class Kimurai::Base
Constants
- DMERGE_EXCLUDE
don't deep merge config's headers hash option
- LoggerFormatter
Attributes
run_info[R]
savers[R]
storage[R]
logger[R]
with_info[RW]
Public Class Methods
add_event(scope, event)
click to toggle source
# File lib/kimurai/base.rb, line 57 def self.add_event(scope, event) return unless @run_info @update_mutex.synchronize { @run_info[:events][scope][event] += 1 } end
completed?()
click to toggle source
# File lib/kimurai/base.rb, line 36 def self.completed? @run_info && @run_info[:status] == :completed end
config()
click to toggle source
# File lib/kimurai/base.rb, line 84 def self.config if superclass.equal?(::Object) @config else superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE) end end
crawl!(exception_on_fail: true)
click to toggle source
# File lib/kimurai/base.rb, line 102 def self.crawl!(exception_on_fail: true) logger.error "Spider: already running: #{name}" and return false if running? @storage = Storage.new @savers = {} @update_mutex = Mutex.new @run_info = { spider_name: name, status: :running, error: nil, environment: Kimurai.env, start_time: Time.new, stop_time: nil, running_time: nil, visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 }, events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) } } ### logger.info "Spider: started: #{name}" open_spider if self.respond_to? :open_spider spider = self.new spider.with_info = true if start_urls start_urls.each do |start_url| if start_url.class == Hash spider.request_to(:parse, start_url) else spider.request_to(:parse, url: start_url) end end else spider.parse end rescue StandardError, SignalException, SystemExit => e @run_info.merge!(status: :failed, error: e.inspect) exception_on_fail ? raise(e) : [@run_info, e] else @run_info.merge!(status: :completed) ensure if spider spider.browser.destroy_driver! if spider.instance_variable_get("@browser") stop_time = Time.now total_time = (stop_time - @run_info[:start_time]).round(3) @run_info.merge!(stop_time: stop_time, running_time: total_time) close_spider if self.respond_to? :close_spider message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}" failed? ? logger.fatal(message) : logger.info(message) @run_info, @storage, @savers, @update_mutex = nil end end
engine()
click to toggle source
# File lib/kimurai/base.rb, line 72 def self.engine @engine ||= superclass.engine end
failed?()
click to toggle source
# File lib/kimurai/base.rb, line 40 def self.failed? @run_info && @run_info[:status] == :failed end
items()
click to toggle source
# File lib/kimurai/base.rb, line 48 def self.items @run_info && @run_info[:items] end
logger()
click to toggle source
# File lib/kimurai/base.rb, line 94 def self.logger @logger ||= Kimurai.configuration.logger || begin log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase log_level = "Logger::#{log_level}".constantize Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name) end end
name()
click to toggle source
# File lib/kimurai/base.rb, line 68 def self.name @name end
new(engine = self.class.engine, config: {})
click to toggle source
# File lib/kimurai/base.rb, line 175 def initialize(engine = self.class.engine, config: {}) @engine = engine || self.class.engine @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE) @pipelines = self.class.pipelines.map do |pipeline_name| klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name } instance = klass.new instance.spider = self [pipeline_name, instance] end.to_h @logger = self.class.logger @savers = {} end
parse!(handler, *args, **request)
click to toggle source
# File lib/kimurai/base.rb, line 156 def self.parse!(handler, *args, **request) spider = self.new if args.present? spider.public_send(handler, *args) elsif request.present? spider.request_to(handler, request) else spider.public_send(handler) end ensure spider.browser.destroy_driver! if spider.instance_variable_get("@browser") end
pipelines()
click to toggle source
# File lib/kimurai/base.rb, line 76 def self.pipelines @pipelines ||= superclass.pipelines end
running?()
click to toggle source
# File lib/kimurai/base.rb, line 32 def self.running? @run_info && @run_info[:status] == :running end
start_urls()
click to toggle source
# File lib/kimurai/base.rb, line 80 def self.start_urls @start_urls end
update(type, subtype)
click to toggle source
# File lib/kimurai/base.rb, line 52 def self.update(type, subtype) return unless @run_info @update_mutex.synchronize { @run_info[type][subtype] += 1 } end
visits()
click to toggle source
# File lib/kimurai/base.rb, line 44 def self.visits @run_info && @run_info[:visits] end
Public Instance Methods
add_event(scope = :custom, event)
click to toggle source
# File lib/kimurai/base.rb, line 238 def add_event(scope = :custom, event) unless self.with_info raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)" end self.class.add_event(scope, event) logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom end
browser()
click to toggle source
# File lib/kimurai/base.rb, line 189 def browser @browser ||= BrowserBuilder.build(@engine, @config, spider: self) end
console(response = nil, url: nil, data: {})
click to toggle source
# File lib/kimurai/base.rb, line 207 def console(response = nil, url: nil, data: {}) binding.pry end
request_to(handler, delay = nil, url:, data: {}, response_type: :html)
click to toggle source
# File lib/kimurai/base.rb, line 193 def request_to(handler, delay = nil, url:, data: {}, response_type: :html) raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP) if @config[:skip_duplicate_requests] && !unique_request?(url) add_event(:duplicate_requests) if self.with_info logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return end visited = delay ? browser.visit(url, delay: delay) : browser.visit(url) return unless visited public_send(handler, browser.current_response(response_type), { url: url, data: data }) end
save_to(path, item, format:, position: true, append: false)
click to toggle source
# File lib/kimurai/base.rb, line 223 def save_to(path, item, format:, position: true, append: false) @savers[path] ||= begin options = { format: format, position: position, append: append } if self.with_info self.class.savers[path] ||= Saver.new(path, options) else Saver.new(path, options) end end @savers[path].save(item) end
storage()
click to toggle source
# File lib/kimurai/base.rb, line 213 def storage # Note: for `.crawl!` uses shared thread safe Storage instance, # otherwise, each spider instance will have it's own Storage @storage ||= self.with_info ? self.class.storage : Storage.new end
unique?(scope, value)
click to toggle source
# File lib/kimurai/base.rb, line 219 def unique?(scope, value) storage.unique?(scope, value) end
Private Instance Methods
create_browser(engine, config = {})
click to toggle source
# File lib/kimurai/base.rb, line 251 def create_browser(engine, config = {}) Kimurai::BrowserBuilder.build(engine, config, spider: self) end
in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
click to toggle source
# File lib/kimurai/base.rb, line 290 def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {}) parts = urls.in_sorted_groups(threads, false) urls_count = urls.size all = [] start_time = Time.now logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads" parts.each do |part| all << Thread.new(part) do |part| Thread.current.abort_on_exception = true spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE)) spider.with_info = true if self.with_info part.each do |url_data| if url_data.class == Hash if url_data[:url].present? && url_data[:data].present? spider.request_to(handler, delay, url_data) else spider.public_send(handler, url_data) end else spider.request_to(handler, delay, url: url_data, data: data) end end ensure spider.browser.destroy_driver! if spider.instance_variable_get("@browser") end sleep 0.5 end all.each(&:join) logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}" end
send_item(item, options = {})
click to toggle source
# File lib/kimurai/base.rb, line 269 def send_item(item, options = {}) logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..." self.class.update(:items, :sent) if self.with_info @pipelines.each do |name, instance| item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item) end rescue => e logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}" add_event(:drop_items_errors, e.inspect) if self.with_info false else self.class.update(:items, :processed) if self.with_info logger.info "Pipeline: processed: #{JSON.generate(item)}" true ensure if self.with_info logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}" end end
unique_request?(url)
click to toggle source
# File lib/kimurai/base.rb, line 255 def unique_request?(url) options = @config[:skip_duplicate_requests] if options.class == Hash scope = options[:scope] || :requests_urls if options[:check_only] storage.include?(scope, url) ? false : true else storage.unique?(scope, url) ? true : false end else storage.unique?(:requests_urls, url) ? true : false end end