class DaimonSkycrawlers::Crawler::Base
The base class of crawler
A crawler implementation can inherit this class and override `#fetch` in the class.
Attributes
@!attribute [r] n_processed_urls
The number of processed URLs. @return [Integer]
@!attribute [w] storage
Set storage to crawler instance. @return [void]
Public Class Methods
@param base_url [String] Base
URL for crawler @param faraday_options [Hash] options for Faraday @param options [Hash] options for crawler
DaimonSkycrawlers::Callbacks::new
# File lib/daimon_skycrawlers/crawler/base.rb, line 45 def initialize(base_url = nil, faraday_options: {}, options: {}) super() @base_url = base_url @faraday_options = faraday_options @options = options @prepare = ->(connection) {} @skipped = false @n_processed_urls = 0 setup_default_filters setup_default_post_processes end
Public Instance Methods
@return [Faraday]
# File lib/daimon_skycrawlers/crawler/base.rb, line 102 def connection @connection ||= Faraday.new(@base_url, @faraday_options) end
Fetch URL
Override this method in subclass.
@param path [String] URI or path @param message [Hash] message can include anything
@return [Faraday::Response] HTTP response
# File lib/daimon_skycrawlers/crawler/base.rb, line 147 def fetch(path, message = {}) raise NotImplementedError, "Must implement this method in subclass" end
GET URL with params
@param path [String] URI or path @param params [Hash] query parameters
@return [Faraday::Response] HTTP response
# File lib/daimon_skycrawlers/crawler/base.rb, line 159 def get(path, params = {}) @connection.get(path, params) end
POST URL with params
@param path [String] URI or path @param params [Hash] query parameters
@return [Faraday::Response] HTTP response
# File lib/daimon_skycrawlers/crawler/base.rb, line 171 def post(path, params = {}) @connection.post(path, params) end
Call this method before DaimonSkycrawlers.register_crawler
For example, you can login before fetch URL
@yield [connection]
# File lib/daimon_skycrawlers/crawler/base.rb, line 79 def prepare(&block) @prepare = block end
Process crawler sequence
-
Run registered filters
-
Prepare connection
-
Download(fetch) data from given URL
-
Run post processes (store downloaded data to storage)
@param message [Hash] parameters for crawler
# File lib/daimon_skycrawlers/crawler/base.rb, line 116 def process(message, &block) @skipped = false @n_processed_urls += 1 proceeding = run_before_process_callbacks(message) unless proceeding skip(message[:url]) return end # url can be a path url = message.delete(:url) url = (URI(connection.url_prefix) + url).to_s @prepare.call(connection) response = fetch(url, message, &block) data = { url: url, message: message, response: response } run_after_process_callbacks(data) data end
Set up connection
@param options [Hash] options for Faraday @yield [faraday] @yieldparam faraday [Faraday]
# File lib/daimon_skycrawlers/crawler/base.rb, line 65 def setup_connection(options = {}) merged_options = @faraday_options.merge(options) faraday_options = merged_options.empty? ? nil : merged_options @connection = Faraday.new(@base_url, faraday_options) do |faraday| yield faraday end end
@return [true|false]
# File lib/daimon_skycrawlers/crawler/base.rb, line 95 def skipped? @skipped end
Retrieve storage instance
@return [DaimonSkycrawlers::Storage::Base]
# File lib/daimon_skycrawlers/crawler/base.rb, line 88 def storage @storage ||= Storage::RDB.new end
Private Instance Methods
# File lib/daimon_skycrawlers/crawler/base.rb, line 211 def schedule_to_process(url, message = {}) DaimonSkycrawlers::Processor.enqueue_http_response(url, message) end
# File lib/daimon_skycrawlers/crawler/base.rb, line 177 def setup_default_filters if @options[:obey_robots_txt] before_process do |m| robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url) allowed = robots_txt_checker.allowed?(m) log.debug("Not allowed: #{m[:url]}") unless allowed allowed end end before_process do |m| update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage) updated = update_checker.updated?(m, connection: connection) unless updated log.debug("Not updated: #{m[:url]}") end updated end end
# File lib/daimon_skycrawlers/crawler/base.rb, line 196 def setup_default_post_processes after_process do |data| storage.save(data) message = data[:message] url = data[:url] schedule_to_process(url, message) end end
# File lib/daimon_skycrawlers/crawler/base.rb, line 205 def skip(url) log.info("Skipped '#{url}' by '#{self.class}'") @skipped = true schedule_to_process(url.to_s, heartbeat: true) end