class DaimonSkycrawlers::Crawler::Base

The base class of crawler

A crawler implementation can inherit this class and override `#fetch` in the class.

Attributes

n_processed_urls[R]

@!attribute [r] n_processed_urls

The number of processed URLs.
@return [Integer]
storage[W]

@!attribute [w] storage

Set storage to crawler instance.
@return [void]

Public Class Methods

new(base_url = nil, faraday_options: {}, options: {}) click to toggle source

@param base_url [String] Base URL for crawler @param faraday_options [Hash] options for Faraday @param options [Hash] options for crawler

Calls superclass method DaimonSkycrawlers::Callbacks::new
# File lib/daimon_skycrawlers/crawler/base.rb, line 45
def initialize(base_url = nil, faraday_options: {}, options: {})
  super()
  @base_url = base_url
  @faraday_options = faraday_options
  @options = options
  @prepare = ->(connection) {}
  @skipped = false
  @n_processed_urls = 0

  setup_default_filters
  setup_default_post_processes
end

Public Instance Methods

connection() click to toggle source

@return [Faraday]

# File lib/daimon_skycrawlers/crawler/base.rb, line 102
def connection
  @connection ||= Faraday.new(@base_url, @faraday_options)
end
fetch(path, message = {}) click to toggle source

Fetch URL

Override this method in subclass.

@param path [String] URI or path @param message [Hash] message can include anything

@return [Faraday::Response] HTTP response

# File lib/daimon_skycrawlers/crawler/base.rb, line 147
def fetch(path, message = {})
  raise NotImplementedError, "Must implement this method in subclass"
end
get(path, params = {}) click to toggle source

GET URL with params

@param path [String] URI or path @param params [Hash] query parameters

@return [Faraday::Response] HTTP response

# File lib/daimon_skycrawlers/crawler/base.rb, line 159
def get(path, params = {})
  @connection.get(path, params)
end
post(path, params = {}) click to toggle source

POST URL with params

@param path [String] URI or path @param params [Hash] query parameters

@return [Faraday::Response] HTTP response

# File lib/daimon_skycrawlers/crawler/base.rb, line 171
def post(path, params = {})
  @connection.post(path, params)
end
prepare(&block) click to toggle source

Call this method before DaimonSkycrawlers.register_crawler For example, you can login before fetch URL

@yield [connection]

# File lib/daimon_skycrawlers/crawler/base.rb, line 79
def prepare(&block)
  @prepare = block
end
process(message, &block) click to toggle source

Process crawler sequence

  1. Run registered filters

  2. Prepare connection

  3. Download(fetch) data from given URL

  4. Run post processes (store downloaded data to storage)

@param message [Hash] parameters for crawler

# File lib/daimon_skycrawlers/crawler/base.rb, line 116
def process(message, &block)
  @skipped = false
  @n_processed_urls += 1

  proceeding = run_before_process_callbacks(message)
  unless proceeding
    skip(message[:url])
    return
  end

  # url can be a path
  url = message.delete(:url)
  url = (URI(connection.url_prefix) + url).to_s

  @prepare.call(connection)
  response = fetch(url, message, &block)
  data = { url: url, message: message, response: response }
  run_after_process_callbacks(data)
  data
end
setup_connection(options = {}) { |faraday| ... } click to toggle source

Set up connection

@param options [Hash] options for Faraday @yield [faraday] @yieldparam faraday [Faraday]

# File lib/daimon_skycrawlers/crawler/base.rb, line 65
def setup_connection(options = {})
  merged_options = @faraday_options.merge(options)
  faraday_options = merged_options.empty? ? nil : merged_options
  @connection = Faraday.new(@base_url, faraday_options) do |faraday|
    yield faraday
  end
end
skipped?() click to toggle source

@return [true|false]

# File lib/daimon_skycrawlers/crawler/base.rb, line 95
def skipped?
  @skipped
end
storage() click to toggle source

Retrieve storage instance

@return [DaimonSkycrawlers::Storage::Base]

# File lib/daimon_skycrawlers/crawler/base.rb, line 88
def storage
  @storage ||= Storage::RDB.new
end

Private Instance Methods

schedule_to_process(url, message = {}) click to toggle source
# File lib/daimon_skycrawlers/crawler/base.rb, line 211
def schedule_to_process(url, message = {})
  DaimonSkycrawlers::Processor.enqueue_http_response(url, message)
end
setup_default_filters() click to toggle source
# File lib/daimon_skycrawlers/crawler/base.rb, line 177
def setup_default_filters
  if @options[:obey_robots_txt]
    before_process do |m|
      robots_txt_checker = DaimonSkycrawlers::Filter::RobotsTxtChecker.new(base_url: @base_url)
      allowed = robots_txt_checker.allowed?(m)
      log.debug("Not allowed: #{m[:url]}") unless allowed
      allowed
    end
  end
  before_process do |m|
    update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(storage: storage)
    updated = update_checker.updated?(m, connection: connection)
    unless updated
      log.debug("Not updated: #{m[:url]}")
    end
    updated
  end
end
setup_default_post_processes() click to toggle source
# File lib/daimon_skycrawlers/crawler/base.rb, line 196
def setup_default_post_processes
  after_process do |data|
    storage.save(data)
    message = data[:message]
    url = data[:url]
    schedule_to_process(url, message)
  end
end
skip(url) click to toggle source
# File lib/daimon_skycrawlers/crawler/base.rb, line 205
def skip(url)
  log.info("Skipped '#{url}' by '#{self.class}'")
  @skipped = true
  schedule_to_process(url.to_s, heartbeat: true)
end