class DaimonSkycrawlers::Processor::Spider

Web spider class. By default extract all links and follow.

@example Google search result (2016-11-29)

spider = DaimonSkycrawlers::Processor::Spider.new
spider.configure do |s|
  s.link_rules = ".g .r a"
  s.extract_link do |element|
    element["data-href"]
  end
  s.link_message = { next: "detail" }
  s.next_page_link_rules = "a#pnnext"
  s.next_page_link_message = { next: "spider" }
end

Attributes

enqueue[RW]

@!attribute [rw] enqueue

If true enqueue found links

@!attribute [rw] link_rules

same as Nokogiri::XML::DocumentFragment#search
In generally, we can set XPath or CSS selector.

@!attribute [rw] next_page_link_rules

same as Nokogiri::XML::DocumentFragment#search
In generally, we can set XPath or CSS selector.

Public Class Methods

new() click to toggle source
Calls superclass method DaimonSkycrawlers::Processor::Base::new
# File lib/daimon_skycrawlers/processor/spider.rb, line 45
def initialize
  super
  @link_filters = []
  @doc = nil
  @links = nil
  @enqueue = true
  @link_rules = ["a"]
  @extract_link = ->(element) { element["href"] }
  @link_message = {}
  @next_page_link_rules = nil
  @extract_next_page_link = ->(element) { element["href"] }
  @next_page_link_message = {}
end

Public Instance Methods

call(message) click to toggle source

@param message [Hash] Must have key :url, :depth

# File lib/daimon_skycrawlers/processor/spider.rb, line 101
def call(message)
  depth = Integer(message[:depth] || 2)
  return if depth <= 1
  page = storage.read(message)
  unless page
    log.warn("Could not read page: url=#{message[:url]}, key=#{message[:key]}")
    return
  end
  @doc = Nokogiri::HTML(page.body)
  new_message = {
    depth: depth - 1,
  }
  link_message = new_message.merge(@link_message)
  links.each do |url|
    enqueue_url(url, link_message)
  end
  next_page_url = next_page_link
  if next_page_url
    next_page_link_message = new_message.merge(@next_page_link_message)
    enqueue_url(next_page_url, next_page_link_message)
  end
end

Private Instance Methods

enqueue_url(url, new_message) click to toggle source
# File lib/daimon_skycrawlers/processor/spider.rb, line 158
def enqueue_url(url, new_message)
  return unless @enqueue
  log.debug("Enqueue: URL:#{url}, message: #{new_message}")
  DaimonSkycrawlers::Crawler.enqueue_url(url, new_message)
end