class CrawlStation::Producer

Attributes

cache[RW]
proxies[RW]
proxy[RW]
schedule[RW]

Public Class Methods

new(schedule, cache, proxies = nil) click to toggle source
# File lib/crawl_station/producer.rb, line 7
def initialize(schedule, cache, proxies = nil)
  @schedule = schedule
  @cache = cache
  @proxies = proxies
end

Public Instance Methods

loop_parser() click to toggle source
# File lib/crawl_station/producer.rb, line 18
def loop_parser
  return sleep(0.2) || true if @schedule.empty?
  item = @schedule.pop
  item = CS::ParseStruct.new(item) if item.is_a?(Hash)
  return sleep(0.2) || true if parsed?(item)
  Logger.debug "start parse #{item.link}"
  data = parse_item(item)
  return true if data.nil? || data.empty?
  data = parse_links(data, item.namespace)
  return true if data.empty?
  item.item_class.new.save(item.link, data)
  true
end
parse_item(item) click to toggle source
# File lib/crawl_station/producer.rb, line 32
def parse_item(item)
  data = cache(item) { item.parser_class.new.crawl(item.link) }
  @schedule.done(item)
  data
rescue Exception => e
  Logger.error("%s: %s\n%s" % [item.link, e.message, e.backtrace[0..10].join("\n")])
  @schedule.failed(item)
  nil
end
parsed?(data) click to toggle source
# File lib/crawl_station/producer.rb, line 53
def parsed?(data)
  data.nil? || @cache.include?(data['link'])
end
start() click to toggle source
# File lib/crawl_station/producer.rb, line 13
def start
  loop { break unless loop_parser }
  Logger.debug "#{self} done"
end