class CrawlStation::Producer
Attributes
cache[RW]
proxies[RW]
proxy[RW]
schedule[RW]
Public Class Methods
new(schedule, cache, proxies = nil)
click to toggle source
# File lib/crawl_station/producer.rb, line 7 def initialize(schedule, cache, proxies = nil) @schedule = schedule @cache = cache @proxies = proxies end
Public Instance Methods
loop_parser()
click to toggle source
# File lib/crawl_station/producer.rb, line 18 def loop_parser return sleep(0.2) || true if @schedule.empty? item = @schedule.pop item = CS::ParseStruct.new(item) if item.is_a?(Hash) return sleep(0.2) || true if parsed?(item) Logger.debug "start parse #{item.link}" data = parse_item(item) return true if data.nil? || data.empty? data = parse_links(data, item.namespace) return true if data.empty? item.item_class.new.save(item.link, data) true end
parse_item(item)
click to toggle source
# File lib/crawl_station/producer.rb, line 32 def parse_item(item) data = cache(item) { item.parser_class.new.crawl(item.link) } @schedule.done(item) data rescue Exception => e Logger.error("%s: %s\n%s" % [item.link, e.message, e.backtrace[0..10].join("\n")]) @schedule.failed(item) nil end
parse_links(data, namespace)
click to toggle source
# File lib/crawl_station/producer.rb, line 42 def parse_links(data, namespace) links = ->(data, namespace) do next if data['link'].blank? || parsed?(data) @schedule.push ParseStruct.new(parser: data['parser'], link: data['link'], namespace: namespace) end ['pages', 'details'].each do |field| data.delete(field)&.map { |page| links.call(page, namespace) } end data end
parsed?(data)
click to toggle source
# File lib/crawl_station/producer.rb, line 53 def parsed?(data) data.nil? || @cache.include?(data['link']) end
start()
click to toggle source
# File lib/crawl_station/producer.rb, line 13 def start loop { break unless loop_parser } Logger.debug "#{self} done" end