class NewsCrawler::Downloader
This class implement an parallel downloader based on Typhoes with given queue
Constants
- CONCURRENT_DOWNLOAD
Public Class Methods
new(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
click to toggle source
Construct downloader with an URLQueue @param [ Boolean ] start_on_create whether start selector immediately @param [ NewsCrawler::URLQueue ] queue url queue
# File lib/news_crawler/downloader.rb, line 42 def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts) @queue = queue @urls = queue.find_unvisited @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD @wait_time = 1 @status = :running @stoping = false wait_for_url if start_on_create end
Public Instance Methods
graceful_terminate()
click to toggle source
Graceful terminate this downloader
# File lib/news_crawler/downloader.rb, line 80 def graceful_terminate @stoping = true while @status == :running sleep(1) end end
run()
click to toggle source
Start downloader with current queue URL successed fetch is marked and result’s stored in DB
# File lib/news_crawler/downloader.rb, line 54 def run @status = :running hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download) # TODO Log here @urls = @urls.keep_if do | url | Robots.instance.allowed? url end requests = @urls.map do | url | re = Typhoeus::Request.new(url, followlocation: true) re.on_complete do | response | if response.success? Storage::RawData.add(url, response.response_body) @queue.mark_visited url else NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]") end end hydra.queue re re end hydra.run @urls = [] wait_for_url end
Private Instance Methods
get_new_url()
click to toggle source
# File lib/news_crawler/downloader.rb, line 107 def get_new_url @urls = @queue.find_unvisited end
wait_for_url()
click to toggle source
Waiting for new urls’re added to queue, using backoff algorithms
# File lib/news_crawler/downloader.rb, line 89 def wait_for_url @status = :waiting if @stoping # check for stop flag return end sleep @wait_time get_new_url if @urls.size == 0 if @wait_time < 30 @wait_time = @wait_time * 2 end wait_for_url else @wait_time = 1 run end end