class NewsCrawler::Downloader

This class implement an parallel downloader based on Typhoes with given queue

Constants

CONCURRENT_DOWNLOAD

Public Class Methods

new(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts) click to toggle source

Construct downloader with an URLQueue @param [ Boolean ] start_on_create whether start selector immediately @param [ NewsCrawler::URLQueue ] queue url queue

# File lib/news_crawler/downloader.rb, line 42
def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
  @queue = queue
  @urls = queue.find_unvisited
  @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
  @wait_time = 1
  @status = :running
  @stoping = false
  wait_for_url if start_on_create
end

Public Instance Methods

graceful_terminate() click to toggle source

Graceful terminate this downloader

# File lib/news_crawler/downloader.rb, line 80
def graceful_terminate
  @stoping = true
  while @status == :running
    sleep(1)
  end
end
run() click to toggle source

Start downloader with current queue URL successed fetch is marked and result’s stored in DB

# File lib/news_crawler/downloader.rb, line 54
def run
  @status = :running
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
  # TODO Log here
  @urls = @urls.keep_if do | url |
    Robots.instance.allowed? url
  end
  requests = @urls.map do | url |
    re = Typhoeus::Request.new(url, followlocation: true)
    re.on_complete do | response |
      if response.success?
        Storage::RawData.add(url, response.response_body)
        @queue.mark_visited url
      else
        NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
      end
    end
    hydra.queue re
    re
  end
  hydra.run
  @urls = []
  wait_for_url
end

Private Instance Methods

get_new_url() click to toggle source
# File lib/news_crawler/downloader.rb, line 107
def get_new_url
  @urls = @queue.find_unvisited
end
wait_for_url() click to toggle source

Waiting for new urls’re added to queue, using backoff algorithms

# File lib/news_crawler/downloader.rb, line 89
def wait_for_url
  @status = :waiting
  if @stoping # check for stop flag
    return
  end
  sleep @wait_time
  get_new_url
  if @urls.size == 0
    if @wait_time < 30
      @wait_time = @wait_time * 2
    end
    wait_for_url
  else
    @wait_time = 1
    run
  end
end