class Vessel::Scheduler

Attributes

browser[R]
delay[R]
headers[R]
queue[R]

Public Class Methods

new(queue, settings) click to toggle source
# File lib/vessel/scheduler.rb, line 13
def initialize(queue, settings)
  @queue = queue
  @min_threads, @max_threads, @delay, @headers =
    settings.values_at(:min_threads, :max_threads, :delay, :headers)

  options = settings[:ferrum]
  options.merge!(timeout: settings[:timeout]) if settings[:timeout]
  @browser = Ferrum::Browser.new(**options)

  if settings[:intercept]
    @browser.network.intercept
    @browser.on(:request, &settings[:intercept])
  end
end

Public Instance Methods

post(*requests) click to toggle source
# File lib/vessel/scheduler.rb, line 28
def post(*requests)
  requests.map do |request|
    Concurrent::Promises.future_on(pool, queue, request) do |queue, request|
      queue << goto(request)
    end
  end
end
stop() click to toggle source
# File lib/vessel/scheduler.rb, line 36
def stop
  pool.shutdown
  pool.kill unless pool.wait_for_termination(30)
  browser.quit
end

Private Instance Methods

goto(request) click to toggle source
# File lib/vessel/scheduler.rb, line 52
def goto(request)
  return [nil, request] if request.stub?

  page = browser.create_page
  page.headers.set(headers) if headers
  # Delay is set between requests when we don't want to bombard server with
  # requests so it requires crawler to be single threaded. Otherwise doesn't
  # make sense.
  sleep(delay) if @max_threads == 1 && delay > 0
  page.goto(request.url)
  [page, request]
rescue => e
  e
end
pool() click to toggle source
# File lib/vessel/scheduler.rb, line 44
def pool
  @pool ||= Concurrent::ThreadPoolExecutor.new(
    max_queue: 0,
    min_threads: @min_threads,
    max_threads: @max_threads
  )
end