class PageByPage::Fetch
Public Class Methods
new(opt = {}, &block)
click to toggle source
Calls superclass method
PageByPage::Common::new
# File lib/page_by_page/fetch.rb, line 11 def initialize(opt = {}, &block) @from, @step, @to = 1, 1, Float::INFINITY super @enum = Enum.new(enum_options) @enum = MutexEnum.new(@enum) if defined? @threads end
Public Instance Methods
enumerator(e)
click to toggle source
# File lib/page_by_page/fetch.rb, line 34 def enumerator e @enumerator = e end
from(n)
click to toggle source
# File lib/page_by_page/fetch.rb, line 22 def from n @from = n end
iterator()
click to toggle source
# File lib/page_by_page/fetch.rb, line 47 def iterator Enumerator.new do |yielder| items_enum.each do |_, items| items.each do |i| yielder.yield(i) end end end end
process()
click to toggle source
# File lib/page_by_page/fetch.rb, line 38 def process nodes_2d = defined?(@threads) ? parallel_fetch : _fetch puts if @progress nodes_2d.sort.each_with_object([]) do |key_items, res| res.concat key_items[1] unless key_items[1].nil? end end
step(n)
click to toggle source
# File lib/page_by_page/fetch.rb, line 26 def step n @step = n end
threads(n)
click to toggle source
# File lib/page_by_page/fetch.rb, line 30 def threads n @threads = n end
url(tmpl)
click to toggle source
# File lib/page_by_page/fetch.rb, line 18 def url tmpl @tmpl = ERB.new tmpl end
Protected Instance Methods
_fetch()
click to toggle source
# File lib/page_by_page/fetch.rb, line 59 def _fetch pages = {} items_enum.each do |page_num, items| pages[page_num] = items end pages end
enum_options()
click to toggle source
# File lib/page_by_page/fetch.rb, line 101 def enum_options {from: @from, step: @step, limit: limit, enumerator: @enumerator} end
items_enum()
click to toggle source
# File lib/page_by_page/fetch.rb, line 69 def items_enum Enumerator.new do |yielder| items = [nil] catch :no_more do until items.empty? n = @enum.next break if n.nil? url = @tmpl.result binding doc = parse url items = doc.css @selector yielder.yield(n, items) update_progress Thread.current, n if @progress sleep @interval if @interval end end end end
parallel_fetch()
click to toggle source
# File lib/page_by_page/fetch.rb, line 89 def parallel_fetch ts = @threads.times.map do |n| Thread.new do Thread.current[:sub] = _fetch end end ts.each_with_object({}) do |t, pages| t.join pages.merge! t[:sub] end end