class Samao::Detector
Public Class Methods
new(params={}) { |self| ... }
click to toggle source
# File lib/samao/detector.rb, line 5 def initialize(params={}) matchable @current_url = @baseurl = @from = @max_page = nil @pages = [] @items = [] @queue_of_items = Queue.new @semaphore = Queue.new @max_concurrent = params[:max_concurrent] || 5 @max_concurrent.times { @semaphore.push(1) } yield self if block_given? self end
Public Instance Methods
add_detail(detail_key=:url, &block)
click to toggle source
# File lib/samao/detector.rb, line 79 def add_detail(detail_key=:url, &block) @detail_key = detail_key @on[:detail] = block if block end
baseurl(url)
click to toggle source
set base url
# File lib/samao/detector.rb, line 103 def baseurl(url) @baseurl = url self end
find_item(selector, &block)
click to toggle source
# File lib/samao/detector.rb, line 84 def find_item(selector, &block) find(:item, selector, &block) end
from(url)
click to toggle source
set front page
# File lib/samao/detector.rb, line 89 def from(url) if prev_url = @current_url || @baseurl url = URI.join(prev_url, url) end url = URI(url) if ! url.is_a? URI @from = Catcher.new(url:url, baseurl:@current_url) @pages << url @current_url = url self end
items()
click to toggle source
get items
# File lib/samao/detector.rb, line 130 def items @items end
max_concurrent(max)
click to toggle source
set max concurrent level
# File lib/samao/detector.rb, line 117 def max_concurrent(max) @max_concurrent = max self end
Also aliased as: concurrent
max_page(max)
click to toggle source
set max page
# File lib/samao/detector.rb, line 110 def max_page(max) @max_page = max self end
pages()
click to toggle source
get pages
# File lib/samao/detector.rb, line 125 def pages @pages end
run()
click to toggle source
return Detector
self
# File lib/samao/detector.rb, line 24 def run threads = [] while @from break unless @from.run.success? @current_doc = @from.doc # find items in current_page if found = @current_doc.css(@selector[:item]) and found.size >= 1 found.each do |raw_item| threads << Thread.new do @semaphore.pop # puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting." begin item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item| @on[:item].call(item) if @on[:item] end.run if @detail_key detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail| @on[:detail].call(detail) if @on[:detail] end.run end @queue_of_items.push item.prop rescue => e p e ensure @semaphore.push(1) end end # end Thread end # end found.each loop end # end if found # find next page[s] in current page if @max_page and @pages.size >= @max_page stop elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href']) @on[:next].call(next_url) if @on[:next] from next_url else stop end end # end while @from threads.each(&:join) threads.size.times do item = @queue_of_items.pop @items << item end self end
Private Instance Methods
stop()
click to toggle source
# File lib/samao/detector.rb, line 135 def stop @from = nil self end