class Samao::Detector

Public Class Methods

new(params={}) { |self| ... } click to toggle source
# File lib/samao/detector.rb, line 5
def initialize(params={})
  matchable

  @current_url = @baseurl = @from = @max_page = nil
  @pages = []
  @items = []

  @queue_of_items = Queue.new

  @semaphore = Queue.new
  @max_concurrent = params[:max_concurrent] || 5
  @max_concurrent.times { @semaphore.push(1) }

  yield self if block_given?

  self
end

Public Instance Methods

add_detail(detail_key=:url, &block) click to toggle source
# File lib/samao/detector.rb, line 79
def add_detail(detail_key=:url, &block)
  @detail_key = detail_key
  @on[:detail] = block if block
end
baseurl(url) click to toggle source

set base url

# File lib/samao/detector.rb, line 103
def baseurl(url)
  @baseurl = url

  self
end
concurrent(max)
Alias for: max_concurrent
find_item(selector, &block) click to toggle source
# File lib/samao/detector.rb, line 84
def find_item(selector, &block)
  find(:item, selector, &block)
end
from(url) click to toggle source

set front page

# File lib/samao/detector.rb, line 89
def from(url)
  if prev_url = @current_url || @baseurl
    url = URI.join(prev_url, url)
  end
  url = URI(url) if ! url.is_a? URI

  @from = Catcher.new(url:url, baseurl:@current_url)
  @pages << url
  @current_url = url

  self
end
items() click to toggle source

get items

# File lib/samao/detector.rb, line 130
def items
  @items
end
max_concurrent(max) click to toggle source

set max concurrent level

# File lib/samao/detector.rb, line 117
def max_concurrent(max)
  @max_concurrent = max

  self
end
Also aliased as: concurrent
max_page(max) click to toggle source

set max page

# File lib/samao/detector.rb, line 110
def max_page(max)
  @max_page = max

  self
end
pages() click to toggle source

get pages

# File lib/samao/detector.rb, line 125
def pages
  @pages
end
run() click to toggle source

return Detector self

# File lib/samao/detector.rb, line 24
def run
  threads = []
  while @from
    break unless @from.run.success?
    @current_doc = @from.doc

    # find items in current_page
    if found = @current_doc.css(@selector[:item]) and found.size >= 1
      found.each do |raw_item|
        threads << Thread.new do
          @semaphore.pop
          # puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting."

          begin
            item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item|
              @on[:item].call(item) if @on[:item]
            end.run

            if @detail_key
              detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
                @on[:detail].call(detail) if @on[:detail]
              end.run
            end

            @queue_of_items.push item.prop
          rescue => e
            p e
          ensure
            @semaphore.push(1)
          end
        end # end Thread
      end # end found.each loop
    end # end if found

    # find next page[s] in current page
    if @max_page and @pages.size >= @max_page
      stop
    elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
      @on[:next].call(next_url) if @on[:next]
      from next_url
    else
      stop
    end
  end # end while @from

  threads.each(&:join)

  threads.size.times do
    item = @queue_of_items.pop
    @items << item
  end

  self
end

Private Instance Methods

stop() click to toggle source
# File lib/samao/detector.rb, line 135
def stop
  @from = nil

  self
end