class Ubi::Aranea

Base for araneas (spiders)

Constants

OPTIONS

Attributes

datum[RW]
html[RW]
opts[RW]
text[RW]
thema[RW]
url[RW]

Public Class Methods

new(thema, url, opts = {}) click to toggle source
# File lib/ubi/aranea.rb, line 18
def initialize(thema, url, opts = {})
  @thema = thema
  @url   = url
  @opts  = OPTIONS.merge(opts)
  @datum = []
  @html = []
  @text = ''
end

Public Instance Methods

crawl!() click to toggle source
# File lib/ubi/aranea.rb, line 29
def crawl!
  @last_run = Time.now

  puts "Crawler start #{name} #{url}"
  Retriever::PageIterator.new(url, opts) do |page|
    parse page.source
    p [page.title, page.h1, page.h2]
  end
end
parse(chunk) click to toggle source
# File lib/ubi/aranea.rb, line 39
def parse(chunk)
  @datum << chunk
  @html << Nokogiri::HTML(chunk)
  @text << html.last.text
end
to_s() click to toggle source
# File lib/ubi/aranea.rb, line 50
def to_s
  "#{thema} html: #{html.size} txt: #{text.size}"
end
work() click to toggle source
# File lib/ubi/aranea.rb, line 45
def work
  crawl! unless @last_run
  true
end