class Ubi::Aranea
Base for araneas (spiders)
Constants
- OPTIONS
Attributes
datum[RW]
html[RW]
opts[RW]
text[RW]
thema[RW]
url[RW]
Public Class Methods
new(thema, url, opts = {})
click to toggle source
# File lib/ubi/aranea.rb, line 18 def initialize(thema, url, opts = {}) @thema = thema @url = url @opts = OPTIONS.merge(opts) @datum = [] @html = [] @text = '' end
Public Instance Methods
crawl!()
click to toggle source
# File lib/ubi/aranea.rb, line 29 def crawl! @last_run = Time.now puts "Crawler start #{name} #{url}" Retriever::PageIterator.new(url, opts) do |page| parse page.source p [page.title, page.h1, page.h2] end end
parse(chunk)
click to toggle source
# File lib/ubi/aranea.rb, line 39 def parse(chunk) @datum << chunk @html << Nokogiri::HTML(chunk) @text << html.last.text end
to_s()
click to toggle source
# File lib/ubi/aranea.rb, line 50 def to_s "#{thema} html: #{html.size} txt: #{text.size}" end
work()
click to toggle source
# File lib/ubi/aranea.rb, line 45 def work crawl! unless @last_run true end