class Scrapah::Scraper
Public Class Methods
new(scrape_type=:openuri, caching=false)
click to toggle source
TODO Patterns class, for recursive-autodiscovery proxy-switching etc… ?
# File lib/scrapah/scraper.rb, line 23 def initialize(scrape_type=:openuri, caching=false) @access_type = scrape_type @current_url = '' @caching = caching if @caching @cache = Scrapah::Cache.new @cache.load end # .start automatically? end
Public Instance Methods
get(url)
click to toggle source
# File lib/scrapah/scraper.rb, line 68 def get(url) # visit(url) if caching and not cached # return result @current_url = url if(@caching) go(url) if !@cache.has_key? url Nokogiri::HTML(@cache.get(url)) else get_appropriate(url) end end
process(input)
click to toggle source
TODO split process! and process .…
# File lib/scrapah/scraper.rb, line 82 def process(input) # get current_url source doc = get(@current_url) if input.is_a?(Hash) result = Hash.new input.each{|k,v| result[k] = process_appropriate(doc,v)} return result else return process_appropriate(doc,input) end nil end
start()
click to toggle source
# File lib/scrapah/scraper.rb, line 37 def start() # start headless if(@access_type == :headless) @headless = Headless.new @headless.start @browser = Watir::Browser.new #default browser end end
stop()
click to toggle source
# File lib/scrapah/scraper.rb, line 46 def stop() # end headless/close stuff if(@access_type == :headless) @browser.close @headless.destroy end end
visit(url)
click to toggle source
# File lib/scrapah/scraper.rb, line 55 def visit(url) # cache the url @current_url = url return nil if !@caching doc = get_appropriate(url) @cache.store(url,doc.to_s) @cache.save #TODO ??? end
Private Instance Methods
get_appropriate(url)
click to toggle source
TODO retry & retry strategies returns nokogiri doc's
# File lib/scrapah/scraper.rb, line 102 def get_appropriate(url) retryable :tries => 4, :sleep => 1.5 do return get_headless(url) if(@access_type == :headless) return get_openuri(url) if(@access_type == :openuri) end end
get_headless(url)
click to toggle source
# File lib/scrapah/scraper.rb, line 109 def get_headless(url) return nil if !started_headless? @browser.goto url Nokogiri::HTML(@browser.html) end
get_openuri(url)
click to toggle source
# File lib/scrapah/scraper.rb, line 116 def get_openuri(url) Nokogiri::HTML(open(url)) end
process_appropriate(doc,cmd)
click to toggle source
accepts nokogiri doc's only atm
# File lib/scrapah/scraper.rb, line 131 def process_appropriate(doc,cmd) return process_regex(doc,cmd) if(cmd.is_a? Regexp) return process_proc(doc,cmd) if(cmd.is_a? Proc) if cmd.is_a?(String) return process_xpath(doc,cmd) if cmd.start_with?("x|") return process_css(doc,cmd) if cmd.start_with?("c|") end nil end
process_css(doc,css)
click to toggle source
# File lib/scrapah/scraper.rb, line 154 def process_css(doc,css) css.slice!('c|') sanitize_nokogiri doc.css(css) end
process_proc(doc,proc)
click to toggle source
# File lib/scrapah/scraper.rb, line 159 def process_proc(doc,proc) proc.call(doc) end
process_regex(doc,regex)
click to toggle source
# File lib/scrapah/scraper.rb, line 145 def process_regex(doc,regex) doc.to_s.scan(regex).flatten end
process_xpath(doc,xpath)
click to toggle source
# File lib/scrapah/scraper.rb, line 149 def process_xpath(doc,xpath) xpath.slice!('x|') sanitize_nokogiri doc.xpath(xpath) end
sanitize_nokogiri(stuff)
click to toggle source
# File lib/scrapah/scraper.rb, line 164 def sanitize_nokogiri(stuff) return stuff.to_s if(stuff.count == 1) result = [] stuff.each{|a| result << a.to_s} result end
started_headless?()
click to toggle source
# File lib/scrapah/scraper.rb, line 121 def started_headless?() if @browser.nil? || @headless.nil? raise 'Call Scraper.start first when using :headless' return false end return true end