class Scrapah::Scraper

Public Class Methods

new(scrape_type=:openuri, caching=false) click to toggle source

TODO Patterns class, for recursive-autodiscovery proxy-switching etc… ?

# File lib/scrapah/scraper.rb, line 23
def initialize(scrape_type=:openuri, caching=false)
        @access_type = scrape_type
        @current_url = ''

        @caching = caching
        if @caching
                @cache = Scrapah::Cache.new
                @cache.load
        end

        # .start automatically?
end

Public Instance Methods

get(url) click to toggle source
# File lib/scrapah/scraper.rb, line 68
def get(url)
        # visit(url) if caching and not cached
        # return result
        @current_url = url

        if(@caching)
                go(url) if !@cache.has_key? url
                Nokogiri::HTML(@cache.get(url))
        else
                get_appropriate(url)
        end
end
process(input) click to toggle source

TODO split process! and process .…

# File lib/scrapah/scraper.rb, line 82
def process(input)
        # get current_url source
        doc = get(@current_url)
        
        if input.is_a?(Hash)
                result = Hash.new
                input.each{|k,v| result[k] = process_appropriate(doc,v)}
                return result
        else
                return process_appropriate(doc,input)
        end

        nil
end
start() click to toggle source
# File lib/scrapah/scraper.rb, line 37
def start()
        # start headless
        if(@access_type == :headless)
                @headless = Headless.new
                @headless.start
                @browser = Watir::Browser.new #default browser
        end
end
stop() click to toggle source
# File lib/scrapah/scraper.rb, line 46
def stop()
        # end headless/close stuff
        if(@access_type == :headless)
                @browser.close
                @headless.destroy
        end
end
visit(url) click to toggle source
# File lib/scrapah/scraper.rb, line 55
def visit(url)
        # cache the url

        @current_url = url

        return nil if !@caching
        
        doc = get_appropriate(url)

        @cache.store(url,doc.to_s)
        @cache.save #TODO ???
end

Private Instance Methods

get_appropriate(url) click to toggle source

TODO retry & retry strategies returns nokogiri doc's

# File lib/scrapah/scraper.rb, line 102
def get_appropriate(url)
        retryable :tries => 4, :sleep => 1.5 do
                return get_headless(url) if(@access_type == :headless)
                return get_openuri(url)  if(@access_type == :openuri)
        end
end
get_headless(url) click to toggle source
# File lib/scrapah/scraper.rb, line 109
def get_headless(url)
        return nil if !started_headless?
        
        @browser.goto url
        Nokogiri::HTML(@browser.html)
end
get_openuri(url) click to toggle source
# File lib/scrapah/scraper.rb, line 116
def get_openuri(url)
        Nokogiri::HTML(open(url))
end
process_appropriate(doc,cmd) click to toggle source

accepts nokogiri doc's only atm

# File lib/scrapah/scraper.rb, line 131
def process_appropriate(doc,cmd)
        
        return process_regex(doc,cmd) if(cmd.is_a? Regexp)
        return process_proc(doc,cmd) if(cmd.is_a? Proc)

        if cmd.is_a?(String)
                return process_xpath(doc,cmd) if cmd.start_with?("x|")
                return process_css(doc,cmd) if cmd.start_with?("c|")
        end
        
        nil

end
process_css(doc,css) click to toggle source
# File lib/scrapah/scraper.rb, line 154
def process_css(doc,css)
        css.slice!('c|')
        sanitize_nokogiri doc.css(css)
end
process_proc(doc,proc) click to toggle source
# File lib/scrapah/scraper.rb, line 159
def process_proc(doc,proc)
        proc.call(doc)
end
process_regex(doc,regex) click to toggle source
# File lib/scrapah/scraper.rb, line 145
def process_regex(doc,regex)
        doc.to_s.scan(regex).flatten
end
process_xpath(doc,xpath) click to toggle source
# File lib/scrapah/scraper.rb, line 149
def process_xpath(doc,xpath)
        xpath.slice!('x|')
        sanitize_nokogiri doc.xpath(xpath)
end
sanitize_nokogiri(stuff) click to toggle source
# File lib/scrapah/scraper.rb, line 164
def sanitize_nokogiri(stuff)
        return stuff.to_s if(stuff.count == 1)

        result = []
        stuff.each{|a| result << a.to_s}
        result
end
started_headless?() click to toggle source
# File lib/scrapah/scraper.rb, line 121
def started_headless?()
        if @browser.nil? || @headless.nil? 
                raise 'Call Scraper.start first when using :headless' 
                return false
        end
        return true
end