class Scraptory

Attributes

config[R]

Public Class Methods

new(config={}) click to toggle source
# File lib/scraptory.rb, line 23
def initialize(config={})
        @proxies = []
        @proxy_cursor=-1

        set_config(config)
end

Public Instance Methods

_debug(msg,lvl="warn") click to toggle source
# File lib/scraptory.rb, line 169
def _debug(msg,lvl="warn")
        if @config["debug"]
                if lvl == "warn"
                        @logger.warn msg
                elsif lvl == "info"
                        @logger.info msg
                elsif lvl == "debug"
                        @logger.debug msg
                elsif lvl == "error"
                        @logger.error msg
                else
                        @logger.warn msg
                end
        end
end
add_proxy(proxy) click to toggle source
# File lib/scraptory.rb, line 165
def add_proxy(proxy)
        @proxies << proxy
end
build_request(url) click to toggle source
# File lib/scraptory.rb, line 90
def build_request(url)
        Typhoeus::Config.user_agent = UserAgents.rand()

        # proxy_cursor is set to 1 when no using any proxy
        if not @proxies.any? and @proxy_cursor > -1
                proxy = @proxies[@proxy_cursor]
                return Typhoeus::Request.new(url,
                        :timeout            => proxy.timeout,
                        :proxy                      => proxy.url,
                        :proxytype  => proxy.type)
        else
                return Typhoeus::Request.new(url, :timeout => @config['timeout'])
        end
end
on_request_complete(response,request,callback) click to toggle source
# File lib/scraptory.rb, line 105
def on_request_complete(response,request,callback)
        error_msg = "Unknow error"
        add_to_queue = false

        if response.success?
                # SUCCESS
                callback.call(response)
        elsif response.timed_out?
                error_msg = "Timed out ("+request.url+")"
                add_to_queue = true
        elsif response.code == 404
                error_msg = "404 Page not found ("+request.url+")"
                add_to_queue = false

        elsif response.code == 301 or response.code == 302
                error_msg = "301/302 Redirection not followed ("+request.url+")"
                add_to_queue = false

        elsif response.code == 0
                # Could not get an http response, something's wrong.
                error_msg = "Could not get an http response, something's wrong ("+request.url+") : "+response.return_message
                add_to_queue = true
        else
                # Received a non-successful http response.
                error_msg = "Received a non-successful http response ("+request.url+") : "+response.code.to_s
                add_to_queue = true
        end

        self._debug(error_msg)

        if add_to_queue
                @@count_connect_errors = @@count_connect_errors + 1

                if @@count_connect_errors > @config['err_before_chg_ip']
                        self._debug("Changing Proxy","info")

                        @proxies[@proxy_cursor].change_ip()

                        if @proxy_cursor == @proxies.length - 1 and @config["use_clearconnection"]
                                @proxy_cursor = -1
                        elsif @proxy_cursor == @proxies.length - 1 and !@config["use_clearconnection"]
                                @proxy_cursor = 0
                        else
                                @proxy_cursor = @proxy_cursor + 1
                        end

                        @@count_connect_errors = 0
                end

                if @config['retry_on_error']
                        @hydra.queue(request)
                end

        end
end
queue(url,callback) click to toggle source
# File lib/scraptory.rb, line 74
def queue(url,callback)
        request = build_request(url)

        request.on_complete do |response|
                on_request_complete(response,request,callback)
        end
        
        @hydra.queue(request)
end
queues(urls=Array.new,callback) click to toggle source
# File lib/scraptory.rb, line 84
def queues(urls=Array.new,callback)
        urls.each do |url|
                queue(url,callback)
        end
end
scrap() click to toggle source
# File lib/scraptory.rb, line 161
def scrap()
        @hydra.run
end
set_config(config={}) click to toggle source
# File lib/scraptory.rb, line 30
def set_config(config={})
        # Use debug output
        if !config.has_key?("debug")
                config["debug"] = false
        end

        # If debug_file is set and doesn't exists, we create it
        if config.has_key?("debug_file") and !File.exist?(config["debug_file"])
                config["debug"] = true
                FileUtils.touch(config["debug_file"])
                @logger = Logger.new(config["debug_file"])
        elsif config["debug"]
                @logger = Logger.new(STDOUT)
        end

        # If the param nthreads exists and is an integer it is created. Else it is set to 1
        if config.has_key?("nthreads") and config["nthreads"].is_a? Integer
                @hydra = Typhoeus::Hydra.new(max_concurrency: config["nthreads"].to_i)
        else
                @hydra = Typhoeus::Hydra.new(max_concurrency: @@default_nthreads)
        end

        # If wrong data are in timeout config, we set it to default
        if !config.has_key?("timeout") or config["timeout"].to_i < 1
                config["timeout"] = @@default_hydra_timeout
        end

        if !config.has_key?("retry_on_error")
                config["retry_on_error"] = false
        end

        # Switch between proxies and clear connection
        if config["use_clearconnection"].nil?
                config["use_clearconnection"] = false
        end


        if !config["err_before_chg_ip"].nil? or config["err_before_chg_ip"].to_i < 1
                config["err_before_chg_ip"] = @@default_err_before_chg_ip
        end

        @config = config
end