module Salamander

The module containing the Salamander framework itself.

Public Class Methods

crawl(urls, args = {}) { |y, y, y| ... } click to toggle source

Performs an unauthenticated, breadth-first crawl of the target web asset. Function blocks until all threads terminate. This function can receive a code block like so…

Salamander::crawl(urls, args) do |request, response, depth|
     # request: the URL string used to request the current page
     # response: a hash containing data pertaining to the response to the requested URL
     # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
end

Response Hash Contents

base_uri:         The base_uri field of OpenURI's response
meta:             The meta field of OpenURI's response
status:           The status field of OpenURI's response
content_type:     The content_type field of OpenURI's response
charset:          The charset field of OpenURI's response
content_encoding: The content_encoding field of OpenURI's response
last_modified:    The last_modified field of OpenURI's response
body:             Contains the body of OpenURI's response

Optional Arguments

visit:            A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
delay:            A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
threads:          A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
agent:            The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".

@param urls A list of strings containing the seed URLs. @param args A hash containing optional arguments for the function.

# File lib/slmndr.rb, line 81
def crawl(urls, args = {})
        # Get arguments
        visit = nil
        if args[:visit] != nil then
                visit = args[:visit]
        end
        delay = 1
        if args[:delay] != nil then
                delay = args[:delay]
        end
        if delay < 0 then
                raise "delay must be a positive float"
        end
        threads = 1
        if args[:threads] != nil then
                threads = args[:threads]
        end
        if threads < 0 then
                raise "threads must be a positive integer"
        end
        agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
        if args[:agent] != nil then
                agent = args[:agent]
        end
        # Create threads list and lock
        _threads = {}
        tlock = Mutex.new
        # Create jobs map and lock
        jobs = {}
        jlock = Mutex.new
        # Create yield job list and lock
        yields = []
        ylock = Mutex.new
        # Create job; Job States: 0: waiting, 1: working, 2: done
        urls.each do |url|
                jobs[:"#{url}"] = { state: 0, depth: 0 }
        end
        # Create and launch crawl threads
        for i in 1..threads
                tlock.synchronize do
                        # Create crawl thread
                        thread = Thread.new do
                                # Wait until all threads are created
                                tlock.synchronize do
                                end
                                # Get thread id
                                _id = Thread.current.object_id
                                # Loop
                                while true
                                        # Check if thread has been forcefully killed
                                        kill = false
                                        tlock.synchronize do
                                                kill = _threads[_id][:kill]
                                        end
                                        if kill then
                                                break
                                        end
                                        # Find job to do
                                        kill = true
                                        job_url = nil
                                        jlock.synchronize do
                                                # For each job
                                                jobs.each do |u, j|
                                                        # If job is waiting
                                                        if j[:state] == 0 then
                                                                # Take job
                                                                job_url = u
                                                                j[:state] = 1
                                                                kill = false
                                                                break
                                                        elsif j[:state] == 1 then
                                                                # Some jobs are still working; anticipate more jobs in the future
                                                                kill = false
                                                        end
                                                end
                                        end
                                        # If all jobs are done, and no job is found
                                        if kill then
                                                break
                                        end
                                        # If no job found but some jobs are still being worked on, skip
                                        if job_url == nil then
                                                next
                                        end
                                        # Get job depth
                                        job_depth = jobs[:"#{job_url}"][:depth]
                                        # Get all links in page pointed to by job URL
                                        begin
                                                open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
                                                        _response = {
                                                                base_uri: response.base_uri,
                                                                meta: response.meta,
                                                                status: response.status,
                                                                content_type: response.content_type,
                                                                charset: response.charset,
                                                                content_encoding: response.content_encoding,
                                                                last_modified: response.last_modified,
                                                                body: response.read
                                                        }
                                                        # Callback
                                                        ylock.synchronize do
                                                                yields << { request: "#{job_url}", response: _response, depth: job_depth }
                                                        end
                                                        # If resolved URL is in scope
                                                        if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
                                                                # Add resolved URL to job queue and mark it as complete if it does not exist yet
                                                                jlock.synchronize do
                                                                        if jobs[:"#{response.base_uri}"] == nil then
                                                                                jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
                                                                        end
                                                                end
                                                                # Get links for resolve URL
                                                                Salamander::get_links(response.base_uri, _response[:body]) do |link|
                                                                        # Determine if the link should be visited
                                                                        if visit.nil? || visit.call(link) then
                                                                                jlock.synchronize do
                                                                                        # If link is not in job queue
                                                                                        if jobs[:"#{link}"] == nil then
                                                                                                # Create job for the given link
                                                                                                jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
                                                                                        end
                                                                                end
                                                                        end
                                                                end
                                                        end
                                                end
                                        rescue
                                        end
                                        # Flag job as complete
                                        jlock.synchronize do
                                                jobs[:"#{job_url}"][:state] = 2
                                        end
                                        # Perform delay
                                        sleep(delay)
                                end
                        end
                        _threads[thread.object_id] = { thread: thread, kill: false }
                end
        end
        # Wait for all threads to die
        while true
                # Execute yields
                y = nil
                ylock.synchronize do
                        y = yields.shift
                end
                if y != nil then
                        tlock.synchronize do
                                # Pre-emptive kill if yield breaks
                                _threads.each do |id, _thread|
                                        _thread[:kill] = true
                                end
                                # Yield
                                yield y[:request], y[:response], y[:depth]
                                # Cancel kill if yield does not break
                                _threads.each do |id, _thread|
                                        _thread[:kill] = false
                                end
                        end
                        next
                end
                # Check if dead
                alive = false
                _threads.each do |id, _thread|
                        alive = alive || _thread[:thread].alive?
                        if alive then
                                break
                        end
                end
                if !alive then
                        break
                end
        end
end

Private Instance Methods

crawl(urls, args = {}) { |y, y, y| ... } click to toggle source

Performs an unauthenticated, breadth-first crawl of the target web asset. Function blocks until all threads terminate. This function can receive a code block like so…

Salamander::crawl(urls, args) do |request, response, depth|
     # request: the URL string used to request the current page
     # response: a hash containing data pertaining to the response to the requested URL
     # depth: a positive integer indicating the breadth/depth of the current page, relative to one of the seed URLs
end

Response Hash Contents

base_uri:         The base_uri field of OpenURI's response
meta:             The meta field of OpenURI's response
status:           The status field of OpenURI's response
content_type:     The content_type field of OpenURI's response
charset:          The charset field of OpenURI's response
content_encoding: The content_encoding field of OpenURI's response
last_modified:    The last_modified field of OpenURI's response
body:             Contains the body of OpenURI's response

Optional Arguments

visit:            A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
delay:            A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
threads:          A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
agent:            The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".

@param urls A list of strings containing the seed URLs. @param args A hash containing optional arguments for the function.

# File lib/slmndr.rb, line 81
def crawl(urls, args = {})
        # Get arguments
        visit = nil
        if args[:visit] != nil then
                visit = args[:visit]
        end
        delay = 1
        if args[:delay] != nil then
                delay = args[:delay]
        end
        if delay < 0 then
                raise "delay must be a positive float"
        end
        threads = 1
        if args[:threads] != nil then
                threads = args[:threads]
        end
        if threads < 0 then
                raise "threads must be a positive integer"
        end
        agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
        if args[:agent] != nil then
                agent = args[:agent]
        end
        # Create threads list and lock
        _threads = {}
        tlock = Mutex.new
        # Create jobs map and lock
        jobs = {}
        jlock = Mutex.new
        # Create yield job list and lock
        yields = []
        ylock = Mutex.new
        # Create job; Job States: 0: waiting, 1: working, 2: done
        urls.each do |url|
                jobs[:"#{url}"] = { state: 0, depth: 0 }
        end
        # Create and launch crawl threads
        for i in 1..threads
                tlock.synchronize do
                        # Create crawl thread
                        thread = Thread.new do
                                # Wait until all threads are created
                                tlock.synchronize do
                                end
                                # Get thread id
                                _id = Thread.current.object_id
                                # Loop
                                while true
                                        # Check if thread has been forcefully killed
                                        kill = false
                                        tlock.synchronize do
                                                kill = _threads[_id][:kill]
                                        end
                                        if kill then
                                                break
                                        end
                                        # Find job to do
                                        kill = true
                                        job_url = nil
                                        jlock.synchronize do
                                                # For each job
                                                jobs.each do |u, j|
                                                        # If job is waiting
                                                        if j[:state] == 0 then
                                                                # Take job
                                                                job_url = u
                                                                j[:state] = 1
                                                                kill = false
                                                                break
                                                        elsif j[:state] == 1 then
                                                                # Some jobs are still working; anticipate more jobs in the future
                                                                kill = false
                                                        end
                                                end
                                        end
                                        # If all jobs are done, and no job is found
                                        if kill then
                                                break
                                        end
                                        # If no job found but some jobs are still being worked on, skip
                                        if job_url == nil then
                                                next
                                        end
                                        # Get job depth
                                        job_depth = jobs[:"#{job_url}"][:depth]
                                        # Get all links in page pointed to by job URL
                                        begin
                                                open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
                                                        _response = {
                                                                base_uri: response.base_uri,
                                                                meta: response.meta,
                                                                status: response.status,
                                                                content_type: response.content_type,
                                                                charset: response.charset,
                                                                content_encoding: response.content_encoding,
                                                                last_modified: response.last_modified,
                                                                body: response.read
                                                        }
                                                        # Callback
                                                        ylock.synchronize do
                                                                yields << { request: "#{job_url}", response: _response, depth: job_depth }
                                                        end
                                                        # If resolved URL is in scope
                                                        if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
                                                                # Add resolved URL to job queue and mark it as complete if it does not exist yet
                                                                jlock.synchronize do
                                                                        if jobs[:"#{response.base_uri}"] == nil then
                                                                                jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
                                                                        end
                                                                end
                                                                # Get links for resolve URL
                                                                Salamander::get_links(response.base_uri, _response[:body]) do |link|
                                                                        # Determine if the link should be visited
                                                                        if visit.nil? || visit.call(link) then
                                                                                jlock.synchronize do
                                                                                        # If link is not in job queue
                                                                                        if jobs[:"#{link}"] == nil then
                                                                                                # Create job for the given link
                                                                                                jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
                                                                                        end
                                                                                end
                                                                        end
                                                                end
                                                        end
                                                end
                                        rescue
                                        end
                                        # Flag job as complete
                                        jlock.synchronize do
                                                jobs[:"#{job_url}"][:state] = 2
                                        end
                                        # Perform delay
                                        sleep(delay)
                                end
                        end
                        _threads[thread.object_id] = { thread: thread, kill: false }
                end
        end
        # Wait for all threads to die
        while true
                # Execute yields
                y = nil
                ylock.synchronize do
                        y = yields.shift
                end
                if y != nil then
                        tlock.synchronize do
                                # Pre-emptive kill if yield breaks
                                _threads.each do |id, _thread|
                                        _thread[:kill] = true
                                end
                                # Yield
                                yield y[:request], y[:response], y[:depth]
                                # Cancel kill if yield does not break
                                _threads.each do |id, _thread|
                                        _thread[:kill] = false
                                end
                        end
                        next
                end
                # Check if dead
                alive = false
                _threads.each do |id, _thread|
                        alive = alive || _thread[:thread].alive?
                        if alive then
                                break
                        end
                end
                if !alive then
                        break
                end
        end
end