class LinkScrapper

class for grabbing and parsing domain links

Public Class Methods

new(settings) click to toggle source
# File lib/link_scrapper.rb, line 10
def initialize(settings)

        # available default settings
        # domain: domain to be searched
        # verbose: prints output as the script goes along
        # results: hash or csv

        # init link store hashes
        @settings = settings;
        @search_index = 0
        @search_iteration = 0
        @links = Array.new
        @link_parents = Hash.new
        @checked_links = Hash.new
        @error_links = Hash.new
        @external_links = Hash.new

        # gather search domain
        if ARGV[0]
                @search_domain = ARGV[0].dup
        elsif @settings[:domain] == 'ue'
                puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
                @search_domain = gets.chomp
        elsif @settings[:domain]
                @search_domain = @settings[:domain]
        end

        # override with default domain if entry is left empty
        @search_domain = SEARCH_DOMAIN if @search_domain == ''

        # get and store local domain string
        @local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)

        # configure initial search uri
        @search_uri = @search_domain

        # verify domain entry includes protocol
        if @search_uri !~ /^htt(p|ps):/
                @search_uri.insert(0, 'http://')
        end

        # verify leading forward slash
        if @search_uri[@search_uri.length-1] != '/'
                @search_uri << '/'
        end

        # start scan
        get_links
end

Public Instance Methods

get_search_uri() click to toggle source

gather search uri

# File lib/link_scrapper.rb, line 61
def get_search_uri
        # do not override initial domain setting
        if @search_iteration > 0
                # set search uri
                if !@links[@search_index].nil?
                        @search_uri = @links[@search_index][0].chomp
                else
                        # save results and exit
                        if @settings[:results] == 'csv'
                                save_results
                        else
                                return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
                        end
                        exit
                end

                # check for direct link
                if @search_uri =~ /^htt(p|ps):/

                        # if external link go to next link
                        if @search_uri.index(@local_domain[0]) == nil
                                if !@external_links[@search_uri.to_sym]
                                        begin
                                                t1 = Time.now
                                                response = Net::HTTP.get_response(URI.parse(@search_uri))
                                                t2 = Time.now
                                                delta = t2 - t1
                                                code = response.code
                                        rescue => ex
                                                code = 408
                                        end
                                        @external_links[@search_uri.to_sym] = {res: code, time: delta}
                                end
                                @skip = 1
                        end
                else

                        # skip various files
                        if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
                                @skip = 1
                        end

                        # check for mailto link
                        if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
                                @skip = 1
                        else
                                # check for protocol agnostic and indirect links
                                case @search_uri[0,1]
                                when '.'
                                        @search_uri[0,1] = ''
                                end
                                case @search_uri[0,2]
                                when '//', './', '..'
                                        @search_uri[0,2] = ''
                                end
                                case @search_uri[0,3]
                                when '../'
                                        @search_uri[0,3] = ''
                                end
                                # check for relative link
                                if @search_uri[0] == '/'
                                        @search_uri[0] = ''
                                end
                                # verify uri portion is valid
                                if @search_uri !~ /^([\w]|%|#|\?)/
                                        @search_index += 1
                                        @skip = 1
                                        @error_links[@search_uri] = ''
                                        puts "invalid uri #{@search_uri}" if @settings[:verbose]
                                        return
                                end
                                # define uri string
                                if @search_uri[0,2] != '//'
                                        @search_uri = "#{@search_domain}#{@search_uri}"
                                else
                                        # handle protocol agnostic link requests
                                        if @search_domain[0,6] == 'https:'
                                                @search_uri = "https:#{@search_uri}"
                                        else
                                                @search_uri = "http:#{@search_uri}"
                                        end
                                end
                        end
                end
                # increment search index value
                @search_index += 1
        end
end
save_results() click to toggle source

save results to csvs

# File lib/link_scrapper.rb, line 235
def save_results
        # save search results
        CSV.open('results.csv', 'wb') {|csv|
                @checked_links.each {|link|
                        csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
                }
        }
        # save list of external links
        CSV.open('external-links.csv', 'wb') {|csv|
                @external_links.each do |link|
                   csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
                end
        }
        # save list of invalid links
        CSV.open('invalid.csv', 'wb') {|csv|
                @error_links.each do |link|
                   csv << link
                end
        }
end