class LinkScrapper
class for grabbing and parsing domain links
Public Class Methods
new(settings)
click to toggle source
# File lib/link_scrapper.rb, line 10 def initialize(settings) # available default settings # domain: domain to be searched # verbose: prints output as the script goes along # results: hash or csv # init link store hashes @settings = settings; @search_index = 0 @search_iteration = 0 @links = Array.new @link_parents = Hash.new @checked_links = Hash.new @error_links = Hash.new @external_links = Hash.new # gather search domain if ARGV[0] @search_domain = ARGV[0].dup elsif @settings[:domain] == 'ue' puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})" @search_domain = gets.chomp elsif @settings[:domain] @search_domain = @settings[:domain] end # override with default domain if entry is left empty @search_domain = SEARCH_DOMAIN if @search_domain == '' # get and store local domain string @local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/) # configure initial search uri @search_uri = @search_domain # verify domain entry includes protocol if @search_uri !~ /^htt(p|ps):/ @search_uri.insert(0, 'http://') end # verify leading forward slash if @search_uri[@search_uri.length-1] != '/' @search_uri << '/' end # start scan get_links end
Public Instance Methods
get_links()
click to toggle source
gather link data
# File lib/link_scrapper.rb, line 151 def get_links # init skip bit @skip = 0 # define search uri if undefined get_search_uri # check for existing uri hash index if @checked_links[@search_uri.to_sym] @skip = 1 end # run link scan if @skip bit is not set if @skip == 0 # let user know which uri is currently active puts @search_uri if @settings[:verbose] # gather page request response begin t1 = Time.now response = Net::HTTP.get_response(URI.parse(@search_uri.strip)) t2 = Time.now delta = t2 - t1 # store response page body body = response.body # store response code code = response.code # extract all links within page links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi) # update anchors and indirect links to use direct links links_array.each_with_index { |val, index| skip = 0 if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/ if val[0][0,3] == "../" val[0][0,3] = "" end if val[0][0,2] == "//" val[0][0,2] = "" end if val[0][0] == "/" val[0][0] = "" end val[0] = "#{@search_domain}#{val[0]}" end @links.each { |lnk| if val[0] == lnk[0] skip = 1 end } if skip == 0 @link_parents[val[0].chomp.to_sym] = @search_uri.strip else val.delete_at(index) end } # combine found links with links array @links.concat(links_array) # remove duplicates @links.uniq! rescue => ex rescode = 408 end # store results in checked hash @checked_links[@search_uri.to_sym] = {res: code, time: delta, parent: @link_parents[@search_uri.to_sym]} end # iterate through found links @search_iteration += 1 get_links end
get_search_uri()
click to toggle source
gather search uri
# File lib/link_scrapper.rb, line 61 def get_search_uri # do not override initial domain setting if @search_iteration > 0 # set search uri if !@links[@search_index].nil? @search_uri = @links[@search_index][0].chomp else # save results and exit if @settings[:results] == 'csv' save_results else return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links} end exit end # check for direct link if @search_uri =~ /^htt(p|ps):/ # if external link go to next link if @search_uri.index(@local_domain[0]) == nil if !@external_links[@search_uri.to_sym] begin t1 = Time.now response = Net::HTTP.get_response(URI.parse(@search_uri)) t2 = Time.now delta = t2 - t1 code = response.code rescue => ex code = 408 end @external_links[@search_uri.to_sym] = {res: code, time: delta} end @skip = 1 end else # skip various files if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/ @skip = 1 end # check for mailto link if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:' @skip = 1 else # check for protocol agnostic and indirect links case @search_uri[0,1] when '.' @search_uri[0,1] = '' end case @search_uri[0,2] when '//', './', '..' @search_uri[0,2] = '' end case @search_uri[0,3] when '../' @search_uri[0,3] = '' end # check for relative link if @search_uri[0] == '/' @search_uri[0] = '' end # verify uri portion is valid if @search_uri !~ /^([\w]|%|#|\?)/ @search_index += 1 @skip = 1 @error_links[@search_uri] = '' puts "invalid uri #{@search_uri}" if @settings[:verbose] return end # define uri string if @search_uri[0,2] != '//' @search_uri = "#{@search_domain}#{@search_uri}" else # handle protocol agnostic link requests if @search_domain[0,6] == 'https:' @search_uri = "https:#{@search_uri}" else @search_uri = "http:#{@search_uri}" end end end end # increment search index value @search_index += 1 end end
save_results()
click to toggle source
save results to csvs
# File lib/link_scrapper.rb, line 235 def save_results # save search results CSV.open('results.csv', 'wb') {|csv| @checked_links.each {|link| csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]] } } # save list of external links CSV.open('external-links.csv', 'wb') {|csv| @external_links.each do |link| csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]] end } # save list of invalid links CSV.open('invalid.csv', 'wb') {|csv| @error_links.each do |link| csv << link end } end