class Wmap::UrlCrawler

Web site crawler class

Constants

Crawl_timeout

set hard stop limit of crawler time-out to 1200 seconds or 20 minutes

Max_http_timeout

set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)

Attributes

crawl_depth[RW]
crawl_done[R]
crawl_page_limit[RW]
crawl_start[R]
data_dir[RW]
discovered_urls_by_crawler[R]
http_timeout[RW]
max_parallel[RW]
signature_file[RW]
tag_file[RW]
tag_signatures[R]
tag_store[R]
user_agent[RW]
verbose[RW]
visited_urls_by_crawler[R]

Public Class Methods

new(params = {}) click to toggle source

Crawler instance default variables

# File lib/wmap/url_crawler.rb, line 32
def initialize (params = {})
        @verbose=params.fetch(:verbose, false)
        @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
        @http_timeout=params.fetch(:http_timeout, 5000)
        @crawl_depth=params.fetch(:crawl_depth, 4)
        @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
        @max_parallel=params.fetch(:max_parallel, 40)
        @user_agent=params.fetch(:user_agent, "OWASP WMAP Spider")
        # Discovered data store
        @discovered_urls_by_crawler=Hash.new
        @visited_urls_by_crawler=Hash.new
        @crawl_start=Hash.new
        @crawl_done=Hash.new
        Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
        @log_dir=@data_dir + "/logs/"
        Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir)
        @log_file=@log_dir + "crawler.log"
end

Public Instance Methods

crawl(url) click to toggle source

A web crawler to crawl a known website and search for html links within the same root domain. For example, by crawling 'www.yahoo.com/' it could discover 'login.yahoo.com/'

# File lib/wmap/url_crawler.rb, line 72
def crawl(url)
        puts "Start web crawling on #{url}"
        result=Array.new
        url=url.chomp.strip
        result.push(url_2_site(url))
        raise "Error! Invalid url format: #{urls}" unless is_url?(url)
        # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
        pre_crawl(url)
        status = Timeout::timeout(Crawl_timeout/1000) {
                result+=crawl_worker(url).keys
        }
        puts "Web crawling time-out on #{url}: #{status}" if @verbose
        return result
rescue => ee
        puts "Exception on method #{__method__} for URL #{url}: #{ee}"
        return result
end
Also aliased as: query
crawl_file(file)
crawl_worker(url0) click to toggle source

The worker instance of crawler who perform the labour work

# File lib/wmap/url_crawler.rb, line 92
        def crawl_worker(url0)
                puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and discovery contents."
                # Input URL sanity check first
                if is_url?(url0)
                        host=url_2_host(url0)
                        ip=host_2_ip(host).to_s
                        raise "Invalid IP address: #{url0}" if ip.nil?
                        port=url_2_port(url0).to_s
                        raise "Invalid port number: #{url0}" if port.nil?
                else
                        raise "Invalid URL: #{url0}. Please check it out with your browser again."
                end
                log_info=Hash.new
                log_info[1]="Start working on #{url0}"
                url_stores=Hash.new
                url_stores[url0]=true unless url_stores.key?(url0)
                @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
                @crawl_start[url0]=true unless @crawl_start.key?(url0)
#                       $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
                @crawl_depth.times do
                        url_stores.keys.each do |url|
                                # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
                                next if @visited_urls_by_crawler.key?(url)
                                url_object = open_url(url)
                                next if url_object == nil
                                url = update_url_if_redirected(url, url_object)
                                url_body = read_url(url)
                                # Protection code - to avoid parsing failure on the empty or nil object
                                next if url_body.nil? or url_body.empty?
                                url_stores[url]=true unless url_stores.key?(url)
                                @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
#                                       $discovered_urls[url]=true unless $discovered_urls.key?(url)
                                doc = Nokogiri::HTML(url_body)
                                next if doc == nil
                                if url_stores.size >= @crawl_page_limit
                                        #@visited_urls_by_crawler.merge!(url_stores)
                                        @discovered_urls_by_crawler.merge!(url_stores)
#                                               $discovered_urls.merge!(url_stores)
                                        puts "Finish web crawling the url: #{url0}"
                                        return url_stores
                                end
                                page_urls = find_urls_on_page(doc, url)
                                page_urls.uniq!
                                page_urls.map do |y|
                                        y=normalize_url(y)
                                        url_stores[y]=true unless url_stores.key?(y)
                                        @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
#                                               $discovered_urls[y]=true unless $discovered_urls.key?(y)
                                end
                        end
                end
                puts "Finish web crawling on: #{url0}"
                log_info[2]="Finish working on: #{url0}"
                wlog(log_info, "UrlCrawler", @log_file)
                @crawl_done[url0]=true unless @crawl_done.key?(url0)
                return url_stores
        rescue => ee
                puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
                log_info[3]="Exception on #{url0}"
                wlog(log_info,"UrlCrawler",@log_file)
                return url_stores
        end
crawl_workers(targets,num=@max_parallel) click to toggle source

Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time each child process will continuously work on the target pool until all the works are done

# File lib/wmap/url_crawler.rb, line 157
def crawl_workers (targets,num=@max_parallel)
        raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
        puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
        #puts "This could be awhile depending on the list size. Please be patient ..."
        # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
        targets -= ["", nil]
        uniq_sites=Hash.new
        targets.dup.map do |target|
                if is_url?(target)
                        host=url_2_host(target)
                        ip=host_2_ip(host).to_s
                        next if ip.nil?
                        port=url_2_port(target).to_s
                        next if port.nil?
                        site_key=ip+":"+port
                        unless uniq_sites.key?(site_key)
                                uniq_sites[site_key]=target
                        end
                end
        end
        puts "Sanitization done! " if @verbose
        puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
        puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
        raise "Error: target list is empty!" if targets.size < 1
        Parallel.map(uniq_sites.values, :in_processes => num) { |target|
                puts "Working on #{target} ..." if @verbose
                crawl(target)
        }.dup.each do |process|
                puts "process.inspect: #{process}" if @verbose
                urls=process
                urls-=["",nil] unless urls.nil?
                if urls.nil?
                        next
                elsif urls.empty?
                        next
                        #do nothing
                else
                        urls.map do |url|
                                url.strip!
                                @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
                                #$discovered_urls[url]=true unless $discovered_urls.key?(url)
                        end
                end
        end
        #return sites
        return @discovered_urls_by_crawler.keys
rescue Exception => ee
        puts "Exception on method #{__method__}: #{ee}" if @verbose
        return nil
end
Also aliased as: crawls
crawl_workers_on_file(file) click to toggle source

Fast crawling method - build the target pool from the input file

# File lib/wmap/url_crawler.rb, line 210
    def crawl_workers_on_file (file)
            puts "Web crawl the list of targets from file: #{file}"
            targets=file_2_list(file)
            sites=crawl_workers(targets,num=@max_parallel)
            return sites
    rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
return nil
    end
Also aliased as: query_file, crawl_file
crawls(targets,num=@max_parallel)
Alias for: crawl_workers
get_discovered_sites_by_crawler() click to toggle source

Method to retrieve discovery site result

# File lib/wmap/url_crawler.rb, line 328
      def get_discovered_sites_by_crawler
              puts "Print summary report of discovered sites. " if @verbose
              puts "\nSummary Report of Discovered Sites from the Crawler:"
              sites = Hash.new
              @discovered_urls_by_crawler.keys.each do |url|
                      site=url_2_site(url)
                      sites[site]=true unless sites.key?(site)
              end
              sites.keys.map { |site| puts site }
              puts "Total: #{sites.size}"
              puts "End of the summary"
              return sites.keys
rescue => ee
              puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
      end
Also aliased as: get_sites
get_sites()
pre_crawl(url) click to toggle source

Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.

# File lib/wmap/url_crawler.rb, line 52
def pre_crawl(url)
        begin
                puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
                host=url_2_host(url)
                # Use the following formula to 'guess' the right http time-out threshold for the scanner
                nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
                if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
                        @http_timeout = Max_http_timeout
                else
                        @http_timeout = 1500 + nwk_to*2
                end
                puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
        rescue Exception => ee
                puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
                @http_timeout = Max_http_timeout
        end
end
print_discovered_urls_by_crawler() click to toggle source

Method to print out discovery URL result

Also aliased as: print
query(url)
Alias for: crawl
query_file(file)
save(file)
save_discovered_urls(file) click to toggle source

Method to save URL discovery result

# File lib/wmap/url_crawler.rb, line 317
      def save_discovered_urls (file)
              puts "Save discovered urls by the crawler to file: #{file} "
              list_2_file(@discovered_urls_by_crawler.keys, file)
              puts "Done!"
rescue => ee
  puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
      end
Also aliased as: save

Private Instance Methods

find_urls_on_page(doc, current_url) click to toggle source

Search 'current_url' and return found URLs under the same domain

# File lib/wmap/url_crawler.rb, line 266
      def find_urls_on_page(doc, current_url)
              puts "Search and return URLs within the doc: #{doc}" if @verbose
              urls_list = []
              # case 1 - search embedded HTML tag <a href='url'> for the url elements
              links=doc.css('a')
              links.map do |x|
                      #puts "x: #{x}"
                      new_url = x.attribute('href').to_s
                      unless new_url == nil
                              if new_url.match("http")
                                      #if urls_on_same_domain?(new_url,current_url)
                                              urls_list.push(new_url)
                                      #end
                              else
                                      new_url = make_absolute(current_url, new_url)
                                      urls_list.push(new_url)
                              end
                      end
              end
              # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
              elements=doc.css("meta[http-equiv]")
              unless elements.size == 0
                      link=elements.attr("content").value.split(/url\=/i)[1]
                      unless link.nil?
                              new_url = make_absolute(current_url, link)
                              urls_list.push(new_url) unless new_url.nil?
                      end
              end
              #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
              return urls_list.uniq-["",nil]
rescue => ee
  puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end
open_url(url,user_agent=@user_agent) click to toggle source

Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success

# File lib/wmap/url_crawler.rb, line 223
      def open_url(url,user_agent=@user_agent)
              puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
              if url =~ /http\:/i
                      # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
                      url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
                      #url_object = open(url)
              elsif url =~ /https\:/i
                      url_object = open(url, :ssl_verify_mode=>0, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
                      #url_object = open(url,:ssl_verify_mode => 0)
              else
                      raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
              end
              return url_object
rescue => ee
  puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
  return nil
end
read_url(url) click to toggle source

Wrapper to use OpenURI method 'read' to return url body contents

# File lib/wmap/url_crawler.rb, line 242
    def read_url(url)
            puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
            url_object=open_url(url)
            @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
            body=url_object.read
            return body
    rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
return nil
    end
update_url_if_redirected(url, url_object) click to toggle source

Return the destination url in case of url re-direct

# File lib/wmap/url_crawler.rb, line 254
      def update_url_if_redirected(url, url_object)
              #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
              if url != url_object.base_uri.to_s
                      return url_object.base_uri.to_s
              end
              return url
rescue => ee
  puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end