class Wmap::UrlCrawler
Web site crawler class
Constants
- Crawl_timeout
set hard stop limit of crawler time-out to 1200 seconds or 20 minutes
- Max_http_timeout
set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
Attributes
Public Class Methods
Crawler instance default variables
# File lib/wmap/url_crawler.rb, line 32 def initialize (params = {}) @verbose=params.fetch(:verbose, false) @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/') @http_timeout=params.fetch(:http_timeout, 5000) @crawl_depth=params.fetch(:crawl_depth, 4) @crawl_page_limit=params.fetch(:crawl_page_limit, 1000) @max_parallel=params.fetch(:max_parallel, 40) @user_agent=params.fetch(:user_agent, "OWASP WMAP Spider") # Discovered data store @discovered_urls_by_crawler=Hash.new @visited_urls_by_crawler=Hash.new @crawl_start=Hash.new @crawl_done=Hash.new Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir) @log_dir=@data_dir + "/logs/" Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir) @log_file=@log_dir + "crawler.log" end
Public Instance Methods
A web crawler to crawl a known website and search for html links within the same root domain. For example, by crawling 'www.yahoo.com/' it could discover 'login.yahoo.com/'
# File lib/wmap/url_crawler.rb, line 72 def crawl(url) puts "Start web crawling on #{url}" result=Array.new url=url.chomp.strip result.push(url_2_site(url)) raise "Error! Invalid url format: #{urls}" unless is_url?(url) # Add logic to profile the web server before crawling; this is used to optimize the crawling speed pre_crawl(url) status = Timeout::timeout(Crawl_timeout/1000) { result+=crawl_worker(url).keys } puts "Web crawling time-out on #{url}: #{status}" if @verbose return result rescue => ee puts "Exception on method #{__method__} for URL #{url}: #{ee}" return result end
The worker instance of crawler who perform the labour work
# File lib/wmap/url_crawler.rb, line 92 def crawl_worker(url0) puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and discovery contents." # Input URL sanity check first if is_url?(url0) host=url_2_host(url0) ip=host_2_ip(host).to_s raise "Invalid IP address: #{url0}" if ip.nil? port=url_2_port(url0).to_s raise "Invalid port number: #{url0}" if port.nil? else raise "Invalid URL: #{url0}. Please check it out with your browser again." end log_info=Hash.new log_info[1]="Start working on #{url0}" url_stores=Hash.new url_stores[url0]=true unless url_stores.key?(url0) @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0) @crawl_start[url0]=true unless @crawl_start.key?(url0) # $discovered_urls[url0]=true unless $discovered_urls.key?(url0) @crawl_depth.times do url_stores.keys.each do |url| # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance next if @visited_urls_by_crawler.key?(url) url_object = open_url(url) next if url_object == nil url = update_url_if_redirected(url, url_object) url_body = read_url(url) # Protection code - to avoid parsing failure on the empty or nil object next if url_body.nil? or url_body.empty? url_stores[url]=true unless url_stores.key?(url) @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url) # $discovered_urls[url]=true unless $discovered_urls.key?(url) doc = Nokogiri::HTML(url_body) next if doc == nil if url_stores.size >= @crawl_page_limit #@visited_urls_by_crawler.merge!(url_stores) @discovered_urls_by_crawler.merge!(url_stores) # $discovered_urls.merge!(url_stores) puts "Finish web crawling the url: #{url0}" return url_stores end page_urls = find_urls_on_page(doc, url) page_urls.uniq! page_urls.map do |y| y=normalize_url(y) url_stores[y]=true unless url_stores.key?(y) @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y) # $discovered_urls[y]=true unless $discovered_urls.key?(y) end end end puts "Finish web crawling on: #{url0}" log_info[2]="Finish working on: #{url0}" wlog(log_info, "UrlCrawler", @log_file) @crawl_done[url0]=true unless @crawl_done.key?(url0) return url_stores rescue => ee puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose log_info[3]="Exception on #{url0}" wlog(log_info,"UrlCrawler",@log_file) return url_stores end
Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time each child process will continuously work on the target pool until all the works are done
# File lib/wmap/url_crawler.rb, line 157 def crawl_workers (targets,num=@max_parallel) raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose #puts "This could be awhile depending on the list size. Please be patient ..." # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked. targets -= ["", nil] uniq_sites=Hash.new targets.dup.map do |target| if is_url?(target) host=url_2_host(target) ip=host_2_ip(host).to_s next if ip.nil? port=url_2_port(target).to_s next if port.nil? site_key=ip+":"+port unless uniq_sites.key?(site_key) uniq_sites[site_key]=target end end end puts "Sanitization done! " if @verbose puts "Start the parallel engine on the normalized crawling list:\n #{targets} " puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose raise "Error: target list is empty!" if targets.size < 1 Parallel.map(uniq_sites.values, :in_processes => num) { |target| puts "Working on #{target} ..." if @verbose crawl(target) }.dup.each do |process| puts "process.inspect: #{process}" if @verbose urls=process urls-=["",nil] unless urls.nil? if urls.nil? next elsif urls.empty? next #do nothing else urls.map do |url| url.strip! @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url) #$discovered_urls[url]=true unless $discovered_urls.key?(url) end end end #return sites return @discovered_urls_by_crawler.keys rescue Exception => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Fast crawling method - build the target pool from the input file
# File lib/wmap/url_crawler.rb, line 210 def crawl_workers_on_file (file) puts "Web crawl the list of targets from file: #{file}" targets=file_2_list(file) sites=crawl_workers(targets,num=@max_parallel) return sites rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Method to retrieve discovery site result
# File lib/wmap/url_crawler.rb, line 328 def get_discovered_sites_by_crawler puts "Print summary report of discovered sites. " if @verbose puts "\nSummary Report of Discovered Sites from the Crawler:" sites = Hash.new @discovered_urls_by_crawler.keys.each do |url| site=url_2_site(url) sites[site]=true unless sites.key?(site) end sites.keys.map { |site| puts site } puts "Total: #{sites.size}" puts "End of the summary" return sites.keys rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
# File lib/wmap/url_crawler.rb, line 52 def pre_crawl(url) begin puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose host=url_2_host(url) # Use the following formula to 'guess' the right http time-out threshold for the scanner nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout @http_timeout = Max_http_timeout else @http_timeout = 1500 + nwk_to*2 end puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose rescue Exception => ee puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose @http_timeout = Max_http_timeout end end
Method to print out discovery URL result
# File lib/wmap/url_crawler.rb, line 302 def print_discovered_urls_by_crawler puts "Print discovered url by the crawler. " if @verbose puts "\nSummary Report of Discovered URLs from the Crawler:" @discovered_urls_by_crawler.keys.each do |url| puts url end puts "Total: #{@discovered_urls_by_crawler.keys.size}" puts "End of the summary" rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Method to save URL discovery result
# File lib/wmap/url_crawler.rb, line 317 def save_discovered_urls (file) puts "Save discovered urls by the crawler to file: #{file} " list_2_file(@discovered_urls_by_crawler.keys, file) puts "Done!" rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Private Instance Methods
Search 'current_url' and return found URLs under the same domain
# File lib/wmap/url_crawler.rb, line 266 def find_urls_on_page(doc, current_url) puts "Search and return URLs within the doc: #{doc}" if @verbose urls_list = [] # case 1 - search embedded HTML tag <a href='url'> for the url elements links=doc.css('a') links.map do |x| #puts "x: #{x}" new_url = x.attribute('href').to_s unless new_url == nil if new_url.match("http") #if urls_on_same_domain?(new_url,current_url) urls_list.push(new_url) #end else new_url = make_absolute(current_url, new_url) urls_list.push(new_url) end end end # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'"> elements=doc.css("meta[http-equiv]") unless elements.size == 0 link=elements.attr("content").value.split(/url\=/i)[1] unless link.nil? new_url = make_absolute(current_url, link) urls_list.push(new_url) unless new_url.nil? end end #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose return urls_list.uniq-["",nil] rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
# File lib/wmap/url_crawler.rb, line 223 def open_url(url,user_agent=@user_agent) puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose if url =~ /http\:/i # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent) #url_object = open(url) elsif url =~ /https\:/i url_object = open(url, :ssl_verify_mode=>0, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent) #url_object = open(url,:ssl_verify_mode => 0) else raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}" end return url_object rescue => ee puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose return nil end
Wrapper to use OpenURI method 'read' to return url body contents
# File lib/wmap/url_crawler.rb, line 242 def read_url(url) puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose url_object=open_url(url) @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url) body=url_object.read return body rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end
Return the destination url in case of url re-direct
# File lib/wmap/url_crawler.rb, line 254 def update_url_if_redirected(url, url_object) #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose if url != url_object.base_uri.to_s return url_object.base_uri.to_s end return url rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end