class Crawler
Attributes
driver1[RW]
driver2[RW]
progress[RW]
site[RW]
Public Class Methods
new(site, test, base, browser = :firefox)
click to toggle source
# File lib/site_context.rb, line 89 def initialize(site, test, base, browser = :firefox) @site = site.to_s $config.SCREEN_RESOLUTION.keys.each do |key| FileUtils::mkdir_p "#{ENV['HOME']}/DoubleTake_data/#{@site}/#{key}" end puts "* Screenshots are saved in #{ENV['HOME']}/DoubleTake_data/#{@site}" $config.to_be_scraped << test @test_domain_length = test.length puts "Crawler initialized" @driver1 = SiteContext.new @driver1 = @driver1.set_driver(browser) @driver1.get(test) @driver1.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1]) #First resolution unless $cf == "scrape" @driver2 = SiteContext.new @driver2 = @driver2.set_driver(browser) @driver2.get(base) @driver2.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1]) #First resolution end end
Public Instance Methods
bad_link?(link)
click to toggle source
# File lib/site_context.rb, line 116 def bad_link?(link) # This should populate bad_links[] and return bool # regarding the link being passed in. # $config.scraped << link # Bad link could be parameterised URLs(?, #) or # External domains such as facebook, twitter etc. or # link is non http Ex: mailto: ftp: file: etc. # puts $config.scraped if link.include?("$") || link.include?("#") || link.include?(".png") || link.include?(".js") puts "Bad Link: "+link $config.scraped << link if ($config.scraped.include?(link) == false) $config.bad_links << link if ($config.bad_links.include?(link) == false) return true elsif link.include?(".pdf") || link.include?(".jpeg") || link.include?(".css") || link.include?(".jpg") puts "Bad Link: "+link $config.scraped << link if ($config.scraped.include?(link) == false) $config.bad_links << link if ($config.bad_links.include?(link) == false) return true elsif link.include?("video/pop") || link.include?("?") || link.include?("/user/logout") puts "Bad Link: "+link $config.scraped << link if ($config.scraped.include?(link) == false) $config.bad_links << link if ($config.bad_links.include?(link) == false) return true elsif link[0..3] != "http" #TODO: This doesn't seem to work. puts "Bad Link: "+link $config.scraped << link if ($config.scraped.include?(link) == false) $config.bad_links << link if ($config.bad_links.include?(link) == false) return true elsif link[0..@test_domain_length-1] != $config.stage || link.include?("%") puts "Out of Scope: "+link $config.scraped << link if ($config.scraped.include?(link) == false) $config.bad_links << link if ($config.bad_links.include?(link) == false) return true elsif $config.scraped.include?(link) puts "Already scraped: "+link $config.bad_links << link if ($config.bad_links.include?(link) == false) return true else puts "Good Link: "+link return false end #End of `if` end
clean_up()
click to toggle source
# File lib/site_context.rb, line 110 def clean_up @driver1.quit @driver2.quit unless $cf == "scrape" puts "Destroyed WebDriver instances." end
crawl()
click to toggle source
# File lib/site_context.rb, line 159 def crawl # unless $config.to_be_scraped.empty? loop do puts "length of progress.to_be_scraped: #{$config.to_be_scraped.length.to_s}" break if $config.to_be_scraped.length < 1 puts "length of @to_be_scrapped: #{$config.to_be_scraped.length.to_s}" $config.to_be_scraped.each do |each_link| puts "* length of @to_be_scrapped: #{$config.to_be_scraped.length.to_s}" puts "** length of $config.scraped: #{$config.scraped.length.to_s}" puts "Working on: #{each_link}" begin @driver1.get(each_link) #### This Code block collects New Links and cleans # $config.to_be_scraped Array. all_a_objs = @driver1.find_elements(:xpath, '//a') all_a_objs.each do |each_a_obj| if each_a_obj.attribute("href") != nil #Why? Cause some link obj are dicks and don't have a href # TODO: ^^ This if should be changed to begin - rescue $config.to_be_scraped << each_a_obj.attribute("href") if (each_a_obj.attribute("href").include?("http") && bad_link?(each_a_obj.attribute("href")) == false) end end #all_a_objs.each do |each_a_obj| $config.to_be_scraped.uniq! # Remove duplicate links. $config.to_be_scraped.each do |each_new_link| #This code block cleans the $config.to_be_scraped Array $config.to_be_scraped = $config.to_be_scraped - [each_new_link] if ($config.scraped.include?(each_new_link) || bad_link?(each_new_link)) end #$config.to_be_scraped.each do |each_new_link| if $config.scraped.include?(each_link) # In case a bad link makes it into the loop # This code-block will skip over it. # It will also delte it from the $config.to_be_scraped Array $config.to_be_scraped = $config.to_be_scraped - [each_link] puts "Already Scrapped linked creeped in: #{each_link}" # next end # ### End of Code Block to collect URL's to be scraped stage_uri = each_link[@test_domain_length..-1] prod_link = $config.prod + stage_uri # ***************************************** if $cf == "scrape" $config.SCREEN_RESOLUTION.each do |type, res| name = sanitize(stage_uri) @driver1.manage.window.resize_to(res[0], res[1]) @driver1.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png") end @driver1.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1]) else @driver2.get(prod_link) image_stuff(stage_uri) end # ***************************************** $config.scraped << each_link # Last Step: Marking the URL as scraped! $config.scraped.uniq! # bad_link? may add duplicate entries $config.to_be_scraped = $config.to_be_scraped - [nil] # This was issue when .delete() was used which resulted in element replaced by nil $config.to_be_scraped = $config.to_be_scraped - [each_link] $config.to_be_scraped.uniq! File.open("#{ENV['HOME']}/DoubleTake_data/progress_#{@site}.yml", "w") {|f| f.write($config.to_yaml)} rescue Selenium::WebDriver::Error::StaleElementReferenceError => e puts "Stale element error occured moving to next link: #{stage_uri}" puts e next rescue Exception => e puts "Generic Exception occured" #binding.pry puts e.backtrace next end #End of begin end #$config.to_be_scraped.each do |each_link| end # Loop do puts "to_be_scraped: " + $config.to_be_scraped.to_s puts "scraped : " + $config.scraped.to_s end
image_stuff(stage_uri)
click to toggle source
# File lib/site_context.rb, line 233 def image_stuff(stage_uri) name = sanitize(stage_uri) $config.SCREEN_RESOLUTION.each do |type, res| @driver1.manage.window.resize_to(res[0], res[1]) @driver2.manage.window.resize_to(res[0], res[1]) @driver1.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png") @driver2.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png") # a, b = IO.read("#{ENV['HOME']}/DoubleTake_data/desktop/stage_"+@site+"/"+name+".png")[0x10..0x18].unpack('NN') img1 = ImageList.new("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png") img2 = ImageList.new("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png") diff_img, diff_metric = img1[0].compare_channel( img2[0], Magick::MeanSquaredErrorMetric ) if diff_metric > $config.IMAGE_THRESHOLD diff_img.write("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/"+name+"_diff.png") else File.delete("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png") File.delete("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png") end # if diff_metric > $IMAGE_THRESHOLD end end