class Crawler

Attributes

driver1[RW]
driver2[RW]
progress[RW]
site[RW]

Public Class Methods

new(site, test, base, browser = :firefox) click to toggle source
# File lib/site_context.rb, line 89
def initialize(site, test, base, browser = :firefox)
        @site = site.to_s
        $config.SCREEN_RESOLUTION.keys.each do |key|
                FileUtils::mkdir_p "#{ENV['HOME']}/DoubleTake_data/#{@site}/#{key}"
        end
        puts "* Screenshots are saved in #{ENV['HOME']}/DoubleTake_data/#{@site}"
        $config.to_be_scraped << test
        @test_domain_length = test.length
        puts "Crawler initialized"
        @driver1 = SiteContext.new
        @driver1 = @driver1.set_driver(browser)
        @driver1.get(test)
        @driver1.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1]) #First resolution
        unless $cf == "scrape"
                @driver2 = SiteContext.new
                @driver2 = @driver2.set_driver(browser)
                @driver2.get(base)
                @driver2.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1]) #First resolution
        end
end

Public Instance Methods

clean_up() click to toggle source
# File lib/site_context.rb, line 110
def clean_up
        @driver1.quit
        @driver2.quit unless $cf == "scrape"
        puts "Destroyed WebDriver instances."
end
crawl() click to toggle source
# File lib/site_context.rb, line 159
def crawl
        # unless $config.to_be_scraped.empty?
        loop do
                puts "length of      progress.to_be_scraped: #{$config.to_be_scraped.length.to_s}"
                break if $config.to_be_scraped.length < 1
                puts "length of @to_be_scrapped: #{$config.to_be_scraped.length.to_s}"
                $config.to_be_scraped.each do |each_link|
                        puts "*  length of @to_be_scrapped: #{$config.to_be_scraped.length.to_s}"
                        puts "** length of $config.scraped: #{$config.scraped.length.to_s}"
                        puts "Working on: #{each_link}"
                        begin
                                @driver1.get(each_link)
                                #### This Code block collects New Links and cleans
                                #          $config.to_be_scraped Array.
                                all_a_objs = @driver1.find_elements(:xpath, '//a')
                                all_a_objs.each do |each_a_obj|
                                        if each_a_obj.attribute("href") != nil #Why? Cause some link obj are dicks and don't have a href
                                                # TODO: ^^ This if should be changed to begin - rescue
                                                $config.to_be_scraped << each_a_obj.attribute("href") if (each_a_obj.attribute("href").include?("http") && bad_link?(each_a_obj.attribute("href")) == false)
                                        end
                                end #all_a_objs.each do |each_a_obj|
                                $config.to_be_scraped.uniq! # Remove duplicate links.
                                $config.to_be_scraped.each do |each_new_link|
                                        #This code block cleans the       $config.to_be_scraped Array
                                        $config.to_be_scraped =   $config.to_be_scraped - [each_new_link] if ($config.scraped.include?(each_new_link) || bad_link?(each_new_link))
                                end        #$config.to_be_scraped.each do |each_new_link|
                                if $config.scraped.include?(each_link)
                                        # In case a bad link makes it into the loop
                                        # This code-block will skip over it.
                                        # It will also delte it from the  $config.to_be_scraped Array
                                        $config.to_be_scraped =   $config.to_be_scraped - [each_link]
                                        puts "Already Scrapped linked creeped in: #{each_link}"
                                        # next
                                end

                                #
                                ### End of Code Block to collect URL's to be scraped
                                stage_uri = each_link[@test_domain_length..-1]
                                prod_link = $config.prod + stage_uri
                                # *****************************************
                                if $cf == "scrape"
                                        $config.SCREEN_RESOLUTION.each do |type, res|
                                                name = sanitize(stage_uri)
                                                @driver1.manage.window.resize_to(res[0], res[1])
                                                @driver1.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png")
                                        end
                                        @driver1.manage.window.resize_to($config.SCREEN_RESOLUTION.first[1][0], $config.SCREEN_RESOLUTION.first[1][1])
                                else
                                        @driver2.get(prod_link)
                                        image_stuff(stage_uri)
                                end
                                # *****************************************
                                $config.scraped << each_link # Last Step: Marking the URL as scraped!
                                $config.scraped.uniq! # bad_link? may add duplicate entries
                                $config.to_be_scraped =    $config.to_be_scraped - [nil] # This was issue when .delete() was used which resulted in element replaced by nil
                                $config.to_be_scraped =    $config.to_be_scraped - [each_link]
                                $config.to_be_scraped.uniq!
                                File.open("#{ENV['HOME']}/DoubleTake_data/progress_#{@site}.yml", "w") {|f| f.write($config.to_yaml)}
                        rescue Selenium::WebDriver::Error::StaleElementReferenceError => e
                                puts "Stale element error occured moving to next link: #{stage_uri}"
                                puts e
                                next
                        rescue Exception => e
                                puts "Generic Exception occured"
                                #binding.pry
                                puts e.backtrace
                                next
                        end #End of begin
                end  #$config.to_be_scraped.each do |each_link|
        end # Loop do
        puts "to_be_scraped: " + $config.to_be_scraped.to_s
        puts "scraped      : " + $config.scraped.to_s
end
image_stuff(stage_uri) click to toggle source
# File lib/site_context.rb, line 233
def image_stuff(stage_uri)
        name = sanitize(stage_uri)
        $config.SCREEN_RESOLUTION.each do |type, res|
                @driver1.manage.window.resize_to(res[0], res[1])
                @driver2.manage.window.resize_to(res[0], res[1])
                @driver1.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png")
                @driver2.save_screenshot("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png")
                # a, b = IO.read("#{ENV['HOME']}/DoubleTake_data/desktop/stage_"+@site+"/"+name+".png")[0x10..0x18].unpack('NN')
                img1 = ImageList.new("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png")
                img2 = ImageList.new("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png")
                diff_img, diff_metric  = img1[0].compare_channel( img2[0], Magick::MeanSquaredErrorMetric )
                if diff_metric > $config.IMAGE_THRESHOLD
                        diff_img.write("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/"+name+"_diff.png")
                else
                        File.delete("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_stage.png")
                        File.delete("#{ENV['HOME']}/DoubleTake_data/#{@site}/#{type}/#{name}_prod.png")
                end # if diff_metric > $IMAGE_THRESHOLD
        end
end