class GoogleScraper
Constants
- BASE_URL
- CSS_SELECTOR
- RANK_LIMIT
- SEARCH
Public Class Methods
new()
click to toggle source
# File lib/google_scraper_gem.rb, line 11 def initialize() # proxy = {:host => "204.12.216.84", :port => 20602, :username => 'pp-henryjay', :password => 'rein&true'} @mech = Mechanize.new { |agent| # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb agent.user_agent_alias = 'Mac Safari' } # @mech.keep_alive = false # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password] end
Public Instance Methods
checkRank(keyword, url, locale = 'us', language = 'en')
click to toggle source
@return Rank of URL for keyword in google.com for the specified locale and language
# File lib/google_scraper_gem.rb, line 23 def checkRank(keyword, url, locale = 'us', language = 'en') results = [] rank_count = 0 page_num = 1 uri = BASE_URL + SEARCH + URI.encode(keyword) + "&gl=#{locale}&lr=lang_#{language}" page = @mech.get(uri) while rank_count < RANK_LIMIT # This parse definition requires the Mac Safari user agent. page.parser.css(CSS_SELECTOR).each do |cite| rank_count += 1 result = cite.attr('href') # puts result result.gsub!(%r{^http://}, '') result.gsub!(%r{^https://}, '') return rank_count if result.start_with?(url) or result.start_with?("www." + url) end # Get next search result page. page_num += 1 # Add random sleep to prevent blocking of IP. # sleep(rand(3..9)) # TODO: Click "next" instead? page = page.link_with(:text => page_num.to_s).click end return -1 end
getTopResults(keyword, extension = '.com', top = 10)
click to toggle source
@return Array of domains in the order specified by Google.
# File lib/google_scraper_gem.rb, line 60 def getTopResults(keyword, extension = '.com', top = 10) results = [] rank_count = 0 page_num = 1 uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword) page = @mech.get(uri) while rank_count < top # This parse definition requires the Mac Safari user agent. page.parser.css(CSS_SELECTOR).each do |cite| rank_count += 1 result = cite.attr('href') puts result results << result if result.start_with? 'http' end return results if rank_count >= top # Confirm that there are ten results in this cycle. unless rank_count % 10 == 0 puts "WARNING: There were #{rank_count.to_s} results instead of 10 near page #{page_num} for '#{keyword}'." end # Get next search result page. page_num += 1 # Add random sleep to prevent blocking of IP. # TODO: Replace this with proxy swap. # rand(8) page = page.link_with(:text => page_num.to_s).click end results end