class GoogleScraper

Constants

BASE_URL
CSS_SELECTOR
RANK_LIMIT

Public Class Methods

new() click to toggle source
# File lib/google_scraper_gem.rb, line 11
def initialize()
  # proxy = {:host => "204.12.216.84", :port => 20602, :username => 'pp-henryjay', :password => 'rein&true'}
  @mech = Mechanize.new { |agent|
    # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
    agent.user_agent_alias = 'Mac Safari' 
  }
  # @mech.keep_alive = false

  # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
end

Public Instance Methods

checkRank(keyword, url, locale = 'us', language = 'en') click to toggle source

@return Rank of URL for keyword in google.com for the specified locale and language

# File lib/google_scraper_gem.rb, line 23
def checkRank(keyword, url, locale = 'us', language = 'en')
  results = []

  rank_count = 0
  page_num = 1

  uri = BASE_URL + SEARCH + URI.encode(keyword) + "&gl=#{locale}&lr=lang_#{language}"

  page = @mech.get(uri)
  while rank_count < RANK_LIMIT
    # This parse definition requires the Mac Safari user agent.
    page.parser.css(CSS_SELECTOR).each do |cite|
      rank_count += 1

      result = cite.attr('href')
      # puts result

      result.gsub!(%r{^http://}, '')
      result.gsub!(%r{^https://}, '')

      return rank_count if result.start_with?(url) or result.start_with?("www." + url)
    end

    # Get next search result page.
    page_num += 1

    # Add random sleep to prevent blocking of IP.
    # sleep(rand(3..9))

    # TODO: Click "next" instead?
    page = page.link_with(:text => page_num.to_s).click
  end

  return -1
end
getTopResults(keyword, extension = '.com', top = 10) click to toggle source

@return Array of domains in the order specified by Google.

# File lib/google_scraper_gem.rb, line 60
def getTopResults(keyword, extension = '.com', top = 10)
  results = []

  rank_count = 0
  page_num = 1

  uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)

  page = @mech.get(uri)
  while rank_count < top
    # This parse definition requires the Mac Safari user agent.
    page.parser.css(CSS_SELECTOR).each do |cite|
      rank_count += 1

      result = cite.attr('href')
      puts result
      results << result if result.start_with? 'http'
    end

    return results if rank_count >= top

    # Confirm that there are ten results in this cycle.
    unless rank_count % 10 == 0
      puts "WARNING: There were #{rank_count.to_s} results instead of 10 near page #{page_num} for '#{keyword}'."
    end

    # Get next search result page.
    page_num += 1

    # Add random sleep to prevent blocking of IP.
    # TODO: Replace this with proxy swap.
    # rand(8)

    page = page.link_with(:text => page_num.to_s).click
  end

  results
end