class GScraper::Search::WebQuery

Constants

LICENSES

Web Search licenses

PATH

Web Search path

RESULTS_PER_PAGE

Default results per-page

Attributes

filtered[RW]

Filter the search results

in_format[RW]

Search for results in the format

inside_domain[RW]

Search for results inside the domain

not_in_format[RW]

Search for results not in the format

occurs_within[RW]

Search for results where the query occurs within the area

outside_domain[RW]

Search for results outside the domain

region[RW]

Search for results from the region

results_per_page[RW]

Results per-page

rights[RW]

Search for results which have the rights

within_past_day[RW]

Search for results within the past day

within_past_months[RW]

Search for results within the past months

within_past_week[RW]

Search for results within the past week

within_past_year[RW]

Search for results within the past year

Public Class Methods

from_url(url,options={},&block) click to toggle source

Creates a new Web query from a search URL.

@param [URI::HTTP, String] url

The search URL.

@param [Hash] options

Additional options.

@yield [query]

If a block is given, it will be passed the new Web query.

@yieldparam [WebQuery] query

The new web query.

@return [WebQuery]

The new Web query.

@example

WebQuery.from_url('http://www.google.com/search?q=ruby+zen')

@example

WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
  q.within_last_month = true
  q.occurs_within = :title
end
# File lib/gscraper/search/web_query.rb, line 223
def WebQuery.from_url(url,options={},&block)
  url = URI(url.to_s)

  options[:search_host] = url.host

  options[:results_per_page] = if url.query_params['num']
                                 url.query_params['num'].to_i
                               else
                                 RESULTS_PER_PAGE
                               end

  options[:query]         = url.query_params['q']
  options[:exact_phrase]  = url.query_params['as_epq']
  options[:with_words]    = url.query_params['as_oq']
  options[:without_words] = url.query_params['as_eq']

  options[:language] = url.query_params['lr']
  options[:region]   = url.query_params['cr']

  if url.query_params['as_filetype']
    options[:filetype] = url.query_params['as_filetype']
  end

  case url.query_params['as_qdr']
  when 'd'
    options[:within_past_day] = true
  when 'w'
    options[:within_past_week] = true
  when 'm'
    options[:within_past_months] = 1
  when 'm2'
    options[:within_past_months] = 2
  when 'm3'
    options[:within_past_months] = 3
  when 'm6'
    options[:within_past_months] = 6
  when 'y'
    options[:within_past_year] = true
  end

  if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
    options[:numeric_range] = Range.new(
      url.query_params['as_nlo'].to_i,
      url.query_params['as_nhi'].to_i
    )
  end

  if url.query_params['as_occt']
    options[:occurs_within] = url.query_params['as_occt'].to_sym
  end

  options[:site] = url.query_params['as_sitesearch']

  options[:rights] = LICENSES[url.query_params['as_rights']]
  options[:filtered] = (url.query_params[:safe] == 'active')

  if url.query_params['as_rq']
    options[:related] = url.query_params['as_rq']
  elsif url.query_params['as_lq']
    options[:link] = url.query_params['as_lq']
  end

  return WebQuery.new(options,&block)
end
new(options={},&block) click to toggle source

Creates a new Web query.

@param [Hash] options

Additional options.

@option options [String] :search_host (www.google.com)

The host to submit queries to.

@option options [Integer] :results_per_page

Specifies the number of results for each page.

@option options [String, Symbol] :language (Languages.native)

Search for results in the specified language.

@option options [String] :region

Search for results from the specified region.

@option options [Boolean] :within_past_day

Search for results that were created within the past day.

@option options [Boolean] :within_past_week

Search for results that were created within the past week.

@option options [Boolean] :within_past_month

Search for results that were created within the past month.

@option options [Boolean] :within_past_year

Search for results that were created within the past year.

@option options [:title, :body, :url] :occurs_within

Searches for results where the keywords occurr within a specific
part of the result page.

@option options [Symbol] :rights

Search for results licensed under the specified license.

@option options [Boolean] :filtered

Specifies whether or not to use SafeSearch.

@yield [query]

If a block is given, it will be passed the new Web query.

@yieldparam [WebQuery] query

The new Web query.

@return [WebQuery]

The new Web query.

@example

WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')

@example

WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
  q.within_past_week = true
end
Calls superclass method GScraper::Search::Query::new
# File lib/gscraper/search/web_query.rb, line 149
def initialize(options={},&block)
  @agent = GScraper.web_agent(options)

  @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)

  @region = options[:region]

  if options[:within_past_day]
    @within_past_day    = options[:within_past_day]
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = false
  elsif options[:within_past_week]
    @within_past_day    = false
    @within_past_week   = options[:within_past_week]
    @within_past_months = false
    @within_past_year   = false
  elsif options[:within_past_months]
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = options[:within_past_months]
    @within_past_year   = false
  elsif options[:within_past_year]
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = options[:within_past_year]
  else
    @within_past_day    = false
    @within_past_week   = false
    @within_past_months = false
    @within_past_year   = false
  end

  @occurs_within = options[:occurs_within]
  @rights        = options[:rights]
  @filtered      = options[:filtered]

  super(options,&block)
end

Public Instance Methods

page(page_index) click to toggle source

Returns a page containing results at the specific page index.

@param [Integer] page_index

The page index to query.

@return [Page<Result>]

The page at the given index for the query.
# File lib/gscraper/search/web_query.rb, line 387
def page(page_index)
  Page.new do |new_page|
    doc = @agent.get(page_url(page_index))

    if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
      raise(Blocked,"Google has temporarily blocked our IP Address",caller)
    end

    results        = doc.search('//li[@class="g"]')
    results_length = [@results_per_page, results.length].min

    rank_offset = result_offset_of(page_index)

    results_length.times do |index|
      result   = results[index]
      rank     = rank_offset + (index + 1)
      link     = result.at('.//h3/a')
      title    = link.inner_text
      link_url = URI(link.get_attribute('href')).query_params['q']
      url      = URI(link_url)
      
      summary_text = ''

      if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
        content.children.each do |elem|
          break if (!(elem.text?) && elem.name=='br')

          summary_text << elem.inner_text
        end

      end

      cached_url  = nil
      similar_url = nil

      if (gl = result.at('.//div[@class="s"]'))
        if (cached_link = gl.at('.//a[1]'))
          cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
        end

        if (similar_link = gl.at('.//a[2]'))
          similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
        end
      end

      new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
    end
  end
end
page_url(page_index) click to toggle source

Returns the URL that represents the query at a specific page index.

@param [Integer] page_index

The page index to create the URL for.

@return [URI::HTTP]

The URL for a query at the given page index.
# File lib/gscraper/search/web_query.rb, line 369
def page_url(page_index)
  url = search_url

  url.query_params['start'] = result_offset_of(page_index)
  url.query_params['sa']    = 'N'

  return url
end
result_at(index) click to toggle source

Returns the result at the specified index.

@param [Integer]

The index of the result.
# File lib/gscraper/search/web_query.rb, line 453
def result_at(index)
  page(page_index_of(index))[result_index_of(index)]
end
search_url() click to toggle source

The URL that represents the query.

@return [URI::HTTP]

The URL for the query.
# File lib/gscraper/search/web_query.rb, line 294
def search_url
  url = URI::HTTP.build(:host => search_host, :path => PATH)

  set_param = lambda { |param,value|
    url.query_params[param.to_s] = value if value
  }

  set_param.call('num',@results_per_page)
  set_param.call('q',expression)
  set_param.call('as_epq',@exact_phrase)
  set_param.call('as_oq',@with_words)
  set_param.call('as_eq',@without_words)

  set_param.call('lr',@language)
  set_param.call('cr',@region)

  set_param.call('as_filetype',@filetype)

  if @within_past_day
    url.query_params['as_qdr'] = 'd'
  elsif @within_past_week
    url.query_params['as_qdr'] = 'w'
  elsif @within_past_months
    case @within_past_months
    when 1
      url.query_params['as_qdr'] = 'm'
    when 2
      url.query_params['as_qdr'] = 'm2'
    when 3
      url.query_params['as_qdr'] = 'm3'
    when 6
      url.query_params['as_qdr'] = 'm6'
    end
  elsif @within_past_year
    url.query_params['as_qdr'] = 'y'
  end

  if @numeric_range.kind_of?(Range)
    url.query_params['as_nlo'] = @numeric_range.begin
    url.query_params['as_nhi'] = @numeric_range.end
  end

  case @occurs_within
  when :title, 'title'
    url.query_params['as_occt'] = 'title'
  when :body, 'body'
    url.query_params['as_occt'] = 'body'
  when :url, 'url'
    url.query_params['as_occt'] = 'url'
  when :links, 'links'
    url.query_params['as_occt'] = 'links'
  end

  set_param.call('as_sitesearch',@site)

  if @rights
    url.query_params['as_rights'] = LICENSES.reverse[@rights]
  end

  if @filtered
    url.query_params['safe'] = 'active'
  end

  return url
end
top_result() click to toggle source

Returns the first result on the first page.

@return [Result]

The first result.
# File lib/gscraper/search/web_query.rb, line 443
def top_result
  first_page.first
end