class GScraper::Search::WebQuery
Constants
Attributes
Filter the search results
Search
for results in the format
Search
for results inside the domain
Search
for results not in the format
Search
for results where the query occurs within the area
Search
for results outside the domain
Search
for results from the region
Results per-page
Search
for results which have the rights
Search
for results within the past day
Search
for results within the past months
Search
for results within the past week
Search
for results within the past year
Public Class Methods
Creates a new Web query from a search URL.
@param [URI::HTTP, String] url
The search URL.
@param [Hash] options
Additional options.
@yield [query]
If a block is given, it will be passed the new Web query.
@yieldparam [WebQuery] query
The new web query.
@return [WebQuery]
The new Web query.
@example
WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
@example
WebQuery.from_url('http://www.google.com/search?q=ruby') do |q| q.within_last_month = true q.occurs_within = :title end
# File lib/gscraper/search/web_query.rb, line 223 def WebQuery.from_url(url,options={},&block) url = URI(url.to_s) options[:search_host] = url.host options[:results_per_page] = if url.query_params['num'] url.query_params['num'].to_i else RESULTS_PER_PAGE end options[:query] = url.query_params['q'] options[:exact_phrase] = url.query_params['as_epq'] options[:with_words] = url.query_params['as_oq'] options[:without_words] = url.query_params['as_eq'] options[:language] = url.query_params['lr'] options[:region] = url.query_params['cr'] if url.query_params['as_filetype'] options[:filetype] = url.query_params['as_filetype'] end case url.query_params['as_qdr'] when 'd' options[:within_past_day] = true when 'w' options[:within_past_week] = true when 'm' options[:within_past_months] = 1 when 'm2' options[:within_past_months] = 2 when 'm3' options[:within_past_months] = 3 when 'm6' options[:within_past_months] = 6 when 'y' options[:within_past_year] = true end if (url.query_params['as_nlo'] || url.query_params['as_nhi']) options[:numeric_range] = Range.new( url.query_params['as_nlo'].to_i, url.query_params['as_nhi'].to_i ) end if url.query_params['as_occt'] options[:occurs_within] = url.query_params['as_occt'].to_sym end options[:site] = url.query_params['as_sitesearch'] options[:rights] = LICENSES[url.query_params['as_rights']] options[:filtered] = (url.query_params[:safe] == 'active') if url.query_params['as_rq'] options[:related] = url.query_params['as_rq'] elsif url.query_params['as_lq'] options[:link] = url.query_params['as_lq'] end return WebQuery.new(options,&block) end
Creates a new Web query.
@param [Hash] options
Additional options.
@option options [String] :search_host (www.google.com)
The host to submit queries to.
@option options [Integer] :results_per_page
Specifies the number of results for each page.
@option options [String, Symbol] :language (Languages.native
)
Search for results in the specified language.
@option options [String] :region
Search for results from the specified region.
@option options [Boolean] :within_past_day
Search for results that were created within the past day.
@option options [Boolean] :within_past_week
Search for results that were created within the past week.
@option options [Boolean] :within_past_month
Search for results that were created within the past month.
@option options [Boolean] :within_past_year
Search for results that were created within the past year.
@option options [:title, :body, :url] :occurs_within
Searches for results where the keywords occurr within a specific part of the result page.
@option options [Symbol] :rights
Search for results licensed under the specified license.
@option options [Boolean] :filtered
Specifies whether or not to use SafeSearch.
@yield [query]
If a block is given, it will be passed the new Web query.
@yieldparam [WebQuery] query
The new Web query.
@return [WebQuery]
The new Web query.
@example
WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
@example
WebQuery.new(:exact_phrase => 'fluent interfaces') do |q| q.within_past_week = true end
GScraper::Search::Query::new
# File lib/gscraper/search/web_query.rb, line 149 def initialize(options={},&block) @agent = GScraper.web_agent(options) @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE) @region = options[:region] if options[:within_past_day] @within_past_day = options[:within_past_day] @within_past_week = false @within_past_months = false @within_past_year = false elsif options[:within_past_week] @within_past_day = false @within_past_week = options[:within_past_week] @within_past_months = false @within_past_year = false elsif options[:within_past_months] @within_past_day = false @within_past_week = false @within_past_months = options[:within_past_months] @within_past_year = false elsif options[:within_past_year] @within_past_day = false @within_past_week = false @within_past_months = false @within_past_year = options[:within_past_year] else @within_past_day = false @within_past_week = false @within_past_months = false @within_past_year = false end @occurs_within = options[:occurs_within] @rights = options[:rights] @filtered = options[:filtered] super(options,&block) end
Public Instance Methods
Iterates over the sponsored ads on the first page.
@yield [ad]
The given block will be passed each sponsored ad.
@yieldparam [SponsoredAd] ad
A sponsored ad on the first page.
@return [Enumerator]
If no block is given, an Enumerator object will be returned.
# File lib/gscraper/search/web_query.rb, line 499 def each_sponsored_link(&block) sponsored_links.each(&block) end
Returns a page containing results at the specific page index.
@param [Integer] page_index
The page index to query.
@return [Page<Result>]
The page at the given index for the query.
# File lib/gscraper/search/web_query.rb, line 387 def page(page_index) Page.new do |new_page| doc = @agent.get(page_url(page_index)) if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]') raise(Blocked,"Google has temporarily blocked our IP Address",caller) end results = doc.search('//li[@class="g"]') results_length = [@results_per_page, results.length].min rank_offset = result_offset_of(page_index) results_length.times do |index| result = results[index] rank = rank_offset + (index + 1) link = result.at('.//h3/a') title = link.inner_text link_url = URI(link.get_attribute('href')).query_params['q'] url = URI(link_url) summary_text = '' if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font'))) content.children.each do |elem| break if (!(elem.text?) && elem.name=='br') summary_text << elem.inner_text end end cached_url = nil similar_url = nil if (gl = result.at('.//div[@class="s"]')) if (cached_link = gl.at('.//a[1]')) cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href')) end if (similar_link = gl.at('.//a[2]')) similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href')) end end new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url) end end end
Returns the URL that represents the query at a specific page index.
@param [Integer] page_index
The page index to create the URL for.
@return [URI::HTTP]
The URL for a query at the given page index.
# File lib/gscraper/search/web_query.rb, line 369 def page_url(page_index) url = search_url url.query_params['start'] = result_offset_of(page_index) url.query_params['sa'] = 'N' return url end
Returns the result at the specified index.
@param [Integer]
The index of the result.
# File lib/gscraper/search/web_query.rb, line 453 def result_at(index) page(page_index_of(index))[result_index_of(index)] end
The URL that represents the query.
@return [URI::HTTP]
The URL for the query.
# File lib/gscraper/search/web_query.rb, line 294 def search_url url = URI::HTTP.build(:host => search_host, :path => PATH) set_param = lambda { |param,value| url.query_params[param.to_s] = value if value } set_param.call('num',@results_per_page) set_param.call('q',expression) set_param.call('as_epq',@exact_phrase) set_param.call('as_oq',@with_words) set_param.call('as_eq',@without_words) set_param.call('lr',@language) set_param.call('cr',@region) set_param.call('as_filetype',@filetype) if @within_past_day url.query_params['as_qdr'] = 'd' elsif @within_past_week url.query_params['as_qdr'] = 'w' elsif @within_past_months case @within_past_months when 1 url.query_params['as_qdr'] = 'm' when 2 url.query_params['as_qdr'] = 'm2' when 3 url.query_params['as_qdr'] = 'm3' when 6 url.query_params['as_qdr'] = 'm6' end elsif @within_past_year url.query_params['as_qdr'] = 'y' end if @numeric_range.kind_of?(Range) url.query_params['as_nlo'] = @numeric_range.begin url.query_params['as_nhi'] = @numeric_range.end end case @occurs_within when :title, 'title' url.query_params['as_occt'] = 'title' when :body, 'body' url.query_params['as_occt'] = 'body' when :url, 'url' url.query_params['as_occt'] = 'url' when :links, 'links' url.query_params['as_occt'] = 'links' end set_param.call('as_sitesearch',@site) if @rights url.query_params['as_rights'] = LICENSES.reverse[@rights] end if @filtered url.query_params['safe'] = 'active' end return url end
Returns the sponsored links for the query.
@return [SponsoredLinks<SponsoredAd>]
The sponsored links for the query.
# File lib/gscraper/search/web_query.rb, line 463 def sponsored_links SponsoredLinks.new do |links| doc = @agent.get(search_url) # top and side ads doc.search('//h3/a[starts-with(@id,"pa")]').each do |link| title = link.inner_text url = URI("http://#{search_host}" + link.get_attribute('href')) links << SponsoredAd.new(title,url) end end end
Returns the first result on the first page.
@return [Result]
The first result.
# File lib/gscraper/search/web_query.rb, line 443 def top_result first_page.first end
Returns the first sponsored ad on the first page of results.
@return [SponsoredAd]
The first sponsored ad.
# File lib/gscraper/search/web_query.rb, line 483 def top_sponsored_link top_sponsored_links.first end