class Scrapix::GoogleImages
download images from a Google Image Search
Public Class Methods
new(query = nil, options = {})
click to toggle source
options can be:
size: named size, e.g. icon, small, medium, large, 13mp, 1280x800, etc. safe: true or false
# File lib/scrapix/google_images.rb, line 10 def initialize(query = nil, options = {}) self.options = options self.query = query self.total = 100 end
Public Instance Methods
find(page_no = 1)
click to toggle source
params: page_no => starting page number for google results
# File lib/scrapix/google_images.rb, line 43 def find(page_no = 1) images = {} return images unless @query while images.count < @num visit search_url(page_no) links = Capybara.page.all("a") links = links.select{|x| x["href"] =~ /^\/imgres/} if links.any? return images unless links.any? page_counter = 0 links.each do |link| attribs = CGI.parse(URI.parse(link["href"]).query) rescue nil next if attribs.nil? hash = Digest::MD5.hexdigest(attribs["imgurl"][0]) unless images.has_key?(hash) images[hash] = { width: attribs["w"][0], height: attribs["h"][0], url: attribs["imgurl"][0], reference_url: attribs["imgrefurl"][0] } page_counter += 1 end end page_no += 1 break if page_counter == 0 end images.take(@num).map{|x| x[1]} end
options=(opts)
click to toggle source
# File lib/scrapix/google_images.rb, line 28 def options=(opts) # convert symbolic keys to string keys options = {} opts.each { |k,v| options[k.to_s] = v } # merge the options with defaults! @options ||= { "safe" => true, "size" => "any" } @options.merge!(options) sanitize_size # parametrize for url purposes @params = create_params end
query=(q)
click to toggle source
# File lib/scrapix/google_images.rb, line 20 def query=(q) @query = URI.escape(q) if q end
search_url(page_no = 1)
click to toggle source
# File lib/scrapix/google_images.rb, line 16 def search_url(page_no = 1) "http://google.com/search?tbm=isch&q=#{@query}#{@params}&start=#{(page_no - 1)*20}" end
total=(n)
click to toggle source
# File lib/scrapix/google_images.rb, line 24 def total=(n) @num = n.to_i end
Private Instance Methods
create_params()
click to toggle source
# File lib/scrapix/google_images.rb, line 106 def create_params string = "" string += "&tbs=#{@options["size"]}" if @options["size"] string += "&safe=off" unless @options["safe"] string end
sanitize_size()
click to toggle source
if width or height is specified, use them as ‘exact’ size otherwise, use a MP size for finding images larger than that size otherwise, use a given named size
# File lib/scrapix/google_images.rb, line 90 def sanitize_size @options["size"] = case when m = @options["size"].match(/^(\d*)x(\d*)$/) then "isz:ex,iszw:#{m[1]},iszh:#{m[2]}" when m = @options["size"].match(/^(\d*)$/) then "isz:ex,iszw:#{m[1]},iszh:#{m[1]}" when m = @options["size"].match(/^(\d*)mp$/) then "isz:lt,islt:#{validate_mp_size(m[1])}mp" when @options["size"] == "large" then "isz:l" when @options["size"] == "medium" then "isz:m" when @options["size"] == "small" then "isz:s" when @options["size"] == "icon" then "isz:i" else nil end end
validate_mp_size(mp)
click to toggle source
# File lib/scrapix/google_images.rb, line 75 def validate_mp_size(mp) mp = mp.to_i lower_bound = 0; upper_bound = 9999; valid_mp_sizes = [ 2, 4, 6, 8, 10, 12, 15, 20, 40, 70 ] valid_mp_sizes.each do |s| return s if s == mp lower_bound = s if s < mp upper_bound = s if s > mp && s < upper_bound end mp - lower_bound > upper_bound - mp ? upper_bound : lower_bound end