class Grabbit::Scrape

Public Class Methods

new(url) click to toggle source
# File lib/grabbit/scrape.rb, line 7
def initialize(url)
  @url = url
  @doc = get_remote_data
end

Public Instance Methods

description() click to toggle source
# File lib/grabbit/scrape.rb, line 37
def description
    if @doc

            # Look for og:description
            @doc.xpath("//meta[@property='og:description']/@content").each do |element|
                            return element.value.strip                                
                            end

                            # Look for twitter:description
                            @doc.xpath("//meta[@name='twitter:description']/@content").each do |element|
                            return element.value.strip                                
                            end

                            # If no OG tag or Titter card, look for <meta name='description'> tags.
            @doc.xpath("//meta[@name='description']/@content").each do |element|
                            return element.value.strip                                
                            end
                    
                    
                            # Finally return a blank string
                            ""
                    else
                            nil
                    end                  
end
images() click to toggle source
# File lib/grabbit/scrape.rb, line 63
def images
    # The following code to return relevant images, is based on the ideas in this blog post:
    # https://tech.shareaholic.com/2012/11/02/how-to-find-the-image-that-best-respresents-a-web-page/
    # If the following does not return good results consistently, then consider using
    # the Fast Image Gem (https://github.com/sdsykes/fastimage).
  # Check to find the 3 largest images and/or images with an aspect ratio less than 3.0

    images_array = []

    if @doc
           # Look for OG:Image first
           @doc.search('//meta[@property="og:image"]/@content').each do |a|
                   images_array << image_absolute_uri(a.value)
           end
           return images_array unless images_array.empty?
           
           # Look for Twitter:Image
           @doc.search('//meta[@name="twitter:image:src"]/@content').each do |a|
                   images_array << image_absolute_uri(a.value)
           end                     
           return images_array unless images_array.empty?

    # Next look for image with id of main-image (--> Amazon) or prodImage (--> WalMart)
    @doc.search('//img[@id="main-image" or @id="prodImage"]/@src').each do |a| 
                   images_array << image_absolute_uri(a.value)
    end
           return images_array unless images_array.empty?            

          # Now search for all images within divs with id="content" excluding sidebar, comment, footer and header sections.
        @doc.search("//img[not(ancestor::*[contains(@id, 'sidebar') or contains(@id, 'comment') or contains(@id, 'footer') or contains(@id, 'header')]) and ancestor::*[contains(@id, 'content')]]/@src").each do |a|
                   images_array << image_absolute_uri(a.value)                 
        end
           return images_array unless images_array.empty? 


    # Now search for all images in the whole page excluding sidebar, comment, footer and header sections.
    @doc.search("//img[not(ancestor::*[contains(@id, 'sidebar') or contains(@id, 'comment') or contains(@id, 'footer') or contains(@id, 'header')])]/@src").each do |a|
                   images_array << image_absolute_uri(a.value)              
    end
           return images_array unless images_array.empty?              


    # Now search for all images in the whole page
    @doc.search("//img/@src").each do |a|
                   images_array << image_absolute_uri(a.value)                      
            end

    end
    images_array
end
title() click to toggle source
# File lib/grabbit/scrape.rb, line 12
def title
    if @doc

            # Look for og:title or twitter:title first
            @doc.xpath("//meta[@property='og:title']/@content").each do |element|
                            return element.value.strip
                            end

            # Look for twitter:title first
            @doc.xpath("//meta[@name='twitter:title']/@content").each do |element|
                            return element.value.strip
                            end                         
                    
                            # If no og, look for <title> tags.
                            @doc.css("title").each do |element|
                            return element.text.strip                                 
                            end

                            # Finally return a blank string
                            ""
                    else
                            nil
                    end
end

Private Instance Methods

get_remote_data() click to toggle source
# File lib/grabbit/scrape.rb, line 116
            def get_remote_data
                    begin
            response = HTTParty.get(@url)
    rescue
            return nil
    end

   if response.code == 200
           begin
                   Nokogiri::HTML(response.body) 
           rescue
                   return nil
           end
   else
           nil
   end 
end
image_absolute_uri(image_path) click to toggle source
# File lib/grabbit/scrape.rb, line 134
def image_absolute_uri(image_path) 
  URI.join(@url, image_path).to_s
end