class Rinkusukurepa::Scraper

Attributes

description[RW]
icon[RW]
images[RW]
page_document[RW]
page_type[RW]
page_url[RW]
site_name[RW]
title[RW]
video[RW]

Public Class Methods

new(url) click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 37
def initialize(url)
  @page_url = URI.parse(URI.encode(url))
  @images = []
end
parse_url!(url) click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 31
def self.parse_url!(url)
  scraper = Rinkusukurepa::Scraper.new(url)
  scraper.parse_url!
  scraper
end

Public Instance Methods

attributes() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 58
def attributes
  {
    icon: @icon,
    title: @title,
    description: @description,
    images: @images || [],
    site_name: @site_name,
    video: @video,
    page_type: @page_type
  }
end
get_description() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 100
def get_description
  # Description
  description = @page_document.css('meta[property="og:description"]')
  if description.nil? || description.empty?
    description = @page_document.css('meta[name="description"]')
    if !description.nil? && !description.empty?
      @description = description.first.attributes['content'].value
    end
  else
    # parse from meta
    @description = description.first.attributes['content'].value
  end

  @description
end
get_icon() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 70
def get_icon
  icon = @page_document.css('link[rel="icon"]')
  icon = @page_document.css('link[rel="shortcut icon"]') if icon.nil? || icon.empty?

  if !icon.nil? && !icon.empty?
    href = icon.first.attributes['href'].value
    uri = URI.parse(URI.encode(href))
    if uri.host.nil?
      @icon = "http://#{@page_url.host}#{uri.path}"
    else
      @icon = "http://#{uri.host}#{uri.path}"
    end
  end

  @icon
end
get_images() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 116
def get_images
  # check if url is an image
  if File.extname(@page_url.to_s).match(Rinkusukurepa.image_extensions)
    @images << @page_url.to_s
    return @images
  end

  # Image Preview
  preview = @page_document.css('meta[property="og:image"]')
  if preview.nil? || preview.empty?
    retrieve_all_images_from_document
  else
    # parse from meta
    @images = preview.map do |p|
      url = p.attributes['content'].value
      uri = URI.parse(URI.encode(url.strip! || url))
      if uri.to_s.match(/http/)
        if uri.host.nil?
          "http://#{@page_url.host}#{uri.to_s}"
        else
          uri.to_s
        end
        if http_image = FastImage.size(uri.to_s)
          if http_image[0] > Rinkusukurepa.image_min_width && http_image[1] > Rinkusukurepa.image_min_height
            uri.to_s
          end
        end
      end
    end.compact.uniq

    if @images.nil? || @images.empty?
      retrieve_all_images_from_document
    end
  end

  @images
end
get_page_type() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 178
def get_page_type
  # Type
  type = @page_document.css('meta[property="og:type"]')
  if type.nil? || type.empty?
    @page_type = Rinkusukurepa::Scraper.website
  else
    # parse from meta
    page_type = type.first.attributes['content'].value
    @page_type = Rinkusukurepa.page_types.include?(page_type) ? page_type : Rinkusukurepa::Scraper.website
  end

  @page_type
end
get_site_name() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 154
def get_site_name
  # Site Name
  site_name = @page_document.css('meta[property="og:site_name"]')
  if site_name.nil? || site_name.empty?
    @site_name = @page_url.host
  else
    # parse from meta
    @site_name = site_name.first.attributes['content'].value
  end

  @site_name
end
get_title() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 87
def get_title
  # title
  title = @page_document.css('meta[property="og:title"]')
  if title.nil? || title.empty?
    @title = @page_document.css('title').text
  else
    # parse from meta
    @title = title.first.attributes['content'].value
  end

  @title
end
get_video() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 167
def get_video
  # Video
  videos = @page_document.css('meta[property="og:video:secure_url"]')
  if !videos.nil? && !videos.empty?
    # parse from meta
    @video = videos.find { |v| v.attributes['content'].value.include?('embed') }.attributes['content'].value
  end

  @video
end
parse_url() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 42
def parse_url
  @page_document = Nokogiri::HTML(open(@page_url.to_s, :allow_redirections => :all))
  self
end
parse_url!() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 47
def parse_url!
  parse_url
  get_icon
  get_title
  get_description
  get_images
  get_site_name
  get_video
  get_page_type
end

Protected Instance Methods

retrieve_all_images_from_document() click to toggle source
# File lib/rinkusukurepa/scraper.rb, line 194
def retrieve_all_images_from_document
  # get all img
  preview = @page_document.css('img')
  if !preview.nil? && !preview.empty?
    previews = preview[0..Rinkusukurepa.max_image].map do |p|
      image = p.attributes['src']
      unless image.blank?
        url = nil
        if image.value.match(Rinkusukurepa.image_extensions) && image.value.match(/http/)
           uri = URI.parse(URI.encode(image.value.strip! || image.value))
          if uri.host.nil?
            url = "http://#{@page_url.host}/#{uri.to_s}"
          else
            url = uri.to_s
          end
        end

        unless url.nil?
          # NOTE: fallback mini magick if fast image return's nil
          # fast image
          if http_image = ::FastImage.size(url)
            if http_image[0] > Rinkusukurepa.image_min_width && http_image[1] > Rinkusukurepa.image_min_height
              url
            end
          # mini magick
          else
            begin
              http_image = ::MiniMagick::Image.open(url)
              if http_image.present?
                if http_image.width > Rinkusukurepa.image_min_width && http_image.height > Rinkusukurepa.image_min_height
                  url
                end
              end
            rescue => e
              nil
            end
          end
        end
      end
    end.compact.uniq
    # set scraped images
    @images = previews
  end
end