class GDNewsScraper::Scrapers::PolygonCOM::News

Attributes

stream[RW]

Public Class Methods

new(offset = nil) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 21
def initialize(offset = nil)
  unless offset.nil?
    uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }"

    @page   = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
    @stream = Hash.new

    stream[:stream] = Hash.new
    stream[:stream][:size]  = @page.at('.c-pagination__text').text.split.first.to_num
    stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num
    stream[:stream][:prev]  = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i
    stream[:stream][:next]  = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i

    stream[:feed] = Hash.new
    stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL
    stream[:feed][:source] = 'polygon'
    stream[:feed][:label] = 'Polygon'

    stream[:articles] = Array.new

    perform
  end
end

Public Instance Methods

parse(article) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 51
def parse(article)
  pulse = Hash.new

  # This allows the Parser to get its data from the Index page, when the
  # article is a Nokogiri::XML or from the Article page when the article
  # is a URL.
  #
  # Passing a URL is mainly for debugging in case an Article fails to
  # parse and should only be used as such..
  #
  if article.is_a?(String)
    begin
      article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))

      first_element = article_page.at('.l-col__main').elements.first
      is_a_video    = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'

      key    = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i
      url    = article
      title  = strip(article_page.css('.c-entry-hero').at('.c-page-title'))
      cover  = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original'))
      author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0])

      begin
        article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item'))
        parsed_date  = DateTime.parse(article_date)

        date = parsed_date.to_time.to_i

        # Never failed so not entirely sure what to rescue from, but with
        # dates it allways risky not to rescue
        #
        # TODO: When it fails, find out why and rescue from that instead
        #       of rescuing from 'everything' ..
        #
      rescue
        date = nil
      end
    rescue TypeError
      raise ArgumentError.new('Invalid URL')
    end
  elsif article.is_a?(Nokogiri::XML::Element)
    article_container = article.at('.c-entry-box--compact--article')
    
    if article_container.nil?
      raise StandardError.new('Not an Article, skipping..')
    else
      key    = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i
      url    = article.at('.c-entry-box--compact__title').at('> a').attr('href')
      title  = strip(article.at('.c-entry-box--compact__title'))
      cover  = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil)
      author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0])
      date   = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i
      
      article_page = url
    end
  else
    raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL")
  end
  
  pulse[:id]      = key
  pulse[:hash]    = ::Base64.encode64("#{ title } - #{ key }")
  pulse[:cover]   = cover
  pulse[:url]     = url
  pulse[:title]   = title
  pulse[:author]  = author
  pulse[:date]    = date
  pulse[:content] = parse_article_body(article_page)
  pulse[:tags]    = title.downcase.split

  return pulse
rescue => e
  {
    success: false,
    message: "There was a problem while parsing this Article: #{ e }"
  }
end
parse_article_body(article) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 129
def parse_article_body(article)
  if article.is_a?(String)
    article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS))
  else
    article_page = article
  end

  article_container = article_page.at('.c-entry-content')

  article_body = {
    galleries: { },
    videos: { },

    anchors: { },
    figures: { },

    body: [ ]
  }

  # Check here as well since an Article CAN have an embeded video instead
  # of a Cover and still show as a non-video artciel on the News page from
  # where we initially took the 'is_a_video' check
  #
  first_element = article_page.at('.l-col__main').elements.first
  is_a_video    = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media'

  if is_a_video
    id = unique_id(first_element)

    is_polygon_video = !first_element.attributes['data-volume-uuid'].nil?

    if is_polygon_video
      article_body[:videos][id] = {}
      article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip
      article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }"
    else
      article_body[:videos][id] = {}
      article_body[:videos][id][:url] = first_element.at('iframe').attr('src')
    end

    article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html
  end

  article_container.children.each do |node|
    content    = node.content.strip.empty?
    text       = node.text.strip.empty?
    attributes = node.attributes.empty?
    children   = node.children.empty?

    if content && text && attributes && children
      node.remove
    else

      if node.name == 'div'
        
        # Check to see if the div contains a embeded video
        #
        iframe = node.at('iframe')

        if iframe # YouTube videos
          id = unique_id(iframe)

          article_body[:videos][id] = {}
          article_body[:videos][id][:url] = iframe.attr('src')

          article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html
        end

        # Check to see if the Article has a video by Polygon, which is
        # embeded differnetly than a YouTube video..
        #
        polygon_video = node.attributes['data-volume-uuid']

        unless polygon_video.nil?
          id = unique_id(polygon_video)

          article_body[:videos][id] = {}
          article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip
          article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }"

          article_body[:body] << node.replace("{{video:#{ id }}}").to_html
        end

        # Check to see if the div contains a gallery
        #
        gallery = node.at('.c-image-gallery')

        if gallery
          gallery_container = gallery.at('.c-image-gallery__thumbs-viewport')

          id = unique_id(gallery)
          article_body[:galleries][id] = []

          gallery_container.children.children.each do |image_container|
            image = image_container.at('a')

            if image
              article_body[:galleries][id] << image.attr('href')
            end
          end

          article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html
        end

        twitdget = node.at('.twitter-tweet')

        if twitdget
          article_body[:body] << twitdget.to_html
        end

        redditget = node.at('.reddit-card')

        if redditget
          article_body[:body] << redditget.to_html
        end
      end

      # Extract 'figure' outside the node check because in many cases it's
      # nested within other HTML elements and it makes it harder to
      # extract without being too specific
      #
      # Do a double check because if the current node is in fact a figure,
      # it will return false
      #
      figure = (node.name == 'figure' || node.at('figure.e-image'))

      if figure
        node.css('.e-image__image').each do |image|
          image_url = image.attr('data-original')
  
          id = unique_id(node)
         
          article_body[:figures][id] = { }
          article_body[:figures][id][:image] = image_url

          article_body[:figures][id][:title] = image.at('img')&.attr('title')
          article_body[:figures][id][:alt]   = image.at('img')&.attr('alt')

          image_meta = node.at('.e-image__meta')

          unless image_meta.nil?
            article_body[:figures][id][:caption] = strip(image_meta.at('figcaption'))
            article_body[:figures][id][:cite]    = strip(image_meta.at('cite'))
          end

          article_body[:body] << node.replace("{{figure:#{ id }}}").to_html
        end

        node.traverse { |children| children.remove }
      end

      # First ensure the node is an actual element. This removes random HTML elements
      #
      # => node.element?
      #
      # Secondly, ensure the node is what we actual want. We don't want <div>'s
      # which are usualy used for placing inline advertisments or content specific
      # only to that website
      #
      # => WHITELIST[:default].include?(node.name)
      #
      if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name)
        node.children.each do |inner_node|
          case inner_node.name
          when 'a'
            id = unique_id(inner_node)
            
            article_body[:anchors][id] = {
              text: inner_node.children.text,
              url: inner_node.attr('href')
            }

            inner_node.replace("{{anchor:#{ id }}}")
          end
        end

        begin

          # Remove all attributes
          #
          parsed_node = node.xpath('.//@*').remove

          # Check the integrity of the node before parsing it into html
          # since 'content' is a Nokogiri feature
          #
          omit_node = node.content.empty?

          # Return clean HTML, including HTML elements and text
          #
          parsed_node = node.to_html

        rescue

        end
      end

      article_body[:body] << parsed_node unless parsed_node.nil? || omit_node
    end
  end

  return article_body
rescue => e
  "There was a problem while parsing this Article's body: #{ e }"
end
perform() click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 45
def perform
  @page.css('.c-compact-river__entry').each do |article|
    stream[:articles].push(parse(article))
  end
end

Private Instance Methods

attr(attribute) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 336
def attr(attribute)
  attributes&.fetch(attribute, nil)&.value
end
strip(string) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 340
def strip(string)
  string&.text&.strip
end
unique_id(node) click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 344
def unique_id(node)
  Base64.strict_encode64(node.to_s)
    .reverse
    .gsub(/[^0-9A-Za-z]/, '')[0..100]
    .downcase
    .to_sym
end