class GDNewsScraper::Scrapers::PolygonCOM::News
Attributes
stream[RW]
Public Class Methods
new(offset = nil)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 21 def initialize(offset = nil) unless offset.nil? uri = "#{ GDNewsScraper::Scrapers::PolygonCOM::URL }/news/archives/#{ offset }" @page = Nokogiri::HTML(open(uri, GDNewsScraper::Scrapers::PolygonCOM::HEADERS)) @stream = Hash.new stream[:stream] = Hash.new stream[:stream][:size] = @page.at('.c-pagination__text').text.split.first.to_num stream[:stream][:pages] = @page.at('.c-pagination__text').text.split.last.to_num stream[:stream][:prev] = @page.at('.c-pagination__prev')&.attr('href')&.split('/')&.last.to_i stream[:stream][:next] = @page.at('.c-pagination__next')&.attr('href')&.split('/')&.last.to_i stream[:feed] = Hash.new stream[:feed][:url] = GDNewsScraper::Scrapers::PolygonCOM::URL stream[:feed][:source] = 'polygon' stream[:feed][:label] = 'Polygon' stream[:articles] = Array.new perform end end
Public Instance Methods
parse(article)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 51 def parse(article) pulse = Hash.new # This allows the Parser to get its data from the Index page, when the # article is a Nokogiri::XML or from the Article page when the article # is a URL. # # Passing a URL is mainly for debugging in case an Article fails to # parse and should only be used as such.. # if article.is_a?(String) begin article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS)) first_element = article_page.at('.l-col__main').elements.first is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media' key = article_page.at('span[data-content-admin-id]').attr('data-content-admin-id').to_i url = article title = strip(article_page.css('.c-entry-hero').at('.c-page-title')) cover = (is_a_video ? nil : article_page.css('.l-col__main').at('.e-image__image').attr('data-original')) author = strip(article_page.css('.c-entry-hero').at('.c-byline').css('.c-byline__item > a').children[0]) begin article_date = strip(article_page.css('.c-entry-hero').at('.c-byline').css('time.c-byline__item')) parsed_date = DateTime.parse(article_date) date = parsed_date.to_time.to_i # Never failed so not entirely sure what to rescue from, but with # dates it allways risky not to rescue # # TODO: When it fails, find out why and rescue from that instead # of rescuing from 'everything' .. # rescue date = nil end rescue TypeError raise ArgumentError.new('Invalid URL') end elsif article.is_a?(Nokogiri::XML::Element) article_container = article.at('.c-entry-box--compact--article') if article_container.nil? raise StandardError.new('Not an Article, skipping..') else key = article.at('.c-entry-box--compact--article').attr('data-chorus-optimize-id').to_i url = article.at('.c-entry-box--compact__title').at('> a').attr('href') title = strip(article.at('.c-entry-box--compact__title')) cover = (article.at('.c-entry-box--compact__image').at('noscript').at('img').attr('src') rescue nil) author = strip(article.at('.c-byline').css('.c-byline__item > a').children[0]) date = JSON.parse(article.at('.c-byline').attr('data-cdata'))['timestamp'].to_i article_page = url end else raise ArgumentError.new("Make sure the 'article' argument is either a Hash containing the article's initial metadata or a String which is the article's URL") end pulse[:id] = key pulse[:hash] = ::Base64.encode64("#{ title } - #{ key }") pulse[:cover] = cover pulse[:url] = url pulse[:title] = title pulse[:author] = author pulse[:date] = date pulse[:content] = parse_article_body(article_page) pulse[:tags] = title.downcase.split return pulse rescue => e { success: false, message: "There was a problem while parsing this Article: #{ e }" } end
parse_article_body(article)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 129 def parse_article_body(article) if article.is_a?(String) article_page = Nokogiri::HTML(open(article, GDNewsScraper::Scrapers::PolygonCOM::HEADERS)) else article_page = article end article_container = article_page.at('.c-entry-content') article_body = { galleries: { }, videos: { }, anchors: { }, figures: { }, body: [ ] } # Check here as well since an Article CAN have an embeded video instead # of a Cover and still show as a non-video artciel on the News page from # where we initially took the 'is_a_video' check # first_element = article_page.at('.l-col__main').elements.first is_a_video = first_element.attr('class') == 'c-video-embed' || first_element.attr('class') == 'c-video-embed--media' if is_a_video id = unique_id(first_element) is_polygon_video = !first_element.attributes['data-volume-uuid'].nil? if is_polygon_video article_body[:videos][id] = {} article_body[:videos][id][:label] = first_element.attr('data-analytics-label')&.split('|')&.first&.strip article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ first_element.attr('data-volume-uuid') }" else article_body[:videos][id] = {} article_body[:videos][id][:url] = first_element.at('iframe').attr('src') end article_body[:body] << first_element.replace("{{video:#{ id }}}").to_html end article_container.children.each do |node| content = node.content.strip.empty? text = node.text.strip.empty? attributes = node.attributes.empty? children = node.children.empty? if content && text && attributes && children node.remove else if node.name == 'div' # Check to see if the div contains a embeded video # iframe = node.at('iframe') if iframe # YouTube videos id = unique_id(iframe) article_body[:videos][id] = {} article_body[:videos][id][:url] = iframe.attr('src') article_body[:body] << iframe.replace("{{video:#{ id }}}").to_html end # Check to see if the Article has a video by Polygon, which is # embeded differnetly than a YouTube video.. # polygon_video = node.attributes['data-volume-uuid'] unless polygon_video.nil? id = unique_id(polygon_video) article_body[:videos][id] = {} article_body[:videos][id][:label] = node.attr('data-analytics-label').split('|').first.strip article_body[:videos][id][:url] = "https://volume.vox-cdn.com/embed/#{ node.attr('data-volume-uuid') }" article_body[:body] << node.replace("{{video:#{ id }}}").to_html end # Check to see if the div contains a gallery # gallery = node.at('.c-image-gallery') if gallery gallery_container = gallery.at('.c-image-gallery__thumbs-viewport') id = unique_id(gallery) article_body[:galleries][id] = [] gallery_container.children.children.each do |image_container| image = image_container.at('a') if image article_body[:galleries][id] << image.attr('href') end end article_body[:body] << gallery.replace("{{gallery:#{ id }}}").to_html end twitdget = node.at('.twitter-tweet') if twitdget article_body[:body] << twitdget.to_html end redditget = node.at('.reddit-card') if redditget article_body[:body] << redditget.to_html end end # Extract 'figure' outside the node check because in many cases it's # nested within other HTML elements and it makes it harder to # extract without being too specific # # Do a double check because if the current node is in fact a figure, # it will return false # figure = (node.name == 'figure' || node.at('figure.e-image')) if figure node.css('.e-image__image').each do |image| image_url = image.attr('data-original') id = unique_id(node) article_body[:figures][id] = { } article_body[:figures][id][:image] = image_url article_body[:figures][id][:title] = image.at('img')&.attr('title') article_body[:figures][id][:alt] = image.at('img')&.attr('alt') image_meta = node.at('.e-image__meta') unless image_meta.nil? article_body[:figures][id][:caption] = strip(image_meta.at('figcaption')) article_body[:figures][id][:cite] = strip(image_meta.at('cite')) end article_body[:body] << node.replace("{{figure:#{ id }}}").to_html end node.traverse { |children| children.remove } end # First ensure the node is an actual element. This removes random HTML elements # # => node.element? # # Secondly, ensure the node is what we actual want. We don't want <div>'s # which are usualy used for placing inline advertisments or content specific # only to that website # # => WHITELIST[:default].include?(node.name) # if node.element? && GDNewsScraper::Scrapers::PolygonCOM::WHITELIST[:default].include?(node.name) node.children.each do |inner_node| case inner_node.name when 'a' id = unique_id(inner_node) article_body[:anchors][id] = { text: inner_node.children.text, url: inner_node.attr('href') } inner_node.replace("{{anchor:#{ id }}}") end end begin # Remove all attributes # parsed_node = node.xpath('.//@*').remove # Check the integrity of the node before parsing it into html # since 'content' is a Nokogiri feature # omit_node = node.content.empty? # Return clean HTML, including HTML elements and text # parsed_node = node.to_html rescue end end article_body[:body] << parsed_node unless parsed_node.nil? || omit_node end end return article_body rescue => e "There was a problem while parsing this Article's body: #{ e }" end
perform()
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 45 def perform @page.css('.c-compact-river__entry').each do |article| stream[:articles].push(parse(article)) end end
Private Instance Methods
attr(attribute)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 336 def attr(attribute) attributes&.fetch(attribute, nil)&.value end
strip(string)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 340 def strip(string) string&.text&.strip end
unique_id(node)
click to toggle source
# File lib/GDNewsScraper/scrapers/polygon_com/news.rb, line 344 def unique_id(node) Base64.strict_encode64(node.to_s) .reverse .gsub(/[^0-9A-Za-z]/, '')[0..100] .downcase .to_sym end