class WebPageParser::NewYorkTimesPageParserV2
NewYorkTimesPageParserV2
parses New York Times web pages, including the new format change in Janurary 2014
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 84 def content return @content if @content @content = [] # 2018 story_body = html_doc.css('article#story div.StoryBodyCompanionColumn p') if story_body.empty? # 2017 story_body = html_doc.css('p.story-content') end if story_body.empty? # older style story_body = html_doc.css('p[itemprop=articleBody]') end story_body.each do |p| @content << p.text.strip end @content end
date()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 103 def date return @date if @date if date_meta = html_doc.at_css('meta[name=dat],meta[itemprop=datePublished]') @date = DateTime.parse(date_meta['content']) rescue nil end @date end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 76 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 80 def title @title ||= html_doc.css('h1[itemprop=headline]').text.strip end