class WebPageParser::NewYorkTimesPageParserV1
NewYorkTimesPageParserV1
parses New York Times web pages up to January 2014
Constants
- CONTENT_RE
- DATE_RE
- PARA_RE
- STRIP_INLINE_BLOCK
- STRIP_TAGS_RE
- TITLE_RE
Public Instance Methods
retrieve_page()
click to toggle source
We want to modify the url to request multi-page articles all in one request
Calls superclass method
WebPageParser::BaseRegexpParser#retrieve_page
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 25 def retrieve_page return nil unless url spurl = url spurl << (spurl.include?("?") ? "&" : "?") spurl << "pagewanted=all" p = super(spurl) # If it fails, reset the session and try one more time unless retreive_successful?(p) self.class.retrieve_session ||= WebPageParser::HTTP::Session.new p = super(spurl) end if retreive_successful?(p) p else raise RetrieveError, "Blocked by NYT paywall" end end
Private Instance Methods
content_processor()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 63 def content_processor @content = STRIP_INLINE_BLOCK.gsub(@content, '') @content = STRIP_TAGS_RE.gsub(@content, '') @content = @content.scan(PARA_RE).collect { |a| a[1] } end
date_processor()
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 54 def date_processor begin # OPD is in GMT/UTC, which DateTime seems to use by default @date = DateTime.parse(@date) rescue ArgumentError @date = Time.now.utc end end
retreive_successful?(page)
click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 46 def retreive_successful?(page) if page and page.curl page.curl.header_str.scan(/^Location: .*/).grep(/myaccount.nytimes.com/).empty? else false end end