class WebPageParser::NewYorkTimesPageParserV1

NewYorkTimesPageParserV1 parses New York Times web pages up to January 2014

Constants

CONTENT_RE
DATE_RE
PARA_RE
STRIP_INLINE_BLOCK
STRIP_TAGS_RE
TITLE_RE

Public Instance Methods

retrieve_page() click to toggle source

We want to modify the url to request multi-page articles all in one request

# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 25
def retrieve_page
  return nil unless url
  spurl = url
  spurl << (spurl.include?("?") ? "&" : "?")
  spurl << "pagewanted=all"
  p = super(spurl)
  # If it fails, reset the session and try one more time
  unless retreive_successful?(p)
    self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
    p = super(spurl)
  end
  if retreive_successful?(p)
    p
  else
    raise RetrieveError, "Blocked by NYT paywall"
  end
end

Private Instance Methods

content_processor() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 63
def content_processor
  @content = STRIP_INLINE_BLOCK.gsub(@content, '')
  @content = STRIP_TAGS_RE.gsub(@content, '')
  @content = @content.scan(PARA_RE).collect { |a| a[1] }
end
date_processor() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 54
def date_processor
  begin
    # OPD is in GMT/UTC, which DateTime seems to use by default
    @date = DateTime.parse(@date)
  rescue ArgumentError
    @date = Time.now.utc
  end
end
retreive_successful?(page) click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 46
def retreive_successful?(page)
  if page and page.curl
    page.curl.header_str.scan(/^Location: .*/).grep(/myaccount.nytimes.com/).empty?
  else
    false
  end
end