class WebPageParser::RTPageParserV1

RTPageParserV1 parses RT web pages using html parsing.

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 28
def content
  return @content if @content
  story_summary = html_doc.css('div.article__summary').text.strip
  story_body = html_doc.css('div.article__text > *').select do |e|
    e.name == 'p' or e.name == 'h2' or e.name == 'h3'
  end
  story_body.collect! { |p| p.text.empty? ? nil : p.text.strip }.compact
  story_body.unshift story_summary unless story_summary.empty?
  story_body
end
date() click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 39
def date
  @date ||= DateTime.parse(html_doc.at_css('div.article time.date:first').text.strip)
end
filter_url(url) click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 43
def filter_url(url)
  # some wierd guardian problem with some older articles
  url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") 
end
html_doc() click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 20
def html_doc
  @html_document ||= Nokogiri::HTML(page)
end
title() click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 24
def title
  @title ||= html_doc.css('h1.article__heading').text.strip
end