class WebPageParser::RTPageParserV1
RTPageParserV1
parses RT web pages using html parsing.
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 28 def content return @content if @content story_summary = html_doc.css('div.article__summary').text.strip story_body = html_doc.css('div.article__text > *').select do |e| e.name == 'p' or e.name == 'h2' or e.name == 'h3' end story_body.collect! { |p| p.text.empty? ? nil : p.text.strip }.compact story_body.unshift story_summary unless story_summary.empty? story_body end
date()
click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 39 def date @date ||= DateTime.parse(html_doc.at_css('div.article time.date:first').text.strip) end
filter_url(url)
click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 43 def filter_url(url) # some wierd guardian problem with some older articles url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 20 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/rt_page_parser.rb, line 24 def title @title ||= html_doc.css('h1.article__heading').text.strip end