class WebPageParser::BbcNewsPageParserV5
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 170 def content return @content if @content @content = [] story_body = html_doc.css('div.story-body > div.story-body__inner') # Pre April 2015 if story_body.children.empty? story_body = html_doc.css('div.story-body') end # for older bbc articles if story_body.children.empty? story_body = html_doc.css('div#story-body') end # for very old bbc articles if story_body.children.empty? story_body = html_doc.css('td.storybody') end story_body.children.each do |n| @content << n.text.strip if n.name == 'p' # Pre-April 2015 headings @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head' # Post April 2015 headings @content << n.text.strip if n.name == 'h2' and n['class'].to_s =~ /crosshead/ end @content end
date()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 201 def date return @date if @date if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]') @date = DateTime.parse(date_meta['content']) rescue nil end @date end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 144 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 148 def title return @title if @title @title = html_doc.css('h1.story-body__h1').text.strip # for older bbc articles if @title.empty? @title = html_doc.css('h1.story-header').text.strip end if @title.empty? @title = html_doc.css('div#meta-information h1').text.strip end # for very old bbc articles if @title.empty? if headline_meta = html_doc.at_css('meta[name=Headline]') @title = headline_meta['content'].to_s.strip end end @title end