class WebPageParser::BbcNewsPageParserV6

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 222
def content
  return @content if @content
  @content = []

  story_body = html_doc.css('div.story-body > div.story-body__inner')

  story_body.children.each do |n|
    case n.name
    when 'p', 'h2', 'h3'
      @content << n.text.strip
    when 'ul'
      if n['class'] =~ /story-body/
        n.css('li').each do |li|
          @content << li.text.strip
        end
      end
    end
  end
  @content
end
date() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 243
def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end
html_doc() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 214
def html_doc
  @html_document ||= Nokogiri::HTML(page)
end
title() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 218
def title
  @title ||= html_doc.css('h1.story-body__h1').text.strip
end