class WebPageParser::BbcNewsPageParserV2

BbcNewsPageParserV2 parses BBC News web pages

Constants

CONTENT_RE
DATE_RE
PARA_RE
STRIP_BLOCKS_RE
STRIP_CAPTIONS_RE
STRIP_COMMENTS_RE
STRIP_TAGS_RE
TITLE_RE
WHITESPACE_RE

Private Instance Methods

content_processor() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 74
def content_processor
  @content = STRIP_CAPTIONS_RE.gsub(@content, '')
  @content = STRIP_COMMENTS_RE.gsub(@content, '')
  @content = STRIP_BLOCKS_RE.gsub(@content, '')
  @content = STRIP_TAGS_RE.gsub(@content, '')
  @content = WHITESPACE_RE.gsub(@content, ' ')
  @content = @content.split(PARA_RE)
end
date_processor() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 83
def date_processor
  begin
    # OPD is in GMT/UTC, which DateTime seems to use by default
    @date = DateTime.parse(@date)
  rescue ArgumentError
    @date = Time.now.utc
  end
end