class WebPageParser::GuardianPageParserV1

GuardianPageParserV1 parses Guardian web pages using regexps

Constants

CONTENT_RE
DATE_RE
PARA_RE
STRIP_SCRIPTS_RE
STRIP_TAGS_RE
TITLE_RE

Private Instance Methods

content_processor() click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 36
def content_processor
  @content = STRIP_TAGS_RE.gsub(@content, '')
  @content = STRIP_SCRIPTS_RE.gsub(@content, '')
  @content = @content.scan(PARA_RE).collect { |a| a[1] }
end
date_processor() click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 27
def date_processor
  begin
    # OPD is in GMT/UTC, which DateTime seems to use by default
    @date = DateTime.parse(@date)
  rescue ArgumentError
    @date = Time.now.utc
  end
end
filter_url(url) click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 42
def filter_url(url)
  url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") # some wierd guardian problem with some older articles
end