class TaiwaneseNewsParser::Parser

Attributes

article[R]
url[RW]

Public Class Methods

applicable?(url) click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 10
def self.applicable?(url)
  url.include?(domain())
end
applicable_parser(url) click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 14
def self.applicable_parser(url)
  redirected_url = open(url).base_uri.to_s

  parser_class = subclasses.find do |parser_class|
    parser_class.applicable?(redirected_url)
  end
  if parser_class
    parser_class.new(redirected_url)
  end
end
domain() click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 56
def self.domain
  raise NotImplementedError
end
new(url) click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 25
def initialize(url)
  @url = url
  @article = {}
  @article[:url] = url
  @article[:web_domain] = self.class.domain()
  @article[:url_id] = self.class.parse_url_id(url)
end
subclasses() click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 52
def self.subclasses
  [ Udn, LibertyTimes, LibertyTimesBig5, LibertyTimesNews, ChinaTimes, ChinaTimesMoney, Cna, AppleDaily, Ettoday, Tvbs, Cts, NowNews ]
end

Public Instance Methods

clean_up() click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 39
def clean_up
  [:content, :title, :reporter_name, :company_name].each do |attr|
    @article[attr].strip! if @article[attr]
  end
  clean_url if respond_to?(:clean_url)
  @article[:reproduced] = reproduced?
end
doc() click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 33
def doc
  @raw = open(url).read.encode('utf-8', 'big5', :invalid => :replace, :undef => :replace, :replace => '')
  @doc = ::Nokogiri::HTML(@raw,url)
end
reproduced?() click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 47
def reproduced?
  !self.class.names.include?(parse_company_name)
end