class TaiwaneseNewsParser::Parser
Attributes
article[R]
url[RW]
Public Class Methods
applicable?(url)
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 10 def self.applicable?(url) url.include?(domain()) end
applicable_parser(url)
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 14 def self.applicable_parser(url) redirected_url = open(url).base_uri.to_s parser_class = subclasses.find do |parser_class| parser_class.applicable?(redirected_url) end if parser_class parser_class.new(redirected_url) end end
domain()
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 56 def self.domain raise NotImplementedError end
new(url)
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 25 def initialize(url) @url = url @article = {} @article[:url] = url @article[:web_domain] = self.class.domain() @article[:url_id] = self.class.parse_url_id(url) end
subclasses()
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 52 def self.subclasses [ Udn, LibertyTimes, LibertyTimesBig5, LibertyTimesNews, ChinaTimes, ChinaTimesMoney, Cna, AppleDaily, Ettoday, Tvbs, Cts, NowNews ] end
Public Instance Methods
clean_up()
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 39 def clean_up [:content, :title, :reporter_name, :company_name].each do |attr| @article[attr].strip! if @article[attr] end clean_url if respond_to?(:clean_url) @article[:reproduced] = reproduced? end
doc()
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 33 def doc @raw = open(url).read.encode('utf-8', 'big5', :invalid => :replace, :undef => :replace, :replace => '') @doc = ::Nokogiri::HTML(@raw,url) end
reproduced?()
click to toggle source
# File lib/taiwanese_news_parser/parser.rb, line 47 def reproduced? !self.class.names.include?(parse_company_name) end