class TaiwaneseNewsParser::Parser::LibertyTimesNews
Public Class Methods
applicable?(url)
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 10 def self.applicable?(url) url.include?('news.ltn.com.tw') end
domain()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 2 def self.domain 'ltn.com.tw' end
names()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 6 def self.names %{自由時報} end
parse_url_id(url)
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 62 def self.parse_url_id(url) url[%r{\w+/\w+/\w+/(\d+)},1] end
Public Instance Methods
clean_url()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 57 def clean_url cleaner = TaiwaneseNewsParser::UrlCleaner.new() @article[:url] = cleaner.clean(@article[:url]) end
doc()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 14 def doc @raw = open(url).read @doc = Nokogiri::HTML(@raw) end
parse()
click to toggle source
url = 'news.ltn.com.tw/news/politics/breakingnews/998126'
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 20 def parse # new layout uses utf-8 @article[:title] = doc.at_css('.content h1').text @article[:company_name] = parse_company_name @article[:content] = doc.css('#newstext p').text doc.css('script').each do |script| match = script.content.match(%r{newsTime\s=\s\'(\d+)\';$}) if not match.nil? timestamp = match.captures[0].to_i @article[:published_at] = Time.at(timestamp) break end end @article[:reporter_name] = parse_reporter_name() clean_up @article end
parse_company_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 53 def parse_company_name '自由時報' end
parse_reporter_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 42 def parse_reporter_name if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕}) reporter_name = match[1][%r{記者(.+)},1] elsif match = @article[:content].match(%r{記者(.+?)[//╱]}) reporter_name = match[1] elsif match = @article[:content].match(%r{(文/(.*?))}) reporter_name = match[1] end reporter_name end