class TaiwaneseNewsParser::Parser::Cna
Public Class Methods
domain()
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 2 def self.domain 'cna.com.tw' end
names()
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 6 def self.names %{中央社} end
parse_url_id(url)
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 56 def self.parse_url_id(url) url[%r{/(\d+)(?:\-\d)?\.},1] end
Public Instance Methods
doc()
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 10 def doc @raw = open(url).read @doc = Nokogiri::HTML(@raw) end
parse()
click to toggle source
url = 'www.cna.com.tw/News/aSaM/201304120296-1.aspx'
# File lib/taiwanese_news_parser/parser/cna.rb, line 16 def parse @article[:title] = doc.at_css('.news_content h1, .news_content h2').text @article[:company_name] = '中央社' @article[:content] = doc.css('.news_content .box_2').text @article[:reporter_name] = parse_reporter_name() match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ ) date = [] date[0] = match[1].to_i + 1911 date[1] = match[2] date[2] = match[3] date_string = date.join('/') + ' ' + doc.css('.date').text @article[:published_at] = Time.parse(date_string) clean_up @article end
parse_reporter_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 38 def parse_reporter_name text = doc.css('.news_content .box_2').text text = text[/(中央社(.*?)\d{1,2}日/,1] cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江} cities.find do |city| text.gsub!(/#{city}(?:縣市)?$/,'') end # TODO proper location name removal if match = text.match(%r{記者(.+)}) reporter_name = match[1] end reporter_name end
reproduced?()
click to toggle source
# File lib/taiwanese_news_parser/parser/cna.rb, line 52 def reproduced? false end