class TaiwaneseNewsParser::Parser::NowNews
Public Class Methods
domain()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 2 def self.domain 'nownews.com' end
names()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 6 def self.names %w{NowNews 今日新聞} end
parse_url_id(url)
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 46 def self.parse_url_id(url) url[%r{/\d+/\d+/\d+/(\d+)},1] end
Public Instance Methods
clean_url()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 41 def clean_url cleaner = TaiwaneseNewsParser::UrlCleaner.new() @article[:url] = cleaner.clean(@article[:url]) end
doc()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 10 def doc @raw = open(url).read @doc = Nokogiri::HTML(@raw) end
parse()
click to toggle source
url = 'www.nownews.com/n/2014/03/21/1159861'
# File lib/taiwanese_news_parser/parser/now_news.rb, line 16 def parse @article[:title] = doc.css('[itemprop=headline]').text @article[:company_name] = self.class.names.first @article[:content] = doc.css('[itemprop=articleBody]>p').text @article[:reporter_name] = parse_reporter_name() t = doc.css('#reporter_info p').text.match(/(\d*)年\s*(\d+)月\s*(\d+)日\D*(\d+):(\d+)/) @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5]) clean_up @article end
parse_reporter_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 33 def parse_reporter_name text = doc.css('[itemprop=articleBody]').text if match = text.match(%r{記者(.+?)[//╱/]}) reporter_name = match[1] end reporter_name end
reproduced?()
click to toggle source
# File lib/taiwanese_news_parser/parser/now_news.rb, line 50 def reproduced? false end