class TaiwaneseNewsParser::Parser::AppleDaily
Public Class Methods
domain()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 2 def self.domain 'appledaily.com.tw' end
names()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 6 def self.names %w{蘋果日報} end
parse_time(raw_time)
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 55 def self.parse_time(raw_time) valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日'] date = nil valid_formats.each do |format| begin date = DateTime.strptime(raw_time, format) rescue end break if !date.nil? end return date end
parse_url_id(url)
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 50 def self.parse_url_id(url) # removes trailing slash url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2] end
Public Instance Methods
clean_url()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 46 def clean_url @article[:url].gsub!(%r{/([^/]*)$},'') end
doc()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 10 def doc @raw = open(url).read @doc = Nokogiri::HTML(@raw) end
parse()
click to toggle source
url = 'www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 16 def parse @article[:title] = doc.at_css('#h1').text @article[:company_name] = parse_company_name @article[:content] = doc.css('.articulum').css('p,h2').text @article[:reporter_name] = parse_reporter_name() @article[:published_at] = self.class.parse_time(doc.css('.gggs time').text) clean_up @article end
parse_company_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 32 def parse_company_name '蘋果日報' end
parse_reporter_name()
click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 36 def parse_reporter_name text = doc.css('.articulum').css('p,h2').text.strip if match = text.match(%r{◎記者(.+)$}) return reporter_name = match[1] elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]}) reporter_name = match[1] end reporter_name end