class TaiwaneseNewsParser::Parser::AppleDaily

Public Class Methods

domain() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 2
def self.domain
  'appledaily.com.tw'
end
names() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 6
def self.names
  %w{蘋果日報}
end
parse_time(raw_time) click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 55
def self.parse_time(raw_time)
  valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']

  date = nil
  valid_formats.each do |format|
    begin
      date = DateTime.strptime(raw_time, format)
    rescue
    end
    break if !date.nil?
  end

  return date
end
parse_url_id(url) click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 50
def self.parse_url_id(url)
  # removes trailing slash
  url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
end

Public Instance Methods

clean_url() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 46
def clean_url
  @article[:url].gsub!(%r{/([^/]*)$},'')
end
doc() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 10
def doc
  @raw = open(url).read
  @doc = Nokogiri::HTML(@raw)
end
parse() click to toggle source

url = 'www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'

# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 16
def parse
  @article[:title] = doc.at_css('#h1').text

  @article[:company_name] = parse_company_name

  @article[:content] = doc.css('.articulum').css('p,h2').text

  @article[:reporter_name] = parse_reporter_name()

  @article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)

  clean_up

  @article
end
parse_company_name() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 32
def parse_company_name
  '蘋果日報'
end
parse_reporter_name() click to toggle source
# File lib/taiwanese_news_parser/parser/apple_daily.rb, line 36
def parse_reporter_name
  text = doc.css('.articulum').css('p,h2').text.strip
  if match = text.match(%r{◎記者(.+)$})
    return reporter_name = match[1]
  elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
    reporter_name = match[1]
  end
  reporter_name
end