class TaiwaneseNewsParser::Parser::Cts

Public Class Methods

applicable?(url) click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 10
def self.applicable?(url)
  url.match(%r{cts\.com\.tw/})
end
domain() click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 2
def self.domain
  'cts.com.tw'
end
names() click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 6
def self.names
  ['華視']
end
parse_url_id(url) click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 49
def self.parse_url_id(url)
  url[%r{/cts/.+/\d+/(\d+)\.html},1]
end

Public Instance Methods

doc() click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 14
def doc
  @raw = open(url).read
  @doc = Nokogiri::HTML(@raw)
end
parse() click to toggle source

url = 'news.cts.com.tw/cts/politics/201403/201403191393958.html'

# File lib/taiwanese_news_parser/parser/cts.rb, line 20
def parse
  @article[:title] = doc.at_css('table h1').text
  @article[:company_name] = parse_company_name
  @article[:content] = doc.css('#ctscontent p').text

  time = doc.at_css('td.style14 span.info').text[%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
  @article[:published_at] = Time.parse("#{time}:00")

  @article[:reporter_name] = parse_reporter_name()

  clean_up

  @article
end
parse_company_name() click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 45
def parse_company_name
  doc.at_css('table table div[align="right"] a img').attr(:alt)
end
parse_reporter_name() click to toggle source
# File lib/taiwanese_news_parser/parser/cts.rb, line 35
def parse_reporter_name
  text = doc.at_css('td.style14 span.info').text
  text.gsub!(%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}},'')
  text.gsub!(%r{地區.+$},'')
  if text.include?('綜合報導')
    return nil
  end
  text[%r{(.+) 報導},1]
end