class TaiwaneseNewsParser::Parser::LibertyTimesNews

Public Class Methods

applicable?(url) click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 10
def self.applicable?(url)
  url.include?('news.ltn.com.tw')
end
domain() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 2
def self.domain
  'ltn.com.tw'
end
names() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 6
def self.names
  %{自由時報}
end
parse_url_id(url) click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 62
def self.parse_url_id(url)
  url[%r{\w+/\w+/\w+/(\d+)},1]
end

Public Instance Methods

clean_url() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 57
def clean_url
  cleaner = TaiwaneseNewsParser::UrlCleaner.new()
  @article[:url] = cleaner.clean(@article[:url])
end
doc() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 14
def doc
  @raw = open(url).read
  @doc = Nokogiri::HTML(@raw)
end
parse() click to toggle source

url = 'news.ltn.com.tw/news/politics/breakingnews/998126'

# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 20
def parse
  # new layout uses utf-8
  @article[:title] = doc.at_css('.content h1').text
  @article[:company_name] = parse_company_name
  @article[:content] = doc.css('#newstext p').text

  doc.css('script').each do |script|
    match = script.content.match(%r{newsTime\s=\s\'(\d+)\';$})
    if not match.nil?
      timestamp = match.captures[0].to_i
      @article[:published_at] = Time.at(timestamp)
      break
    end
  end

  @article[:reporter_name] = parse_reporter_name()

  clean_up

  @article
end
parse_company_name() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 53
def parse_company_name
  '自由時報'
end
parse_reporter_name() click to toggle source
# File lib/taiwanese_news_parser/parser/liberty_times_news.rb, line 42
def parse_reporter_name
  if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
    reporter_name = match[1][%r{記者(.+)},1]
  elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
    reporter_name = match[1]
  elsif match = @article[:content].match(%r{(文/(.*?))})
    reporter_name = match[1]
  end
  reporter_name
end