class Parser::NewsFr
Public Class Methods
new(source, options = {})
click to toggle source
Calls superclass method
# File lib/fly_parser/sources/news-fr.rb, line 3 def initialize(source, options = {}) super end
Public Instance Methods
parse_all()
click to toggle source
# File lib/fly_parser/sources/news-fr.rb, line 7 def parse_all items = @source.search('//item') # # last_date = Time.now - 2.years # for dev 2 years # # select! or reject! is not exists for Nokogiri#NodeSet # # items = items.select {|item| item.xpath('pubDate').first.content() > last_date } items.map do |item| title = item.xpath('title/text()').text() date = item.xpath('pubdate').first.content() link = item.xpath('link/following-sibling::text()[1]').first begin page = Nokogiri::HTML(open(link)) rescue Exception => e puts e.message next end next if page.search('figure.img img').first.nil? poster_image = page.search('.article-long figure.img img').first.attributes['src'].value full_desc = page.search('.article-long .bd') full_desc.search('.modification').remove() full_desc.search('script').remove() full_desc.search('.ft').remove() full_desc.search('a').remove_attr('href') full_desc.search('.twitter-tweet').remove() desc = full_desc.inner_html desc.gsub! /h2|h1|h3/, 'h4' # remove href attributes #full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>') copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>" content = desc + copyright {title: title, content: content, poster_image: poster_image} end.compact end