class NewsScraper::Transformers::Article
Public Class Methods
new(url:, payload:)
click to toggle source
Initialize a Article
object
Params
-
url
: keyword arg - the url on which scraping was done -
payload
: keyword arg - the result of the scrape
# File lib/news_scraper/transformers/article.rb, line 19 def initialize(url:, payload:) @url = url @root_domain = URIParser.new(url).host @payload = payload end
Public Instance Methods
transform()
click to toggle source
Transform the article
Raises
-
ScrapePatternNotDefined: will raise this error if the root domain is not in the article_scrape_patterns.yml
Returns
-
transformed_response
: the response that has been parsed and transformed to a hash
# File lib/news_scraper/transformers/article.rb, line 33 def transform scrape_details = NewsScraper.configuration.scrape_patterns['domains'][@root_domain] raise ScrapePatternNotDefined.new(url: @url, root_domain: @root_domain) unless scrape_details transformed_response(scrape_details).merge(url: @url, root_domain: @root_domain) end
Private Instance Methods
parsed_data(scrape_method, scrape_pattern)
click to toggle source
# File lib/news_scraper/transformers/article.rb, line 53 def parsed_data(scrape_method, scrape_pattern) case scrape_method when :xpath noko_html = ::Nokogiri::HTML(@payload) Sanitize.fragment( noko_html.xpath("(#{scrape_pattern})[1]", Nokogiri::Functions.new) ).squish when :css noko_html = ::Nokogiri::HTML(@payload) Sanitize.fragment( noko_html.css(scrape_pattern) ).squish when :readability content = Readability::Document.new( @payload, remove_empty_nodes: true, tags: %w(div p img a table tr th tbody td h1 h2 h3 h4 h5 h6), attributes: %w(src href colspan rowspan) ).content # Remove any newlines in the text content = content.squeeze("\n").strip HtmlBeautifier.beautify(content) when :metainspector page = MetaInspector.new(@url, document: @payload) page.respond_to?(scrape_pattern.to_sym) ? page.send(scrape_pattern.to_sym) : nil when :highscore NewsScraper::Transformers::Helpers::HighScoreParser.keywords(url: @url, payload: @payload) end end
transformed_response(scrape_details)
click to toggle source
# File lib/news_scraper/transformers/article.rb, line 41 def transformed_response(scrape_details) NewsScraper.configuration.scrape_patterns['data_types'].each_with_object({}) do |data_type, response| response[data_type.to_sym] = nil next unless scrape_details[data_type] response[data_type.to_sym] = parsed_data( scrape_details[data_type]['method'].to_sym, scrape_details[data_type]['pattern'] ) end end