class NewsScraper::Transformers::Helpers::HighScoreParser

Public Class Methods

keywords(url:, payload:) click to toggle source

NewsScraper::Transformers::Helpers::HighScoreParser.keywords parses out keywords

Params

  • url:: keyword for the url to parse to a uri

  • payload:: keyword for the payload from a request to the url (html body)

Returns

  • keywords: Top 5 keywords from the body of text

# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 19
def keywords(url:, payload:)
  blacklist = Highscore::Blacklist.load(stopwords(url, payload))
  content = Readability::Document.new(payload, emove_empty_nodes: true, tags: [], attributes: []).content
  highscore(content, blacklist)
end

Private Class Methods

highscore(content, blacklist) click to toggle source
# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 27
def highscore(content, blacklist)
  text = Highscore::Content.new(content, blacklist)
  text.configure do
    set :multiplier, 2
    set :upper_case, 3
    set :long_words, 2
    set :long_words_threshold, 15
    set :ignore_case, true
  end
  text.keywords.top(5).collect(&:text).join(',')
end
stopwords(url, payload) click to toggle source
# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 39
def stopwords(url, payload)
  page = MetaInspector.new(url, document: payload)
  stopwords = NewsScraper.configuration.stopwords
  # Add the site name to the stop words
  stopwords += page.meta['og:site_name'].downcase.split(' ') if page.meta['og:site_name']
  stopwords
end