class NewsScraper::Transformers::Helpers::HighScoreParser
Public Class Methods
keywords(url:, payload:)
click to toggle source
NewsScraper::Transformers::Helpers::HighScoreParser.keywords
parses out keywords
Params
-
url:
: keyword for the url to parse to a uri -
payload:
: keyword for the payload from a request to the url (html body)
Returns
-
keywords
: Top 5 keywords from the body of text
# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 19 def keywords(url:, payload:) blacklist = Highscore::Blacklist.load(stopwords(url, payload)) content = Readability::Document.new(payload, emove_empty_nodes: true, tags: [], attributes: []).content highscore(content, blacklist) end
Private Class Methods
highscore(content, blacklist)
click to toggle source
# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 27 def highscore(content, blacklist) text = Highscore::Content.new(content, blacklist) text.configure do set :multiplier, 2 set :upper_case, 3 set :long_words, 2 set :long_words_threshold, 15 set :ignore_case, true end text.keywords.top(5).collect(&:text).join(',') end
stopwords(url, payload)
click to toggle source
# File lib/news_scraper/transformers/helpers/highscore_parser.rb, line 39 def stopwords(url, payload) page = MetaInspector.new(url, document: payload) stopwords = NewsScraper.configuration.stopwords # Add the site name to the stop words stopwords += page.meta['og:site_name'].downcase.split(' ') if page.meta['og:site_name'] stopwords end