class NewsScraper::Extractors::GoogleNewsRss
Constants
- BASE_URL
Public Class Methods
new(query:)
click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 10 def initialize(query:) @query = query end
Public Instance Methods
extract()
click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 14 def extract http_request "#{BASE_URL}&q=#{@query}" do |response| google_urls = google_urls_from_resp(response.body) extract_article_urls(google_urls) end end
Private Instance Methods
extract_article_urls(google_urls)
click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 33 def extract_article_urls(google_urls) google_urls.map do |google_url| regex = google_url.match(%r{&url=(?<url>https?://.*)}) regex.nil? ? nil : regex['url'] end.compact.uniq end
google_urls_from_resp(body)
click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 23 def google_urls_from_resp(body) rss = RSS::Parser.parse(body) rss.items.flat_map do |rss_item| Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor| anchor['href'] end end end