class GuardianScraper
Scraper for the articles about the NSA docs in the Guardian
Public Class Methods
new(url)
click to toggle source
# File lib/guardianscraper.rb, line 7 def initialize(url) @url = url end
Public Instance Methods
getArticle()
click to toggle source
Download the article and save the text and other data
# File lib/guardianscraper.rb, line 12 def getArticle articlehash = Hash.new html = Nokogiri::HTML(open(@url)) # Gets misc data on article articlehash[:headline] = html.css('h1[itemprop="name headline "]').text articlehash[:description] = html.css('div[itemprop="description"]').text articlehash[:date] = html.css('time[itemprop="datePublished"]').text articlehash[:author] = html.css("a.contributor").text articlehash[:published_by] = "The Guardian" articlehash[:caption] = html.css("div.caption").text # Gets list of documents linked to articlehash[:documents] = Array.new html.css('div[itemprop="description"]').css("a").each do |d| articlehash[:documents].push(d["href"]) end # Gets text of article articlehash[:text] = html.css("div#article-body-blocks").text articlehash[:plaintext] = html.css("div#article-body-blocks").text JSON.pretty_generate(articlehash) end