class GuardianScraper

Scraper for the articles about the NSA docs in the Guardian

Public Class Methods

new(url) click to toggle source
# File lib/guardianscraper.rb, line 7
def initialize(url)
  @url = url
end

Public Instance Methods

getArticle() click to toggle source

Download the article and save the text and other data

# File lib/guardianscraper.rb, line 12
def getArticle
  articlehash = Hash.new
  html = Nokogiri::HTML(open(@url))

  # Gets misc data on article
  articlehash[:headline] = html.css('h1[itemprop="name headline  "]').text
  articlehash[:description] = html.css('div[itemprop="description"]').text
  articlehash[:date] = html.css('time[itemprop="datePublished"]').text
  articlehash[:author] = html.css("a.contributor").text
  articlehash[:published_by] = "The Guardian"                                                     
  articlehash[:caption] = html.css("div.caption").text

  # Gets list of documents linked to
  articlehash[:documents] = Array.new
  html.css('div[itemprop="description"]').css("a").each do |d|
    articlehash[:documents].push(d["href"])
  end
  
  # Gets text of article
  articlehash[:text] = html.css("div#article-body-blocks").text
  articlehash[:plaintext] = html.css("div#article-body-blocks").text

  JSON.pretty_generate(articlehash)
end