class ACLUScraper
Public Class Methods
new(url)
click to toggle source
# File lib/acluscraper.rb, line 7 def initialize(url) @url = url @casearray = Array.new end
Public Instance Methods
scrapeCase()
click to toggle source
Get all the case documents
# File lib/acluscraper.rb, line 13 def scrapeCase html = Nokogiri::HTML(open(@url)) prevdate = "" html.css("tbody").each do |t| t.css("tr").each do |r| if !r.css("a").empty? dochash = Hash.new # Get date for filing if r.css("td")[0].text == "\u00a0" dochash[:date] = prevdate else prevdate = r.css("td")[0].text.to_s dochash[:date] = r.css("td")[0].text.to_s end a = r.css("a") dochash[:title] = a.text # Get URL if a[0]["href"].to_s.include? "https://" dochash[:url] = a[0]["href"] else dochash[:url] = "https://www.aclu.org" + a[0]["href"] end # Download documents `wget -P public/uploads #{dochash[:url]}` path = dochash[:url].split("/") dochash[:path] = "public/uploads/" + path[path.length-1].chomp.strip # Extract metadata and text begin u = UploadConvert.new(dochash[:path]) metadata = u.extractMetadataPDF metadata.each{|k, v| dochash[k] = v} dochash[:text] = u.detectPDFType @casearray.push(dochash) rescue end end end end JSON.pretty_generate(@casearray) end