class Lobbyliste::Downloader

This class finds the lobbyliste pdf on the Bundestag website, downloads it and extracts the pdf content

Public Class Methods

new(pdf_link=nil) click to toggle source

Creates a new Downloader @param [String] link that will be used to fetch the lobbylist pdf, defaults to nil

# File lib/lobbyliste/downloader.rb, line 11
def initialize(pdf_link=nil)
  @pdf_link = pdf_link
end

Public Instance Methods

html_data() click to toggle source

@return [String] extracted content of pdf file in html format

# File lib/lobbyliste/downloader.rb, line 29
def html_data
  extract_pdf unless @html_data
  @html_data
end
pdf_data() click to toggle source

@return [String] raw content of pdf file

# File lib/lobbyliste/downloader.rb, line 16
def pdf_data
  retrieve_pdf unless @pdf_data
  @pdf_data
end
text_data() click to toggle source

@return [String] extracted content of pdf file

# File lib/lobbyliste/downloader.rb, line 23
def text_data
  extract_pdf unless @text_data
  @text_data
end

Private Instance Methods

extract_pdf() click to toggle source
# File lib/lobbyliste/downloader.rb, line 70
def extract_pdf
  pdf_file = Tempfile.new(["lobbyliste",".pdf"])
  pdf_file.write(pdf_data)
  pdf_file.rewind

  @text_data = run_extraction(pdf_file)
  @html_data = run_extraction(pdf_file,true)
ensure
  pdf_file.close
  pdf_file.unlink
end
http_version_of_url(url) click to toggle source
# File lib/lobbyliste/downloader.rb, line 54
def http_version_of_url(url)
  url.gsub('https://', 'http://')
end
jar_path() click to toggle source
# File lib/lobbyliste/downloader.rb, line 92
def jar_path
  File.join(File.dirname(File.expand_path(__FILE__)), '../../ext/pdfbox.jar')
end
retrieve_pdf() click to toggle source
# File lib/lobbyliste/downloader.rb, line 59
def retrieve_pdf
  begin
    @pdf_data = open(pdf_link) {|f| f.read}
  rescue RuntimeError => error
    non_https_link = http_version_of_url(pdf_link)
    @pdf_data = open(non_https_link) {|f| f.read}
  end

end
run_extraction(pdf_file,html=false) click to toggle source
# File lib/lobbyliste/downloader.rb, line 82
def run_extraction(pdf_file,html=false)
  tmp_file = Tempfile.new(["lobbyliste"])
  status = system("/usr/bin/java -jar #{jar_path} ExtractText #{pdf_file.path} #{html ? "-html":""} #{tmp_file.path} > /dev/null 2>&1")
  raise "PDF extraction failed" unless status
  return tmp_file.read
ensure
  tmp_file.close
  tmp_file.unlink
end