class UploadConvert
Public Class Methods
new(input)
click to toggle source
# File lib/uploadconvert.rb, line 8 def initialize(input) @input = input @output = "" @text = "" end
Public Instance Methods
cleanPDF(text)
click to toggle source
Removes numbers from edges of legal documents
# File lib/uploadconvert.rb, line 99 def cleanPDF(text) text.gsub!(/\r?\n/, "\n") text.each_line do |l| lflag = 0 (1..28).each do |i| if l == i.to_s+"\n" lflag = 1 end end if lflag != 1 && l @text += l end end return @text end
detectPDFType()
click to toggle source
Use embedded fonts to detect the type of PDF
# File lib/uploadconvert.rb, line 49 def detectPDFType out = `pdffonts #{@input}`.split("\n") if out.length > 4 return embedPDF else return ocrPDF end end
embedPDF()
click to toggle source
Extract text from embedded text PDFs
# File lib/uploadconvert.rb, line 59 def embedPDF begin Docsplit.extract_text(@input, :ocr => false, :output => "public/uploads") outfile = @input.split(".pdf") if outfile[0].include? "public/uploads/" path = outfile[0] else path = "public/uploads/" + outfile[0] end text = File.read(path+".txt") # Clean up text and delete file File.delete(path+".txt") cleanPDF(text) rescue end end
extractMetadataPDF()
click to toggle source
Extract PDF metadata
# File lib/uploadconvert.rb, line 118 def extractMetadataPDF @metadata = Hash.new @metadata[:author] = Docsplit.extract_author(@input) @metadata[:creator] = Docsplit.extract_creator(@input) @metadata[:producer] = Docsplit.extract_producer(@input) @metadata[:title] = Docsplit.extract_title(@input) @metadata[:subject] = Docsplit.extract_subject(@input) @metadata[:date] = Docsplit.extract_date(@input) @metadata[:keywords] = Docsplit.extract_keywords(@input) @metadata[:length] = Docsplit.extract_length(@input) return @metadata end
handleDoc()
click to toggle source
Sends the document to the appropriate method
# File lib/uploadconvert.rb, line 15 def handleDoc if @input.include? "http" `wget #{@input}` path = @input.split("/") @input = path[path.length-1].chomp.strip handleDoc elsif @input.include? ".pdf" pdfTojson elsif @input.include? ".xml" xmlTojson(File.read(@input)) end end
ocrPDF()
click to toggle source
OCR PDFs and turn that text into a JSON
# File lib/uploadconvert.rb, line 79 def ocrPDF # Extract individual pages Docsplit.extract_images(@input) # OCR docs = Dir["*.png"] Docsplit.extract_text(@input, :ocr => true, :output => 'text') outfile = @input.split(".") text = File.read("text/" + outfile[0] + ".txt") # Clean up text and files File.delete("text/" + outfile[0]+".txt") Dir.delete("text") docs.each do |d| File.delete(d) end cleanPDF(text) end
pdfTojson()
click to toggle source
Convert PDFs to JSON
# File lib/uploadconvert.rb, line 35 def pdfTojson # Extract and clean text @text = detectPDFType # Extract metadata and generate output extractMetadataPDF outhash = Hash.new @metadata.each{|k, v| outhash[k] = v} outhash[:text] = @text outhash[:input] = @input @output = JSON.pretty_generate(outhash) end
xmlTojson(xmlin)
click to toggle source
Convert XML files to JSONs
# File lib/uploadconvert.rb, line 29 def xmlTojson(xmlin) xml = Crack::XML.parse(xmlin) JSON.pretty_generate(xml) end