class UploadConvert

Public Class Methods

new(input) click to toggle source
# File lib/uploadconvert.rb, line 8
def initialize(input)
  @input = input
  @output = ""
  @text = ""
end

Public Instance Methods

cleanPDF(text) click to toggle source

Removes numbers from edges of legal documents

# File lib/uploadconvert.rb, line 99
def cleanPDF(text)
  text.gsub!(/\r?\n/, "\n")
  text.each_line do |l|
    lflag = 0
    (1..28).each do |i|
      if l == i.to_s+"\n"
        lflag = 1
      end
    end

    if lflag != 1 && l
      @text += l
    end
  end
  
  return @text
end
detectPDFType() click to toggle source

Use embedded fonts to detect the type of PDF

# File lib/uploadconvert.rb, line 49
def detectPDFType
  out = `pdffonts #{@input}`.split("\n")
  if out.length > 4
    return embedPDF
  else
    return ocrPDF
  end
end
embedPDF() click to toggle source

Extract text from embedded text PDFs

# File lib/uploadconvert.rb, line 59
def embedPDF
  begin
    Docsplit.extract_text(@input, :ocr => false, :output => "public/uploads")
    outfile = @input.split(".pdf")
    if outfile[0].include? "public/uploads/"
      path = outfile[0]
    else
      path = "public/uploads/" + outfile[0]
    end

    text = File.read(path+".txt")
    
    # Clean up text and delete file
    File.delete(path+".txt")
    cleanPDF(text)
  rescue
  end
end
extractMetadataPDF() click to toggle source

Extract PDF metadata

# File lib/uploadconvert.rb, line 118
def extractMetadataPDF
  @metadata = Hash.new
  @metadata[:author] = Docsplit.extract_author(@input)
  @metadata[:creator] =  Docsplit.extract_creator(@input)
  @metadata[:producer] = Docsplit.extract_producer(@input)
  @metadata[:title] = Docsplit.extract_title(@input)
  @metadata[:subject] = Docsplit.extract_subject(@input)
  @metadata[:date] = Docsplit.extract_date(@input)
  @metadata[:keywords] = Docsplit.extract_keywords(@input)
  @metadata[:length] = Docsplit.extract_length(@input)
  return @metadata
end
handleDoc() click to toggle source

Sends the document to the appropriate method

# File lib/uploadconvert.rb, line 15
def handleDoc
  if @input.include? "http"
    `wget #{@input}`
    path = @input.split("/")
    @input = path[path.length-1].chomp.strip
    handleDoc
  elsif @input.include? ".pdf"
    pdfTojson
  elsif @input.include? ".xml"
    xmlTojson(File.read(@input))
  end
end
ocrPDF() click to toggle source

OCR PDFs and turn that text into a JSON

# File lib/uploadconvert.rb, line 79
def ocrPDF
  # Extract individual pages
  Docsplit.extract_images(@input)
  
  # OCR
  docs = Dir["*.png"]
  Docsplit.extract_text(@input, :ocr => true, :output => 'text')
  outfile = @input.split(".")
  text = File.read("text/" + outfile[0] + ".txt")

  # Clean up text and files
  File.delete("text/" + outfile[0]+".txt")
  Dir.delete("text")
  docs.each do |d|
    File.delete(d)
  end
  cleanPDF(text)
end
pdfTojson() click to toggle source

Convert PDFs to JSON

# File lib/uploadconvert.rb, line 35
def pdfTojson
  # Extract and clean text
  @text = detectPDFType

  # Extract metadata and generate output
  extractMetadataPDF
  outhash = Hash.new
  @metadata.each{|k, v| outhash[k] = v}
  outhash[:text] = @text
  outhash[:input] = @input
  @output = JSON.pretty_generate(outhash)
end
xmlTojson(xmlin) click to toggle source

Convert XML files to JSONs

# File lib/uploadconvert.rb, line 29
def xmlTojson(xmlin)
  xml = Crack::XML.parse(xmlin)
  JSON.pretty_generate(xml)
end