class Docsplit::TextExtractor

Delegates to pdftotext and tesseract in order to extract text from PDF documents. The ‘–ocr` and `–no-ocr` flags can be used to force or forbid OCR extraction, but by default the heuristic works like this:

* Check for the presence of fonts in the PDF. If no fonts are detected,
  OCR is used automatically.
* Extract the text of each page with **pdftotext**, if the page has less
  than 100 bytes of text (a scanned image page, or a page that just
  contains a filename and a page number), then add it to the list of
  `@pages_to_ocr`.
* Re-OCR each page in the `@pages_to_ocr` list at the end.

Constants

MEMORY_ARGS
MIN_TEXT_PER_PAGE
NO_TEXT_DETECTED
OCR_FLAGS

Public Class Methods

new() click to toggle source
# File lib/docsplit/text_extractor.rb, line 24
def initialize
  @pages_to_ocr = []
end

Public Instance Methods

contains_text?(pdf) click to toggle source

Does a PDF have any text embedded?

# File lib/docsplit/text_extractor.rb, line 47
def contains_text?(pdf)
  fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
  !fonts.match(NO_TEXT_DETECTED)
end
extract(pdfs, opts) click to toggle source

Extract text from a list of PDFs.

# File lib/docsplit/text_extractor.rb, line 29
def extract(pdfs, opts)
  extract_options opts
  FileUtils.mkdir_p @output unless File.exists?(@output)
  [pdfs].flatten.each do |pdf|
    @pdf_name = File.basename(pdf, File.extname(pdf))
    pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
    if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
      extract_from_ocr(pdf, pages)
    else
      extract_from_pdf(pdf, pages)
      if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
        extract_from_ocr(pdf, @pages_to_ocr)
      end
    end
  end
end
extract_from_ocr(pdf, pages) click to toggle source

Extract a page range worth of text from a PDF via OCR.

# File lib/docsplit/text_extractor.rb, line 59
def extract_from_ocr(pdf, pages)
  tempdir = Dir.mktmpdir
  base_path = File.join(@output, @pdf_name)
  escaped_pdf = ESCAPE[pdf]
  if pages
    pages.each do |page|
      tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
      escaped_tiff = ESCAPE[tiff]
      file = "#{base_path}_#{page}"
      if ENV["toolchain"] == 'graphicsmagick'
        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
      else
        run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
      end
      run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
      clean_text(file + '.txt') if @clean_ocr
      FileUtils.remove_entry_secure tiff
    end
  else
    tiff = "#{tempdir}/#{@pdf_name}.tif"
    escaped_tiff = ESCAPE[tiff]
    if ENV["toolchain"] == 'graphicsmagick'
      run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
    else
      run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
    end
    run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
    clean_text(base_path + '.txt') if @clean_ocr
  end
ensure
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end
extract_from_pdf(pdf, pages) click to toggle source

Extract a page range worth of text from a PDF, directly.

# File lib/docsplit/text_extractor.rb, line 53
def extract_from_pdf(pdf, pages)
  return extract_full(pdf) unless pages
  pages.each {|page| extract_page(pdf, page) }
end

Private Instance Methods

clean_text(file) click to toggle source
# File lib/docsplit/text_extractor.rb, line 95
def clean_text(file)
  File.open(file, 'r+') do |f|
    text = f.read
    f.truncate(0)
    f.rewind
    f.write(Docsplit.clean_text(text))
  end
end
extract_full(pdf) click to toggle source

Extract the full contents of a pdf as a single file, directly.

# File lib/docsplit/text_extractor.rb, line 112
def extract_full(pdf)
  text_path = File.join(@output, "#{@pdf_name}.txt")
  run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
end
extract_options(options) click to toggle source
# File lib/docsplit/text_extractor.rb, line 127
def extract_options(options)
  @output     = options[:output] || '.'
  @pages      = options[:pages]
  @force_ocr  = options[:ocr] == true
  @forbid_ocr = options[:ocr] == false
  @clean_ocr  = !(options[:clean] == false)
  @language   = options[:language] || 'eng'
end
extract_page(pdf, page) click to toggle source

Extract the contents of a single page of text, directly, adding it to the ‘@pages_to_ocr` list if the text length is inadequate.

# File lib/docsplit/text_extractor.rb, line 119
def extract_page(pdf, page)
  text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
  run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
  unless @forbid_ocr
    @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
  end
end
run(command) click to toggle source

Run an external process and raise an exception if it fails.

# File lib/docsplit/text_extractor.rb, line 105
def run(command)
  result = `#{command}`
  raise ExtractionFailed, result if $? != 0
  result
end