class Docsplit::TextExtractor
Delegates to pdftotext and tesseract in order to extract text from PDF documents. The ‘–ocr` and `–no-ocr` flags can be used to force or forbid OCR extraction, but by default the heuristic works like this:
* Check for the presence of fonts in the PDF. If no fonts are detected, OCR is used automatically. * Extract the text of each page with **pdftotext**, if the page has less than 100 bytes of text (a scanned image page, or a page that just contains a filename and a page number), then add it to the list of `@pages_to_ocr`. * Re-OCR each page in the `@pages_to_ocr` list at the end.
Constants
- MEMORY_ARGS
- MIN_TEXT_PER_PAGE
- NO_TEXT_DETECTED
- OCR_FLAGS
Public Class Methods
new()
click to toggle source
# File lib/docsplit/text_extractor.rb, line 24 def initialize @pages_to_ocr = [] end
Public Instance Methods
contains_text?(pdf)
click to toggle source
Does a PDF have any text embedded?
# File lib/docsplit/text_extractor.rb, line 47 def contains_text?(pdf) fonts = `pdffonts #{ESCAPE[pdf]} 2>&1` !fonts.match(NO_TEXT_DETECTED) end
extract(pdfs, opts)
click to toggle source
Extract text from a list of PDFs.
# File lib/docsplit/text_extractor.rb, line 29 def extract(pdfs, opts) extract_options opts FileUtils.mkdir_p @output unless File.exists?(@output) [pdfs].flatten.each do |pdf| @pdf_name = File.basename(pdf, File.extname(pdf)) pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) extract_from_ocr(pdf, pages) else extract_from_pdf(pdf, pages) if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? extract_from_ocr(pdf, @pages_to_ocr) end end end end
extract_from_ocr(pdf, pages)
click to toggle source
Extract a page range worth of text from a PDF via OCR.
# File lib/docsplit/text_extractor.rb, line 59 def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" if ENV["toolchain"] == 'graphicsmagick' run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" else run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}" end run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] if ENV["toolchain"] == 'graphicsmagick' run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" else run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}" end run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end
extract_from_pdf(pdf, pages)
click to toggle source
Extract a page range worth of text from a PDF, directly.
# File lib/docsplit/text_extractor.rb, line 53 def extract_from_pdf(pdf, pages) return extract_full(pdf) unless pages pages.each {|page| extract_page(pdf, page) } end
Private Instance Methods
clean_text(file)
click to toggle source
# File lib/docsplit/text_extractor.rb, line 95 def clean_text(file) File.open(file, 'r+') do |f| text = f.read f.truncate(0) f.rewind f.write(Docsplit.clean_text(text)) end end
extract_full(pdf)
click to toggle source
Extract the full contents of a pdf as a single file, directly.
# File lib/docsplit/text_extractor.rb, line 112 def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" end
extract_options(options)
click to toggle source
# File lib/docsplit/text_extractor.rb, line 127 def extract_options(options) @output = options[:output] || '.' @pages = options[:pages] @force_ocr = options[:ocr] == true @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' end
extract_page(pdf, page)
click to toggle source
Extract the contents of a single page of text, directly, adding it to the ‘@pages_to_ocr` list if the text length is inadequate.
# File lib/docsplit/text_extractor.rb, line 119 def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end end
run(command)
click to toggle source
Run an external process and raise an exception if it fails.
# File lib/docsplit/text_extractor.rb, line 105 def run(command) result = `#{command}` raise ExtractionFailed, result if $? != 0 result end