class Despeck::Ocr

Extracts text of desired language from the image

Attributes

lang[R]
source_path[R]

Public Class Methods

new(path) click to toggle source
# File lib/despeck/ocr.rb, line 8
def initialize(path)
  @source_path = path
end

Public Instance Methods

text(lang: :eng) click to toggle source
# File lib/despeck/ocr.rb, line 12
def text(lang: :eng)
  if source_path.end_with?('.pdf')
    res = ''
    for_each_page_image do |path|
      res += RTesseract.new(path, lang: lang).to_s
    end
    res
  else
    RTesseract.new(source_path, lang: lang).to_s
  end
end

Private Instance Methods

for_each_page_image() { |path| ... } click to toggle source
# File lib/despeck/ocr.rb, line 26
def for_each_page_image
  paths = []
  Despeck::PdfTools
    .pdf_to_images(source_path).each do |pic|
      tempfile = Tempfile.new(['despeck_page', '.jpg'])
      pic.write_to_file(tempfile.path)
      yield tempfile.path
    end

  paths
end