class Bagira

Constants

FILE_TYPE
VERSION

Attributes

file_type[R]
filepath[R]
page_count[R]

Public Class Methods

new(filepath) click to toggle source
# File lib/bagira.rb, line 12
def initialize(filepath)
      @file_type = FILE_TYPE[:invalid]
      @file_type = FILE_TYPE[:pdf] unless filepath.match(/\.pdf$/).nil?
      @file_type = FILE_TYPE[:png] unless filepath.match(/\.png$/).nil?
      @file_type = FILE_TYPE[:jpg] unless filepath.match(/\.jpg$/).nil?
      @file_type = FILE_TYPE[:not_found] unless File.exist?(filepath)

      @filepath = filepath

      @page_count = 0
      @page_count = 1 if is_image?
      if is_document?
              reader = PDF::Reader.new(filepath)
              @page_count = reader.page_count
      end
end

Public Instance Methods

perform_ocr() click to toggle source
# File lib/bagira.rb, line 29
def perform_ocr
      return process_image(@filepath) if is_image?
      return process_pdf(@filepath) if is_document?
end

Private Instance Methods

is_document?() click to toggle source
# File lib/bagira.rb, line 54
def is_document?
      return true if (@file_type.eql?(FILE_TYPE[:pdf]))
              return false
end
is_image?() click to toggle source
# File lib/bagira.rb, line 49
def is_image?
      return true if (@file_type.eql?(FILE_TYPE[:jpg]) or @file_type.eql?(FILE_TYPE[:png]))
              return false
end
process_image(filepath) click to toggle source
# File lib/bagira.rb, line 35
def process_image(filepath)
      result = %x(tesseract #{filepath} stdout)
      return result.strip unless result.nil?
end
process_pdf(filepath) click to toggle source
# File lib/bagira.rb, line 40
def process_pdf(filepath)
      result_array = []
      for i in 1..@page_count
              page_result = %x(gs -q -dNOPAUSE -sDEVICE=pnggray -dTextAlphaBits=4 -r300 -sOutputFile=%stdout -dBATCH -dFirstPage=#{i} -dLastPage=#{i} #{filepath}|tesseract stdin stdout)
              result_array << page_result.strip unless page_result.nil?
      end
      result_array.join("\n")
end