class Bagira
Constants
- FILE_TYPE
- VERSION
Attributes
file_type[R]
filepath[R]
page_count[R]
Public Class Methods
new(filepath)
click to toggle source
# File lib/bagira.rb, line 12 def initialize(filepath) @file_type = FILE_TYPE[:invalid] @file_type = FILE_TYPE[:pdf] unless filepath.match(/\.pdf$/).nil? @file_type = FILE_TYPE[:png] unless filepath.match(/\.png$/).nil? @file_type = FILE_TYPE[:jpg] unless filepath.match(/\.jpg$/).nil? @file_type = FILE_TYPE[:not_found] unless File.exist?(filepath) @filepath = filepath @page_count = 0 @page_count = 1 if is_image? if is_document? reader = PDF::Reader.new(filepath) @page_count = reader.page_count end end
Public Instance Methods
perform_ocr()
click to toggle source
# File lib/bagira.rb, line 29 def perform_ocr return process_image(@filepath) if is_image? return process_pdf(@filepath) if is_document? end
Private Instance Methods
is_document?()
click to toggle source
# File lib/bagira.rb, line 54 def is_document? return true if (@file_type.eql?(FILE_TYPE[:pdf])) return false end
is_image?()
click to toggle source
# File lib/bagira.rb, line 49 def is_image? return true if (@file_type.eql?(FILE_TYPE[:jpg]) or @file_type.eql?(FILE_TYPE[:png])) return false end
process_image(filepath)
click to toggle source
# File lib/bagira.rb, line 35 def process_image(filepath) result = %x(tesseract #{filepath} stdout) return result.strip unless result.nil? end
process_pdf(filepath)
click to toggle source
# File lib/bagira.rb, line 40 def process_pdf(filepath) result_array = [] for i in 1..@page_count page_result = %x(gs -q -dNOPAUSE -sDEVICE=pnggray -dTextAlphaBits=4 -r300 -sOutputFile=%stdout -dBATCH -dFirstPage=#{i} -dLastPage=#{i} #{filepath}|tesseract stdin stdout) result_array << page_result.strip unless page_result.nil? end result_array.join("\n") end