module ScanDex

Constants

VERSION

Public Class Methods

convert() click to toggle source
# File lib/scandex.rb, line 13
def self.convert
    `which convert`.strip
end
convert_to_gray_scale(source, destination) click to toggle source
# File lib/scandex.rb, line 63
def self.convert_to_gray_scale(source, destination)
    cmd = "#{self.convert} -density 300 -depth 8 -type grayscale \"#{source}\" #{destination}/convert-%04d.jpg"
    #puts "cmd = #{cmd}"
    puts "Converting '#{File.basename(source)}'"
    ret = system(cmd)
    if !ret
        puts "Failed to convert #{source}"
        []
    else
        Dir["#{destination}/convert-*.jpg"]
    end
end
db(store_path) click to toggle source
# File lib/scandex.rb, line 91
def self.db(store_path)
    store_path = '~/' if store_path.nil? || store_path.empty?
    filename = File.expand_path("#{store_path}/.scandex.db")
    migrate = !File.exists?(filename)
    db = SQLite3::Database.new(filename)
    if migrate
        puts "Creating DB"
        db.execute("CREATE TABLE documents (name VARCHAR(255), content TEXT, created TEXT, modified TEXT)")
    end
    db
end
doctor() click to toggle source
# File lib/scandex.rb, line 17
def self.doctor
    convert = self.convert
    if convert.empty?
        puts "ImageMagick is missing"
        return false
    end
    gs = self.gs
    if gs.empty?
        puts "GhostScript is missing"
        return false
    end
    tesseract = self.tesseract
    if tesseract.empty?
        puts "Tesseract is missing"
        return false
    end
    #puts "All Tools Available"
    true
end
documents(store_path) click to toggle source
# File lib/scandex.rb, line 109
def self.documents(store_path)
    db = self.db(store_path)
    db.execute("SELECT name, created, modified FROM documents")
end
gs() click to toggle source
# File lib/scandex.rb, line 9
def self.gs
    `which gs`.strip
end
has_document(store_path, name) click to toggle source
# File lib/scandex.rb, line 103
def self.has_document(store_path, name)
    db = self.db(store_path)
    rows = db.execute("SELECT name FROM documents WHERE name = ?", [name])
    rows.size > 0
end
image_to_string(image, language = "eng") click to toggle source

TODO orientation and language detection

# File lib/scandex.rb, line 86
def self.image_to_string(image, language = "eng")
    img = RTesseract.new(image, :lang => language)
    img.to_s
end
index(store_path, source, force = false) click to toggle source
# File lib/scandex.rb, line 45
def self.index(store_path, source, force = false)
    accepted_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff']
    if (force || !self.has_document(store_path, source)) && accepted_formats.include?(File.extname(source).downcase)
        puts "Indexing #{source}"
        tmp = Dir.mktmpdir('scandex_')
        pages = convert_to_gray_scale(source, tmp)
        puts "Found #{pages.size} page(s)"
        if pages.size > 0
            ocr(pages)
        else
            nil
        end
    else
        puts "Ignoring '#{source}'"
        nil
    end
end
index_and_store(store_path, file, force = false) click to toggle source
# File lib/scandex.rb, line 37
def self.index_and_store(store_path, file, force = false)
    content = ScanDex::index(store_path, file, force)
    if !content.nil?
        file = File.expand_path(file)
        ScanDex::store_document(store_path, file, content)
    end
end
ocr(pages, language = "eng") click to toggle source
# File lib/scandex.rb, line 76
def self.ocr(pages, language = "eng")
    text = ''
    pages.each do |page|
        puts "OCR on '#{File.basename(page)}'"
        text += image_to_string(page, language)
    end
    text
end
search_documents(store_path, text) click to toggle source
# File lib/scandex.rb, line 114
def self.search_documents(store_path, text)
    db = self.db(store_path)
    pattern = "%#{text.downcase}%"
    db.execute("SELECT name, created, modified FROM documents WHERE LOWER(content) LIKE ? OR LOWER(name) LIKE ?", [pattern, pattern])
end
store_document(store_path, source, content) click to toggle source
# File lib/scandex.rb, line 120
def self.store_document(store_path, source, content)
    created = File.mtime(source).utc.iso8601
    modified = File.ctime(source).utc.iso8601

    db = self.db(store_path)
    rows = db.execute("SELECT * FROM documents WHERE name = ?", source)
    if rows.size == 0
        puts "Insert: #{source} #{created} #{modified}"
        db.execute("INSERT INTO documents (name, content, created, modified) VALUES (?, ?, ?, ?)", [source, content, created, modified])
    else
        puts "Update: #{source} #{created} #{modified}"
        db.execute("UPDATE documents SET content = ?, modified = ? WHERE name = ?", [content, modified, source])
    end
end
tesseract() click to toggle source
# File lib/scandex.rb, line 5
def self.tesseract
    `which tesseract`.strip
end