class TikaWrapper::Instance

Attributes

options[R]
pid[R]

Public Class Methods

new(options = {}) click to toggle source

@param [Hash] options @option options [String] :url @option options [String] :version @option options [String] :port @option options [String] :version_file @option options [String] :instance_dir @option options [String] :download_path @option options [String] :md5sum @option options [String] :tika_xml @option options [Boolean] :verbose @option options [Boolean] :managed @option options [Boolean] :ignore_md5sum @option options [Hash] :tika_options @option options [Hash] :env

# File lib/tika_wrapper/instance.rb, line 29
def initialize(options = {})
  @options = options
end

Public Instance Methods

clean!() click to toggle source

Clean up any files tika_wrapper may have downloaded

# File lib/tika_wrapper/instance.rb, line 96
def clean!
  stop
  FileUtils.remove_entry(download_path) if File.exists? download_path
  FileUtils.remove_entry(md5sum_path) if File.exists? md5sum_path
end
port() click to toggle source

Get the port this tika instance is running at

# File lib/tika_wrapper/instance.rb, line 90
def port
  options.fetch(:port, "9998").to_s
end
start() click to toggle source

Start tika and wait for it to become available

# File lib/tika_wrapper/instance.rb, line 42
def start
  download
  if managed?
    exec(p: port)

    # Wait for tika to start
    unless status
      sleep 1
    end
  end
end
started?() click to toggle source

Is tika running?

# File lib/tika_wrapper/instance.rb, line 84
def started?
  !!status
end
status() click to toggle source

Check the status of a managed tika service

# File lib/tika_wrapper/instance.rb, line 71
def status
  return true unless managed?

  begin
    open(url + "version")
    true
  rescue
    false
  end
end
stop() click to toggle source

Stop tika and wait for it to finish exiting

# File lib/tika_wrapper/instance.rb, line 56
def stop
  if managed? && started?
    Process.kill("KILL", pid.to_i)

    # Wait for tika to stop
    while status
      sleep 1
    end
  end

  @pid = nil
end
url() click to toggle source

Get a (likely) URL to the tika instance

# File lib/tika_wrapper/instance.rb, line 104
def url
  "http://127.0.0.1:#{port}/"
end
wrap() { |self| ... } click to toggle source
# File lib/tika_wrapper/instance.rb, line 33
def wrap(&_block)
  start
  yield self
ensure
  stop
end

Protected Instance Methods

download() click to toggle source
# File lib/tika_wrapper/instance.rb, line 110
def download
  unless File.exists?(download_path) && validate?(download_path)
    fetch_with_progressbar download_url, download_path
    validate! download_path
  end

  download_path
end
exec(options = {}) click to toggle source

Run the tika server

# File lib/tika_wrapper/instance.rb, line 131
def exec(options = {})
  args = ["java", "-jar", tika_binary] + tika_options.merge(options).map { |k, v| ["-#{k}", "#{v}"] }.flatten + [">&2"]
  io = IO.popen(env, args + [err: [:child, :out]])
  @pid = io.pid
end
validate!(file) click to toggle source
# File lib/tika_wrapper/instance.rb, line 123
def validate!(file)
  unless validate? file
    raise "MD5 mismatch" unless options[:ignore_md5sum]
  end
end
validate?(file) click to toggle source
# File lib/tika_wrapper/instance.rb, line 119
def validate?(file)
  Digest::MD5.file(file).hexdigest == expected_md5sum
end

Private Instance Methods

default_download_path() click to toggle source
# File lib/tika_wrapper/instance.rb, line 176
def default_download_path
  File.join(Dir.tmpdir, File.basename(download_url))
end
default_download_url() click to toggle source
# File lib/tika_wrapper/instance.rb, line 143
def default_download_url
  @default_url ||= begin
    mirror_url = "http://www.apache.org/dyn/closer.cgi/tika/tika-server-#{version}.jar?asjson=true"
    json = open(mirror_url).read
    doc = JSON.parse(json)
    doc['preferred'] + doc['path_info']
  end
end
default_tika_version() click to toggle source
# File lib/tika_wrapper/instance.rb, line 168
def default_tika_version
  TikaWrapper.default_tika_version
end
download_path() click to toggle source
# File lib/tika_wrapper/instance.rb, line 172
def download_path
  @download_path ||= options.fetch(:download_path, default_download_path)
end
download_url() click to toggle source
# File lib/tika_wrapper/instance.rb, line 139
def download_url
  @download_url ||= options.fetch(:url, default_download_url)
end
env() click to toggle source
# File lib/tika_wrapper/instance.rb, line 164
def env
  options.fetch(:env, {})
end
expected_md5sum() click to toggle source
# File lib/tika_wrapper/instance.rb, line 196
def expected_md5sum
  @md5sum ||= options.fetch(:md5sum, open(md5file).read.split(" ").first)
end
fetch_with_progressbar(url, output) click to toggle source
# File lib/tika_wrapper/instance.rb, line 212
def fetch_with_progressbar(url, output)
  pbar = ProgressBar.create(title: File.basename(url), total: nil, format: "%t: |%B| %p%% (%e )")
  open(url, content_length_proc: lambda do|t|
    if t && 0 < t
      pbar.total = t
    end
  end,
            progress_proc: lambda do|s|
              pbar.progress = s
            end) do |io|
    IO.copy_stream(io, output)
  end
end
managed?() click to toggle source
# File lib/tika_wrapper/instance.rb, line 188
def managed?
  !!options.fetch(:managed, true)
end
md5file() click to toggle source
# File lib/tika_wrapper/instance.rb, line 226
def md5file
  unless File.exists? md5sum_path
    fetch_with_progressbar md5url, md5sum_path
  end

  md5sum_path
end
md5sum_path() click to toggle source
# File lib/tika_wrapper/instance.rb, line 204
def md5sum_path
  File.join(Dir.tmpdir, File.basename(md5url))
end
md5url() click to toggle source
# File lib/tika_wrapper/instance.rb, line 152
def md5url
  "http://archive.apache.org/dist/tika/tika-server-#{version}.jar.md5"
end
tika_binary() click to toggle source
# File lib/tika_wrapper/instance.rb, line 200
def tika_binary
  download_path
end
tika_dir() click to toggle source
# File lib/tika_wrapper/instance.rb, line 180
def tika_dir
  @tika_dir ||= options.fetch(:instance_dir, File.join(Dir.tmpdir, File.basename(download_url, ".jar")))
end
tika_options() click to toggle source
# File lib/tika_wrapper/instance.rb, line 160
def tika_options
  options.fetch(:tika_options, {})
end
tmp_save_dir() click to toggle source
# File lib/tika_wrapper/instance.rb, line 208
def tmp_save_dir
  @tmp_save_dir ||= Dir.mktmpdir
end
verbose?() click to toggle source
# File lib/tika_wrapper/instance.rb, line 184
def verbose?
  !!options.fetch(:verbose, false)
end
version() click to toggle source
# File lib/tika_wrapper/instance.rb, line 156
def version
  @version ||= options.fetch(:version, default_tika_version)
end
version_file() click to toggle source
# File lib/tika_wrapper/instance.rb, line 192
def version_file
  options.fetch(:version_file, File.join(tika_dir, "VERSION"))
end