class TikaWrapper::Instance
Attributes
Public Class Methods
@param [Hash] options @option options [String] :url @option options [String] :version @option options [String] :port @option options [String] :version_file @option options [String] :instance_dir @option options [String] :download_path @option options [String] :md5sum @option options [String] :tika_xml @option options [Boolean] :verbose @option options [Boolean] :managed @option options [Boolean] :ignore_md5sum @option options [Hash] :tika_options @option options [Hash] :env
# File lib/tika_wrapper/instance.rb, line 29 def initialize(options = {}) @options = options end
Public Instance Methods
Clean up any files tika_wrapper may have downloaded
# File lib/tika_wrapper/instance.rb, line 96 def clean! stop FileUtils.remove_entry(download_path) if File.exists? download_path FileUtils.remove_entry(md5sum_path) if File.exists? md5sum_path end
Get the port this tika instance is running at
# File lib/tika_wrapper/instance.rb, line 90 def port options.fetch(:port, "9998").to_s end
Start tika and wait for it to become available
# File lib/tika_wrapper/instance.rb, line 42 def start download if managed? exec(p: port) # Wait for tika to start unless status sleep 1 end end end
Is tika running?
# File lib/tika_wrapper/instance.rb, line 84 def started? !!status end
Check the status of a managed tika service
# File lib/tika_wrapper/instance.rb, line 71 def status return true unless managed? begin open(url + "version") true rescue false end end
Stop tika and wait for it to finish exiting
# File lib/tika_wrapper/instance.rb, line 56 def stop if managed? && started? Process.kill("KILL", pid.to_i) # Wait for tika to stop while status sleep 1 end end @pid = nil end
Get a (likely) URL to the tika instance
# File lib/tika_wrapper/instance.rb, line 104 def url "http://127.0.0.1:#{port}/" end
# File lib/tika_wrapper/instance.rb, line 33 def wrap(&_block) start yield self ensure stop end
Protected Instance Methods
# File lib/tika_wrapper/instance.rb, line 110 def download unless File.exists?(download_path) && validate?(download_path) fetch_with_progressbar download_url, download_path validate! download_path end download_path end
Run the tika server
# File lib/tika_wrapper/instance.rb, line 131 def exec(options = {}) args = ["java", "-jar", tika_binary] + tika_options.merge(options).map { |k, v| ["-#{k}", "#{v}"] }.flatten + [">&2"] io = IO.popen(env, args + [err: [:child, :out]]) @pid = io.pid end
# File lib/tika_wrapper/instance.rb, line 123 def validate!(file) unless validate? file raise "MD5 mismatch" unless options[:ignore_md5sum] end end
# File lib/tika_wrapper/instance.rb, line 119 def validate?(file) Digest::MD5.file(file).hexdigest == expected_md5sum end
Private Instance Methods
# File lib/tika_wrapper/instance.rb, line 176 def default_download_path File.join(Dir.tmpdir, File.basename(download_url)) end
# File lib/tika_wrapper/instance.rb, line 143 def default_download_url @default_url ||= begin mirror_url = "http://www.apache.org/dyn/closer.cgi/tika/tika-server-#{version}.jar?asjson=true" json = open(mirror_url).read doc = JSON.parse(json) doc['preferred'] + doc['path_info'] end end
# File lib/tika_wrapper/instance.rb, line 168 def default_tika_version TikaWrapper.default_tika_version end
# File lib/tika_wrapper/instance.rb, line 172 def download_path @download_path ||= options.fetch(:download_path, default_download_path) end
# File lib/tika_wrapper/instance.rb, line 139 def download_url @download_url ||= options.fetch(:url, default_download_url) end
# File lib/tika_wrapper/instance.rb, line 164 def env options.fetch(:env, {}) end
# File lib/tika_wrapper/instance.rb, line 196 def expected_md5sum @md5sum ||= options.fetch(:md5sum, open(md5file).read.split(" ").first) end
# File lib/tika_wrapper/instance.rb, line 212 def fetch_with_progressbar(url, output) pbar = ProgressBar.create(title: File.basename(url), total: nil, format: "%t: |%B| %p%% (%e )") open(url, content_length_proc: lambda do|t| if t && 0 < t pbar.total = t end end, progress_proc: lambda do|s| pbar.progress = s end) do |io| IO.copy_stream(io, output) end end
# File lib/tika_wrapper/instance.rb, line 188 def managed? !!options.fetch(:managed, true) end
# File lib/tika_wrapper/instance.rb, line 226 def md5file unless File.exists? md5sum_path fetch_with_progressbar md5url, md5sum_path end md5sum_path end
# File lib/tika_wrapper/instance.rb, line 204 def md5sum_path File.join(Dir.tmpdir, File.basename(md5url)) end
# File lib/tika_wrapper/instance.rb, line 152 def md5url "http://archive.apache.org/dist/tika/tika-server-#{version}.jar.md5" end
# File lib/tika_wrapper/instance.rb, line 200 def tika_binary download_path end
# File lib/tika_wrapper/instance.rb, line 180 def tika_dir @tika_dir ||= options.fetch(:instance_dir, File.join(Dir.tmpdir, File.basename(download_url, ".jar"))) end
# File lib/tika_wrapper/instance.rb, line 160 def tika_options options.fetch(:tika_options, {}) end
# File lib/tika_wrapper/instance.rb, line 208 def tmp_save_dir @tmp_save_dir ||= Dir.mktmpdir end
# File lib/tika_wrapper/instance.rb, line 184 def verbose? !!options.fetch(:verbose, false) end
# File lib/tika_wrapper/instance.rb, line 156 def version @version ||= options.fetch(:version, default_tika_version) end
# File lib/tika_wrapper/instance.rb, line 192 def version_file options.fetch(:version_file, File.join(tika_dir, "VERSION")) end