class PDF::Reader::ObjectHash
Provides low level access to the objects in a PDF file via a hash-like object.
A PDF file can be viewed as a large hash map. It is a series of objects stored at precise byte offsets, and a table that maps object IDs to byte offsets. Given an object ID, looking up an object is an O(1) operation.
Each PDF object can be mapped to a ruby object, so by passing an object ID to the [] method, a ruby representation of that object will be retrieved.
The class behaves much like a standard Ruby hash, including the use of the Enumerable mixin. The key difference is no []= method - the hash is read only.
Basic Usage¶ ↑
h = PDF::Reader::ObjectHash.new("somefile.pdf") h[1] => 3469 h[PDF::Reader::Reference.new(1,0)] => 3469
Attributes
Public Class Methods
Source
# File lib/pdf/reader/object_hash.rb, line 43 def initialize(input, opts = {}) @io = extract_io_from(input) @xref = PDF::Reader::XRef.new(@io) @pdf_version = read_version @trailer = @xref.trailer @cache = opts[:cache] || PDF::Reader::ObjectCache.new @sec_handler = NullSecurityHandler.new @sec_handler = build_security_handler(opts) end
Creates a new ObjectHash
object. Input can be a string with a valid filename or an IO-like object.
Valid options:
:password - the user password to decrypt the source PDF
Public Instance Methods
Source
# File lib/pdf/reader/object_hash.rb, line 74 def [](key) return default if key.to_i <= 0 unless key.is_a?(PDF::Reader::Reference) key = PDF::Reader::Reference.new(key.to_i, 0) end @cache[key] ||= fetch_object(key) || fetch_object_stream(key) rescue InvalidObjectError return default end
Access an object from the PDF. key can be an int or a PDF::Reader::Reference
object.
If an int is used, the object with that ID and a generation number of 0 will be returned.
If a PDF::Reader::Reference
object is used the exact ID and generation number can be specified.
Source
# File lib/pdf/reader/object_hash.rb, line 97 def deref!(key) deref_internal!(key, {}) end
Recursively dereferences the object refered to be key
. If key
is not a PDF::Reader::Reference
, the key is returned unchanged.
Source
# File lib/pdf/reader/object_hash.rb, line 126 def each(&block) @xref.each do |ref| yield ref, self[ref] end end
iterate over each key, value. Just like a ruby hash.
Source
# File lib/pdf/reader/object_hash.rb, line 135 def each_key(&block) each do |id, obj| yield id end end
iterate over each key. Just like a ruby hash.
Source
# File lib/pdf/reader/object_hash.rb, line 143 def each_value(&block) each do |id, obj| yield obj end end
iterate over each value. Just like a ruby hash.
Source
# File lib/pdf/reader/object_hash.rb, line 158 def empty? size == 0 ? true : false end
return true if there are no objects in this file
Source
# File lib/pdf/reader/object_hash.rb, line 238 def encrypted? trailer.has_key?(:Encrypt) end
Source
# File lib/pdf/reader/object_hash.rb, line 113 def fetch(key, local_default = nil) obj = self[key] if obj return obj elsif local_default return local_default else raise IndexError, "#{key} is invalid" if key.to_i <= 0 end end
Access an object from the PDF. key can be an int or a PDF::Reader::Reference
object.
If an int is used, the object with that ID and a generation number of 0 will be returned.
If a PDF::Reader::Reference
object is used the exact ID and generation number can be specified.
local_default is the object that will be returned if the requested key doesn’t exist.
Source
# File lib/pdf/reader/object_hash.rb, line 165 def has_key?(check_key) # TODO update from O(n) to O(1) each_key do |key| if check_key.kind_of?(PDF::Reader::Reference) return true if check_key == key else return true if check_key.to_i == key.id end end return false end
return true if the specified key exists in the file. key can be an int or a PDF::Reader::Reference
Source
# File lib/pdf/reader/object_hash.rb, line 182 def has_value?(value) # TODO update from O(n) to O(1) each_value do |obj| return true if obj == value end return false end
return true if the specifiedvalue exists in the file
Source
# File lib/pdf/reader/object_hash.rb, line 197 def keys ret = [] each_key { |k| ret << k } ret end
return an array of all keys in the file
Source
# File lib/pdf/reader/object_hash.rb, line 54 def obj_type(ref) self[ref].class.to_s.to_sym rescue nil end
returns the type of object a ref points to
Source
# File lib/pdf/reader/object_hash.rb, line 89 def object(key) key.is_a?(PDF::Reader::Reference) ? self[key] : key end
If key is a PDF::Reader::Reference
object, lookup the corresponding object in the PDF and return it. Otherwise return key untouched.
Source
# File lib/pdf/reader/object_hash.rb, line 233 def page_references root = fetch(trailer[:Root]) @page_references ||= get_page_objects(root[:Pages]).flatten end
returns an array of PDF::Reader::References. Each reference in the array points a Page
object, one for each page in the PDF. The first reference is page 1, second reference is page 2, etc.
Useful for apps that want to extract data from specific pages.
Source
# File lib/pdf/reader/object_hash.rb, line 242 def sec_handler? !!sec_handler end
Source
# File lib/pdf/reader/object_hash.rb, line 151 def size xref.size end
return the number of objects in the file. An object with multiple generations is counted once.
Source
# File lib/pdf/reader/object_hash.rb, line 61 def stream?(ref) self.has_key?(ref) && self[ref].is_a?(PDF::Reader::Stream) end
returns true if the supplied references points to an object with a stream
Source
# File lib/pdf/reader/object_hash.rb, line 219 def to_a ret = [] each do |id, obj| ret << [id, obj] end ret end
return an array of arrays. Each sub array contains a key/value pair.
Source
# File lib/pdf/reader/object_hash.rb, line 191 def to_s "<PDF::Reader::ObjectHash size: #{self.size}>" end
Source
# File lib/pdf/reader/object_hash.rb, line 205 def values ret = [] each_value { |v| ret << v } ret end
return an array of all values in the file
Source
# File lib/pdf/reader/object_hash.rb, line 213 def values_at(*ids) ids.map { |id| self[id] } end
return an array of all values from the specified keys
Private Instance Methods
Source
# File lib/pdf/reader/object_hash.rb, line 301 def build_security_handler(opts = {}) encrypt = deref(trailer[:Encrypt]) if NullSecurityHandler.supports?(encrypt) NullSecurityHandler.new elsif StandardSecurityHandler.supports?(encrypt) encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true" StandardSecurityHandler.new( key_length: (encrypt[:Length] || 40).to_i, revision: encrypt[:R], owner_key: encrypt[:O], user_key: encrypt[:U], permissions: encrypt[:P].to_i, encrypted_metadata: encmeta, file_id: (deref(trailer[:ID]) || []).first, password: opts[:password], cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil) ) elsif StandardSecurityHandlerV5.supports?(encrypt) StandardSecurityHandlerV5.new( O: encrypt[:O], U: encrypt[:U], OE: encrypt[:OE], UE: encrypt[:UE], password: opts[:password] ) else UnimplementedSecurityHandler.new end end
Source
# File lib/pdf/reader/object_hash.rb, line 331 def decrypt(ref, obj) case obj when PDF::Reader::Stream then obj.data = sec_handler.decrypt(obj.data, ref) obj when Hash then arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1) Hash[*arr] when Array then obj.collect { |item| decrypt(ref, item) } when String sec_handler.decrypt(obj, ref) else obj end end
Source
# File lib/pdf/reader/object_hash.rb, line 272 def deref_internal!(key, seen) seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id return seen[seen_key] if seen.key?(seen_key) case object = deref(key) when Hash seen[seen_key] ||= {} object.each do |k, value| seen[seen_key][k] = deref_internal!(value, seen) end seen[seen_key] when PDF::Reader::Stream seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data) object.hash.each do |k,value| seen[seen_key].hash[k] = deref_internal!(value, seen) end seen[seen_key] when Array seen[seen_key] ||= [] object.each do |value| seen[seen_key] << deref_internal!(value, seen) end seen[seen_key] else object end end
Private implementation of deref!, which exists to ensure the ‘seen` argument isn’t publicly available. It’s used to avoid endless loops in the recursion, and doesn’t need to be part of the public API.
Source
# File lib/pdf/reader/object_hash.rb, line 383 def extract_io_from(input) if input.respond_to?(:seek) && input.respond_to?(:read) input elsif File.file?(input.to_s) StringIO.new read_as_binary(input) else raise ArgumentError, "input must be an IO-like object or a filename" end end
Source
# File lib/pdf/reader/object_hash.rb, line 251 def fetch_object(key) if xref[key].is_a?(Integer) buf = new_buffer(xref[key]) decrypt(key, Parser.new(buf, self).object(key.id, key.gen)) end end
parse a traditional object from the PDF, starting from the byte offset indicated in the xref table
Source
# File lib/pdf/reader/object_hash.rb, line 260 def fetch_object_stream(key) if xref[key].is_a?(PDF::Reader::Reference) container_key = xref[key] object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key)) object_streams[container_key][key.id] end end
parse a object that’s embedded in an object stream in the PDF
Source
# File lib/pdf/reader/object_hash.rb, line 362 def get_page_objects(ref) obj = deref(ref) unless obj.kind_of?(::Hash) raise MalformedPDFError, "Dereferenced page object must be a dict" end if obj[:Type] == :Page ref elsif obj[:Kids] deref(obj[:Kids]).map { |kid| get_page_objects(kid) } end end
returns a nested array of object references for all pages in this object store.
Source
# File lib/pdf/reader/object_hash.rb, line 348 def new_buffer(offset = 0) PDF::Reader::Buffer.new(@io, :seek => offset) end
Source
# File lib/pdf/reader/object_hash.rb, line 356 def object_streams @object_stream ||= {} end
Source
# File lib/pdf/reader/object_hash.rb, line 393 def read_as_binary(input) if File.respond_to?(:binread) File.binread(input.to_s) else File.open(input.to_s,"rb") { |f| f.read } end end
Source
# File lib/pdf/reader/object_hash.rb, line 376 def read_version @io.seek(0) _m, version = *@io.read(10).match(/PDF-(\d.\d)/) @io.seek(0) version.to_f end