class HexaPDF::Parser
Parses an IO stream according to PDF1.7 to get at the contained objects.
This class also contains higher-level methods for getting indirect objects and revisions.
See: PDF1.7 s7
Attributes
The IO stream which is parsed.
Public Class Methods
Creates a new parser for the given IO object.
PDF references are resolved using the associated Document
object.
# File lib/hexapdf/parser.rb, line 56 def initialize(io, document) @io = io on_correctable_error = document.config['parser.on_correctable_error'].curry[document] @tokenizer = Tokenizer.new(io, on_correctable_error: on_correctable_error) @document = document @object_stream_data = {} @reconstructed_revision = nil @in_reconstruct_revision = false retrieve_pdf_header_offset_and_version end
Public Instance Methods
Returns the PDF version number that is stored in the file header.
See: PDF1.7 s7.5.2
# File lib/hexapdf/parser.rb, line 376 def file_header_version unless @header_version raise_malformed("PDF file header is missing or corrupt", pos: 0) end @header_version end
Loads the compressed object identified by the cross-reference entry.
# File lib/hexapdf/parser.rb, line 197 def load_compressed_object(xref_entry) unless @object_stream_data.key?(xref_entry.objstm) obj = @document.object(xref_entry.objstm) unless obj.respond_to?(:parse_stream) raise_malformed("Object with oid=#{xref_entry.objstm} is not an object stream") end @object_stream_data[xref_entry.objstm] = obj.parse_stream end [*@object_stream_data[xref_entry.objstm].object_by_index(xref_entry.pos), xref_entry.gen, nil] end
Loads the indirect (potentially compressed) object specified by the given cross-reference entry.
For information about the xref_entry
argument, have a look at HexaPDF::XRefSection
and HexaPDF::XRefSection::Entry.
# File lib/hexapdf/parser.rb, line 72 def load_object(xref_entry) obj, oid, gen, stream = case xref_entry.type when :in_use if xref_entry.pos == 0 && xref_entry.oid != 0 # Handle seen-in-the-wild objects with invalid offset 0 maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0) [nil, xref_entry.oid, xref_entry.gen, nil] else parse_indirect_object(xref_entry.pos) end when :free [nil, xref_entry.oid, xref_entry.gen, nil] when :compressed load_compressed_object(xref_entry) else raise_malformed("Invalid cross-reference type '#{xref_entry.type}' encountered") end if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen) raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref") end @document.wrap(obj, oid: oid, gen: gen, stream: stream) rescue HexaPDF::MalformedPDFError reconstructed_revision.object(xref_entry) || @document.wrap(nil, oid: xref_entry.oid, gen: xref_entry.gen) end
Loads a single revision whose cross-reference section/stream is located at the given position.
Returns an HexaPDF::XRefSection
object and the accompanying trailer dictionary.
# File lib/hexapdf/parser.rb, line 213 def load_revision(pos) if xref_section?(pos) xref_section, trailer = parse_xref_section_and_trailer(pos) else obj = load_object(XRefSection.in_use_entry(0, 0, pos)) unless obj.respond_to?(:xref_section) raise_malformed("Object is not a cross-reference stream", pos: pos) end begin xref_section = obj.xref_section rescue MalformedPDFError => e e.pos = pos raise end trailer = obj.trailer unless xref_section.entry?(obj.oid, obj.gen) maybe_raise("Cross-reference stream doesn't contain entry for itself", pos: pos) xref_section.add_in_use_entry(obj.oid, obj.gen, pos) end end xref_section.delete(0) [xref_section, trailer] end
Parses the indirect object at the specified offset.
This method is used by a PDF Document
to load objects. It should not be used by any other object because invalid object positions lead to errors.
Returns an array containing [object, oid, gen, stream].
See: PDF1.7 s7.3.10, s7.3.8
# File lib/hexapdf/parser.rb, line 110 def parse_indirect_object(offset = nil) @tokenizer.pos = offset + @header_offset if offset oid = @tokenizer.next_token gen = @tokenizer.next_token tok = @tokenizer.next_token unless oid.kind_of?(Integer) && gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj' raise_malformed("No valid object found", pos: offset) end if (tok = @tokenizer.peek_token) && tok.kind_of?(Tokenizer::Token) && tok == 'endobj' maybe_raise("No indirect object value between 'obj' and 'endobj'", pos: @tokenizer.pos) object = nil else begin object = @tokenizer.next_object rescue MalformedPDFError if tok.kind_of?(Tokenizer::Token) && tok =~ /\A\d+endobj\z/ # Handle often found invalid indirect object with missing whitespace after number maybe_raise("Missing whitespace after number'", pos: @tokenizer.pos) object = tok.to_i @tokenizer.pos -= 6 else maybe_raise("Invalid value after '#{oid} #{gen} obj', treating as null", pos: @tokenizer.pos) end end end tok = @tokenizer.next_token if tok.kind_of?(Tokenizer::Token) && tok == 'stream' unless object.kind_of?(Hash) raise_malformed("A stream needs a dictionary, not a(n) #{object.class}", pos: offset) end tok1 = @tokenizer.next_byte if tok1 == 32 # space maybe_raise("Keyword stream followed by space instead of LF or CR/LF", pos: @tokenizer.pos) tok1 = @tokenizer.next_byte end tok2 = @tokenizer.next_byte if tok1 == 13 # CR if tok1 != 10 && tok1 != 13 raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos) elsif tok1 == 13 && tok2 != 10 maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone", pos: @tokenizer.pos) @tokenizer.pos -= 1 end # Note that getting :Length might move the IO pointer (when resolving references) pos = @tokenizer.pos length = if object[:Length].kind_of?(Integer) object[:Length] elsif object[:Length].kind_of?(Reference) @document.deref(object[:Length]).value else 0 end @tokenizer.pos = pos + length rescue pos tok = @tokenizer.next_token unless tok.kind_of?(Tokenizer::Token) && tok == 'endstream' maybe_raise("Invalid stream length, keyword endstream not found", pos: @tokenizer.pos) @tokenizer.pos = pos if @tokenizer.scan_until(/(?=\n?endstream)/) length = @tokenizer.pos - pos tok = @tokenizer.next_token else raise_malformed("Stream content must be followed by keyword endstream", pos: @tokenizer.pos) end end tok = @tokenizer.next_token object[:Length] = length stream = StreamData.new(@tokenizer.io, offset: pos, length: length, filter: @document.unwrap(object[:Filter]), decode_parms: @document.unwrap(object[:DecodeParms])) end unless tok.kind_of?(Tokenizer::Token) && tok == 'endobj' maybe_raise("Indirect object must be followed by keyword endobj", pos: @tokenizer.pos) end [object, oid, gen, stream] end
Parses the cross-reference section at the given position and the following trailer and returns them as an array consisting of an HexaPDF::XRefSection
instance and a hash.
This method can only parse cross-reference sections, not cross-reference streams!
See: PDF1.7 s7.5.4, s7.5.5; ADB1.7 sH.3-3.4.3
# File lib/hexapdf/parser.rb, line 251 def parse_xref_section_and_trailer(offset) @tokenizer.pos = offset + @header_offset token = @tokenizer.next_token unless token.kind_of?(Tokenizer::Token) && token == 'xref' raise_malformed("Xref section doesn't start with keyword xref", pos: @tokenizer.pos) end xref = XRefSection.new start = @tokenizer.next_token while start.kind_of?(Integer) number_of_entries = @tokenizer.next_token unless number_of_entries.kind_of?(Integer) raise_malformed("Invalid cross-reference subsection start", pos: @tokenizer.pos) end @tokenizer.skip_whitespace start.upto(start + number_of_entries - 1) do |oid| pos, gen, type = @tokenizer.next_xref_entry do |recoverable| maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos, force: !recoverable) end if xref.entry?(oid) next elsif type == 'n' if pos == 0 || gen > 65535 maybe_raise("Invalid in use cross-reference entry", pos: @tokenizer.pos) xref.add_free_entry(oid, gen) else xref.add_in_use_entry(oid, gen, pos) end else xref.add_free_entry(oid, gen) end end start = @tokenizer.next_token end unless start.kind_of?(Tokenizer::Token) && start == 'trailer' raise_malformed("Trailer doesn't start with keyword trailer", pos: @tokenizer.pos) end trailer = @tokenizer.next_object unless trailer.kind_of?(Hash) raise_malformed("Trailer is #{trailer.class} instead of dictionary ", pos: @tokenizer.pos) end unless trailer[:Prev] || xref.max_oid == 0 || xref.entry?(0) first_entry = xref[xref.oids[0]] test_entry = xref[xref.oids[-1]] @tokenizer.pos = test_entry.pos + @header_offset test_oid = @tokenizer.next_token first_oid = first_entry.oid force_failure = !first_entry.free? || first_entry.gen != 65535 || !test_oid.kind_of?(Integer) || xref.oids[-1] - test_oid != first_oid maybe_raise("Main cross-reference section has invalid numbering", pos: offset + @header_offset, force: force_failure) new_xref = XRefSection.new xref.oids.each do |oid| entry = xref[oid] entry.oid -= first_oid new_xref.send(:[]=, entry.oid, entry.gen, entry) end xref = new_xref end [xref, trailer] end
Returns the reconstructed revision.
# File lib/hexapdf/parser.rb, line 369 def reconstructed_revision @reconstructed_revision ||= reconstruct_revision end
Returns the offset of the main cross-reference section/stream.
Implementation note: Normally, the %%EOF marker has to be on the last line, however, Adobe viewers relax this restriction and so do we.
If strict parsing is disabled, the whole file is searched for the offset.
See: PDF1.7 s7.5.5, ADB1.7 sH.3-3.4.4
# File lib/hexapdf/parser.rb, line 330 def startxref_offset return @startxref_offset if defined?(@startxref_offset) @io.seek(0, IO::SEEK_END) step_size = 1024 pos = @io.pos eof_not_found = pos == 0 startxref_missing = false while pos != 0 @io.pos = [pos - step_size, 0].max pos = @io.pos lines = @io.read(step_size + 40).split(/[\r\n]+/) eof_index = lines.rindex {|l| l.strip == '%%EOF' } unless eof_index eof_not_found = true next end unless eof_index >= 2 && lines[eof_index - 2].strip == "startxref" startxref_missing = true next end break # we found the startxref offset end if eof_not_found maybe_raise("PDF file trailer with end-of-file marker not found", pos: pos, force: !eof_index) elsif startxref_missing maybe_raise("PDF file trailer is missing startxref keyword", pos: pos, force: eof_index < 2 || lines[eof_index - 2].strip != "startxref") end @startxref_offset = lines[eof_index - 1].to_i end
Looks at the given offset and returns true
if there is a cross-reference section at that position.
# File lib/hexapdf/parser.rb, line 239 def xref_section?(offset) @tokenizer.pos = offset + @header_offset token = @tokenizer.peek_token token.kind_of?(Tokenizer::Token) && token == 'xref' end
Private Instance Methods
Calls the block stored in the config option parser.on_correctable_error
with the document, the given message and the position. If the returned value is true
, raises a HexaPDF::MalformedPDFError
. Otherwise the error is corrected and parsing continues.
If the option force
is used, the block is not called and the error is raised immediately.
# File lib/hexapdf/parser.rb, line 486 def maybe_raise(msg, pos:, force: false) if force || @document.config['parser.on_correctable_error'].call(@document, msg, pos) error = HexaPDF::MalformedPDFError.new(msg, pos: pos) error.set_backtrace(caller(1)) raise error end end
Raises a HexaPDF::MalformedPDFError
with the given message and source position.
# File lib/hexapdf/parser.rb, line 477 def raise_malformed(msg, pos: nil) raise HexaPDF::MalformedPDFError.new(msg, pos: pos) end
Tries to reconstruct the PDF document's main cross-reference table by serially parsing the file and returning a Revision
object for loading the found objects.
If the file contains multiple cross-reference sections, all objects will be put into a single cross-reference table, later objects overwriting prior ones.
# File lib/hexapdf/parser.rb, line 403 def reconstruct_revision return if @in_reconstruct_revision @in_reconstruct_revision = true raise unless @document.config['parser.try_xref_reconstruction'] msg = "#{$!} - trying cross-reference table reconstruction" @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos) xref = XRefSection.new @tokenizer.pos = 0 linearized = nil while true @tokenizer.skip_whitespace pos = @tokenizer.pos @tokenizer.scan_until(/(\n|\r\n?)+|\z/) next_new_line_pos = @tokenizer.pos @tokenizer.pos = pos token = @tokenizer.next_integer_or_keyword rescue nil if token.kind_of?(Integer) gen = @tokenizer.next_integer_or_keyword rescue nil tok = @tokenizer.next_integer_or_keyword rescue nil if @tokenizer.pos > next_new_line_pos @tokenizer.pos = next_new_line_pos elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj' xref.add_in_use_entry(token, gen, pos) if linearized.nil? obj = @tokenizer.next_object rescue nil linearized = obj.kind_of?(Hash) && obj.key?(:Linearized) end @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/) end elsif token.kind_of?(Tokenizer::Token) && token == 'trailer' obj = @tokenizer.next_object rescue nil # Use last trailer found in case of multiple revisions but use first trailer in case of # linearized file. trailer = obj if obj.kind_of?(Hash) && (!linearized || trailer.nil?) elsif token == Tokenizer::NO_MORE_TOKENS break else @tokenizer.pos = next_new_line_pos end end if !trailer || trailer.empty? _, trailer = load_revision(startxref_offset) rescue nil unless trailer xref.each do |_oid, _gen, xref_entry| obj, * = parse_indirect_object(xref_entry.pos) rescue nil if obj.kind_of?(Hash) && obj[:Type] == :Catalog trailer = {Root: HexaPDF::Reference.new(xref_entry.oid, xref_entry.gen)} break end end end unless trailer @in_reconstruct_revision = false raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0) end end trailer&.delete(:Prev) # no need for this and may wreak havoc loader = lambda do |xref_entry| obj, oid, gen, stream = parse_indirect_object(xref_entry.pos) obj = @document.wrap(obj, oid: oid, gen: gen, stream: stream) @document.security_handler ? @document.security_handler.decrypt(obj) : obj end @in_reconstruct_revision = false Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref, loader: loader) end
Retrieves the offset of the PDF header and the PDF version number in it.
The PDF header should normally appear on the first line. However, Adobe relaxes this restriction so that the header may appear in the first 1024 bytes. We follow the Adobe convention.
See: PDF1.7 s7.5.2, ADB1.7 sH.3-3.4.1
# File lib/hexapdf/parser.rb, line 392 def retrieve_pdf_header_offset_and_version @io.seek(0) @header_offset = (@io.read(1024) || '').index(/%PDF-(\d\.\d)/) || 0 @header_version = $1 end