a PDF parser
# File misc/pdfparse.rb, line 71 def initialize(str=nil) read str if str end
dereference references from the specified root, with the specified depth
# File misc/pdfparse.rb, line 333 def deref(obj, depth=1) if obj.kind_of? Ref @xrefs[obj.gen] ||= {} if not nobj = @xrefs[obj.gen][obj.id] pvoff = @off raise 'unknown ref off' unless @off = @xoff[obj.gen][obj.id] puts "deref #{obj.gen} #{obj.id} => #{@off.to_s(16)}" if $DEBUG nobj = @xrefs[obj.gen][obj.id] = readany || :poil @off = pvoff end obj = nobj end depth -= 1 case obj when Hash; obj = obj.dup ; obj.each { |k, v| obj[k] = deref(v, depth) } when Array; obj = obj.dup ; obj.each_with_index { |v, i| obj[i] = deref(v, depth) } end if depth > 0 obj end
# File misc/pdfparse.rb, line 244 def newstream(hash, data) f = [hash['Filter']].flatten.compact if f.length == 1 and f.first == 'FlateDecode' data = Zlib::Inflate.inflate(data) elsif f.length == 0 else puts "stream filter #{f.inspect} unsupported" end hash[:data] = data hash end
returns the nr-th page of the pdf as a PSPage
# File misc/pdfparse.rb, line 372 def page(nr, ar=@trailer['Root']['Pages']['Kids']) ar.each { |kid| if kid['Count'] break page(nr, kid['Kids']) if nr <= kid['Count'] nr -= kid['Count'] else nr -= 1 break PSPage.new(page_data(kid['Contents'])) if nr <= 0 end } end
returns the :data field for a Hash or the concatenation of the :data fields of the children for an Array
# File misc/pdfparse.rb, line 354 def page_data(ct) if deref(ct).kind_of? Array ct.map { |c| c[:data] }.join else ct[:data] end end
reads a string as a PDF, interpret basic informations (header, trailer, xref table)
# File misc/pdfparse.rb, line 76 def read(str) @str = str @off = 0 readhdr raise 'bad pdf: no trailer' unless @off = @str.rindex("trailer", @str.length) readtrailer self end
reads & returns any pdf object according to its 1st char (almost) updates @xrefs if the object is indirect
# File misc/pdfparse.rb, line 276 def readany case @str[@off, 1] when nil; return when '/'; readname when '+', '-'; readint when '0'..'9' i = readint if ('0'..'9').include?(@str[@off, 1]) poff = @off g = readint case readcmd when 'obj' @xrefs[g] ||= {} i = @xrefs[g][i] ||= readany raise 'no endobj' if readcmd != 'endobj' when 'R' i = Ref.new(self, g, i) else @off = poff end end i when '['; readarray when '('; readstr when '<' if @str[@off+1, 1] == '<' h = readhash if @str[@off, 6] == 'stream' and i = @str.index("\n", @off) # readcmd may eat spaces that are part of the stream l = h['Length'].to_i h = newstream(h, @str[i+1, l]) @off = i+1+l skipspc raise 'no endstream' if readcmd != 'endstream' end h else readstr end else case c = readcmd when 'true', 'false', 'null'; c.to_sym when 'xref'; readxrtable ; (@trailer ||= {}).update readhash if readcmd == 'trailer' ; readint if readcmd == 'startxref' ; :xref else raise "unknown cmd #{c.inspect}" end end end
# File misc/pdfparse.rb, line 208 def readarray return if @str[@off, 1] != '[' buf = [] @off += 1 skipspc buf << readany until @str[@off, 1] == ']' or @off >= @str.length @off += 1 skipspc buf end
# File misc/pdfparse.rb, line 231 def readcmd buf = '' loop do case c = @str[@off, 1] when nil, /[\s\(\)\{\}<>\[\]\/%]/; break else buf << c end @off += 1 end skipspc buf end
# File misc/pdfparse.rb, line 219 def readhash return if @str[@off, 2] != '<<' buf = {} @off += 2 skipspc buf[readname] = readany until @str[@off, 2] == '>>' or @off >= @str.length buf.delete_if { |k, v| v == :null } @off += 2 skipspc buf end
# File misc/pdfparse.rb, line 85 def readhdr @hdr = @str[@off, @off = @str.index("\n", @off)] end
# File misc/pdfparse.rb, line 126 def readint buf = '' loop do case c = @str[@off, 1] when '+', '-'; break if not buf.empty? when '.'; break if buf.include? '.' when '0'..'9' else break end buf << c @off += 1 end return if buf.empty? skipspc buf.include?('.') ? buf.to_f : buf.to_i end
# File misc/pdfparse.rb, line 193 def readname return if @str[@off, 1] != '/' buf = '' loop do @off += 1 case c = @str[@off, 1] when '#'; buf << @str[@off+1, 2].to_i(16) ; @off += 2 when nil, /[\s\(\)\{\}<>\[\]\/]/; break else buf << c end end skipspc buf end
# File misc/pdfparse.rb, line 143 def readstr buf = '' case @str[@off, 1] when '(' nest = 0 loop do @off += 1 case c = @str[@off, 1] when '('; nest += 1 ; buf << c when ')'; nest -= 1 ; break if nest < 0 ; buf << c when '\' @off += 1 case c = @str[@off, 1] when 'n'; buf << \n when 'r'; buf << \r when 't'; buf << \t when 'b'; buf << \b when '0'..'7' if ('0'..'7').include?(cc = @str[@off+1, 1]) @off += 1 ; c << cc if ('0'..'7').include?(cc = @str[@off+1, 1]) @off += 1 ; c << cc end end buf << c.to_i(8) when nil; break else buf << c end when nil; break else buf << c end end when '<' loop do @off += 1 case c = @str[@off, 1] when '0'..'9', 'a'..'f', 'A'..'F'; buf << c when ' ', "\n", "\r", "\t" else break end end buf << '0' if buf.length % 2 == 1 buf = [buf].pack('H*') else return end @off += 1 skipspc buf end
reads the pdf trailer XXX the xref table referenced here may be the first of the file, so we suppose the last is just before the 'trailer' command..
# File misc/pdfparse.rb, line 91 def readtrailer toff = @off readcmd @trailer = readhash readcmd @xroff = readint @xoff = {} # [gen] => { id => off } @xrefs = {} # [gen] => { id => obj } @off = @xroff readcmd readxrtable off2 = @off if @off < toff and readcmd == 'trailer' and off = @str.rindex('xref', toff) @off = off readcmd readxrtable @off = off2 readcmd @trailer.update readhash end end
# File misc/pdfparse.rb, line 113 def readxrtable while @str[@off, 7] != 'trailer' objnr = readint objcnt = readint @str[@off, 20*objcnt].scan(/(\d+) (\d+) (.)/) { |o, g, u| (@xoff[g.to_i] ||= {})[objnr] = o.to_i if u == 'n' objnr += 1 } @off += 20*objcnt skipspc end end
# File misc/pdfparse.rb, line 321 def skipspc while @off < @str.length case @str[@off, 1] when '%'; @off += 1 until @str[@off, 1] == "\n" or @off >= @str.length when ' ', "\n", "\r", "\t" else break end @off += 1 end end