a PostScript page (lines with position information)
# File misc/pdfparse.rb, line 443 def initialize(str=nil) parse(str) if str end
remove lines not within ymin and ymax
# File misc/pdfparse.rb, line 448 def clip_lines(ymin, ymax) ymin, ymax = ymax, ymin if ymin > ymax @lines.each { |la| la.delete_if { |l| l.y < ymin or l.y > ymax } } @lines.delete_if { |la| la.empty? } self end
parse a postscript string to an array of paragraph (itself an array of lines) handles text strings and basic cursor position updates
# File misc/pdfparse.rb, line 457 def parse(str) @lines = [] curx = cury = 0 fontx = fonty = 12 charspc = wordspc = 0 stack = [] linelead = -12 ps2tok(str) { |t| case t when Float, String; print "#{t} " else puts t end if $VERBOSE case t when Float, String; stack << t # be postfix ! when :BT; intext = true ; @lines << [] # begin text when :ET; intext = false # end text when :Tj, :TJ # print line @lines.last << Line.new(stack.pop, curx, cury, fontx, fonty, charspc, wordspc) when :Td, :TD # move cursor linelead = stack.last*fonty if t == :TD cury += stack.pop*fonty curx += stack.pop*fontx when :'T*' # new line cury += linelead when :Tc # character spacing # RHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA #3.17731 Tc 9 0 0 9 343.41 653.84998 Tm #[(3T)3202(O)729(R)3179(A)-3689(S)3178(I)]TJ # => 3 TO RA SI charspc = stack.pop when :Tw wordspc = stack.pop when :Tm # set transform matrix (scale, rotate, translate) params = Array.new(6) { stack.pop }.reverse next if params[0] == 0.0 # rotated text fontx, _, _, fonty, curx, cury = params end } end
yields PS tokens: floats, commands, and strings
# File misc/pdfparse.rb, line 498 def ps2tok(str) loop do case str when ''; break when /\A-?\d+(?:\.\d+)?/; tok = $&.to_f when /\A\((?:\.|[^\)])*\)/; tok = $& when /\A\[(?:[^\](]*\((?:\.|[^\)])*\))*[^\]]*\]/; tok = $& when /\A[a-zA-Z0-9_*]+/; tok = $&.to_sym rescue nil when /\A\S+/, /\A\s+/ end str = str[$&.length..-1] yield tok if tok end end
renders the lines, according to the layout (almost ;) )
# File misc/pdfparse.rb, line 514 def to_s mx = @lines.flatten.map { |l| l.x }.min py = nil strs = [''] @lines.sort_by { |la| -la.map { |l| l.y }.max.to_i }.each { |la| y = la.map { |l| l.y }.max strs.concat ['']*((py-y)/12) if py and py > y la.sort_by { |l| [-l.y, l.x] }.each { |l| # 9 == base font size strs << '' if y > l.y+l.fonty*0.9 or strs.last.length*1000/Line::CHARWIDTH/9 > l.x-mx strs[-1] = strs.last.ljust((l.x-mx)*1000/Line::CHARWIDTH/9-1) << ' ' << l.str y = l.y } py = y if not py or py > y } strs.join("\n") end