class PDFBeads::PDFBuilder

The key class where the actual generation of a PDF file is performed.

Public Class Methods

new( pdfargs ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 57
def initialize( pdfargs )
  @pdfargs = pdfargs
  @now = Time.now()
  @doc = Doc.new()
  @fdata = FontDataProvider.new()

  @dictpath = ''
  @dictobj  = nil
end

Public Instance Methods

output( outpath ) click to toggle source

Output the created PDF file to the disk.

# File lib/pdfbeads/pdfbuilder.rb, line 365
def output( outpath )
  begin
    if outpath.eql? 'STDOUT'
      out = $stdout
    else
      out = File.open( outpath,'w' )
    end

    out.binmode if /(win|w)32$/.match( RUBY_PLATFORM )
    out.write( @doc.to_s )
    out.close unless outpath.eql? 'STDOUT'
  rescue
    $stderr.puts( "Error: could not write to #{outpath}" )
  end
end
process( pagefiles,st_format ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 67
def process( pagefiles,st_format )
  labels = toc = nil
  labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
  toc    = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
  meta   = parseMeta( @pdfargs[:meta] )
  reader = getPDFReader( @pdfargs[:textpdf] )

  cat = XObj.new(Hash[
    'Type'       => '/Catalog',
    'PageLayout' => "/#{@pdfargs[:pagelayout]}"
    ])
  @doc.addObject(cat)

  offsign = 'Z'
  if @now.gmt_offset > 0
    offsign = "+"
  else
    offsign = "-"
  end
  creationDate = sprintf( "D:%04d%02d%02d%02d%02d%02d%s",
    @now.year, @now.month, @now.day, @now.hour, @now.min, @now.sec, offsign )
  unless offsign.eql? 'Z'
    gmt_mins = @now.gmt_offset/60
    creationDate << sprintf( "%02d'%02d", gmt_mins/60, gmt_mins%60 )
  end
  info = XObj.new(Hash[
    'Creator'      => "(PDFBeads)",
    'Producer'     => "(PDFBeads)",
    'CreationDate' => "(#{creationDate})"
  ])
  @doc.addObject(info)
  meta.each_key do |key|
    info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
  end

  if ( toc != nil and toc.length > 0 ) or @pdfargs[:rtl]
    vpref = XObj.new(Hash.new())
    vpref.addToDict('Direction', "/R2L") if @pdfargs[:rtl]
    @doc.addObject(vpref)
    cat.addToDict('ViewerPreferences', ref(vpref.getID))
  end

  pages = XObj.new(Hash[
    'Type' => '/Pages'
  ])
  @doc.addObject(pages)
  cat.addToDict('Pages', ref(pages.getID))

  creator = XObj.new(Hash[
    'Subtype' => '/Artwork',
    'Creator' => "(PDFBeads)",
    'Feature' => '(Layers)'
  ])
  @doc.addObject(creator)

  ocFore = XObj.new(Hash[
    'Type'   => '/OCG',
    'Name'   => '(Foreground)',
    'Usage'  => "<</CreatorInfo #{ref(creator.getID)}>>",
    'Intent' => '[/View/Design]'
  ])
  @doc.addObject(ocFore)
  ocBack = XObj.new({
    'Type'   => '/OCG',
    'Name'   => '(Background)',
    'Usage'  => "<</CreatorInfo #{ref(creator.getID)}>>",
    'Intent' => '[/View/Design]'
  })
  @doc.addObject(ocBack)
  cat.addToDict('OCProperties',
    sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState /ON /Order[%s %s] >>>>",
      ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))

  page_objs = Array.new()
  pages_by_num = Hash.new()
  symd = nil
  font = nil
  pidx = 0

  if labels != nil and labels.length > 0
    nTree = "<</Nums[\n"
    labels.each do |rng|
      nTree << "#{rng[:first]} << "
      if rng.has_key? :prefix
        begin
          # If possible, use iso8859-1 (aka PDFDocEncoding) for page labels:
          # it is at least guaranteed to be safe
          if rng[:prefix].respond_to? :encode
            ltitl = rng[:prefix].encode( "iso8859-1", "utf-8" )
          else
            ltitl = Iconv.iconv( "iso8859-1", "utf-8", rng[:prefix] ).first
          end
          nTree << "/P (#{ltitl.to_text}) "
        # Iconv::InvalidCharacter, Iconv::IllegalSequence, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
        rescue
          if rng[:prefix].respond_to? :encode
            ltitl = rng[:prefix].encode( "utf-16be", "utf-8" )
          else
            ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first
          end
          # If there is no number (just prefix) then put a zero character after the prefix:
          # this makes acroread happy, but prevents displaying the number in evince
          unless rng.has_key? :style
            nTree << "/P (\xFE\xFF#{ltitl.to_text}\x00\x00) "
          # Otherwise put a formally correct Unicode string, which, however, may stumble acroread
          else
            nTree << "/P (\xFE\xFF#{ltitl.to_text}) "
          end
        end
      end
      nTree << "/S /#{rng[:style]} " if rng.has_key? :style
      nTree << "/St #{rng[:start]}" if rng.has_key? :start
      nTree << ">>\n"
    end

    nTree << "]\n>>"
    cat.addToDict('PageLabels', nTree)
    cur_range_id = 0
  end

  needs_font = false
  fonts = encodings = nil
  unless reader.nil?
    fdict = importPDFFonts( reader,@pdfargs[:textpdf] )
  else
    pagefiles.each do |p|
      unless p.hocr_path.nil?
        needs_font = true
        break
      end
    end

    if needs_font
      fonts = Array.new()
      encodings = [ [' '] ]
      fdict = XObj.new( Hash[] )
      @doc.addObject( fdict )

      descr = XObj.new( Hash[
        'Type'     => '/FontDescriptor',
        'BaseFont' => '/Times-Roman',
        ] )
      @fdata.header.each_key do |key|
        descr.addToDict( key,@fdata.header[key] )
      end
      @doc.addObject( descr )
    end
  end

  pagefiles.each do |p|
    procSet = ['/PDF', '/ImageB']
    c_str = ''
    doc_objs = Array.new()
    lastimg = 0

    width = p.width; height = p.height
    xres  = p.x_res; yres   = p.y_res
    pwidth  = width.to_f  / xres * 72
    pheight = height.to_f / yres * 72

    p.stencils.each do |s|
      if st_format.eql? 'JBIG2'
        xobj,width,height,xres,yres = loadJBIG2Page( s[:jbig2path],s[:jbig2dict],ref(ocFore.getID) )
      else
        xobj,width,height,xres,yres = loadCCITTPage( s[:path],ref(ocFore.getID) )
      end
      break if xobj.nil?

      color = s[:rgb].join(' ') << ' rg'
      doc_objs << xobj

      c_str << "#{color} /Im#{lastimg} Do "
      lastimg += 1
    end

    fg_image = bg_image = nil
    fg_image = loadImage( p.fg_layer,ocFore.getID,procSet ) unless p.fg_layer.nil?
    bg_image = loadImage( p.bg_layer,ocBack.getID,procSet ) unless p.bg_layer.nil?

    contents = XObj.new(Hash[
      'Filter' => '/FlateDecode'
    ])
    resobj = XObj.new(Hash.new())
    resources = XObj.new(Hash[
      'XObject' => ref(resobj.getID)
    ])

    unless fg_image.nil?
      xobj = doc_objs[0]
      fg_image.addToDict('SMask', ref(xobj.getID))
      xobj.removeFromDict('ImageMask')
      xobj.addToDict('Decode', '[1 0]')
      resobj.addToDict('Im0', ref(fg_image.getID))
      doc_objs << fg_image
      c_str = '/Im0 Do '
    else
      doc_objs.each_index do |i|
        resobj.addToDict( "Im#{i}", ref(doc_objs[i].getID) )
      end
    end

    unless bg_image.nil?
      c_str = "/Im#{resobj.dictLength} Do " << c_str
      resobj.addToDict( "Im#{resobj.dictLength}", ref(bg_image.getID) )
      doc_objs << bg_image
    end
    c_str = sprintf( "q %.2f 0 0 %.2f 0 0 cm %sQ",pwidth,pheight,c_str )

    doc_objs.concat( [contents, resobj, resources] )

    hocr = nil
    if not reader.nil?
      procSet << '/Text'
      c_str   << getPDFText( reader,pidx,@pdfargs[:debug] )
    elsif not p.hocr_path.nil?
      hocr = open( p.hocr_path ) { |f| Nokogiri::HTML( f ) }
      procSet << '/Text'
      c_str   << getHOCRText( hocr,pheight,72.0/xres,72.0/yres,encodings )
    end

    unless @pdfargs[:debug]
      contents.reinit( Hash[
        'Filter' => '/FlateDecode'
      ], Zlib::Deflate.deflate( c_str,9 ) )
    else
      contents.reinit( Hash[], c_str )
    end
    resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
    resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil? and reader.nil?

    page = XObj.new(Hash[
      'Type'      => '/Page',
      'Parent'    => "#{pages.getID} 0 R",
      'MediaBox'  => sprintf( "[ 0 0 %.02f %.02f ]",pwidth,pheight ),
      'Contents'  => ref( contents.getID ),
      'Resources' => ref( resources.getID )
    ])
    # By default acroread uses /DeviceCMYK as a transparency blending space,
    # so adding an SMask image to a page would result to colors being shifted,
    # uless we take a special care of this. For more details see
    # http://comments.gmane.org/gmane.comp.tex.pdftex/3747
    unless fg_image.nil?
      cspace = '/DeviceRGB'
      cspace = fg_image.getFromDict( 'ColorSpace' ) if fg_image.hasInDict( 'ColorSpace' )
      page.addToDict( 'Group', "<< /S /Transparency /CS #{cspace} >>" )
    end
    doc_objs  << page
    doc_objs.each{ |x| @doc.addObject(x) }
    page_objs << page

    pages.addToDict( 'Count', page_objs.length )
    pages.addToDict( 'Kids', '[' << page_objs.map{|x| ref(x.getID).to_s}.join(' ') << ']' )

    pkey = (pidx + 1).to_s
    pkey = labels.getPageLabel( cur_range_id,pidx ) if labels != nil and labels.length > 0
    pages_by_num[pkey] = page.getID
    pidx += 1
    if labels != nil and labels.length > 0
      if cur_range_id < labels.length - 1 and labels[cur_range_id + 1][:first] == pidx
        cur_range_id += 1
      end
    end

    $stderr.puts("Processed #{p.name}\n")
    $stderr.puts("  Added background image from #{p.bg_layer}\n") unless bg_image.nil?
    $stderr.puts("  Added foreground image from #{p.fg_layer}\n") unless fg_image.nil?
  end

  if needs_font
    fidx = 1
    encodings.each do |enc|
      font = addFont( descr,enc,"Fnt#{fidx}" )
      fdict.addToDict( "Fnt#{fidx}",ref(font.getID) )
      fonts << font
      fidx += 1
    end
  end

  if toc != nil and toc.length > 0
    getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
    cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
    cat.addToDict('PageMode', "/UseOutlines")
    vpref.addToDict('NonFullScreenPageMode', "/UseOutlines")
  end

  if @pdfargs[:delfiles]
    pagefiles.each do |p|
      $stderr.puts( "Cleaning up temporary files for #{p.name}" )
      safe_delete( p.fg_layer ) if p.fg_created
      safe_delete( p.bg_layer ) if p.bg_created
      p.stencils.each do |s|
        safe_delete( s[:path] ) if s[:created]
      end
    end
  end
end

Private Instance Methods

addFont( descr,fenc,fname ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 862
def addFont( descr,fenc,fname )
  enc_str = @fdata.getEncoding( fenc ).join( ' ' )
  enc  = XObj.new( Hash[
    'Type'        => "/Encoding",
    'Differences' => "[ 0 #{enc_str} ]"
  ])
  @doc.addObject( enc )

  toUni = @fdata.getCMAP( fenc )
  @doc.addObject( toUni )

  font = XObj.new( Hash[
    'BaseFont'       => '/Times-Roman',
    'Name'           => "/#{fname}",
    'Subtype'        => '/Type1',
    'Type'           => '/Font',
    'FirstChar'      => 0,
    'LastChar'       => fenc.length - 1,
    'Widths'         => '[ ' << @fdata.getWidths(fenc).map{|w| w.to_s}.join(' ') << ' ]',
    'FontDescriptor' => ref(descr.getID),
    'ToUnicode'      => ref(toUni.getID),
  ] )
  if enc.nil?
    font.addToDict( 'Encoding','/WinAnsiEncoding' )
  else
    font.addToDict( 'Encoding',ref(enc.getID) )
  end
  @doc.addObject( font )
  return font
end
elementCoordinates( element,xscale,yscale ) click to toggle source

Returns an array containing the coordinates of the bounding box around an element

# File lib/pdfbeads/pdfbuilder.rb, line 650
def elementCoordinates( element,xscale,yscale )
  out = [0,0,0,0]

  if element.attributes.has_key? 'title'
    if /bbox((\s+\d+){4})/.match(element.attributes['title'].content)
      coords = $1.strip.split(/\s+/)
      out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
              (coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
    end
  end
  return out
end
elementText( elem ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 663
def elementText( elem )
  # used to put some Iconv stuff here, but nokogiri performs this conversion itself
  return elem.inner_text.strip
end
encodePDFArray( in_a ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 434
def encodePDFArray( in_a )
  out_a = Array.new()
  out_a << '['
  in_a.each do |item|
    if item.is_a? String
      out_a << ( '(' << item.to_s << ')' )
    elsif item.is_a? Symbol
      out_a << ( '/' << item.to_s )
    elsif item.is_a? Array
      out_a << encodePDFArray( item )
    else
      out_a << item.to_s
    end
  end
  out_a << ']'
  out_a.join( ' ' )
end
encodePDFObjEntry( inhash,outobj,label ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 452
def encodePDFObjEntry( inhash,outobj,label )
  if inhash[label].is_a? String
    outobj.addToDict( label,"(#{inhash[label]})" )

  elsif inhash[label].is_a? Symbol
    outobj.addToDict( label,"/#{inhash[label]}" )

  elsif inhash[label].is_a? Integer
    outobj.addToDict( label,"#{inhash[label]}" )

  elsif inhash[label].is_a? Array
    outobj.addToDict( label,encodePDFArray( inhash[label] ) )

  elsif inhash[label].is_a? Hash
    newobj = XObj.new( Hash.new() )
    @doc.addObject( newobj )
    outobj.addToDict( label,ref(newobj.getID) )
    inhash[label].keys.each do |newlabel|
      encodePDFObjEntry( inhash[label],newobj,newlabel )
    end

  elsif inhash[label].is_a? PDF::Reader::Stream
    newobj = XObj.new( Hash.new(),inhash[label].data )
    @doc.addObject( newobj )
    outobj.addToDict( label,ref(newobj.getID) )
    inhash[label].hash.keys.each do |newlabel|
      encodePDFObjEntry( inhash[label].hash,newobj,newlabel ) unless newlabel.eql? :Length
    end
  end
end
getHOCRText( hocr,pheight,xscale,yscale,encodings ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 743
def getHOCRText( hocr,pheight,xscale,yscale,encodings )
  fsize = 10
  cur_enc = nil
  ret = " BT 3 Tr "

  hocr.xpath("//span[@class='ocr_line']").each do |line|
    lbbox = elementCoordinates( line,xscale,yscale )
    next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
    units = getOCRUnits( line,lbbox,fsize,xscale,yscale )
    next if units.length == 0

    wwidth = 0
    ltxt = ''
    units.each do |unit|
      ltxt << unit[0]
      wwidth += ( unit[1][2] - unit[1][0] )
    end
    lw = @fdata.getLineWidth( ltxt,fsize )
    ratio = 1
    ratio = wwidth / lw unless lw == 0
    pos = lbbox[0]
    posdiff = 0

    ret << sprintf( "%f %f %f %f %f %f Tm ",
      ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
    in_txt = false

    units.each_index do |i|
      unit = units[i]
      wtxt = unit[0]
      bbox = unit[1]

      posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
      pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )

      txt8 = ''
      wtxt.each_char do |char|
        begin
          if char.respond_to? :encode
            char.encode!( "utf-16be", "utf-8" )
          else
            Iconv.iconv( "utf-16be","utf-8",char )
          end
        rescue
          rawbytes = char.unpack( 'C*' )
          bs = ''
          rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
          $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
          char = '?' * rawbytes.length
        end

        encoded = false
        if cur_enc.nil? or not cur_enc.include? char
          encodings.each_index do |i|
            enc = encodings[i]
            next if enc == cur_enc

            if enc.include? char
              if in_txt
                ret << "#{posdiff} " if posdiff != 0
                ret << "<#{txt8}> " unless txt8.eql? ''
                ret << "] TJ "
              end
              cur_enc = enc
              ret << "/Fnt#{i + 1} #{fsize} Tf "
              txt8 = ''
              posdiff = 0
              encoded = true
              in_txt = false
              break
            end
          end

          unless encoded
            last = encodings[-1]
            if last.length < 256
              last << char
            else
              last = [ ' ',char ]
              encodings << last
            end

            if cur_enc != last
              if in_txt
                ret << "#{posdiff} " if posdiff != 0
                ret << "<#{txt8}> " unless txt8.eql? ''
                ret << "] TJ "
              end
              cur_enc = last
              ret << "/Fnt#{encodings.length} #{fsize} Tf "
              txt8 = ''
              posdiff = 0
              in_txt = false
            end
          end
        end

        unless in_txt
          ret << "[ "
          in_txt = true
        end
        txt8 << sprintf( "%02X",cur_enc.index(char) )
      end

      unless txt8.eql? ''
        ret << "#{posdiff} " if posdiff != 0
        ret << "<#{txt8}> "
      end
    end
    if in_txt
      ret << "] TJ "
      in_txt = false
    end
  end

  ret << "ET "
  return ret
end
getOCRUnits( ocr_line,lbbox,fsize,xscale,yscale ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 668
def getOCRUnits( ocr_line,lbbox,fsize,xscale,yscale )
  units = Array.new()
  ocr_words = ocr_line.xpath(".//span[@class='ocrx_word']")
  ocr_chars = nil
  ocr_chars = ocr_line.at_xpath(".//span[@class='ocr_cinfo']") if ocr_words.length == 0

  # If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
  # into individual words
  if ocr_words.length > 0
    ocr_words.each do |word|
      bbox = elementCoordinates( word,xscale,yscale )
      next if bbox == [0,0,0,0]
      txt = elementText( word )
      units << [txt,bbox]
    end

  # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
  # into individual characters and then combine them into words
  elsif not ocr_chars.nil? and ocr_chars.attributes.has_key? 'title'
    if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes['title'].content )
      coords = $1.strip.split(/\s+/)
      ltxt = elementText( ocr_line )
      charcnt = 0
      ltxt.each_char { |uc| charcnt += 1 }

      if charcnt <= coords.length/4
        i = 0
        wtxt = ''
        bbox = [-1,-1,-1,-1]
        ltxt.each_char do |uc|
          cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
                    (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]

          unless cbbox[0] < 0
            bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
            bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
            bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
            bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
            wtxt << uc

          else
            units << [wtxt,bbox]
            bbox = [-1,-1,-1,-1]
            if /^\s+$/.match( uc )
              wtxt = ''

            # A workaround for probable hpricot bug (TODO: is Nokogiri affected?),
            # which sometimes causes whitespace characters from inside a string
            # to be stripped. So if we find a bounding box with negative values
            # we assume there was a whitespace character here, even if not
            # preserved in the string itself
            else
              wtxt = uc
              i += 1
              bbox =  [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
                        (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
            end
          end
          i += 1
        end
        units << [wtxt,bbox] unless wtxt.eql? ''
      end
    end
  end

  # If neither word nor character bounding boxes are available, then store the line as a whole
  if units.length == 0
    ltxt = elementText( ocr_line )
    units << [ltxt,lbbox] unless ltxt.eql? ''
  end

  units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
  return units
end
getOutlineObjs( toc,page_ids,fp_id ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 592
def getOutlineObjs( toc,page_ids,fp_id )
  root = toc[0]
  root[:pdfobj] = XObj.new( Hash[
    'Type'  => '/Outlines',
    'Count' => root.getChildrenCount
  ])
  @doc.addObject(root[:pdfobj])

  toc[1..-1].each do |item|
    dest = fp_id
    if page_ids.has_key? item[:ref]
      dest = page_ids[item[:ref]]
    else
      dest = nil
      $stderr.puts("Malformed TOC: there is no page #{item[:ref]} in this document.")
    end

    item_text = item[:title].to_binary
    item_text.gsub!( /\x5C/,"\x5C\x5C" )
    item_text.gsub!( /\x28/,"\x5C\x28" )
    item_text.gsub!( /\x29/,"\x5C\x29" )
    item[:pdfobj] = XObj.new(Hash[
      'Title'  => "(\xFE\xFF#{item_text.to_text})",
      'Parent' => ref(item[:parent][:pdfobj].getID),
    ])
    if dest != nil
      item[:pdfobj].addToDict('Dest', "[ #{dest} 0 R /XYZ null null null ]")
    else
      item[:pdfobj].addToDict('C', "[0.75 0.75 0.75]")
    end


    if item[:children].length > 0
      cnt = item.getChildrenCount
      if item[:open]
        item[:pdfobj].addToDict('Count', cnt)
      else
        item[:pdfobj].addToDict('Count', -cnt)
      end
    end

    unless item.has_key? :prev
      item[:parent][:pdfobj].addToDict('First', ref(item[:pdfobj].getID))
    else
      item[:prev][:pdfobj].addToDict('Next', ref(item[:pdfobj].getID))
      item[:pdfobj].addToDict('Prev', ref(item[:prev][:pdfobj].getID))
    end

    unless item.has_key? :next
      item[:parent][:pdfobj].addToDict('Last', ref(item[:pdfobj].getID))
    end

    @doc.addObject(item[:pdfobj])
  end
end
getPDFReader( path ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 427
def getPDFReader( path )
  return nil if path.nil? or path.eql? ''
  return nil unless File.file? path

  PDF::Reader.new( path )
end
getPDFText( reader,pidx,debug ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 520
def getPDFText( reader,pidx,debug )
  return "" unless reader.pages.length > pidx

  page = reader.pages[pidx]
  pcont = page.raw_content.to_binary()
  cidx = 0
  in_t = false
  pstack = 0
  prevc = "\0"
  ch_start = -1
  ret = ""
  tr_val = debug ? 0 : 3

  pcont.each_byte do |char|
    if char.chr.eql? '('
      ctx = pcont[0,cidx].match( /\\+$/ )
      pstack += 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
    elsif char.chr.eql? ')'
      ctx = pcont[0,cidx].match( /\\+$/ )
      pstack -= 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
    end

    unless pstack > 0
      # Text state operators may occur outside text objects. We have to take care of this
      if not in_t and prevc.eql? 'T'
        case char.chr
          when 'c'
            if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
              ret << " #{$1} Tc"
            end
          when 'w'
            if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
              ret << " #{$1} Tw"
            end
          when 'z'
            if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
              ret << " #{$1} Tz"
            end
          when 'L'
            if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
              ret << " #{$1} TL"
            end
          when 'f'
            if pcont[0,cidx-1] =~ /\/([A-Za-z0-9]+)\s+([-+]?\d*\.?\d+)\s+$/
              ret << " /#{$1} #{$2} Tf"
            end
          # Tr operators are ignored, since we always need either a hidden text (3 Tr)
          # or (for debugging purposes) a visible text without special effects (0 Tr)
          when 's'
            if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
              chunks << " #{$1} Ts"
            end
        end
      elsif not in_t and ( prevc + char.chr ).eql? 'BT'
        ch_start = cidx -1
        in_t = true
      elsif in_t and ( prevc + char.chr ).eql? 'ET'
        chunk = pcont.slice( ch_start,cidx - ch_start + 1 )
        chunk.gsub!( /\d{1}\s+Tr/,"#{tr_val} Tr" )
        ret << "\n" << chunk
        ch_start = -1
        in_t = false
      end
    end

    prevc = char.chr
    cidx += 1
  end
  return "\nq #{tr_val} Tr" << ret << " Q" if ret.length > 0
  return ""
end
importPDFFont( label,font ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 483
def importPDFFont( label,font )
  fontobj = XObj.new( Hash.new() )
  fontobj.addToDict( 'Name',"/#{label}" ) unless label.nil?
  @doc.addObject( fontobj )

  if font.has_key? :DescendantFonts
    dfonts = Array.new()
    font[:DescendantFonts].each {|dfont| dfonts << importPDFFont( nil,dfont ) }
    fontobj.addToDict( "DescendantFonts",'[ ' << dfonts.map{|dfont| ref(dfont.getID)}.join(' ') << ' ]' )
  end

  [ :BaseFont, :Type, :Subtype, :FirstChar, :LastChar, :Widths, :FontDescriptor,
    :Encoding, :ToUnicode, :DW, :W, :CIDSystemInfo, :CIDToGIDMap ].each do |fontkey|
    encodePDFObjEntry( font,fontobj,fontkey ) if font.has_key? fontkey
  end
  fontobj
end
importPDFFonts( reader,path ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 501
def importPDFFonts( reader,path )
  fonts = Hash.new()
  reader.pages.each_index do |i|
    $stderr.puts("Reading font data from #{path}: page #{i}\n")
    page = reader.pages[i]
    page.fonts.each do |label,font|
      fonts[label] = page.objects.deref( font ) unless fonts.has_key? label
    end
  end

  fdict = XObj.new( Hash[] )
  @doc.addObject( fdict )
  fonts.keys.sort_by {|sym| sym.to_s}.each do |label|
    fontobj = importPDFFont( label,fonts[label] )
    fdict.addToDict( label,ref(fontobj.getID) )
  end
  fdict
end
loadCCITTPage( path,ocref ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 893
def loadCCITTPage( path,ocref )
  stencil = ImageInspector.new( path )
  return nil if stencil.width.nil?

  width = stencil.width
  height = rows_per_strip = stencil.height
  xres = stencil.x_dpi
  yres = stencil.y_dpi
  rows_per_strip = stencil.tags[0x116][0] if
    stencil.format.eql? :TIFF and stencil.tags.has_key? 0x116

  unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
    img = ImageList.new( path )
    imgdata = img.to_blob { |imd|
      imd.format = 'TIFF'
      imd.define( 'TIFF','rows-per-strip',height )
      imd.compression = Group4Compression
    }
    stencil = ImageInspector.new( StringIO.new(imgdata) )
    img.destroy!
  end
  body = stencil.getRawData
  photometric = 0
  photometric = stencil.tags[0x106][0] if
    stencil.format.eql? :TIFF and stencil.tags.has_key? 0x106

  xobj = XObj.new(Hash[
    'Type'             => '/XObject',
    'Subtype'          => '/Image',
    'OC'               => ocref,
    'Width'            => width.to_s,
    'Height'           => height.to_s,
    'ImageMask'        => 'true',
    'ColorSpace'       => '/DeviceGray',
    'BitsPerComponent' => '1',
    'Filter'           => '/CCITTFaxDecode',
    'DecodeParms'      => "<< /Columns #{width} /K -1 >>",
  ], body)
  if photometric == 1 then
    # As ImageMask is always on, BlackIs1 actually doesn't work, while
    # the Decode array does.
    xobj.addToDict( 'BlackIs1', 'true' )
    xobj.addToDict( 'Decode', '[1 0]' )
  end

  return [ xobj,width,height,xres,yres ]
end
loadImage( impath,ocID,procSet ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 972
def loadImage( impath,ocID,procSet )
  insp = ImageInspector.new( impath )
  return nil if insp.width.nil?

  # JPEG, JPEG2000 and PNG images can be handled directly. We also can
  # handle uncompressed TIFF files, although it is very unlikely someone
  # would use them for page background. Unfortunately things are more
  # difficult for compressed TIFF images, as they normally contain several
  # compressed chunks, so that we can't just concatenate them. So for all
  # other image types we just call ImageMagick to convert them into a
  # zip-compressed PNG, and then retrieve the raw data from that PNG image.
  unless [ :JPEG, :JPEG2000, :PNG ].include? insp.format or
    ( insp.format.eql? :TIFF and ( insp.compression.eql? :NoCompression or
    ( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))

    img = ImageList.new( impath )
    imgdata = img.to_blob { |imd|
      imd.format = 'PNG'
      imd.quality = 95
      imd.compression = ZipCompression
    }
    insp = ImageInspector.new( StringIO.new(imgdata) )
    img.destroy!
  end
  rawdata  = insp.getRawData
  cspace   = "/#{insp.cspace}"
  fmt      = insp.format
  imgcompr = insp.compression
  per_comp = 1

  if cspace.eql? '/Indexed' and not insp.palette.nil?
    cspace = '/DeviceGray'; cpal = insp.palette
    rgb = false
    cpal.each do |c|
      if c[0] != c[1] or c[0] != c[2]
        cspace = '/DeviceRGB'
        rgb = true
        break
      end
    end

    cspace = "[/Indexed #{cspace} #{cpal.length - 1} < "
    cpal.each do |c|
      cspace << sprintf( "%02x ",c[0] )
      cspace << sprintf( "%02x %02x ",c[1],c[2] ) if rgb
    end
    cspace << '>]'

    procSet << '/ImageI' unless procSet.include? '/ImageI'

  elsif not cspace.eql? '/DeviceGray' and not procSet.include? '/ImageC'
    procSet << '/ImageC'
  end

  if cspace.eql? '/DeviceRGB'
    per_comp = 3
  elsif cspace.eql? '/DeviceCMYK'
    per_comp = 4
  end
  image = XObj.new( Hash[
    'Type'              => '/XObject',
    'Subtype'           => '/Image',
    'OC'                => ref( ocID ),
    'Width'             => insp.width,
    'Height'            => insp.height,
    'Interpolate'       => 'true'
  ], rawdata )

  unless fmt.eql? :JPEG2000
    image.addToDict( 'BitsPerComponent',insp.depth )
    image.addToDict( 'ColorSpace',"#{cspace}" )
  end
  image.addToDict( 'Filter',"/#{imgcompr}" ) unless insp.compression.eql? :NoCompression
  if [:PNG, :TIFF].include? fmt
    predictor = (fmt.eql? :PNG) ? 15 : 2
    image.addToDict( 'DecodeParms',
      "<< /Predictor #{predictor} /Colors #{per_comp} /BitsPerComponent #{insp.depth} /Columns #{insp.width} >>" )
  end
  return image
end
loadJBIG2Page( path,dictpath,ocref ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 941
def loadJBIG2Page( path,dictpath,ocref )
  begin
    jbig2  = File.open( path,'rb' ).read
    width, height, xres, yres = jbig2[11...27].unpack( 'NNNN' )
    unless @dictpath.eql? dictpath
      symd_f = File.open( dictpath,'rb' ).read
      symd_o = @doc.addObject( XObj.new(Hash.new(),symd_f) )
      @dictpath = dictpath
      @dictobj  = symd_o
    end
  rescue
    $stderr.puts( "Page not completed: could not access #{path}" )
    return nil
  end

  xobj = XObj.new(Hash[
    'Type'             => '/XObject',
    'Subtype'          => '/Image',
    'OC'               => ocref,
    'Width'            => width.to_s,
    'Height'           => height.to_s,
    'ImageMask'        => 'true',
    'ColorSpace'       => '/DeviceGray',
    'BitsPerComponent' => '1',
    'Filter'           => '/JBIG2Decode',
    'DecodeParms'      => "<< /JBIG2Globals #{@dictobj.getID} 0 R >>"
  ], jbig2)

  return [ xobj,width,height,xres,yres ]
end
parseMeta( path ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 392
def parseMeta( path )
  ret = Hash.new()
  return ret if path.nil? or path.eql? ''

  keys = [ 'Title', 'Author', 'Subject', 'Keywords' ]
  File.open( path,'r' ) do |fin|
    fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
    fin.each do |fl|
      next if /^\#/.match( fl )

      if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
        key = $1
        if keys.include? key
          begin
            tmp_str = ''
            if $2.respond_to? :encode
              tmp_str = $2.encode( "utf-16be", "utf-8" )
            else
              tmp_str = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
            end
            # a parenthesis code in a formally correct utf-16 should nevertheless be escaped
            ret[key] = tmp_str.to_binary
            ret[key].gsub!( /\x5C/,"\x5C\x5C" )
            ret[key].gsub!( /\x28/,"\x5C\x28" )
            ret[key].gsub!( /\x29/,"\x5C\x29" )
          rescue
            $stderr.puts("Error: metadata should be specified in utf-8")
          end
        end
      end
    end
  end
  ret
end
ref(x) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 1053
def ref(x)
  return "#{x} 0 R"
end
safe_delete( path ) click to toggle source
# File lib/pdfbeads/pdfbuilder.rb, line 383
def safe_delete( path )
  begin
    File.delete( path )
    $stderr.puts( " Deleted #{path}" )
  rescue Exception => e
      $stderr.puts( "Could not delete #{path}: #{e.message}" )
  end
end