class HexaPDF::Font::CMap::Parser

Parses CMap files.

See: Adobe Technical Notes #5014 and #5411

Public Instance Methods

parse(string) click to toggle source

Parses the given string and returns a CMap object.

# File lib/hexapdf/font/cmap/parser.rb, line 50
def parse(string)
  tokenizer = HexaPDF::Content::Tokenizer.new(string)
  cmap = CMap.new

  until (token = tokenizer.next_token) == HexaPDF::Tokenizer::NO_MORE_TOKENS
    if token.kind_of?(HexaPDF::Tokenizer::Token)
      case token
      when 'beginbfchar' then parse_bf_char(tokenizer, cmap)
      when 'beginbfrange' then parse_bf_range(tokenizer, cmap)
      when 'begincidchar' then parse_cid_char(tokenizer, cmap)
      when 'begincidrange' then parse_cid_range(tokenizer, cmap)
      when 'begincodespacerange' then parse_codespace_range(tokenizer, cmap)
      when 'endcmap' then break
      end
    elsif token.kind_of?(Symbol)
      value = tokenizer.next_token
      if value.kind_of?(HexaPDF::Tokenizer::Token)
        parse_cmap(cmap, token) if value == 'usecmap'
      else
        parse_dict_mapping(cmap, token, value)
      end
    end
  end

  cmap
rescue StandardError => e
  raise HexaPDF::Error, "Error parsing CMap: #{e.message}", e.backtrace
end

Private Instance Methods

bytes_to_int(string) click to toggle source

Treats the string as an array of bytes and converts it to an integer.

The bytes are converted in the big-endian way.

# File lib/hexapdf/font/cmap/parser.rb, line 184
def bytes_to_int(string)
  result = 0
  index = 0
  while index < string.length
    result = (result << 8) | string.getbyte(index)
    index += 1
  end
  result
end
parse_bf_char(tokenizer, cmap) click to toggle source

Parses the “bfchar” operator at the current position.

# File lib/hexapdf/font/cmap/parser.rb, line 138
def parse_bf_char(tokenizer, cmap)
  until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
    str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
    cmap.add_unicode_mapping(bytes_to_int(code), str)
  end
end
parse_bf_range(tokenizer, cmap) click to toggle source

Parses the “bfrange” operator at the current position.

# File lib/hexapdf/font/cmap/parser.rb, line 158
def parse_bf_range(tokenizer, cmap)
  until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
    code1 = bytes_to_int(code1)
    code2 = bytes_to_int(tokenizer.next_token)
    dest = tokenizer.next_object

    if dest.kind_of?(String)
      codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
      code1.upto(code2) do |code|
        cmap.add_unicode_mapping(code, +'' << codepoint)
        codepoint += 1
      end
    elsif dest.kind_of?(Array)
      code1.upto(code2) do |code|
        str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
        cmap.add_unicode_mapping(code, str)
      end
    else
      raise HexaPDF::Error, "Invalid bfrange operator in CMap"
    end
  end
end
parse_cid_char(tokenizer, cmap) click to toggle source

Parses the “cidchar” operator at the current position.

# File lib/hexapdf/font/cmap/parser.rb, line 116
def parse_cid_char(tokenizer, cmap)
  until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
    cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
  end
end
parse_cid_range(tokenizer, cmap) click to toggle source

Parses the “cidrange” operator at the current position.

# File lib/hexapdf/font/cmap/parser.rb, line 123
def parse_cid_range(tokenizer, cmap)
  until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
    code1 = bytes_to_int(code1)
    code2 = bytes_to_int(tokenizer.next_token)
    cid_start = tokenizer.next_object

    if code1 == code2
      cmap.add_cid_mapping(code1, cid_start)
    else
      cmap.add_cid_range(code1, code2, cid_start)
    end
  end
end
parse_cmap(cmap, name) click to toggle source

Populates the CMap with the values from the CMap with the given name.

# File lib/hexapdf/font/cmap/parser.rb, line 82
def parse_cmap(cmap, name)
  cmap.use_cmap(CMap.for_name(name.to_s))
end
parse_codespace_range(tokenizer, cmap) click to toggle source

Parses the “begincodespacerange” operator at the current position.

# File lib/hexapdf/font/cmap/parser.rb, line 104
def parse_codespace_range(tokenizer, cmap)
  until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
    code2 = tokenizer.next_token
    byte_ranges = []
    code1.each_byte.with_index do |byte, index|
      byte_ranges << (byte..(code2.getbyte(index)))
    end
    cmap.add_codespace_range(*byte_ranges)
  end
end
parse_dict_mapping(cmap, name, value) click to toggle source

Parses a single mapping of a dictionary pair. The name and value of the mapping have already been parsed.

# File lib/hexapdf/font/cmap/parser.rb, line 88
def parse_dict_mapping(cmap, name, value)
  case name
  when :Registry
    cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
  when :Ordering
    cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
  when :Supplement
    cmap.supplement = value if value.kind_of?(Integer)
  when :CMapName
    cmap.name = value.to_s.dup.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
  when :WMode
    cmap.wmode = value
  end
end