module HexaPDF::Layout::TextLayouter::SimpleTextSegmentation

Implementation of a simple text segmentation algorithm.

The algorithm breaks TextFragment objects into objects wrapped by Box, Glue or Penalty items, and inserts additional Penalty items when needed:

Constants

BREAK_RE

Breaks are detected at: space, tab, zero-width-space, non-breaking space, hyphen, soft-hypen and any valid Unicode newline separator

Public Class Methods

call(items) click to toggle source

Breaks the items (an array of InlineBox and TextFragment objects) into atomic pieces wrapped by Box, Glue or Penalty items, and returns those as an array.

# File lib/hexapdf/layout/text_layouter.rb, line 211
def self.call(items)
  result = []
  glues = {}
  penalties = {}
  items.each do |item|
    if item.kind_of?(InlineBox)
      result << Box.new(item)
    else
      i = 0
      while i < item.items.size
        # Collect characters and kerning values until break character is encountered
        box_items = []
        while (glyph = item.items[i]) &&
            (glyph.kind_of?(Numeric) || !BREAK_RE.match?(glyph.str))
          box_items << glyph
          i += 1
        end

        # A hyphen belongs to the text fragment
        box_items << glyph if glyph && !glyph.kind_of?(Numeric) && glyph.str == '-'

        unless box_items.empty?
          result << Box.new(TextFragment.new(box_items.freeze, item.style))
        end

        if glyph
          case glyph.str
          when ' '
            glues[item.style] ||=
              Glue.new(TextFragment.new([glyph].freeze, item.style))
            result << glues[item.style]
          when "\n", "\v", "\f", "\u{85}", "\u{2029}"
            penalties[item.style] ||=
              Penalty.new(Penalty::PARAGRAPH_BREAK, 0,
                          item: TextFragment.new([].freeze, item.style))
            result << penalties[item.style]
          when "\u{2028}"
            result << Penalty.new(Penalty::LINE_BREAK, 0,
                                  item: TextFragment.new([].freeze, item.style))
          when "\r"
            if !item.items[i + 1] || item.items[i + 1].kind_of?(Numeric) ||
                item.items[i + 1].str != "\n"
              penalties[item.style] ||=
                Penalty.new(Penalty::PARAGRAPH_BREAK, 0,
                            item: TextFragment.new([].freeze, item.style))
              result << penalties[item.style]
            end
          when '-'
            result << Penalty::Standard
          when "\t"
            spaces = [item.style.font.decode_utf8(" ").first] * 8
            result << Glue.new(TextFragment.new(spaces.freeze, item.style))
          when "\u{00AD}"
            hyphen = item.style.font.decode_utf8("-").first
            frag = TextFragment.new([hyphen].freeze, item.style)
            result << Penalty.new(Penalty::Standard.penalty, frag.width, item: frag)
          when "\u{00A0}"
            space = item.style.font.decode_utf8(" ").first
            frag = TextFragment.new([space].freeze, item.style)
            result << Penalty.new(Penalty::ProhibitedBreak.penalty, frag.width, item: frag)
          when "\u{200B}"
            result << Penalty.new(0)
          end
        end
        i += 1
      end
    end
  end
  result
end