class LLT::Tokenizer::Worker

Public Class Methods

new(metric_text, marker) click to toggle source

TODO 28.11.13 11:45 by LFDM Edge cases? Merge words?

# File lib/llt/tokenizer/worker.rb, line 17
def initialize(metric_text, marker)
  @metric_text = metric_text
  @bare_text     = metric_text.map { |token| wo_meter(token) }
  @marker        = marker
  @marked_enclitics = ENCLITICS.map { |e| "#{@marker}#{e}"}
end

Public Instance Methods

to_a() click to toggle source
# File lib/llt/tokenizer/worker.rb, line 24
def to_a
  align_metrical_text
  @metric_text
end

Private Instance Methods

align_metrical_text() click to toggle source

One ugly method, but we don't want to slow it down even more

# File lib/llt/tokenizer/worker.rb, line 32
def align_metrical_text
  m = ArrayScanner.new(@metric_text)
  b = ArrayScanner.new(@bare_text)
  loop do
    # metric element
    x = m.scan
    # bare element
    y = b.scan
    no_meter = wo_meter(x)

    # we don't have to do anything if the dequantified metric element
    # was the same as the bare element - the metric_text was right
    # at this position
    unless no_meter == y

      # If the bare element was a marked enclitic, it must have been
      # shifted. We're looking for the next metric token, that has it
      # attached and try to find the string index where it starts to
      # slice it of.
      # Usually the metric element just scanned (y) will have it, if we
      # don't find it, a double shift has occured and it should sit right
      # at the current element of the metric ArrayScanner (m).
      # The enclitic (sliced of x) has to be inserted one position before.
      if @marked_enclitics.include?(y)
        clean_encl_re = /#{y.dup.delete(@marker)}$/
        unless index = no_meter =~ clean_encl_re
          x = m.current
          index = wo_meter(x) =~ clean_encl_re
        end
        insert!(slice_encl!(x, index), m.pos - 1)

      # If the dequantified metric element has an enclitic attached, the
      # option shifting: false must have been given. The enclitic will
      # follow right after in the @bare_text, we can therefore slice and
      # insert right in place (the next # scan round will reveal that
      # enclitic in metric_text == enclitic in bare_text
      elsif encl = ENCLITICS.find { |e| no_meter.end_with?(e) }
        index = no_meter =~ /#{encl}$/
        insert!(slice_encl!(x, index), m.pos)

      # If the bare element has a dot attached, it must have been an
      # abbreviation.
      # The . will appear right afterwards in the metric text. We can
      # delete it and append it to the last scanned metric element (x)
      #
      # We need to do the same if merge words were present.
      # The last metric element was quam, the bare element is quamdiu.
      # We append if the last metric element + the next metric element
      # is the same as the bare element.
      elsif y.end_with?('.') || merged_words_present?(no_meter, y, m)
        append_from_deleted_index!(x, m.pos)
      end
    end
    break if b.eoa?
  end
end
append_from_deleted_index!(token, index) click to toggle source
# File lib/llt/tokenizer/worker.rb, line 97
def append_from_deleted_index!(token, index)
  token << @metric_text.delete_at(index)
end
insert!(enclitic, position) click to toggle source
# File lib/llt/tokenizer/worker.rb, line 89
def insert!(enclitic, position)
  @metric_text.insert(position, "#{@marker}#{enclitic}")
end
merged_words_present?(last_metric, last_bare, metric_arr_scanner) click to toggle source
# File lib/llt/tokenizer/worker.rb, line 101
def merged_words_present?(last_metric, last_bare, metric_arr_scanner)
  (last_metric + wo_meter(metric_arr_scanner.peek)) == last_bare
end
slice_encl!(token, index) click to toggle source
# File lib/llt/tokenizer/worker.rb, line 93
def slice_encl!(token, index)
  token.slice!(index..-1)
end