class LLT::Tokenizer

Constants

ABBREVIATIONS

covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.

ABBR_NAME_WITH_DOT
APOSTROPHE_WORDS

covers a list of words which are abbreviated with a ' like satin' for satisne

ENCLITICS

laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum

but

uterque, institutione, sive et al. remain

iuvene might come as a suprise in these lists - it's a hack, but special because it has ve and ne - both would get split. Such words might be so rare that we postpone proper handling for now

ENCLITICS_MAP
MERGE_WORDS
PUNCTUATION
PUNCT_ITSELF
ROMAN_DATE_EXPR_WITH_DOT
VERSION
WORDS_ENDING_WITH_NE
WORDS_ENDING_WITH_QUE
WORDS_ENDING_WITH_VE
XML_TAG

Attributes

default_options[R]

Public Class Methods

default_options() click to toggle source
# File lib/llt/tokenizer.rb, line 25
def self.default_options
  {
    shifting: true,
    enclitics_marker: '-',
    merging: true,
    indexing: true,
    splitting: true,
    xml: false,
    #for Greek
    krasis_marker: '-'
  }
end

Public Instance Methods

create_tokens() click to toggle source
# File lib/llt/tokenizer.rb, line 370
def create_tokens
  # call #to_a is to retrieve (and align) optional metrical data
  reset_id
  @worker.to_a.map! do |el|
    case el
    when XML_TAG                  then Token::XmlTag.new(el)
    when ABBR_NAME_WITH_DOT       then raise_id and Token::Filler.new(el, @id)
    when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
    when PUNCT_ITSELF             then raise_id and Token::Punctuation.new(el, @id)
    else                               raise_id and Token::Word.new(el, @id)
    end
  end
end
enclitic(val) click to toggle source
# File lib/llt/tokenizer.rb, line 201
def enclitic(val)
  "#{@enclitics_marker}#{val}"
end
find_abbreviations_and_join_strings() click to toggle source

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }

# File lib/llt/tokenizer.rb, line 138
def find_abbreviations_and_join_strings
  arr = []
  @worker.each_with_index do |e, i|
    n = @worker[i + 1]
    if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
      @worker[i + 1] = n.prepend(e)
      arr << (i - arr.size)
    end
  end

  arr.each { |i| @worker.delete_at(i) }
end
is_a_mergable_pair?(x, y) click to toggle source
# File lib/llt/tokenizer.rb, line 353
def is_a_mergable_pair?(x, y)
  # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
  # sentence's first position
  MERGE_WORDS.any? { |a, b| a === x.downcase && b === y  }
end
is_que?(element) click to toggle source
# File lib/llt/tokenizer.rb, line 251
def is_que?(element)
  element == enclitic('que')
end
led_by_preposition?(index) click to toggle source
# File lib/llt/tokenizer.rb, line 255
def led_by_preposition?(index)
  @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
end
lookup(string, type, column, inflection_class = 3) click to toggle source
# File lib/llt/tokenizer.rb, line 321
def lookup(string, type, column, inflection_class = 3)
  string = (type == :persona ? string : string.downcase)
  query = {
            type: type, stem_type: column, stem: string,
            restrictions: { type: :inflection_class, values: Array(inflection_class) }
          }
  @db.look_up_stem(query)
end
make_frequent_corrections() click to toggle source
# File lib/llt/tokenizer.rb, line 225
def make_frequent_corrections
  # uses db lookups
  # # TODO 27.11.13 14:15 by LFDM
  # Implement caching here
  ne_corrections
  ve_corrections
  que_corrections
end
merge_what_needs_merging() click to toggle source

quam diu to quamdiu

# File lib/llt/tokenizer.rb, line 345
def merge_what_needs_merging
  to_delete = []
  @worker.each_overlapping_pair.each_with_index do |pair, i|
    merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
  end
  to_delete.each { |i| @worker.delete_at(i) }
end
merge_words(pair, i, to_delete) click to toggle source
# File lib/llt/tokenizer.rb, line 359
def merge_words(pair, i, to_delete)
  pair.first << pair.last
  to_delete  << (i + 1 - to_delete.size)
end
ne_corrections() click to toggle source
# File lib/llt/tokenizer.rb, line 259
def ne_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ne')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
      entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
      entries += lookup(orig_el + "n", :noun, :stem, [3, 33])  # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
      entries += lookup(orig_el + "n", :noun, :stem, 2)                       # domi-ne
      entries += lookup(orig_el + "n", :adjective, :stem, [1,3])              # communis commune, or bonus

      entries += lookup(orig_el + "n", :persona, :stem, 2)                    # Pauli-ne

      if entries.any?(&:third_decl_with_possible_ne_abl?)
        corrections << i - corrections.size
      end

      if entries.any?(&:o_decl_with_possible_ne_voc?)
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end
open_xml_tag?(str) click to toggle source
# File lib/llt/tokenizer.rb, line 123
def open_xml_tag?(str)
  str.start_with?('<') &! str.end_with?('>')
end
original_word(i) click to toggle source
# File lib/llt/tokenizer.rb, line 310
def original_word(i)
  # there are two possible scenarios at this point
  # with shifting enabled:
  #         i  i + 1
  #   arma que virum
  # with shifting disabled:
  #        i - 1  i
  #   arma virum que
  @worker[i + (@shifting ? 1 : -1)]
end
preliminary() click to toggle source
# File lib/llt/tokenizer.rb, line 398
def preliminary
  @worker.to_a
end
put_xml_attributes_back_together(elements) click to toggle source
# File lib/llt/tokenizer.rb, line 103
def put_xml_attributes_back_together(elements)
  as = ArrayScanner.new(elements)
  loop do
    last = as.look_behind.to_s # catch nil
    if open_xml_tag?(last)
      number_of_xml_elements = as.peek_until do |el|
        el.end_with?('>')
      end.size + 1

      number_of_xml_elements.times do
        last << ' ' << as.current
        elements.delete_at(as.pos)
      end
    else
      as.forward(1)
    end
    break if as.eoa?
  end
end
que_corrections() click to toggle source
# File lib/llt/tokenizer.rb, line 234
def que_corrections
  # this is used in rare only in cases like in eoque
  # which needs a shift to -que in eo
  if @shifting
    to_be_shifted_que_indices.each do |i|
      @worker.insert(i - 1, @worker.delete_at(i))
    end
  end
end
raise_id() click to toggle source
# File lib/llt/tokenizer.rb, line 388
def raise_id
  if @indexing
    @id += 1
  else
    # need to return true because this is used as first part
    # of an and construction
    true
  end
end
reset_id() click to toggle source
# File lib/llt/tokenizer.rb, line 384
def reset_id
  @id = (@indexing ? @id = 0 : nil)
end
reverse_splittings(indices) click to toggle source
# File lib/llt/tokenizer.rb, line 330
def reverse_splittings(indices)
  indices.each do |i|
    # need to retrieve the orig word before the splitted var is
    # assigned, as it deletes something in the worker
    ow = original_word(i)
    splitted  = @worker.delete_at(i).delete(@enclitics_marker)
    ow << splitted
  end
end
setup(text, options = {}, worker = []) click to toggle source
# File lib/llt/tokenizer.rb, line 55
def setup(text, options = {}, worker = [])
  @text = text
  evaluate_metrical_presence(@text)
  @enclitics_marker = parse_option(:enclitics_marker, options)
  @merging          = parse_option(:merging, options)
  @shifting         = parse_option(:shifting, options)
  @splitting        = parse_option(:splitting, options)
  @indexing         = parse_option(:indexing, options)
  @xml              = parse_option(:xml, options)
  #for Greek
  @krasis_marker    = parse_option(:krasis_marker, options)
  @worker = setup_worker(worker)
  @shift_range = shift_range(@shifting)
end
setup_worker(worker) click to toggle source

This is here for two reasons:

1) easier test setup, when a preliminary result shall be further evaluated

2) more importantly adding a level of indirection, when
   the given text holds metrical information. It adds a
   substitute implementation for the worker array, but only
   if it's needed - which should perform better, when there
   are no metrics involved (the default case)
# File lib/llt/tokenizer.rb, line 81
def setup_worker(worker)
  return worker if worker.any?

  elements = split_and_space_text
  put_xml_attributes_back_together(elements) if @xml

  if metrical?
    Worker.new(elements, @enclitics_marker)
  else
    elements
  end
end
shift_range(shifting_enabled) click to toggle source
# File lib/llt/tokenizer.rb, line 94
def shift_range(shifting_enabled)
  shifting_enabled ? 0 : 1
end
split_and_space_text() click to toggle source
# File lib/llt/tokenizer.rb, line 98
def split_and_space_text
  regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
  @text.gsub(regex, ' \0 ').split
end
split_enklitika_and_change_their_position() click to toggle source
# File lib/llt/tokenizer.rb, line 170
def split_enklitika_and_change_their_position
  split_with_force
  split_frequent_enclitics # like latin c, ve or greek te, de
  make_frequent_corrections
end
split_enklitikon(encl, restrictors) click to toggle source
# File lib/llt/tokenizer.rb, line 186
def split_enklitikon(encl, restrictors)
  # needs a word character in front - ne itself should be contained
  regexp = /(?<=\w)#{encl}$/

  indices = []
  @worker.each_with_index do |token, i|
    if token.match(regexp) && restrictors !~ token
      token.slice!(regexp)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic(encl)) }
end
split_frequent_enclitics() click to toggle source
# File lib/llt/tokenizer.rb, line 212
def split_frequent_enclitics
  container = []
  @worker.each_with_index do |token, i|
    ENCLITICS_MAP.each do |regex, encl|
      if token.match(regex)
        token.slice!(-encl.length, encl.length)
        container << [encl, (i + container.size + @shift_range)]
      end
    end
  end
  container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
end
split_with_force() click to toggle source
# File lib/llt/tokenizer.rb, line 176
def split_with_force
  # uses brute force at first
  # the restrictor regexps handle only obvious cases

  # don't use c here atm
  ENCLITICS[0..-2].each do |encl|
    split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
  end
end
to_be_shifted_que_indices() click to toggle source
# File lib/llt/tokenizer.rb, line 244
def to_be_shifted_que_indices
  # double shifts would properly fail, but they  might never happen
  @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
    accumulator << index if is_que?(element) && led_by_preposition?(index)
  end
end
tokenize(text, add_to: nil, **options) click to toggle source
# File lib/llt/tokenizer.rb, line 38
def tokenize(text, add_to: nil, **options)
  raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
  return [] if text.empty?

  setup(text, options)

  find_abbreviations_and_join_strings
  #for Greek
  split_krasis if @splitting
  split_enklitika_and_change_their_position if @splitting
  merge_what_needs_merging if @merging # quam diu => quamdiu
  tokens = create_tokens

  add_to << tokens if add_to.respond_to?(:<<)
  tokens
end
ve_corrections() click to toggle source
# File lib/llt/tokenizer.rb, line 287
def ve_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ve')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
      entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
      entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 33, 5])
      entries += lookup(orig_el + 'v',  :persona,   :stem, 3)
      entries += lookup(orig_el + 've', :verb,      :pr,   2)
      entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists

      if entries.any?
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end