class Glaemscribe::API::UpDownTehtaSplitPreProcessorOperator

Attributes

consonant_list[R]
vowel_list[R]

Public Instance Methods

apply(content) click to toggle source
# File lib/api/pre_processor/up_down_tehta_split.rb, line 115
def apply(content)
  accumulated_word = ""
  
  ret = ""
  
  content.split(//).each{ |letter|
    if @word_split_map[letter]
      accumulated_word += letter
    else
      ret += apply_to_word(accumulated_word)
      ret += letter
      accumulated_word = ""
    end        
  }
  ret += apply_to_word(accumulated_word) 
  ret   
end
apply_to_word(w) click to toggle source
# File lib/api/pre_processor/up_down_tehta_split.rb, line 62
def apply_to_word(w)
  res = []
  
  if w.strip.empty?
    res << w
  else
    while w.length != 0
      r, len = @splitter_tree.transcribe(w)
         
      if r != [UNKNOWN_CHAR_OUTPUT]
        res << r 
      else
        res << w[0..0] # r
      end
    
      w = w[len..-1]
    end
  end
  
  res_modified = []

  # We replace the pattern CVC by CvVC where v is a phantom vowel.
  # This makes the pattern CVC not possible.
  i = 0
  while i < res.count - 2 do
    
    r0 = res[i]
    r1 = res[i+1]
    r2 = res[i+2]
    t0 = type_of(r0)
    t1 = type_of(r1)
    t2 = type_of(r2)
  
    if t0 == "C" && t1 == "V" && t2 == "C"
      res_modified << res[i]
      res_modified << "@"
      res_modified << res[i+1] 
      i += 2
    else
      res_modified << res[i]
      i += 1
    end
  end

  # Add the remaining stuff
  while i < res.count
    res_modified << res[i]
    i += 1
  end

  return res_modified.join("")       
end
finalize(trans_options) click to toggle source
Calls superclass method
# File lib/api/pre_processor/up_down_tehta_split.rb, line 29
def finalize(trans_options)
  super(trans_options)
  
  vowel_list          = finalized_glaeml_element.args[0]
  consonant_list      = finalized_glaeml_element.args[1]
      
  vowel_list          = vowel_list.split(/,/).map{|s| s.strip}
  consonant_list      = consonant_list.split(/,/).map{|s| s.strip}  
     
  @vowel_map          = {} # Recognize vowel tokens
  @consonant_map      = {} # Recognize consonant tokens
  @splitter_tree      = TranscriptionTreeNode.new(nil,nil) # Recognize tokens
  @word_split_map     = {}
  # The word split map will help to recognize words
  # The splitter tree will help to split words into tokens
  
  vowel_list.each      { |v| @splitter_tree.add_subpath(v, v); @vowel_map[v] = v }
  consonant_list.each  { |c| @splitter_tree.add_subpath(c, c); @consonant_map[c] = c}

  all_letters = (vowel_list + consonant_list).join("").split(//).sort.uniq    
  all_letters.each{ |l| @word_split_map[l] = l }
end
type_of(token) click to toggle source
# File lib/api/pre_processor/up_down_tehta_split.rb, line 52
def type_of(token)
  if @vowel_map[token]        
    return "V"
  elsif @consonant_map[token] 
    return "C"
  else                        
    return "X"
  end            
end