class MorMor::Dictionary

Morfologik dictionary client.

@example

dictionary = MorMor::Dictionary.new('path/to/english')
dictionary.lookup('meowing')
# => [#<struct MorMor::Dictionary::Word stem="meow", tags="VBG">]

Constants

DECODERS

@private

Word

Result of {Dictionary#lookup}

`stem` is base form of the looked up word, `tags` is dictionary-depended part of speech / word form tags.

Attributes

fsa[R]

@private

info[R]

@return [Hash]

Public Class Methods

new(path) click to toggle source

@param path [String] Path to dictionary files. It is expected that `path + “.info”` and

`path + ".dict"` files are existing and contain Morfologik dictionary
# File lib/mormor/dictionary.rb, line 35
def initialize(path)
  @path = path # Just for inspect

  read_info(path + '.info')

  @fsa = FSA.read(path + '.dict')
end

Public Instance Methods

inspect() click to toggle source

@return [String]

# File lib/mormor/dictionary.rb, line 44
def inspect
  '#<%s %s>' % [self.class, @path]
end
lookup(word) click to toggle source

Finds all forms and POS tags of words in the dictionary.

@param word [String] a word to lookup @return [Array<Word>, nil]

# File lib/mormor/dictionary.rb, line 52
def lookup(word) # rubocop:disable Metrics/AbcSize
  # Method is left unsplit to leave original algorithm (DictionaryLookup.java#lookup) recognizable,
  # hence rubocop:disable

  bword = word.encode(@encoding).force_encoding('ASCII-8BIT')

  # TODO: there could be "input conversion pairs"

  # Note: not bword.bytes, because morfologik expects signed bytes, while String#bytes
  # is analog of unpack('C*'), returning unsigned
  m = fsa.match(bword.unpack('c*'))

  # OC: this case is somewhat confusing: we should have hit the separator
  # first... I don't really know how to deal with it at the time
  # being.
  return unless m.kind == :sequence_is_a_prefix

  # OC: The entire sequence exists in the dictionary. A separator should
  # be the next symbol.
  arc = fsa.find_arc(m.node, @sepbyte)

  # OC: The situation when the arc points to a final node should NEVER
  # happen. After all, we want the word to have SOME base form.
  return if arc.zero? || fsa.final_arc?(arc)

  # OC: There is such a word in the dictionary. Return its base forms.
  fsa.each_sequence(from: fsa.end_node(arc)).map do |encoded|
    # TODO: there could be "output conversion pairs"

    decoded = @decoder.call(bword, encoded).force_encoding(@encoding).encode('UTF-8')

    Word.new(*decoded.split(@separator, 2))
  end
end

Private Instance Methods

choose_decoder(name) click to toggle source
# File lib/mormor/dictionary.rb, line 110
def choose_decoder(name)
  DECODERS.fetch(name.upcase) { fail ArgumentError, "Encoder #{name} is not supported yet" }
          .then(&method(:method))
end
prefix_suffix(source, encoded) click to toggle source
# File lib/mormor/dictionary.rb, line 121
def prefix_suffix(source, encoded)
  truncate_pref, truncate_suf = encoded[0...2].bytes.first(2).map { |b| (b - 65) & 0xff } # 65 is 'A'
  # TODO: If remove == 255, means "remove all"

  source[truncate_pref...source.size - truncate_suf] + encoded[2..-1]
end
read_info(path) click to toggle source
# File lib/mormor/dictionary.rb, line 89
def read_info(path)
  @info = read_values(path)

  # NB: All possible values described in DictionaryAttribute.java

  # Cache it to be quickly accessible
  @encoding = @info.fetch('fsa.dict.encoding')
  @separator = @info.fetch('fsa.dict.separator')
  @sepbyte = @separator.bytes.first

  @decoder = choose_decoder(@info.fetch('fsa.dict.encoder'))
end
read_values(path) click to toggle source
# File lib/mormor/dictionary.rb, line 102
def read_values(path)
  File.exist?(path) or fail ArgumentError, "#{path} does not exist"
  File.read(path).split("\n")
      .map { |ln| ln.sub(/\#.*$/, '').strip }.reject(&:empty?)
      .map { |ln| ln.split('=', 2) }
      .to_h
end
suffix(source, encoded) click to toggle source
# File lib/mormor/dictionary.rb, line 115
def suffix(source, encoded)
  truncate_suf = encoded[0...1].bytes.first.-(65) & 0xff # 65 is 'A'
  # TODO: If remove == 255, means "remove all"
  source[0...source.size - truncate_suf] + encoded[1..-1]
end