class Myasorubka::AOT::Dictionary
MRD file is a text file that contains a morphological dictionary of a natural language. MRD is an abbreviation of “morphological dictionary”.
All words in MRD file are written in UPPERCASE. One MRD file has the following sections: section of flexion and prefix models, section of accentual models, section of user sessions, session of prefix sets, section of lemmas.
Attributes
accents_offset[R]
filename[R]
language[R]
lemmas_offset[R]
lines[R]
logs_offset[R]
prefixes_offset[R]
rules_offset[R]
Public Class Methods
new(filename, language = nil, ee = nil, ie = Encoding.default_external)
click to toggle source
The parser should be initialized by passing filename and language parameters.
# File lib/myasorubka/aot/dictionary.rb, line 19 def initialize(filename, language = nil, ee = nil, ie = Encoding.default_external) encoding = { internal_encoding: ie, external_encoding: ee } @filename = filename @lines, @language = File.readlines(filename, $/, encoding), language @rules_offset = 0 @accents_offset = rules_offset + rules.length + 1 @logs_offset = accents_offset + accents.length + 1 @prefixes_offset = logs_offset + logs.length + 1 @lemmas_offset = prefixes_offset + prefixes.length + 1 end
Public Instance Methods
accents()
click to toggle source
Accents section accessor.
# File lib/myasorubka/aot/dictionary.rb, line 93 def accents @accents ||= Section.new(lines, accents_offset) end
lemmas()
click to toggle source
Lemmas section accessor.
# File lib/myasorubka/aot/dictionary.rb, line 111 def lemmas @lemmas ||= Section.new(lines, lemmas_offset) do |line| stem, rule_id, accent_id, session_id, ancode, prefix_id = line.split case language when :russian then stem &&= stem.tr 'Ёё', 'Ее' end Array.new.tap do |result| result << (stem == '#' ? nil : stem) << rule_id.to_i << accent_id.to_i << session_id.to_i << (ancode == '-' ? nil : ancode[0..1]) << (prefix_id == '-' ? nil : prefix_id.to_i) end end end
logs()
click to toggle source
Logs section accessor.
# File lib/myasorubka/aot/dictionary.rb, line 99 def logs @logs ||= Section.new(lines, logs_offset) end
prefixes()
click to toggle source
Prefixes section accessor.
# File lib/myasorubka/aot/dictionary.rb, line 105 def prefixes @prefixes ||= Section.new(lines, prefixes_offset) end
rules()
click to toggle source
Rules section accessor.
# File lib/myasorubka/aot/dictionary.rb, line 73 def rules @rules ||= Section.new(lines, rules_offset) do |line| line.split('%').map do |rule_line| next unless rule_line && !rule_line.empty? suffix, ancode, prefix = rule_line.split '*' case language when :russian then suffix &&= suffix.tr 'Ёё', 'Ее' prefix &&= prefix.tr 'Ёё', 'Ее' end [suffix, ancode[0..1], prefix] end.compact end end