class PROIEL::DictionaryBuilder
Constants
- CURRENT_SCHEMA_VERSION
Attributes
language[R]
lemmata[R]
license[R]
sources[R]
Public Class Methods
new()
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 15 def initialize @language = nil @license = nil @sources = [] @lemmata = {} @valency = PROIEL::Valency::Lexicon.new end
Public Instance Methods
add_external_glosses!(filename, languages = %i(eng))
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 60 def add_external_glosses!(filename, languages = %i(eng)) raise ArgumentError, 'filename expected' unless filename.is_a?(String) raise ArgumentError, 'file not found' unless File.exists?(filename) CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t", header_converters: :symbol, quote_char: "\b") do |row| h = row.to_h data = languages.map { |l| [l, h[l]] }.to_h lemma = initialize_lemma!(row[:lemma], row[:part_of_speech]) lemma[:glosses] ||= {} lemma[:glosses].merge!(data) end end
add_source!(source)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 23 def add_source!(source) raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source) raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license @language ||= source.language @license ||= source.license @sources << source source.tokens.each { |token| index_token!(token) } index_homographs! end
to_xml(io)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 39 def to_xml(io) builder = ::Builder::XmlMarkup.new(target: io, indent: 2) builder.instruct! :xml, version: '1.0', encoding: 'UTF-8' builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do builder.dictionary(language: @language) do builder.sources do @sources.each do |source| builder.source(idref: source.id, license: source.license) end end builder.lemmata do @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data| form, _ = form_and_pos.split(',') lemma_to_xml(builder, form, data) end end end end end
Private Instance Methods
distribution_to_xml(builder, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 103 def distribution_to_xml(builder, data) unless data[:distribution].empty? builder.distribution do data[:distribution].sort_by(&:first).each do |source_id, n| builder.source(idref: source_id, n: n) end end end end
glosses_to_xml(builder, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 113 def glosses_to_xml(builder, data) unless data[:glosses].empty? builder.glosses do data[:glosses].each do |language, value| builder.gloss(value, language: language) end end end end
homographs_to_xml(builder, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 123 def homographs_to_xml(builder, data) if data[:homographs].count > 0 builder.homographs do data[:homographs].each do |homograph| lemma, part_of_speech = homograph.split(',') builder.homograph lemma: lemma, "part-of-speech": part_of_speech end end end end
index_homographs!()
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 184 def index_homographs! @lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |_, homographs| if homographs.count > 1 homographs.each do |form| @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form } end end end end
index_token!(token)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 194 def index_token!(token) if token.lemma and token.part_of_speech lemma = initialize_lemma!(token.lemma, token.part_of_speech) lemma[:n] += 1 lemma[:distribution][token.source.id] ||= 0 lemma[:distribution][token.source.id] += 1 lemma[:paradigm][token.morphology] ||= {} lemma[:paradigm][token.morphology][token.form] ||= 0 lemma[:paradigm][token.morphology][token.form] += 1 # Find verbal nodes if token.part_of_speech[/^V/] frame = PROIEL::Valency::Arguments.get_argument_frame(token) lemma[:valency][frame] ||= { a: [], r: [] } entry = lemma[:valency][frame] if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' } entry[:r] << token.id else entry[:a] << token.id end end end end
initialize_lemma!(lemma, part_of_speech)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 77 def initialize_lemma!(lemma, part_of_speech) encoded_lemma = [lemma, part_of_speech].join(',') @lemmata[encoded_lemma] ||= {} @lemmata[encoded_lemma][:lemma] ||= lemma @lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech @lemmata[encoded_lemma][:homographs] ||= [] @lemmata[encoded_lemma][:n] ||= 0 %i(distribution glosses paradigm valency).each do |k| @lemmata[encoded_lemma][k] ||= {} end @lemmata[encoded_lemma] end
lemma_to_xml(builder, form, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 93 def lemma_to_xml(builder, form, data) builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do distribution_to_xml(builder, data) glosses_to_xml(builder, data) homographs_to_xml(builder, data) paradigm_to_xml(builder, data) valency_to_xml(builder, data) end end
paradigm_to_xml(builder, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 134 def paradigm_to_xml(builder, data) unless data[:paradigm].empty? builder.paradigm do data[:paradigm].sort_by(&:first).each do |morphology, d| builder.slot1 morphology: morphology do d.sort_by(&:first).each do |form, n| builder.slot2 form: form, n: n end end end end end end
valency_to_xml(builder, data)
click to toggle source
# File lib/proiel/dictionary/builder.rb, line 148 def valency_to_xml(builder, data) unless data[:valency].empty? builder.valency do frames = data[:valency].map do |arguments, token_ids| { arguments: arguments, tokens: token_ids } end PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame| builder.frame do builder.arguments do frame[:arguments].each do |argument| # FIXME: deal with in a better way argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech] argument.delete(:part_of_speech) builder.argument argument end end if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0 builder.tokens do frame[:tokens][:a].each do |token_id| builder.token(flags: 'a', idref: token_id) end frame[:tokens][:r].each do |token_id| builder.token(flags: 'r', idref: token_id) end end end end end end end end