class Lemmatizer::Lemmatizer
Constants
- DATA_DIR
- MORPHOLOGICAL_SUBSTITUTIONS
- WN_FILES
Public Class Methods
new(dict = nil)
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 61 def initialize(dict = nil) @wordlists = {} @exceptions = {} MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x| @wordlists[x] = {} @exceptions[x] = {} end WN_FILES.each_pair do |pos, pair| load_wordnet_files(pos, pair[0], pair[1]) end if dict [dict].flatten.each do |d| load_provided_dict(d) end end end
Public Instance Methods
inspect()
click to toggle source
Print object only on init
# File lib/lemmatizer/lemmatizer.rb, line 99 def inspect "#{self}" end
lemma(form, pos = nil)
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 81 def lemma(form, pos = nil) unless pos [:verb, :noun, :adj, :adv, :abbr].each do |p| result = lemma(form, p) return result unless result == form end return form end each_lemma(form, pos) do |x| return x end form end
Private Instance Methods
each_lemma(form, pos) { |x| ... }
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 147 def each_lemma(form, pos) if lemma = @exceptions[pos][form] lemma.each { |x| yield x } end if pos == :noun && form.endwith('ful') each_lemma(form[0, form.length-3], pos) do |x| yield x + 'ful' end else each_substitutions(form, pos) do|x| yield x end end end
each_substitutions(form, pos) { |lemma| ... }
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 132 def each_substitutions(form, pos) if lemma = @wordlists[pos][form] yield lemma end MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry| old, new = *entry if form.endwith(old) each_substitutions(form[0, form.length - old.length] + new, pos) do |x| yield x end end end end
load_provided_dict(dict)
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 181 def load_provided_dict(dict) num_lex_added = 0 open_file(dict) do |io| io.each_line do |line| # pos must be either n|v|r|a or noun|verb|adverb|adjective p, w, s = line.split(/\s+/, 3) pos = str_to_pos(p) word = w substitute = s.strip if /\A\"(.*)\"\z/ =~ substitute substitute = $1 end if /\A\'(.*)\'\z/ =~ substitute substitute = $1 end next unless (pos && word && substitute) if @wordlists[pos] @wordlists[pos][word] = substitute num_lex_added += 1 end end end # puts "#{num_lex_added} items added from #{File.basename dict}" end
load_wordnet_files(pos, list, exc)
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 115 def load_wordnet_files(pos, list, exc) open_file(list) do |io| io.each_line do |line| w = line.split(/\s+/)[0] @wordlists[pos][w] = w end end open_file(exc) do |io| io.each_line do |line| w, s = line.split(/\s+/) @exceptions[pos][w] ||= [] @exceptions[pos][w] << s end end end
open_file(*args) { |args| ... }
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 105 def open_file(*args) if args[0].is_a? IO or args[0].is_a? StringIO yield args[0] else File.open(*args) do |io| yield io end end end
str_to_pos(str)
click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 164 def str_to_pos(str) case str when "n", "noun" return :noun when "v", "verb" return :noun when "a", "j", "adjective", "adj" return :adj when "r", "adverb", "adv" return :adv when "b", "abbrev", "abbr", "abr" return :abbr else return :unknown end end