class Lemmatizer::Lemmatizer

Constants

DATA_DIR
MORPHOLOGICAL_SUBSTITUTIONS
WN_FILES

Public Class Methods

new(dict = nil) click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 61
def initialize(dict = nil)
  @wordlists  = {}
  @exceptions = {}

  MORPHOLOGICAL_SUBSTITUTIONS.keys.each do |x|
    @wordlists[x]  = {}
    @exceptions[x] = {}
  end
  
  WN_FILES.each_pair do |pos, pair|
    load_wordnet_files(pos, pair[0], pair[1])
  end

  if dict
    [dict].flatten.each do |d|
      load_provided_dict(d)
    end
  end
end

Public Instance Methods

inspect() click to toggle source

Print object only on init

# File lib/lemmatizer/lemmatizer.rb, line 99
def inspect
  "#{self}"
end
lemma(form, pos = nil) click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 81
def lemma(form, pos = nil)
  unless pos
    [:verb, :noun, :adj, :adv, :abbr].each do |p|
      result = lemma(form, p)
      return result unless result == form
    end

    return form
  end

  each_lemma(form, pos) do |x|
    return x
  end

  form
end

Private Instance Methods

each_lemma(form, pos) { |x| ... } click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 147
def each_lemma(form, pos)
  if lemma = @exceptions[pos][form]
    lemma.each { |x| yield x }
  end

  if pos == :noun && form.endwith('ful')
    each_lemma(form[0, form.length-3], pos) do |x|
      yield x + 'ful'
    end
  else

  each_substitutions(form, pos) do|x|
      yield x
    end
  end
end
each_substitutions(form, pos) { |lemma| ... } click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 132
def each_substitutions(form, pos)
  if lemma = @wordlists[pos][form]
    yield lemma
  end

  MORPHOLOGICAL_SUBSTITUTIONS[pos].each do |entry|
    old, new = *entry
    if form.endwith(old)
      each_substitutions(form[0, form.length - old.length] + new, pos) do |x|
        yield x
      end
    end
  end
end
load_provided_dict(dict) click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 181
def load_provided_dict(dict)
  num_lex_added = 0
  open_file(dict) do |io|
    io.each_line do |line|
      # pos must be either n|v|r|a or noun|verb|adverb|adjective
      p, w, s = line.split(/\s+/, 3)
      pos = str_to_pos(p)
      word = w
      substitute = s.strip
      if /\A\"(.*)\"\z/ =~ substitute
        substitute = $1
      end
      if /\A\'(.*)\'\z/ =~ substitute
        substitute = $1
      end
      next unless (pos && word && substitute)
      if @wordlists[pos]
        @wordlists[pos][word] = substitute
        num_lex_added += 1
      end
    end
  end
  # puts "#{num_lex_added} items added from #{File.basename dict}"
end
load_wordnet_files(pos, list, exc) click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 115
def load_wordnet_files(pos, list, exc)
  open_file(list) do |io|
    io.each_line do |line|
      w = line.split(/\s+/)[0]
      @wordlists[pos][w] = w
    end
  end

  open_file(exc) do |io|
    io.each_line do |line|
      w, s = line.split(/\s+/)
      @exceptions[pos][w] ||= []
      @exceptions[pos][w] << s
    end
  end
end
open_file(*args) { |args| ... } click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 105
def open_file(*args)
  if args[0].is_a? IO or args[0].is_a? StringIO
    yield args[0]
  else
    File.open(*args) do |io|
      yield io
    end
  end
end
str_to_pos(str) click to toggle source
# File lib/lemmatizer/lemmatizer.rb, line 164
def str_to_pos(str)
  case str
  when "n", "noun"
    return :noun
  when "v", "verb"
    return :noun
  when "a", "j", "adjective", "adj"
    return :adj
  when "r", "adverb", "adv"
    return :adv
  when "b", "abbrev", "abbr", "abr"
    return :abbr
  else
    return :unknown
  end
end