class Enparser::Extractor

Attributes

forms[R]
skips[R]
words[R]

Public Class Methods

new() click to toggle source
# File lib/enparser/extractor.rb, line 12
def initialize
  @words = {}
  @skips = {}
  @forms = {}
  @lmlzr = Lemmatizer.new
end

Public Instance Methods

each() { |k, v, keys.join(', ')| ... } click to toggle source

accept block with |word, count, forms|

# File lib/enparser/extractor.rb, line 79
def each
  return unless block_given?
  sort!
  @words.each {|k, v| yield(k, v, @forms[k].keys.join(', ')) }
end
extract(line) click to toggle source
# File lib/enparser/extractor.rb, line 56
def extract(line)
  split_by_word(line).each {|w| extract_word(w.downcase)}
end
extract_word(word) click to toggle source
# File lib/enparser/extractor.rb, line 38
def extract_word(word)
  lemma = @lmlzr.lemma(word)
  return if skip?(lemma)
  @words[lemma] = 0 if @words[lemma].nil?
  @forms[lemma] = {} if @forms[lemma].nil?
  @words[lemma] += 1
  @forms[lemma][word] = 0 unless lemma.eql?(word)
end
extracted?(word) click to toggle source
# File lib/enparser/extractor.rb, line 47
def extracted?(word)
  w = word.downcase
  lemma = @lmlzr.lemma(w)
  return false if @words[lemma].nil?
  aforms = Array.new(@forms[lemma].keys)
  aforms << lemma
  return aforms.include?(w)
end
load_skip_deafult() click to toggle source

TODO make some lists and put it into data directory

# File lib/enparser/extractor.rb, line 86
def load_skip_deafult
  load_skip_file('word/*')
end
load_skip_file(file_pattern) click to toggle source
# File lib/enparser/extractor.rb, line 60
def load_skip_file(file_pattern)
  skipfiles = Dir.glob(file_pattern.split(';')).flatten.uniq
  skipfiles.each do |f|
    File.foreach(f) {|l| skip_line(l)}
  end
end
parse_files(file_pattern) click to toggle source
# File lib/enparser/extractor.rb, line 67
def parse_files(file_pattern)
  files = Dir.glob(file_pattern.split(';')).flatten.uniq
  files.each do |f|
    File.foreach(f) {|l| extract(l)}
  end
end
skip?(word) click to toggle source
# File lib/enparser/extractor.rb, line 19
def skip?(word)
  !@skips[@lmlzr.lemma(word.downcase)].nil?
end
skip_line(line) click to toggle source
# File lib/enparser/extractor.rb, line 34
def skip_line(line)
  split_by_word(line.downcase).each {|w| @skips[@lmlzr.lemma(w)] = 0}
end
sort!() click to toggle source
# File lib/enparser/extractor.rb, line 74
def sort!
  @words = @words.sort_by(&:last).to_h
end
split_by_word(line) click to toggle source
# File lib/enparser/extractor.rb, line 23
def split_by_word(line)
  line.scrub!
  words = line.scan(/[a-zA-Z]+\-?\'?[a-zA-Z]*/)
  # FIXME: can't -> ca
  words.each {|w|
    w.gsub!(/^can't/, 'can')
    w.gsub!(/^won't/, 'will')
    w.gsub!(/(\'m|\'s|\'re|\'d|\'ve|\'ll|n\'t)$/, '')}
  words
end