class Enparser::Extractor
Attributes
forms[R]
skips[R]
words[R]
Public Class Methods
new()
click to toggle source
# File lib/enparser/extractor.rb, line 12 def initialize @words = {} @skips = {} @forms = {} @lmlzr = Lemmatizer.new end
Public Instance Methods
each() { |k, v, keys.join(', ')| ... }
click to toggle source
accept block with |word, count, forms|
# File lib/enparser/extractor.rb, line 79 def each return unless block_given? sort! @words.each {|k, v| yield(k, v, @forms[k].keys.join(', ')) } end
extract(line)
click to toggle source
# File lib/enparser/extractor.rb, line 56 def extract(line) split_by_word(line).each {|w| extract_word(w.downcase)} end
extract_word(word)
click to toggle source
# File lib/enparser/extractor.rb, line 38 def extract_word(word) lemma = @lmlzr.lemma(word) return if skip?(lemma) @words[lemma] = 0 if @words[lemma].nil? @forms[lemma] = {} if @forms[lemma].nil? @words[lemma] += 1 @forms[lemma][word] = 0 unless lemma.eql?(word) end
extracted?(word)
click to toggle source
# File lib/enparser/extractor.rb, line 47 def extracted?(word) w = word.downcase lemma = @lmlzr.lemma(w) return false if @words[lemma].nil? aforms = Array.new(@forms[lemma].keys) aforms << lemma return aforms.include?(w) end
load_skip_deafult()
click to toggle source
TODO make some lists and put it into data directory
# File lib/enparser/extractor.rb, line 86 def load_skip_deafult load_skip_file('word/*') end
load_skip_file(file_pattern)
click to toggle source
# File lib/enparser/extractor.rb, line 60 def load_skip_file(file_pattern) skipfiles = Dir.glob(file_pattern.split(';')).flatten.uniq skipfiles.each do |f| File.foreach(f) {|l| skip_line(l)} end end
parse_files(file_pattern)
click to toggle source
# File lib/enparser/extractor.rb, line 67 def parse_files(file_pattern) files = Dir.glob(file_pattern.split(';')).flatten.uniq files.each do |f| File.foreach(f) {|l| extract(l)} end end
skip?(word)
click to toggle source
# File lib/enparser/extractor.rb, line 19 def skip?(word) !@skips[@lmlzr.lemma(word.downcase)].nil? end
skip_line(line)
click to toggle source
# File lib/enparser/extractor.rb, line 34 def skip_line(line) split_by_word(line.downcase).each {|w| @skips[@lmlzr.lemma(w)] = 0} end
sort!()
click to toggle source
# File lib/enparser/extractor.rb, line 74 def sort! @words = @words.sort_by(&:last).to_h end
split_by_word(line)
click to toggle source
# File lib/enparser/extractor.rb, line 23 def split_by_word(line) line.scrub! words = line.scan(/[a-zA-Z]+\-?\'?[a-zA-Z]*/) # FIXME: can't -> ca words.each {|w| w.gsub!(/^can't/, 'can') w.gsub!(/^won't/, 'will') w.gsub!(/(\'m|\'s|\'re|\'d|\'ve|\'ll|n\'t)$/, '')} words end