class NlpToolz::PosTags

Constants

FileInputStream

load java classes

POSModel
POSTaggerME

Attributes

input[RW]
lang[RW]
model[RW]
model_name[RW]
tokenized[RW]

Public Class Methods

new(input, lang = nil) click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 20
def initialize(input, lang = nil)
  @input = input
  @lang = lang || NlpToolz::Language.get_language(input)
  @model_name = "#{@lang}-pos-maxent.bin"
  get_model
end

Public Instance Methods

get_pos_tags() click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 27
def get_pos_tags
  if self.has_model?
    @tokenized = tokenize_it @tagger.tag(@input.clean_up)
  end
end
has_model?() click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 41
def has_model?
  @model
end
tags() click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 37
def tags
  @tokenized[:tags]
end
tokens() click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 33
def tokens
  @tokenized[:tokens]
end

Private Instance Methods

get_model() click to toggle source
# File lib/nlp_toolz/pos_tags.rb, line 47
def get_model
  model_file = "#{MODELS}/pos/#{@model_name}"
  if File.exists?(model_file)
    @model = POSModel.new(FileInputStream.new(model_file))
    @tagger = POSTaggerME.new(@model)
  else
    @model = false
  end
end
tokenize_it(stream) click to toggle source

ToDo 2012-11-28: only a workaround upto the opennlp tokenizer is implemented

# File lib/nlp_toolz/pos_tags.rb, line 58
def tokenize_it stream
  foo = {tokens: [], tags: []}
  stream.split.each do |token|
    splitter = token.split("/")
    if splitter.length == 2
      foo[:tokens] << splitter.first
      foo[:tags] << splitter.last
    else
      splitter[0..-2].each do |splits|
        foo[:tokens] << splits
        foo[:tags] << splitter.last
      end
    end
  end
  foo
end