class NlpToolz::Parser

Constants

FileInputStream

load java classes

Leaf
Node

Attributes

input[RW]
lang[RW]
model[RW]
model_name[RW]
parse_hash[RW]
parsed[R]

Public Class Methods

new(input, lang = nil) click to toggle source
# File lib/nlp_toolz/parser.rb, line 18
def initialize(input, lang = nil)
  @input = input
  @lang = lang || NlpToolz::Language.get_language(input)
  @model_name = "#{@lang}-sm5.gr"
  get_model
end

Public Instance Methods

has_model?() click to toggle source
# File lib/nlp_toolz/parser.rb, line 41
def has_model?
  @model
end
hash() click to toggle source
# File lib/nlp_toolz/parser.rb, line 49
def hash
  @parse_hash
end
layer(level = nil) click to toggle source
# File lib/nlp_toolz/parser.rb, line 45
def layer(level = nil)
  @first_layer
end
parse_text() click to toggle source
# File lib/nlp_toolz/parser.rb, line 25
def parse_text
  parsed = nil
  if self.has_model?
    jar = "#{JARS}/BerkeleyParser-1.7.jar"
    in_file = make_tmp_file_from @input.clean_up
    out_file = make_tmp_file_from
    `java -Xmx4g -jar #{jar} -gr #{@model} -inputFile #{in_file.path} -outputFile #{out_file.path} -tokenize -maxLength 500`.chomp
    @parsed = File.open(out_file).gets(nil).chomp

    parse_output_to_hash

    delete_and_unlink_tmp_file in_file
    delete_and_unlink_tmp_file out_file
  end
end

Private Instance Methods

create_leafs(parsed) click to toggle source
  1. merge tags and tokens, create leafs

# File lib/nlp_toolz/parser.rb, line 89
def create_leafs(parsed)
  @first_layer = {tags: [],tokens: []}
  leafs = {}
  foo = []
  parsed.each_with_index do |part,i|
    if part =~ /\{([\w\-]+|\$\p{P}|\p{P})/ && parsed[i+1] =~ /([\p{L}\p{N}\-\.]+|\p{P})\}/
      tag = part.gsub("{","")
      token = parsed[i+1].gsub("}","")
      @first_layer[:tags] << tag
      @first_layer[:tokens] << token

      leaf = Leaf.new(tag.to_sym,token)

      if foo[foo.length-1].is_a?(Hash)
        foo[foo.length-1] = [foo[foo.length-1], leaf]
      elsif foo[foo.length-1].is_a?(Array)
        foo[foo.length-1] << leaf
      else
        foo << leaf
      end
    elsif part !~ /([\p{L}\p{N}\-]+|\p{P})\}/
      if part =~ /(\{)(.+)/
        foo << "{#{part.gsub("{","")}"
      else
        foo << "#{part}"
      end
    end
  end

  foo
end
get_model() click to toggle source

helper for … initialize

# File lib/nlp_toolz/parser.rb, line 57
def get_model
  model_file = "#{MODELS}/parser/#{@model_name}"
  if File.exists?(model_file)
    @model = model_file
  else
    @model = false
  end
end
make_hash_hash(nodes) click to toggle source
# File lib/nlp_toolz/parser.rb, line 121
def make_hash_hash(nodes)
  tmp = catch(:done) {
    nodes.reverse.each_with_index do |node,i|
      if node =~ /\{(\w+)/
        key = node.match(/\{(\w+)/)[1].to_sym
        part = []
        nodes[-i-1..-1].each_with_index do |x,ii|
          if x == "}"
            part = {key => nodes[-i..-i+ii-2]}
            throw :done, [nodes[0..-i-2],part,nodes[-i+ii..-1]].flatten
          end
        end
      end
    end
  }
  if tmp.length > 3
    make_hash_hash(tmp)
  else
    tmp[1]
  end
end
parse_output_to_hash() click to toggle source

convert: tree -> hash

# File lib/nlp_toolz/parser.rb, line 67
def parse_output_to_hash
  parsed = split_parse_tree(self.parsed)
  nodes = create_leafs(parsed)
  @parse_hash = make_hash_hash(nodes)
  
  @parse_hash
end
split_parse_tree(parsed) click to toggle source
  1. split

# File lib/nlp_toolz/parser.rb, line 80
def split_parse_tree(parsed)
  bar = parsed.gsub("))", ") )").gsub("))", ") )")
              .gsub("(", "{")
              .gsub(")", "}")

  bar.split
end