class Corenlp::Treebank

Attributes

deps_dir[RW]
filenames[RW]
java_max_memory[RW]
output_directory[RW]
raw_text[RW]
sentences[RW]
summary_file[RW]
threads_to_use[RW]

Public Class Methods

new(attrs = {}) click to toggle source
# File lib/corenlp.rb, line 9
def initialize(attrs = {})
  self.raw_text = attrs[:raw_text] || ""
  self.filenames = []
  self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
  self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
  self.filenames = []
  self.threads_to_use = attrs[:threads_to_use] || 4
  self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
  self.sentences = []
  self.deps_dir = attrs[:deps_dir] || "./lib/ext"
end

Public Instance Methods

build_treebank() click to toggle source
# File lib/corenlp.rb, line 48
def build_treebank
  filenames.each do |filename|
    xml_file = "#{filename}.xml"
    doc = Nokogiri.XML(File.open(xml_file))
    doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
      sentence = Sentence.new(index: idx)
      self.sentences << sentence
      sentence_node.xpath(".//token").each_with_index do |token_node, index|
        text = token_node.children.at('word').text
        text = Token.clean_stanford_text(text)
        cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
        token_attrs = {
          index: index,
          text: text,
          penn_treebank_tag: token_node.children.at('POS').text,
          stanford_lemma: cleaned_stanford_lemma,
          type: Token.token_subclass_from_text(text),
          ner: token_node.children.at('NER').text
        }
        token = Token.token_subclass_from_text(text).new(token_attrs)
        sentence.tokens << token
      end
      sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
        dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
        governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
        if dependent_index >= 0 && governor_index >= 0
          dependent = sentence.get_dependency_token_by_index(dependent_index),
          governor  = sentence.get_dependency_token_by_index(governor_index),
          relation = dep_node.attr('type')
          if dependent && governor && relation
            token_dep = TokenDependency.new({
              dependent: sentence.get_dependency_token_by_index(dependent_index),
              governor: sentence.get_dependency_token_by_index(governor_index),
              relation: dep_node.attr('type')
            })
            sentence.token_dependencies << token_dep
          end
        end
      end
      sentence_node.xpath(".//parse").each do |parse_node|
        sentence.parse_tree_raw = parse_node.text
      end
    end
  end
end
parse() click to toggle source
# File lib/corenlp.rb, line 94
def parse
  write_output_file_and_summary_file
  process_files_with_stanford_corenlp
  build_treebank
  self
end
process_files_with_stanford_corenlp() click to toggle source
# File lib/corenlp.rb, line 28
def process_files_with_stanford_corenlp
  classpath = "#{deps_dir}/stanford-corenlp-3.4.jar:#{deps_dir}/stanford-corenlp-3.4-models.jar:#{deps_dir}/xom.jar:#{deps_dir}/joda-time.jar:#{deps_dir}/jollyday.jar:#{deps_dir}/ejml-0.23.jar"
  stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
  annotators = "tokenize,ssplit,pos,lemma,parse,ner"

  options = []
  options << ["-cp", classpath]
  options << [java_max_memory, stanford_bin]
  options << ["-annotators", annotators]
  options << ["-ner.useSUTime", 0] # turn this off
  #options << ["-sutime.binders", 0]
  options << ["-outputDirectory", output_directory]
  options << ["-nthreads", threads_to_use]
  options << ["-filelist", summary_file] # a file with one zone file per line

  command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
  puts "Running command: \n\n#{command}\n\n"
  `#{command}`
end
write_output_file_and_summary_file() click to toggle source
# File lib/corenlp.rb, line 21
def write_output_file_and_summary_file
  input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
  filenames << input_file
  File.open(input_file, "w"){|f| f.write(raw_text)}
  File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
end