class Corenlp::Treebank
Attributes
deps_dir[RW]
filenames[RW]
java_max_memory[RW]
output_directory[RW]
raw_text[RW]
sentences[RW]
summary_file[RW]
threads_to_use[RW]
Public Class Methods
new(attrs = {})
click to toggle source
# File lib/corenlp.rb, line 9 def initialize(attrs = {}) self.raw_text = attrs[:raw_text] || "" self.filenames = [] self.output_directory = attrs[:output_directory] || "./tmp/language_processing" self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt" self.filenames = [] self.threads_to_use = attrs[:threads_to_use] || 4 self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g" self.sentences = [] self.deps_dir = attrs[:deps_dir] || "./lib/ext" end
Public Instance Methods
build_treebank()
click to toggle source
# File lib/corenlp.rb, line 48 def build_treebank filenames.each do |filename| xml_file = "#{filename}.xml" doc = Nokogiri.XML(File.open(xml_file)) doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx| sentence = Sentence.new(index: idx) self.sentences << sentence sentence_node.xpath(".//token").each_with_index do |token_node, index| text = token_node.children.at('word').text text = Token.clean_stanford_text(text) cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text) token_attrs = { index: index, text: text, penn_treebank_tag: token_node.children.at('POS').text, stanford_lemma: cleaned_stanford_lemma, type: Token.token_subclass_from_text(text), ner: token_node.children.at('NER').text } token = Token.token_subclass_from_text(text).new(token_attrs) sentence.tokens << token end sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node| dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1 governor_index = dep_node.children.at('governor').attr('idx').to_i - 1 if dependent_index >= 0 && governor_index >= 0 dependent = sentence.get_dependency_token_by_index(dependent_index), governor = sentence.get_dependency_token_by_index(governor_index), relation = dep_node.attr('type') if dependent && governor && relation token_dep = TokenDependency.new({ dependent: sentence.get_dependency_token_by_index(dependent_index), governor: sentence.get_dependency_token_by_index(governor_index), relation: dep_node.attr('type') }) sentence.token_dependencies << token_dep end end end sentence_node.xpath(".//parse").each do |parse_node| sentence.parse_tree_raw = parse_node.text end end end end
parse()
click to toggle source
# File lib/corenlp.rb, line 94 def parse write_output_file_and_summary_file process_files_with_stanford_corenlp build_treebank self end
process_files_with_stanford_corenlp()
click to toggle source
# File lib/corenlp.rb, line 28 def process_files_with_stanford_corenlp classpath = "#{deps_dir}/stanford-corenlp-3.4.jar:#{deps_dir}/stanford-corenlp-3.4-models.jar:#{deps_dir}/xom.jar:#{deps_dir}/joda-time.jar:#{deps_dir}/jollyday.jar:#{deps_dir}/ejml-0.23.jar" stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP" annotators = "tokenize,ssplit,pos,lemma,parse,ner" options = [] options << ["-cp", classpath] options << [java_max_memory, stanford_bin] options << ["-annotators", annotators] options << ["-ner.useSUTime", 0] # turn this off #options << ["-sutime.binders", 0] options << ["-outputDirectory", output_directory] options << ["-nthreads", threads_to_use] options << ["-filelist", summary_file] # a file with one zone file per line command = "java #{options.map{|x| x.join(" ")}.join(" ")}" puts "Running command: \n\n#{command}\n\n" `#{command}` end
write_output_file_and_summary_file()
click to toggle source
# File lib/corenlp.rb, line 21 def write_output_file_and_summary_file input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt") filenames << input_file File.open(input_file, "w"){|f| f.write(raw_text)} File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))} end