class PROIEL::Treebank
A class representing a PROIEL
treebank containing any number of sources. The sources must use the same annotation scheme.
Constants
- METADATA_ELEMENTS
Available metadata elements for sources.
Attributes
@return [AnnotationSchema] annotation schema for the treebank
@return [Array<Dictionary>] dictionaries in the treebank
@return [String] PROIEL
XML schema version for the treebank
@return [Array<Source>] sources in the treebank
Public Class Methods
Creates a new treebank object.
# File lib/proiel/treebank.rb, line 67 def initialize @annotation_schema = nil @schema_version = nil @sources = [] @dictionaries = [] @source_index = {} @div_index = {} @sentence_index = {} @token_index = {} end
Public Instance Methods
Finds the {Div} object corresponding to a div ID.
@param id [Integer]
@return [nil, Div]
# File lib/proiel/treebank.rb, line 146 def find_div(id) raise ArgumentError, 'integer expected' unless id.is_a?(Integer) @div_index[id] end
Finds the {Sentence} object corresponding to a sentence ID.
@param id [Integer]
@return [nil, Sentence]
# File lib/proiel/treebank.rb, line 157 def find_sentence(id) raise ArgumentError, 'integer expected' unless id.is_a?(Integer) @sentence_index[id] end
Finds the {Source} object corresponding to a source ID.
@param id [String]
@return [nil, Source]
# File lib/proiel/treebank.rb, line 135 def find_source(id) raise ArgumentError, 'string expected' unless id.is_a?(String) @source_index[id] end
Finds the {Token} object corresponding to a token ID.
@param id [Integer]
@return [nil, Token]
# File lib/proiel/treebank.rb, line 168 def find_token(id) raise ArgumentError, 'integer expected' unless id.is_a?(Integer) @token_index[id] end
Loads one or more PROIEL
XML files.
@param f [String, IO, Array] PROIEL
XML files to load
@return [Treebank] treebank object
# File lib/proiel/treebank.rb, line 85 def load_from_xml(f) case f when Array f.each { |filename| load_from_xml(filename) } when String load_from_xml(File.open(f)) when IO tf = PROIELXML::Reader.parse_io(f) tf.proiel.sources.each do |s| @sources << Source.new(self, s.id, tf.proiel.export_time, s.language, s.dialect, bundle_metadata(s), s.alignment_id) do |source| build_divs(s, source) end index_source_objects!(@sources.last) end tf.proiel.dictionaries.each do |s| @dictionaries << Dictionary.new(self, tf.proiel.export_time, s.language, s.dialect, s) index_dictionary_objects!(@dictionaries.last) end annotation_schema = AnnotationSchema.new(tf.proiel.annotation) schema_version = tf.proiel.schema_version @annotation_schema ||= annotation_schema @schema_version ||= schema_version if @annotation_schema == annotation_schema and @schema_version == schema_version # FIXME: consolidate export times? This is a design flaw in PROIEL XML # 2.0: export time ought to be per source not per PROIEL XML file, so # not clear what to do here. Pass it down to the source object? # @export_time = tf.proiel.export_time else raise SchemaMismatch end else raise ArgumentError, 'expected filename, IO or array of these' end self end
Private Instance Methods
# File lib/proiel/treebank.rb, line 180 def build_divs(s, source) # For PROIEL XML 2.0 we generate an ID, for PROIEL XML >= 2.1 we respect the ID # from the XML file. s.divs.each_with_index.map do |d, i| Div.new(source, d.id || i + 1, d.title, d.presentation_before, d.presentation_after, d.alignment_id) do |div| build_sentences(d, div) end end end
# File lib/proiel/treebank.rb, line 191 def build_sentences(d, div) d.sentences.map do |e| Sentence.new(div, e.id, e.status, e.presentation_before, e.presentation_after, e.alignment_id, e.annotated_by, e.reviewed_by, e.annotated_at, e.reviewed_at) do |sentence| build_tokens(e, sentence) end end end
# File lib/proiel/treebank.rb, line 202 def build_tokens(e, sentence) e.tokens.map do |t| Token.new(sentence, t.id, t.head_id, t.form, t.lemma, t.part_of_speech, t.morphology, t.relation, t.empty_token_sort, t.citation_part, t.presentation_before, t.presentation_after, t.antecedent_id, t.information_status, t.contrast_group, t.foreign_ids, t.slashes, t.alignment_id) end end
# File lib/proiel/treebank.rb, line 176 def bundle_metadata(s) METADATA_ELEMENTS.map { |f| [f, s.send(f)] }.to_h end
# File lib/proiel/treebank.rb, line 230 def index_dictionary_objects!(dictionary) # TODO end
# File lib/proiel/treebank.rb, line 214 def index_source_objects!(source) @source_index[source.id] = source source.divs.each do |div| @div_index[div.id] = div div.sentences.each do |sentence| @sentence_index[sentence.id] = sentence sentence.tokens.each do |token| @token_index[token.id] = token end end end end