class PROIEL::Treebank

A class representing a PROIEL treebank containing any number of sources. The sources must use the same annotation scheme.

Constants

METADATA_ELEMENTS

Available metadata elements for sources.

Attributes

annotation_schema[R]

@return [AnnotationSchema] annotation schema for the treebank

dictionaries[R]

@return [Array<Dictionary>] dictionaries in the treebank

schema_version[R]

@return [String] PROIEL XML schema version for the treebank

sources[R]

@return [Array<Source>] sources in the treebank

Public Class Methods

new() click to toggle source

Creates a new treebank object.

# File lib/proiel/treebank.rb, line 67
def initialize
  @annotation_schema = nil
  @schema_version = nil
  @sources = []
  @dictionaries = []

  @source_index = {}
  @div_index = {}
  @sentence_index = {}
  @token_index = {}
end

Public Instance Methods

find_div(id) click to toggle source

Finds the {Div} object corresponding to a div ID.

@param id [Integer]

@return [nil, Div]

# File lib/proiel/treebank.rb, line 146
def find_div(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @div_index[id]
end
find_sentence(id) click to toggle source

Finds the {Sentence} object corresponding to a sentence ID.

@param id [Integer]

@return [nil, Sentence]

# File lib/proiel/treebank.rb, line 157
def find_sentence(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @sentence_index[id]
end
find_source(id) click to toggle source

Finds the {Source} object corresponding to a source ID.

@param id [String]

@return [nil, Source]

# File lib/proiel/treebank.rb, line 135
def find_source(id)
  raise ArgumentError, 'string expected' unless id.is_a?(String)

  @source_index[id]
end
find_token(id) click to toggle source

Finds the {Token} object corresponding to a token ID.

@param id [Integer]

@return [nil, Token]

# File lib/proiel/treebank.rb, line 168
def find_token(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @token_index[id]
end
load_from_xml(f) click to toggle source

Loads one or more PROIEL XML files.

@param f [String, IO, Array] PROIEL XML files to load

@return [Treebank] treebank object

# File lib/proiel/treebank.rb, line 85
def load_from_xml(f)
  case f
  when Array
    f.each { |filename| load_from_xml(filename) }
  when String
    load_from_xml(File.open(f))
  when IO
    tf = PROIELXML::Reader.parse_io(f)

    tf.proiel.sources.each do |s|
      @sources << Source.new(self, s.id, tf.proiel.export_time, s.language, s.dialect,
                             bundle_metadata(s), s.alignment_id) do |source|
        build_divs(s, source)
      end

      index_source_objects!(@sources.last)
    end

    tf.proiel.dictionaries.each do |s|
      @dictionaries << Dictionary.new(self, tf.proiel.export_time, s.language, s.dialect, s)

      index_dictionary_objects!(@dictionaries.last)
    end

    annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
    schema_version = tf.proiel.schema_version

    @annotation_schema ||= annotation_schema
    @schema_version ||= schema_version

    if @annotation_schema == annotation_schema and @schema_version == schema_version
      # FIXME: consolidate export times? This is a design flaw in PROIEL XML
      # 2.0: export time ought to be per source not per PROIEL XML file, so
      # not clear what to do here. Pass it down to the source object?
      # @export_time = tf.proiel.export_time
    else
      raise SchemaMismatch
    end
  else
    raise ArgumentError, 'expected filename, IO or array of these'
  end

  self
end

Private Instance Methods

build_divs(s, source) click to toggle source
# File lib/proiel/treebank.rb, line 180
def build_divs(s, source)
  # For PROIEL XML 2.0 we generate an ID, for PROIEL XML >= 2.1 we respect the ID
  # from the XML file.
  s.divs.each_with_index.map do |d, i|
    Div.new(source, d.id || i + 1, d.title, d.presentation_before,
            d.presentation_after, d.alignment_id) do |div|
      build_sentences(d, div)
    end
  end
end
build_sentences(d, div) click to toggle source
# File lib/proiel/treebank.rb, line 191
def build_sentences(d, div)
  d.sentences.map do |e|
    Sentence.new(div, e.id, e.status, e.presentation_before,
                 e.presentation_after, e.alignment_id,
                 e.annotated_by, e.reviewed_by, e.annotated_at,
                 e.reviewed_at) do |sentence|
      build_tokens(e, sentence)
    end
  end
end
build_tokens(e, sentence) click to toggle source
# File lib/proiel/treebank.rb, line 202
def build_tokens(e, sentence)
  e.tokens.map do |t|
    Token.new(sentence, t.id, t.head_id, t.form, t.lemma,
              t.part_of_speech, t.morphology, t.relation,
              t.empty_token_sort, t.citation_part,
              t.presentation_before, t.presentation_after,
              t.antecedent_id, t.information_status,
              t.contrast_group, t.foreign_ids,
              t.slashes, t.alignment_id)
  end
end
bundle_metadata(s) click to toggle source
# File lib/proiel/treebank.rb, line 176
def bundle_metadata(s)
  METADATA_ELEMENTS.map { |f| [f, s.send(f)] }.to_h
end
index_dictionary_objects!(dictionary) click to toggle source
# File lib/proiel/treebank.rb, line 230
def index_dictionary_objects!(dictionary)
  # TODO
end
index_source_objects!(source) click to toggle source
# File lib/proiel/treebank.rb, line 214
def index_source_objects!(source)
  @source_index[source.id] = source

  source.divs.each do |div|
    @div_index[div.id] = div

    div.sentences.each do |sentence|
      @sentence_index[sentence.id] = sentence

      sentence.tokens.each do |token|
        @token_index[token.id] = token
      end
    end
  end
end