class PROIEL::PROIELXML::Validator

A validator object that uses an XML schema as well as additional integrity checks to validate a PROIEL XML file. Functionality for loading the XML schema and checking the PROIEL XML version number is found in {PROIEL::PROIELXML::Schema}.

Attributes

errors[R]

Returns an array of error messages generated during validation.

Public Class Methods

new(filename, aligned_filename = nil) click to toggle source

Creates a new validator for a PROIEL XML file.

@param filename [String] name of PROIEL XML file to validate @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against

# File lib/proiel/proiel_xml/validator.rb, line 21
def initialize(filename, aligned_filename = nil)
  @filename = filename
  @aligned_filename = aligned_filename
  @errors = []
end

Public Instance Methods

has_referential_integrity?() click to toggle source

Checks the referential integrity of the PROIEL XML file.

If inconsistencies are found, error messages will be appended to ‘errors`.

@return [true, false]

# File lib/proiel/proiel_xml/validator.rb, line 106
def has_referential_integrity?
  tb = PROIEL::Treebank.new
  tb.load_from_xml(@filename)

  errors = []

  # Pass 1: keep track of all object IDs and look for duplicates
  sentence_ids = {}
  token_ids = {}

  tb.sources.each do |source|
    source.divs.each do |div|
      div.sentences.each do |sentence|
        errors << "Repeated sentence ID #{sentence.id}" if sentence_ids.key?(sentence.id)
        sentence_ids[sentence.id] = true

        sentence.tokens.each do |token|
          errors << "Repeated token ID #{token.id}" if token_ids.key?(token.id)
          token_ids[token.id] = { sentence: sentence.id, div: div.id, source: source.id }
        end
      end
    end
  end

  # Pass 2: check object ID references
  tb.sources.each do |source|
    source.tokens.each do |token|
      # Head IDs and slash IDs should be sentence internal
      check_reference_locality(errors, token, token_ids, :head_id, token.head_id, domain: :sentence, allow_nil: true)

      token.slashes.each do |_, target_id|
        check_reference_locality(errors, token, token_ids, :slash_id, target_id, domain: :sentence, allow_nil: false)
      end

      # Antecedent IDs should be source internal
      check_reference_locality(errors, token, token_ids, :antecedent_id, token.antecedent_id, domain: :source, allow_nil: true)
    end
  end

  # Pass 3: verify that all features are defined
  # TBD

  # Pass 4: alignment_id on div, sentence or token requires an alignment_id on source
  tb.sources.each do |source|
    if source.alignment_id.nil?
      if source.divs.any?(&:alignment_id) or source.sentences.any?(&:alignment_id) or source.tokens.any?(&:alignment_id)
        errors << "Alignment ID(s) on divs, sentences or tokens without alignment ID on source"
      end
    end
  end

  # Pass 5: if div is aligned, sentences and tokens within should belong
  # to aligned div(s); if sentence aligned, tokens within should belong
  # to aligned sentence(s). Skip if no alignment_id on source (see pass
  # 4) or if aligned source not available.
  if @aligned_filename
    aligned_tb = PROIEL::Treebank.new
    aligned_tb.load_from_xml(@aligned_filename)

    tb.sources.each do |source|
      if source.alignment_id
        aligned_source = aligned_tb.find_source(source.alignment_id)

        if aligned_source
          check_alignment_integrity(errors, source, aligned_source)
        else
          errors << "Aligned source not available in treebank"
        end
      end
    end
  end

  # Decide if there were any errors
  if errors.empty?
    true
  else
    @errors += errors

    false
  end
end
valid?() click to toggle source

Checks if the PROIEL XML file is valid. This checks for well-formedness, a valid schema version, validation against the schema and referential integrity.

If invalid, ‘errors` will contain error messages.

@return [true, false]

# File lib/proiel/proiel_xml/validator.rb, line 35
def valid?
  wellformed? and valid_schema_version? and validates? and has_referential_integrity?
end
valid_schema_version?() click to toggle source

Checks if the PROIEL XML file has a valid schema version number.

If invalid, an error message will be appended to ‘errors`.

@return [true, false]

# File lib/proiel/proiel_xml/validator.rb, line 61
def valid_schema_version?
  schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)

  if schema_version.nil?
    @errors << 'invalid schema version'

    false
  else
    true
  end
rescue PROIEL::PROIELXML::Schema::InvalidSchemaVersion => e
  @errors << e.message

  false
end
validates?() click to toggle source

Checks if the PROIEL XML file validates against the schema.

If invalid, error messages will be appended to ‘errors`.

@return [true, false]

# File lib/proiel/proiel_xml/validator.rb, line 83
def validates?
  doc = Nokogiri::XML(File.read(@filename))

  schema_version = PROIEL::PROIELXML::Schema.check_schema_version_of_xml_file(@filename)

  schema = PROIEL::PROIELXML::Schema.load_proiel_xml_schema(schema_version)
  r = schema.validate(doc)

  if r.empty?
    true
  else
    @errors += r.map { |e| "Line #{e.line}: #{e.message}" }

    false
  end
end
wellformed?() click to toggle source

Checks if the PROIEL XML file is well-formed XML.

If not well-formed, an error message will be appended to ‘errors`.

@return [true, false]

# File lib/proiel/proiel_xml/validator.rb, line 45
def wellformed?
  Nokogiri::XML(File.read(@filename)) { |config| config.strict }

  true
rescue Nokogiri::XML::SyntaxError => _
  @errors << 'XML file is not wellformed'

  false
end

Private Instance Methods

check_alignment_integrity(errors, source, aligned_source) click to toggle source
# File lib/proiel/proiel_xml/validator.rb, line 207
def check_alignment_integrity(errors, source, aligned_source)
  source.divs.each do |div|
    target_sentences =
      div.sentences.map do |sentence|
        target_tokens =
          sentence.tokens.select(&:alignment_id).map do |token|
            # Check that target token exists in aligned source
            aligned_token = aligned_source.treebank.find_token(token.alignment_id)

            if aligned_token
              aligned_token
            else
              errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
              nil
            end
          end

        inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq

        if sentence.alignment_id
          a = sentence.alignment_id.to_s.split(',').sort.join(',')
          i = inferred_target_sentences.map(&:id).sort.join(',')

          # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
          if a != i
            errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
          end
        end

        inferred_target_sentences
      end

    inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq

    if div.alignment_id
      a = div.alignment_id.to_s.split(',').sort.join(',')
      i = inferred_target_divs.map(&:id).sort.join(',')

      # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
      if a != i
        errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
      end
    end
  end
end
check_reference_locality(errors, token, token_ids, attribute_name, attribute_value, domain: :sentence, allow_nil: false) click to toggle source
# File lib/proiel/proiel_xml/validator.rb, line 190
def check_reference_locality(errors, token, token_ids, attribute_name,
                             attribute_value, domain: :sentence, allow_nil: false)
  if attribute_value
    referenced_token = token_ids[attribute_value]

    if referenced_token.nil?
      errors << "Token #{token.id}: #{attribute_name} references an unknown token"
    elsif referenced_token[domain] != token.send(domain).id
      errors << "Token #{token.id}: #{attribute_name} references a token in a different #{domain}"
    end
  elsif allow_nil
    # Everything is fine...
  else
    errors << "Token #{token.id}: #{attribute_name} is null"
  end
end