module PROIEL::Alignment::Builder

Public Class Methods

compute_matrix(alignment, source, blacklist = [], log_directory = nil) click to toggle source

This computes a matrix of original and translation sentences that are aligned. For now, this function does not handle translation sentences that are unaligned (this is tricky to handle robustly!). As the current treebank collection stands this is an issue that should not arise so this is for now a reasonable approximation.

# File lib/proiel/alignment/builder.rb, line 9
def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
  matrix1 = group_backwards(alignment, source, blacklist)
  raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)

  matrix2 = group_forwards(alignment, source, blacklist)
  raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)

  if log_directory
    # Verify that both texts are still in the correct sequence
    File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
      matrix1.map do |x|
        f.puts x.inspect
      end
    end

    File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
      matrix2.map do |x|
        f.puts x.inspect
      end
    end
  end

  matrix = []
  iter1 = { i: 0, m: matrix1 }
  iter2 = { i: 0, m: matrix2 }

  loop do
    # Take from matrix1 unless we have a translation
    while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
      matrix << iter1[:m][iter1[:i]]
      iter1[:i] += 1
    end

    # Take from matrix2 unless we have an original
    while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
      matrix << iter2[:m][iter2[:i]]
      iter2[:i] += 1
    end

    if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
      # Now the two should match provided alignments are sorted the same way,
      # so take one from each. If they don't match outright, we may have a case
      # of swapped sentence orders or a gap (one sentence unaligned in one of
      # the texts surrounded by two sentences that are aligned to the same
      # sentence in the other text). We'll try to repair this by merging bits
      # from the next row in various combinations.
      #
      # When adding to the new mateix, pick original from matrix1 and
      # translation from matrix2 so that the original textual order is
      # preserved
      if repair(matrix, iter1, 0, iter2, 0) or

         repair(matrix, iter1, 1, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 1) or

         repair(matrix, iter1, 2, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 2) or

         repair(matrix, iter1, 3, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 3) or

         repair(matrix, iter1, 4, iter2, 0) or
         repair(matrix, iter1, 0, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 1) or
         repair(matrix, iter1, 1, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 2) or
         repair(matrix, iter1, 2, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 3) or
         repair(matrix, iter1, 3, iter2, 4) or
         repair(matrix, iter1, 4, iter2, 4)
      else
        STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
        STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
        raise
      end
    else
      raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
      break
    end
  end

  if log_directory
    File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
      matrix.map do |x|
        f.puts x.inspect
      end
    end
  end

  raise unless matrix.map { |r| r[:original]    }.flatten.compact == alignment.sentences.map(&:id)
  raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)

  matrix
end

Private Class Methods

group_backwards(alignment, source, blacklist = []) click to toggle source
# File lib/proiel/alignment/builder.rb, line 145
def self.group_backwards(alignment, source, blacklist = [])
  # Make an original to translation ID mapping
  mapping = {}

  alignment.sentences.each do |sentence|
    mapping[sentence.id] = []
  end

  source.sentences.each do |sentence|
    next if blacklist.include?(sentence.id)

    original_ids = sentence.inferred_alignment(alignment).map(&:id)

    original_ids.each do |original_id|
      mapping[original_id] << sentence.id
    end
  end

  # Translate to a pairs of ID arrays, chunk original IDs that share at least
  # one translation ID, then reduce the result so we get an array of m-to-n
  # relations
  mapping.map do |k, v|
    { original: [k], translation: v }
  end.chunk_while do |x, y|
    !(x[:translation] & y[:translation]).empty?
  end.map do |chunk|
    chunk.inject do |a, v|
      a[:original] += v[:original]
      a[:translation] += v[:translation]
      a
    end
  end.map do |row|
    { original: row[:original], translation: row[:translation].uniq }
  end
end
group_forwards(alignment, source, blacklist = []) click to toggle source
# File lib/proiel/alignment/builder.rb, line 115
def self.group_forwards(alignment, source, blacklist = [])
  # Make an original to translation ID mapping
  mapping = {}

  source.sentences.each do |sentence|
    mapping[sentence.id] = []

    next if blacklist.include?(sentence.id)

    mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
  end

  # Translate to a pairs of ID arrays, chunk original IDs that share at least
  # one translation ID, then reduce the result so we get an array of m-to-n
  # relations
  mapping.map do |v, k|
    { original: k, translation: [v] }
  end.chunk_while do |x, y|
    !(x[:original] & y[:original]).empty?
  end.map do |chunk|
    chunk.inject do |a, v|
      a[:original] += v[:original]
      a[:translation] += v[:translation]
      a
    end
  end.map do |row|
    { original: row[:original].uniq, translation: row[:translation] }
  end
end
repair(matrix, iter1, delta1, iter2, delta2) click to toggle source
# File lib/proiel/alignment/builder.rb, line 191
def self.repair(matrix, iter1, delta1, iter2, delta2)
  o1 = repair_merge_cells(iter1, delta1, :original)
  o2 = repair_merge_cells(iter2, delta2, :original)

  t1 = repair_merge_cells(iter1, delta1, :translation)
  t2 = repair_merge_cells(iter2, delta2, :translation)

  u1 = select_unaligned(iter1, delta1, :original, :translation)
  u2 = select_unaligned(iter2, delta2, :translation, :original)

  if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
    unless delta1.zero? and delta2.zero?
      STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
      STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
      STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
    end

    matrix << { original: o1, translation: t2 }

    iter1[:i] += delta1 + 1
    iter2[:i] += delta2 + 1

    true
  else
    false
  end
end
repair_merge_cells(iter, delta, field) click to toggle source
# File lib/proiel/alignment/builder.rb, line 181
def self.repair_merge_cells(iter, delta, field)
  matrix, i = iter[:m], iter[:i]
  (0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
end
select_unaligned(iter, delta, field, check_field) click to toggle source
# File lib/proiel/alignment/builder.rb, line 186
def self.select_unaligned(iter, delta, field, check_field)
  matrix, i = iter[:m], iter[:i]
  (0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
end