class AmazonTRP::Page

Attributes

blocks[R]
content[R]
form[R]
geometry[R]
id[R]
lines[R]
tables[R]
text[R]

Public Class Methods

new(blocks, blockMap) click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 435
def initialize(blocks, blockMap)
  @blocks = blocks
  @text = ""
  @lines = []
  @form = Form.new()
  @tables = []
  @content = []
  
  _parse(blockMap)
end

Public Instance Methods

_parse(blockMap) click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 454
def _parse(blockMap)
  @blocks.each do |item|
    if item[:block_type] == "PAGE"
      @geometry = Geometry.new(item[:geometry])
      @id = item[:id]
    elsif item[:block_type] == "LINE"
      l = Line.new(item, blockMap)
      @lines.append(l)
      @content.append(l)
      @text = @text + l.text + '\n'
    elsif item[:block_type] == "TABLE"
      t = Table.new(item, blockMap)
      @tables.append(t)
      @content.append(t)
    elsif item[:block_type] == "KEY_VALUE_SET"
      if item[:entity_types].include?('KEY')
        f = Field.new(item, blockMap)
        if f.key
          @form.addField(f)
          @content.append(f)
        end
      end
    end
  end
end
getLinesInBoundingBox(boundingBox) click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 515
def getLinesInBoundingBox(boundingBox)
  lines = []
  @lines.each do |line|
    line_bbox = line.geometry.boundingBox
    if (line_bbox.left >= boundingBox.left &&
      line_bbox.left <= boundingBox.right &&
      line_bbox.top >= boundingBox.top &&
      line_bbox.top <= boundingBox.bottom)
      lines.append(line)
    end
  end
  return lines
end
getLinesInReadingOrder() click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 480
def getLinesInReadingOrder
  columns = []
  lines = []
  @lines.each do |item|
    column_found = false
    columns.each_with_index do |column, index|
      bbox_left = item.geometry.boundingBox.left
      bbox_right = item.geometry.boundingBox.right
      bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
      column_centre = column[:left] + ((column[:right] - column[:left]) / 2)
      if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right)
        # Bbox appears inside the column
        lines.append({:column => index, :text => item.text})
        column_found = true
        break
      end
    end
    if !column_found
      columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right})
      lines.append({:column => columns.count - 1, :text => item.text})
    end
  end
  
  return AmazonTRP::stable_sort_by(lines) {|x| x[:column]}
end
getTextInReadingOrder() click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 506
def getTextInReadingOrder
  lines = getLinesInReadingOrder()
  text = ""
  lines.each do |line|
    text = text + line[:text] + "\n"
  end
  return text
end
to_s() click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 446
def to_s
  s = "Page:\n"
  @content.each do |item|
    s = s + item.to_s + "\n"
  end
  return s
end