class AmazonTRP::Page
Attributes
blocks[R]
content[R]
form[R]
geometry[R]
id[R]
lines[R]
tables[R]
text[R]
Public Class Methods
new(blocks, blockMap)
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 435 def initialize(blocks, blockMap) @blocks = blocks @text = "" @lines = [] @form = Form.new() @tables = [] @content = [] _parse(blockMap) end
Public Instance Methods
_parse(blockMap)
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 454 def _parse(blockMap) @blocks.each do |item| if item[:block_type] == "PAGE" @geometry = Geometry.new(item[:geometry]) @id = item[:id] elsif item[:block_type] == "LINE" l = Line.new(item, blockMap) @lines.append(l) @content.append(l) @text = @text + l.text + '\n' elsif item[:block_type] == "TABLE" t = Table.new(item, blockMap) @tables.append(t) @content.append(t) elsif item[:block_type] == "KEY_VALUE_SET" if item[:entity_types].include?('KEY') f = Field.new(item, blockMap) if f.key @form.addField(f) @content.append(f) end end end end end
getLinesInBoundingBox(boundingBox)
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 515 def getLinesInBoundingBox(boundingBox) lines = [] @lines.each do |line| line_bbox = line.geometry.boundingBox if (line_bbox.left >= boundingBox.left && line_bbox.left <= boundingBox.right && line_bbox.top >= boundingBox.top && line_bbox.top <= boundingBox.bottom) lines.append(line) end end return lines end
getLinesInReadingOrder()
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 480 def getLinesInReadingOrder columns = [] lines = [] @lines.each do |item| column_found = false columns.each_with_index do |column, index| bbox_left = item.geometry.boundingBox.left bbox_right = item.geometry.boundingBox.right bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2 column_centre = column[:left] + ((column[:right] - column[:left]) / 2) if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right) # Bbox appears inside the column lines.append({:column => index, :text => item.text}) column_found = true break end end if !column_found columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right}) lines.append({:column => columns.count - 1, :text => item.text}) end end return AmazonTRP::stable_sort_by(lines) {|x| x[:column]} end
getTextInReadingOrder()
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 506 def getTextInReadingOrder lines = getLinesInReadingOrder() text = "" lines.each do |line| text = text + line[:text] + "\n" end return text end
to_s()
click to toggle source
# File lib/amazon-textract-parser-ruby.rb, line 446 def to_s s = "Page:\n" @content.each do |item| s = s + item.to_s + "\n" end return s end