class AIPP::PDF
PDF
to text reader with support for pages and fencing
@example
pdf = AIPP::PDF.new("/path/to/file.pdf") pdf.file # => #<Pathname:/path/to/file.pdf> pdf.from(100).to(200).each_line_with_position do |line, page, last| line # => line content (e.g. "first line") page # => page number (e.g. 1) last # => last line boolean (true for last line, false otherwise) end
Attributes
Public Class Methods
# File lib/aipp/pdf.rb 16 def initialize(file, cache: true) 17 @file = file.is_a?(Pathname) ? file : Pathname(file) 18 @text, @page_ranges = cache ? read_cache : read 19 @from = 0 20 @to = @last = @text.length - 1 21 end
Public Instance Methods
Executes the block for every line and passes the line content, page number and end of document boolean.
If no block is given, an enumerator is returned instead.
@yieldparam line [String] content of the line @yieldparam page [Integer] page number the line is found on within the PDF
@yieldparam last [Boolean] true for the last line, false otherwise @return [Enumerator]
# File lib/aipp/pdf.rb 82 def each_line 83 return enum_for(:each) unless block_given? 84 offset, last_line_index = @from, lines.count - 1 85 lines.each_with_index do |line, line_index| 86 yield(line, page_for(index: offset), line_index == last_line_index) 87 offset += line.length 88 end 89 end
Fence the PDF
beginning with this index
@param index [Integer, Symbol] either an integer position within the
+text+ string or +:begin+ to indicate "first existing position"
@return [self]
# File lib/aipp/pdf.rb 33 def from(index) 34 index = 0 if index == :begin 35 fail ArgumentError unless (0..@to).include? index 36 @from = index 37 self 38 end
@return [String]
# File lib/aipp/pdf.rb 24 def inspect 25 %Q(#<#{self.class} file=#{@file} range=#{range}>) 26 end
Text split to individual lines
@return [Array] lines
# File lib/aipp/pdf.rb 69 def lines 70 text.split(/(?<=[\n\f])/) 71 end
Get the current fencing range
@return [Range<Integer>]
# File lib/aipp/pdf.rb 55 def range 56 (@from..@to) 57 end
Fence the PDF
ending with this index
@param index [Integer, Symbol] either an integer position within the
+text+ string or +:end+ to indicate "last existing position"
@return [self]
# File lib/aipp/pdf.rb 45 def to(index) 46 index = @last if index == :end 47 fail ArgumentError unless (@from..@last).include? index 48 @to = index 49 self 50 end
Private Instance Methods
# File lib/aipp/pdf.rb 119 def page_for(index:) 120 @page_ranges.index(@page_ranges.bsearch { _1 >= index }) + 1 121 end
# File lib/aipp/pdf.rb 111 def page_ranges_for(pages) 112 [].tap do |page_ranges| 113 pages.each_with_index do |page, index| 114 page_ranges << (page_ranges.last || 0) + page.text.length + index 115 end 116 end 117 end
# File lib/aipp/pdf.rb 94 def read 95 pages = ::PDF::Reader.new(@file).pages 96 [pages.map(&:text).join("\f"), page_ranges_for(pages)] 97 end
# File lib/aipp/pdf.rb 99 def read_cache 100 cache_file = Pathname.new("#{@file}.json") 101 if cache_file.exist? && (@file.stat.mtime - cache_file.stat.mtime).abs < 1 102 JSON.load cache_file 103 else 104 read.tap do |data| 105 cache_file.write data.to_json 106 FileUtils.touch(cache_file, mtime: @file.stat.mtime) 107 end 108 end 109 end