class BankStatementParser::HSBC
Parser for HSBC
bank statements
Constants
- COLUMN_HEADINGS_1ST
N.B. Unicode pound symbol deleted from brackets in balance column heading
- COLUMN_HEADINGS_2ND
- MONTHS
- TYPES
Public Instance Methods
handle_line(line)
click to toggle source
Handle the specified line
Returns true if parsing should continue; false to terminate the parser
# File lib/bank_statement_parser/hsbc.rb, line 31 def handle_line line # Re-encode line to ASCII line = Utils.ascii_filter(line) # Skip blank lines return true if line =~ /\A\s*\z/ # Sanity checking raise "line contains TAB characters" if line =~ /\t/ # Stop line if line =~ /\A\s+AER\s+EAR\s*\z/ logger.debug { "Found stop line (2nd form)" } return false end if line =~ /\AStatements produced from \d{1,2} (?:#{MONTHS.join('|')}) \d{4} are available in PDF format\.\s*\z/ logger.debug { "Found stop line (1st form)" } return false end # Look for sort code and account number lines, if we haven't found # one yet if sort_code.nil? && account_number.nil? if line =~ /(?:\A[A-Z][\w\s]+|,)\s+(?<sort_code>\d{2}-\d{2}-\d{2})\s+(?<account_number>\d{8})(?:\s*|\s+\d+)\z/ logger.debug { "Found sort code and account number" } self.sort_code = Regexp.last_match(:sort_code) self.account_number = Regexp.last_match(:account_number) if line =~ /^\s*(?<account_name>.+)\s+(\d{2}-\d{2}-\d{2})\s+(\d{8})\s+(\d+)\s*$/ # New-style metadata line, first field is account [holder] name self.name = Regexp.last_match(:account_name).strip logger.debug { "Found account holder name (2nd form): #{self.name}" } elsif line =~ /^\s*(?<account_name>.+)\s*,\s+(\d{2}-\d{2}-\d{2})\s+(\d{8})\s*$/ # Old-style metadata line, first field is account [holder] name self.name = Regexp.last_match(:account_name).strip logger.debug { "Found account holder name (1st form): #{self.name}" } end end end # Look for statement date lines, if we haven't found one yet if statement_date.nil? if line =~ /\A\s*(?<statement_date>\d{2} (?:#{MONTHS.map{|m| m[0,3]}.join('|')}) \d{4})\s*\z/ logger.debug { "Found statement date (1st form)" } @statement_format = StatementFormat::FORMAT_1ST # Parse statement date date_string = Regexp.last_match(:statement_date) self.statement_date = Date.parse(date_string) elsif line =~ /\A(?<date_range_start>\d+\s+(?:#{MONTHS.join('|')})(?:\s+\d{4})?)\s+to\s+(?<date_range_end>\d+\s+(?:#{MONTHS.join('|')})\s+\d{4})\b/ logger.debug { "Found statement date (2nd form)" } @statement_format = StatementFormat::FORMAT_2ND date_range_start = Regexp.last_match(:date_range_start) date_range_end = Regexp.last_match(:date_range_end) logger.debug { "Found statement date range #{date_range_start}-#{date_range_end}" } # Parse range end date self.statement_date = Date.parse(date_range_end) end end if !sort_code.nil? && !account_number.nil? && !statement_date.nil? # Look for statement records proper headings = nil case @statement_format when StatementFormat::FORMAT_UNKNOWN raise "Failed to detect statement format before start of records" when StatementFormat::FORMAT_1ST headings = COLUMN_HEADINGS_1ST when StatementFormat::FORMAT_2ND headings = COLUMN_HEADINGS_2ND end logger.debug { "Parsing potential record line (format #{@statement_format}): #{line}" } parse_record_line_format(line, headings) end return true end
Private Instance Methods
fix_record_date_year(record_date)
click to toggle source
Fix the year of the specified record date
Returns the record date, with the year fixed
# File lib/bank_statement_parser/hsbc.rb, line 180 def fix_record_date_year record_date # Sanity checking if Date.today.year != record_date.year logger.info { "No need to fix year for statement record date" } return record_date end # The date we have parsed will have the year set to the current year. # # We need to figure out the correct year, from the statement date. raise "No statement date" unless statement_date record_date = Date.new(statement_date.year, record_date.month, record_date.day) logger.debug { "record date #{record_date}" } if statement_date.month != record_date.month logger.debug { "record month differs from statement month" } if 1 == statement_date.month # Assume that the statement crosses a year boundary: the record # must be from the end of the previous year raise "Expected a record from December" unless 12 == record_date.month record_date = record_date.prev_year end end record_date end
get_column_fragments(line)
click to toggle source
Split the specified line into an array of column fragments
# File lib/bank_statement_parser/hsbc.rb, line 252 def get_column_fragments line col_fragments = [] @cols.reverse.each_with_index do |i,index| # We need to be flexible here, because the columns can (and do) # fail to line up with the heading alignments # # Check whether the supposed column boundary has whitespace on # at least one side: # # * If so, then this is a correct column boundary # * If not, then (somewhat arbitrarily, based on cases that have # been seen) opt to move the column left until we hit whitespace if (i > 0) && (i < line.length) char_before_boundary = line[i-1] char_after_boundary = line[i] unless char_before_boundary =~ /\A\s\z/ || char_after_boundary =~ /\A\s\z/ logger.warn { "Column boundary failure: #{char_before_boundary}|#{char_after_boundary}" } # Shift down until we hit whitespace before the boundary boundary_limit = ((index + 1) < @cols.reverse.size) ? @cols.reverse[index + 1] : -1 logger.debug { "Boundary adjust limit #{boundary_limit}" } new_boundary = i while new_boundary > boundary_limit left = line[new_boundary] if left =~ /\A\s\z/ logger.debug { "Adjusting column boundary from #{i} to #{new_boundary}" } i = new_boundary break end new_boundary -= 1 end raise "Failed to adjust column boundary" if 0 == new_boundary end end fragment_i = line[i...(line.length)] unless fragment_i.nil? fragment_i.strip! if fragment_i.empty? fragment_i = nil end end col_fragments.unshift(fragment_i) line = line[0...i] end return col_fragments end
parse_record_line_format(line, headings)
click to toggle source
Parse the specified line, looking for records
# File lib/bank_statement_parser/hsbc.rb, line 307 def parse_record_line_format line, headings if update_columns(line, headings) if @parser_paused logger.debug { "Resuming parser: set/updated columns" } @parser_paused = false end return end # Skip known "noise" lines return if line =~ /\A\s*A\s*\z/ return if @cols.empty? return if @parser_paused col_fragments = get_column_fragments(line) # N.B. Detect and fix up failed column splitting date_string = col_fragments[0] unless date_string.nil? if date_string =~ /(?<date_proper>.+)\s+(?<spurious_tail>[A-Z]+)\z/ date_proper = Regexp.last_match(:date_proper) spurious_tail = Regexp.last_match(:spurious_tail) logger.warn { "Must fix date string #{date_string}|#{date_proper}|#{spurious_tail}" } col_fragments[0] = date_proper col_fragments[1] = spurious_tail + " " + col_fragments[1] end end date_string = col_fragments[0] unless date_string.nil? begin @cached_statement_date = Date.parse(date_string) @cached_statement_date = fix_record_date_year(@cached_statement_date) rescue ArgumentError => e raise "Failed to parse date/time '#{date_string}': #{e}" end end payment_type_and_details = col_fragments[1] if payment_type_and_details =~ /\ABALANCE CARRIED FORWARD\z/i cb = col_fragments[4] unless cb.nil? if cb =~ /\s+D\z/ # Overdrawn; remove suffix and make negative cb = '-' + cb.sub(/\s+D\z/, '') end cb = cb.delete(",").to_f self.closing_balance = cb logger.debug { "Found potential closing balance: #{cb}" } end logger.debug { "Pausing parser" } @parser_paused = true return elsif payment_type_and_details =~ /\ABALANCE BROUGHT FORWARD(\s+\.)?\z/i if opening_balance.nil? ob = col_fragments[4] unless ob.nil? if ob =~ /\s+D\z/ # Overdrawn; remove suffix and make negative ob = '-' + ob.sub(/\s+D\z/, '') end ob = ob.delete(",").to_f self.opening_balance = ob logger.debug { "Found probable opening balance: #{ob}" } end end if @parser_paused logger.debug { "Resuming parser" } @parser_paused = false else logger.debug { "Skipping parser resume line" } end return end if @parser_paused logger.debug { "Skipping line: parser paused" } return end payment_details = nil if payment_type_and_details =~ /\A(?<payment_type>#{TYPES.keys.map{ |t| Regexp.quote(t) }.join('|')})\s+(?<payment_details>.*)\z/ logger.debug { "Found the start of a record (group)" } @cached_payment_type = Regexp.last_match(:payment_type) payment_details = Regexp.last_match(:payment_details) else payment_details = payment_type_and_details end @cached_details << payment_details paid_out = col_fragments[2] paid_in = col_fragments[3] paid_out.delete!(",") unless paid_out.nil? paid_in.delete!(",") unless paid_in.nil? balance = col_fragments[4] unless balance.nil? balance = balance.delete(",").to_f end if !paid_out.nil? || !paid_in.nil? logger.debug { "Found the end of a record (group)" } full_details = @cached_details.join("\n") record_credit = !paid_in.nil? record_amount = record_credit ? paid_in.to_f : paid_out.to_f # Create statement record record = StatementRecord.new(date: @cached_statement_date, type: @cached_payment_type, record_type: TYPES[@cached_payment_type], credit: record_credit, amount: record_amount, detail: full_details, balance: balance) logger.debug { "Created statement record: #{record}" } add_record record @cached_payment_type = nil @cached_details = [] end end
reset()
click to toggle source
Reset the parser
Calls superclass method
BankStatementParser::Base#reset
# File lib/bank_statement_parser/hsbc.rb, line 156 def reset super @statement_format = StatementFormat::FORMAT_UNKNOWN # Somewhere to cache the most-recent statement record date @cached_statement_date = nil # Somewhere to cached the most-recent statement record type @cached_payment_type = nil # Somewhere to cache details for the ongoing statement record @cached_details = [] # Somewhere to cache column alignments @cols = [] # Flag to temporarily pause the parser @parser_paused = false end
update_columns(line, headings)
click to toggle source
If the specified line is a headings line, use it to update our column alignments
Returns true if column alignments were updated; false otherwise
# File lib/bank_statement_parser/hsbc.rb, line 213 def update_columns line, headings # Look for lines that allow us to match column alignments raise "Expected a five-column layout" unless 5 == headings.size # Build a regexp for matching the column header line column_heading_regexp_str = '\A' headings.each_with_index do |item,index| pre_space_match = '' post_space_quantifier = '{2,}' if 0 == index pre_space_match = '\s*' elsif (headings.length - 1) == index post_space_quantifier = '*' end column_heading_regexp_str += '(?<col' + index.to_s + '>' + pre_space_match + item + '\s' + post_space_quantifier + ')' end column_heading_regexp_str += '\z' column_heading_regexp = Regexp.new(column_heading_regexp_str) if line =~ column_heading_regexp if @cols.empty? logger.debug { "Setting column alignments from line #{line}" } else logger.debug { "Updating column alignments from line #{line}" } end (0...headings.size).each do |i| str_i = "col" + i.to_s sym_i = str_i.to_sym @cols[i] = Regexp.last_match.offset(sym_i)[0] end return true end return false end