class NumberNormalizer

Constants

ADJ_WORD
MULT_WORD
NUM_WORD
VERSION

Attributes

matches[R]
text[R]

Public Class Methods

new(text) click to toggle source
# File lib/number_normalizer.rb, line 57
def initialize text
  @text =  text
  @token = ' '
  @text_array = []
  # parse all numbers and store it in an array
  # [ {original_format => "1 thousand",
  #    digit_format => "1000",
  #    word_format => "one thousand",
  #    occurance => 1}, ... ]
  @p_digits_only = '\d+'
  @p_floating_digits = '\d+\.\d+'
  @p_floating_digits_2 = '\.\d+'
  @p_space_sperated_digits = '\d{,3}(\s\d{3}){1,}(\.\d{1,3}){,1}(\s\d{3})*(\s\d{,3})?'
  @p_comma_sperated_digits = '\d{,3}(,\d{3}){1,}(\.\d{1,3}){,1}(,\d{3})*(,\d{,3})?'
  @p_begin = ''
  @p_end = '\W?(\s|$)'
  @matches = {}

  find_all_digits

  find_all_words
end

Public Instance Methods

digit_numbers() click to toggle source
# File lib/number_normalizer.rb, line 80
def digit_numbers

  convert_digit_string_to_numbers
  digits = Set.new

  #puts @matches

  @matches.each_value do |v|
    if v.has_key?:digit_form
      digits.add(v[:digit_form])
    end
  end

  return digits.to_a
end

Protected Instance Methods

convert_digit_string_to_numbers() click to toggle source
# File lib/number_normalizer.rb, line 213
def convert_digit_string_to_numbers
  @matches.each_pair do |key, value|
    if value[:type] == :digits
      s = key.gsub(/[\s,]/, '')
      value[:digit_form] = if s =~ /\./ then s.to_f else s.to_i end
    end
  end
end
find_all_digits() click to toggle source
# File lib/number_normalizer.rb, line 102
def find_all_digits
  digits_pattern_str = '(' + @p_space_sperated_digits + '|' \
                       + @p_comma_sperated_digits + '|' \
                       + @p_floating_digits + '|' \
                       + @p_floating_digits_2 + '|' \
                       + @p_digits_only + ')' \
                       + @p_end

  digits_pattern = Regexp.new(digits_pattern_str)

  @text.enum_for(:scan, digits_pattern).each do |m|
    mstr = m[0]
    position = Regexp.last_match.begin(0)
    if @matches.has_key?mstr
      @matches[mstr][:pos].push(position)
    else
      @matches[mstr] = {:pos => [position],
                        :type => :digits}
    end
  end
end
find_all_words() click to toggle source
# File lib/number_normalizer.rb, line 124
def find_all_words
  text_arr = preprocess(@text)

  num_words = ''
  temp_words = ''
  temp_num = 0
  temp_mode = 0
  hundred_num = 0
  final_num = 0
  flag = false
  is_prev_mult = false
  is_prev_num = false

  text_arr.each do |str|
    num = NUM_WORD[str]
    mult = MULT_WORD[str]
    adj = ADJ_WORD[str]
    #puts "#{str}==#{num}==#{mult}==#{adj}=="

    if num != nil
      if temp_mode > 100
        temp_num += num
        temp_words = temp_words + ' ' + str
      else
        hundred_num += num
        num_words = num_words + ' ' + str
        temp_mode = 0
      end
      flag = true
      is_prev_mult = false
      is_prev_num = true
    elsif flag  && adj != nil && ((str == '-') || (is_prev_mult && (str == 'and')))
      if temp_mode >= 100 && temp_mode % 10 == 0 && str == 'and'
        temp_mode += 1
      end
      num_words = num_words + ' ' + str if str == 'and'
      num_words += str if str == '-'
      is_prev_mult = false
      is_prev_num = false
    elsif flag && mult != nil
      if (temp_mode > 1000 && temp_mode % 10 == 1) || (temp_mode > 100 && temp_mode < 1000 && mult < 1000)
        # 2 numbers
        # -- save previous number first
        final_num += hundred_num
        save_num_words(num_words.strip, final_num)
        # -- start counting current number
        num_words = temp_words
        hundred_num = temp_num
        temp_words = ''
        temp_num = 0
        final_num = 0
      else
        # 1 number
        hundred_num += temp_num
        temp_num = 0
      end
      hundred_num *= mult
      if hundred_num >= 1000
        final_num += hundred_num
        hundred_num = 0
      end
      num_words = num_words + ' ' + str
      is_prev_mult = true
      is_prev_num = false
      temp_mode = mult
    elsif flag
      final_num += hundred_num + temp_num
      save_num_words(num_words.strip, final_num)
      num_words = ''
      final_num = 0
      hundred_num = 0
      temp_num = 0
      temp_words = ''
      temp_mode = 0
      flag = false
      is_prev_mult = false
      is_prev_num = false
    end
  end

  save_num_words(num_words.strip, final_num + hundred_num) if flag
end
preprocess(str) click to toggle source
# File lib/number_normalizer.rb, line 222
def preprocess(str)
  str.downcase.gsub(/([^a-zA-Z0-9\s])/, ' \1 ').split()
end
save_num_words(num_words, number) click to toggle source
# File lib/number_normalizer.rb, line 207
def save_num_words(num_words, number)
  @matches[num_words] = {:pos => [1],
                        :type => :words,
                        :digit_form => number}
end
tokenlize() click to toggle source
# File lib/number_normalizer.rb, line 98
def tokenlize
  @text_array = @text.split(@token)
end