class PragmaticSegmenter::AbbreviationReplacer

This class searches for periods within an abbreviation and replaces the periods.

Attributes

text[R]

Public Class Methods

new(text:, language: ) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 11
def initialize(text:, language: )
  @text = text.dup
  @language = language
end

Public Instance Methods

replace() click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 16
def replace
  Rule.apply(@text,
    @language::PossessiveAbbreviationRule,
    @language::KommanditgesellschaftRule,
    @language::SingleLetterAbbreviationRules::All)

  @text = search_for_abbreviations_in_string(@text)
  @text = replace_multi_period_abbreviations(@text)
  Rule.apply(@text, @language::AmPmRules::All)
  replace_abbreviation_as_sentence_boundary(@text)
end

Private Instance Methods

replace_abbreviation_as_sentence_boundary(txt) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 64
def replace_abbreviation_as_sentence_boundary(txt)
  # As we are being conservative and keeping ambiguous
  # sentence boundaries as one sentence instead of
  # splitting into two, we can split at words that
  # we know for certain never follow these abbreviations.
  # Some might say that the set of words that follow an
  # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
  # the set of words that could start a sentence and
  # never follow U.S. However, we are being conservative
  # and not splitting by default, so we need to look for places
  # where we definitely can split. Obviously SENTENCE_STARTERS
  # will never cover all cases, but as the gem is named
  # 'Pragmatic Segmenter' we need to be pragmatic
  # and try to cover the words that most often start a
  # sentence but could never follow one of the abbreviations below.

  # Rubular: http://rubular.com/r/PkBQ3PVBS8
  @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
    escaped = Regexp.escape(word)
    regex   = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
    txt.gsub!(regex, '\1.')
  end
  txt
end
replace_multi_period_abbreviations(txt) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 89
def replace_multi_period_abbreviations(txt)
  mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
  return txt if mpa.empty?
  mpa.each do |r|
    txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
  end
  txt
end
replace_period_of_abbr(txt, abbr) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 110
def replace_period_of_abbr(txt, abbr)
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
  txt
end
replace_possessive_abbreviations(txt) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 116
def replace_possessive_abbreviations(txt)
  txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
  txt
end
replace_pre_number_abbr(txt, abbr) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 98
def replace_pre_number_abbr(txt, abbr)
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
  txt
end
replace_prepositive_abbr(txt, abbr) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 104
def replace_prepositive_abbr(txt, abbr)
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
  txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
  txt
end
scan_for_replacements(txt, am, index, character_array) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 47
def scan_for_replacements(txt, am, index, character_array)
  character = character_array[index]
  prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
  number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
  upper = /[[:upper:]]/.match(character.to_s)
  if upper.nil? || prepositive.include?(Unicode::downcase(am.strip))
    if prepositive.include?(Unicode::downcase(am.strip))
      txt = replace_prepositive_abbr(txt, am)
    elsif number_abbr.include?(Unicode::downcase(am.strip))
      txt = replace_pre_number_abbr(txt, am)
    else
      txt = replace_period_of_abbr(txt, am)
    end
  end
  txt
end
search_for_abbreviations_in_string(txt) click to toggle source
# File lib/pragmatic_segmenter/abbreviation_replacer.rb, line 30
def search_for_abbreviations_in_string(txt)
  original = txt.dup
  downcased = Unicode::downcase(txt)
  @language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
    stripped = abbreviation.strip
    next unless downcased.include?(stripped)
    abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
    next if abbrev_match.empty?
    next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
    character_array = @text.scan(next_word_start)
    abbrev_match.each_with_index do |am, index|
      txt = scan_for_replacements(txt, am, index, character_array)
    end
  end
  txt
end