class PragmaticSegmenter::Languages::Danish::AbbreviationReplacer

Constants

SENTENCE_STARTERS

Public Instance Methods

replace_abbreviation_as_sentence_boundary(txt) click to toggle source
# File lib/pragmatic_segmenter/languages/danish.rb, line 54
def replace_abbreviation_as_sentence_boundary(txt)
  # As we are being conservative and keeping ambiguous
  # sentence boundaries as one sentence instead of
  # splitting into two, we can split at words that
  # we know for certain never follow these abbreviations.
  # Some might say that the set of words that follow an
  # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
  # the set of words that could start a sentence and
  # never follow U.S. However, we are being conservative
  # and not splitting by default, so we need to look for places
  # where we definitely can split. Obviously SENTENCE_STARTERS
  # will never cover all cases, but as the gem is named
  # 'Pragmatic Segmenter' we need to be pragmatic
  # and try to cover the words that most often start a
  # sentence but could never follow one of the abbreviations below.

  @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
    escaped = Regexp.escape(word)
    txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s")
    txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s")
    txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s")
    txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s")
    txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s")
    txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s")
    txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s")
    txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s")
    txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s")
    txt.gsub!(/s.u∯\s#{escaped}\s/, "s\.u\.\s#{escaped}\s")
    txt.gsub!(/S.U∯\s#{escaped}\s/, "S\.U\.\s#{escaped}\s")
  end
  txt
end