class PragmaticSegmenter::Languages::Danish::AbbreviationReplacer
Constants
- SENTENCE_STARTERS
Public Instance Methods
replace_abbreviation_as_sentence_boundary(txt)
click to toggle source
# File lib/pragmatic_segmenter/languages/danish.rb, line 54 def replace_abbreviation_as_sentence_boundary(txt) # As we are being conservative and keeping ambiguous # sentence boundaries as one sentence instead of # splitting into two, we can split at words that # we know for certain never follow these abbreviations. # Some might say that the set of words that follow an # abbreviation such as U.S. (i.e. U.S. Government) is smaller than # the set of words that could start a sentence and # never follow U.S. However, we are being conservative # and not splitting by default, so we need to look for places # where we definitely can split. Obviously SENTENCE_STARTERS # will never cover all cases, but as the gem is named # 'Pragmatic Segmenter' we need to be pragmatic # and try to cover the words that most often start a # sentence but could never follow one of the abbreviations below. @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word| escaped = Regexp.escape(word) txt.gsub!(/U∯S∯\s#{escaped}\s/, "U∯S\.\s#{escaped}\s") txt.gsub!(/U\.S∯\s#{escaped}\s/, "U\.S\.\s#{escaped}\s") txt.gsub!(/U∯K∯\s#{escaped}\s/, "U∯K\.\s#{escaped}\s") txt.gsub!(/U\.K∯\s#{escaped}\s/, "U\.K\.\s#{escaped}\s") txt.gsub!(/E∯U∯\s#{escaped}\s/, "E∯U\.\s#{escaped}\s") txt.gsub!(/E\.U∯\s#{escaped}\s/, "E\.U\.\s#{escaped}\s") txt.gsub!(/U∯S∯A∯\s#{escaped}\s/, "U∯S∯A\.\s#{escaped}\s") txt.gsub!(/U\.S\.A∯\s#{escaped}\s/, "U\.S\.A\.\s#{escaped}\s") txt.gsub!(/I∯\s#{escaped}\s/, "I\.\s#{escaped}\s") txt.gsub!(/s.u∯\s#{escaped}\s/, "s\.u\.\s#{escaped}\s") txt.gsub!(/S.U∯\s#{escaped}\s/, "S\.U\.\s#{escaped}\s") end txt end