class Rudge
Constants
- EOSAW
- EOSBW
end of sentence marker (before and after whitespace)
Public Class Methods
sentences(text, options={})
click to toggle source
# File lib/rudge.rb, line 8 def self.sentences(text, options={}) text = text.dup # initial sentence markers, before and after whitespace text.gsub!(/([.?!](?:\"|\'|\)|\]|\})?)(\s+)/) { $1 << EOSBW << $2 << EOSAW } # remove sentence markers on ellipsis text.gsub!(/(\.\.\.*)#{EOSBW}(\s+)#{EOSAW}/) { $1 << $2 } # remove sentence markers on abbreviations abbreviations = Rudge::Abbreviations.list.join("|") text.gsub!(/(\s)(#{abbreviations})\.#{EOSBW}(\s+)#{EOSAW}/i) { $1 << $2 << "." << $3 } if options[:keep_trailling_whitespace] # split after whitespace, remove EOSBW marker text.split(EOSAW).map { | sentence | sentence.gsub(EOSBW, "") } else # remove initial whitespace, split at markers text.gsub(/\A\s+/, "").split(/#{EOSBW}\s+#{EOSAW}/) end end