module RubyMarkovify::Splitters
Constants
- ABBR_CAPPED
- ABBR_LOWERCASE
- ASCII_LOWERCASE
- ASCII_UPPERCASE
- END_PATTERN
A word that ends with punctuation Followed by optional quote/parens/etc Followed by whitespace + non-(lowercase or dash)
- EXCEPTIONS
- INITIALS
- MONTHS
- PUNCTUATION
- STATES
States w/ with thanks to github.com/unitedstates/python-us Titles w/ thanks to github.com/nytimes/emphasis and @donohoe
- STREETS
- TITLES
- UNITED_STATES
Public Instance Methods
is_abbreviation(dotted_word)
click to toggle source
# File lib/ruby_markovify/splitters.rb, line 24 def is_abbreviation(dotted_word) clipped = dotted_word[0..-2] if ASCII_UPPERCASE.include? clipped[0] ABBR_CAPPED.include? clipped.downcase else ABBR_LOWERCASE.include? clipped end end
is_sentence_ender(word)
click to toggle source
# File lib/ruby_markovify/splitters.rb, line 33 def is_sentence_ender(word) return false if EXCEPTIONS.include? word return true if PUNCTUATION.include? word[-1] return true if word.sub(/[^A-Z]/, '').length > 1 return true if word[-1] == '.' && !is_abbreviation(word) false end
split_into_sentences(text)
click to toggle source
# File lib/ruby_markovify/splitters.rb, line 46 def split_into_sentences(text) res = [] text.scan(END_PATTERN) do |c| res << [c, $~.offset(0)[0]] end end_indices = res.select do |e| groups, _ = e is_sentence_ender(groups[0]) end.map do |e| groups, index = e index + groups[0].length + groups[1].length end spans = ([nil] + end_indices).zip(end_indices + [nil]) ret = spans.map do |elem| start_idx, end_idx = elem next if end_idx == nil start_idx ||= 0 text[start_idx..end_idx].strip end ret.compact end