module RubyMarkovify::Splitters

Constants

ABBR_CAPPED
ABBR_LOWERCASE
ASCII_LOWERCASE
ASCII_UPPERCASE
END_PATTERN

A word that ends with punctuation Followed by optional quote/parens/etc Followed by whitespace + non-(lowercase or dash)

EXCEPTIONS
INITIALS
MONTHS
PUNCTUATION
STATES

States w/ with thanks to github.com/unitedstates/python-us Titles w/ thanks to github.com/nytimes/emphasis and @donohoe

STREETS
TITLES
UNITED_STATES

Public Instance Methods

is_abbreviation(dotted_word) click to toggle source
# File lib/ruby_markovify/splitters.rb, line 24
def is_abbreviation(dotted_word)
  clipped = dotted_word[0..-2]
  if ASCII_UPPERCASE.include? clipped[0]
    ABBR_CAPPED.include? clipped.downcase
  else
    ABBR_LOWERCASE.include? clipped
  end
end
is_sentence_ender(word) click to toggle source
# File lib/ruby_markovify/splitters.rb, line 33
def is_sentence_ender(word)
  return false if EXCEPTIONS.include? word
  return true if PUNCTUATION.include? word[-1]
  return true if word.sub(/[^A-Z]/, '').length > 1
  return true if word[-1] == '.' && !is_abbreviation(word)
  false
end
split_into_sentences(text) click to toggle source
# File lib/ruby_markovify/splitters.rb, line 46
def split_into_sentences(text)
  res = []
  text.scan(END_PATTERN) do |c|
    res << [c, $~.offset(0)[0]]
  end

  end_indices = res.select do |e|
    groups, _ = e
    is_sentence_ender(groups[0])
  end.map do |e|
    groups, index = e
    index + groups[0].length + groups[1].length
  end

  spans = ([nil] + end_indices).zip(end_indices + [nil])

  ret = spans.map do |elem|
    start_idx, end_idx = elem
    next if end_idx == nil
    start_idx ||= 0
    text[start_idx..end_idx].strip
  end
  ret.compact
end