class SplitSentence
class that takes a corpus and breaks it down into arrays. each array is one sentence.
Constants
- ABBREVIATIONS
- ENDERS
Attributes
corpus[R]
Public Class Methods
new(corpus = nil)
click to toggle source
# File lib/markovite/splitter.rb, line 22 def initialize(corpus = nil) self.corpus = corpus || "" end
Public Instance Methods
corpus=(text)
click to toggle source
# File lib/markovite/splitter.rb, line 26 def corpus=(text) @corpus = text.dup end
expand_corpus(text)
click to toggle source
# File lib/markovite/splitter.rb, line 61 def expand_corpus(text) self.corpus += " #{text}" end
split_text(new_text = nil)
click to toggle source
might be cool to count punct. separately, we can point to punct as a way to indicate the end. if the sentences are delimited by n, we can have nil be the value it points to instead. This way, we can impose grammatical rules by making the first word of the sentence capitalized, and the end of the sentence will end with some sort of punctuation.
# File lib/markovite/splitter.rb, line 39 def split_text(new_text = nil) current_sentence = [] sentences = [] new_text = new_text || corpus all_words = split_words(new_text) all_words.each do |word| if is_end_of_sentence?(word) sentences << add_sentence(current_sentence, word) current_sentence.clear elsif has_newline?(word) newline_words = split_newline(word) sentences << add_sentence(current_sentence, newline_words[0]) current_sentence.clear current_sentence << newline_words[1] else current_sentence << word end end sentences << add_sentence(current_sentence, nil) if !current_sentence.empty? sentences end
Private Instance Methods
add_sentence(sentence, word)
click to toggle source
# File lib/markovite/splitter.rb, line 67 def add_sentence(sentence, word) sentence << word if word sentence.compact.join(" ") end
has_newline?(word)
click to toggle source
# File lib/markovite/splitter.rb, line 80 def has_newline?(word) word.include?("\n") end
is_abbreviation(word)
click to toggle source
# File lib/markovite/splitter.rb, line 76 def is_abbreviation(word) ABBREVIATIONS.include?(word.downcase) end
is_end_of_sentence?(word)
click to toggle source
# File lib/markovite/splitter.rb, line 88 def is_end_of_sentence?(word) #check punctuation before delving into abbreviations to save time return false if !ENDERS.include?(word[-1]) return false if is_abbreviation(word) return true end
split_newline(word)
click to toggle source
# File lib/markovite/splitter.rb, line 84 def split_newline(word) word.split("\n").map{|str| str.empty? ? nil:str} end
split_words(text)
click to toggle source
# File lib/markovite/splitter.rb, line 72 def split_words(text) text.split(/ /) end