class TeRex::Corpus::Body
Attributes
category_klass[R]
format_klass[R]
sample_size[R]
set[R]
testing[R]
total_sentences[R]
training[R]
Public Class Methods
new(glob: "", partition: :file, format_klass: NilClass, category_klass: NilClass)
click to toggle source
# File lib/te_rex/corpus.rb, line 7 def initialize(glob: "", partition: :file, format_klass: NilClass, category_klass: NilClass) @glob = glob @format_klass = format_klass @category_klass = category_klass @partition = partition end
Public Instance Methods
build()
click to toggle source
@sample_size = (@set.count * 0.75).round
# File lib/te_rex/corpus.rb, line 15 def build define_set case @partition when /file/ file_partition else sentence_partition end end
build_superset()
click to toggle source
# File lib/te_rex/corpus.rb, line 41 def build_superset @set.reduce([]) do |memo,formatter| memo << formatter.sentences end.flatten end
file_partition()
click to toggle source
# File lib/te_rex/corpus.rb, line 25 def file_partition @sample_size = (@set.count.to_f * 0.75).round @training = partition_training_by_file @testing = partition_test_by_file count_all end
sentence_partition()
click to toggle source
# File lib/te_rex/corpus.rb, line 32 def sentence_partition corpus_set = partition_files_for_sentences @training = partition_training_by_sentence(corpus_set) @testing = partition_test_by_sentence(corpus_set) c = count_all @sample_size = (c.to_f * 0.75) c end
Private Instance Methods
count_all()
click to toggle source
# File lib/te_rex/corpus.rb, line 80 def count_all counter = 0 @set.map{|f| counter += f.sentences.count} @total_sentences = counter end
define_set()
click to toggle source
# File lib/te_rex/corpus.rb, line 48 def define_set @set ||= Dir[@glob].map do |file| @format_klass.new(file, @category_klass) end end
partition_files_for_sentences()
click to toggle source
# File lib/te_rex/corpus.rb, line 66 def partition_files_for_sentences @set.map do |file| file.scanner end.flatten end
partition_test_by_file()
click to toggle source
# File lib/te_rex/corpus.rb, line 60 def partition_test_by_file @set[(@sample_size - 1)..-1].map do |file| file.scanner end.flatten end
partition_test_by_sentence(c_set)
click to toggle source
# File lib/te_rex/corpus.rb, line 76 def partition_test_by_sentence(c_set) c_set.sample(c_set.count * 0.25) end
partition_training_by_file()
click to toggle source
# File lib/te_rex/corpus.rb, line 54 def partition_training_by_file @set[0..@sample_size].map do |file| file.scanner end.flatten end
partition_training_by_sentence(c_set)
click to toggle source
# File lib/te_rex/corpus.rb, line 72 def partition_training_by_sentence(c_set) c_set.sample(c_set.count * 0.75) end