class TeRex::Corpus::Body

Attributes

category_klass[R]
format_klass[R]
sample_size[R]
set[R]
testing[R]
total_sentences[R]
training[R]

Public Class Methods

new(glob: "", partition: :file, format_klass: NilClass, category_klass: NilClass) click to toggle source
# File lib/te_rex/corpus.rb, line 7
def initialize(glob: "", partition: :file, format_klass: NilClass, category_klass: NilClass)
  @glob = glob
  @format_klass = format_klass
  @category_klass = category_klass
  @partition = partition
end

Public Instance Methods

build() click to toggle source

@sample_size = (@set.count * 0.75).round

# File lib/te_rex/corpus.rb, line 15
def build
  define_set
  case @partition
  when /file/
    file_partition
  else
    sentence_partition
  end
end
build_superset() click to toggle source
# File lib/te_rex/corpus.rb, line 41
def build_superset
  @set.reduce([]) do |memo,formatter|
    memo << formatter.sentences
  end.flatten
end
file_partition() click to toggle source
# File lib/te_rex/corpus.rb, line 25
def file_partition
  @sample_size = (@set.count.to_f * 0.75).round
  @training = partition_training_by_file
  @testing = partition_test_by_file
  count_all
end
sentence_partition() click to toggle source
# File lib/te_rex/corpus.rb, line 32
def sentence_partition
  corpus_set = partition_files_for_sentences
  @training = partition_training_by_sentence(corpus_set)
  @testing = partition_test_by_sentence(corpus_set)
  c = count_all
  @sample_size = (c.to_f * 0.75)
  c
end

Private Instance Methods

count_all() click to toggle source
# File lib/te_rex/corpus.rb, line 80
def count_all
  counter = 0
  @set.map{|f| counter += f.sentences.count}
  @total_sentences = counter
end
define_set() click to toggle source
# File lib/te_rex/corpus.rb, line 48
def define_set
  @set ||= Dir[@glob].map do |file|
    @format_klass.new(file, @category_klass)
  end
end
partition_files_for_sentences() click to toggle source
# File lib/te_rex/corpus.rb, line 66
def partition_files_for_sentences
  @set.map do |file|
    file.scanner
  end.flatten
end
partition_test_by_file() click to toggle source
# File lib/te_rex/corpus.rb, line 60
def partition_test_by_file
  @set[(@sample_size - 1)..-1].map do |file|
    file.scanner
  end.flatten
end
partition_test_by_sentence(c_set) click to toggle source
# File lib/te_rex/corpus.rb, line 76
def partition_test_by_sentence(c_set)
  c_set.sample(c_set.count * 0.25)
end
partition_training_by_file() click to toggle source
# File lib/te_rex/corpus.rb, line 54
def partition_training_by_file
  @set[0..@sample_size].map do |file|
    file.scanner
  end.flatten
end
partition_training_by_sentence(c_set) click to toggle source
# File lib/te_rex/corpus.rb, line 72
def partition_training_by_sentence(c_set)
  c_set.sample(c_set.count * 0.75)
end