class DelimitedWordDataSource

Attributes

buckets[R]
wordAsEncountered[R]
wordCounts[R]
wordValueSequence[R]

Public Class Methods

new(filePath, lineStateMachine, limit) click to toggle source
Calls superclass method WordDataSource::new
# File lib/data/word_data_source.rb, line 123
def initialize(filePath, lineStateMachine, limit)
  @lineStateMachine = lineStateMachine
  @limit = limit
  @count = 0
  @buckets = {}
  @wordCounts = {}
  @wordValueSequence = []  # list of words in file in terms of index into @wordAsEncountered
  @wordAsEncounteredIndex = {}          # key is word, value is number as encountered
  @wordAsEncountered = []  # array entry added only when a new word is encountered
  @nextWordEncounteredIndex = 0
  super(filePath,"/[^[:print:]]/")
end

Public Instance Methods

bucket() click to toggle source
# File lib/data/word_data_source.rb, line 136
def bucket
  @lineStateMachine.bucket
end
has_terminator?() click to toggle source
# File lib/data/word_data_source.rb, line 222
def has_terminator?
  true
end
metaDataFor(offset) click to toggle source

TODO: fix this, linear metadata search, O(N) should be O(lg N)

# File lib/data/word_data_source.rb, line 162
def metaDataFor(offset)
  previousMetadata = "unknown"
  @lineStateMachine.pages.sort_by(&:reverse).each do |metadata, wordOffset|
    if (wordOffset < offset) then
      previousMetadata = metadata
    else
      return previousMetadata
    end
  end
  return previousMetadata
end
process(line) click to toggle source
# File lib/data/word_data_source.rb, line 207
def process(line)
  line = self.preprocessLine(line)
  data = @lineStateMachine.process(line, @wordValueSequence.length)
  if (data.length > 0) then
    bucket = @lineStateMachine.bucket
    @buckets[bucket] = {} if (!@buckets.has_key?(bucket))
    return self.processData(data,bucket)
  end
  return false
end
processData(data,bucket) click to toggle source
# File lib/data/word_data_source.rb, line 179
def processData(data,bucket)
  data.each do |word|
    word = word.chomp(",")
    word = word.chomp(".")
    if (word.length > 0) then
      @words << word
      if (!@wordCounts.has_key?(word)) then
        # we have a new word
        @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex
        @wordAsEncountered << word
        @nextWordEncounteredIndex += 1
        @wordCounts[word] = 0
      end
      @wordCounts[word] += 1
      if (!@buckets[bucket].has_key?(word)) then
        @buckets[bucket][word] = 0
      end
      @buckets[bucket][word] += 1
      @wordValueSequence << @wordAsEncounteredIndex[word]
      @count += 1
      if ((@limit > 0) && (@count >= @limit)) then
        return true
      end
    end
  end
  return false
end
save() click to toggle source
# File lib/data/word_data_source.rb, line 140
def save
  File.open("#{@filePath}.words", 'w') do |file|
    @wordAsEncountered.each do |word|
      file.write("#{word}\n")
    end
  end
  File.open("#{@filePath}.values", 'wb') do |file|
    file << @wordValueSequence.pack("N*")
  end
  File.open("#{@filePath}.summary", "w") do |file|
    file << "#{@numberWordsInFile} words in file\n"
    file << "#{@nextWordEncounteredIndex} distinct words\n"
    file << "Metadata\n"

    # uh-oh, this seems to reverse the hash in place!
    @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset|
      file << "#{wordOffset} #{page}\n"
    end
  end
end
terminator() click to toggle source
# File lib/data/word_data_source.rb, line 226
def terminator
  "END_OF_DOCUMENT"
end
verify(word, count) click to toggle source
# File lib/data/word_data_source.rb, line 218
def verify(word, count)
  @wordCounts[word] == count
end
wordCount(word) click to toggle source
# File lib/data/word_data_source.rb, line 174
def wordCount(word)
  return @wordCounts[word] if @wordCounts.has_key?(word)
  return 0
end