class Tokkens::Tokens

Converts a string token to a uniquely identifying sequential number.

Useful for working with a {en.wikipedia.org/wiki/Vector_space_model vector space model} for text.

Attributes

offset[RW]

@!attribute [r] offset

@return [Fixnum] Number of first token.

Public Class Methods

new(offset: 1) click to toggle source
# File lib/tokkens/tokens.rb, line 12
def initialize(offset: 1)
  # liblinear can't use offset 0, libsvm doesn't mind to start at one
  @tokens = {}
  @offset = offset
  @next_number = offset
  @frozen = false
end

Public Instance Methods

find(i, prefix: nil) click to toggle source

Return an token by number.

This class is optimized for retrieving by token, not by number.

@param i [String] number to return token for @param prefix [String] optional string to remove from beginning of token @return [String, NilClass] given token, or nil when not found

# File lib/tokkens/tokens.rb, line 78
def find(i, prefix: nil)
  @tokens.each do |s, data|
    if data[0] == i
      return (prefix && s.start_with?(prefix)) ? s[prefix.length..-1] : s
    end
  end
  nil
end
freeze!() click to toggle source

Stop assigning new numbers to token. @see frozen? @see thaw!

# File lib/tokkens/tokens.rb, line 23
def freeze!
  @frozen = true
end
frozen?() click to toggle source

@return [Boolean] Whether the tokens are frozen or not. @see freeze! @see thaw!

# File lib/tokkens/tokens.rb, line 37
def frozen?
  @frozen
end
get(s, **kwargs) click to toggle source

Return a number for a new or existing token.

When the token was seen before, the same number is returned. If the token is first seen and this class isn't {#frozen?}, a new number is returned; else nil is returned.

@param s [String] token to return number for @option kwargs [String] :prefix optional string to prepend to the token @return [Fixnum, NilClass] number for given token

# File lib/tokkens/tokens.rb, line 66
def get(s, **kwargs)
  return if !s || s.strip == ''
  @frozen ? retrieve(s, **kwargs) : upsert(s, **kwargs)
end
indexes() click to toggle source

Return indexes for all of the current tokens.

@return [Array<Fixnum>] All current token numbers. @see limit!

# File lib/tokkens/tokens.rb, line 91
def indexes
  @tokens.values.map(&:first)
end
limit!(max_size: nil, min_occurence: nil) click to toggle source

Limit the number of tokens.

@param max_size [Fixnum] Maximum number of tokens to retain @param min_occurence [Fixnum] Keep only tokens seen at least this many times @return [Fixnum] Number of tokens left

# File lib/tokkens/tokens.rb, line 46
def limit!(max_size: nil, min_occurence: nil)
  # @todo raise if frozen
  if min_occurence
    @tokens.delete_if {|name, data| data[1] < min_occurence }
  end
  if max_size
    @tokens = Hash[@tokens.to_a.sort_by {|a| -a[1][1] }[0..(max_size-1)]]
  end
  @tokens.length
end
load(filename) click to toggle source

Load tokens from file.

The tokens are frozen by default. All previously existing tokens are removed.

@param filename [String] Filename

# File lib/tokkens/tokens.rb, line 101
def load(filename)
  File.open(filename) do |f|
    @tokens = {}
    f.each_line do |line|
      id, count, name = line.rstrip.split(/\s+/, 3)
      @tokens[name.strip] = [id.to_i, count]
    end
  end
  # safer
  freeze!
end
save(filename) click to toggle source

Save tokens to file.

@param filename [String] Filename

# File lib/tokkens/tokens.rb, line 116
def save(filename)
  File.open(filename, 'w') do |f|
    @tokens.each do |token, (index, count)|
      f.puts "#{index} #{count} #{token}"
    end
  end
end
thaw!() click to toggle source

Allow new tokens to be created. @see freeze! @see frozen?

# File lib/tokkens/tokens.rb, line 30
def thaw!
  @frozen = false
end

Private Instance Methods

retrieve(s, prefix: '') click to toggle source
# File lib/tokkens/tokens.rb, line 126
def retrieve(s, prefix: '')
  data = @tokens[prefix + s]
  data[0] if data
end
upsert(s, prefix: '') click to toggle source

return token number, update next_number; always returns a number

# File lib/tokkens/tokens.rb, line 132
def upsert(s, prefix: '')
  unless data = @tokens[prefix + s]
    @tokens[prefix + s] = data = [@next_number, 0]
    @next_number += 1
  end
  data[1] += 1
  data[0]
end