class UV::BufferedTokenizer

Constants

DEFAULT_ENCODING

Attributes

delimiter[RW]
indicator[RW]
size_limit[RW]
verbose[RW]

Public Class Methods

new(options) click to toggle source

@param [Hash] options

# File lib/uv-rays/buffered_tokenizer.rb, line 25
def initialize(options)
    @delimiter  = options[:delimiter]
    @indicator  = options[:indicator]
    @msg_length = options[:msg_length]
    @size_limit = options[:size_limit]
    @min_length = options[:min_length] || 1
    @verbose    = options[:verbose] if @size_limit
    @encoding   = options[:encoding] || DEFAULT_ENCODING

    if @delimiter
        @extract_method = method(:delimiter_extract)
    elsif @indicator && @msg_length
        @extract_method = method(:length_extract)
    else
        raise ArgumentError, 'no delimiter provided'
    end

    init_buffer
end

Public Instance Methods

bytesize() click to toggle source

@return [Integer]

# File lib/uv-rays/buffered_tokenizer.rb, line 77
def bytesize
    @input.bytesize
end
empty?() click to toggle source

@return [Boolean]

# File lib/uv-rays/buffered_tokenizer.rb, line 72
def empty?
    @input.empty?
end
extract(data) click to toggle source

Extract takes an arbitrary string of input data and returns an array of tokenized entities, provided there were any available to extract.

@example

tokenizer.extract(data).
    map { |entity| Decode(entity) }.each { ... }

@param [String] data

# File lib/uv-rays/buffered_tokenizer.rb, line 54
def extract(data)
    data.force_encoding(@encoding)
    @input << data

    @extract_method.call
end
flush() click to toggle source

Flush the contents of the input buffer, i.e. return the input buffer even though a token has not yet been encountered.

@return [String]

# File lib/uv-rays/buffered_tokenizer.rb, line 65
def flush
    buffer = @input
    reset
    buffer
end

Protected Instance Methods

empty_string() click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 172
def empty_string
    String.new.force_encoding(@encoding)
end

Private Instance Methods

check_buffer_limits() click to toggle source

Check to see if the buffer has exceeded capacity, if we're imposing a limit

# File lib/uv-rays/buffered_tokenizer.rb, line 137
def check_buffer_limits
    if @size_limit && @input.size > @size_limit
        if @indicator && @indicator.respond_to?(:length) # check for regex
            # save enough of the buffer that if one character of the indicator were
            # missing we would match on next extract (very much an edge case) and
            # best we can do with a full buffer. If we were one char short of a
            # delimiter it would be unfortunate
            @input = @input[-(@indicator.length - 1)..-1]
        else
            reset
        end
        raise 'input buffer exceeded limit' if @verbose
    end
end
delimiter_extract() click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 85
def delimiter_extract
    # Extract token-delimited entities from the input string with the split command.
    # There's a bit of craftiness here with the -1 parameter.    Normally split would
    # behave no differently regardless of if the token lies at the very end of the
    # input buffer or not (i.e. a literal edge case)    Specifying -1 forces split to
    # return "" in this case, meaning that the last entry in the list represents a
    # new segment of data where the token has not been encountered
    messages = @input.split(@delimiter, -1)

    if @indicator
        @input = messages.pop || empty_string
        entities = []
        messages.each do |msg|
            res = msg.split(@indicator, -1)
            entities << res.last if res.length > 1
        end
    else
        entities = messages
        @input = entities.pop || empty_string
    end

    check_buffer_limits

    # Check min-length is met
    entities.select! {|msg| msg.length >= @min_length}

    return entities
end
init_buffer() click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 152
def init_buffer
    @input = empty_string

    if @delimiter.is_a?(String)
        @delimiter = String.new(@delimiter).force_encoding(@encoding).freeze
    end

    if @indicator.is_a?(String)
        @indicator = String.new(@indicator).force_encoding(@encoding).freeze
    end
end
length_extract() click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 114
def length_extract
    messages = @input.split(@indicator, -1)
    messages.shift # discard junk data

    last = messages.pop || empty_string

    # Select messages of the right size then remove junk data
    messages.select! { |msg| msg.length >= @msg_length ? true : false }
    messages.map! { |msg| msg[0...@msg_length] }

    if last.length >= @msg_length
        messages << last[0...@msg_length]
        @input = last[@msg_length..-1]
    else
        reset("#{@indicator}#{last}")
    end

    check_buffer_limits

    return messages
end
reset(value = nil) click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 164
def reset(value = nil)
    @input = String.new(value || '').force_encoding(@encoding)
end