class UV::BufferedTokenizer
Constants
- DEFAULT_ENCODING
Attributes
delimiter[RW]
indicator[RW]
size_limit[RW]
verbose[RW]
Public Class Methods
new(options)
click to toggle source
@param [Hash] options
# File lib/uv-rays/buffered_tokenizer.rb, line 25 def initialize(options) @delimiter = options[:delimiter] @indicator = options[:indicator] @msg_length = options[:msg_length] @size_limit = options[:size_limit] @min_length = options[:min_length] || 1 @verbose = options[:verbose] if @size_limit @encoding = options[:encoding] || DEFAULT_ENCODING if @delimiter @extract_method = method(:delimiter_extract) elsif @indicator && @msg_length @extract_method = method(:length_extract) else raise ArgumentError, 'no delimiter provided' end init_buffer end
Public Instance Methods
bytesize()
click to toggle source
@return [Integer]
# File lib/uv-rays/buffered_tokenizer.rb, line 77 def bytesize @input.bytesize end
empty?()
click to toggle source
@return [Boolean]
# File lib/uv-rays/buffered_tokenizer.rb, line 72 def empty? @input.empty? end
extract(data)
click to toggle source
Extract takes an arbitrary string of input data and returns an array of tokenized entities, provided there were any available to extract.
@example
tokenizer.extract(data). map { |entity| Decode(entity) }.each { ... }
@param [String] data
# File lib/uv-rays/buffered_tokenizer.rb, line 54 def extract(data) data.force_encoding(@encoding) @input << data @extract_method.call end
flush()
click to toggle source
Flush the contents of the input buffer, i.e. return the input buffer even though a token has not yet been encountered.
@return [String]
# File lib/uv-rays/buffered_tokenizer.rb, line 65 def flush buffer = @input reset buffer end
Protected Instance Methods
empty_string()
click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 172 def empty_string String.new.force_encoding(@encoding) end
Private Instance Methods
check_buffer_limits()
click to toggle source
Check to see if the buffer has exceeded capacity, if we're imposing a limit
# File lib/uv-rays/buffered_tokenizer.rb, line 137 def check_buffer_limits if @size_limit && @input.size > @size_limit if @indicator && @indicator.respond_to?(:length) # check for regex # save enough of the buffer that if one character of the indicator were # missing we would match on next extract (very much an edge case) and # best we can do with a full buffer. If we were one char short of a # delimiter it would be unfortunate @input = @input[-(@indicator.length - 1)..-1] else reset end raise 'input buffer exceeded limit' if @verbose end end
delimiter_extract()
click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 85 def delimiter_extract # Extract token-delimited entities from the input string with the split command. # There's a bit of craftiness here with the -1 parameter. Normally split would # behave no differently regardless of if the token lies at the very end of the # input buffer or not (i.e. a literal edge case) Specifying -1 forces split to # return "" in this case, meaning that the last entry in the list represents a # new segment of data where the token has not been encountered messages = @input.split(@delimiter, -1) if @indicator @input = messages.pop || empty_string entities = [] messages.each do |msg| res = msg.split(@indicator, -1) entities << res.last if res.length > 1 end else entities = messages @input = entities.pop || empty_string end check_buffer_limits # Check min-length is met entities.select! {|msg| msg.length >= @min_length} return entities end
init_buffer()
click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 152 def init_buffer @input = empty_string if @delimiter.is_a?(String) @delimiter = String.new(@delimiter).force_encoding(@encoding).freeze end if @indicator.is_a?(String) @indicator = String.new(@indicator).force_encoding(@encoding).freeze end end
length_extract()
click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 114 def length_extract messages = @input.split(@indicator, -1) messages.shift # discard junk data last = messages.pop || empty_string # Select messages of the right size then remove junk data messages.select! { |msg| msg.length >= @msg_length ? true : false } messages.map! { |msg| msg[0...@msg_length] } if last.length >= @msg_length messages << last[0...@msg_length] @input = last[@msg_length..-1] else reset("#{@indicator}#{last}") end check_buffer_limits return messages end
reset(value = nil)
click to toggle source
# File lib/uv-rays/buffered_tokenizer.rb, line 164 def reset(value = nil) @input = String.new(value || '').force_encoding(@encoding) end