class LogStash::Filters::Hashid

This filter allow you to generate predictable, string encoded hashed keys based om event contents and timestamp. This can be used to avoid getting duplicate records indexed into Elasticsearch.

Hashed keys to be generated based on full or partial hashes and has the ability to prefix these keys based on the event timestamp in order to make then largely ordered by timestamp, which tend to lead to increased indexing performance for event based use cases where data is being indexed in near real time.

When used with the timestamp prefix enabled, it should ideally be run after the date filter has run and populated the @timestamp field.

Constants

CHARS
SHIFTS

Public Instance Methods

encode_to_sortable_string(data) click to toggle source
# File lib/logstash/filters/hashid.rb, line 95
def encode_to_sortable_string(data)
  idxes = []
  to_take = 0
  data.each_slice(3) do |part0, part1, part2|
    to_take = 0
    if part1.nil?
      part1 = part2 = 0
      to_take = 2
    end
    if part2.nil?
      part2 = 0
      to_take = 1
    end
    group24 = (part0 << 16) | (part1 << 8) | part2
    idxes.concat(SHIFTS.map{|n| (group24 >> n) & 0x3f })
  end
  CHARS.values_at(*idxes.take(idxes.size - to_take)).join
end
filter(event) click to toggle source
# File lib/logstash/filters/hashid.rb, line 50
def filter(event)
  hmac = OpenSSL::HMAC.new(@key, @digest.new)

  @source.sort.each do |k|
    hmac.update("|#{k}|#{event.get(k)}") 
  end

  hash = hmac.digest

  if !@hash_bytes_used.nil? && @hash_bytes_used > 0 && hash.length > @hash_bytes_used
    hash = hash[(-1 * @hash_bytes_used), @hash_bytes_used]
  end

  epoch_array = []
  if @add_timestamp_prefix
    epoch = event.get('@timestamp').to_i
    epoch_array.push(epoch >> 24)
    epoch_array.push((epoch >> 16) % 256)
    epoch_array.push((epoch >> 8) % 256)
    epoch_array.push(epoch % 256)
  end

  binary_array = epoch_array + hash.unpack('C*')

  event.set(@target, encode_to_sortable_string(binary_array).force_encoding(Encoding::UTF_8))
end
register() click to toggle source
# File lib/logstash/filters/hashid.rb, line 44
def register
  # convert to symbol for faster comparisons
  @method = @method.to_sym
  @digest = select_digest(@method)
end
select_digest(method) click to toggle source
# File lib/logstash/filters/hashid.rb, line 77
def select_digest(method)
  case method
  when :SHA1
    OpenSSL::Digest::SHA1
  when :SHA256
    OpenSSL::Digest::SHA256
  when :SHA384
    OpenSSL::Digest::SHA384
  when :SHA512
    OpenSSL::Digest::SHA512
  when :MD5
    OpenSSL::Digest::MD5
  else
    # we really should never get here
    raise(LogStash::ConfigurationError, "Unknown digest for method=#{method.to_s}")
  end
end