module Synt::Similar

Public Instance Methods

compare(opts) click to toggle source
# File lib/synt/similar.rb, line 11
def compare opts
  error 'no compare propery provided' unless opts[:compare]
  error 'no to propery provided' unless opts[:to]

  # TODO: string-compare sucks to reference..
  src = parse_data opts["string-compare"], opts[:compare]
  cmp = parse_data opts["string-compare"], opts[:to]
  algorithm = algorithms[(opts[:algorithm] || 'jaccard').to_sym]
  n_start, n_end = ngram_range opts[:ngram]
  src_t = normalize_ripper_tokens Synt::Parser.parse(src)
  cmp_t = normalize_ripper_tokens Synt::Parser.parse(cmp)

  a = generate_ngrams src_t, n_start, n_end
  b = generate_ngrams cmp_t, n_start, n_end

  sim = algorithm.compare a, b

  sim.to_f.round 2
end

Private Instance Methods

algorithms() click to toggle source
# File lib/synt/similar.rb, line 33
def algorithms
  { jaccard: Jaccard, tanimoto: Tanimoto }
end
error(msg) click to toggle source
# File lib/synt/similar.rb, line 88
def error msg
  puts msg
  exit 1
end
generate_ngrams(arr, start, nend) click to toggle source
# File lib/synt/similar.rb, line 45
def generate_ngrams arr, start, nend
  nend = arr.length unless nend
  start = 1 unless start

  if nend > arr.length
    start = nend = 1
    puts 'ngram end value exceeds max length- setting start/end to: 1'
  end

  return arr if start == nend && start == 1 # short circuit

  sets = []

  (start..nend).to_a.each_index do |n_len|
    arr.each_index do |index|
      s_len = index + n_len
      sets.push arr[index, s_len].join('') if s_len <= arr.length
    end
  end

  sets
end
ngram_range(ngram) click to toggle source
# File lib/synt/similar.rb, line 68
def ngram_range ngram
  is_range = /\.\./

  if !ngram
    return 1, 1
  elsif ngram =~ is_range
    n = ngram.split '..'
    return n[0].to_i, n[1].to_i
  elsif ngram != 'all'
    n = ngram.to_i
    return n, n
  else
    return nil, nil
  end
end
normalize_ripper_tokens(tokens) click to toggle source
# File lib/synt/similar.rb, line 84
def normalize_ripper_tokens tokens
  tokens.select { |t| t && t !~ /^\s*$/ }
end
parse_data(is_string, value) click to toggle source
# File lib/synt/similar.rb, line 37
def parse_data is_string, value
  if is_string
    value
  else
    IO.read value
  end
end