class Amadeusz::Jekyll::RelatedPosts

Public Class Methods

new() click to toggle source
# File lib/jekyll-related-posts.rb, line 18
def initialize
  @posts = Array.new
  @keywords = Array.new
  @tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
  @stopwords_filter = Stopwords::Snowball::Filter.new('en')
end

Public Instance Methods

add_post(post) click to toggle source
# File lib/jekyll-related-posts.rb, line 25
def add_post(post)
  post = {
    url: post.url,
    title: post.data['title'].dup,
    content: (stem(post.content) + stem(post.data['title']))
  }

  @posts << post
  @keywords += post[:content]
  @keywords.uniq!
end
build!(site) click to toggle source
# File lib/jekyll-related-posts.rb, line 37
def build!(site)
  conf = config(site)
  @weights = keywords_weights(conf['weights'])
  related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
  template = Liquid::Template.parse(File.read(template_path(site)))

  @posts.each do |post|
    filename = File.join(site.config['destination'], post[:url])
    filename = File.join(filename, 'index.html') if File.directory? filename
    rendered = File.read(filename)

    output = template.render('related_posts' => related[post])

    rendered.gsub! '<related-posts />', output
    File.write(filename, rendered)
  end
end

Private Instance Methods

bag_of_words() click to toggle source
# File lib/jekyll-related-posts.rb, line 138
def bag_of_words
  result = NMatrix.new([@posts.size, @keywords.size], 0.0)
  @max = NMatrix.new([@posts.size], 0.0)

  result.each_with_indices do |_, pi, ki|
    result[pi, ki] = @posts[pi][:content].count(@keywords[ki])

    if result[pi, ki] > @max[pi]
      @max[pi] = result[pi, ki]
    end
  end

  @bag_of_words = result.dup
  return result
end
config(site) click to toggle source
# File lib/jekyll-related-posts.rb, line 57
def config(site)
  builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
  defaults = YAML.load_file(builtin_file)

  defaults['related'].merge(site.config['related'] || {})
end
document_correleation(accuracy = 1.0) click to toggle source
# File lib/jekyll-related-posts.rb, line 118
def document_correleation(accuracy = 1.0)
  if accuracy == 1.0
    scores = tfidf
  else
    scores = lsi(tfidf, accuracy)
  end

  result = scores.dot(scores.transpose)

  result.each_with_indices do |_, u, v|
    if u != v
      result[u, v] /= (result[u, u] + result[v, v] - result[u, v])
    else
      result[u, v] = 0.0
    end
  end

  return result
end
find_releated(count = 5, min_score = -10.0, accuracy = 1.0) click to toggle source
# File lib/jekyll-related-posts.rb, line 75
def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
  dc = document_correleation(accuracy)
  result = Hash.new
  count = [count, @posts.size].min

  @posts.each_with_index do |post, index|
    queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
      a[0] > b[0]
    end

    result[post] = []
    count.times do
      score, id = queue.pop
      break unless score
      begin
        result[post] << {
          'score' => score,
          'url' => @posts[id][:url],
          'title' => @posts[id][:title]
        }
      rescue
        break
      end
    end
  end

  return result
end
inverse_document_frequency() click to toggle source
# File lib/jekyll-related-posts.rb, line 179
def inverse_document_frequency
  result = NMatrix.new([1, @keywords.size], 0.0)

  @bag_of_words.each_column do |column|
    occurences = column.reduce do |m, c|
      m + (c > 0 ? 1.0 : 0.0)
    end

    result[0, column.offset[1]] = Math.log(column.size / occurences) if occurences > 0
  end

  return result
end
keywords_weights(weights) click to toggle source
# File lib/jekyll-related-posts.rb, line 165
def keywords_weights(weights)
  result = NMatrix.new([1, @keywords.size], 1.0)

  weights.each do |word, weight|
    keyword = word.to_s.stem.to_sym

    next unless @keywords.include? keyword

    result[0, @keywords.index(keyword)] = weight
  end

  return result
end
lsi(matrix, accuracy) click to toggle source
# File lib/jekyll-related-posts.rb, line 104
def lsi(matrix, accuracy)
  degree = (@keywords.size * accuracy - 1).floor
  u, sigma, vt = matrix.transpose.gesdd

  u2 = u.slice(0..degree, 0..degree)
  sigma_d = NMatrix.zeros([degree+1, @posts.size])
  sigma.each_with_indices do |v, i, j|
    break if i > degree
    sigma_d[i, i] = v
  end

  return u2.dot(sigma_d).dot(vt).transpose
end
stem(data) click to toggle source
# File lib/jekyll-related-posts.rb, line 204
def stem(data)
  tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
  filtered = @stopwords_filter.filter(tokenized)
  stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)

  return stemmed
end
template_path(site) click to toggle source
# File lib/jekyll-related-posts.rb, line 64
def template_path(site)
  site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
  builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')

  if File.exist? site_file
    site_file
  else
    builtin_file
  end
end
term_frequency() click to toggle source
# File lib/jekyll-related-posts.rb, line 154
def term_frequency
  result = bag_of_words

  result.rows.times do |r|
    result[r, 0..-1] *= @weights
    result[r, 0..-1] /= @max[r]
  end

  return result
end
tfidf() click to toggle source
# File lib/jekyll-related-posts.rb, line 193
def tfidf
  result = term_frequency
  idf = inverse_document_frequency

  result.rows.times do |r|
    result[r, 0..-1] *= idf
  end

  return result
end