class SangsooNam::Jekyll::TFIDFRelatedPosts
Public Class Methods
new()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 11 def initialize @docs = Array.new @keywords = Array.new @tags_and_categories = Array.new @stopwords_filter = Stopwords::Snowball::Filter.new('en') end
Public Instance Methods
add_post(post)
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 18 def add_post(post) tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym) categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym) doc = { post: post, content: (stem(post.content) + stem(post.data['title']) + tags + categories) } @docs << doc @keywords += doc[:content] @tags_and_categories += tags + categories end
build(site)
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 30 def build(site) @keywords.uniq! @tags_and_categories.uniq! @weights = custom_weights(@tags_and_categories) related = build_related_docs_with_score(site.config['related_posts_count'] || 4) @docs.each do |doc| doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] }) end end
Private Instance Methods
bag_of_words()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 85 def bag_of_words result = NMatrix.new([@docs.size, @keywords.size], 0.0) @max = NMatrix.new([@docs.size], 0.0) result.each_with_indices do |_, pi, ki| result[pi, ki] = @docs[pi][:content].count(@keywords[ki]) if result[pi, ki] > @max[pi] @max[pi] = result[pi, ki] end end @bag_of_words = result.dup return result end
custom_weights(terms, weight = 8.0)
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 112 def custom_weights(terms, weight = 8.0) result = NMatrix.new([1, @keywords.size], 1.0) terms.each do |term| result[0, @keywords.index(term)] = weight end return result end
document_correleation()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 70 def document_correleation() scores = tfidf result = scores.dot(scores.transpose) result.each_with_indices do |_, u, v| if u != v result[u, v] /= (result[u, u] + result[v, v] - result[u, v]) else result[u, v] = 0.0 end end return result end
inverse_document_frequency()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 122 def inverse_document_frequency result = NMatrix.new([1, @keywords.size], 0.0) @bag_of_words.each_column do |column| occurences = column.reduce do |m, c| m + (c > 0 ? 1.0 : 0.0) end result[0, column.offset[1]] = Math.log(column.size / occurences) if occurences > 0 end return result end
stem(data)
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 147 def stem(data) data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates tokenized = data.scan(/\w+/).map(&:downcase) filtered = @stopwords_filter.filter(tokenized) stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym) return stemmed end
term_frequency()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 101 def term_frequency result = bag_of_words result.rows.times do |r| result[r, 0..-1] *= @weights result[r, 0..-1] /= @max[r] end return result end
tfidf()
click to toggle source
# File lib/jekyll-tfidf-related-posts.rb, line 136 def tfidf result = term_frequency idf = inverse_document_frequency result.rows.times do |r| result[r, 0..-1] *= idf end return result end