class RakeText

Public Class Methods

FOX() click to toggle source
# File lib/rake_text.rb, line 10
def self.FOX
        return @@stoplist_fox
end
SMART() click to toggle source
# File lib/rake_text.rb, line 6
def self.SMART
        return @@stoplist_smart
end
new() click to toggle source
# File lib/rake_text.rb, line 14
def initialize
end

Public Instance Methods

analyse(text, stoplist, verbose=false) click to toggle source
# File lib/rake_text.rb, line 17
def analyse text, stoplist, verbose=false
        pattern    = buildStopwordRegExPattern stoplist
        sentences  = text.split(/[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/u)
        phrases    = generateCandidateKeywords sentences, pattern
        wordscores = calculateWordScores phrases
        candidates = generateCandidateKeywordScores phrases, wordscores

        if verbose == true
                result = candidates.sort_by{|k,v| v}.reverse
                result.each do |word, score|
                        puts sprintf '%.2f - %s', score, word
                end  
        end

        return candidates
end

Private Instance Methods

buildStopwordRegExPattern(words) click to toggle source

create stopword pattern 1

# File lib/rake_text.rb, line 38
def buildStopwordRegExPattern words
        pattern = Array.new
        words.each do |word|
                pattern.push '\\b'+word+'\\b'
        end
        return Regexp.new(pattern.join("|"), Regexp::IGNORECASE)
end
calculateWordScores(phrases) click to toggle source

calculate individual word scores 3

# File lib/rake_text.rb, line 69
def calculateWordScores phrases
        word_freq = Hash.new 0
        word_degree = Hash.new 0
        word_score = Hash.new 0

        phrases.each do |phrase|
                words = seperateWords phrase

                length = words.length
                degree = length-1

                words.each do |word|
                        word_freq[word] += 1
                        word_degree[word] += degree
                end
        end

        word_freq.each do |word, counter|
                word_degree[word] = word_degree[word] + word_freq[word]
        end

        word_freq.each do |word, counter|
                word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
        end

        return word_score
end
generateCandidateKeywordScores(phrases, scores) click to toggle source

generate candidate keyword scores 4

# File lib/rake_text.rb, line 99
def generateCandidateKeywordScores phrases, scores
        candidates = Hash.new 0

        phrases.each do |phrase|
                words = seperateWords(phrase)
                score = 0
                words.each do |word|
                        score += scores[word]
                end
                candidates[phrase] = score
        end

        return candidates
end
generateCandidateKeywords(sentences, pattern) click to toggle source

generate candidate keywords 2

# File lib/rake_text.rb, line 48
def generateCandidateKeywords sentences, pattern
        phrases = Array.new

        sentences.each do |sentence|
                sentence = sentence.strip

                tmp = sentence.gsub pattern, "|"

                tmp.split("|").each do |part|
                        part = part.strip.downcase
                        if !part.empty?
                                phrases.push part
                        end
                end
        end

        return phrases
end
seperateWords(text) click to toggle source
# File lib/rake_text.rb, line 114
def seperateWords text
        words = Array.new

        text.split(/[^a-zA-Z0-9_\\+\\-]/).each do |word|
                word = word.strip.downcase
                if !word.empty? && !(true if Float(word) rescue false)
                        words.push word
                end
        end

        return words
end