module Dopeness
sentenceを解析してchunkの配列を返す
sentenceを解析して配列を返す
Constants
- VERSION
Public Class Methods
dope(verse)
click to toggle source
# File lib/dopeness.rb, line 10 def self.dope(verse) threshold = 0.0 surfaces = parse_surfaces(verse) pronuncitaitons = parse_pronunciation(verse) vowels = [] pronuncitaitons.each do |pronuncitaiton| vowels.push(parse_vowels(pronuncitaiton.to_roman)) end chunk_features = {} (0...vowels.size).each do |i| once_chunk_hash = {} max = i + 10 if vowels.size < max max = vowels.size end (i...max).each do |j| # 母音の3-gram類似度 # 母音のLevenshtein距離 # 発音のLevenshtein距離 # 対象までの物理的距離 からスコアを計算する trigram_evaluation = Trigram.compare(vowels[i], vowels[j]) if trigram_evaluation.nan? trigram_evaluation = 0 end vowels_distance = 1 - Levenshtein.normalized_distance(vowels[i], vowels[j]) pronuncitaitons_distance = 1 - Levenshtein.normalized_distance(pronuncitaitons[i], pronuncitaitons[j]) physical_distance = 1 - ((j - i) / 10) matching_score = trigram_evaluation + vowels_distance + pronuncitaitons_distance + physical_distance once_chunk_hash.store(j, matching_score) end sorted_score = Hash[once_chunk_hash.sort_by{ |_, v| -v }] chunk_features.store(i, sorted_score) end return surfaces, chunk_features end
parse_pronunciation(sentence)
click to toggle source
# File lib/dopeness/parse_pronunciation.rb, line 8 def parse_pronunciation(sentence) parser = Parser.new; tree = parser.parse(sentence) tree.set_output_layer(OUTPUT_RAW_SENTENCE) chunks = [] (0 ... tree.chunk_size).each do |i| chunk = tree.chunk(i) x = (0 ... chunk.token_size).map do |j| if tree.token(chunk.token_pos + j).feature_list(tree.token(chunk.token_pos + j).feature_list_size - 1).force_encoding("UTF-8") != "*" then tree.token(chunk.token_pos + j).feature_list(tree.token(chunk.token_pos + j).feature_list_size - 1).force_encoding("UTF-8") else tree.token(chunk.token_pos).surface.force_encoding("UTF-8") end end.join("") chunks.push(x) end return chunks end
parse_surfaces(sentence)
click to toggle source
# File lib/dopeness/parse_surfaces.rb, line 8 def parse_surfaces(sentence) parser = Parser.new; tree = parser.parse(sentence) tree.set_output_layer(OUTPUT_RAW_SENTENCE) surfaces = [] (0 ... tree.chunk_size).each do |i| chunk = tree.chunk(i) x = (0 ... chunk.token_size).map do |j| surface = tree.token(chunk.token_pos + j).normalized_surface.force_encoding("UTF-8") if surface != "*" then if surface != "。" surface end else surface = tree.token(chunk.token_pos).normalized_surface.force_encoding("UTF-8") if surface != "。" surface end end end.join("") surfaces.push(x) end return surfaces end
parse_vowels(str)
click to toggle source
# File lib/dopeness/parse_vowels.rb, line 4 def parse_vowels(str) vowel = ["a", "i", "u", "e", "o"] rhyme = "" str.each_char do |ch| if vowel.include?(ch) rhyme += ch end end return rhyme end
Private Instance Methods
parse_pronunciation(sentence)
click to toggle source
# File lib/dopeness/parse_pronunciation.rb, line 8 def parse_pronunciation(sentence) parser = Parser.new; tree = parser.parse(sentence) tree.set_output_layer(OUTPUT_RAW_SENTENCE) chunks = [] (0 ... tree.chunk_size).each do |i| chunk = tree.chunk(i) x = (0 ... chunk.token_size).map do |j| if tree.token(chunk.token_pos + j).feature_list(tree.token(chunk.token_pos + j).feature_list_size - 1).force_encoding("UTF-8") != "*" then tree.token(chunk.token_pos + j).feature_list(tree.token(chunk.token_pos + j).feature_list_size - 1).force_encoding("UTF-8") else tree.token(chunk.token_pos).surface.force_encoding("UTF-8") end end.join("") chunks.push(x) end return chunks end
parse_surfaces(sentence)
click to toggle source
# File lib/dopeness/parse_surfaces.rb, line 8 def parse_surfaces(sentence) parser = Parser.new; tree = parser.parse(sentence) tree.set_output_layer(OUTPUT_RAW_SENTENCE) surfaces = [] (0 ... tree.chunk_size).each do |i| chunk = tree.chunk(i) x = (0 ... chunk.token_size).map do |j| surface = tree.token(chunk.token_pos + j).normalized_surface.force_encoding("UTF-8") if surface != "*" then if surface != "。" surface end else surface = tree.token(chunk.token_pos).normalized_surface.force_encoding("UTF-8") if surface != "。" surface end end end.join("") surfaces.push(x) end return surfaces end
parse_vowels(str)
click to toggle source
# File lib/dopeness/parse_vowels.rb, line 4 def parse_vowels(str) vowel = ["a", "i", "u", "e", "o"] rhyme = "" str.each_char do |ch| if vowel.include?(ch) rhyme += ch end end return rhyme end