class Datasets::SudachiSynonymDictionary
Public Class Methods
new()
click to toggle source
Calls superclass method
Datasets::Dataset::new
# File lib/datasets/sudachi-synonym-dictionary.rb, line 19 def initialize super() @metadata.id = "sudachi-synonym-dictionary" @metadata.name = "Sudachi synonym dictionary" @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md" @metadata.licenses = [ "Apache-2.0", ] @metadata.description = lambda do download_description end end
Public Instance Methods
each() { |synonym| ... }
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 32 def each return to_enum(__method__) unless block_given? lexeme_id_context = {} open_data do |csv| csv.each do |row| group_id = row[0] if group_id != lexeme_id_context[:group_id] lexeme_id_context[:group_id] = group_id lexeme_id_context[:counter] = 0 end is_noun = (row[1] == "1") expansion_type = normalize_expansion_type(row[2]) lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context) form_type = normalize_form_type(row[4]) acronym_type = normalize_acronym_type(row[5]) variant_type = normalize_variant_type(row[6]) categories = normalize_categories(row[7]) notation = row[8] synonym = Synonym.new(group_id, is_noun, expansion_type, lexeme_id, form_type, acronym_type, variant_type, categories, notation) yield(synonym) end end end
Private Instance Methods
download_description()
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 79 def download_description description_path = cache_dir_path + "synonyms.md" unless description_path.exist? description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md" download(description_path, description_url) end description_path.read end
normalize_acronym_type(type)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 132 def normalize_acronym_type(type) case type when "0", "" :typical when "1" :alphabet when "2" :others else raise Error, "unknown acronym type: #{type.inspect}" end end
normalize_categories(categories)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 160 def normalize_categories(categories) case categories when "" nil when /\A\((.*)\)\z/ $1.split("/") else raise Error, "invalid categories: #{categories.inspect}" end end
normalize_expansion_type(type)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 88 def normalize_expansion_type(type) case type when "0", "" :always when "1" :expanded when "2" :never else raise Error, "unknown expansion type: #{type.inspect}" end end
normalize_form_type(type)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 115 def normalize_form_type(type) case type when "0", "" :typical when "1" :translation when "2" :alias when "3" :old_name when "4" :misnomer else raise Error, "unknown form type: #{type.inspect}" end end
normalize_lexeme_id(id, context)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 101 def normalize_lexeme_id(id, context) case id when "" lexeme_id_context[:counter] += 1 lexeme_id_context[:counter] else # Use only the first lexeme ID. # Example: # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,, # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,, Integer(id.split("/").first, 10) end end
normalize_variant_type(type)
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 145 def normalize_variant_type(type) case type when "0", "" :typical when "1" :alphabet when "2" :general when "3" :misspelled else raise Error, "unknown variant type: #{type.inspect}" end end
open_data() { |csv| ... }
click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 66 def open_data data_path = cache_dir_path + "synonyms.txt" unless data_path.exist? data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt" download(data_path, data_url) end CSV.open(data_path, encoding: "UTF-8", skip_blanks: true) do |csv| yield(csv) end end