class Datasets::SudachiSynonymDictionary

Public Class Methods

new() click to toggle source
Calls superclass method Datasets::Dataset::new
# File lib/datasets/sudachi-synonym-dictionary.rb, line 19
def initialize
  super()
  @metadata.id = "sudachi-synonym-dictionary"
  @metadata.name = "Sudachi synonym dictionary"
  @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
  @metadata.licenses = [
    "Apache-2.0",
  ]
  @metadata.description = lambda do
    download_description
  end
end

Public Instance Methods

each() { |synonym| ... } click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 32
def each
  return to_enum(__method__) unless block_given?

  lexeme_id_context = {}
  open_data do |csv|
    csv.each do |row|
      group_id = row[0]
      if group_id != lexeme_id_context[:group_id]
        lexeme_id_context[:group_id] = group_id
        lexeme_id_context[:counter] = 0
      end
      is_noun = (row[1] == "1")
      expansion_type = normalize_expansion_type(row[2])
      lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
      form_type = normalize_form_type(row[4])
      acronym_type = normalize_acronym_type(row[5])
      variant_type = normalize_variant_type(row[6])
      categories = normalize_categories(row[7])
      notation = row[8]
      synonym = Synonym.new(group_id,
                            is_noun,
                            expansion_type,
                            lexeme_id,
                            form_type,
                            acronym_type,
                            variant_type,
                            categories,
                            notation)
      yield(synonym)
    end
  end
end

Private Instance Methods

download_description() click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 79
def download_description
  description_path = cache_dir_path + "synonyms.md"
  unless description_path.exist?
    description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
    download(description_path, description_url)
  end
  description_path.read
end
normalize_acronym_type(type) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 132
def normalize_acronym_type(type)
  case type
  when "0", ""
    :typical
  when "1"
    :alphabet
  when "2"
    :others
  else
    raise Error, "unknown acronym type: #{type.inspect}"
  end
end
normalize_categories(categories) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 160
def normalize_categories(categories)
  case categories
  when ""
    nil
  when /\A\((.*)\)\z/
    $1.split("/")
  else
    raise Error, "invalid categories: #{categories.inspect}"
  end
end
normalize_expansion_type(type) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 88
def normalize_expansion_type(type)
  case type
  when "0", ""
    :always
  when "1"
    :expanded
  when "2"
    :never
  else
    raise Error, "unknown expansion type: #{type.inspect}"
  end
end
normalize_form_type(type) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 115
def normalize_form_type(type)
  case type
  when "0", ""
    :typical
  when "1"
    :translation
  when "2"
    :alias
  when "3"
    :old_name
  when "4"
    :misnomer
  else
    raise Error, "unknown form type: #{type.inspect}"
  end
end
normalize_lexeme_id(id, context) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 101
def normalize_lexeme_id(id, context)
  case id
  when ""
    lexeme_id_context[:counter] += 1
    lexeme_id_context[:counter]
  else
    # Use only the first lexeme ID.
    # Example:
    #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
    #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
    Integer(id.split("/").first, 10)
  end
end
normalize_variant_type(type) click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 145
def normalize_variant_type(type)
  case type
  when "0", ""
    :typical
  when "1"
    :alphabet
  when "2"
    :general
  when "3"
    :misspelled
  else
    raise Error, "unknown variant type: #{type.inspect}"
  end
end
open_data() { |csv| ... } click to toggle source
# File lib/datasets/sudachi-synonym-dictionary.rb, line 66
def open_data
  data_path = cache_dir_path + "synonyms.txt"
  unless data_path.exist?
    data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
    download(data_path, data_url)
  end
  CSV.open(data_path,
           encoding: "UTF-8",
           skip_blanks: true) do |csv|
    yield(csv)
  end
end