class Ting::HanyuPinyinParser

Public Instance Methods

all_syllables() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 13
def all_syllables
  @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
end
call(pinyin)
Alias for: parse
consonant_syllables() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 17
def consonant_syllables
  @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
end
hanyu_reader() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 9
def hanyu_reader
  @hanyu_reader ||= Ting.reader(:hanyu, :accents)
end
hanyu_writer() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 5
def hanyu_writer
  @hanyu_writer ||= Ting.writer(:hanyu, :accents)
end
parse(pinyin) click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 64
def parse(pinyin)
  # hanyu_reader cannot parse uppercase pinyin.
  pinyin = pinyin.downcase

  clusters = pinyin.split(pinyin_separator_regexp)
  clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
end
Also aliased as: call
parse_cluster(pinyin) click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 41
def parse_cluster(pinyin)
  syllables = []

  # Chop off one syllable at a time from the end by continuously matching the same regular expression.
  # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
  # only contain the *last* content it has matched, we have to use a loop.
  while match = pinyin_regexp.match(pinyin)
    # If an 'r' at the end was matched, this implies that all other parts of the string were matched as
    # syllables, and this cluster uses erhua.
    if 'r' == match[3]
      syllables << 'er'
      pinyin = pinyin.chop
    end
    last_syllable = match[2] || match[1]
    syllables << last_syllable
    pinyin = pinyin[0, pinyin.length - last_syllable.length]
  end

  raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?

  syllables.reverse
end
pinyin_regexp() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 21
def pinyin_regexp
  # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
  @pinyin_cluster_regexp ||= /\A
    # Every syllable can appear at the start of a cluster.
    (#{Regexp.union(all_syllables)})
    # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
    # be prefixed with an apostrophe.
    # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
    # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
    # syllable.
    (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
    (r)?
    \Z/x
end
pinyin_separator_regexp() click to toggle source
# File lib/ting/hanyu_pinyin_parser.rb, line 36
def pinyin_separator_regexp
  # A regular expression that matches every character that can *not* appear in pinyin.
  @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
end