class Datte::Train

Constants

FNAME

Public Class Methods

new(path = FNAME) click to toggle source

SEE: qiita.com/Hironsan/items/326b66711eb4196aa9d4

# File lib/datte/train.rb, line 7
def initialize(path = FNAME)
  train_sents = corpus_read
  x = corpus_read[0]
  p x
  p '==='
  p sent2features(x)
  #sent2features(corpus_read[0])[0]
  #sent2features(train_sents[0])[0]
end

Public Instance Methods

train() click to toggle source
# File lib/datte/train.rb, line 41
def train()
end
x_test() click to toggle source
# File lib/datte/train.rb, line 29
def x_test
  test_sents.each do |s|
    return sent2features(s)
  end
end
x_train() click to toggle source
# File lib/datte/train.rb, line 17
def x_train
  train_sents.each do |s|
    return sent2features(s)
  end
end
y_test() click to toggle source
# File lib/datte/train.rb, line 35
def y_test
  test_sents.each do |s|
    return sent2labels(s)
  end
end
y_train() click to toggle source
# File lib/datte/train.rb, line 23
def y_train
  train_sents.each do |s|
    return sent2labels(s)
  end
end

Private Instance Methods

chara_type(ch) click to toggle source
# File lib/datte/train.rb, line 73
def chara_type(ch)
  if space?(ch) then 'ZSPACE'
  elsif integer?(ch) then 'ZDIGIT'
  elsif lower?(ch) then 'ZLLET'
  elsif upper?(ch) then 'ZULET'
  elsif hiragana?(ch) then 'HIRAG'
  elsif katakana?(ch) then 'KATAK'
  else 'OTHER'
  end
end
chara_types(str) click to toggle source
# File lib/datte/train.rb, line 84
def chara_types(str)
  types = str.each_char.to_a.map do |ch|
    chara_type(ch)
  end
  types.uniq.sort().join('-')
end
corpus_read() click to toggle source
# File lib/datte/train.rb, line 185
def corpus_read
  sents, sent = [], []

  File.open(FNAME, 'r') do |file|
    file.each_line do |line|
      if line == "\n"
        sents.push(sent)
        sent = []
        next
      end
      morph_info = line.strip().split(' ')
      sent.push(morph_info)
    end
  end
  sents
end
extract_pos(morph) click to toggle source
# File lib/datte/train.rb, line 91
def extract_pos(morph)
  idx = morph.index('*')
  morph[1, idx-1].join('-')
end
hiragana?(ch) click to toggle source
# File lib/datte/train.rb, line 45
def hiragana?(ch)
  0x3040 <= ch.ord && ch.ord <= 0x309F
end
integer?(ch) click to toggle source
# File lib/datte/train.rb, line 57
def integer?(ch)
  Integer(ch)
  Integer(ch)
  true
rescue ArgumentError
  false
end
katakana(ch) click to toggle source
# File lib/datte/train.rb, line 49
def katakana(ch)
  0x30A0 <= ch.ord && ch.ord <= 0x30FF
end
lower?(ch) click to toggle source
# File lib/datte/train.rb, line 65
def lower?(ch)
  ch == ch.downcase
end
sent2features(sent) click to toggle source
# File lib/datte/train.rb, line 167
def sent2features(sent)
  (0..(sent.length)).to_a.map do |i|
    return word2features(sent, i)
  end
end
sent2labels(sent) click to toggle source
# File lib/datte/train.rb, line 173
def sent2labels(sent)
  sent.map do |morph|
    morph[-1]
  end
end
sent2tokens(sent) click to toggle source
# File lib/datte/train.rb, line 179
def sent2tokens(sent)
  sent.map do |morph|
    morph[0]
  end
end
space?(ch) click to toggle source
# File lib/datte/train.rb, line 53
def space?(ch)
  !(ch =~ /^\s*$/).nil?
end
upper?(ch) click to toggle source
# File lib/datte/train.rb, line 69
def upper?(ch)
  ch == ch.upcase
end
word2features(sent, i) click to toggle source
# File lib/datte/train.rb, line 96
def word2features(sent, i)
  word = sent[i][0]
  chtype = chara_types(sent[i][0])
  postag = extract_pos(sent[i])

  features = [
    'bias',
    'word=' + word,
    'type=' + chtype,
    'pos_tag=' + postag
  ]

  if i >= 2
    word2 = sent[i-2][0]
    chtype2 = chara_types(sent[i-2][0])
    postag2 = extract_pos(sent[i-2])
    iobtag2 = sent[i-2][-1]
    features.push(*[
      '-2:word=' + word2,
      '-2:type=' + chtype2,
      '-2:postag=' + postag2,
      '-2:iobtag=' + iobtag2
    ])
  else
    features.push('BOS')
  end

  if i >= 1
    word1 = sent[i-1][0]
    chtype1 = chara_types(sent[i-1][0])
    postag1 = extract_pos(sent[i-1])
    iobtag1 = sent[i-1][-1]
    features.push(*[
      '-1:word=' + word1,
      '-1:type=' + chtype1,
      '-1:postag=' + postag1,
      '-1:iobtag=' + iobtag1
    ])
  else
    features.push('BOS')
  end

  if i < sent.length - 1
    word1 = sent[i+1][0]
    chtype1 = chara_types(sent[i+1][0])
    postag1 = extract_pos(sent[i+1])
    features.push(*[
      '+1:word=' + word1,
      '+1:type=' + chtype1,
      '+1:postag=' + postag1
    ])
  else
    features.push('EOS')
  end

  if i < sent.length - 2
    word2 = sent[i+2][0]
    chtype2 = chara_types(sent[i+2][0])
    postag2 = extract_pos(sent[i+2])
    features.push(*[
      '+2:word=' + word2,
      '+2:type=' + chtype2,
      '+2:postag=' + postag2
    ])
  else
    features.push('EOS')
  end

  return features
end