class Genebrand::PosParser

Public Class Methods

new() click to toggle source

Fills parts of speech table

# File lib/genebrand/posparser.rb, line 7
def initialize
  @parsed = {}
  @table = {}
  # Сущ
  @table['N'] = @parsed['noun'] = []
  # Мн. число
  @table['P'] = @parsed['plur'] = []
  # Глаг. прич, пер, непер
  @table['V'] = @parsed['verb_part'] = []
  @table['t'] = @parsed['verb_trans'] = []
  @table['i'] = @parsed['verb_intrans'] = []
  # Прилаг
  @table['A'] = @parsed['adj'] = []
end

Public Instance Methods

getparts(data) click to toggle source
# File lib/genebrand/posparser.rb, line 44
def getparts(data)
  data[1].split('').each do |partofsp|
    @table[partofsp].push(data[0].downcase) if @table.key?(partofsp)
  end
end
is_numeric?(obj) click to toggle source
# File lib/genebrand/posparser.rb, line 120
def is_numeric?(obj)
  obj.to_s.match(/\A[+-]?\d+?(\.\d+)?\Z/).nil? ? false : true
end
parse(filename) click to toggle source

Parses file of wordtpartofspeech

@param [String] filename that should be parsed @return [Hash] of partofspeech => words

# File lib/genebrand/posparser.rb, line 26
def parse(filename)
  init

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Seeding'
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    getparts(data)
  end

  @parsed
end
parse_top(filename, top) click to toggle source
# File lib/genebrand/posparser.rb, line 50
def parse_top(filename, top)
  init

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Load top'
  toparr = []
  File.open(top, 'r').each_line do |line|
    toparr << line.strip.downcase
  end
  puts toparr.count

  puts 'Seeding'
  it = 0
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    getparts(data) if toparr.include?(data[0])
    it += 1
    puts it if it % 10_000 == 0
  end

  @parsed
end
parseandsave(filename, to) click to toggle source
# File lib/genebrand/posparser.rb, line 100
def parseandsave(filename, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') { |f| f.write(parse(filename).to_json) }
end
parseandsave_preseed(filename, to) click to toggle source
# File lib/genebrand/posparser.rb, line 105
def parseandsave_preseed(filename, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') do |f|
    write = preseed(filename)
    write.each do |line|
      f.write(line)
    end
  end
end
parseandsave_top(filename, top, to) click to toggle source
# File lib/genebrand/posparser.rb, line 115
def parseandsave_top(filename, top, to)
  FileUtils.mkdir_p 'lib/data'
  File.open(to, 'w+') { |f| f.write(parse_top(filename, top).to_json) }
end
preseed(filename) click to toggle source
# File lib/genebrand/posparser.rb, line 78
def preseed(filename)
  init

  prsdata = []

  unless File.exist?(filename)
    fail "File not found: #{filename}"
    return
  end

  puts 'Preseed'
  File.open(filename, 'r').each_line do |line|
    data = line.split("\t")

    if !is_numeric?(data[0]) && (!/\A[a-zA-Z0-9]{2,10}\z/.match(data[0]).nil?)
      prsdata << line
    end
  end

  prsdata
end