class Persian::Tokenizer

Persian tokenize class

Public Class Methods

split_paragraphs(text) click to toggle source

Split paragraphs Return an array of paragraphs

# File lib/persian/tokenizer.rb, line 36
def self.split_paragraphs(text)
  text = text.split("\n").reject(&:empty?)
  text
end
tokenize(text) click to toggle source

Basic persian word tokenizer Return an array of words

# File lib/persian/tokenizer.rb, line 8
def self.tokenize(text)
  symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.']
  pair_pre = ['(', '{', '«', '<', '[']
  pair_post = [')', '}', '»', '>', ']']
  prepost = ["'", '"']

  # Split text with space characters
  splits = text.split(/\s/)

  return [''] if splits.empty?

  options = symbols + pair_pre + pair_post + prepost

  pattern = /[^#{Regexp.escape(options.join)}]+/
  tokens = []

  splits.each do |split|
    first, middle, last = split.partition(pattern)
    tokens << first.split unless first.empty?
    tokens << middle unless middle.empty?
    tokens << last.split unless last.empty?
  end

  tokens.flatten
end