class Persian::Tokenizer
Persian
tokenize class
Public Class Methods
split_paragraphs(text)
click to toggle source
Split paragraphs Return an array of paragraphs
# File lib/persian/tokenizer.rb, line 36 def self.split_paragraphs(text) text = text.split("\n").reject(&:empty?) text end
tokenize(text)
click to toggle source
Basic persian word tokenizer Return an array of words
# File lib/persian/tokenizer.rb, line 8 def self.tokenize(text) symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.'] pair_pre = ['(', '{', '«', '<', '['] pair_post = [')', '}', '»', '>', ']'] prepost = ["'", '"'] # Split text with space characters splits = text.split(/\s/) return [''] if splits.empty? options = symbols + pair_pre + pair_post + prepost pattern = /[^#{Regexp.escape(options.join)}]+/ tokens = [] splits.each do |split| first, middle, last = split.partition(pattern) tokens << first.split unless first.empty? tokens << middle unless middle.empty? tokens << last.split unless last.empty? end tokens.flatten end