class Enparser::Segmenter
Public Class Methods
new()
click to toggle source
# File lib/enparser/segmenter.rb, line 8 def initialize @skips = [] end
Public Instance Methods
add_skip_pattern(pattern)
click to toggle source
# File lib/enparser/segmenter.rb, line 23 def add_skip_pattern(pattern) @skips << Regexp.new(pattern) end
completed?(line)
click to toggle source
# File lib/enparser/segmenter.rb, line 12 def completed?(line) return false if /[^\.!?]+[\.!?]$/.match(line).nil? # end of sentence return false if /\w\.\w\.$/.match(line) # abbrevation return false if /\.\.\.$/.match(line) # ellipsis return true end
load_skip_patters(file_name)
click to toggle source
# File lib/enparser/segmenter.rb, line 19 def load_skip_patters(file_name) File.foreach(file_name) { |line| add_skip_pattern(line.chomp) } end
parse_line(line)
click to toggle source
# File lib/enparser/segmenter.rb, line 46 def parse_line(line) return '' if skip?(line) strip!(line) unless @previous.empty? line = @previous + ' ' + line end unless completed?(line) @previous = line line = '' else @previous = '' end line end
segment(text)
click to toggle source
@return [Array<String>]
# File lib/enparser/segmenter.rb, line 41 def segment(text) ps = PragmaticSegmenter::Segmenter.new(text: text) ps.segment.map(&:strip) end
segment_file(file_name) { |l| ... }
click to toggle source
# File lib/enparser/segmenter.rb, line 74 def segment_file(file_name, &block) File.open(file_name) {|f| segment_stream(f) {|l| yield(l)}} end
segment_stream(input) { |s| ... }
click to toggle source
@param input [IO] must be opened @param block [&block<String>]
# File lib/enparser/segmenter.rb, line 64 def segment_stream(input, &block) return unless block_given? @previous = '' input.each_line do |line| pl = parse_line(line) next if pl.empty? segment(pl).each {|s| yield(s)} end end
skip?(line)
click to toggle source
# File lib/enparser/segmenter.rb, line 27 def skip?(line) @skips.each {|t| return true unless t.match(line).nil?} return false end
strip!(line)
click to toggle source
# File lib/enparser/segmenter.rb, line 32 def strip!(line) line.scrub! line.gsub!(/<\/?[^>]*>/, "") # remove html tags line.strip! line.chomp! line end