class Enparser::Segmenter

Public Class Methods

new() click to toggle source
# File lib/enparser/segmenter.rb, line 8
def initialize
  @skips = []
end

Public Instance Methods

add_skip_pattern(pattern) click to toggle source
# File lib/enparser/segmenter.rb, line 23
def add_skip_pattern(pattern)
  @skips << Regexp.new(pattern)
end
completed?(line) click to toggle source
# File lib/enparser/segmenter.rb, line 12
def completed?(line)
  return false if /[^\.!?]+[\.!?]$/.match(line).nil? # end of sentence
  return false if /\w\.\w\.$/.match(line)       # abbrevation
  return false if /\.\.\.$/.match(line)         # ellipsis
  return true
end
load_skip_patters(file_name) click to toggle source
# File lib/enparser/segmenter.rb, line 19
def load_skip_patters(file_name)
  File.foreach(file_name) { |line| add_skip_pattern(line.chomp) }
end
parse_line(line) click to toggle source
# File lib/enparser/segmenter.rb, line 46
def parse_line(line)
  return '' if skip?(line)
  strip!(line)
  unless @previous.empty?
    line = @previous + ' ' + line
  end

  unless completed?(line)
    @previous = line
    line = ''
  else
    @previous = ''
  end
  line
end
segment(text) click to toggle source

@return [Array<String>]

# File lib/enparser/segmenter.rb, line 41
def segment(text)
  ps = PragmaticSegmenter::Segmenter.new(text: text)
  ps.segment.map(&:strip)
end
segment_file(file_name) { |l| ... } click to toggle source
# File lib/enparser/segmenter.rb, line 74
def segment_file(file_name, &block)
  File.open(file_name) {|f| segment_stream(f) {|l| yield(l)}}
end
segment_stream(input) { |s| ... } click to toggle source

@param input [IO] must be opened @param block [&block<String>]

# File lib/enparser/segmenter.rb, line 64
def segment_stream(input, &block)
  return unless block_given?
  @previous = ''
  input.each_line do |line|
    pl = parse_line(line)
    next if pl.empty?
    segment(pl).each {|s| yield(s)}
  end
end
skip?(line) click to toggle source
# File lib/enparser/segmenter.rb, line 27
def skip?(line)
  @skips.each {|t| return true unless t.match(line).nil?}
                    return false
end
strip!(line) click to toggle source
# File lib/enparser/segmenter.rb, line 32
def strip!(line)
  line.scrub!
  line.gsub!(/<\/?[^>]*>/, "") # remove html tags
  line.strip!
  line.chomp!
  line
end