class PragmaticSegmenter::Processor

This class processing segmenting the text.

Attributes

text[R]

Public Class Methods

new(language: Languages::Common) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 16
def initialize(language: Languages::Common)
  @language = language
end

Public Instance Methods

process(text:) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 20
def process(text:)
  @text = List.new(text: text).add_line_break
  replace_abbreviations
  replace_numbers
  replace_continuous_punctuation
  replace_periods_before_numeric_references
  Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule)
  Rule.apply(@text, @language::GeoLocationRule)
  Rule.apply(@text, @language::FileFormatRule)
  split_into_segments
end

Private Instance Methods

abbreviations_replacer() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 108
def abbreviations_replacer
  if defined? @language::AbbreviationReplacer
    @language::AbbreviationReplacer
  else
    AbbreviationReplacer
  end
end
between_punctuation(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 128
def between_punctuation(txt)
  between_punctuation_processor.new(text: txt).replace
end
between_punctuation_processor() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 120
def between_punctuation_processor
  if defined? @language::BetweenPunctuation
    @language::BetweenPunctuation
  else
    BetweenPunctuation
  end
end
check_for_parens_between_quotes(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 60
def check_for_parens_between_quotes(txt)
  return txt unless txt =~ @language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX
  txt.gsub!(@language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX) do |match|
    match.gsub(/\s(?=\()/, "\r").gsub(/(?<=\))\s/, "\r")
  end
end
check_for_punctuation(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 82
def check_for_punctuation(txt)
  if @language::Punctuations.any? { |p| txt.include?(p) }
    process_text(txt)
  else
    txt
  end
end
consecutive_underscore?(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 77
def consecutive_underscore?(txt)
  # Rubular: http://rubular.com/r/fTF2Ff3WBL
  txt.gsub(/_{3,}/, '').length.eql?(0)
end
post_process_segments(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 44
def post_process_segments(txt)
  return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/
  return if consecutive_underscore?(txt) || txt.length < 2
  Rule.apply(
    txt,
    @language::ReinsertEllipsisRules::All,
    @language::ExtraWhiteSpaceRule
  )

  if txt =~ @language::QUOTATION_AT_END_OF_SENTENCE_REGEX
    txt.split(@language::SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
  else
    txt.tr("\n", '').strip
  end
end
process_text(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 90
def process_text(txt)
  txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) }
  ExclamationWords.apply_rules(txt)
  between_punctuation(txt)
  txt = Rule.apply(
    txt,
    @language::DoublePunctuationRules::All,
    @language::QuestionMarkInQuotationRule,
    @language::ExclamationPointRules::All
  )
  txt = List.new(text: txt).replace_parens
  sentence_boundary_punctuation(txt)
end
replace_abbreviations() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 116
def replace_abbreviations
  @text = abbreviations_replacer.new(text: @text, language: @language).replace
end
replace_continuous_punctuation() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 67
def replace_continuous_punctuation
  @text.gsub!(@language::CONTINUOUS_PUNCTUATION_REGEX) do |match|
    match.gsub(/!/, '&ᓴ&').gsub(/\?/, '&ᓷ&')
  end
end
replace_numbers() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 104
def replace_numbers
  Rule.apply @text, @language::Numbers::All
end
replace_periods_before_numeric_references() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 73
def replace_periods_before_numeric_references
  @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
end
sentence_boundary_punctuation(txt) click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 132
def sentence_boundary_punctuation(txt)
  txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule
  txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule

  txt.scan(@language::SENTENCE_BOUNDARY_REGEX)
end
split_into_segments() click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 34
def split_into_segments
  check_for_parens_between_quotes(@text).split("\r")
     .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) }
     .map { |segment| check_for_punctuation(segment) }.flatten
     .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) }
     .map { |segment| post_process_segments(segment) }
     .flatten.compact.delete_if(&:empty?)
     .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) }
end