class PragmaticSegmenter::Processor
This class processing segmenting the text.
Attributes
text[R]
Public Class Methods
new(language: Languages::Common)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 16 def initialize(language: Languages::Common) @language = language end
Public Instance Methods
process(text:)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 20 def process(text:) @text = List.new(text: text).add_line_break replace_abbreviations replace_numbers replace_continuous_punctuation replace_periods_before_numeric_references Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule) Rule.apply(@text, @language::GeoLocationRule) Rule.apply(@text, @language::FileFormatRule) split_into_segments end
Private Instance Methods
abbreviations_replacer()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 108 def abbreviations_replacer if defined? @language::AbbreviationReplacer @language::AbbreviationReplacer else AbbreviationReplacer end end
between_punctuation(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 128 def between_punctuation(txt) between_punctuation_processor.new(text: txt).replace end
between_punctuation_processor()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 120 def between_punctuation_processor if defined? @language::BetweenPunctuation @language::BetweenPunctuation else BetweenPunctuation end end
check_for_parens_between_quotes(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 60 def check_for_parens_between_quotes(txt) return txt unless txt =~ @language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX txt.gsub!(@language::PARENS_BETWEEN_DOUBLE_QUOTES_REGEX) do |match| match.gsub(/\s(?=\()/, "\r").gsub(/(?<=\))\s/, "\r") end end
check_for_punctuation(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 82 def check_for_punctuation(txt) if @language::Punctuations.any? { |p| txt.include?(p) } process_text(txt) else txt end end
consecutive_underscore?(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 77 def consecutive_underscore?(txt) # Rubular: http://rubular.com/r/fTF2Ff3WBL txt.gsub(/_{3,}/, '').length.eql?(0) end
post_process_segments(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 44 def post_process_segments(txt) return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/ return if consecutive_underscore?(txt) || txt.length < 2 Rule.apply( txt, @language::ReinsertEllipsisRules::All, @language::ExtraWhiteSpaceRule ) if txt =~ @language::QUOTATION_AT_END_OF_SENTENCE_REGEX txt.split(@language::SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX) else txt.tr("\n", '').strip end end
process_text(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 90 def process_text(txt) txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) } ExclamationWords.apply_rules(txt) between_punctuation(txt) txt = Rule.apply( txt, @language::DoublePunctuationRules::All, @language::QuestionMarkInQuotationRule, @language::ExclamationPointRules::All ) txt = List.new(text: txt).replace_parens sentence_boundary_punctuation(txt) end
replace_abbreviations()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 116 def replace_abbreviations @text = abbreviations_replacer.new(text: @text, language: @language).replace end
replace_continuous_punctuation()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 67 def replace_continuous_punctuation @text.gsub!(@language::CONTINUOUS_PUNCTUATION_REGEX) do |match| match.gsub(/!/, '&ᓴ&').gsub(/\?/, '&ᓷ&') end end
replace_numbers()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 104 def replace_numbers Rule.apply @text, @language::Numbers::All end
replace_periods_before_numeric_references()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 73 def replace_periods_before_numeric_references @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7") end
sentence_boundary_punctuation(txt)
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 132 def sentence_boundary_punctuation(txt) txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule txt.scan(@language::SENTENCE_BOUNDARY_REGEX) end
split_into_segments()
click to toggle source
# File lib/pragmatic_segmenter/processor.rb, line 34 def split_into_segments check_for_parens_between_quotes(@text).split("\r") .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) } .map { |segment| check_for_punctuation(segment) }.flatten .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) } .map { |segment| post_process_segments(segment) } .flatten.compact.delete_if(&:empty?) .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) } end