class PragmaticSegmenter::Cleaner

This is an opinionated class that removes errant newlines, xhtml, inline formatting, etc.

This is an opinionated class that removes errant newlines, xhtml, inline formatting, etc.

Attributes

doc_type[R]
text[R]

Public Class Methods

new(text:, doc_type: nil, language: Languages::Common) click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 12
def initialize(text:, doc_type: nil, language: Languages::Common)
  @text = text.dup
  @doc_type = doc_type
  @language = language
end

Public Instance Methods

clean() click to toggle source

Clean text of unwanted formatting

Example:

>> text = "This is a sentence\ncut off in the middle because pdf."
>> PragmaticSegmenter::Cleaner(text: text).clean
=> "This is a sentence cut off in the middle because pdf."

Arguments:

text:       (String)  *required
language:   (String)  *optional
            (two character ISO 639-1 code e.g. 'en')
doc_type:   (String)  *optional
            (e.g. 'pdf')
# File lib/pragmatic_segmenter/cleaner.rb, line 32
def clean
  return unless text
  remove_all_newlines
  replace_double_newlines
  replace_newlines
  replace_escaped_newlines

  Rule.apply(@text, HTML::All)

  replace_punctuation_in_brackets
  Rule.apply(@text, InlineFormattingRule)
  clean_quotations
  clean_table_of_contents
  check_for_no_space_in_between_sentences
  clean_consecutive_characters
end

Private Instance Methods

abbreviations() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 51
def abbreviations
  @language::Abbreviation::ABBREVIATIONS
end
check_for_no_space_in_between_sentences() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 55
def check_for_no_space_in_between_sentences
  words = @text.split(' ')
  words.each do |word|
    search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
    search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
  end
  @text
end
clean_consecutive_characters() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 131
def clean_consecutive_characters
  Rule.apply @text, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
end
clean_quotations() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 122
def clean_quotations
  Rule.apply @text, QuotationsFirstRule, QuotationsSecondRule
end
clean_table_of_contents() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 126
def clean_table_of_contents
  Rule.apply @text, TableOfContentsRule, ConsecutivePeriodsRule,
    ConsecutiveForwardSlashRule
end
remove_all_newlines() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 81
def remove_all_newlines
  remove_newline_in_middle_of_sentence
  remove_newline_in_middle_of_word
end
remove_newline_in_middle_of_sentence() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 86
def remove_newline_in_middle_of_sentence
  @text.gsub!(/(?:[^\.])*/) do |match|
    match.gsub(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
  end
  @text
end
remove_newline_in_middle_of_word() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 93
def remove_newline_in_middle_of_word
  Rule.apply @text, NewLineInMiddleOfWordRule
end
remove_pdf_line_breaks() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 115
def remove_pdf_line_breaks
  Rule.apply @text, NewLineFollowedByBulletRule,

    PDF::NewLineInMiddleOfSentenceRule,
    PDF::NewLineInMiddleOfSentenceNoSpacesRule
end
replace_double_newlines() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 102
def replace_double_newlines
  Rule.apply @text, DoubleNewLineWithSpaceRule, DoubleNewLineRule
end
replace_escaped_newlines() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 97
def replace_escaped_newlines
  Rule.apply @text, EscapedNewLineRule, EscapedCarriageReturnRule,
    TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
end
replace_newlines() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 106
def replace_newlines
  if doc_type.eql?('pdf')
    remove_pdf_line_breaks
  else
    Rule.apply @text, NewLineFollowedByPeriodRule,
      ReplaceNewlineWithCarriageReturnRule
  end
end
replace_punctuation_in_brackets() click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 64
def replace_punctuation_in_brackets
  @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
    @text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?')
  end
end
search_for_connected_sentences(word, txt, regex, rule) click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 70
def search_for_connected_sentences(word, txt, regex, rule)
  if word =~ regex
    unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
      unless abbreviations.any? { |abbr| word =~ /#{abbr}/i }
        new_word = Rule.apply(word.dup, rule)
        txt.gsub!(/#{Regexp.escape(word)}/, new_word)
      end
    end
  end
end