class PragmaticSegmenter::Cleaner
This is an opinionated class that removes errant newlines, xhtml, inline formatting, etc.
This is an opinionated class that removes errant newlines, xhtml, inline formatting, etc.
Attributes
doc_type[R]
text[R]
Public Class Methods
new(text:, doc_type: nil, language: Languages::Common)
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 12 def initialize(text:, doc_type: nil, language: Languages::Common) @text = text.dup @doc_type = doc_type @language = language end
Public Instance Methods
clean()
click to toggle source
Clean text of unwanted formatting
Example:
>> text = "This is a sentence\ncut off in the middle because pdf." >> PragmaticSegmenter::Cleaner(text: text).clean => "This is a sentence cut off in the middle because pdf."
Arguments:
text: (String) *required language: (String) *optional (two character ISO 639-1 code e.g. 'en') doc_type: (String) *optional (e.g. 'pdf')
# File lib/pragmatic_segmenter/cleaner.rb, line 32 def clean return unless text remove_all_newlines replace_double_newlines replace_newlines replace_escaped_newlines Rule.apply(@text, HTML::All) replace_punctuation_in_brackets Rule.apply(@text, InlineFormattingRule) clean_quotations clean_table_of_contents check_for_no_space_in_between_sentences clean_consecutive_characters end
Private Instance Methods
abbreviations()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 51 def abbreviations @language::Abbreviation::ABBREVIATIONS end
check_for_no_space_in_between_sentences()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 55 def check_for_no_space_in_between_sentences words = @text.split(' ') words.each do |word| search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule) search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule) end @text end
clean_consecutive_characters()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 131 def clean_consecutive_characters Rule.apply @text, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule end
clean_quotations()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 122 def clean_quotations Rule.apply @text, QuotationsFirstRule, QuotationsSecondRule end
clean_table_of_contents()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 126 def clean_table_of_contents Rule.apply @text, TableOfContentsRule, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule end
remove_all_newlines()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 81 def remove_all_newlines remove_newline_in_middle_of_sentence remove_newline_in_middle_of_word end
remove_newline_in_middle_of_sentence()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 86 def remove_newline_in_middle_of_sentence @text.gsub!(/(?:[^\.])*/) do |match| match.gsub(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '') end @text end
remove_newline_in_middle_of_word()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 93 def remove_newline_in_middle_of_word Rule.apply @text, NewLineInMiddleOfWordRule end
remove_pdf_line_breaks()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 115 def remove_pdf_line_breaks Rule.apply @text, NewLineFollowedByBulletRule, PDF::NewLineInMiddleOfSentenceRule, PDF::NewLineInMiddleOfSentenceNoSpacesRule end
replace_double_newlines()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 102 def replace_double_newlines Rule.apply @text, DoubleNewLineWithSpaceRule, DoubleNewLineRule end
replace_escaped_newlines()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 97 def replace_escaped_newlines Rule.apply @text, EscapedNewLineRule, EscapedCarriageReturnRule, TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule end
replace_newlines()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 106 def replace_newlines if doc_type.eql?('pdf') remove_pdf_line_breaks else Rule.apply @text, NewLineFollowedByPeriodRule, ReplaceNewlineWithCarriageReturnRule end end
replace_punctuation_in_brackets()
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 64 def replace_punctuation_in_brackets @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match| @text.gsub!(/#{Regexp.escape(match)}/, match.dup.gsub!(/\?/, '&ᓷ&')) if match.include?('?') end end
search_for_connected_sentences(word, txt, regex, rule)
click to toggle source
# File lib/pragmatic_segmenter/cleaner.rb, line 70 def search_for_connected_sentences(word, txt, regex, rule) if word =~ regex unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ } unless abbreviations.any? { |abbr| word =~ /#{abbr}/i } new_word = Rule.apply(word.dup, rule) txt.gsub!(/#{Regexp.escape(word)}/, new_word) end end end end