class PragmaticSegmenter::List
This class searches for a list within a string and adds newlines before each list item.
Constants
- ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX
Rubular: rubular.com/r/wMpnVedEIb
- ALPHABETICAL_LIST_WITH_PARENS
Rubular: rubular.com/r/Gu5rQapywf
- ALPHABETICAL_LIST_WITH_PERIODS
Rubular: rubular.com/r/XcpaJKH0sz
- EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX
Rubular: rubular.com/r/NsNFSqrNvJ
- LATIN_NUMERALS
- ListMarkerRule
- NUMBERED_LIST_PARENS_REGEX
- NUMBERED_LIST_REGEX_1
- NUMBERED_LIST_REGEX_2
- ROMAN_NUMERALS
- ROMAN_NUMERALS_IN_PARENTHESES
Rubular: rubular.com/r/GcnmQt4a3I
- SpaceBetweenListItemsFirstRule
Rubular: rubular.com/r/Wv4qLdoPx7
- SpaceBetweenListItemsSecondRule
Rubular: rubular.com/r/AizHXC6HxK
- SpaceBetweenListItemsThirdRule
Rubular: rubular.com/r/GE5q6yID2j
- SubstituteListPeriodRule
Attributes
text[R]
Public Class Methods
new(text:)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 49 def initialize(text:) @text = text.dup end
Public Instance Methods
add_line_break()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 53 def add_line_break format_alphabetical_lists format_roman_numeral_lists format_numbered_list_with_periods format_numbered_list_with_parens end
replace_parens()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 60 def replace_parens text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze) text end
Private Instance Methods
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 136 def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false) iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, parens: true, roman_numeral: roman_numeral) end
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 132 def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false) iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral) end
add_line_breaks_for_numbered_list_with_parens()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 105 def add_line_breaks_for_numbered_list_with_parens if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/ Rule.apply(@text, SpaceBetweenListItemsThirdRule) end end
add_line_breaks_for_numbered_list_with_periods()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 93 def add_line_breaks_for_numbered_list_with_periods if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/ Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule) end end
format_alphabetical_lists()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 79 def format_alphabetical_lists add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false) add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false) end
format_numbered_list_with_parens()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 67 def format_numbered_list_with_parens replace_parens_in_numbered_list add_line_breaks_for_numbered_list_with_parens Rule.apply(@text, ListMarkerRule) end
format_numbered_list_with_periods()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 73 def format_numbered_list_with_periods replace_periods_in_numbered_list add_line_breaks_for_numbered_list_with_periods Rule.apply(@text, SubstituteListPeriodRule) end
format_roman_numeral_lists()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 84 def format_roman_numeral_lists add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true) add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true) end
iterate_alphabet_array(regex, parens: false, roman_numeral: false)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 184 def iterate_alphabet_array(regex, parens: false, roman_numeral: false) list_array = @text.scan(regex).map { |s| Unicode::downcase(s) } if roman_numeral alphabet = ROMAN_NUMERALS else alphabet = LATIN_NUMERALS end list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } } list_array.each_with_index do |a, i| if i.eql?(list_array.length - 1) last_array_item_replacement(a, i, alphabet, list_array, parens) else other_items_replacement(a, i, alphabet, list_array, parens) end end end
last_array_item_replacement(a, i, alphabet, list_array, parens)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 166 def last_array_item_replacement(a, i, alphabet, list_array, parens) return if alphabet & list_array == [] || !alphabet.include?(list_array[i - 1]) || !alphabet.include?(a) return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1 replace_correct_alphabet_list(a, parens) end
other_items_replacement(a, i, alphabet, list_array, parens)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 174 def other_items_replacement(a, i, alphabet, list_array, parens) return if alphabet & list_array == [] || !alphabet.include?(list_array[i - 1]) || !alphabet.include?(a) || !alphabet.include?(list_array[i + 1]) return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 && (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1 replace_correct_alphabet_list(a, parens) end
replace_alphabet_list(a)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 142 def replace_alphabet_list(a) @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m| a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}" end end
replace_alphabet_list_parens(a)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 148 def replace_alphabet_list_parens(a) @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m| if m.include?('(') a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}" else a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}" end end end
replace_correct_alphabet_list(a, parens)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 158 def replace_correct_alphabet_list(a, parens) if parens replace_alphabet_list_parens(a) else replace_alphabet_list(a) end end
replace_parens_in_numbered_list()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 99 def replace_parens_in_numbered_list scan_lists( NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝') scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝') end
replace_periods_in_numbered_list()
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 89 def replace_periods_in_numbered_list scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true) end
scan_lists(regex1, regex2, replacement, strip: false)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 111 def scan_lists(regex1, regex2, replacement, strip: false) list_array = @text.scan(regex1).map(&:to_i) list_array.each_with_index do |a, i| next unless (a + 1).eql?(list_array[i + 1]) || (a - 1).eql?(list_array[i - 1]) || (a.eql?(0) && list_array[i - 1].eql?(9)) || (a.eql?(9) && list_array[i + 1].eql?(0)) substitute_found_list_items(regex2, a, strip, replacement) end end
substitute_found_list_items(regex, a, strip, replacement)
click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 122 def substitute_found_list_items(regex, a, strip, replacement) @text.gsub!(regex).with_index do |m| if a.to_s.eql?(strip ? m.strip.chop : m) "#{Regexp.escape(a.to_s)}" + replacement else "#{m}" end end end