class PragmaticSegmenter::List

This class searches for a list within a string and adds newlines before each list item.

Constants

ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX

Rubular: rubular.com/r/wMpnVedEIb

ALPHABETICAL_LIST_WITH_PARENS

Rubular: rubular.com/r/Gu5rQapywf

ALPHABETICAL_LIST_WITH_PERIODS

Rubular: rubular.com/r/XcpaJKH0sz

EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX

Rubular: rubular.com/r/NsNFSqrNvJ

LATIN_NUMERALS
ListMarkerRule
NUMBERED_LIST_PARENS_REGEX
NUMBERED_LIST_REGEX_1
NUMBERED_LIST_REGEX_2
ROMAN_NUMERALS
ROMAN_NUMERALS_IN_PARENTHESES

Rubular: rubular.com/r/GcnmQt4a3I

SpaceBetweenListItemsFirstRule

Rubular: rubular.com/r/Wv4qLdoPx7

SpaceBetweenListItemsSecondRule

Rubular: rubular.com/r/AizHXC6HxK

SpaceBetweenListItemsThirdRule

Rubular: rubular.com/r/GE5q6yID2j

SubstituteListPeriodRule

Attributes

text[R]

Public Class Methods

new(text:) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 49
def initialize(text:)
  @text = text.dup
end

Public Instance Methods

add_line_break() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 53
def add_line_break
  format_alphabetical_lists
  format_roman_numeral_lists
  format_numbered_list_with_periods
  format_numbered_list_with_parens
end
replace_parens() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 60
def replace_parens
  text.gsub!(ROMAN_NUMERALS_IN_PARENTHESES, '&✂&\1&⌬&'.freeze)
  text
end

Private Instance Methods

add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 136
def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
    parens: true,
    roman_numeral: roman_numeral)
end
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 132
def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
end
add_line_breaks_for_numbered_list_with_parens() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 105
def add_line_breaks_for_numbered_list_with_parens
  if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
    Rule.apply(@text, SpaceBetweenListItemsThirdRule)
  end
end
add_line_breaks_for_numbered_list_with_periods() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 93
def add_line_breaks_for_numbered_list_with_periods
  if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
    Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
  end
end
format_alphabetical_lists() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 79
def format_alphabetical_lists
  add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
  add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
end
format_numbered_list_with_parens() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 67
def format_numbered_list_with_parens
  replace_parens_in_numbered_list
  add_line_breaks_for_numbered_list_with_parens
  Rule.apply(@text, ListMarkerRule)
end
format_numbered_list_with_periods() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 73
def format_numbered_list_with_periods
  replace_periods_in_numbered_list
  add_line_breaks_for_numbered_list_with_periods
  Rule.apply(@text, SubstituteListPeriodRule)
end
format_roman_numeral_lists() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 84
def format_roman_numeral_lists
  add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
  add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
end
iterate_alphabet_array(regex, parens: false, roman_numeral: false) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 184
def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
  list_array = @text.scan(regex).map { |s| Unicode::downcase(s) }
  if roman_numeral
    alphabet = ROMAN_NUMERALS
  else
    alphabet = LATIN_NUMERALS
  end
  list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
  list_array.each_with_index do |a, i|
    if i.eql?(list_array.length - 1)
      last_array_item_replacement(a, i, alphabet, list_array, parens)
    else
      other_items_replacement(a, i, alphabet, list_array, parens)
    end
  end
end
last_array_item_replacement(a, i, alphabet, list_array, parens) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 166
def last_array_item_replacement(a, i, alphabet, list_array, parens)
  return if alphabet & list_array == [] ||
    !alphabet.include?(list_array[i - 1]) ||
    !alphabet.include?(a)
  return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
  replace_correct_alphabet_list(a, parens)
end
other_items_replacement(a, i, alphabet, list_array, parens) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 174
def other_items_replacement(a, i, alphabet, list_array, parens)
  return if alphabet & list_array == [] ||
    !alphabet.include?(list_array[i - 1]) ||
    !alphabet.include?(a) ||
    !alphabet.include?(list_array[i + 1])
  return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
            (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
  replace_correct_alphabet_list(a, parens)
end
replace_alphabet_list(a) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 142
def replace_alphabet_list(a)
  @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
    a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
  end
end
replace_alphabet_list_parens(a) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 148
def replace_alphabet_list_parens(a)
  @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
    if m.include?('(')
      a.eql?(Unicode::downcase(m.dup).gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
    else
      a.eql?(Unicode::downcase(m.dup)) ? "\r#{Regexp.escape(m)}" : "#{m}"
    end
  end
end
replace_correct_alphabet_list(a, parens) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 158
def replace_correct_alphabet_list(a, parens)
  if parens
    replace_alphabet_list_parens(a)
  else
    replace_alphabet_list(a)
  end
end
replace_parens_in_numbered_list() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 99
def replace_parens_in_numbered_list
  scan_lists(
    NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
  scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
end
replace_periods_in_numbered_list() click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 89
def replace_periods_in_numbered_list
  scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
end
scan_lists(regex1, regex2, replacement, strip: false) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 111
def scan_lists(regex1, regex2, replacement, strip: false)
  list_array = @text.scan(regex1).map(&:to_i)
  list_array.each_with_index do |a, i|
    next unless (a + 1).eql?(list_array[i + 1]) ||
                (a - 1).eql?(list_array[i - 1]) ||
                (a.eql?(0) && list_array[i - 1].eql?(9)) ||
                (a.eql?(9) && list_array[i + 1].eql?(0))
    substitute_found_list_items(regex2, a, strip, replacement)
  end
end
substitute_found_list_items(regex, a, strip, replacement) click to toggle source
# File lib/pragmatic_segmenter/list.rb, line 122
def substitute_found_list_items(regex, a, strip, replacement)
  @text.gsub!(regex).with_index do |m|
    if a.to_s.eql?(strip ? m.strip.chop : m)
      "#{Regexp.escape(a.to_s)}" + replacement
    else
      "#{m}"
    end
  end
end