class String

Constants

APPROXIMATIONS

Transliterations (like the i18n defaults) see github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb

ASCII_SPACE
BAD_ENCODING

When strings are mistakenly encoded as single-byte character sets, instead of UTF-8, there are some distinctive character combinations that we can spot and fix Useful table here www.i18nqa.com/debug/utf8-debug.html

BAD_ENCODING_PATTERNS
COMPOUND_NAMES
NAME_MODIFIERS
NONBREAKING_SPACE

Public Instance Methods

ansi_attributes(*args) click to toggle source
# File lib/name_tamer/string.rb, line 276
def ansi_attributes(*args)
  "\e[#{args.join(';')}m#{self}\e[0m"
end
approximate_latin_chars!() click to toggle source

Any characters that resemble latin characters might usefully be transliterated into ones that are easy to type on an anglophone keyboard.

# File lib/name_tamer/string.rb, line 68
def approximate_latin_chars!
  gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
end
downcase_after_apostrophe!() click to toggle source
# File lib/name_tamer/string.rb, line 82
def downcase_after_apostrophe!
  gsub!(/\'\w\b/, &:downcase) || self # Lowercase 's
end
ensure_safe!() click to toggle source
# File lib/name_tamer/string.rb, line 171
def ensure_safe!
  encode!('UTF-8', invalid: :replace, undef: :replace, replace: '')
end
ensure_space_after_initials!() click to toggle source
# File lib/name_tamer/string.rb, line 167
def ensure_space_after_initials!
  gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
end
fix_apostrophe_modifiers!() click to toggle source
# File lib/name_tamer/string.rb, line 125
def fix_apostrophe_modifiers!
  %w(Dell D).each do |modifier|
    gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
  end

  self # Allows chaining
end
fix_encoding_errors!() click to toggle source

Strings that were wrongly encoded with single-byte encodings sometimes have tell-tale substrings that we can put back into the correct UTF-8 character

# File lib/name_tamer/string.rb, line 74
def fix_encoding_errors!
  gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
end
fix_ff!() click to toggle source

Fix ff wierdybonks

# File lib/name_tamer/string.rb, line 103
def fix_ff!
  %w(
    Fforbes Fforde Ffinch Ffrench Ffoulkes
  ).each { |ff_name| substitute!(ff_name, ff_name.downcase) }

  self # Allows chaining
end
fix_mac!() click to toggle source

Our list of terminal characters that indicate a non-celtic name used to include o but we removed it because of MacMurdo.

# File lib/name_tamer/string.rb, line 88
def fix_mac!
  if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
    gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }

    # Fix Mac exceptions
    %w(
      MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
      MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
    ).each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
  end

  self # Allows chaining
end
fix_name_modifiers!() click to toggle source

Fixes for name modifiers followed by space Also replaces spaces with non-breaking spaces Fixes for name modifiers followed by an apostrophe, e.g. d’Artagnan, Commedia dell’Arte

# File lib/name_tamer/string.rb, line 114
def fix_name_modifiers!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
      "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
    end
  end

  fix_apostrophe_modifiers!
  self # Allows chaining
end
fix_separators!(separator) click to toggle source

Make sure separators are not where they shouldn’t be

# File lib/name_tamer/string.rb, line 53
def fix_separators!(separator)
  return self if separator.nil? || separator.empty?

  r = Regexp.escape(separator)

  # No more than one of the separator in a row.
  substitute!(/#{r}{2,}/, separator)

  # Remove leading/trailing separator.
  substitute!(/^#{r}|#{r}$/i, '')
end
invalid_chars_to!(separator) click to toggle source

Change some characters embedded in words to our separator character e.g. example.com -> example-com

# File lib/name_tamer/string.rb, line 30
def invalid_chars_to!(separator)
  substitute!(%r{(?<![[:space:]])[\.\/](?![[:space:]])}, separator)
end
nbsp_in_compound_name!() click to toggle source

Fix known last names that have spaces (not hyphens!)

# File lib/name_tamer/string.rb, line 141
def nbsp_in_compound_name!
  COMPOUND_NAMES.each do |compound_name|
    substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
  end

  self # Allows chaining
end
nbsp_in_name_modifier!() click to toggle source
# File lib/name_tamer/string.rb, line 149
def nbsp_in_name_modifier!
  NAME_MODIFIERS.each do |modifier|
    gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
  end

  self # Allows chaining
end
presence() click to toggle source
# File lib/name_tamer/string.rb, line 4
def presence
  self unless empty?
end
remove_periods_from_initials!() click to toggle source
# File lib/name_tamer/string.rb, line 157
def remove_periods_from_initials!
  gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
end
remove_spaces_from_initials!() click to toggle source
# File lib/name_tamer/string.rb, line 161
def remove_spaces_from_initials!
  gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
    "#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
  end || self
end
safe_unescape!() click to toggle source

Unescape percent-encoded characters This might introduce UTF-8 invalid byte sequence so we take precautions

# File lib/name_tamer/string.rb, line 37
def safe_unescape!
  string = URI.unescape(self)
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
  return self
else
  return self if self == string
  replace string
  ensure_safe!
end
space_around_comma!() click to toggle source

Ensure commas have exactly one space after them

# File lib/name_tamer/string.rb, line 24
def space_around_comma!
  substitute!(/[[:space:]]*,[[:space:]]*/, ', ')
end
strip_or_self!() click to toggle source
# File lib/name_tamer/string.rb, line 14
def strip_or_self!
  strip! || self
end
strip_unwanted!(filter) click to toggle source

Strip illegal characters out completely

# File lib/name_tamer/string.rb, line 10
def strip_unwanted!(filter)
  substitute!(filter, '')
end
substitute!(pattern, replacement) click to toggle source
# File lib/name_tamer/string.rb, line 175
def substitute!(pattern, replacement)
  gsub!(pattern, replacement) || self
end
unescape_html!() click to toggle source

Remove HTML entities

# File lib/name_tamer/string.rb, line 48
def unescape_html!
  replace CGI.unescapeHTML self
end
upcase_first_letter!() click to toggle source
# File lib/name_tamer/string.rb, line 78
def upcase_first_letter!
  gsub!(/\b\w/, &:upcase) || self
end
upcase_initials!() click to toggle source

Upcase words with no vowels, e.g JPR Williams Except Ng

# File lib/name_tamer/string.rb, line 135
def upcase_initials!
  gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
  gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
end
whitespace_to!(separator) click to toggle source

Change any whitespace into our separator character

# File lib/name_tamer/string.rb, line 19
def whitespace_to!(separator)
  substitute!(/[[:space:]]+/, separator)
end