module Sterile

Constants

VERSION

Public Class Methods

decode_entities(string) click to toggle source

The reverse of encode_entities. Turns HTML or numeric entities into their Unicode counterparts.

# File lib/sterile/entities.rb, line 26
def decode_entities(string)
  string.gsub!(/&#x([a-zA-Z0-9]{1,7});/) { [$1.to_i(16)].pack("U") }
  string.gsub!(/&#(\d{1,7});/) { [$1.to_i].pack("U") }
  string.gsub(/&([a-zA-Z0-9]+);/) do
    codepoint = html_entities_data[$1]
    codepoint ? [codepoint].pack("U") : $&
  end
end
encode_entities(string) click to toggle source

Turn Unicode characters into their HTML equivilents. If a valid HTML entity is not possible, it will create a numeric entity.

q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline
# File lib/sterile/entities.rb, line 12
def encode_entities(string)
  transmogrify(string) do |mapping, codepoint|
    if (32..126).include?(codepoint)
      mapping[0]
    else
      "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
    end
  end
end
gsub_tags(string) { |content| ... } click to toggle source

Similar to gsub, except it works in between HTML/XML tags and yields text to a block. Text will be replaced by what the block returns. Warning: does not work in some degenerate cases.

# File lib/sterile/tags.rb, line 57
def gsub_tags(string, &block)
  raise "No block given" unless block_given?

  fragment = Nokogiri::HTML::DocumentFragment.parse string
  fragment.traverse do |node|
    node.content = yield(node.content) if node.text?
  end
  fragment.to_html
end
plain_format(string) click to toggle source
# File lib/sterile/plain_format.rb, line 7
def plain_format(string)
  string = string.encode_entities
  plain_format_rules.each do |rule|
    string.gsub! rule[0], rule[1]
  end
  string
end
plain_format_tags(string) click to toggle source

Like plain_format, but works with HTML/XML (somewhat).

# File lib/sterile/plain_format.rb, line 18
def plain_format_tags(string)
  string.gsub_tags do |text|
    text.plain_format.decode_entities
  end.encode_entities
end
scan_tags(string) { |content| ... } click to toggle source

Iterates over all text in between HTML/XML tags and yields it to a block. Warning: does not work in some degenerate cases.

# File lib/sterile/tags.rb, line 72
def scan_tags(string, &block)
  raise "No block given" unless block_given?

  fragment = Nokogiri::HTML::DocumentFragment.parse string
  fragment.traverse do |node|
    yield(node.content) if node.text?
  end
  nil
end
sluggerize(string, options = {}) click to toggle source

Transliterate to ASCII, downcase and format for URL permalink/slug by stripping out all non-alphanumeric characters and replacing spaces with a delimiter (defaults to '-').

"Hello World!".sluggerize # => "hello-world"
# File lib/sterile/utilities.rb, line 32
def sluggerize(string, options = {})
  options = {
    :delimiter => "-"
  }.merge!(options)

  sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
end
Also aliased as: to_slug
smart_format(string) click to toggle source

Format text with proper “curly” quotes, m-dashes, copyright, trademark, etc.

q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
# File lib/sterile/smart_format.rb, line 11
def smart_format(string)
  string = string.to_s
  string = string.dup if string.frozen?
  smart_format_rules.each do |rule|
    string.gsub! rule[0], rule[1]
  end
  string
end
smart_format_tags(string) click to toggle source

Like smart_format, but works with HTML/XML (somewhat).

# File lib/sterile/smart_format.rb, line 23
def smart_format_tags(string)
  string = string.gsub(/[\p{Z}\s]+(<\/[a-zA-Z]+>)(['"][a-zA-Z])/, "\\1 \\2") # Fixes quote after whitespace + tag "<em>Dan. </em>'And"
  string.gsub_tags do |text|
    text.smart_format
  end.encode_entities.gsub(/(\<\/\w+\>)&ldquo;/, "\\1&rdquo;").gsub(/(\<\/\w+\>)&lsquo;/, "\\1&rsquo;")
end
sterilize(string) click to toggle source

Transliterate to ASCII and strip out any HTML/XML tags.

"<b>nåsty</b>".sterilize # => "nasty"
# File lib/sterile/utilities.rb, line 21
def sterilize(string)
  strip_tags(transliterate(string))
end
strip_tags(string, options = {}) click to toggle source

Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags. CDATA is considered text unless :keep_cdata => false is specified. Redundant whitespace will be removed unless :keep_whitespace => true is specified.

# File lib/sterile/tags.rb, line 13
def strip_tags(string, options = {})
  options = {
    :keep_whitespace => false,
    :keep_cdata      => true
  }.merge!(options)

  string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
  string.gsub!(/<!--[^-]*-->/, '')      # strip comments

  string.gsub!(
    /
      <!\[CDATA\[
      ([^\]]*)
      \]\]>
    /xi,
    options[:keep_cdata] ? '\\1' : ''
  )

  html_name = /[\w:-]+/
  html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
  html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/

  string.gsub!(
    /
      <
      [\/]?
      #{html_name}
      (\s+(#{html_attr}(\s+#{html_attr})*))?
      \s*
      [\/]?
      >
    /xi,
    ''
  )

  options[:keep_whitespace] ? string : trim_whitespace(string)
end
titlecase(string) click to toggle source

Format text appropriately for titles. This method is much smarter than ActiveSupport's titlecase. The algorithm is based on work done by John Gruber et al (daringfireball.net/2008/08/title_case_update)

# File lib/sterile/titlecase.rb, line 11
def titlecase(string)

  lsquo = [8216].pack("U")
  rsquo = [8217].pack("U")
  ldquo = [8220].pack("U")
  rdquo = [8221].pack("U")
  ndash = [8211].pack("U")

  string.strip!
  string.gsub!(/\s+/, " ")
  string.downcase! unless string =~ /[[:lower:]]/

  small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
  apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu

  string.gsub!(
    /
      \b
      ([_\*]*)
      (?:
        ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} )      # URL, domain, or email
        |
        ( (?i: #{small_words} ) #{apos} )               # or small word, case-insensitive
        |
        ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} )  # or word without internal caps
        |
        ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} )  # or some other word
      )
      ([_\*]*)
      \b
    /xu
  ) do
    ($1 ? $1 : "") +
    ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
    ($6 ? $6 : "")
  end

  if RUBY_VERSION < "1.9.0"
    string.gsub!(
      /
        \b
        ([:alpha:]+)
        (#{ndash})
        ([:alpha:]+)
        \b
      /xu
    ) do
      $1.downcase.capitalize + $2 + $1.downcase.capitalize
    end
  end

  string.gsub!(
    /
      (
        \A [[:punct:]]*     # start of title
        | [:.;?!][ ]+       # or of subsentence
        | [ ]['"#{ldquo}#{lsquo}(\[][ ]*  # or of inserted subphrase
      )
      ( #{small_words} )    # followed by a small-word
      \b
    /xiu
  ) do
    $1 + $2.downcase.capitalize
  end

  string.gsub!(
    /
      \b
      ( #{small_words} )    # small-word
      (?=
        [[:punct:]]* \Z     # at the end of the title
        |
        ['"#{rsquo}#{rdquo})\]] [ ]       # or of an inserted subphrase
      )
    /xu
  ) do
    $1.downcase.capitalize
  end

  string.gsub!(
    /
      (
        \b
        [[:alpha:]]         # single first letter
        [\-#{ndash}]               # followed by a dash
      )
      ( [[:alpha:]] )       # followed by a letter
    /xu
  ) do
    $1 + $2.downcase
  end

  string.gsub!(/q&a/i, 'Q&A')

  string
end
Also aliased as: titleize
titleize(string)
Alias for: titlecase
to_ascii(string, options = {})
Alias for: transliterate
to_slug(string, options = {})
Alias for: sluggerize
transliterate(string, options = {}) click to toggle source

Transliterate Unicode [and accented ASCII] characters to their plain-text ASCII equivalents. This is based on data from the stringex gem (github.com/rsl/stringex) which is in turn a port of Perl's Unidecode and ostensibly provides superior results to iconv. The optical conversion data is based on work by Eric Boehs at github.com/ericboehs/to_slug Passing an option of :optical => true will prefer optical mapping instead of more pedantic matches.

"ýůçký".transliterate # => "yucky"
# File lib/sterile/transliterate.rb, line 34
def transliterate(string, options = {})
  options = {
    :optical => false
  }.merge!(options)

  if options[:optical]
    transmogrify(string) do |mapping, codepoint|
      mapping[1] || mapping[0] || ""
    end
  else
    transmogrify(string) do |mapping, codepoint|
      mapping[0] || mapping[1] || ""
    end
  end
end
Also aliased as: to_ascii
transmogrify(string) { |mapping, codepoint| ... } click to toggle source
# File lib/sterile/transliterate.rb, line 7
def transmogrify(string, &block)
  raise "No block given" unless block_given?

  result = ""
  string.unpack("U*").each do |codepoint|
    cg = codepoint >> 8
    cp = codepoint & 0xFF
    begin
      mapping = Array(codepoints_data[cg][cp])
      result << yield(mapping, codepoint)
    rescue
    end
  end

  result
end
trim_whitespace(string) click to toggle source

Trim whitespace from start and end of string and remove any redundant whitespace in between.

" Hello  world! ".transliterate # => "Hello world!"
# File lib/sterile/utilities.rb, line 12
def trim_whitespace(string)
  string.gsub(/\s+/, " ").strip
end

Private Class Methods

codepoints_data() click to toggle source

Lazy load codepoints data

# File lib/sterile/transliterate.rb, line 56
def codepoints_data
  @codepoints_data ||= begin
    require "sterile/data/codepoints_data"
    Data.codepoints_data
  end
end
html_entities_data() click to toggle source

Lazy load html entities

# File lib/sterile/entities.rb, line 40
def html_entities_data
  @html_entities_data ||= begin
    require "sterile/data/html_entities_data"
    Data.html_entities_data
  end
end
plain_format_rules() click to toggle source

Lazy load plain formatting rules

# File lib/sterile/plain_format.rb, line 29
def plain_format_rules
  @plain_format_rules ||= begin
    require "sterile/data/plain_format_rules"
    Data.plain_format_rules
  end
end
smart_format_rules() click to toggle source

Lazy load smart formatting rules

# File lib/sterile/smart_format.rb, line 35
def smart_format_rules
  @smart_format_rules ||= begin
    require "sterile/data/smart_format_rules"
    Data.smart_format_rules
  end
end