class Html2Text

Constants

DO_NOT_TOUCH_WHITESPACE
VERSION

Attributes

doc[R]

Public Class Methods

convert(html) click to toggle source
# File lib/html2text.rb, line 10
def self.convert(html)
  html = html.to_s

  if is_office_document?(html)
    # Emulate the CSS rendering of Office documents
    html = html.gsub("<p class=MsoNormal>", "<br>")
      .gsub("<o:p>&nbsp;</o:p>", "<br>")
      .gsub("<o:p></o:p>", "")
  end

  if !html.include?("<html")
    # Stop Nokogiri from inserting in <p> tags
    html = "<div>#{html}</div>"
  end

  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  Html2Text.new(doc).convert
end
fix_newlines(text) click to toggle source
# File lib/html2text.rb, line 31
def self.fix_newlines(text)
  text.gsub("\r\n", "\n").gsub("\r", "\n")
end
new(doc) click to toggle source
# File lib/html2text.rb, line 6
def initialize(doc)
  @doc = doc
end
replace_entities(text) click to toggle source
# File lib/html2text.rb, line 35
def self.replace_entities(text)
  text.gsub("&nbsp;", " ").gsub("\u00a0", " ").gsub("&zwnj;", "")
end

Private Class Methods

is_office_document?(text) click to toggle source
# File lib/html2text.rb, line 66
def self.is_office_document?(text)
  text.include?("urn:schemas-microsoft-com:office")
end

Public Instance Methods

convert() click to toggle source
# File lib/html2text.rb, line 39
def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output = remove_unnecessary_empty_lines(output)
  return output.strip
end
remove_leading_and_trailing_whitespace(text) click to toggle source
# File lib/html2text.rb, line 48
def remove_leading_and_trailing_whitespace(text)
  # ignore any <pre> blocks, which we don't want to interact with
  pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)

  output = []
  pre_blocks.each.with_index do |block, index|
    if index % 2 == 0
      output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
    else
      output << block
    end
  end

  output.join("")
end

Private Instance Methods

image_text(node) click to toggle source
# File lib/html2text.rb, line 219
def image_text(node)
  if node.attribute("title")
    "[" + node.attribute("title").to_s + "]"
  elsif node.attribute("alt")
    "[" + node.attribute("alt").to_s + "]"
  else
    ""
  end
end
iterate_over(node) click to toggle source
# File lib/html2text.rb, line 85
def iterate_over(node)
  return "\n" if node.name.downcase == "br" && next_node_is_text?(node)

  return trimmed_whitespace(node.text) if node.text?

  if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
    return ""
  end

  if node.name.downcase == "pre"
    return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
  end

  output = []

  output << prefix_whitespace(node)
  output += node.children.map do |child|
    iterate_over(child)
  end
  output << suffix_whitespace(node)

  output = output.compact.join("") || ""

  if node.name.downcase == "a"
    output = wrap_link(node, output)
  elsif node.name.downcase == "img"
    output = image_text(node)
  end

  return output
end
next_node_is_text?(node) click to toggle source
# File lib/html2text.rb, line 241
def next_node_is_text?(node)
  return !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
end
next_node_name(node) click to toggle source
# File lib/html2text.rb, line 229
def next_node_name(node)
  next_node = node.next_sibling
  while next_node != nil
    break if next_node.element?
    next_node = next_node.next_sibling
  end

  if next_node && next_node.element?
    next_node.name.downcase
  end
end
prefix_whitespace(node) click to toggle source
# File lib/html2text.rb, line 117
def prefix_whitespace(node)
  case node.name.downcase
    when "hr"
      "\n---------------------------------------------------------------\n"

    when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
      "\n\n"

    when "p"
      "\n\n"

    when "tr"
      "\n"

    when "div"
      if node.parent.name == "div" && (node.parent.text.strip == node.text.strip)
        ""
      else
        "\n"
      end

    when "td", "th"
      "\t"

    when "li"
      "- "
  end
end
previous_node_is_text?(node) click to toggle source
# File lib/html2text.rb, line 257
def previous_node_is_text?(node)
  return !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
end
previous_node_name(node) click to toggle source
# File lib/html2text.rb, line 245
def previous_node_name(node)
  previous_node = node.previous_sibling
  while previous_node != nil
    break if previous_node.element?
    previous_node = previous_node.previous_sibling
  end

  if previous_node && previous_node.element?
    previous_node.name.downcase
  end
end
remove_unnecessary_empty_lines(text) click to toggle source
# File lib/html2text.rb, line 70
def remove_unnecessary_empty_lines(text)
  text.gsub(/\n\n\n*/im, "\n\n")
end
suffix_whitespace(node) click to toggle source
# File lib/html2text.rb, line 146
def suffix_whitespace(node)
  case node.name.downcase
    when "h1", "h2", "h3", "h4", "h5", "h6"
      # add another line
      "\n\n"

    when "p"
      "\n\n"

    when "br"
      if next_node_name(node) != "div" && next_node_name(node) != nil
        "\n"
      end

    when "li"
      "\n"

    when "div"
      if next_node_is_text?(node)
        "\n"
      elsif next_node_name(node) != "div" && next_node_name(node) != nil
        "\n"
      end
  end
end
trimmed_whitespace(text) click to toggle source
# File lib/html2text.rb, line 74
def trimmed_whitespace(text)
  # Replace whitespace characters with a space (equivalent to \s)
  # and force any text encoding into UTF-8
  if text.valid_encoding?
    text.gsub(/[\t\n\f\r ]+/im, " ")
  else
    text.force_encoding("WINDOWS-1252")
    return trimmed_whitespace(text.encode("UTF-16be", invalid: :replace, replace: "?").encode('UTF-8'))
  end
end