class ChupaText::Decomposers::HTML

Constants

TARGET_EXTENSIONS
TARGET_MIME_TYPES

Public Instance Methods

decompose(data) { |decomposed_data| ... } click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 47
def decompose(data)
  html = data.body
  doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html))
  body_element = (doc % "body")
  if body_element
    body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '')
  else
    body = ""
  end
  decomposed_data = TextData.new(body, :source_data => data)
  attributes = decomposed_data.attributes
  title_element = (doc % "head/title")
  attributes.title = title_element.text if title_element
  encoding = doc.encoding
  attributes.encoding = encoding if encoding

  yield(decomposed_data)
end
target?(data) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 30
def target?(data)
  (data["source-mime-types"] || []).each do |source_mime_type|
    return false if TARGET_MIME_TYPES.include?(source_mime_type)
  end

  return true if TARGET_EXTENSIONS.include?(data.extension)
  return true if TARGET_MIME_TYPES.include?(data.mime_type)

  body = data.body
  return false if body.nil?

  return true if body.start_with?("<!DOCTYPE html ")
  return true if body.start_with?("<html")

  false
end

Private Instance Methods

aside_element?(element, name, classes) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 212
def aside_element?(element, name, classes)
  case name
  when "aside"
    return true
  end

  classes.each do |klass|
    case klass
    when "aside"
      return true
    end
  end

  case element["id"]
  when "aside"
    return true
  end

  false
end
extract_text(element, text) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 112
def extract_text(element, text)
  name = element.name.downcase
  classes = (element["class"] || "").split
  return text if noindex_element?(element, name, classes)
  return text if header_element?(element, name, classes)
  return text if footer_element?(element, name, classes)
  return text if navigation_element?(element, name, classes)
  return text if aside_element?(element, name, classes)

  element.children.each do |child|
    case child
    when Nokogiri::XML::Text
      text << child.text
    when Nokogiri::XML::Element
      extract_text(child, text)
    end
  end

  text
end
guess_encoding(text) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 67
def guess_encoding(text)
  unless text.encoding.ascii_compatible?
    return text.encoding.name
  end

  case text
  when /\A<\?xml.+?encoding=(['"])([a-zA-Z0-9_-]+)\1/
    $2
  when /<meta\s[^>]*
         http-equiv=(['"])content-type\1\s+
         content=(['"])(.+?)\2/imx # "
    content_type = $3
    _, parameters = content_type.split(/;\s*/, 2)
    encoding = nil
    if parameters and /\bcharset=([a-zA-Z0-9_-]+)/i =~ parameters
      encoding = normalize_charset($1)
    end
    encoding
  when /<meta\s[^>]*charset=(['"])(.+?)\1/imx # "
    charset = $2
    normalize_charset(charset)
  else
    if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding?
      text.encoding.to_s
    else
      guess_encoding_nkf(text)
    end
  end
end
guess_encoding_nkf(text) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 108
def guess_encoding_nkf(text)
  NKF.guess(text).name
end
header_element?(element, name, classes) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 149
def header_element?(element, name, classes)
  case name
  when "header"
    return true
  end

  classes.each do |klass|
    case klass
    when "header"
      return true
    end
  end

  case element["id"]
  when "header"
    return true
  end

  false
end
navigation_element?(element, name, classes) click to toggle source
noindex_element?(element, name, classes) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 133
def noindex_element?(element, name, classes)
  case name
  when "script", "noscript", "link", "style"
    return true
  end

  classes.each do |klass|
    case klass
    when "noindex", "robots-noindex"
      return true
    end
  end

  false
end
normalize_charset(charset) click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 97
def normalize_charset(charset)
  case charset
  when /\Ax-sjis\z/i
    normalize_charset("Shift_JIS")
  when /\Ashift[_-]jis\z/i
    "Windows-31J"
  else
    charset
  end
end