class ChupaText::Decomposers::HTML
Constants
- TARGET_EXTENSIONS
- TARGET_MIME_TYPES
Public Instance Methods
decompose(data) { |decomposed_data| ... }
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 47 def decompose(data) html = data.body doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html)) body_element = (doc % "body") if body_element body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '') else body = "" end decomposed_data = TextData.new(body, :source_data => data) attributes = decomposed_data.attributes title_element = (doc % "head/title") attributes.title = title_element.text if title_element encoding = doc.encoding attributes.encoding = encoding if encoding yield(decomposed_data) end
target?(data)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 30 def target?(data) (data["source-mime-types"] || []).each do |source_mime_type| return false if TARGET_MIME_TYPES.include?(source_mime_type) end return true if TARGET_EXTENSIONS.include?(data.extension) return true if TARGET_MIME_TYPES.include?(data.mime_type) body = data.body return false if body.nil? return true if body.start_with?("<!DOCTYPE html ") return true if body.start_with?("<html") false end
Private Instance Methods
aside_element?(element, name, classes)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 212 def aside_element?(element, name, classes) case name when "aside" return true end classes.each do |klass| case klass when "aside" return true end end case element["id"] when "aside" return true end false end
extract_text(element, text)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 112 def extract_text(element, text) name = element.name.downcase classes = (element["class"] || "").split return text if noindex_element?(element, name, classes) return text if header_element?(element, name, classes) return text if footer_element?(element, name, classes) return text if navigation_element?(element, name, classes) return text if aside_element?(element, name, classes) element.children.each do |child| case child when Nokogiri::XML::Text text << child.text when Nokogiri::XML::Element extract_text(child, text) end end text end
guess_encoding(text)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 67 def guess_encoding(text) unless text.encoding.ascii_compatible? return text.encoding.name end case text when /\A<\?xml.+?encoding=(['"])([a-zA-Z0-9_-]+)\1/ $2 when /<meta\s[^>]* http-equiv=(['"])content-type\1\s+ content=(['"])(.+?)\2/imx # " content_type = $3 _, parameters = content_type.split(/;\s*/, 2) encoding = nil if parameters and /\bcharset=([a-zA-Z0-9_-]+)/i =~ parameters encoding = normalize_charset($1) end encoding when /<meta\s[^>]*charset=(['"])(.+?)\1/imx # " charset = $2 normalize_charset(charset) else if text.encoding != Encoding::ASCII_8BIT and text.valid_encoding? text.encoding.to_s else guess_encoding_nkf(text) end end end
guess_encoding_nkf(text)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 108 def guess_encoding_nkf(text) NKF.guess(text).name end
header_element?(element, name, classes)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 149 def header_element?(element, name, classes) case name when "header" return true end classes.each do |klass| case klass when "header" return true end end case element["id"] when "header" return true end false end
noindex_element?(element, name, classes)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 133 def noindex_element?(element, name, classes) case name when "script", "noscript", "link", "style" return true end classes.each do |klass| case klass when "noindex", "robots-noindex" return true end end false end
normalize_charset(charset)
click to toggle source
# File lib/chupa-text/decomposers/html.rb, line 97 def normalize_charset(charset) case charset when /\Ax-sjis\z/i normalize_charset("Shift_JIS") when /\Ashift[_-]jis\z/i "Windows-31J" else charset end end