class CW::RSSClean
Constants
- NON_CONTENT_TAGS
- WHITESPACE_CONTENT_TAGS
Attributes
html_fragment[R]
Public Class Methods
new(html_fragment)
click to toggle source
# File lib/cw/rss_clean.rb, line 5 def initialize(html_fragment) @html_fragment = html_fragment end
Public Instance Methods
scrub(options = {})
click to toggle source
# File lib/cw/rss_clean.rb, line 9 def scrub(options = {}) blacklisted_tags = NON_CONTENT_TAGS + options.fetch(:blacklist, []) sanitize(Oga.parse_html(html_fragment).children, blacklisted_tags) end
Private Instance Methods
sanitize(node_set, blacklisted_tags)
click to toggle source
# File lib/cw/rss_clean.rb, line 22 def sanitize(node_set, blacklisted_tags) node_set.reject { |node| !text?(node) && blacklisted_tags.include?(node.name) } .flat_map { |node| [whitespace(node, :prefix), text(node, blacklisted_tags), whitespace(node, :suffix)] }.join end
text(node, blacklisted_tags)
click to toggle source
# File lib/cw/rss_clean.rb, line 36 def text(node, blacklisted_tags) return node.text if text?(node) sanitize(node.children, blacklisted_tags) end
text?(node)
click to toggle source
# File lib/cw/rss_clean.rb, line 27 def text?(node) node.is_a?(Oga::XML::Text) end
whitespace(node, _position)
click to toggle source
# File lib/cw/rss_clean.rb, line 31 def whitespace(node, _position) return ' ' if !text?(node) && WHITESPACE_CONTENT_TAGS.include?(node.name) '' end