class CW::RSSClean

Constants

NON_CONTENT_TAGS
WHITESPACE_CONTENT_TAGS

Attributes

html_fragment[R]

Public Class Methods

new(html_fragment) click to toggle source
# File lib/cw/rss_clean.rb, line 5
def initialize(html_fragment)
  @html_fragment = html_fragment
end

Public Instance Methods

scrub(options = {}) click to toggle source
# File lib/cw/rss_clean.rb, line 9
def scrub(options = {})
  blacklisted_tags = NON_CONTENT_TAGS + options.fetch(:blacklist, [])

  sanitize(Oga.parse_html(html_fragment).children, blacklisted_tags)
end

Private Instance Methods

sanitize(node_set, blacklisted_tags) click to toggle source
# File lib/cw/rss_clean.rb, line 22
def sanitize(node_set, blacklisted_tags)
  node_set.reject { |node| !text?(node) && blacklisted_tags.include?(node.name) }
    .flat_map { |node| [whitespace(node, :prefix), text(node, blacklisted_tags), whitespace(node, :suffix)] }.join
end
text(node, blacklisted_tags) click to toggle source
# File lib/cw/rss_clean.rb, line 36
def text(node, blacklisted_tags)
  return node.text if text?(node)
  sanitize(node.children, blacklisted_tags)
end
text?(node) click to toggle source
# File lib/cw/rss_clean.rb, line 27
def text?(node)
  node.is_a?(Oga::XML::Text)
end
whitespace(node, _position) click to toggle source
# File lib/cw/rss_clean.rb, line 31
def whitespace(node, _position)
  return ' ' if !text?(node) && WHITESPACE_CONTENT_TAGS.include?(node.name)
  ''
end