class Html2rss::AttributePostProcessors::SanitizeHtml

Returns sanitized HTML code as String.

It adds:

It also:

Imagine this HTML structure:

<section>
  Lorem <b>ipsum</b> dolor...
  <iframe src="https://evil.corp/miner"></iframe>
  <script>alert();</script>
</section>

YAML usage example:

selectors:
  description:
    selector: '.section'
    extractor: html
    post_process:
      name: sanitize_html

Would return:

'<p>Lorem <b>ipsum</b> dolor ...</p>'

Constants

URL_ELEMENTS_WITH_URL_ATTRIBUTE

Public Class Methods

new(value, env) click to toggle source
# File lib/html2rss/attribute_post_processors/sanitize_html.rb, line 41
def initialize(value, env)
  @value = value
  @channel_url = env[:config].url
end

Public Instance Methods

get() click to toggle source

@return [String]

# File lib/html2rss/attribute_post_processors/sanitize_html.rb, line 52
def get
  Sanitize.fragment(@value, sanitize_config).to_s.split.join(' ')
end

Private Instance Methods

sanitize_config() click to toggle source
# File lib/html2rss/attribute_post_processors/sanitize_html.rb, line 58
def sanitize_config
  Sanitize::Config.merge(
    Sanitize::Config::RELAXED,
    attributes: { all: %w[dir lang alt title translate] },
    add_attributes: {
      'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
      'img' => { 'referrer-policy' => 'no-referrer' }
    },
    transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
  )
end
transform_urls_to_absolute_ones() click to toggle source
# File lib/html2rss/attribute_post_processors/sanitize_html.rb, line 70
def transform_urls_to_absolute_ones
  lambda do |env|
    return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(env[:node_name])

    url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[env[:node_name]]
    url = env[:node][url_attribute]

    return if URI(url).absolute?

    absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)

    env[:node][url_attribute] = absolute_url
  end
end
wrap_img_in_a() click to toggle source
# File lib/html2rss/attribute_post_processors/sanitize_html.rb, line 85
def wrap_img_in_a
  lambda do |env|
    return if env[:node_name] != 'img'

    img = env[:node]

    return if img.parent.name == 'a'

    anchor = Nokogiri::XML::Node.new('a', img)
    anchor[:href] = img[:src]

    anchor.add_child img.dup

    img.replace(anchor)
  end
end