class Rack::UTF8Sanitizer
Constants
- DEFAULT_STRATEGIES
- HTTP_
- SANITIZABLE_CONTENT_TYPES
- StringIO
- UNRESERVED_OR_UTF8
This regexp matches all 'unreserved' characters from RFC3986 (2.3), plus all multibyte UTF-8 characters.
- UNSAFE
This regexp matches unsafe characters, i.e. everything except 'reserved' and 'unreserved' characters from RFC3986 (2.3), and additionally '%', as percent-encoded unreserved characters could be left over from the `unescape_unreserved` invocation.
See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}.
- URI_ENCODED_CONTENT_TYPES
- URI_FIELDS
- UTF8_BOM
- UTF8_BOM_SIZE
Public Class Methods
# File lib/rack/utf8_sanitizer.rb, line 12 def initialize(app, options={}) @app = app @strategy = build_strategy(options) @sanitizable_content_types = options[:sanitizable_content_types] @sanitizable_content_types ||= SANITIZABLE_CONTENT_TYPES + (options[:additional_content_types] || []) @only = Array(options[:only]).flatten @except = Array(options[:except]).flatten end
Public Instance Methods
# File lib/rack/utf8_sanitizer.rb, line 21 def call(env) @app.call(sanitize(env)) end
# File lib/rack/utf8_sanitizer.rb, line 64 def sanitize(env) sanitize_rack_input(env) sanitize_cookies(env) env.each do |key, value| next if skip?(key) if URI_FIELDS.include?(key) env[key] = transfer_frozen(value, sanitize_uri_encoded_string(value)) elsif key.to_s.start_with?(HTTP_) # Just sanitize the headers and leave them in UTF-8. There is # no reason to have UTF-8 in headers, but if it's valid, let it be. env[key] = transfer_frozen(value, sanitize_string(value)) end end end
Protected Instance Methods
# File lib/rack/utf8_sanitizer.rb, line 91 def build_strategy(options) strategy = options.fetch(:strategy) { :replace } return strategy unless DEFAULT_STRATEGIES.key?(strategy) DEFAULT_STRATEGIES[strategy] end
# File lib/rack/utf8_sanitizer.rb, line 196 def decode_string(input) unescape_unreserved( sanitize_string(input). force_encoding(Encoding::ASCII_8BIT)) end
Performs the reverse function of `unescape_unreserved`. Unlike the previous function, we can reuse the logic in URI#encode
# File lib/rack/utf8_sanitizer.rb, line 233 def escape_unreserved(input) URI::DEFAULT_PARSER.escape(input, UNSAFE) end
# File lib/rack/utf8_sanitizer.rb, line 191 def reencode_string(decoded_value) escape_unreserved( sanitize_string(decoded_value)) end
# File lib/rack/utf8_sanitizer.rb, line 152 def sanitize_io(io, uri_encoded = false) input = io.read sanitized_input = sanitize_string(strip_byte_order_mark(input)) if uri_encoded sanitized_input = sanitize_uri_encoded_string(sanitized_input). force_encoding(Encoding::UTF_8) end sanitized_input = transfer_frozen(input, sanitized_input) SanitizedRackInput.new(io, StringIO.new(sanitized_input)) end
# File lib/rack/utf8_sanitizer.rb, line 99 def sanitize_rack_input(env) # https://github.com/rack/rack/blob/master/lib/rack/request.rb#L42 # Logic borrowed from Rack::Request#media_type,#media_type_params,#content_charset # Ignoring charset in content type. content_type = env['CONTENT_TYPE'] content_type &&= content_type.split(/\s*[;,]\s*/, 2).first content_type &&= content_type.downcase return unless @sanitizable_content_types.any? {|type| content_type == type } uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type} if env['rack.input'] sanitized_input = sanitize_io(env['rack.input'], uri_encoded) env['rack.input'] = sanitized_input env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s end end
# File lib/rack/utf8_sanitizer.rb, line 237 def sanitize_string(input) if input.is_a? String input = input.dup.force_encoding(Encoding::UTF_8) if input.valid_encoding? input else @strategy.call(input) end else input end end
URI.encode/decode expect the input to be in ASCII-8BIT. However, there could be invalid UTF-8 characters both in raw and percent-encoded form.
So, first sanitize the value, then percent-decode it while treating as UTF-8, then sanitize the result and encode it back.
The result is guaranteed to be UTF-8-safe.
# File lib/rack/utf8_sanitizer.rb, line 185 def sanitize_uri_encoded_string(input) return input if input.nil? decoded_value = decode_string(input) reencode_string(decoded_value) end
# File lib/rack/utf8_sanitizer.rb, line 84 def skip?(rack_env_key) return true if !@except.empty? && @except.any? { |matcher| rack_env_key[matcher] } return true if !@only.empty? && @only.none? { |matcher| rack_env_key[matcher] } false end
# File lib/rack/utf8_sanitizer.rb, line 262 def strip_byte_order_mark(input) return input unless input.start_with?(UTF8_BOM) input.byteslice(UTF8_BOM_SIZE..-1) end
# File lib/rack/utf8_sanitizer.rb, line 251 def transfer_frozen(from, to) if from.frozen? to.freeze else to end end
RFC3986, 2.2 states that the characters from 'reserved' group must be protected during normalization (which is what UTF8Sanitizer
does).
However, the regexp approach used by URI.unescape is not sophisticated enough for our task.
# File lib/rack/utf8_sanitizer.rb, line 211 def unescape_unreserved(input) input.gsub(/%([a-f\d]{2})/i) do |encoded| decoded = $1.hex.chr if decoded =~ UNRESERVED_OR_UTF8 decoded else encoded end end end