class GoogleSafeBrowsing::Canonicalize
Helpers to Canonicalize
urls and generate url permutations for lookups
Constants
- DEFAULT_PROTOCOL
- PROTOCOL_DELIMITER
Public Class Methods
Returns the cartesian product of two arrays by concatination of the string representation of the elements
@param (Array) a_one array of strings @param (Array) a_two array of strings @return (Array) cartesian product of arrays with elements concatinated
# File lib/google_safe_browsing/canonicalize.rb, line 122 def self.cart_prod(a_one, a_two) result = [] a_one.each do |i| a_two.each do |j| result << "#{i}#{j}" end end result end
Apply initial fixes to host string
@param (String) host host string @return (String) standardized host string
# File lib/google_safe_browsing/canonicalize.rb, line 175 def self.fix_host(host) # remove leading and trailing dots, multiple dots to one host.gsub!(/\A\.+|\.+\Z/, '') host.gsub!(/\.+/, '.') host.downcase! host_splits = self.split_username_password_and_port(host) if host_splits[:host] =~ /^\d+$/ host_splits[:host] = IP::V4.new(host.to_i).to_addr elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/ begin host_splits[:host] = IP.new(host).to_addr rescue ArgumentError end end result = host_splits[:host] result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank? result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank? result end
Apply initial fixes to path string
@param (String) path path string @return (String) standardized path string
# File lib/google_safe_browsing/canonicalize.rb, line 203 def self.fix_path(path) # remove leading slash path = path[1..-1] if path[0..0] == '/' preserve_trailing_slash = (path[-1..-1] == '/') if path.index('?') first_ques = path.index('?') params = path[first_ques..-1] path = path[0..(first_ques - 1)] end # remove multiple '/' path.gsub!(/\/+/, '/') new_path_array = [] path.split('/').each do |p| new_path_array << p unless p == '.' || p == '..' new_path_array.pop if p == '..' end path = new_path_array.join('/') path += '/' if preserve_trailing_slash path += params if params path end
Generates the path permutations from the raw path string
@param (String) raw_path path split from the full url string @return (Array) array of path permutation strings
# File lib/google_safe_browsing/canonicalize.rb, line 85 def self.generate_path_strings(raw_path) return ['/', ''] if raw_path == '' path_split = raw_path.split('?') path = path_split[0] || '' params = path_split[1] || '' path_components = path.split('/').first(3) path_strings = ['/'] path_components.length.times do path_strings << '/' + path_components.join('/') path_components.pop end path_strings.map! do |p| if p.index('.') p else p + '/' end end path_strings.map! { |p| p.to_s.gsub!(/\/+/, '/') } path_strings.compact! path_strings.uniq! return path_strings if params.blank? path_strings | path_strings.map do |p| p[-1] == '/' ? p : "#{p}?#{params}" end end
Continues to unescape the url until unescaping has no effect
@param (String) url url string @return (String) fully unescaped url string
# File lib/google_safe_browsing/canonicalize.rb, line 161 def self.recursively_unescape(url) compare_url = url.clone url = URI.unescape(url) until compare_url == url compare_url = url.clone url = URI.unescape(url) end url end
Strips the fragment portion of the url string (the last ‘#’ and everything after)
@param (String) string url @return (String) parameter with the fragment removed
# File lib/google_safe_browsing/canonicalize.rb, line 152 def self.remove_fragment(string) string = string[0..(string.index('#') - 1)] if string.index('#') string end
Strip port number from host string
@param (see strip_username_password_and_port_from_host
) @return (String) host part without the port number
# File lib/google_safe_browsing/canonicalize.rb, line 271 def self.remove_port(host_string) self.split_port(host_string)[:host] end
Strip the leading protocol from the url string
@param (String) cann url string @return (String) url string without the protocol
# File lib/google_safe_browsing/canonicalize.rb, line 248 def self.remove_protocol(cann) if cann.index(PROTOCOL_DELIMITER) delimiting_index = cann.index(PROTOCOL_DELIMITER) @protocol = cann[0..(delimiting_index - 1)] protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length cann = cann[protocol_end_index..-1] end cann end
Strip user name and password from host part of url
@param (see remove_port
) @return (String) host part of url without user name or password
# File lib/google_safe_browsing/canonicalize.rb, line 279 def self.remove_username_and_password(host_string) self.split_username_and_password(host_string)[:host] end
Takes the canonicalized url and splits the host and the path apart
@param (String) cann canonicalized url string @return (Hash) !{ host: host_part, path: path_part }
# File lib/google_safe_browsing/canonicalize.rb, line 136 def self.split_host_path(cann) ret = { host: cann, path: '' } split_point = cann.index('/') if split_point ret[:host] = cann[0..split_point - 1] ret[:path] = cann[(split_point + 1)..-1] end ret end
Split post number and host string into a hash
@param (See remove_port
) @return (Hash) :host has the host string, :port holds the port number
# File lib/google_safe_browsing/canonicalize.rb, line 305 def self.split_port(host_string) port_sep = host_string.rindex(':') result = {} if port_sep splits = host_string.split(':') result[:host] = splits[0] result[:port] = splits[1] else result[:host] = host_string result[:port] = nil end result end
Split user name, passowrd from the host
@param (see remove_port
)_ @return (Hash) :host has the host string, :creds holds the username and password string
# File lib/google_safe_browsing/canonicalize.rb, line 287 def self.split_username_and_password(host_string) un_sep = host_string.index('@') result = {} if un_sep splits = host_string.split('@') result[:host] = splits[1] result[:creds] = splits[0] else result[:host] = host_string result[:creds] = nil end result end
Split the user name, password and port from the host string
@param (see remove_port
) @return (Hash) :host as the host string; :creds has the username and password; :port holds the port number
# File lib/google_safe_browsing/canonicalize.rb, line 323 def self.split_username_password_and_port(host_string) result = self.split_username_and_password(host_string) result.merge(self.split_port(result[:host])) end
Escape the url, but do not escape certain characters; such as the carat
@param (String) url url string @return (String) escaped url string
# File lib/google_safe_browsing/canonicalize.rb, line 235 def self.strict_escape(url) url = URI.escape url # unescape carat, may need other optionally escapeable chars url.gsub!('%5E', '^') url end
Strip the user name, password and port number from the url
@param (String) host_string host portion of the url @return (String) host portion of the url without the username, password and port
# File lib/google_safe_browsing/canonicalize.rb, line 262 def self.strip_username_password_and_port_from_host(host_string) host_string = remove_port(host_string) remove_username_and_password(host_string) end
Base Canonicalizer method
@param (String) uncanonicalized url string @return (String) canonicalized url string
# File lib/google_safe_browsing/canonicalize.rb, line 15 def self.url(raw_url) raw_url = raw_url.to_s # Change encoding from UTF-8 to ASCII-8BIT to avoid # InvalidByteSequenceError raw_url = raw_url.force_encoding('ASCII-8BIT') # remove tabs, carriage returns and line feeds raw_url.gsub!("\t", '') raw_url.gsub!("\r", '') raw_url.gsub!("\n", '') cann = raw_url.clone cann.gsub!(/\A\s+|\s+\Z/, '') cann = remove_fragment(cann) # repeatedly unescape until no more escaping cann = recursively_unescape(cann) # remove leading PROTOCOL cann = remove_protocol(cann) # split into host and path components splits = split_host_path(cann) cann = fix_host(splits[:host]) + '/' + fix_path(splits[:path]) # add leading protocol @protocol ||= DEFAULT_PROTOCOL cann = @protocol + PROTOCOL_DELIMITER + cann strict_escape(cann) end
Generate the url permutations for lookup
@param (String) lookup_url uncanonicalized url string @return (Array) array of cannonicalized url permutation strings
# File lib/google_safe_browsing/canonicalize.rb, line 54 def self.urls_for_lookup(lookup_url) lookup_url = url(lookup_url) # return empty array if url returns nil; for invalid url return [] if lookup_url.blank? lookup_url = remove_protocol(lookup_url) splits = split_host_path(lookup_url) host_string = strip_username_password_and_port_from_host(splits[:host]) # return empty array unless host_string has at least one period return [] unless host_string.include?('.') host_strings = [host_string] host = TopLevelDomain.split_from_host(host_string).last(5) (host.length - 1).times do host_strings << host.join('.') host.shift end host_strings.uniq! path_strings = generate_path_strings(splits[:path]) cart_prod(host_strings, path_strings) end