class GoogleSafeBrowsing::Canonicalize

Helpers to Canonicalize urls and generate url permutations for lookups

Constants

DEFAULT_PROTOCOL
PROTOCOL_DELIMITER

Public Class Methods

cart_prod(a_one, a_two) click to toggle source

Returns the cartesian product of two arrays by concatination of the string representation of the elements

@param (Array) a_one array of strings @param (Array) a_two array of strings @return (Array) cartesian product of arrays with elements concatinated

# File lib/google_safe_browsing/canonicalize.rb, line 122
def self.cart_prod(a_one, a_two)
  result = []
  a_one.each do |i|
    a_two.each do |j|
      result << "#{i}#{j}"
    end
  end
  result
end
fix_host(host) click to toggle source

Apply initial fixes to host string

@param (String) host host string @return (String) standardized host string

# File lib/google_safe_browsing/canonicalize.rb, line 175
def self.fix_host(host)
  # remove leading and trailing dots, multiple dots to one
  host.gsub!(/\A\.+|\.+\Z/, '')
  host.gsub!(/\.+/, '.')

  host.downcase!

  host_splits = self.split_username_password_and_port(host)

  if host_splits[:host] =~ /^\d+$/
    host_splits[:host] = IP::V4.new(host.to_i).to_addr
  elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
    begin
      host_splits[:host] = IP.new(host).to_addr
    rescue ArgumentError
    end
  end

  result = host_splits[:host]
  result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
  result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
  result
end
fix_path(path) click to toggle source

Apply initial fixes to path string

@param (String) path path string @return (String) standardized path string

# File lib/google_safe_browsing/canonicalize.rb, line 203
def self.fix_path(path)
  # remove leading slash
  path = path[1..-1] if path[0..0] == '/'

  preserve_trailing_slash = (path[-1..-1] == '/')

  if path.index('?')
    first_ques = path.index('?')
    params = path[first_ques..-1]
    path = path[0..(first_ques - 1)]
  end

  # remove multiple '/'
  path.gsub!(/\/+/, '/')

  new_path_array = []
  path.split('/').each do |p|
    new_path_array << p unless p == '.' || p == '..'
    new_path_array.pop if p == '..'
  end

  path = new_path_array.join('/')
  path += '/' if preserve_trailing_slash
  path += params if params

  path
end
generate_path_strings(raw_path) click to toggle source

Generates the path permutations from the raw path string

@param (String) raw_path path split from the full url string @return (Array) array of path permutation strings

# File lib/google_safe_browsing/canonicalize.rb, line 85
def self.generate_path_strings(raw_path)
  return ['/', ''] if raw_path == ''

  path_split = raw_path.split('?')
  path = path_split[0] || ''
  params = path_split[1] || ''

  path_components = path.split('/').first(3)
  path_strings = ['/']
  path_components.length.times do
    path_strings << '/' + path_components.join('/')
    path_components.pop
  end

  path_strings.map! do |p|
    if p.index('.')
      p
    else
      p + '/'
    end
  end
  path_strings.map! { |p| p.to_s.gsub!(/\/+/, '/') }
  path_strings.compact!
  path_strings.uniq!

  return path_strings if params.blank?
  path_strings | path_strings.map do |p|
    p[-1] == '/' ?  p : "#{p}?#{params}"
  end
end
recursively_unescape(url) click to toggle source

Continues to unescape the url until unescaping has no effect

@param (String) url url string @return (String) fully unescaped url string

# File lib/google_safe_browsing/canonicalize.rb, line 161
def self.recursively_unescape(url)
  compare_url = url.clone
  url = URI.unescape(url)
  until compare_url == url
    compare_url = url.clone
    url = URI.unescape(url)
  end
  url
end
remove_fragment(string) click to toggle source

Strips the fragment portion of the url string (the last ‘#’ and everything after)

@param (String) string url @return (String) parameter with the fragment removed

# File lib/google_safe_browsing/canonicalize.rb, line 152
def self.remove_fragment(string)
  string = string[0..(string.index('#') - 1)] if string.index('#')
  string
end
remove_port(host_string) click to toggle source

Strip port number from host string

@param (see strip_username_password_and_port_from_host) @return (String) host part without the port number

# File lib/google_safe_browsing/canonicalize.rb, line 271
def self.remove_port(host_string)
  self.split_port(host_string)[:host]
end
remove_protocol(cann) click to toggle source

Strip the leading protocol from the url string

@param (String) cann url string @return (String) url string without the protocol

# File lib/google_safe_browsing/canonicalize.rb, line 248
def self.remove_protocol(cann)
  if cann.index(PROTOCOL_DELIMITER)
    delimiting_index = cann.index(PROTOCOL_DELIMITER)
    @protocol = cann[0..(delimiting_index - 1)]
    protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
    cann = cann[protocol_end_index..-1]
  end
  cann
end
remove_username_and_password(host_string) click to toggle source

Strip user name and password from host part of url

@param (see remove_port) @return (String) host part of url without user name or password

# File lib/google_safe_browsing/canonicalize.rb, line 279
def self.remove_username_and_password(host_string)
  self.split_username_and_password(host_string)[:host]
end
split_host_path(cann) click to toggle source

Takes the canonicalized url and splits the host and the path apart

@param (String) cann canonicalized url string @return (Hash) !{ host: host_part, path: path_part }

# File lib/google_safe_browsing/canonicalize.rb, line 136
def self.split_host_path(cann)
  ret = { host: cann, path: '' }
  split_point = cann.index('/')
  if split_point
    ret[:host] = cann[0..split_point - 1]
    ret[:path] = cann[(split_point + 1)..-1]
  end

  ret
end
split_port(host_string) click to toggle source

Split post number and host string into a hash

@param (See remove_port) @return (Hash) :host has the host string, :port holds the port number

# File lib/google_safe_browsing/canonicalize.rb, line 305
def self.split_port(host_string)
  port_sep = host_string.rindex(':')
  result = {}
  if port_sep
    splits = host_string.split(':')
    result[:host] = splits[0]
    result[:port] = splits[1]
  else
    result[:host] = host_string
    result[:port] = nil
  end
  result
end
split_username_and_password(host_string) click to toggle source

Split user name, passowrd from the host

@param (see remove_port)_ @return (Hash) :host has the host string, :creds holds the username and password string

# File lib/google_safe_browsing/canonicalize.rb, line 287
def self.split_username_and_password(host_string)
  un_sep = host_string.index('@')
  result = {}
  if un_sep
    splits = host_string.split('@')
    result[:host] = splits[1]
    result[:creds] = splits[0]
  else
    result[:host] = host_string
    result[:creds] = nil
  end
  result
end
split_username_password_and_port(host_string) click to toggle source

Split the user name, password and port from the host string

@param (see remove_port) @return (Hash) :host as the host string; :creds has the username and password; :port holds the port number

# File lib/google_safe_browsing/canonicalize.rb, line 323
def self.split_username_password_and_port(host_string)
  result = self.split_username_and_password(host_string)
  result.merge(self.split_port(result[:host]))
end
strict_escape(url) click to toggle source

Escape the url, but do not escape certain characters; such as the carat

@param (String) url url string @return (String) escaped url string

# File lib/google_safe_browsing/canonicalize.rb, line 235
def self.strict_escape(url)
  url = URI.escape url

  # unescape carat, may need other optionally escapeable chars
  url.gsub!('%5E', '^')

  url
end
strip_username_password_and_port_from_host(host_string) click to toggle source

Strip the user name, password and port number from the url

@param (String) host_string host portion of the url @return (String) host portion of the url without the username, password and port

# File lib/google_safe_browsing/canonicalize.rb, line 262
def self.strip_username_password_and_port_from_host(host_string)
  host_string = remove_port(host_string)
  remove_username_and_password(host_string)
end
url(raw_url) click to toggle source

Base Canonicalizer method

@param (String) uncanonicalized url string @return (String) canonicalized url string

# File lib/google_safe_browsing/canonicalize.rb, line 15
def self.url(raw_url)
  raw_url = raw_url.to_s

  # Change encoding from UTF-8 to ASCII-8BIT to avoid
  # InvalidByteSequenceError
  raw_url = raw_url.force_encoding('ASCII-8BIT')

  # remove tabs, carriage returns and line feeds
  raw_url.gsub!("\t", '')
  raw_url.gsub!("\r", '')
  raw_url.gsub!("\n", '')

  cann = raw_url.clone
  cann.gsub!(/\A\s+|\s+\Z/, '')

  cann = remove_fragment(cann)

  # repeatedly unescape until no more escaping
  cann = recursively_unescape(cann)

  # remove leading PROTOCOL
  cann = remove_protocol(cann)

  # split into host and path components
  splits = split_host_path(cann)

  cann = fix_host(splits[:host]) + '/' + fix_path(splits[:path])

  # add leading protocol
  @protocol ||= DEFAULT_PROTOCOL
  cann = @protocol + PROTOCOL_DELIMITER + cann

  strict_escape(cann)
end
urls_for_lookup(lookup_url) click to toggle source

Generate the url permutations for lookup

@param (String) lookup_url uncanonicalized url string @return (Array) array of cannonicalized url permutation strings

# File lib/google_safe_browsing/canonicalize.rb, line 54
def self.urls_for_lookup(lookup_url)
  lookup_url = url(lookup_url)
  # return empty array if url returns nil; for invalid url
  return [] if lookup_url.blank?

  lookup_url = remove_protocol(lookup_url)

  splits = split_host_path(lookup_url)

  host_string = strip_username_password_and_port_from_host(splits[:host])

  # return empty array unless host_string has at least one period
  return [] unless host_string.include?('.')

  host_strings = [host_string]
  host = TopLevelDomain.split_from_host(host_string).last(5)
  (host.length - 1).times do
    host_strings << host.join('.')
    host.shift
  end
  host_strings.uniq!

  path_strings = generate_path_strings(splits[:path])

  cart_prod(host_strings, path_strings)
end