lib/google_safe_browsing/canonicalize.rb
require 'uri'
require 'ip'
require File.dirname(__FILE__) + '/top_level_domain.rb'
module GoogleSafeBrowsing
# Helpers to Canonicalize urls and generate url permutations for lookups
class Canonicalize
PROTOCOL_DELIMITER = '://'
DEFAULT_PROTOCOL = 'http'
# Base Canonicalizer method
#
# @param (String) uncanonicalized url string
# @return (String) canonicalized url string
def self.url(raw_url)
raw_url = raw_url.to_s
# Change encoding from UTF-8 to ASCII-8BIT to avoid
# InvalidByteSequenceError
raw_url = raw_url.force_encoding('ASCII-8BIT')
# remove tabs, carriage returns and line feeds
raw_url.gsub!("\t", '')
raw_url.gsub!("\r", '')
raw_url.gsub!("\n", '')
cann = raw_url.clone
cann.gsub!(/\A\s+|\s+\Z/, '')
cann = remove_fragment(cann)
# repeatedly unescape until no more escaping
cann = recursively_unescape(cann)
# remove leading PROTOCOL
cann = remove_protocol(cann)
# split into host and path components
splits = split_host_path(cann)
cann = fix_host(splits[:host]) + '/' + fix_path(splits[:path])
# add leading protocol
@protocol ||= DEFAULT_PROTOCOL
cann = @protocol + PROTOCOL_DELIMITER + cann
strict_escape(cann)
end
# Generate the url permutations for lookup
#
# @param (String) lookup_url uncanonicalized url string
# @return (Array) array of cannonicalized url permutation strings
def self.urls_for_lookup(lookup_url)
lookup_url = url(lookup_url)
# return empty array if url returns nil; for invalid url
return [] if lookup_url.blank?
lookup_url = remove_protocol(lookup_url)
splits = split_host_path(lookup_url)
host_string = strip_username_password_and_port_from_host(splits[:host])
# return empty array unless host_string has at least one period
return [] unless host_string.include?('.')
host_strings = [host_string]
host = TopLevelDomain.split_from_host(host_string).last(5)
(host.length - 1).times do
host_strings << host.join('.')
host.shift
end
host_strings.uniq!
path_strings = generate_path_strings(splits[:path])
cart_prod(host_strings, path_strings)
end
# Generates the path permutations from the raw path string
#
# @param (String) raw_path path split from the full url string
# @return (Array) array of path permutation strings
def self.generate_path_strings(raw_path)
return ['/', ''] if raw_path == ''
path_split = raw_path.split('?')
path = path_split[0] || ''
params = path_split[1] || ''
path_components = path.split('/').first(3)
path_strings = ['/']
path_components.length.times do
path_strings << '/' + path_components.join('/')
path_components.pop
end
path_strings.map! do |p|
if p.index('.')
p
else
p + '/'
end
end
path_strings.map! { |p| p.to_s.gsub!(/\/+/, '/') }
path_strings.compact!
path_strings.uniq!
return path_strings if params.blank?
path_strings | path_strings.map do |p|
p[-1] == '/' ? p : "#{p}?#{params}"
end
end
# Returns the cartesian product of two arrays by concatination of the
# string representation of the elements
#
# @param (Array) a_one array of strings
# @param (Array) a_two array of strings
# @return (Array) cartesian product of arrays with elements concatinated
def self.cart_prod(a_one, a_two)
result = []
a_one.each do |i|
a_two.each do |j|
result << "#{i}#{j}"
end
end
result
end
# Takes the canonicalized url and splits the host and the path apart
#
# @param (String) cann canonicalized url string
# @return (Hash) !{ host: host_part, path: path_part }
def self.split_host_path(cann)
ret = { host: cann, path: '' }
split_point = cann.index('/')
if split_point
ret[:host] = cann[0..split_point - 1]
ret[:path] = cann[(split_point + 1)..-1]
end
ret
end
# Strips the fragment portion of the url string (the last '#' and
# everything after)
#
# @param (String) string url
# @return (String) parameter with the fragment removed
def self.remove_fragment(string)
string = string[0..(string.index('#') - 1)] if string.index('#')
string
end
# Continues to unescape the url until unescaping has no effect
#
# @param (String) url url string
# @return (String) fully unescaped url string
def self.recursively_unescape(url)
compare_url = url.clone
url = URI.unescape(url)
until compare_url == url
compare_url = url.clone
url = URI.unescape(url)
end
url
end
# Apply initial fixes to host string
#
# @param (String) host host string
# @return (String) standardized host string
def self.fix_host(host)
# remove leading and trailing dots, multiple dots to one
host.gsub!(/\A\.+|\.+\Z/, '')
host.gsub!(/\.+/, '.')
host.downcase!
host_splits = self.split_username_password_and_port(host)
if host_splits[:host] =~ /^\d+$/
host_splits[:host] = IP::V4.new(host.to_i).to_addr
elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
begin
host_splits[:host] = IP.new(host).to_addr
rescue ArgumentError
end
end
result = host_splits[:host]
result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
result
end
# Apply initial fixes to path string
#
# @param (String) path path string
# @return (String) standardized path string
def self.fix_path(path)
# remove leading slash
path = path[1..-1] if path[0..0] == '/'
preserve_trailing_slash = (path[-1..-1] == '/')
if path.index('?')
first_ques = path.index('?')
params = path[first_ques..-1]
path = path[0..(first_ques - 1)]
end
# remove multiple '/'
path.gsub!(/\/+/, '/')
new_path_array = []
path.split('/').each do |p|
new_path_array << p unless p == '.' || p == '..'
new_path_array.pop if p == '..'
end
path = new_path_array.join('/')
path += '/' if preserve_trailing_slash
path += params if params
path
end
# Escape the url, but do not escape certain characters; such as the carat
#
# @param (String) url url string
# @return (String) escaped url string
def self.strict_escape(url)
url = URI.escape url
# unescape carat, may need other optionally escapeable chars
url.gsub!('%5E', '^')
url
end
# Strip the leading protocol from the url string
#
# @param (String) cann url string
# @return (String) url string without the protocol
def self.remove_protocol(cann)
if cann.index(PROTOCOL_DELIMITER)
delimiting_index = cann.index(PROTOCOL_DELIMITER)
@protocol = cann[0..(delimiting_index - 1)]
protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
cann = cann[protocol_end_index..-1]
end
cann
end
# Strip the user name, password and port number from the url
#
# @param (String) host_string host portion of the url
# @return (String) host portion of the url without the username, password and port
def self.strip_username_password_and_port_from_host(host_string)
host_string = remove_port(host_string)
remove_username_and_password(host_string)
end
# Strip port number from host string
#
# @param (see strip_username_password_and_port_from_host)
# @return (String) host part without the port number
def self.remove_port(host_string)
self.split_port(host_string)[:host]
end
# Strip user name and password from host part of url
#
# @param (see remove_port)
# @return (String) host part of url without user name or password
def self.remove_username_and_password(host_string)
self.split_username_and_password(host_string)[:host]
end
# Split user name, passowrd from the host
#
# @param (see remove_port)_
# @return (Hash) :host has the host string, :creds holds the username and password string
def self.split_username_and_password(host_string)
un_sep = host_string.index('@')
result = {}
if un_sep
splits = host_string.split('@')
result[:host] = splits[1]
result[:creds] = splits[0]
else
result[:host] = host_string
result[:creds] = nil
end
result
end
# Split post number and host string into a hash
#
# @param (See remove_port)
# @return (Hash) :host has the host string, :port holds the port number
def self.split_port(host_string)
port_sep = host_string.rindex(':')
result = {}
if port_sep
splits = host_string.split(':')
result[:host] = splits[0]
result[:port] = splits[1]
else
result[:host] = host_string
result[:port] = nil
end
result
end
# Split the user name, password and port from the host string
#
# @param (see remove_port)
# @return (Hash) :host as the host string; :creds has the username and password; :port holds the port number
def self.split_username_password_and_port(host_string)
result = self.split_username_and_password(host_string)
result.merge(self.split_port(result[:host]))
end
end
end