mobiledefense/google_safe_browsing

View on GitHub
lib/google_safe_browsing/canonicalize.rb

Summary

Maintainability
B
4 hrs
Test Coverage
require 'uri'
require 'ip'
require File.dirname(__FILE__) + '/top_level_domain.rb'

module GoogleSafeBrowsing
  # Helpers to Canonicalize urls and generate url permutations for lookups
  class Canonicalize
    PROTOCOL_DELIMITER = '://'
    DEFAULT_PROTOCOL = 'http'

    # Base Canonicalizer method
    #
    # @param (String) uncanonicalized url string
    # @return (String) canonicalized url string
    def self.url(raw_url)
      raw_url = raw_url.to_s

      # Change encoding from UTF-8 to ASCII-8BIT to avoid
      # InvalidByteSequenceError
      raw_url = raw_url.force_encoding('ASCII-8BIT')

      # remove tabs, carriage returns and line feeds
      raw_url.gsub!("\t", '')
      raw_url.gsub!("\r", '')
      raw_url.gsub!("\n", '')

      cann = raw_url.clone
      cann.gsub!(/\A\s+|\s+\Z/, '')

      cann = remove_fragment(cann)

      # repeatedly unescape until no more escaping
      cann = recursively_unescape(cann)

      # remove leading PROTOCOL
      cann = remove_protocol(cann)

      # split into host and path components
      splits = split_host_path(cann)

      cann = fix_host(splits[:host]) + '/' + fix_path(splits[:path])

      # add leading protocol
      @protocol ||= DEFAULT_PROTOCOL
      cann = @protocol + PROTOCOL_DELIMITER + cann

      strict_escape(cann)
    end

    # Generate the url permutations for lookup
    #
    # @param (String) lookup_url uncanonicalized url string
    # @return (Array) array of cannonicalized url permutation strings
    def self.urls_for_lookup(lookup_url)
      lookup_url = url(lookup_url)
      # return empty array if url returns nil; for invalid url
      return [] if lookup_url.blank?

      lookup_url = remove_protocol(lookup_url)

      splits = split_host_path(lookup_url)

      host_string = strip_username_password_and_port_from_host(splits[:host])

      # return empty array unless host_string has at least one period
      return [] unless host_string.include?('.')

      host_strings = [host_string]
      host = TopLevelDomain.split_from_host(host_string).last(5)
      (host.length - 1).times do
        host_strings << host.join('.')
        host.shift
      end
      host_strings.uniq!

      path_strings = generate_path_strings(splits[:path])

      cart_prod(host_strings, path_strings)
    end

    # Generates the path permutations from the raw path string
    #
    # @param (String) raw_path path split from the full url string
    # @return (Array) array of path permutation strings
    def self.generate_path_strings(raw_path)
      return ['/', ''] if raw_path == ''

      path_split = raw_path.split('?')
      path = path_split[0] || ''
      params = path_split[1] || ''

      path_components = path.split('/').first(3)
      path_strings = ['/']
      path_components.length.times do
        path_strings << '/' + path_components.join('/')
        path_components.pop
      end

      path_strings.map! do |p|
        if p.index('.')
          p
        else
          p + '/'
        end
      end
      path_strings.map! { |p| p.to_s.gsub!(/\/+/, '/') }
      path_strings.compact!
      path_strings.uniq!

      return path_strings if params.blank?
      path_strings | path_strings.map do |p|
        p[-1] == '/' ?  p : "#{p}?#{params}"
      end
    end

    # Returns the cartesian product of two arrays by concatination of the
    # string representation of the elements
    #
    # @param (Array) a_one array of strings
    # @param (Array) a_two array of strings
    # @return (Array) cartesian product of arrays with elements concatinated
    def self.cart_prod(a_one, a_two)
      result = []
      a_one.each do |i|
        a_two.each do |j|
          result << "#{i}#{j}"
        end
      end
      result
    end

    # Takes the canonicalized url and splits the host and the path apart
    #
    # @param (String) cann canonicalized url string
    # @return (Hash) !{ host: host_part, path: path_part }
    def self.split_host_path(cann)
      ret = { host: cann, path: '' }
      split_point = cann.index('/')
      if split_point
        ret[:host] = cann[0..split_point - 1]
        ret[:path] = cann[(split_point + 1)..-1]
      end

      ret
    end

    # Strips the fragment portion of the url string (the last '#' and
    # everything after)
    #
    # @param (String) string url
    # @return (String) parameter with the fragment removed
    def self.remove_fragment(string)
      string = string[0..(string.index('#') - 1)] if string.index('#')
      string
    end

    # Continues to unescape the url until unescaping has no effect
    #
    # @param (String) url url string
    # @return (String) fully unescaped url string
    def self.recursively_unescape(url)
      compare_url = url.clone
      url = URI.unescape(url)
      until compare_url == url
        compare_url = url.clone
        url = URI.unescape(url)
      end
      url
    end

    # Apply initial fixes to host string
    #
    # @param (String) host host string
    # @return (String) standardized host string
    def self.fix_host(host)
      # remove leading and trailing dots, multiple dots to one
      host.gsub!(/\A\.+|\.+\Z/, '')
      host.gsub!(/\.+/, '.')

      host.downcase!

      host_splits = self.split_username_password_and_port(host)

      if host_splits[:host] =~ /^\d+$/
        host_splits[:host] = IP::V4.new(host.to_i).to_addr
      elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
        begin
          host_splits[:host] = IP.new(host).to_addr
        rescue ArgumentError
        end
      end

      result = host_splits[:host]
      result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
      result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
      result
    end

    # Apply initial fixes to path string
    #
    # @param (String) path path string
    # @return (String) standardized path string
    def self.fix_path(path)
      # remove leading slash
      path = path[1..-1] if path[0..0] == '/'

      preserve_trailing_slash = (path[-1..-1] == '/')

      if path.index('?')
        first_ques = path.index('?')
        params = path[first_ques..-1]
        path = path[0..(first_ques - 1)]
      end

      # remove multiple '/'
      path.gsub!(/\/+/, '/')

      new_path_array = []
      path.split('/').each do |p|
        new_path_array << p unless p == '.' || p == '..'
        new_path_array.pop if p == '..'
      end

      path = new_path_array.join('/')
      path += '/' if preserve_trailing_slash
      path += params if params

      path
    end

    # Escape the url, but do not escape certain characters; such as the carat
    #
    # @param (String) url url string
    # @return (String) escaped url string
    def self.strict_escape(url)
      url = URI.escape url

      # unescape carat, may need other optionally escapeable chars
      url.gsub!('%5E', '^')

      url
    end

    # Strip the leading protocol from the url string
    #
    # @param (String) cann url string
    # @return (String) url string without the protocol
    def self.remove_protocol(cann)
      if cann.index(PROTOCOL_DELIMITER)
        delimiting_index = cann.index(PROTOCOL_DELIMITER)
        @protocol = cann[0..(delimiting_index - 1)]
        protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
        cann = cann[protocol_end_index..-1]
      end
      cann
    end

    # Strip the user name, password and port number from the url
    #
    # @param (String) host_string host portion of the url
    # @return (String) host portion of the url without the username, password and port
    def self.strip_username_password_and_port_from_host(host_string)
      host_string = remove_port(host_string)
      remove_username_and_password(host_string)
    end

    # Strip port number from host string
    #
    # @param (see strip_username_password_and_port_from_host)
    # @return (String) host part without the port number
    def self.remove_port(host_string)
      self.split_port(host_string)[:host]
    end

    # Strip user name and password from host part of url
    #
    # @param (see remove_port)
    # @return (String) host part of url without user name or password
    def self.remove_username_and_password(host_string)
      self.split_username_and_password(host_string)[:host]
    end

    # Split user name, passowrd from the host
    #
    # @param (see remove_port)_
    # @return (Hash) :host has the host string, :creds holds the username and password string
    def self.split_username_and_password(host_string)
      un_sep = host_string.index('@')
      result = {}
      if un_sep
        splits = host_string.split('@')
        result[:host] = splits[1]
        result[:creds] = splits[0]
      else
        result[:host] = host_string
        result[:creds] = nil
      end
      result
    end

    # Split post number and host string into a hash
    #
    # @param (See remove_port)
    # @return (Hash) :host has the host string, :port holds the port number
    def self.split_port(host_string)
      port_sep = host_string.rindex(':')
      result = {}
      if port_sep
        splits = host_string.split(':')
        result[:host] = splits[0]
        result[:port] = splits[1]
      else
        result[:host] = host_string
        result[:port] = nil
      end
      result
    end

    # Split the user name, password and port from the host string
    #
    # @param (see remove_port)
    # @return (Hash) :host as the host string; :creds has the username and password; :port holds the port number
    def self.split_username_password_and_port(host_string)
      result = self.split_username_and_password(host_string)
      result.merge(self.split_port(result[:host]))
    end
  end
end