VIPnytt/RobotsTxtParser

View on GitHub
src/Parser/UriParser.php

Summary

Maintainability
A
0 mins
Test Coverage
A
100%
<?php
/**
 * vipnytt/RobotsTxtParser
 *
 * @link https://github.com/VIPnytt/RobotsTxtParser
 * @license https://github.com/VIPnytt/RobotsTxtParser/blob/master/LICENSE The MIT License (MIT)
 */

namespace vipnytt\RobotsTxtParser\Parser;

class UriParser
{
    /**
     * Scheme white-list
     * @var string[]
     */
    protected $schemes = [
        'http',
        'https',
        'ftp',
        'ftps',
        'sftp',
    ];

    /**
     * URI
     * @var string
     */
    private $uri;

    /**
     * UriParser constructor.
     *
     * @param $uri
     */
    public function __construct($uri)
    {
        $this->uri = $uri;
    }

    /**
     * Convert relative to full
     *
     * @param string $fallbackBase
     * @return string
     */
    public function convertToFull($fallbackBase)
    {
        $this->encode();
        if ($this->validate()) {
            return $this->uri;
        } elseif (strpos($this->uri, '/') === 0) {
            $relative = $this->uri;
            $this->uri = $fallbackBase;
            return $this->base() . $relative;
        }
        throw new \InvalidArgumentException("Invalid URI `$this->uri`");
    }

    /**
     * URI encoder according to RFC 3986
     * Returns a string containing the encoded URI with disallowed characters converted to their percentage encodings.
     * @link http://publicmind.in/blog/url-encoding/
     *
     * @return string
     */
    public function encode()
    {
        $reserved = [
            '!%21!ui' => "!",
            '!%23!ui' => "#",
            '!%24!ui' => "$",
            '!%26!ui' => "&",
            '!%27!ui' => "'",
            '!%28!ui' => "(",
            '!%29!ui' => ")",
            '!%2A!ui' => "*",
            '!%2B!ui' => "+",
            '!%2C!ui' => ",",
            '!%2F!ui' => "/",
            '!%3A!ui' => ":",
            '!%3B!ui' => ";",
            '!%3D!ui' => "=",
            '!%3F!ui' => "?",
            '!%40!ui' => "@",
            '!%5B!ui' => "[",
            '!%5D!ui' => "]",
            '!%25!ui' => "%",
        ];
        // The % character must be the last in the $reserved array.
        // This makes sure that the already encoded values are not lost or encoded again.
        $this->uri = preg_replace(array_keys($reserved), array_values($reserved), rawurlencode($this->uri));
        return $this->baseToLowercase();
    }

    /**
     * Base uri to lowercase
     *
     * @return string
     */
    private function baseToLowercase()
    {
        if (($host = parse_url($this->uri, PHP_URL_HOST)) === null) {
            return $this->uri;
        }
        $pos = strpos($this->uri, $host) + strlen($host);
        return $this->uri = substr_replace($this->uri, strtolower(substr($this->uri, 0, $pos)), 0, $pos);
    }

    /**
     * Validate
     *
     * @return bool
     */
    public function validate()
    {
        return (
            (
                filter_var($this->uri, FILTER_VALIDATE_URL) ||
                // PHP 5.x bug fix: FILTER_VALIDATE_URL doesn't support IPv6 urls. IP check not needed in the future.
                $this->validateIP(($parsed = parse_url($this->uri, PHP_URL_HOST)) === false ? '' : $parsed)
            ) &&
            ($parsed = parse_url($this->uri)) !== false &&
            (
                $this->validateHost($parsed['host']) ||
                $this->validateIP($parsed['host'])
            ) &&
            $this->validateScheme($parsed['scheme'])
        );
    }

    /**
     * Validate IPv4 or IPv6
     *
     * @param  string|null $ipAddress
     * @return bool
     */
    public function validateIP($ipAddress = null)
    {
        if ($ipAddress === null) {
            $parsed = parse_url($this->uri);
            $ipAddress = isset($parsed['host']) ? $parsed['host'] : null;
        }
        return (
            filter_var($ipAddress, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4) ||
            filter_var(trim($ipAddress, '[]'), FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)
        );
    }

    /**
     * Validate host name
     *
     * @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php
     *
     * @param  string|null $host
     * @return bool
     */
    public function validateHost($host = null)
    {
        if ($host === null) {
            $parsed = parse_url($this->uri);
            $host = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
        }
        return (
            preg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check
            && preg_match("/^.{1,253}$/", $host) //overall length check
            && preg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label
            && !$this->validateIP($host)
        );
    }

    /**
     * Validate scheme
     *
     * @param  string|null $scheme
     * @return bool
     */
    public function validateScheme($scheme = null)
    {
        if ($scheme === null) {
            $parsed = parse_url($this->uri);
            $scheme = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
        }
        return in_array($scheme, $this->schemes);
    }

    /**
     * Base
     *
     * @return string
     */
    public function base()
    {
        if (!$this->validate()) {
            throw new \InvalidArgumentException("Invalid URI: $this->uri");
        }
        $parts = [
            'scheme' => parse_url($this->uri, PHP_URL_SCHEME),
            'host' => parse_url($this->uri, PHP_URL_HOST),
        ];
        $parts['port'] = is_int($port = parse_url($this->uri, PHP_URL_PORT)) ? $port : getservbyname($parts['scheme'], 'tcp');
        return strtolower($parts['scheme'] . '://' . $parts['host'] . ':' . $parts['port']);
    }

    /**
     * Strip fragment
     *
     * @return string
     */
    public function stripFragment()
    {
        return explode('#', $this->uri, 2)[0];
    }
}