VIPnytt/SitemapParser

View on GitHub
src/SitemapParser.php

Summary

Maintainability
D
1 day
Test Coverage
A
90%
<?php

namespace vipnytt;

use GuzzleHttp;
use SimpleXMLElement;
use vipnytt\SitemapParser\Exceptions;
use vipnytt\SitemapParser\UrlParser;

/**
 * SitemapParser class
 *
 * @license https://opensource.org/licenses/MIT MIT license
 * @link https://github.com/VIPnytt/SitemapParser
 *
 * Specifications:
 * @link http://www.sitemaps.org/protocol.html
 */
class SitemapParser
{
    use UrlParser;

    /**
     * Default User-Agent
     */
    const DEFAULT_USER_AGENT = 'SitemapParser-VIPnytt/1.1 (+https://github.com/VIPnytt/SitemapParser/blob/master/README.md)';

    /**
     * Default encoding
     */
    const ENCODING = 'UTF-8';

    /**
     * XML file extension
     */
    const XML_EXTENSION = 'xml';

    /**
     * Compressed XML file extension
     */
    const XML_EXTENSION_COMPRESSED = 'xml.gz';

    /**
     * XML Sitemap tag
     */
    const XML_TAG_SITEMAP = 'sitemap';

    /**
     * XML URL tag
     */
    const XML_TAG_URL = 'url';

    /**
     * Robots.txt path
     */
    const ROBOTSTXT_PATH = '/robots.txt';

    /**
     * User-Agent to send with every HTTP(S) request
     * @var string
     */
    protected $userAgent;

    /**
     * Configuration options
     * @var array
     */
    protected $config = [];

    /**
     * Sitemaps discovered
     * @var array
     */
    protected $sitemaps = [];

    /**
     * URLs discovered
     * @var array
     */
    protected $urls = [];

    /**
     * Sitemap URLs discovered but not yet parsed
     * @var array
     */
    protected $queue = [];

    /**
     * Parsed URLs history
     * @var array
     */
    protected $history = [];

    /**
     * Current URL being parsed
     * @var null|string
     */
    protected $currentURL;

    /**
     * @var \GuzzleHttp\Client
     */
    protected $client;

    /**
     * Constructor
     *
     * @param string $userAgent User-Agent to send with every HTTP(S) request
     * @param array $config Configuration options
     * @throws Exceptions\SitemapParserException
     */
    public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [], GuzzleHttp\Client $client = null)
    {
        mb_language("uni");
        if (!mb_internal_encoding(self::ENCODING)) {
            throw new Exceptions\SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
        }
        $this->userAgent = $userAgent;
        $this->config = $config;

        if (!is_null($client)) {
            $this->setClient($client);
        }
    }

    /**
     * Parse Recursive
     *
     * @param string $url
     * @return void
     * @throws Exceptions\SitemapParserException
     */
    public function parseRecursive($url)
    {
        $this->addToQueue([$url]);
        $this->clean();
        while (count($todo = $this->getQueue()) > 0) {
            $sitemaps = $this->sitemaps;
            $urls = $this->urls;
            try {
                $this->parse($todo[0]);
            } catch (Exceptions\TransferException $e) {
                // Keep crawling
            }
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
            $this->urls = array_merge_recursive($urls, $this->urls);
        }
    }

    /**
     * Add an array of URLs to the parser queue
     *
     * @param array $urlArray
     */
    public function addToQueue(array $urlArray)
    {
        foreach ($urlArray as $url) {
            $url = $this->urlEncode($url);
            if ($this->urlValidate($url)) {
                $this->queue[] = $url;
            }
        }
    }

    /**
     * Sitemap URLs discovered but not yet parsed
     *
     * @return array
     */
    public function getQueue()
    {
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
        return $this->queue;
    }

    /**
     * Parse
     *
     * @param string $url URL to parse
     * @param string|null $urlContent URL body content (provide to skip download)
     * @return void
     * @throws Exceptions\TransferException
     * @throws Exceptions\SitemapParserException
     */
    public function parse($url, $urlContent = null)
    {
        $this->clean();
        $this->currentURL = $this->urlEncode($url);
        if (!$this->urlValidate($this->currentURL)) {
            throw new Exceptions\SitemapParserException('Invalid URL');
        }
        $this->history[] = $this->currentURL;
        $response = is_string($urlContent) ? $urlContent : $this->getContent();
        if (parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
            $this->parseRobotstxt($response);
            return;
        }
        // Check if content is an gzip file
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
            $response = gzdecode($response);
        }
        $sitemapJson = $this->generateXMLObject($response);
        if ($sitemapJson instanceof SimpleXMLElement === false) {
            $this->parseString($response);
            return;
        }
        $this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson);
        $this->parseJson(self::XML_TAG_URL, $sitemapJson);
    }

    /**
     * Cleanup between each parse
     *
     * @return void
     */
    protected function clean()
    {
        $this->sitemaps = [];
        $this->urls = [];
    }

    /**
     * Request the body content of an URL
     *
     * @return string Raw body content
     * @throws Exceptions\TransferException
     * @throws Exceptions\SitemapParserException
     */
    protected function getContent()
    {
        $this->currentURL = $this->urlEncode($this->currentURL);
        if (!$this->urlValidate($this->currentURL)) {
            throw new Exceptions\SitemapParserException('Invalid URL');
        }
        try {
            if (strpos($this->currentURL, 'file://') === 0) {
                $path = parse_url($this->currentURL, PHP_URL_PATH);
                if (!$this->urlValidatePath($path)) {
                    throw new Exceptions\SitemapParserException('Invalid file path');
                }
                if (!file_exists($path) && PHP_OS === 'WINNT') {
                    return file_get_contents(urldecode($path));
                }
                return file_get_contents($path);
            }
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
            }
            $client = $this->getClient();
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
            return $res->getBody()->getContents();
        } catch (GuzzleHttp\Exception\TransferException $e) {
            throw new Exceptions\TransferException('Unable to fetch URL contents', 0, $e);
        } catch (GuzzleHttp\Exception\GuzzleException $e) {
            throw new Exceptions\SitemapParserException('GuzzleHttp exception', 0, $e);
        }
    }

    /**
     * Search for sitemaps in the robots.txt content
     *
     * @param string $robotstxt
     * @return bool
     */
    protected function parseRobotstxt($robotstxt)
    {
        // Split lines into array
        $lines = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $robotstxt)));
        // Parse each line individually
        foreach ($lines as $line) {
            // Remove comments
            $line = mb_split('#', $line, 2)[0];
            // Split by directive and rule
            $pair = array_map('trim', mb_split(':', $line, 2));
            // Check if the line contains a sitemap
            if (
                mb_strtolower($pair[0]) !== self::XML_TAG_SITEMAP ||
                empty($pair[1])
            ) {
                // Line does not contain any supported directive
                continue;
            }
            $url = $this->urlEncode($pair[1]);
            if ($this->urlValidate($url)) {
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $url]);
            }
        }
        return true;
    }

    /**
     * Validate URL arrays and add them to their corresponding arrays
     *
     * @param string $type sitemap|url
     * @param array $array Tag array
     * @return bool
     */
    protected function addArray($type, array $array)
    {
        if (!isset($array['loc'])) {
            return false;
        }
        $array['loc'] = $this->urlEncode(trim($array['loc']));
        if ($this->urlValidate($array['loc'])) {
            switch ($type) {
                case self::XML_TAG_SITEMAP:
                    $this->sitemaps[$array['loc']] = $this->fixMissingTags(['lastmod'], $array);
                    return true;
                case self::XML_TAG_URL:
                    $this->urls[$array['loc']] = $this->fixMissingTags(['lastmod', 'changefreq', 'priority'], $array);
                    return true;
            }
        }
        return false;
    }

    /**
     * Check for missing values and set them to null
     *
     * @param array $tags Tags check if exists
     * @param array $array Array to check
     * @return array
     */
    protected function fixMissingTags(array $tags, array $array)
    {
        foreach ($tags as $tag) {
            if (empty($array[$tag])) {
                $array[$tag] = null;
            }
        }
        return $array;
    }

    /**
     * Generate the \SimpleXMLElement object if the XML is valid
     *
     * @param string $xml
     * @return \SimpleXMLElement|false
     */
    protected function generateXMLObject($xml)
    {
        // strip XML comments from files
        // if they occur at the beginning of the file it will invalidate the XML
        // this occurs with certain versions of Yoast
        $xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string)$xml);
        try {
            libxml_use_internal_errors(true);
            return new SimpleXMLElement($xml, LIBXML_NOCDATA);
        } catch (\Exception $e) {
            return false;
        }
    }

    /**
     * Parse line separated text string
     *
     * @param string $string
     * @return bool
     */
    protected function parseString($string)
    {
        if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
            // Strings are not part of any documented sitemap standard
            return false;
        }
        $array = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $string)));
        foreach ($array as $line) {
            if ($this->isSitemapURL($line)) {
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]);
                continue;
            }
            $this->addArray(self::XML_TAG_URL, ['loc' => $line]);
        }
        return true;
    }

    /**
     * Check if the URL may contain an Sitemap
     *
     * @param string $url
     * @return bool
     */
    protected function isSitemapURL($url)
    {
        $path = parse_url($this->urlEncode($url), PHP_URL_PATH);
        return $this->urlValidate($url) && (
                mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION ||
                mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED
            );
    }

    /**
     * Convert object to array recursively
     *
     * @param $object
     * @return array
     */
    protected function objectToArray($object)
    {
        if (is_object($object) || is_array($object)) {
            if (is_object($object) && $object instanceof SimpleXMLElement and count($object->getNamespaces()) != 0 ) {
                if (count($object->attributes()) != 0) {
                    $ret = [];
                    foreach($object->attributes() as $attribute) {
                        $ret[$attribute->getName()] = $attribute->__toString();
                    }
                } else {
                    $ret = (array)$object;
                }
            } else {
                $ret = (array)$object;
            }

            foreach($ret as &$item) {
                $item = $this->objectToArray($item);
            }

            return $ret;
        }
        return $object;
    }

    /**
     * Parse Json object
     *
     * @param string $type Sitemap or URL
     * @param \SimpleXMLElement $json object
     * @return bool
     */
    protected function parseJson($type, \SimpleXMLElement $json)
    {
        if (!isset($json->$type)) {
            return false;
        }

        $nameSpaces = $json->getDocNamespaces();
        $notEmptyNamespaceNames = array_filter(array_keys($nameSpaces));
        if (!empty($nameSpaces)) {
            foreach ($json->$type as $node) {
                $tags = ["namespaces" => array_combine($notEmptyNamespaceNames, array_fill(0,count($notEmptyNamespaceNames),[]))];
                foreach ($nameSpaces as $nameSpace => $value) {
                    if ($nameSpace != "") {
                        foreach($node->children($nameSpace, true) as $child) {
                            $tags["namespaces"][$nameSpace][] = [$child->getName() => $this->objectToArray($child)];
                        }
                    } else {
                        $tags = array_merge($tags, (array)$node);
                    }
                }

                $this->addArray($type, $tags);
            }
        } else {
            foreach ($json->$type as $node) {
                $this->addArray($type, (array)$node);
            }
        }

        return true;
    }

    /**
     * Sitemaps discovered
     *
     * @return array
     */
    public function getSitemaps()
    {
        return $this->sitemaps;
    }

    /**
     * URLs discovered
     *
     * @return array
     */
    public function getURLs()
    {
        return $this->urls;
    }

    /**
     * Get config
     *
     * @return array
     */
    public function getConfig(): array {
        return $this->config;
    }

    /**
     * Set config
     *
     * @param array $config
     * @return void
     */
    public function setConfig(array $config) {
        $this->config = $config;
    }

    /**
     * Get user agent
     *
     * @return string
     */
    public function getUserAgent() {
        return $this->userAgent;
    }

    /**
     * Change user agent after object creation
     *
     * @param string $userAgent
     */
    public function setUserAgent(string $userAgent) {
        $this->userAgent = $userAgent;
    }

    /**
     * @return \GuzzleHttp\Client
     */
    protected function getClient()
    {
        if (empty($this->client)) {
            $this->client = new \GuzzleHttp\Client();
        }
        return $this->client;
    }

    /**
     * @param mixed $client
     * @return $this
     */
    public function setClient(\GuzzleHttp\Client $client)
    {
        $this->client = $client;
        return $this;
    }
}