suitmedia/suitcoda

View on GitHub
app/Supports/CrawlerUrl.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

namespace Suitcoda\Supports;

use GuzzleHttp\Client;
use GuzzleHttp\HandlerStack;
use Sabre\Uri;
use Suitcoda\Exceptions\HtmlContentTypeException;
use Suitcoda\Supports\EffectiveUrlMiddleware;
use Symfony\Component\DomCrawler\Crawler;

class CrawlerUrl
{
    /**
     * Stores the base url of current crawling session
     *
     * @var string
     */
    protected $baseUrl;

    /**
     * Guzzle Http Client
     *
     * @var Object
     */
    protected $client;

    /**
     * Symfony DOM Crawler
     *
     * @var Object
     */
    protected $crawler;

    /**
     * Maximum attempt to retry on any unsuccessful request
     *
     * @var integer
     */
    protected $retry = 3;

    /**
     * List of Broken links
     *
     * @var array
     */
    protected $siteBrokenLink;

    /**
     * List of Css
     *
     * @var array
     */
    protected $siteCss;

    /**
     * List of Javascript
     *
     * @var array
     */
    protected $siteJs;

    /**
     * List of redirected url
     *
     * @var array
     */
    protected $siteRedirectLink;

    /**
     * List of html url
     *
     * @var array
     */
    protected $siteUrl;

    /**
     * List of unvisited url
     *
     * @var array
     */
    protected $unvisitedUrl;

    /**
     * Class constructor
     *
     * @param Client  $client  [Guzzle HTTP Client]
     * @param Crawler $crawler [Symfony DOM Crawler]
     */
    public function __construct(Client $client, Crawler $crawler)
    {
        $this->client = $client;
        $this->crawler = $crawler;
        
        $this->siteBrokenLink = [];
        $this->siteCss = [];
        $this->siteJs = [];
        $this->siteRedirectLink = [];
        $this->siteUrl = [];
        $this->unvisitedUrl = [];
    }

    /**
     * Get list of url from the web
     *
     * @return array
     */
    public function getSiteUrl()
    {
        sort($this->siteUrl);
        return $this->siteUrl;
    }

    /**
     * Get list of css from the web
     *
     * @return array
     */
    public function getSiteCss()
    {
        return $this->siteCss;
    }

    /**
     * Get list of js from the web
     *
     * @return array
     */
    public function getSiteJs()
    {
        return $this->siteJs;
    }

    /**
     * Get list of broken links
     *
     * @return array
     */
    public function getSiteBrokenLink()
    {
        return $this->siteBrokenLink;
    }

    /**
     * Get list of redirected url
     *
     * @return array
     */
    public function getSiteRedirectUrl()
    {
        return $this->siteRedirectLink;
    }

    /**
     * Get list of unvisited url
     *
     * @return array
     */
    public function getUnvisitedUrl()
    {
        return $this->unvisitedUrl;
    }

    /**
     * Set website url for crawling
     *
     * @param string $baseUrl []
     *
     * @return  void
     */
    public function setBaseUrl($baseUrl)
    {
        if (strpos($baseUrl, 'http') === false) {
            $this->baseUrl = Uri\normalize('http://' . $baseUrl);
        } else {
            $this->baseUrl = Uri\normalize($baseUrl);
        }
    }

    /**
     * Get main url of website
     *
     * @return string
     */
    public function getBaseUrl()
    {
        return $this->baseUrl;
    }

    /**
     * Check url is crawlable or not
     *
     * @param  string $url []
     *
     * @return bool
     */
    public function checkIfCrawlable($url)
    {
        if (empty($url)) {
            return false;
        }
        $patterns = [
            '@^javascript\:@',
            '@^#.*@',
            '@^void\(0\);$@'
        ];
        foreach ($patterns as $pattern) {
            if (preg_match($pattern, $url)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Check url is external url or not
     *
     * @param  string $url []
     *
     * @return bool
     */
    public function checkIfExternal($url)
    {
        $baseUrlTrimmed = str_replace(['http://', 'https://'], '', $this->baseUrl);
        if (preg_match("@http(s)?\://$baseUrlTrimmed@", $url)) {
            return false;
        } else {
            return true;
        }
    }

    /**
     * Checking a url in the array of site that crawl or not
     *
     * @param  string $url []
     * @param  array $siteLink []
     *
     * @return bool
     */
    public function checkNotInList($url, &$siteLink)
    {
        if (!in_array($url, array_pluck($siteLink, 'url')) &&
            !in_array($url, $this->siteBrokenLink) &&
            !in_array($url, $this->siteRedirectLink)) {
            return true;
        }
        return false;
    }
    
    /**
     * Function to trigger start crawling a website
     *
     * @return void
     */
    public function start()
    {
        $this->unvisitedUrl[] = $this->baseUrl;
        $counter = count($this->unvisitedUrl);

        while (($counter > 0) && ($url = array_shift($this->unvisitedUrl))) {
            $this->crawl($url);

            $counter = count($this->unvisitedUrl);
        }
    }

    /**
     * Function to filter a url base on html tag
     *
     * @param  string $url []
     *
     * @return  void
     */
    protected function crawl($url)
    {
        $responseUrl = $this->doRequest($url);
        if (!is_null($responseUrl)) {
            $listCss = $this->crawler->filterXPath('//*[@rel="stylesheet"]')->extract('href');
            $this->getAllLink($url, $listCss, $this->siteCss);
            $listJs = $this->crawler->filter('script')->extract('src');
            $this->getAllLink($url, $listJs, $this->siteJs);
            $listUrl = $this->crawler->filter('a')->extract('href');
            $this->getAllLink($url, $listUrl, $this->siteUrl, 1);
        }
        $this->crawler->clear();
        unset($responseUrl);
    }

    /**
     * Function to get all link url, css and js from a url
     *
     * @param  array  $currentUrl []
     * @param  array  $lists []
     * @param  array  $siteLink []
     * @param  int    $recursive []
     *
     * @return void
     */
    protected function getAllLink($currentUrl, $lists, &$siteLink, $recursive = 0)
    {
        foreach ($lists as $list) {
            if (!$list || !$this->checkIfCrawlable($list)) {
                continue;
            }
            
            $list = $this->encodeUrl(preg_replace('/(\.\.\/)+/', '/', $list));
            $list = Uri\resolve($currentUrl, $list);
            if ($this->checkIfExternal($list) || !$this->checkNotInList($list, $siteLink)) {
                continue;
            }
            if (!$recursive && $this->getExtension($list)) {
                $responseUrl = $this->client->get($list);
                array_push($siteLink, [
                    'type' => $this->getExtension($list),
                    'url' => $list,
                    'bodyContent' => gzdeflate($responseUrl->getBody()->getContents())
                ]);
            } elseif (!in_array($list, $this->unvisitedUrl)) {
                $this->unvisitedUrl[] = $list;
            }
        }
    }

    /**
     * Function to checking and retrying a url
     *
     * @param  string $url []
     * @param  int    $try []
     *
     * @return ResponseInterface [It could be null]
     */
    public function doRequest($url, $try = null)
    {
        if (is_null($try)) {
            $try = $this->retry;
        }
        if ($try < 0) {
            return null;
        }
        try {
            $responseUrl = $this->getEffectiveUrl($url);
            $effectiveUrl = $responseUrl->getHeaderLine('X-GUZZLE-EFFECTIVE-URL');
            if ($responseUrl->getStatusCode() === 200 &&
                !$this->checkIfExternal($effectiveUrl) &&
                $this->checkNotInList($effectiveUrl, $this->siteUrl)) {
                $bodyContent = $responseUrl->getBody()->getContents();
                $compressedContent = gzdeflate($bodyContent);
                $this->crawler->addHtmlContent($bodyContent);
                $urlInfo = [
                    'type' => 'url',
                    'url' => $effectiveUrl,
                    'title' => '',
                    'titleTag' => '',
                    'desc' => '',
                    'descTag' => '',
                    'bodyContent' => $compressedContent,
                    'depth' => $this->getUrlDepth($effectiveUrl)
                ];
                if ($this->crawler->filterXPath('//head')->count() &&
                    !empty($this->crawler->filterXPath('//head')->html())) {
                    $headTag = $this->crawler->filterXPath('//head')->html();
                    
                    $titleTag = $this->getTitleTag($headTag);
                    $descTag = $this->getDescTag($headTag);
                    $urlInfo['title'] = $titleTag[1];
                    $urlInfo['titleTag'] = $titleTag[0];
                    $urlInfo['desc'] = $descTag[1];
                    $urlInfo['descTag'] = $descTag[0];
                }
                array_push($this->siteUrl, $urlInfo);
                unset($bodyContent, $compressedContent, $urlInfo);
                return $responseUrl;
            }
            if ($responseUrl->getStatusCode() >= 400) {
                array_push($this->siteBrokenLink, $effectiveUrl);
                return null;
            }
            array_push($this->siteRedirectLink, $url);
            return null;
        } catch (\Exception $e) {
            if (!($e->getPrevious() instanceof HtmlContentTypeException)) {
                $try--;
                return $this->doRequest($url, $try);
            }
        }
    }
    
    /**
     * Get the last redirect from a url
     *
     * @param  string $url []
     *
     * @return Response object
     */
    public function getEffectiveUrl($url)
    {
        $stack = HandlerStack::create();
        $stack->push(EffectiveUrlMiddleware::middleware());
        $response = $this->client->get($url, [
            'handler' => $stack,
            'http_errors' => false,
            'on_headers' => function (\Psr\Http\Message\ResponseInterface $response) {
                if (strpos($response->getHeaderLine('Content-Type'), 'text/html') === false) {
                    throw new HtmlContentTypeException;
                }
            }
        ]);
        return $response;
    }

    /**
     * Get the title tag and content of tag
     *
     * @param  string $html []
     *
     * @return array
     */
    public function getTitleTag($html)
    {
        preg_match(
            '/<title[^>]*>([\s\S]*?)<\/title>/',
            $html,
            $titleTagMatches
        );
        if (empty($titleTagMatches)) {
            $finalTitleTagMatches = ['', ''];
        } else {
            $finalTitleTagMatches[] = $titleTagMatches[0];
            $finalTitleTagMatches[] = trim($titleTagMatches[1]);
        }
        return $finalTitleTagMatches;
    }

    /**
     * Get the meta tag description and content of tag
     *
     * @param  string $html []
     *
     * @return array
     */
    public function getDescTag($html)
    {
        preg_match(
            '/<meta name="description"(.*?)content="(.*?)"(.*?\/*)>/',
            $html,
            $descTagMatches
        );
        if (empty($descTagMatches)) {
            $finalDescTagMatches = ['', ''];
        } else {
            $finalDescTagMatches[] = $descTagMatches[0];
            $finalDescTagMatches[] = trim($descTagMatches[2]);
        }
        return $finalDescTagMatches;
    }

    /**
     * Get extension of the given url
     *
     * @param  string $url [A valid URL address]
     *
     * @return string      [Extenstion]
     */
    public function getExtension($url)
    {
        $extensions = [
            'css',
            'js'
        ];

        foreach ($extensions as $ext) {
            if (str_contains(pathinfo($url, PATHINFO_EXTENSION), $ext)) {
                return $ext;
            }
        }
        return null;
    }

    /**
     * Get depth of given url
     *
     * @param  string $url []
     * @return int
     */
    public function getUrlDepth($url)
    {
        $urlPath = parse_url($url, PHP_URL_PATH);

        if (is_null($urlPath) || strcmp($urlPath, '/') == 0) {
            return 0;
        }
        $splitSegment = explode('/', trim($urlPath, '/'));
        return count($splitSegment);
    }

    /**
     * Encode url except -_/? character
     *
     * @param  string $url []
     * @return string
     */
    public function encodeUrl($url)
    {
        $urlTrimmed = str_replace(['http://', 'https://'], '', $url);
        $scheme = parse_url($url, PHP_URL_SCHEME);
        if ($scheme) {
            $urlTrimmed = preg_replace('/^' . $scheme . '\:\/\//', '', $url);
        }
        
        $qMarkExplode = explode('?', $urlTrimmed);
        $slashExplode = explode('/', $qMarkExplode[0]);
        foreach ($slashExplode as $slash) {
            $slashArray[] = urlencode($slash);
        }
        $qMarkExplode[0] = implode('/', $slashArray);
        unset($slashArray);

        $qMarkImplode = implode('?', $qMarkExplode);

        if (parse_url($url, PHP_URL_SCHEME)) {
            return parse_url($url, PHP_URL_SCHEME) . '://' . $qMarkImplode;
        }
        return $qMarkImplode;
    }
}