eliashaeussler/cache-warmup

View on GitHub
src/Command/CacheWarmupCommand.php

Summary

Maintainability
A
0 mins
Test Coverage
A
100%
<?php

declare(strict_types=1);

/*
 * This file is part of the Composer package "eliashaeussler/cache-warmup".
 *
 * Copyright (C) 2020-2024 Elias Häußler <elias@haeussler.dev>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
 */

namespace EliasHaeussler\CacheWarmup\Command;

use EliasHaeussler\CacheWarmup\CacheWarmer;
use EliasHaeussler\CacheWarmup\Config;
use EliasHaeussler\CacheWarmup\Crawler;
use EliasHaeussler\CacheWarmup\Exception;
use EliasHaeussler\CacheWarmup\Formatter;
use EliasHaeussler\CacheWarmup\Helper;
use EliasHaeussler\CacheWarmup\Log;
use EliasHaeussler\CacheWarmup\Result;
use EliasHaeussler\CacheWarmup\Sitemap;
use EliasHaeussler\CacheWarmup\Time;
use GuzzleHttp\Client;
use GuzzleHttp\ClientInterface;
use Psr\Log\LogLevel;
use Symfony\Component\Console;
use Symfony\Component\Filesystem;

use function array_map;
use function array_unshift;
use function count;
use function getenv;
use function implode;
use function in_array;
use function is_string;
use function json_encode;
use function sleep;
use function sprintf;
use function strtolower;

/**
 * CacheWarmupCommand.
 *
 * @author Elias Häußler <elias@heussler.dev>
 * @license GPL-3.0-or-later
 */
final class CacheWarmupCommand extends Console\Command\Command
{
    private const SUCCESSFUL = 0;
    private const FAILED = 1;

    private readonly Time\TimeTracker $timeTracker;
    private Config\CacheWarmupConfig $config;
    private Console\Style\SymfonyStyle $io;
    private Formatter\Formatter $formatter;
    private Crawler\CrawlerFactory $crawlerFactory;
    private bool $firstRun = true;

    public function __construct(
        private readonly ClientInterface $client = new Client(),
    ) {
        parent::__construct('cache-warmup');
        $this->timeTracker = new Time\TimeTracker();
    }

    protected function configure(): void
    {
        $crawler = Crawler\Crawler::class;
        $configurableCrawler = Crawler\ConfigurableCrawler::class;
        $stoppableCrawler = Crawler\StoppableCrawler::class;
        $textFormatter = Formatter\TextFormatter::getType();
        $jsonFormatter = Formatter\JsonFormatter::getType();
        $sortByChangeFrequencyStrategy = Crawler\Strategy\SortByChangeFrequencyStrategy::getName();
        $sortByLastModificationDateStrategy = Crawler\Strategy\SortByLastModificationDateStrategy::getName();
        $sortByPriorityStrategy = Crawler\Strategy\SortByPriorityStrategy::getName();
        $logLevels = implode(
            PHP_EOL,
            array_map(
                static fn (string $logLevel): string => '   * <comment>'.strtolower($logLevel).'</comment>',
                Log\LogLevel::getAll(),
            ),
        );

        $this->setDescription('Warms up caches of URLs provided by a given set of XML sitemaps.');
        $this->setHelp(<<<HELP
This command can be used to warm up website caches.
It requires a set of XML sitemaps offering several URLs which will be crawled.

<info>Sitemaps</info>
<info>========</info>
The list of sitemaps to be crawled can be defined as command argument:

   * <comment>%command.full_name% https://www.example.com/sitemap.xml</comment> (URL)
   * <comment>%command.full_name% /var/www/html/sitemap.xml</comment> (local file)

You are free to crawl as many sitemaps as you want.
Alternatively, sitemaps can be specified from user input when application is in interactive mode.

<info>Custom URLs</info>
<info>===========</info>
In addition or as an alternative to sitemaps, it's also possible to provide a given URL set using the <comment>--urls</comment> option:

   <comment>%command.full_name% -u https://www.example.com/foo -u https://www.example.com/baz</comment>

<info>Config file</info>
<info>===========</info>
All command parameters can be configured in an external config file.
Use the <comment>--config</comment> option to specify the config file:

   <comment>%command.full_name% -c cache-warmup.php</comment>

The following formats are currently supported:

   * <comment>json</comment>
   * <comment>php</comment>
   * <comment>yaml/yml</comment>

<info>Exclude patterns</info>
<info>================</info>
You can specify exclude patterns to be applied on URLs in order to ignore them from cache warming.
Use the <comment>--exclude</comment> (or <comment>-e</comment>) option to specify one or more patterns:

   <comment>%command.full_name% -e "*no_cache=1*" -e "*no_warming=1*"</comment>

You can also specify regular expressions as exclude patterns.
Note that each expression must start and end with a <comment>#</comment> symbol:

   <comment>%command.full_name% -e "#(no_cache|no_warming)=1#"</comment>

<info>Progress bar</info>
<info>============</info>
You can track the cache warmup progress by using the <comment>--progress</comment> option:

   <comment>%command.full_name% --progress</comment>

This shows a compact progress bar, including current warmup failures.
For a more verbose output, add the <comment>--verbose</comment> option:

   <comment>%command.full_name% --progress --verbose</comment>

<info>URL limit</info>
<info>=========</info>
The number of URLs to be crawled can be limited using the <comment>--limit</comment> option:

   <comment>%command.full_name% --limit 50</comment>

<info>Crawler</info>
<info>=======</info>
By default, cache warmup will be done using concurrent HEAD requests.
This behavior can be overridden in case a special crawler is defined using the <comment>--crawler</comment> option:

   <comment>%command.full_name% --crawler "Vendor\Crawler\MyCrawler"</comment>

It's up to you to ensure the given crawler class is available and fully loaded.
This can best be achieved by registering the class with Composer autoloader.
Also make sure the crawler implements <comment>{$crawler}</comment>.

<info>Crawler options</info>
<info>===============</info>
For crawlers implementing <comment>{$configurableCrawler}</comment>,
it is possible to pass a JSON-encoded array of crawler options by using the <comment>--crawler-options</comment> option:

   <comment>%command.full_name% --crawler-options '{"concurrency": 3}'</comment>

<info>Crawling strategy</info>
<info>=================</info>
URLs can be crawled using a specific crawling strategy, e.g. by sorting them by a specific property.
For this, use the <comment>--strategy</comment> option together with a predefined value:

   <comment>%command.full_name% --strategy {$sortByPriorityStrategy}</comment>

The following strategies are currently available:

   * <comment>{$sortByChangeFrequencyStrategy}</comment>
   * <comment>{$sortByLastModificationDateStrategy}</comment>
   * <comment>{$sortByPriorityStrategy}</comment>

<info>Allow failures</info>
<info>==============</info>
If a sitemap cannot be parsed or a URL fails to be crawled, this command normally exits
with a non-zero exit code. This is not always the desired behavior. Therefore, you can change
this behavior by using the <comment>--allow-failures</comment> option:

   <comment>%command.full_name% --allow-failures</comment>

<info>Stop on failure</info>
<info>===============</info>
For crawlers implementing <comment>{$stoppableCrawler}</comment>,
you can also configure the crawler to stop on failure. The <comment>--stop-on-failure</comment> option
exists for this case:

   <comment>%command.full_name% --stop-on-failure</comment>

<info>Format output</info>
<info>=============</info>
By default, all user-oriented output is printed as plain text to the console.
However, you can use other formatters by using the <comment>--format</comment> option:

   <comment>%command.full_name% --format json</comment>

Currently, the following formatters are available:

   * <comment>{$textFormatter}</comment> (default)
   * <comment>{$jsonFormatter}</comment>

<info>Logging</info>
<info>=======</info>
You can log the crawling results of each crawled URL to an external log file.
For this, the <comment>--log-file</comment> option exists:

   <comment>%command.full_name% --log-file crawling-errors.log</comment>

When logging is enabled, by default only crawling failures are logged.
You can increase the log level to log successful crawlings as well:

   * <comment>%command.full_name% --log-level error</comment> (default)
   * <comment>%command.full_name% --log-level info</comment>

The following log levels are currently available:

{$logLevels}

HELP);

        $this->addArgument(
            'sitemaps',
            Console\Input\InputArgument::OPTIONAL | Console\Input\InputArgument::IS_ARRAY,
            'URLs or local filenames of XML sitemaps to be used for cache warming',
        );
        $this->addOption(
            'urls',
            'u',
            Console\Input\InputOption::VALUE_REQUIRED | Console\Input\InputOption::VALUE_IS_ARRAY,
            'Custom additional URLs to be used for cache warming',
        );
        $this->addOption(
            'config',
            null,
            Console\Input\InputOption::VALUE_REQUIRED,
            'Path to configuration file',
        );
        $this->addOption(
            'exclude',
            'e',
            Console\Input\InputOption::VALUE_REQUIRED | Console\Input\InputOption::VALUE_IS_ARRAY,
            'Patterns for URLs to be excluded from cache warming',
        );
        $this->addOption(
            'limit',
            'l',
            Console\Input\InputOption::VALUE_REQUIRED,
            'Limit the number of URLs to be processed',
            0,
        );
        $this->addOption(
            'progress',
            'p',
            Console\Input\InputOption::VALUE_NONE,
            'Show progress bar during cache warmup',
        );
        $this->addOption(
            'crawler',
            'c',
            Console\Input\InputOption::VALUE_REQUIRED,
            'FQCN of the crawler to be used for cache warming',
        );
        $this->addOption(
            'crawler-options',
            'o',
            Console\Input\InputOption::VALUE_REQUIRED,
            'Additional config for configurable crawlers',
        );
        $this->addOption(
            'strategy',
            's',
            Console\Input\InputOption::VALUE_REQUIRED,
            'Optional strategy to prepare URLs before crawling them',
        );
        $this->addOption(
            'allow-failures',
            null,
            Console\Input\InputOption::VALUE_NONE,
            'Allow failures during URL crawling and exit with zero',
        );
        $this->addOption(
            'stop-on-failure',
            null,
            Console\Input\InputOption::VALUE_NONE,
            'Cancel further cache warmup requests on failure',
        );
        $this->addOption(
            'format',
            'f',
            Console\Input\InputOption::VALUE_REQUIRED,
            'Formatter used to print the cache warmup result',
            Formatter\TextFormatter::getType(),
        );
        $this->addOption(
            'log-file',
            null,
            Console\Input\InputOption::VALUE_REQUIRED,
            'File where to log crawling results',
        );
        $this->addOption(
            'log-level',
            null,
            Console\Input\InputOption::VALUE_REQUIRED,
            'Log level used to determine which crawling results to log (see help for more information)',
            LogLevel::ERROR,
        );
        $this->addOption(
            'repeat-after',
            null,
            Console\Input\InputOption::VALUE_REQUIRED,
            'Run cache warmup in endless loop and repeat x seconds after each run',
            0,
        );
    }

    /**
     * @throws Exception\ConfigFileIsNotSupported
     * @throws Exception\FormatterIsNotSupported
     * @throws Exception\LogLevelIsNotSupported
     */
    protected function initialize(Console\Input\InputInterface $input, Console\Output\OutputInterface $output): void
    {
        $configFile = $input->getOption('config');
        $configFileFromEnv = getenv('CACHE_WARMUP_CONFIG');
        $configAdapters = [
            new Config\Adapter\ConsoleInputConfigAdapter($input),
            new Config\Adapter\EnvironmentVariablesConfigAdapter(),
        ];

        if (false !== $configFileFromEnv) {
            array_unshift($configAdapters, $this->loadConfigFromFile($configFileFromEnv));
        }
        if (null !== $configFile) {
            array_unshift($configAdapters, $this->loadConfigFromFile($configFile));
        }

        $this->config = (new Config\Adapter\CompositeConfigAdapter($configAdapters))->get();
        $this->io = new Console\Style\SymfonyStyle($input, $output);
        $this->formatter = (new Formatter\FormatterFactory($this->io))->get($this->config->getFormat());

        $logFile = $this->config->getLogFile();
        $logLevel = $this->config->getLogLevel();
        $stopOnFailure = $this->config->shouldStopOnFailure();
        $logger = null;

        // Create logger
        if (is_string($logFile)) {
            $logger = new Log\FileLogger($logFile);
        }

        // Validate log level
        if (!in_array($logLevel, Log\LogLevel::getAll(), true)) {
            throw new Exception\LogLevelIsNotSupported($logLevel);
        }

        // Use error output or disable output if formatter is non-verbose
        if (!$this->formatter->isVerbose()) {
            if ($output instanceof Console\Output\ConsoleOutputInterface) {
                $output = $output->getErrorOutput();

                $this->config->enableProgressBar();
            } else {
                $output = new Console\Output\NullOutput();
            }
        }

        $this->crawlerFactory = new Crawler\CrawlerFactory($output, $logger, $logLevel, $stopOnFailure);
    }

    protected function interact(Console\Input\InputInterface $input, Console\Output\OutputInterface $output): void
    {
        // Early return if sitemaps or URLs are already specified
        if ([] !== $this->config->getSitemaps() || [] !== $this->config->getUrls()) {
            return;
        }

        // Get sitemaps from interactive user input
        $sitemaps = [];
        $helper = $this->getHelper('question');
        do {
            $question = new Console\Question\Question('Please enter the URL of a XML sitemap: ');
            $question->setValidator($this->validateSitemap(...));
            $sitemap = $helper->ask($input, $output, $question);
            if ($sitemap instanceof Sitemap\Sitemap) {
                $sitemaps[] = $sitemap;
                $output->writeln(sprintf('<info>Sitemap added: %s</info>', $sitemap));
            }
        } while ($sitemap instanceof Sitemap\Sitemap);

        // Throw exception if no sitemaps were added
        if ([] === $sitemaps) {
            throw new Console\Exception\RuntimeException('You must enter at least one sitemap URL.', 1604258903);
        }

        $this->config->setSitemaps($sitemaps);
    }

    /**
     * @throws Exception\ConfigFileIsNotSupported
     */
    protected function execute(Console\Input\InputInterface $input, Console\Output\OutputInterface $output): int
    {
        $sitemaps = $this->config->getSitemaps();
        $urls = $this->config->getUrls();
        $repeatAfter = $this->config->getRepeatAfter();

        // Throw exception if neither sitemaps nor URLs are defined
        if ([] === $sitemaps && [] === $urls) {
            throw new Console\Exception\RuntimeException('Neither sitemaps nor URLs are defined.', 1604261236);
        }

        // Show header
        if ($this->formatter->isVerbose()) {
            $this->printHeader();
        }

        // Show warning on endless runs
        if ($this->firstRun && $repeatAfter > 0) {
            $this->showEndlessModeWarning($repeatAfter);
            $this->firstRun = false;
        }

        // Initialize components
        $crawler = $this->initializeCrawler();
        $cacheWarmer = $this->timeTracker->track(fn () => $this->initializeCacheWarmer($crawler));
        $parseTime = $this->timeTracker->getLastDuration();

        // Start crawling
        $result = $this->timeTracker->track(
            fn () => $this->runCacheWarmup(
                $cacheWarmer,
                $crawler instanceof Crawler\VerboseCrawler,
            ),
        );

        // Print formatted parser result
        $this->formatter->formatParserResult(
            new Result\ParserResult($cacheWarmer->getSitemaps(), $cacheWarmer->getUrls()),
            new Result\ParserResult($cacheWarmer->getFailedSitemaps()),
            new Result\ParserResult($cacheWarmer->getExcludedSitemaps(), $cacheWarmer->getExcludedUrls()),
            $parseTime,
        );

        // Print formatted cache warmup result
        $this->formatter->formatCacheWarmupResult($result, $this->timeTracker->getLastDuration());

        // Early return if parsing or crawling failed
        if (!$this->config->areFailuresAllowed()
            && ([] !== $cacheWarmer->getFailedSitemaps() || !$result->isSuccessful())
        ) {
            return self::FAILED;
        }

        // Repeat on endless mode
        if ($repeatAfter > 0) {
            sleep($repeatAfter);

            return $this->execute($input, $output);
        }

        return self::SUCCESSFUL;
    }

    private function runCacheWarmup(CacheWarmer $cacheWarmer, bool $isVerboseCrawler): Result\CacheWarmupResult
    {
        $urlCount = count($cacheWarmer->getUrls());

        if ($this->formatter->isVerbose()) {
            $this->io->write(sprintf('Crawling URL%s... ', 1 === $urlCount ? '' : 's'), $isVerboseCrawler);
        }

        $result = $cacheWarmer->run();

        if ($this->formatter->isVerbose() && !$isVerboseCrawler) {
            if ($result->wasCancelled()) {
                $this->io->writeln('<comment>Cancelled</comment>');
            } else {
                $this->io->writeln('<info>Done</info>');
            }
        }

        return $result;
    }

    private function initializeCacheWarmer(Crawler\Crawler $crawler): CacheWarmer
    {
        if ($this->formatter->isVerbose()) {
            $this->io->write('Parsing sitemaps... ');
        }

        // Initialize crawling strategy
        $strategy = $this->config->getStrategy();
        if (is_string($strategy)) {
            $strategy = match ($strategy) {
                Crawler\Strategy\SortByChangeFrequencyStrategy::getName() => new Crawler\Strategy\SortByChangeFrequencyStrategy(),
                Crawler\Strategy\SortByLastModificationDateStrategy::getName() => new Crawler\Strategy\SortByLastModificationDateStrategy(),
                Crawler\Strategy\SortByPriorityStrategy::getName() => new Crawler\Strategy\SortByPriorityStrategy(),
                default => throw new Console\Exception\RuntimeException('The given crawling strategy is invalid.', 1677618007),
            };
        }

        // Initialize cache warmer
        $cacheWarmer = new CacheWarmer(
            $this->config->getLimit(),
            $this->client,
            $crawler,
            $strategy,
            !$this->config->areFailuresAllowed(),
            $this->config->getExcludePatterns(),
        );

        // Add and parse XML sitemaps
        $cacheWarmer->addSitemaps($this->config->getSitemaps());

        // Add URLs
        foreach ($this->config->getUrls() as $url) {
            $cacheWarmer->addUrl($url);
        }

        if ($this->formatter->isVerbose()) {
            $this->io->writeln('<info>Done</info>');
        }

        return $cacheWarmer;
    }

    private function initializeCrawler(): Crawler\Crawler
    {
        $crawler = $this->config->getCrawler();
        $crawlerOptions = $this->crawlerFactory->parseCrawlerOptions($this->config->getCrawlerOptions());
        $stopOnFailure = $this->config->shouldStopOnFailure();

        // Select default crawler
        if (null === $crawler) {
            $crawler = $this->config->isProgressBarEnabled()
                ? Crawler\OutputtingCrawler::class
                : Crawler\ConcurrentCrawler::class
            ;
        }

        // Initialize crawler
        if (is_string($crawler)) {
            $crawler = $this->crawlerFactory->get($crawler, $crawlerOptions);
        }

        // Print crawler options
        if ($crawler instanceof Crawler\ConfigurableCrawler) {
            if ($this->formatter->isVerbose() && $this->io->isVerbose() && [] !== $crawlerOptions) {
                $this->io->section('Using custom crawler options:');
                $this->io->writeln((string) json_encode($crawlerOptions, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
                $this->io->newLine();
            }
        } elseif ([] !== $crawlerOptions) {
            $this->formatter->logMessage(
                'You passed crawler options to a non-configurable crawler.',
                Formatter\MessageSeverity::Warning,
            );
        }

        // Show notice on unsupported stoppable crawler feature
        if ($stopOnFailure && !($crawler instanceof Crawler\StoppableCrawler)) {
            $this->formatter->logMessage(
                'You configured "stop on failure" for a non-stoppable crawler.',
                Formatter\MessageSeverity::Warning,
            );
        }

        return $crawler;
    }

    /**
     * @throws Exception\ConfigFileIsNotSupported
     */
    private function loadConfigFromFile(string $configFile): Config\Adapter\ConfigAdapter
    {
        $configFile = Helper\FilesystemHelper::resolveRelativePath($configFile);
        $extension = Filesystem\Path::getExtension($configFile, true);

        return match ($extension) {
            'php' => new Config\Adapter\PhpConfigAdapter($configFile),
            'json', 'yaml', 'yml' => new Config\Adapter\FileConfigAdapter($configFile),
            default => throw new Exception\ConfigFileIsNotSupported($configFile),
        };
    }

    private function showEndlessModeWarning(int $interval): void
    {
        $this->formatter->logMessage(
            sprintf(
                'Command is scheduled to run forever. It will be repeated %d second%s after each run.',
                $interval,
                1 === $interval ? '' : 's',
            ),
            Formatter\MessageSeverity::Warning,
        );
    }

    private function validateSitemap(?string $input): ?Sitemap\Sitemap
    {
        if (null === $input) {
            return null;
        }

        return Sitemap\Sitemap::createFromString($input);
    }

    private function printHeader(): void
    {
        $currentVersion = Helper\VersionHelper::getCurrentVersion();

        $this->io->writeln(
            sprintf(
                'Running <info>cache warmup</info>%s by Elias Häußler and contributors.',
                null !== $currentVersion ? ' <comment>'.$currentVersion.'</comment>' : '',
            ),
        );
    }
}