kiler129/TorrentGhost

View on GitHub
src/Aggregator/RssAggregator.php

Summary

Maintainability
B
5 hrs
Test Coverage
<?php
/*
 * This file is part of TorrentGhost project.
 * You are using it at your own risk and you are fully responsible
 *  for everything that code will do.
 *
 * (c) Grzegorz Zdanowski <grzegorz@noflash.pl>
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */

namespace noFlash\TorrentGhost\Aggregator;


use GuzzleHttp\Psr7\Request;
use noFlash\TorrentGhost\Configuration\AggregatorAbstractConfiguration;
use noFlash\TorrentGhost\Configuration\RssAggregatorConfiguration;
use noFlash\TorrentGhost\Configuration\TorrentGhostConfiguration;
use noFlash\TorrentGhost\Http\FetchJob;
use noFlash\TorrentGhost\Util\Blacklist;
use Psr\Log\LoggerInterface;

/**
 * Fetches data from RSS feed.
 */
class RssAggregator extends AbstractAggregator
{
    /**
     * @inheritDoc
     */
    const TYPE = 'rss';

    /**
     * @var RssAggregatorConfiguration
     */
    protected $configuration;

    /**
     * @var int Unix timestamp where RSS was downloaded last time
     */
    private $lastSuccessfulDownload = -1;

    /**
     * @var Blacklist
     */
    private $blacklist;

    /**
     * @var array
     */
    private $links;

    /**
     * @inheritDoc
     */
    public function __construct(
        TorrentGhostConfiguration $appConfiguration,
        AggregatorAbstractConfiguration $configuration,
        LoggerInterface $logger)
    {
        parent::__construct($appConfiguration, $configuration, $logger);

        if (!function_exists('simplexml_load_string')) {
            throw new \RuntimeException(
                'Your PHP interpreter was compiled without SimpleXML - you need it for RSS aggregator'
            );
        }
        libxml_disable_entity_loader(true); //Disable XXE for security reasons, by default it's disabled since 5.3.23
        libxml_use_internal_errors(true); //Prevent printing XML errors as E_WARNING

        $this->blacklist = new Blacklist($logger);
    }

    /**
     * @inheritDoc
     */
    public function ping()
    {
        if (time() - $this->lastSuccessfulDownload < $this->configuration->getInterval()) {
            $this->logger->debug($this->getName() . ' got ping but it is too early to do something, skipping');

            return;
        }

        if (!$this->configuration->isValid()) {
            $this->logger->warning($this->getName() . ' configuration is not valid - skipping');

            return;
        }

        $this->getNewDataFromOrigin();

        $this->logger->debug('Pool in ' . $this->getName() . ' now contains ' . count($this->links) . ' entries');
    }

    /**
     * @inheritDoc
     */
    public function getPingInterval()
    {
        return $this->configuration->getInterval();
    }

    /**
     * @inheritDoc
     */
    public function getLinks()
    {
        return $this->links;
    }

    /**
     * @inheritDoc
     */
    public function flushLinksPool()
    {
        $this->links = [];
    }

    /**
     * @inheritDoc
     */
    public function extractLink($string)
    {
        return parent::extractLink(htmlspecialchars_decode($string));
    }

    /**
     * Triggers downloading new data from origin and parsing it.
     * Normally this method is executed by ping(), but you may decide to call it manually at any time.
     */
    public function getNewDataFromOrigin()
    {
        //TODO maybe add checksum to skip all operations if it's the same as previous?
        $feed = $this->fetchFeed($this->configuration->getUrl());
        if ($feed === false) {
            $this->logger->error($this->getName() . ': Failed to fetch feed from ' . $this->configuration->getUrl());

            return;
        }

        $this->logger->debug('Downloaded RSS for ' . $this->configuration->getUrl() . ' (' . strlen($feed) . ' bytes)');
        $this->lastSuccessfulDownload = time();

        $feed = $this->parseFeedXml($feed);
        if($feed === false) {
            return;
        }

        $this->blacklist->applyBlacklist($feed);

        if (empty($feed)) {
            $this->logger->debug($this->getName() . ': No new links in feed');

            return;
        }

        foreach ($feed as $name => $link) {
            $this->links[] = ['name' => $name, 'link' => $link];
        }
    }

    /**
     * @param $url
     *
     * @return bool|string Returns false on error or downloaded RSS file content
     */
    private function fetchFeed($url)
    {
        $rssRequest = new Request('GET', $url);
        $fetchJob = new FetchJob($this->appConfiguration, $rssRequest);
        //TODO: Implement RSS size limit when FetchJob start supporting it

        try {
            if (!$fetchJob->execute()) {
                throw new \Exception('RSS download job failed for unknown reason');
            }

        } catch (\Exception $e) {
            $type = get_class($e);
            $this->logger->error($type . ': ' . $e->getMessage());

            return false;
        }

        $responseStream = $fetchJob->getResponseStream();
        fseek($responseStream, 0);

        return stream_get_contents($responseStream);
    }

    /**
     * Parses given XML and returns array with titles & processed links.
     *
     * @param string $feed Valid RSS XML.
     *
     * @return array|false Key contains title, value contains processed link. It can also return bool false on error.
     *
     * @todo This method sucks...
     */
    public function parseFeedXml($feed)
    {
        $feed = simplexml_load_string($feed);
        if ($feed === false) {
            $this->logger->error('RSS XML parsing failed in ' . $this->getName(), libxml_get_errors());

            return false;
        }

        if (!isset($feed->channel->item)) {
            $this->logger->error($this->getName() . ' failed to extract items from XML');

            return false;
        }

        $nameTag = $this->configuration->getNameTagName();
        $linkTag = $this->configuration->getLinkTagName();
        $nameExtract = $this->configuration->getNameExtractPattern();
        $linkExtract = $this->configuration->getLinkExtractPattern();
        $linkTransform = $this->configuration->getLinkTransformPattern();

        $result = [];
        $counter = 0;
        foreach ($feed->channel->item as $item) {
            if (!isset($item->{$nameTag})) {
                $this->logger->warning(
                    'Unable to locate name tag <' . $nameTag . '> in ' . $this->getName() . ' for item#' . $counter .
                    '. Skipping item.'
                );
                continue;
            }

            if (!isset($item->{$linkTag})) {
                $this->logger->warning(
                    'Unable to locate name tag <' . $linkTag . '> in ' . $this->getName() . ' for item#' . $counter .
                    '. Skipping item.'
                );
                continue;
            }

            $nameMatches = [];
            if (preg_match($nameExtract, $item->{$nameTag}, $nameMatches) < 1 || !isset($nameMatches[1])) {
                $this->logger->warning(
                    'Extracting name from "' . $item->{$nameTag} . '" failed in ' . $this->getName() . ' for item#' .
                    $counter . '. Skipping item.'
                );
                continue;
            }

            if (isset($result[$nameMatches[1]])) {
                $this->logger->warning(
                    'Duplicate name "' . $item->{$nameTag} . '" in ' . $this->getName() . ' for item#' . $counter .
                    '. Skipping item.'
                );
                continue;
            }

            $linkMatches = [];
            if (preg_match($linkExtract, $item->{$linkTag}, $linkMatches) < 1 || !isset($linkMatches[1])) {
                $this->logger->warning(
                    'Extracting link from "' . $item->{$linkTag} . '" failed in ' . $this->getName() . ' for item#' .
                    $counter . '. Skipping item.'
                );
                continue;
            }

            if ($linkTransform !== null) {
                $linkMatches[1] = preg_replace($linkTransform[0], $linkTransform[1], $linkMatches[1]);
            }

            $result[$nameMatches[1]] = $linkMatches[1];
            $counter++;
        }

        return $result;
    }
}