luyadev/luya-module-crawler

View on GitHub
src/crawler/ResultHandler.php

Summary

Maintainability
B
4 hrs
Test Coverage
B
87%
<?php

namespace luya\crawler\crawler;

use luya\crawler\frontend\commands\CrawlController;
use luya\crawler\models\Builderindex;
use luya\crawler\models\Index;
use luya\crawler\models\Link;
use Nadar\Crawler\Crawler;
use Nadar\Crawler\Interfaces\HandlerInterface;
use Nadar\Crawler\Result;
use Yii;
use yii\base\Event;
use yii\console\Exception;
use yii\helpers\Console;

/**
 * Handle the Crawler Results
 *
 * Add new pages to the index and sync the production index from builder index on finish.
 *
 * @author Basil Suter <git@nadar.io>
 * @since 3.0.0
 */
class ResultHandler implements HandlerInterface
{
    const EVENT_BEFORE_PROCESS = 'beforeProcess';
    
    const EVENT_AFTER_INDEX = 'afterIndex';

    /**
     * @var CrawlController
     */
    protected $controller;

    /**
     * Constructor
     *
     * @param CrawlController $controller
     */
    public function __construct(CrawlController $controller)
    {
        $this->controller = $controller;
    }
    
    /**
    * {@inheritDoc}
    */
    public function onSetup(Crawler $crawler)
    {
        Builderindex::deleteAll();
    }

    /**
     * {@inheritDoc}
     */
    public function afterRun(Result $result)
    {
        $url = $result->url->getNormalized();
        
        $index = Builderindex::findOne(['url' => $url]);

        if (!$index) {
            $index = new Builderindex();
            $index->url = $url;
        }

        $content = $result->content;
        if (!empty($result->keywords)) {
            $content.= ' ' . $result->keywords;
        }
        
        $index->content = $content;
        $index->title = $result->title;
        $index->description = $result->description;
        $index->language_info = $result->language;
        $index->url_found_on_page = $result->refererUrl->getNormalized();
        $index->group = $result->group;

        if (!$index->save()) {
            return $this->controller->outputError($index->getErrors());
        }
        
        unset($index, $content);
        
        if ($this->controller->linkcheck && !$result->parserResult->ignore) {
            foreach ($result->parserResult->links as $linkUrl => $value) {
                Link::add($linkUrl, $value, $url, $this->controller->encode);
            }
        }
    }

    /**
    * {@inheritDoc}
    */
    public function onEnd(Crawler $crawler)
    {
        $transaction = Yii::$app->db->beginTransaction();
        try {
            $keepIndexIds = [];
            $currentTotal = (int) Index::find()->count();
            $total = (int) Builderindex::find()->count();

            if (!$this->controller->purging && ($currentTotal > 0 && $total == 0)) {
                throw new Exception("The old index contained {$currentTotal} while the new index is empty. Possible misconfiguration or error while crawling the website. The force an empty index us --purging=1");
            }

            $this->controller->module->trigger(self::EVENT_BEFORE_PROCESS, new Event());

            $i = 0;
            if ($this->controller->verbose) {
                Console::startProgress(0, $total, 'synchronize index: ', false);
            }
            foreach (Builderindex::find()->batch() as $batch) {
                foreach ($batch as $builderIndex) {
                    $index = Index::findOne(['url' => $builderIndex->url]);

                    if (!$index) {
                        $index = new Index();
                        $index->added_to_index = time();
                    }

                    $index->url = $builderIndex->url;
                    $index->title = $builderIndex->title;
                    $index->description = $builderIndex->description;
                    $index->content = $builderIndex->content;
                    $index->language_info = $builderIndex->language_info;
                    $index->last_update = time();
                    $index->url_found_on_page = $builderIndex->url_found_on_page;
                    $index->group = $builderIndex->group;
                    $index->save();

                    $keepIndexIds[] = $index->id;
                    unset($index, $builderIndex);
                    $i++;

                    if ($this->controller->verbose) {
                        Console::updateProgress($i, $total);
                    }
                }
            }

            Index::deleteAll(['not in', 'id', $keepIndexIds]);

            if ($this->controller->verbose) {
                Console::endProgress("done." . PHP_EOL);
            }

            $this->controller->module->trigger(self::EVENT_AFTER_INDEX, new Event());

            $transaction->commit();
            unset($batch);
        } catch (\Exception $e) {
            $transaction->rollBack();
            throw $e;
        } catch (\Throwable $e) {
            $transaction->rollBack();
            throw $e;
        }
    }
}