BookStackApp/BookStack

View on GitHub
app/Search/SearchIndex.php

Summary

Maintainability
A
0 mins
Test Coverage
<?php

namespace BookStack\Search;

use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Support\Collection;

class SearchIndex
{
    /**
     * A list of delimiter characters used to break-up parsed content into terms for indexing.
     */
    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";

    public function __construct(
        protected EntityProvider $entityProvider
    ) {
    }

    /**
     * Index the given entity.
     */
    public function indexEntity(Entity $entity): void
    {
        $this->deleteEntityTerms($entity);
        $terms = $this->entityToTermDataArray($entity);
        SearchTerm::query()->insert($terms);
    }

    /**
     * Index multiple Entities at once.
     *
     * @param Entity[] $entities
     */
    public function indexEntities(array $entities): void
    {
        $terms = [];
        foreach ($entities as $entity) {
            $entityTerms = $this->entityToTermDataArray($entity);
            array_push($terms, ...$entityTerms);
        }

        $chunkedTerms = array_chunk($terms, 500);
        foreach ($chunkedTerms as $termChunk) {
            SearchTerm::query()->insert($termChunk);
        }
    }

    /**
     * Delete and re-index the terms for all entities in the system.
     * Can take a callback which is used for reporting progress.
     * Callback receives three arguments:
     * - An instance of the model being processed
     * - The number that have been processed so far.
     * - The total number of that model to be processed.
     *
     * @param callable(Entity, int, int):void|null $progressCallback
     */
    public function indexAllEntities(?callable $progressCallback = null): void
    {
        SearchTerm::query()->truncate();

        foreach ($this->entityProvider->all() as $entityModel) {
            $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
            $selectFields = ['id', 'name', $indexContentField];
            /** @var Builder<Entity> $query */
            $query = $entityModel->newQuery();
            $total = $query->withTrashed()->count();
            $chunkSize = 250;
            $processed = 0;

            $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
                $this->indexEntities($entities->all());
                $processed = min($processed + $chunkSize, $total);

                if (is_callable($progressCallback)) {
                    $progressCallback($entityModel, $processed, $total);
                }
            };

            $entityModel->newQuery()
                ->select($selectFields)
                ->with(['tags:id,name,value,entity_id,entity_type'])
                ->chunk($chunkSize, $chunkCallback);
        }
    }

    /**
     * Delete related Entity search terms.
     */
    public function deleteEntityTerms(Entity $entity): void
    {
        $entity->searchTerms()->delete();
    }

    /**
     * Create a scored term array from the given text, where the keys are the terms
     * and the values are their scores.
     *
     * @returns array<string, int>
     */
    protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
    {
        $termMap = $this->textToTermCountMap($text);

        foreach ($termMap as $term => $count) {
            $termMap[$term] = floor($count * $scoreAdjustment);
        }

        return $termMap;
    }

    /**
     * Create a scored term array from the given HTML, where the keys are the terms
     * and the values are their scores.
     *
     * @returns array<string, int>
     */
    protected function generateTermScoreMapFromHtml(string $html): array
    {
        if (empty($html)) {
            return [];
        }

        $scoresByTerm = [];
        $elementScoreAdjustmentMap = [
            'h1' => 10,
            'h2' => 5,
            'h3' => 4,
            'h4' => 3,
            'h5' => 2,
            'h6' => 1.5,
        ];

        $html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
        $doc = new HtmlDocument($html);

        /** @var DOMNode $child */
        foreach ($doc->getBodyChildren() as $child) {
            $nodeName = $child->nodeName;
            $termCounts = $this->textToTermCountMap(trim($child->textContent));
            foreach ($termCounts as $term => $count) {
                $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
                $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
            }
        }

        return $scoresByTerm;
    }

    /**
     * Create a scored term map from the given set of entity tags.
     *
     * @param Tag[] $tags
     *
     * @returns array<string, int>
     */
    protected function generateTermScoreMapFromTags(array $tags): array
    {
        $names = [];
        $values = [];

        foreach ($tags as $tag) {
            $names[] = $tag->name;
            $values[] = $tag->value;
        }

        $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
        $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);

        return $this->mergeTermScoreMaps($nameMap, $valueMap);
    }

    /**
     * For the given text, return an array where the keys are the unique term words
     * and the values are the frequency of that term.
     *
     * @returns array<string, int>
     */
    protected function textToTermCountMap(string $text): array
    {
        $tokenMap = []; // {TextToken => OccurrenceCount}
        $splitChars = static::$delimiters;
        $token = strtok($text, $splitChars);

        while ($token !== false) {
            if (!isset($tokenMap[$token])) {
                $tokenMap[$token] = 0;
            }
            $tokenMap[$token]++;
            $token = strtok($splitChars);
        }

        return $tokenMap;
    }

    /**
     * For the given entity, Generate an array of term data details.
     * Is the raw term data, not instances of SearchTerm models.
     *
     * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
     */
    protected function entityToTermDataArray(Entity $entity): array
    {
        $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
        $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());

        if ($entity instanceof Page) {
            $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
        } else {
            $bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
        }

        $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);

        $dataArray = [];
        $entityId = $entity->id;
        $entityType = $entity->getMorphClass();
        foreach ($mergedScoreMap as $term => $score) {
            $dataArray[] = [
                'term'        => $term,
                'score'       => $score,
                'entity_type' => $entityType,
                'entity_id'   => $entityId,
            ];
        }

        return $dataArray;
    }

    /**
     * For the given term data arrays, Merge their contents by term
     * while combining any scores.
     *
     * @param array<string, int>[] ...$scoreMaps
     *
     * @returns array<string, int>
     */
    protected function mergeTermScoreMaps(...$scoreMaps): array
    {
        $mergedMap = [];

        foreach ($scoreMaps as $scoreMap) {
            foreach ($scoreMap as $term => $score) {
                $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
            }
        }

        return $mergedMap;
    }
}