markusos/simple-search

View on GitHub
src/Search/Engine.php

Summary

Maintainability
A
0 mins
Test Coverage
<?php namespace Search;

use Search\Config\Config;

/**
 * Class Engine
 * Main Search Engine class.
 * @package Search
 */
class Engine
{

    /**
     * @var Config
     */
    private $config;

    private $store;

    private $index;

    /**
     * Construct a new Search Engine instance
     * @param Config $config Set the search engine configuration
     */
    public function __construct(Config $config = null)
    {
        if (is_null($config)) {
            $config = Config::createBuilder()->defaultConfig()->build();
        }

        $this->config = $config;
        $this->store = $this->config->getStore();
        $this->index = $this->config->getIndex();
    }

    /**
     * Add a new Document to the search index
     * @param Document $document
     */
    public function addDocument(Document $document)
    {
        $document->id = $this->size();
        $document->tokens = $this->config->getTokenizer()->tokenize($document->content);

        $this->store->addDocument($document);
        $this->index->addDocument($document);
    }

    /**
     * Get the size of the search index
     * @return int number of indexed documents
     */
    public function size()
    {
        return $this->store->size();
    }

    /**
     * Clear the search index of all indexed documents
     * @param string $clear what to clear, default 'all', supports 'store', 'index' and 'all'
     */
    public function clear($clear = 'all')
    {
        switch ($clear) {
            case 'store':
                $this->store->clear();
                break;
            case 'index':
                $this->index->clear();
                break;
            case 'all':
                $this->store->clear();
                $this->index->clear();
                break;
        }
    }

    /**
     * Search the search index for matching documents.
     * Result is ranked and ordered by the document ranker class
     * @param string $query The search query used to find matching documents
     * @return array Array of Documents matching the search query, sorted by the ranker class
     */
    public function search($query)
    {
        $ranker = $this->config->getRanker();
        $queryTokens = $this->config->getTokenizer()->tokenize($query);

        // Filter stop words
        $queryTokens = array_filter($queryTokens, function ($token) {
            return !in_array($token, $this->config->getStopWords());
        });

        // Init the ranker with the query
        $ranker->init($queryTokens, $this->size());

        // Find matching documents
        $documentIds = [];
        foreach ($queryTokens as $token) {
            $result = $this->index->search($token);
            $ranker->cacheTokenFrequency($token, count($result));
            $documentIds += $result;
        }

        // Get matching documents from document store
        $documents = $this->store->getDocuments($documentIds);

        // Rank found documents
        foreach ($documents as $document) {
            $document->score = $ranker->rank($document);
        }

        // Sort the result according to document rank
        usort($documents, function ($a, $b) {
            return $a->score == $b->score ? 0 : ($a->score > $b->score) ? -1 : 1;
        });

        return $documents;
    }

    /**
     * Utils function used to find keywords in a query
     * @param $query string to identify keywords in
     * @return array of keywords, ordered by the ranker class
     */
    public function findKeywords($query)
    {

        $tokenizer = $this->config->getTokenizer();
        $ranker = $this->config->getRanker();

        $tokens = $tokenizer->tokenize($query);
        $ranker->init($tokens, $this->size());

        foreach ($tokens as $token) {
            $result = $this->index->search($token);
            $ranker->cacheTokenFrequency($token, count($result));
        }

        $keywords = $ranker->findKeywords($tokens);

        // If tokens are stemmed, look up original word
        if ($tokenizer instanceof Tokenizer\StemTokenizer) {
            $keywords = array_map(function ($token) use ($tokenizer) {
                $token['keyword'] = $tokenizer->getWord($token['keyword']);
                return $token;
            }, $keywords);
        }

        return $keywords;
    }
}