app/resto/core/RestoQueryAnalyzer.php

Summary

Maintainability
D
2 days
Test Coverage
<?php
/*
 * Copyright 2018 Jérôme Gasperi
 *
 * Licensed under the Apache License, version 2.0 (the "License");
 * You may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

/**
 * Simple query analyzer
 */
class RestoQueryAnalyzer
{
    /*
     * RestoContext
     */
    private $context;

    /*
     * RestoUser
     */
    private $user;

    /*
     * Reference to Gazetteer add-on
     */
    private $gazetteer;

    /**
     * Constructor
     *
     * @param RestoContext $context
     * @param RestoUser $user
     */
    public function __construct($context, $user)
    {
        $this->context = $context;
        $this->user = $user;
        if (isset($this->context->addons['Gazetteer'])) {
            $this->gazetteer = new Gazetteer($this->context, $this->user);
        }
    }

    /**
     * Query analyzer process searchTerms and modify query parameters accordingly
     *
     * @param array $params
     * @param RestoModel $model
     * @return array
     */
    public function analyze($params, $model)
    {
        /*
         * Store original params
         */
        $inputFilters = $params;
        
        /*
         * [STAC][WFS] datetime is converted into start/end
         */
        if (isset($params['resto:datetime'])) {
            $this->splitDatetime($params['resto:datetime'], $params);
            unset($params['resto:datetime']);
        }

        /*
         * Check dates
         */
        if (isset($params['time:start']) && isset($params['time:end']) && $params['time:start'] > $params['time:end']) {
            RestoLogUtil::httpError(400, 'Invalid dates range - start cannot be greater than end');
        }

        /*
         * Details analysis
         */
        $details = array(
            'language' => $this->context->lang,
            'What' => array(),
            'When' => array(),
            'Where' => array(),
            'Errors' => array(),
            'Explained' => array()
        );

        $hashTodiscard = null;

        /*
         * Query Analyzer on searchTerms
         */
        if (isset($params['searchTerms']) && isset($this->context->addons['NLP'])) {
            $nlp = new NLP($this->context, $this->user);
            $details = $nlp->process(array(
                'q' => $params['searchTerms']
            ));
        } else {
            /*
             * Extract hashtags (i.e. #something or -#something)
             */
            $hashtags = isset($params['searchTerms']) ? RestoUtil::extractHashtags($params['searchTerms']) : array();
            $nbOfHashtags = count($hashtags);
            if ($nbOfHashtags > 0) {
                /*
                 * Special gazetteer hashtags - if found, the first is converted to geouid
                 * A gazetteer hashtag format is type:name:geouid
                 */
                if (!isset($params['geo:name'])) {
                    for ($i = 0, $ii = $nbOfHashtags; $i < $ii; $i++) {
                        $splitted = explode(RestoConstants::TAG_SEPARATOR, $hashtags[$i]);
                        if (count($splitted) === 3 && is_numeric($splitted[2])) {
                            $params['geo:name'] = 'geouid:' . $splitted[2];
                            array_splice($hashtags, $i, 1);
                            break;
                        }
                    }
                }

                $details['What'] = array(
                    'searchTerms' => $this->appendSkos($hashtags)
                );
            }

            /*
             * Extract toponym
             */
            if (isset($this->gazetteer)) {
                $this->extractToponym($params, $details, $hashTodiscard);
            }
        }

        /*
         * Not understood
         */
        if (isset($params['searchTerms']) && empty($details['What']) && empty($details['When']) && empty($details['Where'])) {
            $details['appliedFilters'] = $this->addOperation($params, $model->searchFilters);
            return array(
                'inputFilters' => $inputFilters,
                'notUnderstood' => true,
                'details' => $details
            );
        }

        /*
         * Where, When, What
         */
        $details['appliedFilters'] = $this->addOperation($this->setWhereFilters($details['Where'], $this->setWhenFilters($details['When'], $this->setWhatFilters($details['What'], $params)), $hashTodiscard), $model->searchFilters);
        return array(
            'inputFilters' => $inputFilters,
            'details' => $details
        );
    }

    /**
     * Parse input $hastags array and replace individual $hashtag with skos related
     * hastags.
     *
     * @param array $hashtags
     * @return array
     */
    private function appendSkos($hashtags)
    {
        for ($i = 0, $ii = count($hashtags); $i < $ii; $i++) {
            /*
             * If resto-addon-sosa add-on exists, check for searchTerm last character:
             *  - if ends with "!" character, then search for broader search terms
             *  - if ends with "*" character, then search for narrower search terms
             *  - if ends with "~" character, then search for related search terms
             */
            $lastCharacter = substr($hashtags[$i], -1);
            if (in_array($lastCharacter, array('!', '*', '~')) && class_exists('SKOS')) {
                $hashtags[$i] = substr($hashtags[$i], 0, -1);
                $relations = array(
                    '!' => SKOS::$SKOS_BROADER,
                    '*' => SKOS::$SKOS_NARROWER,
                    '~' => SKOS::$SKOS_RELATED
                );
                // Don't forget to trim # prefix
                $relations = (new SKOS($this->context, $this->user))->retrieveRecursiveRelations(substr($hashtags[$i], 1), $relations[$lastCharacter]);
                if (count($relations) > 0) {
                    $hashtags[$i] = $hashtags[$i] . '|' . join('|', $relations);
                }
            }
        }
        
        return $hashtags;
    }

    /**
     * Extract toponym from gazetteer
     *
     * @param array $params
     * @param array $details
     * @param array $hashToDiscard
     */
    private function extractToponym($params, &$details, &$hashToDiscard)
    {
        $foundLocation = null;

        /*
         * Order is "name" over "searchTerms"
         */
        $locationName = $params['geo:name'] ?? $params['searchTerms'] ?? null;
       
        /*
         * Search on toponym name
         */
        if (isset($locationName) && ! isset($params['geo:lon']) && ! isset($params['geo:geometry'])) {
            /*
             * Search on toponym identifier i.e. geo:name starts with geouid
             */
            if (strpos($locationName, 'geouid' . RestoConstants::TAG_SEPARATOR) === 0) {
                $location = $this->gazetteer->getToponym(array(
                    'id' => substr($locationName, 7),
                    'index' => $this->context->core['planet']
                ));
                if (isset($location['_source'])) {
                    $foundLocation = array_merge(array('_id' => $location['_id']), $location['_source']);
                    if (isset($foundLocation['hash'])) {
                        $hashToDiscard = $foundLocation['hash'];
                    }
                    if (isset($foundLocation['wkt'])) {
                        $params['geo:geometry'] = $foundLocation['wkt'];
                    } else {
                        $coordinates = explode(',', $foundLocation['coordinates']);
                        $params['geo:geometry'] = 'POINT(' . trim($coordinates[1]) . ' ' . trim($coordinates[0]) . ')';
                    }
                }
            } else {

                /*
                 * [IMPORTANT] The search is performed on a modified "searchTerms" with hashtags REMOVED
                 */
                $locationName = trim(preg_replace("/(#|-#)([^ ]+)/", '', $locationName));
                if ( $locationName !== '') {
                    $locations = $this->gazetteer->search(array(
                        'q' => $locationName,
                        'index' => $this->context->core['planet']
                    ));
                    if (isset($locations['hits']) && count($locations['hits']['hits']) > 0) {
                        $foundLocation = array_merge(array('_id' => $locations['hits']['hits'][0]['_id']), $locations['hits']['hits'][0]['_source']);
                        if (isset($foundLocation['wkt'])) {
                            $params['geo:geometry'] = $foundLocation['wkt'];
                        } elseif (isset($foundLocation['coordinates'])) {
                            $coordinates = explode(',', $foundLocation['coordinates']);
                            $params['geo:lon'] = floatval(trim($coordinates[1]));
                            $params['geo:lat'] = floatval(trim($coordinates[0]));
                        }
                    }
                }
                
            }
        }

        if (isset($foundLocation)) {
            $details['Where'] = array_merge(array($foundLocation), $details['Where']);
            $details['Explained'] = array_merge(array(
                'processor' => 'WhereProcessor::processIn',
                'word' => $foundLocation['name']
            ), $details['Explained']);
        }
    }


    /**
     * Set location filters from query analysis
     *
     * @param array $where
     * @param array $params
     * @param string $hashTodiscard
     */
    private function setWhereFilters($where, $params, $hashTodiscard = null)
    {
        for ($i = count($where); $i--;) {
            /*
             * Geometry
             */
            if (isset($where[$i]['wkt'])) {
                $params['geo:geometry'] = $where[$i]['wkt'];
            }
            /*
             * Only one toponym is supported (the last one)
             */
            elseif (isset($where[$i]['coordinates'])) {
                $coordinates = array_map('trim', explode(',', $where[$i]['coordinates']));
                $params['geo:lon'] = floatval($coordinates[1]);
                $params['geo:lat'] = floatval($coordinates[0]);
            }
            /*
             * Searching for hash/keywords is faster than geometry
             */
            elseif (isset($where[$i]['searchTerms'])) {
                $params['searchTerms'][] = $where[$i]['searchTerms'];
            } elseif (isset($where[$i]['geouid'])) {
                if (!isset($hashTodiscard) || $where[$i]['hash'] !== $hashTodiscard) {
                    $params['searchTerms'][] = 'geouid' . RestoConstants::TAG_SEPARATOR . $where[$i]['geonameid'];
                }
            }
        }
        if (count($params['searchTerms']) > 0) {
            $params['searchTerms'] = join(' ', $params['searchTerms']);
        } else {
            unset($params['searchTerms']);
        }
        return $params;
    }

    /**
     * Set what filters from query analysis
     *
     * @param array $what
     * @param array $params
     */
    private function setWhatFilters($what, $params)
    {
        $params['searchTerms'] = array();
        foreach ($what as $key => $value) {
            if ($key === 'searchTerms') {
                for ($i = count($value); $i--;) {
                    $params['searchTerms'][] = $value[$i];
                }
            } else {
                $params[$key] = $value;
            }
        }
        return $params;
    }

    /**
     * Set when filters from query analysis
     *
     * @param array $when
     * @param array $params
     */
    private function setWhenFilters($when, $params)
    {
        foreach ($when as $key => $value) {
            /*
             * times is an array of time:start/time:end pairs
             * [TODO] : Currently only one pair is supported
             * [UDPATE] : Are you sure ?
             */
            if ($key === 'times') {
                $params = array_merge($params, $this->timesToOpenSearch($value));
            } else {
                $params['searchTerms'][] = $key . RestoConstants::TAG_SEPARATOR . $value;
            }
        }
        return $params;
    }

    /**
     *
     * @param array $times
     */
    private function timesToOpenSearch($times)
    {
        $params = array();
        for ($i = 0, $ii = count($times); $i < $ii; $i++) {
            foreach ($times[$i] as $key => $value) {
                $params[$key] = $value;
            }
        }
        return $params;
    }

    /**
     * Convert datetime to start/end filters
     *
     * @param string $datetime
     * @param array $params
     */
    private function splitDatetime($datetime, &$params)
    {
        $dates = explode('/', trim($datetime));

        /*
         * Double-open-ended queries are not allowed in STAC API
         */
        if (count($dates) > 2) {
            RestoLogUtil::httpError(400, 'Invalid dates range - too many /');
        } elseif (count($dates) == 2 && in_array($dates[0], array('', '..')) && in_array($dates[1], array('', '..'))) {
            RestoLogUtil::httpError(400, 'Invalid dates range - double-open-ended queries are not allowed in STAC API /');
        }

        $model = new DefaultModel();

        if (isset($dates[0]) && !in_array($dates[0], array('', '..'))) {
            $filterKey = $model->getFilterName('start');
            $params[$filterKey] = preg_replace('/<.+?>/', '', $dates[0]);
            $model->validateFilter($filterKey, $params[$filterKey]);
        }
        if (isset($dates[1]) && !in_array($dates[1], array('', '..'))) {
            $filterKey = $model->getFilterName('end');
            $params[$filterKey] = preg_replace('/<.+?>/', '', $dates[1]);
            $model->validateFilter($filterKey, $params[$filterKey]);
        }
    }

    /**
     * Return parameters with value and operation
     *
     * @param array $params
     * @param array searchFilters
     * @return array
     */
    private function addOperation($params, $searchFilters)
    {
        $paramsWithOperation = array();
        foreach ($params as $key => $value) {
            // Only add operation if not already there
            if (is_string($value) || ! isset($value['operation'])) {
                $paramsWithOperation[$key] = array(
                    'value' => $value,
                    'operation' => $searchFilters[$key]['operation'] ?? null
                );
            } else {
                $paramsWithOperation[$key] = $value;
            }
        }
        return $paramsWithOperation;
    }
}