src/Builder/ElasticaQueryBuilder.php
<?php
declare(strict_types=1);
namespace Gdbots\QueryParser\Builder;
use Elastica\Query\AbstractQuery;
use Elastica\Query\BoolQuery;
use Elastica\Query\Exists;
use Elastica\Query\Nested;
use Elastica\Query\Range as RangeQuery;
use Elastica\QueryBuilder as RuflinQueryBuilder;
use Gdbots\QueryParser\Enum\BoolOperator;
use Gdbots\QueryParser\Enum\ComparisonOperator;
use Gdbots\QueryParser\Node\Date;
use Gdbots\QueryParser\Node\DateRange;
use Gdbots\QueryParser\Node\Emoji;
use Gdbots\QueryParser\Node\Emoticon;
use Gdbots\QueryParser\Node\Field;
use Gdbots\QueryParser\Node\Node;
use Gdbots\QueryParser\Node\Numbr;
use Gdbots\QueryParser\Node\Phrase;
use Gdbots\QueryParser\Node\Range;
use Gdbots\QueryParser\Node\Subquery;
use Gdbots\QueryParser\Node\Word;
class ElasticaQueryBuilder extends AbstractQueryBuilder
{
protected RuflinQueryBuilder $qb;
protected BoolQuery $boolQuery;
/**
* When a subquery is entered we'll take the current query
* and save it here. After the subquery completes we inject
* the query back into the outer query.
*
* @var BoolQuery
*/
protected BoolQuery $outerBoolQuery;
protected bool $ignoreEmojis = true;
protected bool $ignoreEmoticons = true;
protected bool $ignoreStopWords = true;
protected bool $lowerCaseTerms = true;
/**
* Array of field names which are nested objects in ElasticSearch and
* must be queried using a nested query.
*
* @link https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
*
* @var string[]
*/
protected array $nestedFields = [];
/**
* Any fields encountered that are nested are stored as a nested query
* keyed by the nested field path and query method. e.g. "comments-addMust"
*
* The nested query contains a bool query and works exactly like the bool
* query non-nested queries are added to.
*
* @var Nested[]
*/
protected array $nestedQueries = [];
public function __construct()
{
$this->defaultFieldName = '_all';
$this->qb = new RuflinQueryBuilder();
$this->clear();
}
public function clear(): self
{
$this->boolQuery = $this->qb->query()->bool();
$this->outerBoolQuery = $this->boolQuery;
$this->nestedQueries = [];
return $this;
}
public function ignoreEmojis(bool $ignoreEmojis = true): self
{
$this->ignoreEmojis = $ignoreEmojis;
return $this;
}
public function ignoreEmoticons(bool $ignoreEmoticons = true): self
{
$this->ignoreEmoticons = $ignoreEmoticons;
return $this;
}
public function ignoreStopWords(bool $ignoreStopWords = true): self
{
$this->ignoreStopWords = $ignoreStopWords;
return $this;
}
public function lowerCaseTerms(bool $lowerCaseTerms = true): self
{
$this->lowerCaseTerms = $lowerCaseTerms;
return $this;
}
public function setNestedFields(array $fields): self
{
$this->nestedFields = array_flip($fields);
return $this;
}
public function addNestedField(string $fieldName): self
{
$this->nestedFields[$fieldName] = true;
return $this;
}
public function removeNestedField(string $fieldName): self
{
unset($this->nestedFields[$fieldName]);
return $this;
}
public function getNestedFields(): array
{
return array_keys($this->nestedFields);
}
public function getBoolQuery(): BoolQuery
{
if ($this->boolQuery->hasParam('must')) {
// if a "must" is used we assume they wanted everything else optional
return $this->boolQuery;
}
return $this->boolQuery->setMinimumShouldMatch('2<80%');
}
protected function handleRange(Range $range, Field $field, bool $cacheable = false): void
{
$useBoost = $field->useBoost();
$boost = $field->getBoost();
$boolOp = $field->getBoolOperator();
if ($boolOp === BoolOperator::REQUIRED) {
$method = 'addMust';
} elseif ($boolOp === BoolOperator::PROHIBITED) {
$method = 'addMustNot';
} else {
$method = 'addShould';
}
if ($range->isExclusive()) {
$lowerOperator = 'gt';
$upperOperator = 'lt';
} else {
$lowerOperator = 'gte';
$upperOperator = 'lte';
}
$data = [];
if ($range instanceof DateRange) {
if ($range->hasLowerNode()) {
$data[$lowerOperator] = $range->getLowerNode()
->toDateTime($this->localTimeZone)
->format('Y-m-d');
}
if ($range->hasUpperNode()) {
$data[$upperOperator] = $range->getUpperNode()
->toDateTime($this->localTimeZone)
->modify('+1 day')
->format('Y-m-d');
}
} else {
if ($range->hasLowerNode()) {
$data[$lowerOperator] = $range->getLowerNode()->getValue();
}
if ($range->hasUpperNode()) {
$data[$upperOperator] = $range->getUpperNode()->getValue();
}
}
if ($cacheable) {
if ('addMustNot' === $method) {
$this->addToBoolQuery($method, $field->getName(), $this->qb->query()->range($field->getName(), $data));
} else {
$this->addToBoolQuery('addFilter', $field->getName(), $this->qb->query()->range($field->getName(), $data));
}
return;
}
if ($useBoost) {
$data['boost'] = $boost;
}
$this->addToBoolQuery($method, $field->getName(), $this->qb->query()->range($field->getName(), $data));
}
protected function startSubquery(Subquery $subquery, ?Field $field = null): void
{
$this->outerBoolQuery = $this->boolQuery;
$this->boolQuery = $this->qb->query()->bool();
}
protected function endSubquery(Subquery $subquery, ?Field $field = null): void
{
$params = $this->boolQuery->getParams();
if (!empty($params)) {
$this->boolQuery->setMinimumShouldMatch(1);
if ($this->inField()) {
$useBoost = $field->useBoost();
$boost = $field->getBoost();
$boolOp = $field->getBoolOperator();
} else {
$useBoost = $subquery->useBoost();
$boost = $subquery->getBoost();
$boolOp = $subquery->getBoolOperator();
}
if ($useBoost) {
$this->boolQuery->setBoost($boost);
}
if ($boolOp === BoolOperator::REQUIRED) {
$this->outerBoolQuery->addMust($this->boolQuery);
} elseif ($boolOp === BoolOperator::PROHIBITED) {
$this->outerBoolQuery->addMustNot($this->boolQuery);
} else {
$this->outerBoolQuery->addShould($this->boolQuery);
}
}
$this->boolQuery = $this->outerBoolQuery;
}
protected function mustMatch(Node $node, ?Field $field = null): void
{
$this->addTextToQuery('addMust', $node, $field);
}
protected function shouldMatch(Node $node, ?Field $field = null): void
{
$this->addTextToQuery('addShould', $node, $field);
}
protected function mustNotMatch(Node $node, ?Field $field = null): void
{
$this->addTextToQuery('addMustNot', $node, $field);
}
/**
* Adds a text node to the active query. These all use the "match" when full
* text searching is needed/supported.
*
* @param string $method
* @param Node $node
* @param Field $field
*/
protected function addTextToQuery(string $method, Node $node, ?Field $field = null): void
{
if ($node instanceof Word && $node->isStopWord() && $this->ignoreStopWords) {
return;
}
$fieldName = $this->inField() ? $field->getName() : $this->defaultFieldName;
if ($this->inField() && !$this->inSubquery()) {
$useBoost = $field->useBoost();
$boost = $field->getBoost();
$useFuzzy = $field->useFuzzy();
$fuzzy = $field->getFuzzy();
} else {
$useBoost = $node->useBoost();
$boost = $node->getBoost();
$useFuzzy = $node->useFuzzy();
$fuzzy = $node->getFuzzy();
}
/*
* Look for special chars and if found, enforce fuzzy.
* todo: review this with more test cases
*/
if (!$useFuzzy
&& $node instanceof Phrase
&& 'addShould' === $method
&& preg_match('/[^a-zA-Z0-9\s\._-]+/', $node->getValue())
) {
$useFuzzy = true;
$fuzzy = 1;
}
if ($node instanceof Phrase) {
$data = ['query' => $node->getValue()];
if ($useBoost) {
$data['boost'] = $boost;
}
if ($useFuzzy) {
$data['slop'] = $fuzzy;
}
$query = $this->qb->query()->match_phrase($fieldName, $data);
} elseif ($useFuzzy) {
$query = $this->qb->query()->fuzzy($fieldName, $node->getValue());
$query->setFieldOption('fuzziness', $fuzzy);
if ($useBoost) {
$query->setFieldOption('boost', $boost);
}
} elseif ($node instanceof Word && $node->hasTrailingWildcard()) {
$query = $this->qb->query()->wildcard(
$fieldName,
strtolower($node->getValue()) . '*',
$useBoost ? $boost : Word::DEFAULT_BOOST
);
} else {
$data = ['query' => $node->getValue(), 'operator' => 'and', 'lenient' => true];
if ($useBoost) {
$data['boost'] = $boost;
}
$query = $this->qb->query()->match($fieldName, $data);
}
$this->addToBoolQuery($method, $fieldName, $query);
}
protected function mustMatchTerm(Node $node, ?Field $field = null, bool $cacheable = false): void
{
$this->addTermToQuery('addMust', $node, $field, $cacheable);
}
protected function shouldMatchTerm(Node $node, ?Field $field = null): void
{
$this->addTermToQuery('addShould', $node, $field);
}
protected function mustNotMatchTerm(Node $node, ?Field $field = null, bool $cacheable = false): void
{
$this->addTermToQuery('addMustNot', $node, $field, $cacheable);
}
/**
* Adds a term to the bool query or filter context. Filter context is used when the
* request for that item could be cached, like documents with hashtag of cats.
*
* @param string $method
* @param Node $node
* @param Field $field
* @param bool $cacheable
*/
protected function addTermToQuery(string $method, Node $node, ?Field $field = null, bool $cacheable = false): void
{
if ($node instanceof Emoji && $this->ignoreEmojis) {
return;
}
if ($node instanceof Emoticon && $this->ignoreEmoticons) {
return;
}
$value = $this->lowerCaseTerms && !$node instanceof Numbr ? strtolower((string)$node->getValue()) : $node->getValue();
$fieldName = $this->inField() ? $field->getName() : $this->defaultFieldName;
if ($this->inField() && !$this->inSubquery()) {
$useBoost = $field->useBoost();
$boost = $field->getBoost();
} else {
$useBoost = $node->useBoost();
$boost = $node->getBoost();
}
if ('_exists_' === $fieldName) {
$term = new Exists($value);
$method = 'addMust';
$cacheable = true;
} elseif ('_missing_' === $fieldName) {
$term = new Exists($value);
$method = 'addMustNot';
$cacheable = true;
} elseif ($node instanceof Date) {
$term = $this->createDateRangeForSingleNode(
$fieldName,
$node,
$cacheable,
$useBoost ? $boost : Date::DEFAULT_BOOST
);
} elseif ($node instanceof Numbr && $node->useComparisonOperator()) {
$data = [$node->getComparisonOperator()->value => $value];
if ($useBoost) {
$data['boost'] = $boost;
}
$term = $this->qb->query()->range($fieldName, $data);
} else {
$term = $this->qb->query()->term();
$term->setTerm($fieldName, $value, $boost);
}
if ($cacheable) {
if ('addMustNot' === $method) {
$this->addToBoolQuery($method, $fieldName, $term);
} else {
$this->addToBoolQuery('addFilter', $fieldName, $term);
}
} else {
$this->addToBoolQuery($method, $fieldName, $term);
}
}
/**
* When dealing with dates we have to create a range, even when the user provides
* an exact date. This is because a user asking for documents on date 2015-12-01
* but the value is stored as a timestamp (for example).
* So we ask for documents >=2015-12-01 and <=2015-12-02
*
* The Date node is a date with no time component. @see Date::toDateTime
*
* @param string $fieldName
* @param Date $node
* @param bool $cacheable
* @param float $boost
*
* @return RangeQuery
*/
protected function createDateRangeForSingleNode(
string $fieldName,
Date $node,
bool $cacheable = false,
float $boost = Date::DEFAULT_BOOST
): RangeQuery {
$operator = $node->getComparisonOperator();
if ($operator === ComparisonOperator::EQ) {
$date = $node->toDateTime($this->localTimeZone);
$data = [
'gte' => $date->format('Y-m-d'),
'lt' => $date->modify('+1 day')->format('Y-m-d'),
];
} else {
$data = [$operator->value => $node->toDateTime($this->localTimeZone)->format('Y-m-d')];
}
if ($cacheable) {
return $this->qb->query()->range($fieldName, $data);
}
$data['boost'] = $boost;
return $this->qb->query()->range($fieldName, $data);
}
protected function addToBoolQuery(string $method, string $fieldName, AbstractQuery $query): void
{
if (!str_contains($fieldName, '.')) {
$this->boolQuery->$method($query);
return;
}
$fieldName = str_replace('.raw', '', $fieldName);
$nestedPath = substr($fieldName, 0, strrpos($fieldName, '.'));
if (!isset($this->nestedFields[$nestedPath])) {
$this->boolQuery->$method($query);
return;
}
$nestedQuery = $nestedPath . '-' . $method;
if (!isset($this->nestedQueries[$nestedQuery])) {
$this->nestedQueries[$nestedQuery] = (new Nested())
->setQuery($this->qb->query()->bool()->setMinimumShouldMatch('2<80%'))
->setPath($nestedPath)
->setParam('ignore_unmapped', true);
$this->boolQuery->$method($this->nestedQueries[$nestedQuery]);
}
$this->nestedQueries[$nestedQuery]->getParam('query')->$method($query);
}
}