src/Tokenizer.php
<?php
declare(strict_types=1);
namespace Gdbots\QueryParser;
final class Tokenizer
{
const REGEX_EMOTICON = '(?<=^|\s)(?:>:\-?\(|:\-?\)|<3|:\'\(|:\-?\|:\-?\/|:\-?\(|:\-?\*|:\-?\||:o\)|:\-?o|=\-?\)|:\-?D|:\-?p|:\-?P|:\-?b|;\-?p|;\-?P|;\-?b|;\-?\))';
const REGEX_EMOJI = '[\x{2712}\x{2714}\x{2716}\x{271d}\x{2721}\x{2728}\x{2733}\x{2734}\x{2744}\x{2747}\x{274c}\x{274e}\x{2753}-\x{2755}\x{2757}\x{2763}\x{2764}\x{2795}-\x{2797}\x{27a1}\x{27b0}\x{27bf}\x{2934}\x{2935}\x{2b05}-\x{2b07}\x{2b1b}\x{2b1c}\x{2b50}\x{2b55}\x{3030}\x{303d}\x{1f004}\x{1f0cf}\x{1f170}\x{1f171}\x{1f17e}\x{1f17f}\x{1f18e}\x{1f191}-\x{1f19a}\x{1f201}\x{1f202}\x{1f21a}\x{1f22f}\x{1f232}-\x{1f23a}\x{1f250}\x{1f251}\x{1f300}-\x{1f321}\x{1f324}-\x{1f393}\x{1f396}\x{1f397}\x{1f399}-\x{1f39b}\x{1f39e}-\x{1f3f0}\x{1f3f3}-\x{1f3f5}\x{1f3f7}-\x{1f4fd}\x{1f4ff}-\x{1f53d}\x{1f549}-\x{1f54e}\x{1f550}-\x{1f567}\x{1f56f}\x{1f570}\x{1f573}-\x{1f579}\x{1f587}\x{1f58a}-\x{1f58d}\x{1f590}\x{1f595}\x{1f596}\x{1f5a5}\x{1f5a8}\x{1f5b1}\x{1f5b2}\x{1f5bc}\x{1f5c2}-\x{1f5c4}\x{1f5d1}-\x{1f5d3}\x{1f5dc}-\x{1f5de}\x{1f5e1}\x{1f5e3}\x{1f5ef}\x{1f5f3}\x{1f5fa}-\x{1f64f}\x{1f680}-\x{1f6c5}\x{1f6cb}-\x{1f6d0}\x{1f6e0}-\x{1f6e5}\x{1f6e9}\x{1f6eb}\x{1f6ec}\x{1f6f0}\x{1f6f3}\x{1f910}-\x{1f918}\x{1f980}-\x{1f984}\x{1f9c0}\x{3297}\x{3299}\x{a9}\x{ae}\x{203c}\x{2049}\x{2122}\x{2139}\x{2194}-\x{2199}\x{21a9}\x{21aa}\x{231a}\x{231b}\x{2328}\x{2388}\x{23cf}\x{23e9}-\x{23f3}\x{23f8}-\x{23fa}\x{24c2}\x{25aa}\x{25ab}\x{25b6}\x{25c0}\x{25fb}-\x{25fe}\x{2600}-\x{2604}\x{260e}\x{2611}\x{2614}\x{2615}\x{2618}\x{261d}\x{2620}\x{2622}\x{2623}\x{2626}\x{262a}\x{262e}\x{262f}\x{2638}-\x{263a}\x{2648}-\x{2653}\x{2660}\x{2663}\x{2665}\x{2666}\x{2668}\x{267b}\x{267f}\x{2692}-\x{2694}\x{2696}\x{2697}\x{2699}\x{269b}\x{269c}\x{26a0}\x{26a1}\x{26aa}\x{26ab}\x{26b0}\x{26b1}\x{26bd}\x{26be}\x{26c4}\x{26c5}\x{26c8}\x{26ce}\x{26cf}\x{26d1}\x{26d3}\x{26d4}\x{26e9}\x{26ea}\x{26f0}-\x{26f5}\x{26f7}-\x{26fa}\x{26fd}\x{2702}\x{2705}\x{2708}-\x{270d}\x{270f}]|\x{23}\x{20e3}|\x{2a}\x{20e3}|\x{30}\x{20e3}|\x{31}\x{20e3}|\x{32}\x{20e3}|\x{33}\x{20e3}|\x{34}\x{20e3}|\x{35}\x{20e3}|\x{36}\x{20e3}|\x{37}\x{20e3}|\x{38}\x{20e3}|\x{39}\x{20e3}|\x{1f1e6}[\x{1f1e8}-\x{1f1ec}\x{1f1ee}\x{1f1f1}\x{1f1f2}\x{1f1f4}\x{1f1f6}-\x{1f1fa}\x{1f1fc}\x{1f1fd}\x{1f1ff}]|\x{1f1e7}[\x{1f1e6}\x{1f1e7}\x{1f1e9}-\x{1f1ef}\x{1f1f1}-\x{1f1f4}\x{1f1f6}-\x{1f1f9}\x{1f1fb}\x{1f1fc}\x{1f1fe}\x{1f1ff}]|\x{1f1e8}[\x{1f1e6}\x{1f1e8}\x{1f1e9}\x{1f1eb}-\x{1f1ee}\x{1f1f0}-\x{1f1f5}\x{1f1f7}\x{1f1fa}-\x{1f1ff}]|\x{1f1e9}[\x{1f1ea}\x{1f1ec}\x{1f1ef}\x{1f1f0}\x{1f1f2}\x{1f1f4}\x{1f1ff}]|\x{1f1ea}[\x{1f1e6}\x{1f1e8}\x{1f1ea}\x{1f1ec}\x{1f1ed}\x{1f1f7}-\x{1f1fa}]|\x{1f1eb}[\x{1f1ee}-\x{1f1f0}\x{1f1f2}\x{1f1f4}\x{1f1f7}]|\x{1f1ec}[\x{1f1e6}\x{1f1e7}\x{1f1e9}-\x{1f1ee}\x{1f1f1}-\x{1f1f3}\x{1f1f5}-\x{1f1fa}\x{1f1fc}\x{1f1fe}]|\x{1f1ed}[\x{1f1f0}\x{1f1f2}\x{1f1f3}\x{1f1f7}\x{1f1f9}\x{1f1fa}]|\x{1f1ee}[\x{1f1e8}-\x{1f1ea}\x{1f1f1}-\x{1f1f4}\x{1f1f6}-\x{1f1f9}]|\x{1f1ef}[\x{1f1ea}\x{1f1f2}\x{1f1f4}\x{1f1f5}]|\x{1f1f0}[\x{1f1ea}\x{1f1ec}-\x{1f1ee}\x{1f1f2}\x{1f1f3}\x{1f1f5}\x{1f1f7}\x{1f1fc}\x{1f1fe}\x{1f1ff}]|\x{1f1f1}[\x{1f1e6}-\x{1f1e8}\x{1f1ee}\x{1f1f0}\x{1f1f7}-\x{1f1fb}\x{1f1fe}]|\x{1f1f2}[\x{1f1e6}\x{1f1e8}-\x{1f1ed}\x{1f1f0}-\x{1f1ff}]|\x{1f1f3}[\x{1f1e6}\x{1f1e8}\x{1f1ea}-\x{1f1ec}\x{1f1ee}\x{1f1f1}\x{1f1f4}\x{1f1f5}\x{1f1f7}\x{1f1fa}\x{1f1ff}]|\x{1f1f4}\x{1f1f2}|\x{1f1f5}[\x{1f1e6}\x{1f1ea}-\x{1f1ed}\x{1f1f0}-\x{1f1f3}\x{1f1f7}-\x{1f1f9}\x{1f1fc}\x{1f1fe}]|\x{1f1f6}\x{1f1e6}|\x{1f1f7}[\x{1f1ea}\x{1f1f4}\x{1f1f8}\x{1f1fa}\x{1f1fc}]|\x{1f1f8}[\x{1f1e6}-\x{1f1ea}\x{1f1ec}-\x{1f1f4}\x{1f1f7}-\x{1f1f9}\x{1f1fb}\x{1f1fd}-\x{1f1ff}]|\x{1f1f9}[\x{1f1e6}\x{1f1e8}\x{1f1e9}\x{1f1eb}-\x{1f1ed}\x{1f1ef}-\x{1f1f4}\x{1f1f7}\x{1f1f9}\x{1f1fb}\x{1f1fc}\x{1f1ff}]|\x{1f1fa}[\x{1f1e6}\x{1f1ec}\x{1f1f2}\x{1f1f8}\x{1f1fe}\x{1f1ff}]|\x{1f1fb}[\x{1f1e6}\x{1f1e8}\x{1f1ea}\x{1f1ec}\x{1f1ee}\x{1f1f3}\x{1f1fa}]|\x{1f1fc}[\x{1f1eb}\x{1f1f8}]|\x{1f1fd}\x{1f1f0}|\x{1f1fe}[\x{1f1ea}\x{1f1f9}]|\x{1f1ff}[\x{1f1e6}\x{1f1f2}\x{1f1fc}]';
const REGEX_URL = '[+-]?[\w-]+:\/\/[^\s\/$.?#].[^\s\^~]*';
const REGEX_PHRASE = '[+-]?"(?:""|[^"])*"';
const REGEX_HASHTAG = '[+-]?#+[a-zA-Z0-9_]+';
const REGEX_MENTION = '[+-]?@+[a-zA-Z0-9_]+(?:[a-zA-Z0-9_\.\-]+)?';
const REGEX_NUMBER = '(?:[+-]?[0-9]+(?:[\.][0-9]+)*)(?:[eE][+-]?[0-9]+)?';
const REGEX_DATE = '[+-]?\d{4}-\d{2}-\d{2}';
const REGEX_FIELD = '[+-]?[a-zA-Z\_]+(?:[a-zA-Z0-9_\.\-]+)?:';
const REGEX_WORD = '[+-]?[^\s\(\)\\\\^\<\>\[\]\{\}~=]*';
const REGEX_WORD_MINIMUM = '[a-zA-Z0-9\pL]+';
const IGNORED_LEAD_TRAIL_CHARS = "#@,.!?;|&+-^~*\\\"' \t\n\r ";
/**
* When building a field lexeme we switch this on/off to establish proper T_FIELD_END.
* It also helps us enforce range and subquery rules.
*
* @var bool
*/
private bool $inField = false;
/**
* This tokenizer only supports one level of sub query (for now). We only want to take
* a query from a user like "funny #cats plays:>500" and parse that to a simple
* object which can be translated to a sql, elasticsearch, riak, etc. query.
*
* @var bool
*/
private bool $inSubquery = false;
/**
* This tokenizer only supports one range to be open at a time (excl or incl).
* Starting a new range of any type is ignored if it's already open and
* closing a range that never started is also ignored.
*
* The value will be the type of range that is open or 0.
*
* @var int
*/
private int $inRange = 0;
/**
* The regex used to split the initial input into chunks that will be
* checked for tokens during scan/tokenization.
*
* @var string
*/
private string $splitRegex;
/** @var Token[] */
private array $tokens = [];
/**
* The last token that was scanned.
*
* @var Token
*/
private Token $lastToken;
public function __construct()
{
$this->splitRegex = sprintf(
'/(%s)/iu',
implode(')|(', [
self::REGEX_EMOTICON,
self::REGEX_URL,
self::REGEX_PHRASE,
self::REGEX_FIELD,
self::REGEX_WORD,
])
);
$this->lastToken = new Token(Token::T_WHITE_SPACE);
}
/**
* The Tokenizer is immediately reset and the new input tokenized.
* Any unprocessed tokens from any previous input are lost.
*
* @param string $input
*
* @return TokenStream
*/
public function scan(string $input): TokenStream
{
$input = str_replace('""', '" "', preg_replace('/\s+/', ' ', ' ' . $input));
// $input = substr($input, 0, 256); // lef
$this->inField = false;
$this->inSubquery = false;
$this->inRange = 0;
$this->tokens = [];
$this->lastToken = new Token(Token::T_WHITE_SPACE);
foreach ($this->splitInput($input) as $match) {
$this->extractTokens(trim($match[0]));
if ($this->lastToken->isWhiteSpace() && $this->inField && !$this->inRange && !$this->inSubquery) {
$this->inField = false;
$this->addOperatorToken(Token::T_FIELD_END);
}
}
if ($this->inField) {
$this->inField = false;
$this->addOperatorToken(Token::T_FIELD_END);
}
if ($this->inSubquery) {
$this->inSubquery = false;
$this->addOperatorToken(Token::T_SUBQUERY_END);
}
$this->tokens = array_values(array_filter($this->tokens, function (Token $token) {
return !$token->isWhiteSpace() && !$token->isIgnored();
}));
return new TokenStream($this->tokens);
}
/**
* Splits the input into chunks that will be scanned for tokens.
*
* @param string $input
*
* @return array
*/
private function splitInput(string $input): array
{
$flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
return preg_split($this->splitRegex, $input, -1, $flags);
}
/**
* Adds an operator token (tokens with no value). This method
* also ensures the same token is not repeated.
*
* @param int $type
*/
private function addOperatorToken(int $type): void
{
if ($this->lastToken->typeEquals($type)) {
return;
}
$token = new Token($type);
$this->tokens[] = $token;
$this->lastToken = $token;
}
private function addToken(int $type, float|string|null $value): void
{
$token = new Token($type, $value);
$this->tokens[] = $token;
$this->lastToken = $token;
}
private function extractTokens(string $value): void
{
if ('' === $value) {
if ($this->lastToken->typeEqualsAnyOf([Token::T_REQUIRED, Token::T_PROHIBITED, Token::T_IGNORED])) {
// todo: review the process of bool operators following ignored values.
array_pop($this->tokens);
}
$this->addOperatorToken(Token::T_WHITE_SPACE);
return;
}
if (is_numeric($value)) {
$this->addToken(Token::T_NUMBER, (float)$value);
return;
}
if ($this->extractSymbolOrKeyword($value)) {
return;
}
switch ($value[0]) {
case '+':
$this->addOperatorToken(Token::T_REQUIRED);
$value = substr($value, 1);
break;
case '-':
$this->addOperatorToken(Token::T_PROHIBITED);
$value = substr($value, 1);
break;
default:
break;
}
if (preg_match('/^' . self::REGEX_EMOTICON . '$/', $value)) {
$this->addToken(Token::T_EMOTICON, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (preg_match('/^' . self::REGEX_EMOJI . '$/u', $value)) {
$this->addToken(Token::T_EMOJI, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (preg_match('/^' . self::REGEX_URL . '$/', $value)) {
$this->addToken(Token::T_URL, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (!$this->inField && !$this->inSubquery
&& preg_match('/^' . self::REGEX_FIELD . '$/', $value)
&& $this->lastToken->typeEqualsAnyOf([
Token::T_WHITE_SPACE,
Token::T_REQUIRED,
Token::T_PROHIBITED,
Token::T_FIELD_END,
Token::T_SUBQUERY_START,
])
) {
$this->inField = true;
$this->addToken(Token::T_FIELD_START, rtrim($value, ':'));
return;
}
if (preg_match('/^' . self::REGEX_PHRASE . '$/', $value)) {
$value = trim(trim($value, '"'));
if (!empty($value)) {
$this->addToken(Token::T_PHRASE, $value);
} else {
$this->addToken(Token::T_IGNORED, $value);
}
return;
}
if (str_contains($value, '..')) {
$parts = explode('..', $value, 2);
$this->extractTokens($parts[0]);
$this->extractSymbolOrKeyword('..');
$this->extractTokens($parts[1] ?? '');
return;
}
if (preg_match('/^' . self::REGEX_HASHTAG . '$/', rtrim($value, self::IGNORED_LEAD_TRAIL_CHARS))) {
$this->addToken(Token::T_HASHTAG, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (preg_match('/^' . self::REGEX_MENTION . '$/', rtrim($value, self::IGNORED_LEAD_TRAIL_CHARS))) {
$this->addToken(Token::T_MENTION, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (preg_match('/^' . self::REGEX_DATE . '$/', rtrim($value, self::IGNORED_LEAD_TRAIL_CHARS))) {
$this->addToken(Token::T_DATE, trim($value, self::IGNORED_LEAD_TRAIL_CHARS));
return;
}
if (preg_match('/' . self::REGEX_WORD . '/', $value)) {
$hasTrailingWildcard = str_ends_with($value, '*');
$value2 = trim($value, self::IGNORED_LEAD_TRAIL_CHARS . '/');
if (!empty($value2)) {
/*
* When in a field or subquery you can get a value which itself looks like the start
* of a field, e.g. "field:vevo:video". We don't want two words here so
* merge the last "word" token value with this one.
*/
if ($this->lastToken->typeEquals(Token::T_WORD)
&& ':' === strrev($this->lastToken->getValue())[0]
) {
$value2 = array_pop($this->tokens)->getValue() . $value2;
}
if (!preg_match('/' . self::REGEX_WORD_MINIMUM . '/u', $value2)) {
$this->addToken(Token::T_IGNORED, $value2);
return;
}
$this->addToken(Token::T_WORD, $value2);
if ($hasTrailingWildcard) {
$this->addOperatorToken(Token::T_WILDCARD);
}
return;
}
}
$this->addToken(Token::T_IGNORED, $value);
}
/**
* Extracts a symbol or keyword from the string and may ignore a token
* if it doesn't follow some basic rules for this lib. E.g. you can't
* boost whitespace " ^5". In that case, boost is ignored.
*
* @param string $value
*
* @return bool True if a symbol or keyword was extracted/processed.
*/
private function extractSymbolOrKeyword(string $value): bool
{
$len = strlen($value);
if ($len > 3) {
return false;
}
switch ($value) {
case '+':
$this->addOperatorToken(Token::T_REQUIRED);
return true;
case '-':
$this->addOperatorToken(Token::T_PROHIBITED);
return true;
case '>':
if ($this->inField && 0 === $this->inRange) {
$this->addOperatorToken(Token::T_GREATER_THAN);
}
return true;
case '<':
if ($this->inField && 0 === $this->inRange) {
$this->addOperatorToken(Token::T_LESS_THAN);
}
return true;
case '=':
if ($this->lastToken->typeEquals(Token::T_GREATER_THAN)
|| $this->lastToken->typeEquals(Token::T_LESS_THAN)
) {
$this->addOperatorToken(Token::T_EQUALS);
}
return true;
case '~':
// can't fuzzy parts of a field, range or sub query
if ($this->inSubquery || 0 !== $this->inRange) {
// fuzzy is ignored
return true;
}
if (!$this->lastToken->isWhiteSpace()) {
if ($this->inField) {
$this->inField = false;
$this->addOperatorToken(Token::T_FIELD_END);
}
$this->addOperatorToken(Token::T_FUZZY);
}
return true;
case '^':
// can't boost parts of a field, range or sub query
if ($this->inSubquery || 0 !== $this->inRange) {
// boost is ignored
return true;
}
if (!$this->lastToken->isWhiteSpace()) {
if ($this->inField) {
$this->inField = false;
$this->addOperatorToken(Token::T_FIELD_END);
}
$this->addOperatorToken(Token::T_BOOST);
}
return true;
case '[':
if ($this->inField && 0 === $this->inRange) {
$this->inRange = Token::T_RANGE_INCL_START;
$this->addOperatorToken(Token::T_RANGE_INCL_START);
}
return true;
case '{':
if ($this->inField && 0 === $this->inRange) {
$this->inRange = Token::T_RANGE_EXCL_START;
$this->addOperatorToken(Token::T_RANGE_EXCL_START);
}
return true;
case ']':
case '}':
if (0 !== $this->inRange) {
if (Token::T_RANGE_INCL_START === $this->inRange) {
$this->addOperatorToken(Token::T_RANGE_INCL_END);
} else {
$this->addOperatorToken(Token::T_RANGE_EXCL_END);
}
$this->inRange = 0;
$this->inField = false;
$this->addOperatorToken(Token::T_FIELD_END);
}
return true;
case '(':
// sub queries can't be nested or exist in a range.
if (!$this->inSubquery && 0 === $this->inRange) {
$this->addOperatorToken(Token::T_SUBQUERY_START);
$this->inSubquery = true;
}
return true;
case ')':
if ($this->inSubquery && 0 === $this->inRange) {
$this->inSubquery = false;
$this->addOperatorToken(Token::T_SUBQUERY_END);
if ($this->inField) {
$this->addOperatorToken(Token::T_FIELD_END);
$this->inField = false;
}
}
return true;
case '*':
$this->addOperatorToken(Token::T_WILDCARD);
return true;
case '||':
case 'OR':
$this->addOperatorToken(Token::T_OR);
return true;
case '&&':
case 'AND':
$this->addOperatorToken(Token::T_AND);
return true;
case '..':
if (0 !== $this->inRange) {
$this->addOperatorToken(Token::T_TO);
}
return true;
case 'TO':
if (0 !== $this->inRange) {
$this->addOperatorToken(Token::T_TO);
return true;
}
$this->addToken(Token::T_WORD, $value);
return true;
default:
if (1 === $len) {
if (ctype_alpha($value)) {
/*
* A word, followed ":", followed by a single char "thing:a".
* can be made into one token.
* todo: review words that look like fields. seems wonky.
*/
if ($this->lastToken->typeEquals(Token::T_WORD)
&& ':' === strrev($this->lastToken->getValue())[0]
) {
$value = array_pop($this->tokens)->getValue() . $value;
}
$this->addToken(Token::T_WORD, $value);
return true;
}
$this->addToken(Token::T_IGNORED, $value);
return true;
}
break;
}
return false;
}
}