src/Lexing/Lexer.php
<?php
declare(strict_types=1);
namespace MyEval\Lexing;
use MyEval\Exceptions\UnknownTokenException;
use function strlen;
/**
* Lexer and Token related classes.
*
* [Lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis) or *lexing* is the process of converting an input
* string into a sequence of tokens, representing discrete parts of the input each carrying certain meaning.
*
*
* Generic very simple lexer, capable of matching tokens defined by regular expressions.
*
* The Lexer works on an input string, sequentially building a list of matched Tokens (or throwing an Exception if the
* input string cannot be tokenized).
*
* The Lexer is context independent and without lookahead, and cannot for example distinguish between `-` used as a
* binary subtraction operation or as a unary negation. This is handled by the parser.
*
* Tokens are added to the Lexer as TokenDefinition instances, and are saved in an ordered list. Hence, some care has
* to be taken when defining the Lexer (see the implementation of StdMathLexer). For example, if we want the lexer to
* recognize `sin` as well as `sinh` as separate tokens, the more specific `sinh` pattern should be added to the Lexer
* before `sin`.
*/
class Lexer
{
/**
* @var TokenDefinition[] $tokenDefinition List of tokens recognized by the Lexer.
*/
private array $tokenDefinitions = [];
/**
* Add a Token to the list of tokens recognized by the Lexer.
*
* Adds the supplied TokenDefinition at the end of the list of known tokens.
*
* @param TokenDefinition $tokenDefinition Token to add to the list of known tokens.
*
* @return void
*/
public function add(TokenDefinition $tokenDefinition): void
{
$this->tokenDefinitions[] = $tokenDefinition;
}
/**
* Convert an input string to a list of tokens.
*
* Using the list of knows tokens, sequentially match the input string to known tokens.
* Note that the first matching token from the list is chosen, so if there are tokens sharing parts of the pattern
* (e.g. `sin` and `sinh`), care should be taken to add `sinh` before `sin`, otherwise the lexer will never match a
* `sinh`.
*
* @param string $input String to tokenize.
*
* @return Token[] Sequence of recognized tokens that don't match any known token.
* @throws UnknownTokenException Throws when encountering characters in the input string.
*/
public function tokenize(string $input): array
{
// The list of tokens we'll eventually return
$tokens = [];
// The currentIndex indicates where we are inside the input string
$currentIndex = 0;
while ($currentIndex < strlen($input)) {
// We try to match only what is after the currentIndex, as the content before is already converted to tokens
$token = $this->findMatchingToken(substr($input, $currentIndex));
// If no tokens were matched, it means that the string has invalid tokens for which we did not define a
// token definition
if (!$token) {
throw new UnknownTokenException(substr($input, $currentIndex));
}
// Add the matched token to our list of token
$tokens[] = $token;
// Increment the string index by the length of the matched token, so we can now process the rest of string.
$currentIndex += $token->length();
}
return $tokens;
}
/**
* Find a matching token at the beginning of the provided input.
*
* @param string $input
*
* @return Token|null Matched token.
* @throws UnknownTokenException
*/
private function findMatchingToken(string $input): ?Token
{
// Check with all tokenDefinitions
foreach ($this->tokenDefinitions as $tokenDefinition) {
$token = $tokenDefinition->match($input);
// Return the first token that was matched.
if ($token) {
return $token;
}
}
// Return null if no tokens were matched.
return null;
}
}