compiler/src/main/scala/amyc/parsing/Lexer.scala
package amyc
package parsing
import amyc.{core, parsing}
import amyc.utils.*
import amyc.utils.Position
import amyc.utils.Position.withPosition
import java.io.File
import silex.*
// The lexer for Amy.
object Lexer extends Pipeline[List[File], Iterator[Token]] with Lexers {
/**
* Tiny Silex reference:
* \============================== Silex's lexer essentially allows you to
* define a list of regular expressions in their order of priority. To
* tokenize a given input stream of characters, each individual regular
* expression is applied in turn. If a given expression matches, it is used
* to produce a token of maximal length. Whenever a regular expression does
* not match, the expression of next-highest priority is tried. The result is
* a stream of tokens.
*
* Regular expressions `r` can be built using the following operators:
* - `word("abc")` matches the sequence "abc" exactly
* - `r1 | r2` matches either expression `r1` or expression `r2`
* - `r1 ~ r2` matches `r1` followed by `r2`
* - `oneOf("xy")` matches either "x" or "y" (i.e., it is a shorthand of
* `word` and `|` for single characters)
* - `elem(c)` matches character `c`
* - `elem(f)` matches any character for which the boolean predicate `f`
* holds
* - `opt(r)` matches `r` or nothing at all
* - `many(r)` matches any number of repetitions of `r` (including none at
* all)
* - `many1(r)` matches any non-zero number of repetitions of `r`
*
* To define the token that should be output for a given expression, one can
* use the `|>` combinator with an expression on the left-hand side and a
* function producing the token on the right. The function is given the
* sequence of matched characters and the source-position range as arguments.
*
* For instance,
*
* `elem(_.isDigit) ~ word("kg") |> { (cs, range) =>
* WeightLiteralToken(cs.mkString).setPos(range._1)) }`
*
* will match a single digit followed by the characters "kg" and turn them
* into a "WeightLiteralToken" whose value will be the full string matched
* (e.g. "1kg").
*/
// Type of characters consumed.
type Character = Char
// Type of positions.
type Position = SourcePosition
// Type of tokens produced.
type Token = parsing.Token
private type P = Producer
private type L = Lexer
import Tokens._
lazy val keywords: P =
parsing.keywords.map(kw => word(kw.toString)).reduce(_ | _)
|> { (cs, range) =>
withPosition(range._1) {
KeywordToken(cs.mkString)
}
}
lazy val modifiers: P =
parsing.modifiers.map(mod => word(mod.toString)).reduce(_ | _)
|> { (cs, range) =>
withPosition(range._1){
ModifierToken(cs.mkString)
}
}
// Boolean literals,
lazy val boolLitToken: P =
word("true") | word("false")
|> { (cs, range) =>
withPosition(range._1) {
BoolLitToken(java.lang.Boolean.parseBoolean(cs.mkString))
}
}
// Identifiers,
// TODO HR : Only accepts ==
lazy val identifiers: P =
(elem(_.isUnicodeIdentifierStart) ~ many(elem(_.isUnicodeIdentifierPart))) |
(oneOf("+-*%!<&|") ~ many(oneOf("=+-*/%!<&|"))) |
(oneOf("+-*/%!<&|") ~ many(oneOf("=+-%!<&|"))) | // Not allowing tokens such as // and /*
word("==") | oneOf("+-!")
|> { (cs, range) =>
withPosition(range._1) {
cs.mkString match
case "+" | "-" | "!" => OperatorToken(cs.mkString)
case _ => IdentifierToken(cs.mkString)
}
}
// Integer literal
lazy val intLitToken: P =
many1(elem(_.isDigit))
|> { (cs, range) =>
withPosition(range._1) {
val litteral = BigInt(cs.mkString)
if litteral < Int.MaxValue then IntLitToken(litteral.toInt)
else ErrorToken(cs.mkString)
}
}
// String literal,
lazy val stringLitToken: P =
word("\"") ~ many(elem(x => !x.isControl && x != '"')) ~ word("\"")
|> { (cs, range) =>
withPosition(range._1) {
val str = cs.mkString;
StringLitToken(str.substring(1, str.length() - 1))
}
}
lazy val delimiters: P =
oneOf(";,{}():.=") | word("=>") | word("::")
|> { (cs, range) =>
withPosition(range._1) {
DelimiterToken(cs.mkString)
}
}
// Whitespace,
lazy val whitespace: P =
elem(_.isWhitespace)
|> { (_, range) =>
withPosition(range._1) {
SpaceToken()
}
}
// Single line comment,
lazy val singleLineComment: P =
word("//") ~ many(elem(_ != '\n'))
|> { (cs, range) =>
withPosition(range._1) {
CommentToken(cs.mkString(""))
}
}
// Multiline comments,
// NOTE: Amy does not support nested multi-line comments (e.g. `/* foo /* bar */ */`).
// Make sure that unclosed multi-line comments result in an ErrorToken.
lazy val multiLineComment: P =
word("/*") ~ many(
(many1(word("*")) ~ elem(x => x != '/' && x != '*')) | elem(_ != '*')
) ~ many(word("*")) ~ word("*/")
|> { (cs, range) =>
withPosition(range._1) {
var str = cs.mkString
str = str.substring(2, str.length() - 2)
CommentToken(str)
}
}
lazy val multiLineCommentError: P =
word("/*") ~ many(
(many1(word("*")) ~ elem(x => x != '/' && x != '*')) | elem(_ != '*')
)
|> { (cs, range) =>
withPosition(range._1) {
val str = cs.mkString
ErrorToken(str)
}
}
// ==============================================================================================
// =========================================== LEXER ============================================
// ==============================================================================================
lazy val lexer: L = Lexer(
keywords,
modifiers,
boolLitToken,
identifiers,
intLitToken,
stringLitToken,
delimiters,
whitespace,
singleLineComment,
multiLineComment,
multiLineCommentError
) onError {
// We also emit ErrorTokens for Silex-handled errors.
(cs, range) =>
withPosition(range._1) {
ErrorToken(cs.mkString)
}
} onEnd {
// Once all the input has been consumed, we emit one EOFToken.
withPosition(_) {
EOFToken()
}
}
override val name = "Lexer"
override def run(files: List[File])(using core.Context): Iterator[Token] = {
var it = Seq[Token]().iterator
for(file <- files) {
val source = Source.fromFile(file.toString, SourcePositioner(file))
it ++= lexer.spawn(source).filter {
_ match
case CommentToken(_) => false
case SpaceToken() => false
case token @ ErrorToken(error) =>
reporter.error(s"Unknown token : $error ", token.position)
false
case _ => true
}
}
it
}
}