crates/parser/src/bibtex.rs
use logos::Logos;
use rowan::{GreenNode, GreenNodeBuilder};
use syntax::bibtex::SyntaxKind::{self, *};
pub fn parse_bibtex(input: &str) -> GreenNode {
let mut ptr = TokenPtr {
builder: GreenNodeBuilder::new(),
lexer: RootToken::lexer(input),
token: None,
};
ptr.builder.start_node(ROOT.into());
while let Some(token) = ptr.current() {
match token {
RootToken::Preamble => ptr = preamble(ptr),
RootToken::String => ptr = string(ptr),
RootToken::Entry => ptr = entry(ptr),
RootToken::Comment | RootToken::Junk => ptr.bump(),
}
}
ptr.builder.finish_node();
ptr.builder.finish()
}
fn preamble(mut ptr: TokenPtr<RootToken>) -> TokenPtr<RootToken> {
ptr.builder.start_node(PREAMBLE.into());
ptr.bump();
let mut ptr = ptr.morph();
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::LDelim);
ptr.expect(BodyToken::Whitespace);
ptr = value(ptr.morph()).morph();
ptr.expect(BodyToken::RDelim);
ptr.builder.finish_node();
ptr.morph()
}
fn string(mut ptr: TokenPtr<RootToken>) -> TokenPtr<RootToken> {
ptr.builder.start_node(STRING.into());
ptr.bump();
let mut ptr = ptr.morph();
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::LDelim);
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Name);
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Eq);
ptr.expect(BodyToken::Whitespace);
ptr = value(ptr.morph()).morph();
ptr.expect(BodyToken::RDelim);
ptr.builder.finish_node();
ptr.morph()
}
fn entry(mut ptr: TokenPtr<RootToken>) -> TokenPtr<RootToken> {
ptr.builder.start_node(ENTRY.into());
ptr.bump();
let mut ptr = ptr.morph();
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::LDelim);
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Name);
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Comma);
ptr.expect(BodyToken::Whitespace);
while ptr.at(BodyToken::Name) {
ptr = field(ptr);
ptr.expect(BodyToken::Whitespace);
}
ptr.expect(BodyToken::RDelim);
ptr.builder.finish_node();
ptr.morph()
}
fn field(mut ptr: TokenPtr<BodyToken>) -> TokenPtr<BodyToken> {
ptr.builder.start_node(FIELD.into());
ptr.bump();
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Eq);
ptr.expect(BodyToken::Whitespace);
ptr = value(ptr.morph()).morph();
ptr.expect(BodyToken::Whitespace);
ptr.expect(BodyToken::Comma);
ptr.builder.finish_node();
ptr
}
fn value(mut ptr: TokenPtr<ValueToken>) -> TokenPtr<ValueToken> {
let checkpoint = ptr.builder.checkpoint();
if let Some(token) = ptr.current() {
match token {
ValueToken::Whitespace => unreachable!(),
ValueToken::Pound | ValueToken::Comma | ValueToken::RCurly => return ptr,
ValueToken::Integer | ValueToken::Name => ptr = literal(ptr),
ValueToken::LCurly => ptr = curly_group(ptr.morph()).morph(),
ValueToken::Quote => ptr = quote_group(ptr.morph()).morph(),
};
ptr.expect(ValueToken::Whitespace);
if ptr.at(ValueToken::Pound) {
ptr.builder.start_node_at(checkpoint, JOIN.into());
ptr.bump();
ptr.expect(ValueToken::Whitespace);
ptr = value(ptr);
ptr.builder.finish_node();
}
}
ptr
}
fn literal(mut ptr: TokenPtr<ValueToken>) -> TokenPtr<ValueToken> {
ptr.builder.start_node(LITERAL.into());
ptr.bump();
ptr.builder.finish_node();
ptr
}
fn curly_group(mut ptr: TokenPtr<ContentToken>) -> TokenPtr<ContentToken> {
ptr.builder.start_node(CURLY_GROUP.into());
ptr.bump();
ptr.expect(ContentToken::Whitespace);
while let Some(token) = ptr.current() {
match token {
ContentToken::RCurly => break,
ContentToken::Whitespace
| ContentToken::Nbsp
| ContentToken::Comma
| ContentToken::Integer
| ContentToken::Quote
| ContentToken::Word => ptr.bump(),
ContentToken::LCurly => ptr = curly_group(ptr),
ContentToken::AccentName => ptr = accent(ptr),
ContentToken::CommandName => ptr = command(ptr),
};
}
ptr.expect(ContentToken::RCurly);
ptr.builder.finish_node();
ptr
}
fn quote_group(mut ptr: TokenPtr<ContentToken>) -> TokenPtr<ContentToken> {
ptr.builder.start_node(QUOTE_GROUP.into());
ptr.bump();
ptr.expect(ContentToken::Whitespace);
while let Some(token) = ptr.current() {
match token {
ContentToken::Quote => break,
ContentToken::Whitespace
| ContentToken::Nbsp
| ContentToken::Comma
| ContentToken::RCurly
| ContentToken::Integer
| ContentToken::Word => ptr.bump(),
ContentToken::LCurly => ptr = curly_group(ptr),
ContentToken::AccentName => ptr = accent(ptr),
ContentToken::CommandName => ptr = command(ptr),
};
}
ptr.expect(ContentToken::Quote);
ptr.builder.finish_node();
ptr
}
fn accent(mut ptr: TokenPtr<ContentToken>) -> TokenPtr<ContentToken> {
ptr.builder.start_node(ACCENT.into());
ptr.bump();
ptr.expect(ContentToken::Whitespace);
let group = ptr.at(ContentToken::LCurly);
if group {
ptr.expect(ContentToken::LCurly);
ptr.expect(ContentToken::Whitespace);
}
if ptr.at(ContentToken::Word) || ptr.at(ContentToken::CommandName) {
ptr.bump();
}
if group {
ptr.expect(ContentToken::Whitespace);
ptr.expect(ContentToken::RCurly);
}
ptr.builder.finish_node();
ptr
}
fn command(mut ptr: TokenPtr<ContentToken>) -> TokenPtr<ContentToken> {
ptr.builder.start_node(COMMAND.into());
ptr.bump();
ptr.builder.finish_node();
ptr
}
struct TokenPtr<'a, T: Logos<'a>> {
builder: GreenNodeBuilder<'static>,
lexer: logos::Lexer<'a, T>,
token: Option<(T, &'a str)>,
}
impl<'a, T> TokenPtr<'a, T>
where
T: Logos<'a, Source = str> + Eq + Copy + Into<SyntaxKind>,
T::Extras: Default,
{
pub fn morph<U>(mut self) -> TokenPtr<'a, U>
where
U: Logos<'a, Source = str> + Eq + Copy + Into<SyntaxKind>,
U::Extras: Default,
{
self.peek();
let start = self.lexer.span().start;
let input = &self.lexer.source()[start..];
TokenPtr {
builder: self.builder,
lexer: U::lexer(input),
token: None,
}
}
#[must_use]
pub fn at(&mut self, kind: T) -> bool {
self.peek().map_or(false, |(k, _)| k == kind)
}
#[must_use]
pub fn current(&mut self) -> Option<T> {
self.peek().map(|(k, _)| k)
}
pub fn bump(&mut self) {
let (kind, text) = self.peek().unwrap();
self.token = None;
self.builder
.token(rowan::SyntaxKind::from(kind.into()), text);
}
pub fn expect(&mut self, kind: T) {
if self.at(kind) {
self.bump();
}
}
fn peek(&mut self) -> Option<(T, &'a str)> {
if self.token.is_none() {
let kind = self.lexer.next()?.unwrap();
let text = self.lexer.slice();
self.token = Some((kind, text));
}
self.token
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
enum RootToken {
#[token(r"@preamble", ignore(ascii_case))]
Preamble,
#[token(r"@string", ignore(ascii_case))]
String,
#[token(r"@comment", ignore(ascii_case))]
Comment,
#[regex(r"@[a-zA-Z]*")]
Entry,
#[regex(r"[^@]+")]
Junk,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
enum BodyToken {
#[regex(r"\s+")]
Whitespace,
#[token("{")]
#[token("(")]
LDelim,
#[token("}")]
#[token(")")]
RDelim,
#[token(",")]
Comma,
#[token("=")]
Eq,
#[regex(r"[^\s\(\)\{\}@,=]+")]
Name,
#[token("@")]
Error,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
enum ValueToken {
#[regex(r"\s+")]
Whitespace,
#[token("#")]
Pound,
#[token(",")]
Comma,
#[token("{")]
LCurly,
#[token("}")]
RCurly,
#[token("\"")]
Quote,
#[regex(r"\d+", priority = 3)]
Integer,
#[regex(r#"[^\s"\{\},#]+"#)]
Name,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, Logos)]
enum ContentToken {
#[regex(r"\s+")]
Whitespace,
#[token(",")]
Comma,
#[token("{")]
LCurly,
#[token("}")]
RCurly,
#[token("\"")]
Quote,
#[regex(r"\d+", priority = 3)]
Integer,
#[token(r#"~"#)]
Nbsp,
#[token(r#"\`"#)]
#[token(r#"\'"#)]
#[token(r#"\^"#)]
#[token(r#"\""#)]
#[token(r#"\H"#)]
#[token(r#"\~"#)]
#[token(r#"\c"#)]
#[token(r#"\k"#)]
#[token(r#"\="#)]
#[token(r#"\b"#)]
#[token(r#"\."#)]
#[token(r#"\d"#)]
#[token(r#"\r"#)]
#[token(r#"\u"#)]
#[token(r#"\v"#)]
#[token(r#"\t"#)]
AccentName,
#[regex(r"\\([^\r\n]|[@a-zA-Z:_]+\*?)?")]
CommandName,
#[regex(r#"[^\s"\{\}\\~,]+"#)]
Word,
}
impl From<RootToken> for SyntaxKind {
fn from(token: RootToken) -> Self {
match token {
RootToken::Preamble | RootToken::String | RootToken::Comment | RootToken::Entry => TYPE,
RootToken::Junk => JUNK,
}
}
}
impl From<BodyToken> for SyntaxKind {
fn from(token: BodyToken) -> Self {
match token {
BodyToken::Whitespace => WHITESPACE,
BodyToken::LDelim => L_DELIM,
BodyToken::RDelim => R_DELIM,
BodyToken::Comma => COMMA,
BodyToken::Eq => EQ,
BodyToken::Name => NAME,
BodyToken::Error => unreachable!(),
}
}
}
impl From<ValueToken> for SyntaxKind {
fn from(token: ValueToken) -> Self {
match token {
ValueToken::Whitespace => WHITESPACE,
ValueToken::Pound => POUND,
ValueToken::Comma => COMMA,
ValueToken::LCurly => L_CURLY,
ValueToken::RCurly => R_CURLY,
ValueToken::Quote => QUOTE,
ValueToken::Integer => INTEGER,
ValueToken::Name => NAME,
}
}
}
impl From<ContentToken> for SyntaxKind {
fn from(token: ContentToken) -> Self {
match token {
ContentToken::Whitespace => WHITESPACE,
ContentToken::Comma => COMMA,
ContentToken::LCurly => L_CURLY,
ContentToken::RCurly => R_CURLY,
ContentToken::Quote => QUOTE,
ContentToken::Integer => INTEGER,
ContentToken::Nbsp => NBSP,
ContentToken::AccentName => ACCENT_NAME,
ContentToken::CommandName => COMMAND_NAME,
ContentToken::Word => WORD,
}
}
}
#[cfg(test)]
mod tests;