thi-ng/umbrella

View on GitHub
packages/parse/src/grammar.ts

Summary

Maintainability
A
1 hr
Test Coverage
import type { Fn } from "@thi.ng/api";
import type { MultiFn4 } from "@thi.ng/defmulti";
import { DEFAULT, defmulti } from "@thi.ng/defmulti/defmulti";
import { illegalArgs } from "@thi.ng/errors/illegal-arguments";
import { unsupported } from "@thi.ng/errors/unsupported";
import type {
    DynamicParser,
    GrammarOpts,
    Language,
    Parser,
    ParseScope,
    RuleTransforms,
} from "./api.js";
import { alt, altD } from "./combinators/alt.js";
import { dynamic } from "./combinators/dynamic.js";
import { lookahead } from "./combinators/lookahead.js";
import { maybe } from "./combinators/maybe.js";
import { not } from "./combinators/not.js";
import { oneOrMore, repeat, zeroOrMore } from "./combinators/repeat.js";
import { seq, seqD } from "./combinators/seq.js";
import { xform } from "./combinators/xform.js";
import { defContext } from "./context.js";
import { ALPHA, ALPHA_NUM } from "./presets/alpha.js";
import { BIT } from "./presets/bits.js";
import { DIGIT } from "./presets/digits.js";
import { ESC, UNICODE } from "./presets/escape.js";
import { HEX_DIGIT } from "./presets/hex.js";
import { FLOAT, INT, UINT } from "./presets/numbers.js";
import { STRING } from "./presets/string.js";
import { DNL, NL, WS, WS0, WS1 } from "./presets/whitespace.js";
import { always, alwaysD } from "./prims/always.js";
import {
    inputEnd,
    inputStart,
    lineEnd,
    lineStart,
    wordBoundary,
} from "./prims/anchor.js";
import { lit, litD } from "./prims/lit.js";
import { noneOf } from "./prims/none-of.js";
import { oneOf } from "./prims/one-of.js";
import { range, rangeD } from "./prims/range.js";
import { string, stringD } from "./prims/string.js";
import { collect, xfCollect } from "./xform/collect.js";
import { xfCount } from "./xform/count.js";
import { discard, xfDiscard } from "./xform/discard.js";
import { hoist, hoistResult, xfHoist, xfHoistResult } from "./xform/hoist.js";
import { join, xfJoin } from "./xform/join.js";
import { xfJson } from "./xform/json.js";
import { nest } from "./xform/nest.js";
import { xfFloat, xfInt } from "./xform/number.js";
import { print, xfPrint } from "./xform/print.js";
import { xfReplace } from "./xform/replace.js";
import { xfTrim } from "./xform/trim.js";
import { withID } from "./xform/with-id.js";

const apos = litD("'");
const dash = litD("-");

const REPEAT = maybe(
    alt([
        oneOf("?*+", "repeat"),
        collect(
            seq(
                [litD("{"), UINT, maybe(lit(",")), maybe(UINT), litD("}")],
                "repeatN"
            )
        ),
    ])
);

const DISCARD = maybe(lit("!"), undefined, "discard");

const CHAR_OR_ESC = alt([UNICODE, ESC, always()]);

const CHAR_RANGE = seq([CHAR_OR_ESC, dash, CHAR_OR_ESC], "charRange");

const CHAR_SEL = seq(
    [
        litD("["),
        maybe(lit("^", "invert")),
        oneOrMore(alt([CHAR_RANGE, UNICODE, noneOf("]", "char")]), "choice"),
        litD("]"),
    ],
    "charSel"
);

const ANY = lit(".", "any");

const LIT = hoistResult(seq([apos, CHAR_OR_ESC, apos], "char"));

const SYM = join(oneOrMore(alt([ALPHA_NUM, oneOf(".-_$")]), "sym"));

const RULE_REF = seq([litD("<"), SYM, litD(">")], "ref");

const TERM_BODY = alt([RULE_REF, ANY, LIT, STRING, CHAR_SEL]);

const LOOK_AHEAD = maybe(
    seq(
        [
            stringD("(?"),
            oneOf("-+"),
            seq([TERM_BODY, REPEAT, DISCARD], "lhterm"),
            litD(")"),
        ],
        "lhspec"
    ),
    undefined,
    "lhnone"
);

const TERM = seq([TERM_BODY, REPEAT, DISCARD, LOOK_AHEAD], "term");

const ALT = seq(
    [
        litD("("),
        WS0,
        TERM,
        zeroOrMore(seq([WS0, litD("|"), WS0, TERM])),
        WS0,
        litD(")"),
        REPEAT,
        DISCARD,
        LOOK_AHEAD,
    ],
    "alt"
);

const RULE_XF = hoist(
    seq([stringD("=>"), WS1, alt([SYM, RULE_REF, STRING]), WS1], "xform")
);

const RULE = seq(
    [
        WS0,
        SYM,
        WS0,
        litD(":"),
        oneOrMore(alt([TERM, ALT, WS1]), "body"),
        maybe(RULE_XF),
        litD(";"),
        WS0,
    ],
    "rule"
);

const COMMENT = seqD([WS0, litD("#"), lookahead(always(), DNL)]);

export const GRAMMAR = zeroOrMore(alt([RULE, COMMENT]), "rules");

const first = ($: ParseScope<any>) => $.children![0];

const nth = ($: ParseScope<any>, n: number) => $.children![n];

interface CompileFlags {
    discard?: boolean;
}

const compile: MultiFn4<
    ParseScope<string>,
    Language,
    GrammarOpts,
    CompileFlags,
    any
> = defmulti<ParseScope<string>, Language, GrammarOpts, CompileFlags, any>(
    (scope) => scope.id,
    {
        unicode: "char",
    },
    {
        [DEFAULT]: ($: ParseScope<string>) =>
            unsupported(`unknown op: ${$.id}`),
        root: ($, lang, opts, flags) => {
            const rules = first($).children!;
            rules.reduce(
                (acc, r) => ((acc[first(r).result] = dynamic()), acc),
                lang.rules
            );
            for (let r of rules) {
                const id = first(r).result;
                (<DynamicParser<string>>lang.rules[id]).set(
                    compile(r, lang, opts, flags)
                );
            }
            return lang;
        },
        rule: ($, lang, opts, flags) => {
            const [id, body, xf] = $.children!;
            opts.debug && console.log(`rule: ${id.result}`, xf);
            const acc: Parser<string>[] = [];
            for (let b of body.children!) {
                const c = compile(b, lang, opts, flags);
                c && acc.push(c);
            }
            let parser =
                acc.length > 1
                    ? seq(acc, id.result)
                    : withID(id.result, acc[0]);
            if (xf.id === "sym") {
                const $xf = lang.env[xf.result];
                if (!$xf) illegalArgs(`missing xform: ${xf.result}`);
                parser = xform(parser, $xf);
            } else if (xf.id === "ref") {
                const $id = first(xf).result;
                if ($id === id) illegalArgs(`self-referential: ${$id}`);
                const $xf = lang.rules[$id];
                if (!$xf) illegalArgs(`missing xform rule: ${$id}`);
                parser = nest(parser, $xf);
            } else if (xf.id === "string") {
                parser = xform(parser, xfReplace(xf.result));
            }
            return parser;
        },
        ref: ($, lang, opts, flags) => {
            const id = first($).result;
            opts.debug && console.log(`ref: ${id}`, flags);
            const ref = lang.rules[id];
            return ref
                ? flags.discard
                    ? discard(ref)
                    : ref
                : illegalArgs(`invalid rule ref: ${id}`);
        },
        term: ($, lang, opts, flags) => {
            const [term, repeat, discard, lookahead] = $!.children!;
            opts.debug && console.log(`term: ${term.id}`, flags);
            return compileRDL(
                (discard) => compile(term, lang, opts, { ...flags, discard }),
                repeat,
                discard,
                lookahead,
                lang,
                opts
            );
        },
        lhterm: ($, lang, opts, flags) => {
            const [term, repeat, discard] = $.children!;
            opts.debug && console.log(`lhterm: ${term.id}`);
            return compileRD(
                (discard) => compile(term, lang, opts, { ...flags, discard }),
                repeat,
                discard,
                opts
            );
        },
        alt: ($, lang, opts, flags) => {
            opts.debug && console.log(`alt: ${$.id}`, flags);
            const [term0, { children: terms }, repeat, disc, lookahead] =
                $.children!;
            const acc: Parser<string>[] = [compile(term0, lang, opts, flags)];
            if (terms) {
                for (let c of terms) {
                    acc.push(compile(first(c), lang, opts, flags));
                }
            }
            return compileRDL(
                (optimize) =>
                    optimize || flags.discard
                        ? acc.length > 1
                            ? altD(acc)
                            : discard(acc[0])
                        : acc.length > 1
                        ? alt(acc)
                        : acc[0],
                repeat,
                disc,
                lookahead,
                lang,
                opts
            );
        },
        any: (_, __, opts, flags) => {
            opts.debug && console.log(`any`, flags);
            return flags.discard ? alwaysD() : always("any");
        },
        char: ($, _, opts, flags) => {
            const x = $.result;
            opts.debug && console.log(`lit: '${x}'`, flags);
            return (flags.discard ? litD : lit)(x);
        },
        string: ($, _, opts, flags) => {
            const x = $.result;
            opts.debug && console.log(`string: "${x}"`, flags);
            return (flags.discard ? stringD : string)(x);
        },
        charRange: ($, _, opts, flags) => {
            const [a, b] = $.children!;
            opts.debug &&
                console.log(`range: ${a.result} - ${b.result}`, flags);
            return (flags.discard ? rangeD : range)(a.result, b.result);
        },
        charSel: ($, lang, opts, flags) => {
            opts.debug && console.log("charSel", flags);
            const choices = nth($, 1).children!.map((c) =>
                compile(c, lang, opts, flags)
            );
            const invert = first($).result;
            const parser = choices.length > 1 ? alt(choices) : choices[0];
            opts.debug && console.log(`invert: ${invert}`);
            return invert
                ? not(parser, flags.discard ? alwaysD() : always())
                : parser;
        },
    }
);

const compileRepeat = (
    parser: Parser<string>,
    rspec: ParseScope<string>,
    opts: GrammarOpts
) => {
    opts.debug && console.log(`repeat: ${rspec.id}`);
    if (rspec.id === "repeat") {
        switch (rspec.result) {
            case "?":
                return maybe(parser);
            case "*":
                return zeroOrMore(parser);
            case "+":
                return oneOrMore(parser);
            default:
                return parser;
        }
    } else if (rspec.id === "repeatN") {
        const [n, sep, m] = rspec.result;
        return repeat(parser, n, sep ? m || Infinity : m || n);
    }
    return parser;
};

const compileDiscard = (
    parser: Parser<string>,
    dspec: ParseScope<string>,
    opts: GrammarOpts
) => {
    opts.debug && console.log(`discard:`, dspec.result);
    return dspec.result === "!" ? discard(parser) : parser;
};

const compileLookahead = (
    parser: Parser<string>,
    spec: ParseScope<string>,
    lang: Language,
    opts: GrammarOpts
) => {
    opts.debug && console.log(`lookahead:`, spec.id);
    return spec.id === "lhspec"
        ? lookahead(
                parser,
                compile(nth(spec, 1), lang, opts, {}),
                first(spec).result === "+"
          )
        : parser;
};

const compileRD = (
    parser: Fn<boolean, Parser<string>>,
    rspec: ParseScope<string>,
    dspec: ParseScope<string>,
    opts: GrammarOpts
) =>
    dspec.result != null && rspec.result == null
        ? parser(true)
        : compileDiscard(
                compileRepeat(parser(false), rspec, opts),
                dspec,
                opts
          );

const compileRDL = (
    parser: Fn<boolean, Parser<string>>,
    rspec: ParseScope<string>,
    dspec: ParseScope<string>,
    lhspec: ParseScope<string>,
    lang: Language,
    opts: GrammarOpts
) =>
    compileLookahead(compileRD(parser, rspec, dspec, opts), lhspec, lang, opts);

export const defGrammar = (
    rules: string,
    env?: RuleTransforms,
    opts?: Partial<GrammarOpts>
) => {
    opts = { debug: false, optimize: true, ...opts };
    env = {
        binary: xfInt(2),
        collect: xfCollect,
        count: xfCount,
        discard: xfDiscard,
        float: xfFloat,
        hex: xfInt(16),
        hoist: xfHoist,
        hoistR: xfHoistResult,
        int: xfInt(10),
        join: xfJoin,
        json: xfJson,
        print: xfPrint(),
        trim: xfTrim,
        ...env,
    };
    const ctx = defContext(rules);
    const result = (opts.debug ? print(GRAMMAR) : GRAMMAR)(ctx);
    if (result) {
        return <Language>compile(
            ctx.root,
            {
                env,
                grammar: ctx,
                rules: {
                    ALPHA_NUM,
                    ALPHA,
                    BIT,
                    DIGIT,
                    DNL,
                    END: inputEnd,
                    ESC,
                    FLOAT,
                    HEX_DIGIT,
                    INT,
                    LEND: lineEnd,
                    LSTART: lineStart,
                    NL,
                    START: inputStart,
                    STRING,
                    UNICODE,
                    WB: wordBoundary,
                    WS,
                    WS0,
                    WS1,
                },
            },
            <GrammarOpts>opts,
            {}
        );
    }
};