tom-weatherhead/thaw-parser

View on GitHub
src/lalr1-parser.ts

Summary

Maintainability
A
0 mins
Test Coverage
// tom-weatherhead/thaw-parser/src/lalr1-parser.ts

import { createSet, IImmutableSet, ISet, Stack } from 'thaw-common-utilities.ts';

import { GrammarSymbol, IGrammar } from 'thaw-interpreter-types';

import { CFSMState, LR0Configuration, LR0Parser } from './lr0-parser';

import { LR1Configuration, LR1Parser } from './lr1-parser';

import { ShiftReduceAction } from './shift-reduce-actions';

import { InternalErrorException } from './exceptions/internal-error';
import { ReduceReduceConflictException } from './exceptions/reduce-reduce-conflict';
import { ShiftReduceConflictException } from './exceptions/shift-reduce-conflict';
// import { ParserException } from './exceptions/parser';
// import { SyntaxException } from './exceptions/syntax';

export class LALR1Configuration extends LR0Configuration {
    public static fromLR0(
        c: LR0Configuration,
        lookaheads: IImmutableSet<GrammarSymbol>
    ): LALR1Configuration {
        const result = new LALR1Configuration(c.ProductionLHS);

        result.ProductionRHS.push(...c.ProductionRHS);
        result.Lookaheads.unionInPlace(lookaheads);

        return result;
    }

    public static fromLR1(c: LR1Configuration, ...lookaheads: GrammarSymbol[]): LALR1Configuration {
        const result = new LALR1Configuration(c.ProductionLHS, c.Lookahead);

        result.ProductionRHS.push(...c.ProductionRHS);

        for (const lookahead of lookaheads) {
            result.Lookaheads.add(lookahead);
        }

        return result;
    }

    public readonly Lookaheads = createSet<GrammarSymbol>();
    public readonly PropagateLinks = createSet<LALR1Configuration>();

    constructor(lhs: GrammarSymbol, ...looks: GrammarSymbol[]) {
        super(lhs);

        for (const look of looks) {
            this.Lookaheads.add(look);
        }
    }

    public override equals(other: unknown): boolean {
        const otherConfig = other as LALR1Configuration;

        // TODO: Try this:
        // if (otherConfig === this) {
        //     return true;
        // }

        if (
            typeof otherConfig === 'undefined' ||
            !(other instanceof LALR1Configuration) ||
            !super.equals(other) ||
            !this.Lookaheads.equals(otherConfig.Lookaheads)
        ) {
            return false;
        }

        return true;
    }

    // public override int GetHashCode() {
    //     /*
    //     int hashCode = base.GetHashCode() * 103;
    //
    //     foreach (Symbol symbol in Lookaheads)
    //     {
    //         hashCode += symbol.GetHashCode();   // The order of the lookahead symbols does not alter the hash code.
    //     }
    //
    //     return hashCode;
    //      */
    //
    //     // The order of the lookahead symbols does not alter the hash code.
    //     return Lookaheads
    //         .Select(symbol => symbol.GetHashCode())
    //         .Aggregate(base.GetHashCode() * 103, (accumulator, hashCode) => accumulator + hashCode);
    // }

    public override toString(): string {
        // return `${super.toString()}; ${this.Lookahead}`;

        return 'LALR1Configuration.toString()';
    }

    // The "new" keyword is used here (in C#) because this function hides
    // a function in the base class which differs only by return type.

    public override AdvanceDot(): LALR1Configuration {
        return LALR1Configuration.fromLR0(super.AdvanceDot(), this.Lookaheads);
    }

    public ToLR0Configuration(): LR0Configuration {
        // Unit tests show that this is necessary.
        return new LR0Configuration(this.ProductionLHS, this.ProductionRHS);
    }
}

export class LALR1CFSMState {
    public readonly Transitions = new Map<GrammarSymbol, LALR1CFSMState>();

    constructor(public readonly ConfigurationSet: IImmutableSet<LALR1Configuration>) {}

    public equals(other: unknown): boolean {
        // TODO: Find a better implementation for this function.  Beware of cycles in the finite state machine (or ignore the transitions in this function).

        // TODO: Should we also consider Transitions.Keys?

        const that = other as LALR1CFSMState;

        return typeof that !== 'undefined' && this.ConfigurationSet.equals(that.ConfigurationSet);
    }

    // public override int GetHashCode() {
    //     // TODO: Find a better implementation for this function.  Beware of cycles in the finite state machine (or ignore the transitions in this function).
    //     /*
    //     int hashCode = 0;
    //
    //     foreach (LALR1Configuration conf in ConfigurationSet)
    //     {
    //         // The order of the configurations in the set doesn't affect the hash code.
    //         hashCode += conf.GetHashCode();
    //     }
    //
    //     // TODO: Should we also consider Transitions.Keys?
    //     return hashCode;
    //      */
    //     return ConfigurationSet
    //         .Select(conf => conf.GetHashCode())
    //         .Aggregate(0, (accumulator, hashCode) => accumulator + hashCode);
    // }
}

export class LALR1CFSM {
    public readonly StateList: LALR1CFSMState[];
    public readonly StartState: LALR1CFSMState;

    constructor(ss: LALR1CFSMState) {
        this.StartState = ss;
        this.StateList = [ss];
    }

    public FindStateWithLabel(cs: IImmutableSet<LALR1Configuration>): LALR1CFSMState {
        const result = this.StateList.find((state: LALR1CFSMState) =>
            state.ConfigurationSet.equals(cs)
        );

        if (typeof result === 'undefined') {
            throw new InternalErrorException(
                `LALR1CFSM.FindStateWithLabel() : No matching state found; label size == ${cs.size}.`
            );
        }

        return result;
    }
}

export class LALR1StateSymbolPair {
    constructor(public readonly state: LALR1CFSMState, public readonly symbol: GrammarSymbol) {}

    public equals(other: unknown): boolean {
        const that = other as LALR1StateSymbolPair;

        return (
            typeof that !== 'undefined' &&
            this.state.equals(that.state) &&
            this.symbol === that.symbol
        );
    }
}

export class LookaheadPropagationRecord {
    constructor(
        public readonly configuration: LALR1Configuration,
        public readonly lookahead: GrammarSymbol
    ) {}
}

export class LALR1Parser extends LR0Parser {
    private readonly cognateDict = new Map<string, IImmutableSet<LALR1Configuration>>();

    constructor(g: IGrammar) {
        super(g);

        if (this.machine.StateList.every((state: CFSMState) => state.Transitions.size === 0)) {
            throw new InternalErrorException('All machine states have zero transitions.');
        }

        this.BuildLALR1CFSM();
    }

    // public LALR1Parser(GrammarSelector gs)
    //     : this(GrammarFactory.Create(gs))
    // {
    // }

    // See Fischer and LeBlanc, page 167.

    private Core(s: IImmutableSet<LALR1Configuration>): IImmutableSet<LR0Configuration> {
        const result = createSet<LR0Configuration>();

        for (const c of s.toArray()) {
            result.add(c.ToLR0Configuration());
        }

        return result;
    }

    private CognateHelper(
        s: IImmutableSet<LALR1Configuration>,
        cognateResult: ISet<LALR1Configuration>,
        configDict: Map<LR0Configuration, LALR1Configuration>
    ): void {
        for (const c of s.toArray()) {
            const lr0Config = c.ToLR0Configuration();
            const c2 = configDict.get(lr0Config);

            if (typeof c2 !== 'undefined') {
                if (!c.Lookaheads.isASubsetOf(c2.Lookaheads)) {
                    // Note: This probably changes the hash code of c2; cognateResult may need to be rehashed, if possible.
                    // A better solution: Remove c2 from cognateResult before modifying c2, and re-add it afterwards:
                    cognateResult.remove(c2);
                    c2.Lookaheads.unionInPlace(c.Lookaheads);
                    cognateResult.add(c2);
                }
            } else {
                cognateResult.add(c);
                configDict.set(lr0Config, c);
            }
        }
    }

    // See Fischer and LeBlanc, page 167.

    private Cognate1(
        s_bar: IImmutableSet<LR0Configuration>,
        stateList: IImmutableSet<LALR1Configuration>[]
    ): IImmutableSet<LALR1Configuration> {
        const result = createSet<LALR1Configuration>();
        const configDict = new Map<LR0Configuration, LALR1Configuration>();

        for (const s of stateList) {
            const s_core = this.Core(s);

            if (!s_bar.equals(s_core)) {
                continue;
            }

            // Solution 1:
            //result.UnionWith(s);    // Question: Could this inadvertently cause result to contain multiple configurations that differ only by lookahead?

            // Solution 2:
            this.CognateHelper(s, result, configDict);
        }

        return result;
    }

    private Cognate2(s_bar: CFSMState, lalr1machine: LALR1CFSM): IImmutableSet<LALR1Configuration> {
        const result = createSet<LALR1Configuration>();
        const configDict = new Map<LR0Configuration, LALR1Configuration>();

        for (const s of lalr1machine.StateList) {
            const s_core = this.Core(s.ConfigurationSet);

            if (!s_bar.ConfigurationSet.equals(s_core)) {
                continue;
            }

            // Solution 1:
            //result.UnionWith(s.ConfigurationSet);    // Question: Could this inadvertently cause result to contain multiple configurations that differ only by lookahead?

            // Solution 2:
            this.CognateHelper(s.ConfigurationSet, result, configDict);
        }

        return result;
    }

    private BuildLALR1CFSM(): void {
        if (this.machine.StateList.length === 0) {
            throw new InternalErrorException('machine.StateList is empty');
        }

        // 1) Create the machine object with all of its states.
        // See Fischer and LeBlanc, page 167.

        // 1a) Create the start state so that the machine object can be created.
        const stateList: IImmutableSet<LALR1Configuration>[] = [];
        const lr1parser = new LR1Parser(this.grammar);

        for (const lr1State of lr1parser.machine.StateList) {
            const cs = createSet<LALR1Configuration>();

            for (const c of lr1State.ConfigurationSet) {
                cs.add(LALR1Configuration.fromLR1(c));
            }

            stateList.push(cs);
        }

        for (const lr0state of this.machine.StateList) {
            this.cognateDict.set(
                lr0state.toString(),
                this.Cognate1(lr0state.ConfigurationSet, stateList)
            );
        }

        const ss = this.cognateDict.get(this.machine.StartState.toString());

        if (typeof ss === 'undefined') {
            throw new InternalErrorException('ss is undefined');
        }

        const startState = new LALR1CFSMState(ss);
        const result = new LALR1CFSM(startState);

        // 1b) Create the other states.

        for (const lr0state of this.machine.StateList) {
            const cognate = this.cognateDict.get(lr0state.toString());

            if (typeof cognate === 'undefined') {
                throw new InternalErrorException('cognate is undefined');
            }

            const lalr1State = new LALR1CFSMState(cognate);

            // if (!result.StateList.Contains(lalr1State))
            if (
                typeof result.StateList.find((s: LALR1CFSMState) => lalr1State.equals(s)) ===
                'undefined'
            ) {
                result.StateList.push(lalr1State);
            }
        }

        // 2) Add the transitions.

        if (result.StateList.length === 0) {
            throw new InternalErrorException('result.StateList is empty');
        }

        for (const lr0state of this.machine.StateList) {
            const cognate2 = this.cognateDict.get(lr0state.toString());

            if (typeof cognate2 === 'undefined') {
                throw new InternalErrorException('cognate2 is undefined');
            }

            const lalr1State = result.FindStateWithLabel(cognate2);

            if (typeof lalr1State === 'undefined') {
                throw new InternalErrorException('lalr1State is undefined');
            }

            const transitionsKeys = Array.from(lr0state.Transitions.keys());

            // if (transitionsKeys.length === 0) {
            //     // BUG 2021-08-09 : It blows up here.
            //     console.error('List of transitionsKeys is empty');
            //     console.error('lr0state is:', lr0state);
            //     console.error('lr0state.Transitions is:', lr0state.Transitions);
            //     throw new InternalErrorException('List of transitionsKeys is empty');
            // }

            for (const symbol of transitionsKeys) {
                const transition = lr0state.Transitions.get(symbol);

                if (typeof transition === 'undefined') {
                    throw new InternalErrorException('transition is undefined');
                }

                const cognate3 = this.cognateDict.get(transition.toString());

                if (typeof cognate3 === 'undefined') {
                    throw new InternalErrorException('cognate3 is undefined');
                }

                const lalr1StateDest = result.FindStateWithLabel(cognate3);

                // BUG 2021-08-09 : It seems like no transitions are being set. (When unit testing with the Chapter1 grammar.)
                lalr1State.Transitions.set(symbol, lalr1StateDest);
            }

            // if (lalr1State.Transitions.size === 0) {
            //     throw new InternalErrorException('lalr1State.Transitions is empty');
            // }
        }

        // 3) Add the lookaheads.
        // See Fischer and LeBlanc, pages 171-173.

        for (const s of result.StateList) {
            for (const c of s.ConfigurationSet.toArray()) {
                c.Lookaheads.clear(); // Test; hack.  The real lookahead symbols will be added below.
            }
        }

        // 3a) Create the propagate links between configurations:
        //   i) When one configuration is created from another in a previous state via a shift operation.

        for (const s of result.StateList) {
            for (const c of s.ConfigurationSet.toArray()) {
                const symbol = c.FindSymbolAfterDot();

                if (typeof symbol === 'undefined') {
                    continue;
                }

                const nextState = s.Transitions.get(symbol);

                if (typeof nextState === 'undefined') {
                    continue;
                }

                const configurationToMatch = c.AdvanceDot();

                for (const cNext of nextState.ConfigurationSet.toArray()) {
                    if (cNext.equals(configurationToMatch)) {
                        c.PropagateLinks.add(cNext);
                        break;
                    }
                }
            }
        }

        //   ii) When a configuration is created as a result of a closure or prediction operation on another configuration.
        //     - A -> dot alpha, L2; B -> beta dot A gamma, L1
        //       - ThAW note: It appears that these two configurations will always be in the same state if the first conf results from the closure of the second.
        //         ? Is this also true in the case of prediction?
        //     1) Spontaneous: L2 = First(gamma), which does not include lambda.
        //     2) Propagate: When gamma can derive lambda.

        for (const s of result.StateList) {
            for (const c of s.ConfigurationSet.toArray()) {
                const symbol = c.FindSymbolAfterDot();

                if (typeof symbol === 'undefined') {
                    continue;
                }

                for (const cNext of s.ConfigurationSet.toArray()) {
                    if (cNext.ProductionLHS !== symbol || cNext.FindDot() !== 0) {
                        continue;
                    }

                    // cNext is of the form "A -> dot alpha", where A == symbol.

                    const gamma = c.FindSuffix(1);

                    if (typeof gamma === 'undefined') {
                        continue;
                    }

                    const firstOfGamma = this.computeFirst(gamma);
                    const gammaCanDeriveLambda = firstOfGamma.contains(GrammarSymbol.Lambda);

                    if (gammaCanDeriveLambda) {
                        // Propagate (adjective) lookaheads.
                        c.PropagateLinks.add(cNext);

                        // Should we also add the non-lambda members of firstOfGamma to cNext.Lookaheads?
                        firstOfGamma.remove(GrammarSymbol.Lambda);
                    }

                    cNext.Lookaheads.unionInPlace(firstOfGamma); // Add spontaneous lookaheads.
                }
            }
        }

        // 3b) Add the spontaneous lookaheads: the non-lambda values of First(gamma). : Done above.
        //  - Also initialize the lookahead set of the initial configuration to the empty set. : Done by default.

        // 3c) Propagate the lookaheads.
        const lookaheadPropagationStack = new Stack<LookaheadPropagationRecord>();

        for (const s of result.StateList) {
            for (const c of s.ConfigurationSet.toArray()) {
                for (const l of c.Lookaheads.toArray()) {
                    lookaheadPropagationStack.push(new LookaheadPropagationRecord(c, l));
                }
            }
        }

        while (!lookaheadPropagationStack.isEmpty()) {
            const lpr = lookaheadPropagationStack.pop();

            for (const cLinked of lpr.configuration.PropagateLinks.toArray()) {
                if (!cLinked.Lookaheads.contains(lpr.lookahead)) {
                    cLinked.Lookaheads.add(lpr.lookahead);
                    lookaheadPropagationStack.push(
                        new LookaheadPropagationRecord(cLinked, lpr.lookahead)
                    );
                }
            }
        }

        // Clear cognateDict and re-populate it with configurations from the newly created LALR(1) machine
        // in order to ensure that the values of cognateDict contain all of the correct lookaheads?

        this.cognateDict.clear();

        for (const lr0state of this.machine.StateList) {
            this.cognateDict.set(lr0state.toString(), this.Cognate2(lr0state, result));
        }
    }

    // Adapted from Fischer and LeBlanc, pages 167-168.

    private GetActionLALR(
        S: CFSMState,
        tokenAsSymbol: GrammarSymbol
    ): { reduceProductionNum: number; action: ShiftReduceAction } {
        let result = ShiftReduceAction.Error;
        let reduceResultFound = false; // In order for the grammar to be LALR(1), there must be at most one result per state-symbol pair.
        let reduceProductionNum = -1;

        // 1) Search for Reduce actions.
        const cognateS = this.cognateDict.get(S.toString());

        if (typeof cognateS === 'undefined') {
            throw new InternalErrorException('GetActionLALR() : cognateS is undefined');
        }

        for (const c of cognateS.toArray()) {
            if (!c.Lookaheads.contains(tokenAsSymbol)) {
                continue;
            }

            const matchedProduction = c.ConvertToProductionIfAllMatched();

            if (typeof matchedProduction === 'undefined') {
                continue;
            }

            for (let i = 0; i < this.grammar.productions.length; ++i) {
                const productionToCompare = this.grammar.productions[i].stripOutSemanticActions();

                if (matchedProduction.equals(productionToCompare)) {
                    if (reduceResultFound && reduceProductionNum !== i) {
                        throw new ReduceReduceConflictException(
                            `GetActionLALR() : Multiple actions found: productions ${this.grammar.productions[reduceProductionNum]} and ${this.grammar.productions[i]}; grammar is not LALR(1).`
                        );
                    }

                    result = ShiftReduceAction.Reduce;
                    reduceProductionNum = i;
                    reduceResultFound = true;
                }
            }
        }

        // 2) Search for Shift and Accept actions.
        const shiftOrAcceptResultFound = S.ConfigurationSet.toArray().some(
            (c: LR0Configuration) => c.FindSymbolAfterDot() === tokenAsSymbol
        );

        if (shiftOrAcceptResultFound) {
            if (reduceResultFound) {
                const configurationToShift = S.ConfigurationSet.toArray().find(
                    (c: LR0Configuration) => c.FindSymbolAfterDot() === tokenAsSymbol
                );

                throw new ShiftReduceConflictException(
                    `GetActionLALR() : Multiple actions found: shift: ${configurationToShift}; reduce: ${this.grammar.productions[reduceProductionNum]}; grammar is not LALR(1).`
                );
            }

            result =
                tokenAsSymbol == GrammarSymbol.terminalEOF
                    ? ShiftReduceAction.Accept
                    : ShiftReduceAction.Shift;
        }

        return {
            reduceProductionNum,
            action: result
        };
    }

    protected override GetActionCaller(
        S: CFSMState,
        tokenAsSymbol: GrammarSymbol
    ): { reduceProductionNum: number; action: ShiftReduceAction } {
        return this.GetActionLALR(S, tokenAsSymbol);
    }
}