src/main/java/innovimax/quixproc/datamodel/ValidQuiXTokenStream.java
/*
* QuiXProc: efficient evaluation of XProc Pipelines.
* Copyright (C) 2011-2018 Innovimax
* All rights reserved.
* Licensed under the Apache License, Version 2.0 (the "License");
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0*/
package innovimax.quixproc.datamodel;
import java.util.Arrays;
import java.util.EnumSet;
import innovimax.quixproc.datamodel.event.IQuiXEventStreamReader;
import innovimax.quixproc.datamodel.filter.AQuiXEventStreamFilter;
/**
* <p>
* The {@code ValidQuiXTokenStream} is a lightweight state machine that checks
* the following rules about {@code QuiXToken}
* </p>
* <table>
* <tr>
* <th>sequence</th>
* <td>:=</td>
* <td>{@code START_SEQUENCE}, (<b>document</b>|<b>json</b>)*,
* {@code END_SEQUENCE}</td>
* </tr>
* <tr>
* <th>document</th>
* <td>:=</td>
* <td>{@code START_DOCUMENT}, ({@code PROCESSING_INSTRUCTION}|{@code COMMENT}
* )*, <b>element</b>, ({@code PROCESSING_INSTRUCTION}|{@code COMMENT})*,
* {@code END_DOCUMENT}</td>
* </tr>
* <tr>
* <th>element</th>
* <td>:=</td>
* <td>{@code START_ELEMENT}, ({@code NAMESPACE}|{@code ATTRIBUTE})*, (
* {@code TEXT}|<b>element</b>|{@code PROCESSING_INSTRUCTION}|{@code COMMENT})*,
* {@code END_ELEMENT}</td>
* </tr>
* <tr>
* <th>json</th>
* <td>:=</td>
* <td>{@code START_JSON}, <b>object</b>, {@code END_JSON}</td>
* <tr>
* <th>object</th>
* <td>:=</td>
* <td>{@code START_OBJECT}, ({@code KEY_NAME}, <b>value</b>)*,
* {@code END_OBJECT}</td>
* </tr>
* <tr>
* <th>value</th>
* <td>:=</td>
* <td><b>object</b>|<b>array</b>|{@code VALUE_FALSE}|{@code VALUE_TRUE}|
* {@code VALUE_NUMBER}|{@code VALUE_NULL}|{@code VALUE_STRING}</td>
* </tr>
* <tr>
* <th>array</th>
* <td>:=</td>
* <td>{@code START_ARRAY}, <b>value</b>*, {@code END_ARRAY}</td>
* </tr>
* </table>
*
* @author innovimax
*
*/
public class ValidQuiXTokenStream extends AQuiXEventStreamFilter {
private State state;
public ValidQuiXTokenStream(final IQuiXStream<IQuiXToken> stream) {
this(stream, ExtraProcess.NONE);
}
public ValidQuiXTokenStream(final IQuiXEventStreamReader stream) {
super(stream.asIQuiXTokenStream());
this.state = State.START;
// ExtraProcess.NONE);
}
// private interface Process {
// checkUniqueNess(QuiXCharStream )
// }
enum ExtraProcess {
NONE,
}
private ValidQuiXTokenStream(final IQuiXStream<IQuiXToken> stream, final ExtraProcess process) {
super(stream);
this.state = State.START;
}
private enum State {
START, IN_SEQUENCE, IN_DOCUMENT, IN_DOCUMENT_AFTER_ROOT, IN_ELEMENT, IN_CONTENT_TEXT, IN_CONTENT, IN_JSON, IN_JSON_AFTER_ROOT, IN_OBJECT, IN_OBJECT_VALUE, IN_ARRAY, IN_RDF, IN_PREDICATE, IN_PREDICATE_AFTER_SUBJECT, IN_PREDICATE_AFTER_OBJECT, IN_PREDICATE_AFTER_GRAPH, IN_TABLE, IN_TABLE_AFTER_ROOT, IN_ARRAY_OF_ARRAY, IN_ARRAY_OF_ARRAY_AFTER_FIRST, IN_FLAT_ARRAY, END
}
private enum Node {
DOCUMENT, ELEMENT, JSON, OBJECT, ARRAY, RDF, PREDICATE, TABLE, ARRAY_OF_ARRAY, FLAT_ARRAY
}
private static class NodeStack {
// this is a compact implementation using the fact that most of
// the time the element are of the same type
int[] data;
private static final int START_SIZE = 8;
int size;
int pos;
final int MASK;
final int UNIT;
final int MAX_ALLOWED;
final int UPPER_MASK;
NodeStack() {
final int max = Node.values()[Node.values().length - 1].ordinal();
int mask = 1;
while (mask <= max) {
mask <<= 1;
}
this.MAX_ALLOWED = Integer.MAX_VALUE >> 1;
this.MASK = mask - 1;
this.UPPER_MASK = this.MAX_ALLOWED ^ this.MASK;
this.UNIT = mask;
this.data = new int[START_SIZE];
this.size = 8;
this.pos = -1;
}
void push(final Node node) {
final int value = node.ordinal();
if (this.pos >= 0 && value == (this.data[this.pos] & this.MASK)) {
if (value <= this.MAX_ALLOWED) {
this.data[this.pos] += this.UNIT;
return;
}
// this is greater than maxallowed
}
this.pos++;
if (this.pos >= this.size) {
this.size = this.size * 3 / 2 + 1;
System.out.println(this.size);
this.data = Arrays.copyOf(this.data, this.size);
}
this.data[this.pos] = (byte) node.ordinal();
}
boolean empty() {
return this.pos < 0;
}
Node pop() {
// simple case first
if ((this.data[this.pos] & this.UPPER_MASK) == 0)
return Node.values()[this.data[this.pos--]];
// now it means there is at least one
this.data[this.pos] -= this.UNIT;
return Node.values()[this.data[this.pos] & this.MASK];
}
Node peek() {
return Node.values()[this.data[this.pos] & this.MASK];
}
}
private final NodeStack stack = new NodeStack();
@Override
public IQuiXToken process(final IQuiXToken item) {
final QuiXToken token = item.getType();
// System.out.println(state +", "+ token);
switch (this.state) {
case START:
// sequence := START_SEQUENCE, (document|json_yaml|table|semantic)*,
// END_SEQUENCE
accept(token, QuiXToken.START_SEQUENCE);
this.state = State.IN_SEQUENCE;
return token;
case IN_SEQUENCE:
accept(token, EnumSet.of(QuiXToken.START_DOCUMENT, QuiXToken.START_JSON, QuiXToken.START_RDF,
QuiXToken.START_TABLE, QuiXToken.END_SEQUENCE));
switch (token) {
case START_DOCUMENT:
this.state = State.IN_DOCUMENT;
this.stack.push(Node.DOCUMENT);
return token;
case START_JSON:
this.state = State.IN_JSON;
this.stack.push(Node.JSON);
return token;
case START_RDF:
this.state = State.IN_RDF;
this.stack.push(Node.RDF);
return token;
case START_TABLE:
this.state = State.IN_TABLE;
this.stack.push(Node.TABLE);
return token;
case END_SEQUENCE:
this.state = State.END;
return token;
default:
}
return token;
case END:
// will throw an error
accept(token, EnumSet.noneOf(QuiXToken.class));
return token;
case IN_DOCUMENT:
// document := START_DOCUMENT, (PROCESSING-INSTRUCTION|COMMENT)*,
// element, (PROCESSING-INSTRUCTION|COMMENT)*, END_DOCUMENT
accept(token, EnumSet.of(QuiXToken.PROCESSING_INSTRUCTION, QuiXToken.COMMENT, QuiXToken.START_ELEMENT));
switch (token) {
case PROCESSING_INSTRUCTION:
case COMMENT:
// stay in this state
return token;
case START_ELEMENT:
this.state = State.IN_ELEMENT;
this.stack.push(Node.ELEMENT);
// update
return token;
default:
}
return token;
case IN_DOCUMENT_AFTER_ROOT:
accept(token, EnumSet.of(QuiXToken.PROCESSING_INSTRUCTION, QuiXToken.COMMENT, QuiXToken.END_DOCUMENT));
switch (token) {
case PROCESSING_INSTRUCTION:
case COMMENT:
// stay in this state
return token;
case END_DOCUMENT:
// unpile
acceptStackAndSetState(token, Node.DOCUMENT);
return token;
default:
}
return token;
case IN_ELEMENT:
// element := START_ELEMENT, (NAMESPACE|ATTRIBUTE)*, TEXT?,
// ((element|PROCESSING-INSTRUCTION|COMMENT)+, TEXT)*,
// (element|PROCESSING-INSTRUCTION|COMMENT)*, END_ELEMENT
accept(token,
EnumSet.of(QuiXToken.NAMESPACE, QuiXToken.ATTRIBUTE, QuiXToken.TEXT,
QuiXToken.PROCESSING_INSTRUCTION, QuiXToken.COMMENT, QuiXToken.START_ELEMENT,
QuiXToken.END_ELEMENT));
switch (token) {
case NAMESPACE:
case ATTRIBUTE:
// stay in this state
return token;
case TEXT:
this.state = State.IN_CONTENT_TEXT;
return token;
case PROCESSING_INSTRUCTION:
case COMMENT:
this.state = State.IN_CONTENT;
return token;
case START_ELEMENT:
// this.state = State.IN_ELEMENT;
this.stack.push(Node.ELEMENT);
return token;
case END_ELEMENT:
// unpile
acceptStackAndSetState(token, Node.ELEMENT);
return token;
default:
}
return token;
case IN_CONTENT:
accept(token, EnumSet.of(QuiXToken.TEXT, QuiXToken.PROCESSING_INSTRUCTION, QuiXToken.COMMENT,
QuiXToken.START_ELEMENT, QuiXToken.END_ELEMENT));
switch (token) {
case PROCESSING_INSTRUCTION:
case COMMENT:
// stay in this state
return token;
case TEXT:
this.state = State.IN_CONTENT_TEXT;
return token;
case START_ELEMENT:
// this.state = State.IN_ELEMENT;
this.stack.push(Node.ELEMENT);
return token;
case END_ELEMENT:
// unpile
acceptStackAndSetState(token, Node.ELEMENT);
return token;
default:
}
return token;
case IN_CONTENT_TEXT:
accept(token, EnumSet.of(QuiXToken.PROCESSING_INSTRUCTION, QuiXToken.COMMENT, QuiXToken.START_ELEMENT,
QuiXToken.END_ELEMENT));
switch (token) {
case PROCESSING_INSTRUCTION:
case COMMENT:
this.state = State.IN_CONTENT;
return token;
case START_ELEMENT:
// this.state = State.IN_ELEMENT;
this.stack.push(Node.ELEMENT);
return token;
case END_ELEMENT:
// unpile
acceptStackAndSetState(token, Node.ELEMENT);
return token;
default:
}
return token;
case IN_OBJECT:
// object := START_OBJECT, (KEY_NAME, value)*, END_OBJECT
accept(token, EnumSet.of(QuiXToken.KEY_NAME, QuiXToken.END_OBJECT));
switch (token) {
case KEY_NAME:
this.state = State.IN_OBJECT_VALUE;
return token;
case END_OBJECT:
acceptStackAndSetState(token, Node.OBJECT);
return token;
default:
}
return token;
case IN_OBJECT_VALUE:
// value :=
// object|array|VALUE_FALSE|VALUE_TRUE|VALUE_NUMBER|VALUE_NULL|VALUE_STRING
accept(token, EnumSet.of(QuiXToken.VALUE_FALSE, QuiXToken.VALUE_TRUE, QuiXToken.VALUE_NULL,
QuiXToken.VALUE_NUMBER, QuiXToken.VALUE_STRING, QuiXToken.START_ARRAY, QuiXToken.START_OBJECT));
switch (token) {
case VALUE_FALSE:
case VALUE_NULL:
case VALUE_NUMBER:
case VALUE_TRUE:
case VALUE_STRING:
this.state = State.IN_OBJECT;
return token;
case START_OBJECT:
this.state = State.IN_OBJECT;
this.stack.push(Node.OBJECT);
return token;
case START_ARRAY:
this.state = State.IN_ARRAY;
this.stack.push(Node.ARRAY);
return token;
default:
}
return token;
case IN_ARRAY:
// array := START_ARRAY, value*, END_ARRAY
accept(token,
EnumSet.of(QuiXToken.VALUE_FALSE, QuiXToken.VALUE_TRUE, QuiXToken.VALUE_NULL,
QuiXToken.VALUE_NUMBER, QuiXToken.VALUE_STRING, QuiXToken.START_ARRAY,
QuiXToken.START_OBJECT, QuiXToken.END_ARRAY));
switch (token) {
case VALUE_FALSE:
case VALUE_NULL:
case VALUE_NUMBER:
case VALUE_TRUE:
case VALUE_STRING:
// stay in this state
return token;
case START_ARRAY:
// this.state = IN_ARRAY;
this.stack.push(Node.ARRAY);
return token;
case START_OBJECT:
this.state = State.IN_OBJECT;
this.stack.push(Node.OBJECT);
return token;
case END_ARRAY:
// unpile
acceptStackAndSetState(token, Node.ARRAY);
return token;
default:
}
return token;
case IN_JSON:
// json := START_JSON, object, END_JSON
accept(token, EnumSet.of(QuiXToken.START_OBJECT));
this.stack.push(Node.OBJECT);
this.state = State.IN_OBJECT;
return token;
case IN_JSON_AFTER_ROOT:
accept(token, EnumSet.of(QuiXToken.END_JSON));
acceptStackAndSetState(token, Node.JSON);
return token;
case IN_RDF:
// semantic := START_RDF, statement*, END_RDF
accept(token, EnumSet.of(QuiXToken.START_PREDICATE, QuiXToken.END_RDF));
switch (token) {
case START_PREDICATE:
this.stack.push(Node.PREDICATE);
this.state = State.IN_PREDICATE;
return token;
case END_RDF:
acceptStackAndSetState(token, Node.RDF);
return token;
default:
}
return token;
case IN_PREDICATE:
// statement := START_PREDICATE, SUBJECT, OBJECT, GRAPH?,
// END_PREDICATE
accept(token, EnumSet.of(QuiXToken.SUBJECT));
this.state = State.IN_PREDICATE_AFTER_SUBJECT;
return token;
case IN_PREDICATE_AFTER_SUBJECT:
accept(token, EnumSet.of(QuiXToken.OBJECT));
this.state = State.IN_PREDICATE_AFTER_OBJECT;
return token;
case IN_PREDICATE_AFTER_OBJECT:
accept(token, EnumSet.of(QuiXToken.GRAPH, QuiXToken.END_PREDICATE));
switch (token) {
case GRAPH:
this.state = State.IN_PREDICATE_AFTER_GRAPH;
return token;
case END_PREDICATE:
acceptStackAndSetState(token, Node.PREDICATE);
return token;
default:
}
return token;
case IN_PREDICATE_AFTER_GRAPH:
accept(token, EnumSet.of(QuiXToken.END_PREDICATE));
acceptStackAndSetState(token, Node.PREDICATE);
return token;
case IN_TABLE:
// table := START_TABLE, header*, array_of_array, END_TABLE
accept(token, EnumSet.of(QuiXToken.COLNAME, QuiXToken.START_ARRAY));
switch (token) {
case COLNAME:
// stay in this state
return token;
case START_ARRAY:
this.stack.push(Node.ARRAY_OF_ARRAY);
this.state = State.IN_ARRAY_OF_ARRAY;
return token;
default:
}
return token;
case IN_TABLE_AFTER_ROOT:
accept(token, EnumSet.of(QuiXToken.END_TABLE));
acceptStackAndSetState(token, Node.TABLE);
return token;
case IN_ARRAY_OF_ARRAY:
// array_of_array := START_ARRAY, array+, END_ARRAY
accept(token, EnumSet.of(QuiXToken.START_ARRAY));
this.stack.push(Node.FLAT_ARRAY);
this.state = State.IN_FLAT_ARRAY;
return token;
case IN_ARRAY_OF_ARRAY_AFTER_FIRST:
accept(token, EnumSet.of(QuiXToken.START_ARRAY, QuiXToken.END_ARRAY));
switch (token) {
case START_ARRAY:
this.stack.push(Node.FLAT_ARRAY);
this.state = State.IN_FLAT_ARRAY;
return token;
case END_ARRAY:
acceptStackAndSetState(token, Node.ARRAY_OF_ARRAY);
return token;
default:
}
return token;
case IN_FLAT_ARRAY:
// flat_array := START_ARRAY, flat_value*, END_ARRAY
// flat_value :=
// VALUE_FALSE|VALUE_TRUE|VALUE_NUMBER|VALUE_NULL|VALUE_STRING
accept(token, EnumSet.of(QuiXToken.VALUE_FALSE, QuiXToken.VALUE_TRUE, QuiXToken.VALUE_NULL,
QuiXToken.VALUE_NUMBER, QuiXToken.VALUE_STRING, QuiXToken.END_ARRAY));
switch (token) {
case VALUE_FALSE:
case VALUE_NULL:
case VALUE_NUMBER:
case VALUE_TRUE:
case VALUE_STRING:
// stay in this state
return token;
case END_ARRAY:
acceptStackAndSetState(token, Node.FLAT_ARRAY);
return token;
default:
}
return token;
default:
}
return token;
}
private void acceptStackAndSetState(final QuiXToken token, final Node node) {
if (this.stack.empty()) {
throw new IllegalStateException(
"Invalid state " + token + ". Closing a node " + node + " that is not opened");
}
final Node last = this.stack.pop();
if (last == node) {
// this is what is expected
// but need to set the correct state
if (this.stack.empty()) {
// we are in the SEQUENCE
this.state = State.IN_SEQUENCE;
} else {
final Node current = this.stack.peek();
switch (current) {
case DOCUMENT:
this.state = State.IN_DOCUMENT_AFTER_ROOT;
break;
case JSON:
this.state = State.IN_JSON_AFTER_ROOT;
break;
case ELEMENT:
this.state = State.IN_CONTENT;
break;
case OBJECT:
this.state = State.IN_OBJECT;
break;
case ARRAY:
this.state = State.IN_ARRAY;
break;
case PREDICATE:
this.state = State.IN_RDF;
break;
case RDF:
this.state = State.IN_RDF;
break;
case TABLE:
this.state = State.IN_TABLE_AFTER_ROOT;
break;
case ARRAY_OF_ARRAY:
this.state = State.IN_ARRAY_OF_ARRAY_AFTER_FIRST;
break;
// case FLAT_ARRAY: impossible to have flat array here
default:
}
}
return;
}
// this is different
throw new IllegalStateException(
"Invalid state " + token + ". Closing a node " + node + " while last open is a " + last);
}
private static void accept(final QuiXToken token, final QuiXToken expected) {
accept(token, EnumSet.of(expected));
}
private static void accept(final QuiXToken token, final EnumSet<QuiXToken> expecteds) {
if (expecteds.contains(token))
return;
//
throw new IllegalStateException(
"Invalid state " + token + ". One of the following state was expected: " + expecteds.toString());
}
}