src/js-scanner.ts
/*!
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Copyright (C) 2020 jeffy-g <hirotom1107@gmail.com>
Released under the MIT license
https://opensource.org/licenses/mit-license.php
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
/// <reference path="./index.d.ts"/>
import * as util from "./reutil";
const {
detectNewLine,
detectRegex,
lookupRegexes
} = util;
type TScannerContext = {
/** The source offset currently being scanned is recorded. (read, write */
offset: number;
/** new line character at source. (read */
readonly newline: util.TDetectedNewLines;
/**
* The scan event listener will no longer run twice
* @date 2022/4/6
*/
eventDone?: true;
/**
* Whether to record the detected regex. (read, write
*
* @date 2020/5/8
*/
collectRegex?: boolean;
// - - - - - - - - - - - - - - -
// replace mode
// - - - - - - - - - - - - - - -
/** replecement result (read, write */
result: string;
// - - - - - - - - - - - - - - -
// walk through mode
// - - - - - - - - - - - - - - -
/**
* always true if running at walk through mode
* @since 3.1
* @date 2020/5/14
*/
isWalk?: boolean;
/**
* whether to continue walk through
* @since 3.1
* @date 2020/5/14
*/
proceed?: boolean;
};
/**
* @date 2022/4/22
* @since 3.3.1
*/
const enum EMetaChars {
/**
* ```js
* '"'.charCodeAt(0)
* ```
*/
DOUBLE_QUOTE = 34,
/**
* ```js
* "'".charCodeAt(0)
* ```
*/
SINGLE_QUOTE = 39,
/**
* ```js
* "/".charCodeAt(0)
* ```
*/
SLASH = 47,
/**
* ```js
* "`".charCodeAt(0)
* ```
*/
BACK_QUOTE = 96,
/**
* ```js
* "}".charCodeAt(0)
* ```
*/
RIGHT_CURLY = 125,
}
/**
* #### main function.
*
* if returns true then context has been changed.
*/
type TCharScannerFunction =
/**
* #### main function.
*
* if returns true then context has been changed.
* @param src current replacement source.
* @param ctx see {@link TScannerContext}
*/
(src: string, ctx: TScannerContext) => boolean | never;
/**
* @typedef {object} TScannerContext
* @prop {number} offset The source offset currently being scanned is recorded.(read, write
* @prop {util.TDetectedNewLines} newline new line character at source. (read
* @prop {boolean} [collectRegex] Whether to record the detected regex. (read, write
* @prop {string} result replecement result (read, write
* @prop {boolean} [isWalk] always true if running at walk through mode
* @prop {boolean} [proceed] whether to continue walk through
* @prop {true} [eventDone] suppress secondary scan event
*/
/**
* @typedef {(
* source: string, context: TScannerContext
* ) => boolean | never} TCharScannerFunction if returns true then context has been changed.
*/
/**
* create TScannerContext
*
* @param {string} src parsing source
* @param {true} [collectRegex]
* @returns {TScannerContext}
*/
const createWhite = (src: string, collectRegex?: true, isWalk: boolean = false): TScannerContext => {
// specify new line character.
/** @type {util.TDetectedNewLines} */
const newline: util.TDetectedNewLines = detectNewLine(src);
/** @type {TScannerContext} */
const ctx = /** @type {TScannerContext} */({
offset: 0,
// result: "",
collectRegex,
newline
}) as TScannerContext;
if (isWalk) {
ctx.proceed = ctx.isWalk = true;
} else {
ctx.result = "";
}
return ctx;
};
/**
* correctly evaluate single quote and double quote string,
* and concat it to the result string.
* ```
* case "'": // single quote
* case '"': // double quote
* ```
* @param {string} src
* @param {TScannerContext} ctx
* @returns {boolean}
* @throws {SyntaxError}
*/
const quote: TCharScannerFunction = (src: string, ctx: TScannerContext): boolean => {
const startOffset = ctx.offset;
const q = src[startOffset];
// limiter
const limiter = src.length;
// move next position.
let next = startOffset + 1;
// toggle escape flag.
let inEscape = false;
// store "next" postion character.
let ch: TBD<string>;
while (next < limiter) {
if ((ch = src[next++]) === "\\") {
inEscape = !inEscape;
} else if (inEscape) { /* need inEscape = false state. */
inEscape = false;
} else if (ch === q) { // last state is "in escape" then current ch is ignored.
ctx.offset = next;
if (!ctx.isWalk) {
// DEVNOTE: 2022/02/02 when working on "walk" mode, context.result is ignored
ctx.result += src.substring(startOffset, next);
}
return true;
}
}
throw new SyntaxError(`incomplete quote, offset=${startOffset}, remaining=<[${src.substring(startOffset, startOffset + 200)}]>`);
};
/**
* In back quote string,
* can define template string further by enclosing it with "${" and "}",
* so special processing is required (at es6
* ```
* case "`": // back quote
* ```
* @param {string} src
* @param {TScannerContext} ctx
* @returns {boolean}
* @throws {SyntaxError}
*/
const backQuote: TCharScannerFunction = (src: string, ctx: TScannerContext): boolean | never => {
// store "next" postion character.
let ch: TBD<string>;
// cache start offset
let startOffset = ctx.offset;
// move next position.
let next = startOffset + 1;
// toggle escape flag.
let inEscape = false;
// limiter
const limiter = src.length;
/* https://coderwall
const when = (start: number) => {
if (!ctx.isWalk) {
ctx.result += src.substring(start, next);
}
};
//*/
$: while (next < limiter) {
// fetch "next" char, if its back slash then toggle escape state.
if ((ch = src[next++]) === "\\") {
inEscape = !inEscape;
} else if (inEscape) {
// last state is "in escape" then current ch is ignored.
inEscape = false;
} else if (ch === "$") {
if (src[next] === "{") {
next++;
// DEVNOTE: 2022/4/22 with `CharScannerFunction`
const scans = scanners;
let prevOffset = next;
/* https://coderwall
when(startOffset);
/*/
if (!ctx.isWalk) {
ctx.result += src.substring(startOffset, next);
}
//*/
while (next < limiter) {
const code = src[next]!.charCodeAt(0);
// "}".charCodeAt(0) === 125
if (code === EMetaChars.RIGHT_CURLY) {
startOffset = ++next;
/* https://coderwall
when(prevOffset);
/*/
if (!ctx.isWalk) {
ctx.result += src.substring(prevOffset, next);
}
//*/
continue $;
}
const inspectable = scans[code];
if (!inspectable) {
next++;
continue;
}
/* https://coderwall
when(prevOffset);
/*/
if (!ctx.isWalk) {
ctx.result += src.substring(prevOffset, next);
}
//*/
ctx.offset = next;
prevOffset = inspectable(src, ctx)? ctx.offset: ctx.offset++;
next = ctx.offset;
}
}
} else if (ch === "`") {
ctx.offset = next;
/* https://coderwall
when(startOffset);
/*/
if (!ctx.isWalk) {
ctx.result += src.substring(startOffset, next);
}
//*/
return true;
}
}
throw new SyntaxError(`incomplete backquote, offset=${startOffset}, remaining=<[${src.substring(startOffset, startOffset + 200)}]>`);
};
/** @type {string[]} */
const detectedReLiterals: string[] = [];
let drlIndex = 0;
/**
* test for "<reference .../>" and "///@ts-(ignore|check|...)..."
*
* regex cache
*/
const reTsrefOrPramga = /^(?:\/\/\/?\s*@ts-[-\w]+|\/\/\/\s*<reference)/;
//
// DEVNOTE: 2019-5-12
//
// 1. The detection of regex in this program is because it is necessary to skip ['"`]
// which may be included in regex in order to detect quoted string correctly.
//
// 2. It is possible to misdetect a string not described as regex literal in the calculation statment,
// but even if it is a misdetected regex literal, the result is that the cost processed up to this point is not wasted.
// Concatenate to a string.
//
/**
* when this character appears,
* its necessary to verify the line comment, multiline comment, regex.
* will need to set the priority as (line comment || multiline comment) > regex.
* ```
* case "/": // slash
* ```
* @param {string} src
* @param {TScannerContext} ctx
* @returns {boolean}
* @throws {SyntaxError}
*/
const slash: TCharScannerFunction = (src: string, ctx: TScannerContext): boolean => {
// cache start offset
const startOffset = ctx.offset;
// fetch next char.
const ch = src[startOffset + 1];
/**
* (single|multi) line comment
*/
let fragment: string;
//
// - - - check multiline comment. - - -
//
if (ch === "*") {
const close = src.indexOf("*/", startOffset + 2);
if (close !== -1) {
fragment = src.substring(startOffset, close + 2);
if (ctx.eventDone) {
ctx.result += fragment;
} else {
const eventContext: TScannerEventContext = {
event: EScannerEvent.MultiLineComment,
fragment,
offset: startOffset
};
if (!ctx.isWalk) {
if (scanListener!(eventContext)) {
ctx.result += fragment;
}
} else {
ctx.proceed = scanListener!(eventContext);
}
}
// update offset.
ctx.offset = close + 2;
return true;
}
throw new SyntaxError("incomplete multi line comment");
}
// avoid jsx, tsx tag
if (src[startOffset - 1] === "<") {
return false;
}
// limitation.
// DEVNOTE: 2022/04/08 - nlsOrEos is not allow zero -> eliminate blanks process is will be fail
let nlsOrEos = ctx.newline && src.indexOf(ctx.newline, startOffset + 1) || -1;
nlsOrEos === -1 && (nlsOrEos = src.length);
// NOTE: It was necessary to extract the character strings of the remaining lines...
fragment = src.substring(startOffset, nlsOrEos);
//
// - - - check ts reference tag or line comment - - -
//
if (ch === "/") { // means single line comment
// update offset. when new line character not found(eof) then
ctx.offset = nlsOrEos;
if (ctx.eventDone) {
ctx.result += fragment;
} else {
const eventContext: TScannerEventContext = {
event: EScannerEvent.SingleLineComment,
fragment,
offset: startOffset
};
if (!ctx.isWalk) { // replace mode
if (
// avoid ts reference tag
reTsrefOrPramga.test(fragment) || scanListener!(eventContext)
) {
ctx.result += fragment;
}
} else { // walk through mode
ctx.proceed = scanListener!(eventContext);
}
}
return true;
}
//
// - - - check regexp literal - - -
//
const m = detectRegex(fragment);
if (!m) {
return false;
}
if (ctx.collectRegex) {
// detectedReLiterals.push(m.body);
detectedReLiterals[drlIndex++] = m.body;
}
// update offset.
ctx.offset = startOffset + m.lastIndex;
if (!ctx.isWalk) {
ctx.result += src.substring(startOffset, ctx.offset);
}
return true;
};
/**
* CharScannerFunction registry.
*
* + JSBench.Me test [array index access or object key access](https://jsbench.me/chl08elhyd/1)
*
* @type {TCharScannerFunction[]}
*/
const scanners: Array<TBD<TCharScannerFunction>> = [];
scanners[EMetaChars.BACK_QUOTE] = backQuote; // 96
scanners[EMetaChars.DOUBLE_QUOTE] = quote; // 34
scanners[EMetaChars.SINGLE_QUOTE] = quote; // 39
scanners[EMetaChars.SLASH] = slash; // 47
const emptyListener = () => false;
/** @type {IScanEventCallback} */
let scanListener: IScanEventCallback = emptyListener;
/**
* ### replace mode
*
* NOTE:
* This function is implemented to correctly judge quoted string,
* line comment, multiline commnet, regexp literal,
* and delete line comment, multiline commnet. (maybe...
*
* ---
* CHANGES: 2019-5-23
* + In v2.x and later, it is optimized to node v10 and later.
* This works even on node v9 and earlier, but with poor performance.
* - > The performance differs depending on whether the structure of `scanners` is a builtin hash map (e.g - { "/": <scanner function> }) or an array
*
* @param {string} src
* @param {TRemoveCStyleCommentsOpt} opt
* @returns {string}
* @throws {SyntaxError}
*/
const apply = (src: string, opt: TRemoveCStyleCommentsOpt): string => {
//
// step 1. remove {line, block} comments
//
const size = src.length;
const ctx = createWhite(src, opt.collectRegex);
let offset = 0;
let prevOffset = 0;
while (offset < size) {
const inspectable = scanners[src.charCodeAt(offset)];
if (!inspectable) {
offset++;
} else {
ctx.result += src.substring(prevOffset, offset);
ctx.offset = offset;
prevOffset = inspectable(src, ctx)? ctx.offset: ctx.offset++;
offset = ctx.offset;
}
}
// adjust remaining
if (size - prevOffset > 0) {
ctx.result += src.substring(prevOffset, offset);
}
if (opt.preserveBlanks) {
return ctx.result;
}
//
// step 2. remove blank line and trailing whitespaces
//
// - - - -
// DEVNOTE: 2019-5-25
// Since string.replace method by regex only is difficult to control,
// so we implemented code + regex replacement.
//
// ✅ This makes it possible to hold the contents of nested es6 templete string.
// - - - -
/* replace removed comments result */
src = ctx.result;
/* reset context */
ctx.result = "";//, ctx.offset = 0;
// fix for "The current implementation calls the scan event listener twice"
ctx.eventDone = true;
/* halt collectRegex */
ctx.collectRegex = false;
const regexes = lookupRegexes(ctx.newline);
const reWsqs = regexes.wsqs;
let m: TBC<RegExpExecArray>;
prevOffset = 0;
// NOTE: need skip quoted string, regexp literal.
while (m = reWsqs.exec(src)) {
// const [head] = m[0];
const head = m[0]![0];
if (head === "/" || head === "`") {
ctx.offset = m.index;
if (prevOffset !== ctx.offset) {
ctx.result += src.substring(prevOffset, ctx.offset);
}
prevOffset = scanners[
head === "/" ? EMetaChars.SLASH : EMetaChars.BACK_QUOTE
]!(src, ctx)? ctx.offset: ctx.offset++;
reWsqs.lastIndex = ctx.offset;
continue;
}
// DEVNOTE: check the double quote first
const sublast = (head === '"' || head === "'")? reWsqs.lastIndex: m.index;
ctx.result += src.substring(prevOffset, sublast);
prevOffset = reWsqs.lastIndex;
}
// adjust remaining
if (src.length - prevOffset > 0) {
ctx.result += src.substring(prevOffset, src.length);
}
//
// DEVNOTE: 2020/5/17 - It turned out that regex `/\n$/g` has a little cost, so I rewritten it by script code.
//
src = ctx.result.replace(regexes.first, "");
// newline length
const nll = ctx.newline.length;
const lidx = src.length - nll;
if (
(nll === 1 && src[lidx] === ctx.newline) ||
(nll === 2 && src[lidx] === "\r" && src[lidx + 1] === "\n")
) {
return src.substring(0, lidx);
}
// DEVNOTE: which means nothing newline at head and tail in original source
return src;
};
/**
* ### walk through mode
*
* @param {string} src
* @param {TRemoveCStyleCommentsOpt} opt
* @returns {void}
* @throws {SyntaxError}
*/
const walk = (src: string, opt: TRemoveCStyleCommentsOpt): void => {
//
// run as walk through mode
//
const size = src.length;
const scans = scanners;
const ctx = createWhite(src, opt.collectRegex, true);
let offset = 0;
while (ctx.proceed && offset < size) {
const inspectable = scans[src.charCodeAt(offset)];
if (!inspectable) {
offset++;
} else {
ctx.offset = offset;
if (!inspectable(src, ctx)) {
ctx.offset++;
}
offset = ctx.offset;
}
}
};
/**
* @param {string[]} ra regex literal array
*/
const uniq = (ra: string[]) => {
// known elements
/** @type {Map<string, boolean>} */
const ke: Map<string, boolean> = new Map<string, boolean>();
// uniqued Array
/** @type {typeof ra} */
const ua: typeof ra = [];
for (const e of ra) {
if (ke.has(e)) continue;
ua.push(e);
ke.set(e, true);
}
return ua;
};
/**
* acquire the regex detection related context
*/
const getDetectedReContext = () => {
return {
detectedReLiterals,
uniqReLiterals: uniq(detectedReLiterals).sort()
};
};
/**
* reset the regex detection related context
*/
const reset = () => {
detectedReLiterals.length = 0;
drlIndex = 0;
};
/**
* @param {IScanEventCallback=} listener
*/
const setListener = (listener?: IScanEventCallback) => {
scanListener = typeof listener === "function" ? listener: emptyListener;
};
export {
apply,
walk,
getDetectedReContext,
reset,
setListener,
};