import type { Fn } from "";
import { DEFAULT, defmulti } from "";
import type { ContextOpts, ParseScope } from "";
import { defContext } from "";
import { defGrammar } from "";
import { unescapeEntities } from "";
export interface ParseOpts {
* Array of element names to ignore.
ignoreElements: string[];
* Array of attribute names to ignore.
ignoreAttribs: string[];
* Keep data attribs.
* @defaultValue true
dataAttribs: boolean;
* Keep `<!doctype ...>` element.
* @defaultValue false
doctype: boolean;
* Keep whitespace-only text bodies.
* @defaultValue false
whitespace: boolean;
* If enabled, collapses all whitespace to single space (`\u0020`)
* characters.
* @defaultValue true
collapse: boolean;
* If enabled, unescapes known named and numeric HTML entities (i.e.
* replaces them with their original characters).
* @defaultValue true
unescape: boolean;
* Keep comments.
* @defaultValue false
comments: boolean;
* Element transform/filter. Receives an hiccup element before its being
* added to its parent. The function has full freedom to replace the element
* with any value. If the function returns a nullish result the element will
* be skipped/omitted entirely.
tx: Fn<any[], any>;
* Plain text transform/filter. If the function returns a nullish result the
* text will be skipped/omitted entirely.
txBody: Fn<string, any>;
* Parser's internal max recursion limit. Parsing will terminate once this
* limit is reached.
* @defaultValue 128
maxDepth: number;
* True to enable parser debug output. Will emit details of each parse scope.
* @defaultValue false
debug: boolean;
export type Element = [string, Record<string, any>, ...ElementBody[]];
export type ElementBody = string | Element;
export interface ParseResult {
type: "success" | "partial" | "fail" | "error";
loc?: { offset: number; line: number; column: number };
err?: Error;
result?: Element[];
// HTML parse grammar rules (see: readme for details)
// playground URL:
export const lang = defGrammar(`
node: '<'! (<comment> | <cdata_el> | <void_el> | <el>) ;
el: <name> <attrib>* (<el_body> | <el_close>! ) ;
el_body: <WS0> '>'! (<body> | <node>)* "</"! <name>! <WS0> '>'! => hoist ;
el_close: <WS0> "/>"! ;
name: [A-Za-z0-9_:\\-]+ => join ;
attrib: <WS1> <name> <attval>? ;
attval: '='! (<val> | <alt_val> | <empty> | <alt_empty>) ;
val: '"'! .(?+'"'!) => join ;
alt_val: '\\''! .(?+'\\''!) => join ;
empty: '"' '"' ;
alt_empty: '\\''! '\\''! ;
body: .(?-'<'!) => join ;
void_el: <void_name> <attrib>* <WS0> '/'?! '>'! ;
void_name: ("area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "meta" | "source" | "track" | "wbr") ;
cdata_el: <cdata_name> <attrib>* '>'! <cdata_body> ;
cdata_name: ("script" | "style") ;
cdata_body: .(?-<cdata_close>!) <cdata_close>! => join ;
cdata_close: "</"! <cdata_name>! <WS0> '>'! ;
doctype: "<!"! ("doctype" | "DOCTYPE")! <WS1> .(?+'>'!) <WS0> => join ;
comment: "!--"! .(?+"-->"!) => join ;
main: <START> <doctype>? <node>+ <END> ;
* Creates a parser context for given source string and calls the main parser
* rule. Returns result object, incl. the context for further inspection and
* transformation.
* @param src
* @param opts
export const parseRaw = (src: string, opts?: Partial<ContextOpts>) => {
const ctx = defContext(src, opts);
return { result: lang!.rules.main(ctx), ctx };
* Trims given HTML source string and attempts to parse it into a collection of
* elements in format, using provided options to transform, clean
* or filter elements.
* @param src
* @param opts
export const parseHtml = (
src: string,
opts?: Partial<ParseOpts>
): ParseResult => {
if (!src) return { type: "success", result: [] };
opts = {
debug: false,
collapse: true,
unescape: true,
maxDepth: 128,
try {
const { result, ctx } = parseRaw(src.trim(), {
debug: opts.debug,
maxDepth: opts.maxDepth,
const loc = {
offset: ctx.state.p,
line: ctx.state.l,
column: ctx.state.c,
if (result) {
const acc: Element[] = [];
__transformScope(ctx.root, opts, acc);
return {
type: ctx.done ? "success" : "partial",
result: acc,
} else {
return { type: "fail", loc };
} catch (e) {
return { type: "error", err: <Error>e };
* Recursive depth-first transformation function to process the parse tree (this
* is where the actual conversion to hiccup format happens).
* @remarks
* The dispatch values for the various implementations here correspond to the
* above grammar rules.
* @internal
const __transformScope = defmulti<
(x) =>,
{ cdata_el: "el", void_el: "el" },
[DEFAULT]: (scope: ParseScope<string>) => {
throw new Error(`missing impl for scope ID: ${}`);
// root node of the parse tree
root: ({ children }, opts, acc) => {
if (!children) return;
children = children[0].children;
if (opts.doctype && children?.[0]) {
acc.push(["!DOCTYPE", children[0].result]);
for (let x of children![1].children!)
__transformScope(x, opts, acc);
node: ({ children }, opts, acc) => {
__transformScope(children![0], opts, acc);
comment: ({ result }, opts, acc) => {
if (opts.comments) acc.push(["__COMMENT__", result.trim()]);
// element node transformer, collects & filters attributes/children
// adds resulting hiccup element to accumulator array
el: ({ children }, opts, acc) => {
const [name, { children: $attribs }, body] = children!;
if (opts.ignoreElements?.includes(name.result)) return;
const attribs: any = {};
const el: Element = [name.result, attribs];
if ($attribs) {
for (let a of $attribs) {
const name: string = a.children![0].result;
if (opts.dataAttribs === false && name.startsWith("data-"))
if (opts.ignoreAttribs?.includes(name)) continue;
if (a.children![1].children) {
const val = a.children![1].children[0].result;
if (val != null) attribs[name] = unescapeEntities(val);
} else {
attribs[name] = true;
if (body) {
if (body.result) {
} else if (body.children) {
for (let x of body.children!) __transformScope(x, opts, el);
const result = opts.tx ? opts.tx(el) : el;
if (result != null) acc.push(result);
// plain text transform (by default only resolves HTML entities)
body: ({ result }, opts, acc) => {
if (!opts.whitespace && /^\s+$/.test(result)) return;
if (opts.collapse) result = (<string>result).replace(/\s+/gm, " ");
if (opts.unescape) result = unescapeEntities(result);
result = opts.txBody ? opts.txBody(result) : result;
if (result != null) acc.push(result);