packages/base/src/typedefs.js
/**
* @typedef { object } Options
* HtmlToText options.
*
* @property { BaseElementsOptions } [baseElements]
* Options for narrowing down to informative parts of HTML document.
*
* @property { boolean } [decodeEntities]
* Specify whether HTML entities should be decoded in the text output.
*
* @property { {[key: string]: string | false} | ((str: string) => string) | undefined } [encodeCharacters]
* A dictionary mapping from input text characters to escape sequences
* (you can set values to false to disable escaping characters that are enabled by default)
* or a function that does the replacement.
*
* @property { {[key: string]: FormatCallback} } [formatters = {}]
* A dictionary with custom formatting functions for specific kinds of elements.
*
* Keys are custom string identifiers, values are callbacks.
*
* @property { LimitsOptions } [limits]
* Options for handling complex documents and limiting the output size.
*
* @property { LongWordSplitOptions } [longWordSplit]
* Describes how to wrap long words.
*
* @property { boolean } [preserveNewlines = false]
* By default, any newlines `\n` from the input HTML are collapsed into space as any other HTML whitespace characters.
* If `true`, these newlines will be preserved in the output.
* This is only useful when input HTML carries some plain text formatting instead of proper tags.
*
* @property { SelectorDefinition[] } [selectors = []]
* Instructions for how to render HTML elements based on matched selectors.
*
* Use this to (re)define options for new or already supported tags.
*
* @property { string[] | boolean } [tables = []]
* Deprecated. Use selectors with `format: 'dataTable'` instead.
*
* @property { string } [whitespaceCharacters = ' \t\r\n\f\u200b']
* All characters that are considered whitespace.
* Default is according to HTML specifications.
*
* @property { number | boolean | null } [wordwrap = 80]
* After how many chars a line break should follow in blocks.
*
* Set to `null` or `false` to disable word-wrapping.
*/
/**
* @typedef { object } BaseElementsOptions
* Options for narrowing down to informative parts of HTML document.
*
* @property { string[] } [selectors = ['body']]
* The resulting text output will be composed from the text content of elements
* matched with these selectors.
*
* @property { 'selectors' | 'occurrence' } [orderBy = 'selectors']
* When multiple selectors are set, this option specifies
* whether the selectors order has to be reflected in the output text.
*
* `'selectors'` (default) - matches for the first selector will appear first, etc;
*
* `'occurrence'` - all bases will appear in the same order as in input HTML.
*
* @property { boolean } [returnDomByDefault = true]
* Use the entire document if none of provided selectors matched.
*/
/**
* @typedef { object } DecodeOptions
* Text decoding options given to `he.decode`.
*
* For more information see the [he](https://github.com/mathiasbynens/he) module.
*
* @property { boolean } [isAttributeValue = false]
* TL;DR: If set to `true` - leave attribute values raw, don't parse them as text content.
*
* @property { boolean } [strict = false]
* TL;DR: If set to `true` - throw an error on invalid HTML input.
*/
/**
* @typedef { object } LimitsOptions
* Options for handling complex documents and limiting the output size.
*
* @property { string } [ellipsis = ...]
* A string to put in place of skipped content.
*
* @property { number | undefined } [maxBaseElements = undefined]
* Stop looking for new base elements after this number of matches.
*
* No ellipsis is used when this condition is met.
*
* No limit if undefined.
*
* @property { number | undefined } [maxChildNodes = undefined]
* Process only this many child nodes of any element.
*
* Remaining nodes, if any, will be replaced with ellipsis.
*
* Text nodes are counted along with tags.
*
* No limit if undefined.
*
* @property { number | undefined } [maxDepth = undefined]
* Only go to a certain depth starting from `Options.baseElement`.
*
* Replace deeper nodes with ellipsis.
*
* No depth limit if undefined.
*
* @property { number } [maxInputLength = 16_777_216]
* If the input string is longer than this value - it will be truncated
* and a message will be sent to `stderr`.
*
* Ellipsis is not used in this case.
*/
/**
* @typedef { object } LongWordSplitOptions
* Describes how to wrap long words.
*
* @property { boolean } [forceWrapOnLimit = false]
* Break long words on the `Options.wordwrap` limit when there are no characters to wrap on.
*
* @property { string[] } [wrapCharacters = []]
* An array containing the characters that may be wrapped on.
*/
/**
* @typedef { object } SelectorDefinition
* Describes how to handle tags matched by a selector.
*
* @property { string } selector
* CSS selector. Refer to README for notes on supported selectors etc.
*
* @property { string } format
* Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary.
*
* @property { FormatOptions } options
* Options to customize the formatter for this element.
*/
/**
* @typedef { object } FormatOptions
* Options specific to different formatters ({@link FormatCallback}).
* This is an umbrella type definition. Each formatter supports it's own subset of options.
*
* @property { number } [leadingLineBreaks]
* Number of line breaks to separate previous block from this one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*
* @property { number } [trailingLineBreaks]
* Number of line breaks to separate this block from the next one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*
* @property { string | null } [baseUrl = null]
* (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes
* relative to the root (the ones that start with `/`).
*
* For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>`
* the link in the text will be `http://asdf.com/dir/subdir`.
*
* Keep in mind that `baseUrl` should not end with a `/`.
*
* @property { boolean } [hideLinkHrefIfSameAsText = false]
* (Only for: `anchor` formatter.) By default links are translated in the following way:
*
* `<a href='link'>text</a>` => becomes => `text [link]`.
*
* If this option is set to `true` and `link` and `text` are the same,
* `[link]` will be omitted and only `text` will be present.
*
* @property { boolean } [ignoreHref = false]
* (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags.
*
* @property { [string, string] | false } [linkBrackets]
* (Only for: `anchor` and `image` formatters.) Surround links with these brackets. Default: `['[', ']']`.
*
* Set to `false` or `['', '']` to disable.
*
* @property { boolean } [noAnchorUrl = true]
* (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`).
*
* @property { string } [itemPrefix = ' * ']
* (Only for: `unorderedList` formatter.) String prefix for each list item.
*
* @property { boolean } [uppercase = true]
* (Only for: `heading` formatter.) By default, headings (`<h1>`, `<h2>`, etc) are uppercased.
*
* Set this to `false` to leave headings as they are.
*
* @property { number | undefined } [length = undefined]
* (Only for: `horizontalLine` formatter.) Length of the `<hr/>` line.
*
* If numeric value is provided - it is used.
* Otherwise, if global `wordwrap` number is provided - it is used.
* If neither is true, then the fallback value of 40 is used.
*
* @property { boolean } [trimEmptyLines = true]
* (Only for: `blockquote` formatter.) Trim empty lines from blockquote.
*
* @property { boolean } [uppercaseHeaderCells = true]
* (Only for: `table`, `dataTable` formatter.) By default, heading cells (`<th>`) are uppercased.
*
* Set this to `false` to leave heading cells as they are.
*
* @property { number } [maxColumnWidth = 60]
* (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width
* instead of global `wordwrap` limit.
*
* Set this to `undefined` in order to fall back to `wordwrap` limit.
*
* @property { number } [colSpacing = 3]
* (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns.
*
* @property { number } [rowSpacing = 0]
* (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows.
*
*/
/**
* @typedef { object } DomNode
* Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type.
*
* Makes no distinction between elements (tags) and data nodes (good enough for now).
*
* @property { string } type Type of node - "text", "tag", "comment", "script", etc.
* @property { string } [data] Content of a data node.
* @property { string } [name] Tag name.
* @property { {[key: string]: string} } [attribs] Tag attributes dictionary.
* @property { DomNode[] } [children] Child nodes.
* @property { DomNode } [parent] Parent node.
*/
/**
* A function to stringify a DOM node.
*
* @callback FormatCallback
*
* @param { DomNode } elem A DOM node as returned by [htmlparser2](https://github.com/fb55/htmlparser2).
* @param { RecursiveCallback } walk Recursive callback to process child nodes.
* @param { BlockTextBuilder } builder Passed around to accumulate output text. Contains options object.
* @param { FormatOptions } formatOptions Options specific to this callback.
*/
/**
* A function to process child nodes.
* Passed into a {@link FormatCallback} as an argument.
*
* @callback RecursiveCallback
*
* @param { DomNode[] } [nodes] DOM nodes array.
* @param { BlockTextBuilder } builder Passed around to accumulate output text. Contains options object.
*/
/**
* @typedef { object } TablePrinterCell
* Cell definition for a table printer.
*
* @property { number } colspan Number of columns this cell occupies.
* @property { number } rowspan Number of rows this cell occupies.
* @property { string } text Cell contents (pre-wrapped).
*/
/**
* Render a table into a string.
* Cells can contain multiline text and span across multiple rows and columns.
*
* Can modify cells.
*
* @callback TablePrinter
*
* @param { TablePrinterCell[][] } tableRows Table to render.
* @returns { string }
*/