werk85/node-html-to-text

View on GitHub
packages/base/src/typedefs.js

Summary

Maintainability
A
0 mins
Test Coverage

/**
 * @typedef { object } Options
 * HtmlToText options.
 *
 * @property { BaseElementsOptions }  [baseElements]
 * Options for narrowing down to informative parts of HTML document.
 *
 * @property { boolean } [decodeEntities]
 * Specify whether HTML entities should be decoded in the text output.
 *
 * @property { {[key: string]: string | false} | ((str: string) => string) | undefined } [encodeCharacters]
 * A dictionary mapping from input text characters to escape sequences
 * (you can set values to false to disable escaping characters that are enabled by default)
 * or a function that does the replacement.
 *
 * @property { {[key: string]: FormatCallback} } [formatters = {}]
 * A dictionary with custom formatting functions for specific kinds of elements.
 *
 * Keys are custom string identifiers, values are callbacks.
 *
 * @property { LimitsOptions }        [limits]
 * Options for handling complex documents and limiting the output size.
 *
 * @property { LongWordSplitOptions } [longWordSplit]
 * Describes how to wrap long words.
 *
 * @property { boolean }              [preserveNewlines = false]
 * By default, any newlines `\n` from the input HTML are collapsed into space as any other HTML whitespace characters.
 * If `true`, these newlines will be preserved in the output.
 * This is only useful when input HTML carries some plain text formatting instead of proper tags.
 *
 * @property { SelectorDefinition[] } [selectors = []]
 * Instructions for how to render HTML elements based on matched selectors.
 *
 * Use this to (re)define options for new or already supported tags.
 *
 * @property { string[] | boolean }   [tables = []]
 * Deprecated. Use selectors with `format: 'dataTable'` instead.
 *
 * @property { string }               [whitespaceCharacters = ' \t\r\n\f\u200b']
 * All characters that are considered whitespace.
 * Default is according to HTML specifications.
 *
 * @property { number | boolean | null } [wordwrap = 80]
 * After how many chars a line break should follow in blocks.
 *
 * Set to `null` or `false` to disable word-wrapping.
 */

/**
 * @typedef { object } BaseElementsOptions
 * Options for narrowing down to informative parts of HTML document.
 *
 * @property { string[] } [selectors = ['body']]
 * The resulting text output will be composed from the text content of elements
 * matched with these selectors.
 *
 * @property { 'selectors' | 'occurrence' } [orderBy = 'selectors']
 * When multiple selectors are set, this option specifies
 * whether the selectors order has to be reflected in the output text.
 *
 * `'selectors'` (default) - matches for the first selector will appear first, etc;
 *
 * `'occurrence'` - all bases will appear in the same order as in input HTML.
 *
 * @property { boolean } [returnDomByDefault = true]
 * Use the entire document if none of provided selectors matched.
 */

/**
 * @typedef { object } DecodeOptions
 * Text decoding options given to `he.decode`.
 *
 * For more information see the [he](https://github.com/mathiasbynens/he) module.
 *
 * @property { boolean } [isAttributeValue = false]
 * TL;DR: If set to `true` - leave attribute values raw, don't parse them as text content.
 *
 * @property { boolean } [strict = false]
 * TL;DR: If set to `true` - throw an error on invalid HTML input.
 */

/**
 * @typedef { object } LimitsOptions
 * Options for handling complex documents and limiting the output size.
 *
 * @property { string } [ellipsis = ...]
 * A string to put in place of skipped content.
 *
 * @property { number | undefined } [maxBaseElements = undefined]
 * Stop looking for new base elements after this number of matches.
 *
 * No ellipsis is used when this condition is met.
 *
 * No limit if undefined.
 *
 * @property { number | undefined } [maxChildNodes = undefined]
 * Process only this many child nodes of any element.
 *
 * Remaining nodes, if any, will be replaced with ellipsis.
 *
 * Text nodes are counted along with tags.
 *
 * No limit if undefined.
 *
 * @property { number | undefined } [maxDepth = undefined]
 * Only go to a certain depth starting from `Options.baseElement`.
 *
 * Replace deeper nodes with ellipsis.
 *
 * No depth limit if undefined.
 *
 * @property { number } [maxInputLength = 16_777_216]
 * If the input string is longer than this value - it will be truncated
 * and a message will be sent to `stderr`.
 *
 * Ellipsis is not used in this case.
 */

/**
 * @typedef { object } LongWordSplitOptions
 * Describes how to wrap long words.
 *
 * @property { boolean }  [forceWrapOnLimit = false]
 * Break long words on the `Options.wordwrap` limit when there are no characters to wrap on.
 *
 * @property { string[] } [wrapCharacters = []]
 * An array containing the characters that may be wrapped on.
 */

/**
 * @typedef { object } SelectorDefinition
 * Describes how to handle tags matched by a selector.
 *
 * @property { string } selector
 * CSS selector. Refer to README for notes on supported selectors etc.
 *
 * @property { string } format
 * Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary.
 *
 * @property { FormatOptions } options
 * Options to customize the formatter for this element.
 */

/**
 * @typedef { object } FormatOptions
 * Options specific to different formatters ({@link FormatCallback}).
 * This is an umbrella type definition. Each formatter supports it's own subset of options.
 *
 * @property { number } [leadingLineBreaks]
 * Number of line breaks to separate previous block from this one.
 *
 * Note that N+1 line breaks are needed to make N empty lines.
 *
 * @property { number } [trailingLineBreaks]
 * Number of line breaks to separate this block from the next one.
 *
 * Note that N+1 line breaks are needed to make N empty lines.
 *
 * @property { string | null } [baseUrl = null]
 * (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes
 * relative to the root (the ones that start with `/`).
 *
 * For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>`
 * the link in the text will be `http://asdf.com/dir/subdir`.
 *
 * Keep in mind that `baseUrl` should not end with a `/`.
 *
 * @property { boolean } [hideLinkHrefIfSameAsText = false]
 * (Only for: `anchor` formatter.) By default links are translated in the following way:
 *
 * `<a href='link'>text</a>` => becomes => `text [link]`.
 *
 * If this option is set to `true` and `link` and `text` are the same,
 * `[link]` will be omitted and only `text` will be present.
 *
 * @property { boolean } [ignoreHref = false]
 * (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags.
 *
 * @property { [string, string] | false } [linkBrackets]
 * (Only for: `anchor` and `image` formatters.) Surround links with these brackets. Default: `['[', ']']`.
 *
 * Set to `false` or `['', '']` to disable.
 *
 * @property { boolean } [noAnchorUrl = true]
 * (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`).
 *
 * @property { string } [itemPrefix = ' * ']
 * (Only for: `unorderedList` formatter.) String prefix for each list item.
 *
 * @property { boolean } [uppercase = true]
 * (Only for: `heading` formatter.) By default, headings (`<h1>`, `<h2>`, etc) are uppercased.
 *
 * Set this to `false` to leave headings as they are.
 *
 * @property { number | undefined } [length = undefined]
 * (Only for: `horizontalLine` formatter.) Length of the `<hr/>` line.
 *
 * If numeric value is provided - it is used.
 * Otherwise, if global `wordwrap` number is provided - it is used.
 * If neither is true, then the fallback value of 40 is used.
 *
 * @property { boolean } [trimEmptyLines = true]
 * (Only for: `blockquote` formatter.) Trim empty lines from blockquote.
 *
 * @property { boolean } [uppercaseHeaderCells = true]
 * (Only for: `table`, `dataTable` formatter.) By default, heading cells (`<th>`) are uppercased.
 *
 * Set this to `false` to leave heading cells as they are.
 *
 * @property { number } [maxColumnWidth = 60]
 * (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width
 * instead of global `wordwrap` limit.
 *
 * Set this to `undefined` in order to fall back to `wordwrap` limit.
 *
 * @property { number } [colSpacing = 3]
 * (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns.
 *
 * @property { number } [rowSpacing = 0]
 * (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows.
 *
 */

/**
 * @typedef { object } DomNode
 * Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type.
 *
 * Makes no distinction between elements (tags) and data nodes (good enough for now).
 *
 * @property { string }                  type       Type of node - "text", "tag", "comment", "script", etc.
 * @property { string }                  [data]     Content of a data node.
 * @property { string }                  [name]     Tag name.
 * @property { {[key: string]: string} } [attribs]  Tag attributes dictionary.
 * @property { DomNode[] }               [children] Child nodes.
 * @property { DomNode }                 [parent]   Parent node.
 */

/**
 * A function to stringify a DOM node.
 *
 * @callback FormatCallback
 *
 * @param   { DomNode }           elem          A DOM node as returned by [htmlparser2](https://github.com/fb55/htmlparser2).
 * @param   { RecursiveCallback } walk          Recursive callback to process child nodes.
 * @param   { BlockTextBuilder }  builder       Passed around to accumulate output text. Contains options object.
 * @param   { FormatOptions }     formatOptions Options specific to this callback.
 */

/**
 * A function to process child nodes.
 * Passed into a {@link FormatCallback} as an argument.
 *
 * @callback RecursiveCallback
 *
 * @param   { DomNode[] }        [nodes] DOM nodes array.
 * @param   { BlockTextBuilder } builder Passed around to accumulate output text. Contains options object.
 */

/**
 * @typedef { object } TablePrinterCell
 * Cell definition for a table printer.
 *
 * @property { number } colspan Number of columns this cell occupies.
 * @property { number } rowspan Number of rows this cell occupies.
 * @property { string } text Cell contents (pre-wrapped).
 */

/**
 * Render a table into a string.
 * Cells can contain multiline text and span across multiple rows and columns.
 *
 * Can modify cells.
 *
 * @callback TablePrinter
 *
 * @param { TablePrinterCell[][] } tableRows Table to render.
 * @returns { string }
 */