sparklemotion/nokogiri

View on GitHub
gumbo-parser/src/nokogiri_gumbo.h

Summary

Maintainability
Test Coverage
// Copyright 2010 Google Inc.
// Copyright 2018 Craig Barnes.
// Licensed under the Apache License, version 2.0.

// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
// static constants

/**
 * @file
 * @mainpage Gumbo HTML Parser
 *
 * This provides a conformant, no-dependencies implementation of the
 * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
 * to parse a different encoding, run a preprocessing step to convert
 * to UTF-8. It returns a parse tree made of the structs in this file.
 *
 * Example:
 * @code
 *    GumboOutput* output = gumbo_parse(input);
 *    do_something_with_doctype(output->document);
 *    do_something_with_html_tree(output->root);
 *    gumbo_destroy_output(output);
 * @endcode
 *
 * [HTML5]: https://html.spec.whatwg.org/multipage/
 */

#ifndef GUMBO_H
#define GUMBO_H

#include <stdbool.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * A struct representing a character position within the original text
 * buffer. Line and column numbers are 1-based and offsets are 0-based,
 * which matches how most editors and command-line tools work.
 */
typedef struct {
  size_t line;
  size_t column;
  size_t offset;
} GumboSourcePosition;

/**
 * A struct representing a string or part of a string. Strings within
 * the parser are represented by a `char*` and a length; the `char*`
 * points into an existing data buffer owned by some other code (often
 * the original input). `GumboStringPiece`s are assumed (by convention)
 * to be immutable, because they may share data. Clients should assume
 * that it is not NUL-terminated and should always use explicit lengths
 * when manipulating them.
 */
typedef struct {
  /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
  const char* data;

  /** The length of the string fragment, in bytes (may be zero). */
  size_t length;
} GumboStringPiece;

#define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
/** A constant to represent a 0-length null string. */
#define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT

/**
 * Compares two `GumboStringPiece`s, and returns `true` if they're
 * equal or `false` otherwise.
 */
bool gumbo_string_equals (
  const GumboStringPiece* str1,
  const GumboStringPiece* str2
);

/**
 * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
 * if they're equal or `false` otherwise.
 */
bool gumbo_string_equals_ignore_case (
  const GumboStringPiece* str1,
  const GumboStringPiece* str2
);

/**
 * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
 * case.
 */
bool gumbo_string_prefix_ignore_case (
  const GumboStringPiece* prefix,
  const GumboStringPiece* str
);

/**
 * A simple vector implementation. This stores a pointer to a data array
 * and a length. All elements are stored as `void*`; client code must
 * cast to the appropriate type. Overflows upon addition result in
 * reallocation of the data array, with the size doubling to maintain
 * `O(1)` amortized cost. There is no removal function, as this isn't
 * needed for any of the operations within this library. Iteration can
 * be done through inspecting the structure directly in a `for` loop.
 */
typedef struct {
  /**
   * Data elements. This points to a dynamically-allocated array of
   * `capacity` elements, each a `void*` to the element itself.
   */
  void** data;

  /** Number of elements currently in the vector. */
  unsigned int length;

  /** Current array capacity. */
  unsigned int capacity;
} GumboVector;

# define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
/** An empty (0-length, 0-capacity) `GumboVector`. */
#define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT

/**
 * Returns the first index at which an element appears in this vector
 * (testing by pointer equality), or `-1` if it never does.
 */
int gumbo_vector_index_of(GumboVector* vector, const void* element);

/**
 * An `enum` for all the tags defined in the HTML5 standard. These
 * correspond to the tag names themselves. Enum constants exist only
 * for tags that appear in the spec itself (or for tags with special
 * handling in the SVG and MathML namespaces). Any other tags appear
 * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
 * through `original_tag`.
 *
 * This is mostly for API convenience, so that clients of this library
 * don't need to perform a `strcasecmp` to find the normalized tag
 * name. It also has efficiency benefits, by letting the parser work
 * with enums instead of strings.
 */
typedef enum {
  GUMBO_TAG_HTML,
  GUMBO_TAG_HEAD,
  GUMBO_TAG_TITLE,
  GUMBO_TAG_BASE,
  GUMBO_TAG_LINK,
  GUMBO_TAG_META,
  GUMBO_TAG_STYLE,
  GUMBO_TAG_SCRIPT,
  GUMBO_TAG_NOSCRIPT,
  GUMBO_TAG_TEMPLATE,
  GUMBO_TAG_BODY,
  GUMBO_TAG_ARTICLE,
  GUMBO_TAG_SECTION,
  GUMBO_TAG_NAV,
  GUMBO_TAG_ASIDE,
  GUMBO_TAG_H1,
  GUMBO_TAG_H2,
  GUMBO_TAG_H3,
  GUMBO_TAG_H4,
  GUMBO_TAG_H5,
  GUMBO_TAG_H6,
  GUMBO_TAG_HGROUP,
  GUMBO_TAG_HEADER,
  GUMBO_TAG_FOOTER,
  GUMBO_TAG_ADDRESS,
  GUMBO_TAG_P,
  GUMBO_TAG_HR,
  GUMBO_TAG_PRE,
  GUMBO_TAG_BLOCKQUOTE,
  GUMBO_TAG_OL,
  GUMBO_TAG_UL,
  GUMBO_TAG_LI,
  GUMBO_TAG_DL,
  GUMBO_TAG_DT,
  GUMBO_TAG_DD,
  GUMBO_TAG_FIGURE,
  GUMBO_TAG_FIGCAPTION,
  GUMBO_TAG_MAIN,
  GUMBO_TAG_DIV,
  GUMBO_TAG_A,
  GUMBO_TAG_EM,
  GUMBO_TAG_STRONG,
  GUMBO_TAG_SMALL,
  GUMBO_TAG_S,
  GUMBO_TAG_CITE,
  GUMBO_TAG_Q,
  GUMBO_TAG_DFN,
  GUMBO_TAG_ABBR,
  GUMBO_TAG_DATA,
  GUMBO_TAG_TIME,
  GUMBO_TAG_CODE,
  GUMBO_TAG_VAR,
  GUMBO_TAG_SAMP,
  GUMBO_TAG_KBD,
  GUMBO_TAG_SUB,
  GUMBO_TAG_SUP,
  GUMBO_TAG_I,
  GUMBO_TAG_B,
  GUMBO_TAG_U,
  GUMBO_TAG_MARK,
  GUMBO_TAG_RUBY,
  GUMBO_TAG_RT,
  GUMBO_TAG_RP,
  GUMBO_TAG_BDI,
  GUMBO_TAG_BDO,
  GUMBO_TAG_SPAN,
  GUMBO_TAG_BR,
  GUMBO_TAG_WBR,
  GUMBO_TAG_INS,
  GUMBO_TAG_DEL,
  GUMBO_TAG_IMAGE,
  GUMBO_TAG_IMG,
  GUMBO_TAG_IFRAME,
  GUMBO_TAG_EMBED,
  GUMBO_TAG_OBJECT,
  GUMBO_TAG_PARAM,
  GUMBO_TAG_VIDEO,
  GUMBO_TAG_AUDIO,
  GUMBO_TAG_SOURCE,
  GUMBO_TAG_TRACK,
  GUMBO_TAG_CANVAS,
  GUMBO_TAG_MAP,
  GUMBO_TAG_AREA,
  GUMBO_TAG_MATH,
  GUMBO_TAG_MI,
  GUMBO_TAG_MO,
  GUMBO_TAG_MN,
  GUMBO_TAG_MS,
  GUMBO_TAG_MTEXT,
  GUMBO_TAG_MGLYPH,
  GUMBO_TAG_MALIGNMARK,
  GUMBO_TAG_ANNOTATION_XML,
  GUMBO_TAG_SVG,
  GUMBO_TAG_FOREIGNOBJECT,
  GUMBO_TAG_DESC,
  GUMBO_TAG_TABLE,
  GUMBO_TAG_CAPTION,
  GUMBO_TAG_COLGROUP,
  GUMBO_TAG_COL,
  GUMBO_TAG_TBODY,
  GUMBO_TAG_THEAD,
  GUMBO_TAG_TFOOT,
  GUMBO_TAG_TR,
  GUMBO_TAG_TD,
  GUMBO_TAG_TH,
  GUMBO_TAG_FORM,
  GUMBO_TAG_FIELDSET,
  GUMBO_TAG_LEGEND,
  GUMBO_TAG_LABEL,
  GUMBO_TAG_INPUT,
  GUMBO_TAG_BUTTON,
  GUMBO_TAG_SELECT,
  GUMBO_TAG_DATALIST,
  GUMBO_TAG_OPTGROUP,
  GUMBO_TAG_OPTION,
  GUMBO_TAG_TEXTAREA,
  GUMBO_TAG_KEYGEN,
  GUMBO_TAG_OUTPUT,
  GUMBO_TAG_PROGRESS,
  GUMBO_TAG_METER,
  GUMBO_TAG_DETAILS,
  GUMBO_TAG_SUMMARY,
  GUMBO_TAG_MENU,
  GUMBO_TAG_MENUITEM,
  GUMBO_TAG_APPLET,
  GUMBO_TAG_ACRONYM,
  GUMBO_TAG_BGSOUND,
  GUMBO_TAG_DIR,
  GUMBO_TAG_FRAME,
  GUMBO_TAG_FRAMESET,
  GUMBO_TAG_NOFRAMES,
  GUMBO_TAG_LISTING,
  GUMBO_TAG_XMP,
  GUMBO_TAG_NEXTID,
  GUMBO_TAG_NOEMBED,
  GUMBO_TAG_PLAINTEXT,
  GUMBO_TAG_RB,
  GUMBO_TAG_STRIKE,
  GUMBO_TAG_BASEFONT,
  GUMBO_TAG_BIG,
  GUMBO_TAG_BLINK,
  GUMBO_TAG_CENTER,
  GUMBO_TAG_FONT,
  GUMBO_TAG_MARQUEE,
  GUMBO_TAG_MULTICOL,
  GUMBO_TAG_NOBR,
  GUMBO_TAG_SPACER,
  GUMBO_TAG_TT,
  GUMBO_TAG_RTC,
  GUMBO_TAG_DIALOG,
  GUMBO_TAG_SEARCH,
  // Used for all tags that don't have special handling in HTML.
  GUMBO_TAG_UNKNOWN,
  // A marker value to indicate the end of the enum, for iterating over it.
  GUMBO_TAG_LAST,
} GumboTag;

/**
 * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
 * return value is static data owned by the library.
 */
const char* gumbo_normalized_tagname(GumboTag tag);

/**
 * Extracts the tag name from the `original_text` field of an element
 * or token by stripping off `</>` characters and attributes and
 * adjusting the passed-in `GumboStringPiece` appropriately. The tag
 * name is in the original case and shares a buffer with the original
 * text, to simplify memory management. Behavior is undefined if a
 * string piece that doesn't represent an HTML tag (`<tagname>` or
 * `</tagname>`) is passed in. If the string piece is completely
 * empty (`NULL` data pointer), then this function will exit
 * successfully as a no-op.
 */
void gumbo_tag_from_original_text(GumboStringPiece* text);

/**
 * Fixes the case of SVG elements that are not all lowercase. This is
 * not done at parse time because there's no place to store a mutated
 * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
 * SVG tags without special handling), while `original_tag_name` is a
 * pointer into the original buffer. Instead, we provide this helper
 * function that clients can use to rename SVG tags as appropriate.
 * Returns the case-normalized SVG tagname if a replacement is found, or
 * `NULL` if no normalization is called for. The return value is static
 * data and owned by the library.
 *
 * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
 */
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);

/**
 * Converts a tag name string (which may be in upper or mixed case) to a
 * tag enum.
 */
GumboTag gumbo_tagn_enum(const char* tagname, size_t length);

/**
 * Attribute namespaces.
 * HTML includes special handling for XLink, XML, and XMLNS namespaces
 * on attributes. Everything else goes in the generic "NONE" namespace.
 */
typedef enum {
  GUMBO_ATTR_NAMESPACE_NONE,
  GUMBO_ATTR_NAMESPACE_XLINK,
  GUMBO_ATTR_NAMESPACE_XML,
  GUMBO_ATTR_NAMESPACE_XMLNS,
} GumboAttributeNamespaceEnum;

/**
 * A struct representing a single attribute on a HTML tag. This is a
 * name-value pair, but also includes information about source locations
 * and original source text.
 */
typedef struct {
  /**
   * The namespace for the attribute. This will usually be
   * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
   * take special values, per:
   * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
   */
  GumboAttributeNamespaceEnum attr_namespace;

  /**
   * The name of the attribute. This is in a freshly-allocated buffer to
   * deal with case-normalization and is null-terminated.
   */
  const char* name;

  /**
   * The original text of the attribute name, as a pointer into the
   * original source buffer.
   */
  GumboStringPiece original_name;

  /**
   * The value of the attribute. This is in a freshly-allocated buffer
   * to deal with unescaping and is null-terminated. It does not include
   * any quotes that surround the attribute. If the attribute has no
   * value (for example, `selected` on a checkbox) this will be an empty
   * string.
   */
  const char* value;

  /**
   * The original text of the value of the attribute. This points into
   * the original source buffer. It includes any quotes that surround
   * the attribute and you can look at `original_value.data[0]` and
   * `original_value.data[original_value.length - 1]` to determine what
   * the quote characters were. If the attribute has no value this will
   * be a 0-length string.
   */
  GumboStringPiece original_value;

  /** The starting position of the attribute name. */
  GumboSourcePosition name_start;

  /**
   * The ending position of the attribute name. This is not always derivable
   * from the starting position of the value because of the possibility of
   * whitespace around the `=` sign.
   */
  GumboSourcePosition name_end;

  /** The starting position of the attribute value. */
  GumboSourcePosition value_start;

  /** The ending position of the attribute value. */
  GumboSourcePosition value_end;
} GumboAttribute;

/**
 * Given a vector of `GumboAttribute`s, look up the one with the
 * specified name and return it, or `NULL` if no such attribute exists.
 * This uses a case-insensitive match, as HTML is case-insensitive.
 */
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);

/**
 * Enum denoting the type of node. This determines the type of the
 * `node.v` union.
 */
typedef enum {
  /** Document node. `v` will be a `GumboDocument`. */
  GUMBO_NODE_DOCUMENT,
  /** Element node. `v` will be a `GumboElement`. */
  GUMBO_NODE_ELEMENT,
  /** Text node. `v` will be a `GumboText`. */
  GUMBO_NODE_TEXT,
  /** CDATA node. `v` will be a `GumboText`. */
  GUMBO_NODE_CDATA,
  /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
  GUMBO_NODE_COMMENT,
  /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
  GUMBO_NODE_WHITESPACE,
  /**
   * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
   * many client libraries will want to ignore the contents of template
   * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
   * do the right thing here, while clients that want to include template
   * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
   * `GumboElement`.
   */
  GUMBO_NODE_TEMPLATE
} GumboNodeType;

/**
 * Forward declaration of GumboNode so it can be used recursively in
 * GumboNode.parent.
 */
typedef struct GumboInternalNode GumboNode;

/** https://dom.spec.whatwg.org/#concept-document-quirks */
typedef enum {
  GUMBO_DOCTYPE_NO_QUIRKS,
  GUMBO_DOCTYPE_QUIRKS,
  GUMBO_DOCTYPE_LIMITED_QUIRKS
} GumboQuirksModeEnum;

/**
 * Namespaces.
 * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
 * Rather, anything inside an `<svg>` tag is in the SVG namespace,
 * anything inside the `<math>` tag is in the MathML namespace, and
 * anything else is inside the HTML namespace. No other namespaces are
 * supported, so this can be an `enum`.
 */
typedef enum {
  GUMBO_NAMESPACE_HTML,
  GUMBO_NAMESPACE_SVG,
  GUMBO_NAMESPACE_MATHML
} GumboNamespaceEnum;

/**
 * Parse flags.
 * We track the reasons for parser insertion of nodes and store them in
 * a bitvector in the node itself. This lets client code optimize out
 * nodes that are implied by the HTML structure of the document, or flag
 * constructs that may not be allowed by a style guide, or track the
 * prevalence of incorrect or tricky HTML code.
 */
typedef enum {
  /**
   * A normal node -- both start and end tags appear in the source,
   * nothing has been reparented.
   */
  GUMBO_INSERTION_NORMAL = 0,

  /**
   * A node inserted by the parser to fulfill some implicit insertion
   * rule. This is usually set in addition to some other flag giving a
   * more specific insertion reason; it's a generic catch-all term
   * meaning "The start tag for this node did not appear in the document
   * source".
   */
  GUMBO_INSERTION_BY_PARSER = 1 << 0,

  /**
   * A flag indicating that the end tag for this node did not appear in
   * the document source. Note that in some cases, you can still have
   * parser-inserted nodes with an explicit end tag. For example,
   * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
   * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
   * `</html>` tag actually exists.
   *
   * This flag will be set only if the end tag is completely missing.
   * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
   * with text afterwards), which will leave this flag unset and require
   * clients to inspect the parse errors for that case.
   */
  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,

  // Value 1 << 2 was for a flag that has since been removed.

  /**
   * A flag for nodes that are inserted because their presence is
   * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
   * `<tbody>`, etc.
   */
  GUMBO_INSERTION_IMPLIED = 1 << 3,

  /**
   * A flag for nodes that are converted from their end tag equivalents.
   * For example, `</p>` when no paragraph is open implies that the
   * parser should create a `<p>` tag and immediately close it, while
   * `</br>` means the same thing as `<br>`.
   */
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,

  // Value 1 << 5 was for a flag that has since been removed.

  /** A flag for `<image>` tags that are rewritten as `<img>`. */
  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,

  /**
   * A flag for nodes that are cloned as a result of the reconstruction
   * of active formatting elements. This is set only on the clone; the
   * initial portion of the formatting run is a NORMAL node with an
   * `IMPLICIT_END_TAG`.
   */
  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,

  /** A flag for nodes that are cloned by the adoption agency algorithm. */
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,

  /** A flag for nodes that are moved by the adoption agency algorithm. */
  GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,

  /**
   * A flag for nodes that have been foster-parented out of a table (or
   * should've been foster-parented, if verbatim mode is set).
   */
  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
} GumboParseFlags;

/** Information specific to document nodes. */
typedef struct {
  /**
   * An array of `GumboNode`s, containing the children of this element.
   * This will normally consist of the `<html>` element and any comment
   * nodes found. Pointers are owned.
   */
  GumboVector /* GumboNode* */ children;

  /**
   * `true` if there was an explicit doctype token, as opposed to it
   * being omitted.
   */
  bool has_doctype;

  // Fields from the doctype token, copied verbatim.
  const char* name;
  const char* public_identifier;
  const char* system_identifier;

  /**
   * Whether or not the document is in QuirksMode, as determined by the
   * values in the GumboTokenDocType template.
   */
  GumboQuirksModeEnum doc_type_quirks_mode;
} GumboDocument;

/**
 * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
 * elements. This contains just a block of text and its position.
 */
typedef struct {
  /**
   * The text of this node, after entities have been parsed and decoded.
   * For comment and cdata nodes, this does not include the comment
   * delimiters.
   */
  const char* text;

  /**
   * The original text of this node, as a pointer into the original
   * buffer. For comment/cdata nodes, this includes the comment
   * delimiters.
   */
  GumboStringPiece original_text;

  /**
   * The starting position of this node. This corresponds to the
   * position of `original_text`, before entities are decoded.
   * */
  GumboSourcePosition start_pos;
} GumboText;

/**
 * The struct used to represent all HTML elements. This contains
 * information about the tag, attributes, and child nodes.
 */
typedef struct {
  /**
   * An array of `GumboNode`s, containing the children of this element.
   * Pointers are owned.
   */
  GumboVector /* GumboNode* */ children;

  /** The GumboTag enum for this element. */
  GumboTag tag;

  /** The name for this element. */
  const char* name;

  /** The GumboNamespaceEnum for this element. */
  GumboNamespaceEnum tag_namespace;

  /**
   * A `GumboStringPiece` pointing to the original tag text for this
   * element, pointing directly into the source buffer. If the tag was
   * inserted algorithmically (for example, `<head>` or `<tbody>`
   * insertion), this will be a zero-length string.
   */
  GumboStringPiece original_tag;

  /**
   * A `GumboStringPiece` pointing to the original end tag text for this
   * element. If the end tag was inserted algorithmically, (for example,
   * closing a self-closing tag), this will be a zero-length string.
   */
  GumboStringPiece original_end_tag;

  /** The source position for the start of the start tag. */
  GumboSourcePosition start_pos;

  /** The source position for the start of the end tag. */
  GumboSourcePosition end_pos;

  /**
   * An array of `GumboAttribute`s, containing the attributes for this
   * tag in the order that they were parsed. Pointers are owned.
   */
  GumboVector /* GumboAttribute* */ attributes;
} GumboElement;

/**
 * A supertype for `GumboElement` and `GumboText`, so that we can
 * include one generic type in lists of children and cast as necessary
 * to subtypes.
 */
struct GumboInternalNode {
  /** The type of node that this is. */
  GumboNodeType type;

  /** Pointer back to parent node. Not owned. */
  GumboNode* parent;

  /** The index within the parent's children vector of this node. */
  unsigned int index_within_parent;

  /**
   * A bitvector of flags containing information about why this element
   * was inserted into the parse tree, including a variety of special
   * parse situations.
   */
  GumboParseFlags parse_flags;

  /** The actual node data. */
  union {
    GumboDocument document;  // For GUMBO_NODE_DOCUMENT.
    GumboElement element;    // For GUMBO_NODE_ELEMENT.
    GumboText text;          // For everything else.
  } v;
};

/**
 * Input struct containing configuration options for the parser.
 * These let you specify alternate memory managers, provide different
 * error handling, etc. Use `kGumboDefaultOptions` for sensible
 * defaults and only set what you need.
 */
typedef struct GumboInternalOptions {
  /**
   * The tab-stop size, for computing positions in HTML files that
   * use tabs. Default: `8`.
   */
  int tab_stop;

  /**
   * Whether or not to stop parsing when the first error is encountered.
   * Default: `false`.
   */
  bool stop_on_first_error;

  /**
   * Maximum allowed number of attributes per element. If this limit is
   * exceeded, the parser will return early with a partial document and
   * the returned `GumboOutput` will have its `status` field set to
   * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
   * Default: `400`.
   */
  int max_attributes;

  /**
   * Maximum allowed depth for the parse tree. If this limit is exceeded,
   * the parser will return early with a partial document and the returned
   * `GumboOutput` will have its `status` field set to
   * `GUMBO_STATUS_TREE_TOO_DEEP`.
   * Default: `400`.
   */
  unsigned int max_tree_depth;

  /**
   * The maximum number of errors before the parser stops recording
   * them. This is provided so that if the page is totally borked, we
   * don't completely fill up the errors vector and exhaust memory with
   * useless redundant errors. Set to `-1` to disable the limit.
   * Default: `-1`.
   */
  int max_errors;

  /**
   * The fragment context for parsing:
   * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
   *
   * If `NULL` is passed here, it is assumed to be "no
   * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
   * tag name for the intended parent of the parsed fragment. We use the
   * tag name, namespace, and encoding attribute which are sufficient to
   * set all of the parsing context needed for fragment parsing.
   *
   * Default: `NULL`.
   */
  const char* fragment_context;

  /**
   * The namespace for the fragment context. This lets client code
   * differentiate between, say, parsing a `<title>` tag in SVG vs.
   * parsing it in HTML.
   *
   * Default: `GUMBO_NAMESPACE_HTML`.
   */
  GumboNamespaceEnum fragment_namespace;

  /**
   * The value of the fragment context's `encoding` attribute, if any.
   * Set to `NULL` for no `encoding` attribute.
   *
   * Default: `NULL`.
   */
  const char* fragment_encoding;

  /**
   * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
   * be looked up using `gumbo_compute_quirks_mode()`.
   *
   * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
   */
  GumboQuirksModeEnum quirks_mode;

  /**
   * For fragment parsing. Set this to true if the context node has a form
   * element as an ancestor.
   *
   * Default: `false`.
   */
  bool fragment_context_has_form_ancestor;
} GumboOptions;

/** Default options struct; use this with gumbo_parse_with_options. */
extern const GumboOptions kGumboDefaultOptions;

/**
 * Status code indicating whether parsing finished successfully or
 * was stopped mid-document due to exceptional circumstances.
 */
typedef enum {
  /**
   * Indicates that parsing completed successfully. The resulting tree
   * will be a complete document.
   */
  GUMBO_STATUS_OK,

  /**
   * Indicates that the maximum element nesting limit
   * (`GumboOptions::max_tree_depth`) was reached during parsing. The
   * resulting tree will be a partial document, with no further nodes
   * created after the point where the limit was reached. The partial
   * document may be useful for constructing an error message but
   * typically shouldn't be used for other purposes.
   */
  GUMBO_STATUS_TREE_TOO_DEEP,

  /**
   * Indicates that the maximum number of attributes per element
   * (`GumboOptions::max_attributes`) was reached during parsing. The
   * resulting tree will be a partial document, with no further nodes
   * created after the point where the limit was reached. The partial
   * document may be useful for constructing an error message but
   * typically shouldn't be used for other purposes.
   */
  GUMBO_STATUS_TOO_MANY_ATTRIBUTES,

  // Currently unused
  GUMBO_STATUS_OUT_OF_MEMORY,
} GumboOutputStatus;


/** The output struct containing the results of the parse. */
typedef struct GumboInternalOutput {
  /**
   * Pointer to the document node. This is a `GumboNode` of type
   * `NODE_DOCUMENT` that contains the entire document as its child.
   */
  GumboNode* document;

  /**
   * Pointer to the root node. This is the `<html>` tag that forms the
   * root of the document.
   */
  GumboNode* root;

  /**
   * A list of errors that occurred during the parse.
   */
  GumboVector /* GumboError */ errors;

  /**
   * True if the parser encountered an error.
   *
   * This can be true and `errors` an empty `GumboVector` if the `max_errors`
   * option was set to 0.
   */
  bool document_error;

  /**
   * A status code indicating whether parsing finished successfully or was
   * stopped mid-document due to exceptional circumstances.
   */
  GumboOutputStatus status;
} GumboOutput;

/**
 * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
 * buffer must live at least as long as the parse tree, as some fields
 * (eg. `original_text`) point directly into the original buffer.
 *
 * This doesn't support buffers longer than 4 gigabytes.
 */
GumboOutput* gumbo_parse(const char* buffer);

/**
 * Extended version of `gumbo_parse` that takes an explicit options
 * structure, buffer, and length.
 */
GumboOutput* gumbo_parse_with_options (
  const GumboOptions* options,
  const char* buffer,
  size_t buffer_length
);

/**
 * Compute the quirks mode based on the name, public identifier, and system
 * identifier. Any of these may be `NULL` to indicate a missing value.
 */
GumboQuirksModeEnum gumbo_compute_quirks_mode (
  const char *name,
  const char *pubid,
  const char *sysid
);

/** Convert a `GumboOutputStatus` code into a readable description. */
const char* gumbo_status_to_string(GumboOutputStatus status);

/** Release the memory used for the parse tree and parse errors. */
void gumbo_destroy_output(GumboOutput* output);

/** Opaque GumboError type */
typedef struct GumboInternalError GumboError;

/**
 * Returns the position of the error.
 */
GumboSourcePosition gumbo_error_position(const GumboError* error);

/**
 * Returns a constant string representation of the error's code. This is owned
 * by the library and should not be freed by the caller.
 */
const char* gumbo_error_code(const GumboError* error);

/**
 * Prints an error to a string. This stores a freshly-allocated buffer
 * containing the error message text in output. The caller is responsible for
 * freeing the buffer. The size of the error message is returned. The error
 * message itself may not be NULL-terminated and may contain NULL bytes so the
 * returned size must be used.
 */
size_t gumbo_error_to_string(const GumboError* error, char **output);

/**
 * Prints a caret diagnostic to a string. This stores a freshly-allocated
 * buffer containing the error message text in output. The caller is responsible for
 * freeing the buffer. The size of the error message is returned. The error
 * message itself may not be NULL-terminated and may contain NULL bytes so the
 * returned size must be used.
 */
size_t gumbo_caret_diagnostic_to_string (
  const GumboError* error,
  const char* source_text,
  size_t source_length,
  char** output
);

/**
 * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
 * instead of writing to a string.
 */
void gumbo_print_caret_diagnostic (
  const GumboError* error,
  const char* source_text,
  size_t source_length
);

#ifdef __cplusplus
}
#endif

#endif // GUMBO_H