ext/nokogiri/gumbo.c
//
// Copyright 2013-2021 Sam Ruby, Stephen Checkoway
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//
// nokogumbo.c defines the following:
//
// class Nokogumbo
// def parse(utf8_string) # returns Nokogiri::HTML5::Document
// end
//
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
// is then walked, a parallel libxml2 tree is constructed, and the final document is
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
// requirements as Ruby objects are only built when necessary.
//
#include <nokogiri.h>
#include "nokogiri_gumbo.h"
VALUE cNokogiriHtml5Document;
// Interned symbols
static ID internal_subset;
static ID parent;
/* Backwards compatibility to Ruby 2.1.0 */
#if RUBY_API_VERSION_CODE < 20200
#define ONIG_ESCAPE_UCHAR_COLLISION 1
#include <ruby/encoding.h>
static VALUE
rb_utf8_str_new(const char *str, long length)
{
return rb_enc_str_new(str, length, rb_utf8_encoding());
}
static VALUE
rb_utf8_str_new_cstr(const char *str)
{
return rb_enc_str_new_cstr(str, rb_utf8_encoding());
}
static VALUE
rb_utf8_str_new_static(const char *str, long length)
{
return rb_enc_str_new(str, length, rb_utf8_encoding());
}
#endif
#include <nokogiri.h>
#include <libxml/tree.h>
#include <libxml/HTMLtree.h>
// URI = system id
// external id = public id
static xmlDocPtr
new_html_doc(const char *dtd_name, const char *system, const char *public)
{
// These two libxml2 functions take the public and system ids in
// opposite orders.
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
assert(doc);
if (dtd_name) {
xmlCreateIntSubset(doc, (const xmlChar *)dtd_name, (const xmlChar *)public, (const xmlChar *)system);
}
return doc;
}
static xmlNodePtr
get_parent(xmlNodePtr node)
{
return node->parent;
}
static GumboOutput *
perform_parse(const GumboOptions *options, VALUE input)
{
assert(RTEST(input));
Check_Type(input, T_STRING);
GumboOutput *output = gumbo_parse_with_options(
options,
RSTRING_PTR(input),
RSTRING_LEN(input)
);
const char *status_string = gumbo_status_to_string(output->status);
switch (output->status) {
case GUMBO_STATUS_OK:
break;
case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
case GUMBO_STATUS_TREE_TOO_DEEP:
gumbo_destroy_output(output);
rb_raise(rb_eArgError, "%s", status_string);
case GUMBO_STATUS_OUT_OF_MEMORY:
gumbo_destroy_output(output);
rb_raise(rb_eNoMemError, "%s", status_string);
}
return output;
}
static xmlNsPtr
lookup_or_add_ns(
xmlDocPtr doc,
xmlNodePtr root,
const char *href,
const char *prefix
)
{
xmlNsPtr ns = xmlSearchNs(doc, root, (const xmlChar *)prefix);
if (ns) {
return ns;
}
return xmlNewNs(root, (const xmlChar *)href, (const xmlChar *)prefix);
}
static void
set_line(xmlNodePtr node, size_t line)
{
// libxml2 uses 65535 to mean look elsewhere for the line number on some
// nodes.
if (line < 65535) {
node->line = (unsigned short)line;
}
}
// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
// at gumbo_node.
static void
build_tree(
xmlDocPtr doc,
xmlNodePtr xml_output_node,
const GumboNode *gumbo_node
)
{
xmlNodePtr xml_root = NULL;
xmlNodePtr xml_node = xml_output_node;
size_t child_index = 0;
while (true) {
assert(gumbo_node != NULL);
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
if (child_index >= children->length) {
// Move up the tree and to the next child.
if (xml_node == xml_output_node) {
// We've built as much of the tree as we can.
return;
}
child_index = gumbo_node->index_within_parent + 1;
gumbo_node = gumbo_node->parent;
xml_node = get_parent(xml_node);
// Children of fragments don't share the same root, so reset it and
// it'll be set below. In the non-fragment case, this will only happen
// after the html element has been finished at which point there are no
// further elements.
if (xml_node == xml_output_node) {
xml_root = NULL;
}
continue;
}
const GumboNode *gumbo_child = children->data[child_index++];
xmlNodePtr xml_child;
switch (gumbo_child->type) {
case GUMBO_NODE_DOCUMENT:
abort(); // Bug in Gumbo.
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
xml_child = xmlNewDocText(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;
case GUMBO_NODE_CDATA:
xml_child = xmlNewCDataBlock(doc, (const xmlChar *)gumbo_child->v.text.text,
(int) strlen(gumbo_child->v.text.text));
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;
case GUMBO_NODE_COMMENT:
xml_child = xmlNewDocComment(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;
case GUMBO_NODE_TEMPLATE:
// XXX: Should create a template element and a new DocumentFragment
case GUMBO_NODE_ELEMENT: {
xml_child = xmlNewDocNode(doc, NULL, (const xmlChar *)gumbo_child->v.element.name, NULL);
set_line(xml_child, gumbo_child->v.element.start_pos.line);
if (xml_root == NULL) {
xml_root = xml_child;
}
xmlNsPtr ns = NULL;
switch (gumbo_child->v.element.tag_namespace) {
case GUMBO_NAMESPACE_HTML:
break;
case GUMBO_NAMESPACE_SVG:
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
break;
case GUMBO_NAMESPACE_MATHML:
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
break;
}
if (ns != NULL) {
xmlSetNs(xml_child, ns);
}
xmlAddChild(xml_node, xml_child);
// Add the attributes.
const GumboVector *attrs = &gumbo_child->v.element.attributes;
for (size_t i = 0; i < attrs->length; i++) {
const GumboAttribute *attr = attrs->data[i];
switch (attr->attr_namespace) {
case GUMBO_ATTR_NAMESPACE_XLINK:
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
break;
case GUMBO_ATTR_NAMESPACE_XML:
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
break;
case GUMBO_ATTR_NAMESPACE_XMLNS:
ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
break;
default:
ns = NULL;
}
xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value);
}
// Add children for this element.
child_index = 0;
gumbo_node = gumbo_child;
xml_node = xml_child;
}
}
}
}
static void
add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
{
const char *input_str = RSTRING_PTR(input);
size_t input_len = RSTRING_LEN(input);
// Add parse errors to rdoc.
if (output->errors.length) {
const GumboVector *errors = &output->errors;
VALUE rerrors = rb_ary_new2(errors->length);
for (size_t i = 0; i < errors->length; i++) {
GumboError *err = errors->data[i];
GumboSourcePosition position = gumbo_error_position(err);
char *msg;
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
VALUE err_str = rb_utf8_str_new(msg, size);
free(msg);
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
const char *error_code = gumbo_error_code(err);
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
rb_iv_set(syntax_error, "@file", url);
rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
rb_iv_set(syntax_error, "@str1", str1);
rb_iv_set(syntax_error, "@str2", Qnil);
rb_iv_set(syntax_error, "@str3", Qnil);
rb_iv_set(syntax_error, "@int1", INT2NUM(0));
rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
rb_ary_push(rerrors, syntax_error);
}
rb_iv_set(rdoc, "@errors", rerrors);
}
}
typedef struct {
GumboOutput *output;
VALUE input;
VALUE url_or_frag;
VALUE klass;
xmlDocPtr doc;
} ParseArgs;
static VALUE
parse_cleanup(VALUE parse_args)
{
ParseArgs *args = (ParseArgs *)parse_args;
gumbo_destroy_output(args->output);
// Make sure garbage collection doesn't mark the objects as being live based
// on references from the ParseArgs. This may be unnecessary.
args->input = Qnil;
args->url_or_frag = Qnil;
if (args->doc != NULL) {
xmlFreeDoc(args->doc);
}
return Qnil;
}
static VALUE parse_continue(VALUE parse_args);
/*
* @!visibility protected
*/
static VALUE
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
{
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);
GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
.output = output,
.input = input,
.url_or_frag = url,
.klass = klass,
.doc = NULL,
};
return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
}
static VALUE
parse_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
xmlDocPtr doc;
if (output->document->v.document.has_doctype) {
const char *name = output->document->v.document.name;
const char *public = output->document->v.document.public_identifier;
const char *system = output->document->v.document.system_identifier;
public = public[0] ? public : NULL;
system = system[0] ? system : NULL;
doc = new_html_doc(name, system, public);
} else {
doc = new_html_doc(NULL, NULL, NULL);
}
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
build_tree(doc, (xmlNodePtr)doc, output->document);
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
rb_iv_set(rdoc, "@url", args->url_or_frag);
rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
add_errors(output, rdoc, args->input, args->url_or_frag);
return rdoc;
}
static int
lookup_namespace(VALUE node, bool require_known_ns)
{
ID namespace, href;
CONST_ID(namespace, "namespace");
CONST_ID(href, "href");
VALUE ns = rb_funcall(node, namespace, 0);
if (NIL_P(ns)) {
return GUMBO_NAMESPACE_HTML;
}
ns = rb_funcall(ns, href, 0);
assert(RTEST(ns));
Check_Type(ns, T_STRING);
const char *href_ptr = RSTRING_PTR(ns);
size_t href_len = RSTRING_LEN(ns);
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
return GUMBO_NAMESPACE_HTML;
}
if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML")) {
return GUMBO_NAMESPACE_MATHML;
}
if (NAMESPACE_P("http://www.w3.org/2000/svg")) {
return GUMBO_NAMESPACE_SVG;
}
#undef NAMESPACE_P
if (require_known_ns) {
rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
}
return -1;
}
static xmlNodePtr
extract_xml_node(VALUE node)
{
xmlNodePtr xml_node;
Noko_Node_Get_Struct(node, xmlNode, xml_node);
return xml_node;
}
static VALUE fragment_continue(VALUE parse_args);
/*
* @!visibility protected
*/
static VALUE
fragment(
VALUE self,
VALUE doc_fragment,
VALUE tags,
VALUE ctx,
VALUE max_attributes,
VALUE max_errors,
VALUE max_depth
)
{
ID name = rb_intern_const("name");
const char *ctx_tag;
GumboNamespaceEnum ctx_ns;
GumboQuirksModeEnum quirks_mode;
bool form = false;
const char *encoding = NULL;
if (NIL_P(ctx)) {
ctx_tag = "body";
ctx_ns = GUMBO_NAMESPACE_HTML;
} else if (TYPE(ctx) == T_STRING) {
ctx_tag = StringValueCStr(ctx);
ctx_ns = GUMBO_NAMESPACE_HTML;
size_t len = RSTRING_LEN(ctx);
const char *colon = memchr(ctx_tag, ':', len);
if (colon) {
switch (colon - ctx_tag) {
case 3:
if (st_strncasecmp(ctx_tag, "svg", 3) != 0) {
goto error;
}
ctx_ns = GUMBO_NAMESPACE_SVG;
break;
case 4:
if (st_strncasecmp(ctx_tag, "html", 4) == 0) {
ctx_ns = GUMBO_NAMESPACE_HTML;
} else if (st_strncasecmp(ctx_tag, "math", 4) == 0) {
ctx_ns = GUMBO_NAMESPACE_MATHML;
} else {
goto error;
}
break;
default:
error:
rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
}
ctx_tag = colon + 1;
} else {
// For convenience, put 'svg' and 'math' in their namespaces.
if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0) {
ctx_ns = GUMBO_NAMESPACE_SVG;
} else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0) {
ctx_ns = GUMBO_NAMESPACE_MATHML;
}
}
// Check if it's a form.
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
} else {
ID element_ = rb_intern_const("element?");
// Context fragment name.
VALUE tag_name = rb_funcall(ctx, name, 0);
assert(RTEST(tag_name));
Check_Type(tag_name, T_STRING);
ctx_tag = StringValueCStr(tag_name);
// Context fragment namespace.
ctx_ns = lookup_namespace(ctx, true);
// Check for a form ancestor, including self.
for (VALUE node = ctx;
!NIL_P(node);
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
if (!RTEST(rb_funcall(node, element_, 0))) {
continue;
}
VALUE element_name = rb_funcall(node, name, 0);
if (RSTRING_LEN(element_name) == 4
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
form = true;
break;
}
}
// Encoding.
if (ctx_ns == GUMBO_NAMESPACE_MATHML
&& RSTRING_LEN(tag_name) == 14
&& !st_strcasecmp(ctx_tag, "annotation-xml")) {
VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
1,
rb_utf8_str_new_static("encoding", 8));
if (RTEST(enc)) {
Check_Type(enc, T_STRING);
encoding = StringValueCStr(enc);
}
}
}
// Quirks mode.
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
VALUE dtd = rb_funcall(doc, internal_subset, 0);
VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
} else if (NIL_P(dtd)) {
quirks_mode = GUMBO_DOCTYPE_QUIRKS;
} else {
VALUE dtd_name = rb_funcall(dtd, name, 0);
VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
quirks_mode = gumbo_compute_quirks_mode(
NIL_P(dtd_name) ? NULL : StringValueCStr(dtd_name),
NIL_P(pubid) ? NULL : StringValueCStr(pubid),
NIL_P(sysid) ? NULL : StringValueCStr(sysid)
);
}
// Perform a fragment parse.
int depth = NUM2INT(max_depth);
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
// Add one to account for the HTML element.
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
options.fragment_context = ctx_tag;
options.fragment_namespace = ctx_ns;
options.fragment_encoding = encoding;
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;
GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
.input = tags,
.url_or_frag = doc_fragment,
.doc = (xmlDocPtr)extract_xml_node(doc),
};
rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
return Qnil;
}
static VALUE
fragment_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
VALUE doc_fragment = args->url_or_frag;
xmlDocPtr xml_doc = args->doc;
args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
build_tree(xml_doc, xml_frag, output->root);
rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
return Qnil;
}
// Initialize the Nokogumbo class and fetch constants we will use later.
void
noko_init_gumbo(void)
{
// Class constants.
cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
rb_gc_register_mark_object(cNokogiriHtml5Document);
// Interned symbols.
internal_subset = rb_intern_const("internal_subset");
parent = rb_intern_const("parent");
// Define Nokogumbo module with parse and fragment methods.
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
}
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: