sparklemotion/nokogiri

View on GitHub
ext/java/nokogiri/XmlDtd.java

Summary

Maintainability
D
2 days
Test Coverage
package nokogiri;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static nokogiri.internals.NokogiriHelpers.nonEmptyStringOrNil;
import static nokogiri.internals.NokogiriHelpers.stringOrNil;
import static org.jruby.runtime.Helpers.invoke;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.xerces.xni.QName;
import org.cyberneko.dtd.DTDConfiguration;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyClass;
import org.jruby.RubyHash;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.SaveContextVisitor;

/**
 * Class for Nokogiri::XML::DTD
 *
 * @author sergio
 * @author Patrick Mahoney <pat@polycrystal.org>
 * @author Yoko Harada <yokolet@gmail.com>
 */
@JRubyClass(name = "Nokogiri::XML::DTD", parent = "Nokogiri::XML::Node")
public class XmlDtd extends XmlNode
{
  private static final long serialVersionUID = 1L;

  /** cache of children, Nokogiri::XML::NodeSet */
  protected IRubyObject children = null;

  /** cache of name => XmlAttributeDecl */
  protected RubyHash attributes = null;

  /** cache of name => XmlElementDecl */
  protected RubyHash elements = null;

  /** cache of name => XmlEntityDecl */
  protected RubyHash entities = null;

  /** cache of name => Nokogiri::XML::Notation */
  protected RubyHash notations = null;
  protected RubyClass notationClass;

  /** temporary store of content models before they are added to
   * their XmlElementDecl. */
  protected RubyHash contentModels;

  /** node name */
  protected IRubyObject name;

  /** public ID (or external ID) */
  protected IRubyObject pubId;

  /** system ID */
  protected IRubyObject sysId;

  public
  XmlDtd(Ruby ruby, RubyClass rubyClass)
  {
    super(ruby, rubyClass);
  }

  public void
  setNode(Ruby runtime, Node dtd)
  {
    this.node = dtd;
    notationClass = (RubyClass) runtime.getClassFromPath("Nokogiri::XML::Notation");

    name = pubId = sysId = runtime.getNil();
    if (dtd == null) { return; }

    // This is the dtd declaration stored in the document; it
    // contains the DTD name (root element) and public and system
    // ids. The actual declarations are in the NekoDTD 'dtd'
    // variable. I don't know of a way to consolidate the two.

    DocumentType otherDtd = dtd.getOwnerDocument().getDoctype();
    if (otherDtd != null) {
      name = stringOrNil(runtime, otherDtd.getNodeName());
      pubId = nonEmptyStringOrNil(runtime, otherDtd.getPublicId());
      sysId = nonEmptyStringOrNil(runtime, otherDtd.getSystemId());
    }
  }

  public
  XmlDtd(Ruby ruby, RubyClass rubyClass, Node dtd)
  {
    super(ruby, rubyClass, dtd);
    setNode(ruby, dtd);
  }

  public static XmlDtd
  newEmpty(Ruby runtime,
           Document doc,
           IRubyObject name,
           IRubyObject external_id,
           IRubyObject system_id)
  {

    DocumentType placeholder;
    if (doc.getDoctype() == null) {
      String javaName = NokogiriHelpers.rubyStringToString(name);
      String javaExternalId = NokogiriHelpers.rubyStringToString(external_id);
      String javaSystemId = NokogiriHelpers.rubyStringToString(system_id);
      placeholder = doc.getImplementation().createDocumentType(javaName, javaExternalId, javaSystemId);
      doc.appendChild(placeholder);
    } else {
      placeholder = doc.getDoctype();
    }
    // FIXME: what if the document had a doc type, why are we here ?
    XmlDtd dtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                 "Nokogiri::XML::DTD"));
    dtd.setNode(runtime, placeholder);
    dtd.name = name;
    dtd.pubId = external_id;
    dtd.sysId = system_id;
    return dtd;
  }


  /**
   * Create an unparented element that contains DTD declarations
   * parsed from the internal subset attached as user data to
   * <code>doc</code>.  The attached dtd must be the tree from
   * NekoDTD. The owner document of the returned tree will be
   * <code>doc</doc>.
   *
   * NekoDTD parser returns a new document node containing elements
   * representing the dtd declarations. The plan is to get the root
   * element and adopt it into the correct document, stipping the
   * Document provided by NekoDTD.
   *
   */
  public static XmlDtd
  newFromInternalSubset(Ruby runtime, Document doc)
  {
    Object dtdTree_ = doc.getUserData(XmlDocument.DTD_RAW_DOCUMENT);
    if (dtdTree_ == null) {
      XmlDtd xmlDtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                      "Nokogiri::XML::DTD"));
      xmlDtd.setNode(runtime, null);
      return xmlDtd;
    }

    Node dtdTree = (Node) dtdTree_;
    Node dtd = getInternalSubset(dtdTree);
    if (dtd == null) {
      XmlDtd xmlDtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                      "Nokogiri::XML::DTD"));
      xmlDtd.setNode(runtime, null);
      return xmlDtd;
    } else {
      // Import the node into doc so it has the correct owner document.
      dtd = doc.importNode(dtd, true);
      XmlDtd xmlDtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                      "Nokogiri::XML::DTD"));
      xmlDtd.setNode(runtime, dtd);
      return xmlDtd;
    }
  }

  public static IRubyObject
  newFromExternalSubset(Ruby runtime, Document doc)
  {
    Object dtdTree_ = doc.getUserData(XmlDocument.DTD_RAW_DOCUMENT);
    if (dtdTree_ == null) {
      return runtime.getNil();
    }

    Node dtdTree = (Node) dtdTree_;
    Node dtd = getExternalSubset(dtdTree);
    if (dtd == null) {
      return runtime.getNil();
    } else if (!dtd.hasChildNodes()) {
      return runtime.getNil();
    } else {
      // Import the node into doc so it has the correct owner document.
      dtd = doc.importNode(dtd, true);
      XmlDtd xmlDtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                      "Nokogiri::XML::DTD"));
      xmlDtd.setNode(runtime, dtd);
      return xmlDtd;
    }
  }

  /*
   * <code>dtd</code> is the document node of a NekoDTD tree.
   * NekoDTD tree looks like this:
   *
   * <code><pre>
   * [#document: null]
   *   [#comment: ...]
   *   [#comment: ...]
   *   [dtd: null]   // a DocumentType; isDTD(node) => false
   *   [dtd: null]   // root of dtd, an Element node; isDTD(node) => true
   *     ... decls, content models, etc. ...
   *     [externalSubset: null] pubid="the pubid" sysid="the sysid"
   *       ... external subset decls, etc. ...
   * </pre></code>
   */
  protected static Node
  getInternalSubset(Node dtdTree)
  {
    Node root;
    for (root = dtdTree.getFirstChild(); ; root = root.getNextSibling()) {
      if (root == null) {
        return null;
      } else if (isDTD(root)) {
        return root;  // we have second dtd which is root
      }
    }
  }

  protected static Node
  getExternalSubset(Node dtdTree)
  {
    Node dtd = getInternalSubset(dtdTree);
    if (dtd == null) { return null; }
    for (Node ext = dtd.getFirstChild(); ; ext = ext.getNextSibling()) {
      if (ext == null) {
        return null;
      } else if (isExternalSubset(ext)) {
        return ext;
      }
    }
  }

  /**
   * This overrides the #attributes method defined in
   * lib/nokogiri/xml/node.rb.
   */
  @JRubyMethod
  public IRubyObject
  attributes(ThreadContext context)
  {
    if (attributes == null) { extractDecls(context); }

    return attributes;
  }

  @JRubyMethod
  public IRubyObject
  elements(ThreadContext context)
  {
    if (elements == null) { extractDecls(context); }

    return elements;
  }

  @JRubyMethod
  public IRubyObject
  entities(ThreadContext context)
  {
    if (entities == null) { extractDecls(context); }

    return entities;
  }

  @JRubyMethod
  public IRubyObject
  notations(ThreadContext context)
  {
    if (notations == null) { extractDecls(context); }

    return notations;
  }

  /**
   * Our "node" object is as-returned by NekoDTD.  The actual
   * "children" that we're interested in (Attribute declarations,
   * etc.) are a few layers deep.
   */
  @Override
  @JRubyMethod
  public IRubyObject
  children(ThreadContext context)
  {
    if (children == null) { extractDecls(context); }

    return children;
  }

  /**
   * Returns the name of the dtd.
   */
  @Override
  @JRubyMethod
  public IRubyObject
  node_name(ThreadContext context)
  {
    return name;
  }

  @Override
  @JRubyMethod(name = "node_name=")
  public IRubyObject
  node_name_set(ThreadContext context, IRubyObject name)
  {
    throw context.getRuntime()
    .newRuntimeError("cannot change name of DTD");
  }

  @JRubyMethod
  public IRubyObject
  system_id(ThreadContext context)
  {
    return sysId;
  }

  @JRubyMethod
  public IRubyObject
  external_id(ThreadContext context)
  {
    return pubId;
  }

  @JRubyMethod
  public IRubyObject
  validate(ThreadContext context, IRubyObject doc)
  {
    RubyArray<?> errors = RubyArray.newArray(context.getRuntime());
    if (doc instanceof XmlDocument) {
      errors = (RubyArray)((XmlDocument)doc).getInstanceVariable("@errors");
    }
    return errors;
  }

  public static boolean
  nameEquals(Node node, QName name)
  {
    return name.localpart.equals(node.getNodeName());
  }

  public static boolean
  isExternalSubset(Node node)
  {
    return nameEquals(node, DTDConfiguration.E_EXTERNAL_SUBSET);
  }

  /**
   * Checks instanceof Element so we return false for a DocumentType
   * node (NekoDTD uses Element for all its nodes).
   */
  public static boolean
  isDTD(Node node)
  {
    return (node instanceof Element &&
            nameEquals(node, DTDConfiguration.E_DTD));
  }

  public static boolean
  isAttributeDecl(Node node)
  {
    return nameEquals(node, DTDConfiguration.E_ATTRIBUTE_DECL);
  }

  public static boolean
  isElementDecl(Node node)
  {
    return nameEquals(node, DTDConfiguration.E_ELEMENT_DECL);
  }

  public static boolean
  isEntityDecl(Node node)
  {
    return (nameEquals(node, DTDConfiguration.E_INTERNAL_ENTITY_DECL) ||
            nameEquals(node, DTDConfiguration.E_UNPARSED_ENTITY_DECL));
  }

  public static boolean
  isNotationDecl(Node node)
  {
    return nameEquals(node, DTDConfiguration.E_NOTATION_DECL);
  }

  public static boolean
  isContentModel(Node node)
  {
    return nameEquals(node, DTDConfiguration.E_CONTENT_MODEL);
  }

  /**
   * Recursively extract various DTD declarations and store them in
   * the various collections.
   */
  protected void
  extractDecls(ThreadContext context)
  {
    Ruby runtime = context.runtime;

    // initialize data structures
    attributes = RubyHash.newHash(runtime);
    elements = RubyHash.newHash(runtime);
    entities = RubyHash.newHash(runtime);
    notations = RubyHash.newHash(runtime);
    contentModels = RubyHash.newHash(runtime);
    children = runtime.getNil();

    // recursively extract decls
    if (node == null) { return; } // leave all the decl hash's empty

    // convert allDecls to a NodeSet
    children = XmlNodeSet.newNodeSet(runtime, extractDecls(context, node.getFirstChild()));

    // add attribute decls as attributes to the matching element decl
    RubyArray<?> keys = attributes.keys();
    for (int i = 0; i < keys.getLength(); ++i) {
      IRubyObject akey = keys.entry(i);
      IRubyObject val;

      val = attributes.op_aref(context, akey);
      if (val.isNil()) { continue; }
      XmlAttributeDecl attrDecl = (XmlAttributeDecl) val;
      IRubyObject ekey = attrDecl.element_name(context);
      val = elements.op_aref(context, ekey);
      if (val.isNil()) { continue; }
      XmlElementDecl elemDecl = (XmlElementDecl) val;

      elemDecl.appendAttrDecl(attrDecl);
    }

    // add content models to the matching element decl
    keys = contentModels.keys();
    for (int i = 0; i < keys.getLength(); ++i) {
      IRubyObject key = keys.entry(i);
      IRubyObject cm = contentModels.op_aref(context, key);

      IRubyObject elem = elements.op_aref(context, key);
      if (elem.isNil()) { continue; }
      if (((XmlElementDecl)elem).isEmpty()) { continue; }
      ((XmlElementDecl) elem).setContentModel(cm);
    }
  }

  /**
   * The <code>node</code> is either the first child of the root dtd
   * node (as returned by getInternalSubset()) or the first child of
   * the external subset node (as returned by getExternalSubset()).
   *
   * This recursive function will not descend into an
   * 'externalSubset' node, thus for an internal subset it only
   * extracts nodes in the internal subset, and for an external
   * subset it extracts everything and assumess <code>node</code>
   * and all children are part of the external subset.
   */
  protected IRubyObject[]
  extractDecls(ThreadContext context, Node node)
  {
    List<IRubyObject> decls = new ArrayList<IRubyObject>();
    while (node != null) {
      if (isExternalSubset(node)) {
        break;
      } else if (isAttributeDecl(node)) {
        XmlAttributeDecl decl = XmlAttributeDecl.create(context, node);
        attributes.op_aset(context, decl.attribute_name(context), decl);
        decls.add(decl);
      } else if (isElementDecl(node)) {
        XmlElementDecl decl = XmlElementDecl.create(context, node);
        elements.op_aset(context, decl.element_name(context), decl);
        decls.add(decl);
      } else if (isEntityDecl(node)) {
        XmlEntityDecl decl = XmlEntityDecl.create(context, node);
        entities.op_aset(context, decl.node_name(context), decl);
        decls.add(decl);
      } else if (isNotationDecl(node)) {
        XmlNode tmp = (XmlNode)
                      NokogiriHelpers.constructNode(context.getRuntime(), node);
        IRubyObject decl = invoke(context, notationClass, "new",
                                  tmp.getAttribute(context, "name"),
                                  tmp.getAttribute(context, "pubid"),
                                  tmp.getAttribute(context, "sysid"));
        notations.op_aset(context,
                          tmp.getAttribute(context, "name"), decl);
        decls.add(decl);
      } else if (isContentModel(node)) {
        XmlElementContent cm =
          new XmlElementContent(context.getRuntime(),
                                (XmlDocument) document(context),
                                node);
        contentModels.op_aset(context, cm.element_name(context), cm);
      } else {
        // recurse
        decls.addAll(Arrays.asList(extractDecls(context, node.getFirstChild())));
      }

      node = node.getNextSibling();
    }

    return decls.toArray(new IRubyObject[decls.size()]);
  }

  @Override
  public void
  accept(ThreadContext context, SaveContextVisitor visitor)
  {
    // since we use nekoDTD to parse dtd, node might be ElementImpl type
    // An external subset doesn't need to show up, so this method just see docType.
    DocumentType docType = node.getOwnerDocument().getDoctype();
    visitor.enter(docType);
    visitor.leave(docType);
  }
}