sparklemotion/nokogiri

View on GitHub
ext/java/nokogiri/XmlDocument.java

Summary

Maintainability
F
3 days
Test Coverage
package nokogiri;

import static nokogiri.internals.NokogiriHelpers.clearXpathContext;
import static nokogiri.internals.NokogiriHelpers.getCachedNodeOrCreate;
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static nokogiri.internals.NokogiriHelpers.isNamespace;
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
import static nokogiri.internals.NokogiriHelpers.stringOrNil;

import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyString;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.javasupport.JavaUtil;
import org.jruby.runtime.Block;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.Visibility;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import nokogiri.internals.NokogiriHelpers;
import nokogiri.internals.NokogiriNamespaceCache;
import nokogiri.internals.SaveContextVisitor;
import nokogiri.internals.XmlDomParserContext;
import nokogiri.internals.c14n.CanonicalFilter;
import nokogiri.internals.c14n.CanonicalizationException;
import nokogiri.internals.c14n.Canonicalizer;

/**
 * Class for Nokogiri::XML::Document
 *
 * @author sergio
 * @author Yoko Harada <yokolet@gmail.com>
 * @author John Shahid <jvshahid@gmail.com>
 */
@JRubyClass(name = "Nokogiri::XML::Document", parent = "Nokogiri::XML::Node")
public class XmlDocument extends XmlNode
{
  private static final long serialVersionUID = 1L;

  private NokogiriNamespaceCache nsCache;

  /* UserData keys for storing extra info in the document node. */
  public final static String DTD_RAW_DOCUMENT = "DTD_RAW_DOCUMENT";
  public final static String DTD_INTERNAL_SUBSET = "DTD_INTERNAL_SUBSET";
  public final static String DTD_EXTERNAL_SUBSET = "DTD_EXTERNAL_SUBSET";

  /* DocumentBuilderFactory implementation class name. This needs to set a classloader into it.
   * Setting an appropriate classloader resolves issue 380.
   */
  private static final String DOCUMENTBUILDERFACTORY_IMPLE_NAME = "org.apache.xerces.jaxp.DocumentBuilderFactoryImpl";

  private static final ByteList DOCUMENT = ByteList.create("document");
  static { DOCUMENT.setEncoding(USASCIIEncoding.INSTANCE); }

  private static boolean substituteEntities = false;
  private static boolean loadExternalSubset = false; // TODO: Verify this.

  /** cache variables */
  protected IRubyObject encoding;
  protected IRubyObject url;

  public
  XmlDocument(Ruby runtime, RubyClass klazz)
  {
    super(runtime, klazz, createNewDocument(runtime));
  }

  public
  XmlDocument(Ruby runtime, Document document)
  {
    this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
  }

  public
  XmlDocument(Ruby runtime, RubyClass klass, Document document)
  {
    super(runtime, klass, document);
    init(runtime, document);
  }

  void
  init(Ruby runtime, Document document)
  {
    stabilizeTextContent(document);
    if (document.getDocumentElement() != null) {
      createAndCacheNamespaces(runtime, document.getDocumentElement());
    }
    setInstanceVariable("@decorators", runtime.getNil());
  }

  public final void
  setDocumentNode(Ruby runtime, Document node)
  {
    super.setNode(runtime, node);
    if (node != null) { init(runtime, node); }
    else { setInstanceVariable("@decorators", runtime.getNil()); }
  }

  public void
  setEncoding(IRubyObject encoding)
  {
    this.encoding = encoding;
  }

  public IRubyObject
  getEncoding()
  {
    return encoding;
  }

  // not sure, but like attribute values, text value will be lost
  // unless it is referred once before this document is used.
  // this seems to happen only when the fragment is parsed from Node#in_context.
  protected static void
  stabilizeTextContent(Document document)
  {
    if (document.getDocumentElement() != null) { document.getDocumentElement().getTextContent(); }
  }

  private static void
  createAndCacheNamespaces(Ruby runtime, Node node)
  {
    if (node.hasAttributes()) {
      NamedNodeMap nodeMap = node.getAttributes();
      for (int i = 0; i < nodeMap.getLength(); i++) {
        Node n = nodeMap.item(i);
        if (n instanceof Attr) {
          Attr attr = (Attr) n;
          stabilizeAttr(attr);
          if (isNamespace(attr.getName())) {
            // create and cache
            XmlNamespace.createFromAttr(runtime, attr);
          }
        }
      }
    }
    NodeList children = node.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
      createAndCacheNamespaces(runtime, children.item(i));
    }
  }

  static void
  stabilizeAttr(final Attr attr)
  {
    // TODO not sure, but need to get value always before document is referred or lose attribute value
    attr.getValue(); // don't delete this line
  }

  // When a document is created from fragment with a context (reference) document,
  // namespace should be resolved based on the context document.
  public
  XmlDocument(Ruby ruby, RubyClass klass, Document document, XmlDocument contextDoc)
  {
    super(ruby, klass, document);
    nsCache = contextDoc.getNamespaceCache();
    String default_href = nsCache.getDefault().getHref();
    resolveNamespaceIfNecessary(document.getDocumentElement(), default_href);
  }

  private void
  resolveNamespaceIfNecessary(Node node, String default_href)
  {
    if (node == null) { return; }
    String nodePrefix = node.getPrefix();
    if (nodePrefix == null) { // default namespace
      NokogiriHelpers.renameNode(node, default_href, node.getNodeName());
    } else {
      String href = getNamespaceCache().get(node, nodePrefix).getHref();
      NokogiriHelpers.renameNode(node, href, node.getNodeName());
    }
    resolveNamespaceIfNecessary(node.getNextSibling(), default_href);
    NodeList children = node.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
      resolveNamespaceIfNecessary(children.item(i), default_href);
    }
  }

  public NokogiriNamespaceCache
  getNamespaceCache()
  {
    if (nsCache == null) { nsCache = new NokogiriNamespaceCache(); }
    return nsCache;
  }

  public Document
  getDocument()
  {
    return (Document) node;
  }

  @Override
  protected IRubyObject
  getNodeName(ThreadContext context)
  {
    if (name == null) { name = RubyString.newStringShared(context.runtime, DOCUMENT); }
    return name;
  }

  public void
  setUrl(IRubyObject url)
  {
    this.url = url;
  }

  protected IRubyObject
  getUrl()
  {
    return this.url;
  }

  @JRubyMethod
  public IRubyObject
  url(ThreadContext context)
  {
    return getUrl();
  }

  public static Document
  createNewDocument(final Ruby runtime)
  {
    try {
      return DocumentBuilderFactoryHolder.INSTANCE.newDocumentBuilder().newDocument();
    } catch (ParserConfigurationException e) {
      throw asRuntimeError(runtime, null, e);
    }
  }

  private static class DocumentBuilderFactoryHolder
  {
    static final DocumentBuilderFactory INSTANCE;
    static
    {
      INSTANCE = DocumentBuilderFactory.newInstance(DOCUMENTBUILDERFACTORY_IMPLE_NAME,
                 NokogiriService.class.getClassLoader());
    }
  }

  static RaiseException
  asRuntimeError(Ruby runtime, String message, Exception cause)
  {
    if (cause instanceof RaiseException) { return (RaiseException) cause; }

    if (message == null) { message = cause.toString(); }
    else { message = message + '(' + cause.toString() + ')'; }
    RaiseException ex = runtime.newRuntimeError(message);
    ex.initCause(cause);
    return ex;
  }

  /*
   * call-seq:
   *  new(version = default)
   *
   * Create a new document with +version+ (defaults to "1.0")
   */
  @JRubyMethod(name = "new", meta = true, rest = true, required = 0)
  public static IRubyObject
  rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args)
  {
    final Ruby runtime = context.runtime;
    XmlDocument xmlDocument;
    try {
      Document docNode = createNewDocument(runtime);
      if ("Nokogiri::HTML4::Document".equals(((RubyClass)klazz).getName())) {
        xmlDocument = new Html4Document(context.runtime, (RubyClass) klazz, docNode);
      } else {
        xmlDocument = new XmlDocument(context.runtime, (RubyClass) klazz, docNode);
      }
    } catch (Exception ex) {
      throw asRuntimeError(runtime, "couldn't create document: ", ex);
    }

    Helpers.invoke(context, xmlDocument, "initialize", args);

    return xmlDocument;
  }

  @JRubyMethod(required = 1, optional = 4)
  public IRubyObject
  create_entity(ThreadContext context, IRubyObject[] argv)
  {
    // FIXME: Entity node should be create by some right way.
    // this impl passes tests, but entity doesn't exists in DTD, which
    // would cause validation failure.
    if (argv.length == 0) { throw context.runtime.newRuntimeError("Could not create entity"); }
    String tagName = rubyStringToString(argv[0]);
    Node node = getOwnerDocument().createElement(tagName);
    return XmlEntityDecl.create(context, node, argv);
  }

  @Override
  XmlDocument
  document(Ruby runtime)
  {
    return this;
  }

  @JRubyMethod(name = "encoding=")
  public IRubyObject
  encoding_set(IRubyObject encoding)
  {
    this.encoding = encoding;
    return this;
  }

  @JRubyMethod
  public IRubyObject
  encoding(ThreadContext context)
  {
    if (this.encoding == null || this.encoding.isNil()) {
      final String enc = getDocument().getXmlEncoding();
      if (enc == null) {
        this.encoding = context.nil;
      } else {
        this.encoding = context.runtime.newString(enc);
      }
    }

    return this.encoding.isNil() ? this.encoding : this.encoding.asString().encode(context,
           context.getRuntime().newString("UTF-8"));
  }

  @JRubyMethod(meta = true)
  public static IRubyObject
  load_external_subsets_set(ThreadContext context, IRubyObject cls, IRubyObject value)
  {
    XmlDocument.loadExternalSubset = value.isTrue();
    return context.nil;
  }

  @JRubyMethod(meta = true, required = 4)
  public static IRubyObject
  read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args)
  {
    XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]);
    ctx.setIOInputSource(context, args[0], args[1]);
    return ctx.parse(context, (RubyClass) klass, args[1]);
  }

  @JRubyMethod(meta = true, required = 4)
  public static IRubyObject
  read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args)
  {
    XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]);
    ctx.setStringInputSource(context, args[0], args[1]);
    return ctx.parse(context, (RubyClass) klass, args[1]);
  }

  @JRubyMethod(name = "remove_namespaces!")
  public IRubyObject
  remove_namespaces(ThreadContext context)
  {
    removeNamespaceRecursively(this);
    if (nsCache != null) { nsCache.clear(); }
    clearXpathContext(getNode());
    return this;
  }

  private void
  removeNamespaceRecursively(XmlNode xmlNode)
  {
    Node node = xmlNode.node;
    if (node.getNodeType() == Node.ELEMENT_NODE) {
      node.setPrefix(null);
      NokogiriHelpers.renameNode(node, null, node.getLocalName());
      NamedNodeMap attrs = node.getAttributes();
      for (int i = 0; i < attrs.getLength(); i++) {
        Attr attr = (Attr) attrs.item(i);
        if (isNamespace(attr.getNodeName())) {
          ((org.w3c.dom.Element) node).removeAttributeNode(attr);
        } else {
          attr.setPrefix(null);
          NokogiriHelpers.renameNode(attr, null, attr.getLocalName());
        }
      }
    }
    IRubyObject[] nodes = xmlNode.getChildren();
    for (int i = 0; i < nodes.length; i++) {
      XmlNode childNode = (XmlNode) nodes[i];
      removeNamespaceRecursively(childNode);
    }
  }

  @JRubyMethod
  public IRubyObject
  root(ThreadContext context)
  {
    Node rootNode = getDocument().getDocumentElement();
    if (rootNode == null) { return context.nil; }

    Object invalid = rootNode.getUserData(NokogiriHelpers.ROOT_NODE_INVALID);
    if (invalid != null && ((Boolean) invalid)) { return context.nil; }

    return getCachedNodeOrCreate(context.runtime, rootNode);
  }

  @JRubyMethod(visibility = Visibility.PROTECTED)
  public IRubyObject
  initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level)
  {
    super.initialize_copy_with_args(context, other, level, null);
    resetCache();
    return this;
  }

  @JRubyMethod(name = "root=")
  public IRubyObject
  root_set(ThreadContext context, IRubyObject new_root)
  {
    // in case of document fragment, temporary root node should be deleted.

    // Java can't have a root whose value is null. Instead of setting null,
    // the method sets user data so that other methods are able to know the root
    // should be nil.
    if (new_root == context.nil) {
      getDocument().getDocumentElement().setUserData(NokogiriHelpers.ROOT_NODE_INVALID, Boolean.TRUE, null);
      return new_root;
    }
    if (!(new_root instanceof XmlNode)) {
      throw context.runtime.newArgumentError("expected Nokogiri::XML::Node but received " + new_root.getType());
    }
    XmlNode newRoot = asXmlNode(context, new_root);

    IRubyObject root = root(context);
    if (root.isNil()) {
      Node newRootNode;
      if (getDocument() == newRoot.getOwnerDocument()) {
        newRootNode = newRoot.node;
      } else {
        // must copy otherwise newRoot may exist in two places
        // with different owner document.
        newRootNode = getDocument().importNode(newRoot.node, true);
      }
      add_child_node(context, getCachedNodeOrCreate(context.runtime, newRootNode));
    } else {
      Node rootNode = asXmlNode(context, root).node;
      ((XmlNode) getCachedNodeOrCreate(context.runtime, rootNode)).replace_node(context, newRoot);
    }

    return newRoot;
  }

  @JRubyMethod
  public IRubyObject
  version(ThreadContext context)
  {
    return stringOrNil(context.runtime, getDocument().getXmlVersion());
  }

  @JRubyMethod(meta = true)
  public static IRubyObject
  substitute_entities_set(ThreadContext context, IRubyObject cls, IRubyObject value)
  {
    XmlDocument.substituteEntities = value.isTrue();
    return context.nil;
  }

  public IRubyObject
  getInternalSubset(ThreadContext context)
  {
    IRubyObject dtd = (IRubyObject) node.getUserData(DTD_INTERNAL_SUBSET);

    if (dtd == null) {
      Document document = getDocument();
      if (document.getUserData(XmlDocument.DTD_RAW_DOCUMENT) != null) {
        dtd = XmlDtd.newFromInternalSubset(context.runtime, document);
      } else if (document.getDoctype() != null) {
        DocumentType docType = document.getDoctype();
        IRubyObject name, publicId, systemId;
        name = publicId = systemId = context.nil;
        if (docType.getName() != null) {
          name = context.runtime.newString(docType.getName());
        }
        if (docType.getPublicId() != null) {
          publicId = context.runtime.newString(docType.getPublicId());
        }
        if (docType.getSystemId() != null) {
          systemId = context.runtime.newString(docType.getSystemId());
        }
        dtd = XmlDtd.newEmpty(context.runtime, document, name, publicId, systemId);
      } else {
        dtd = context.nil;
      }

      setInternalSubset(dtd);
    }

    return dtd;
  }

  /**
   * Assumes XmlNode#internal_subset() has returned nil. (i.e. there
   * is not already an internal subset).
   */
  public IRubyObject
  createInternalSubset(ThreadContext context,
                       IRubyObject name,
                       IRubyObject external_id,
                       IRubyObject system_id)
  {
    XmlDtd dtd = XmlDtd.newEmpty(context.runtime, getDocument(), name, external_id, system_id);
    setInternalSubset(dtd);
    return dtd;
  }

  protected void
  setInternalSubset(IRubyObject data)
  {
    node.setUserData(DTD_INTERNAL_SUBSET, data, null);
  }

  public IRubyObject
  getExternalSubset(ThreadContext context)
  {
    IRubyObject dtd = (IRubyObject) node.getUserData(DTD_EXTERNAL_SUBSET);

    if (dtd == null) { return context.nil; }
    return dtd;
  }

  /**
   * Assumes XmlNode#external_subset() has returned nil. (i.e. there
   * is not already an external subset).
   */
  public IRubyObject
  createExternalSubset(ThreadContext context,
                       IRubyObject name,
                       IRubyObject external_id,
                       IRubyObject system_id)
  {
    XmlDtd dtd = XmlDtd.newEmpty(context.runtime, getDocument(), name, external_id, system_id);
    setExternalSubset(dtd);
    return dtd;
  }

  protected void
  setExternalSubset(IRubyObject data)
  {
    node.setUserData(DTD_EXTERNAL_SUBSET, data, null);
  }

  @Override
  public void
  accept(ThreadContext context, SaveContextVisitor visitor)
  {
    Document document = getDocument();
    visitor.enter(document);
    NodeList children = document.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
      Node child = children.item(i);
      short type = child.getNodeType();
      if (type == Node.COMMENT_NODE) {
        XmlComment xmlComment = (XmlComment) getCachedNodeOrCreate(context.runtime, child);
        xmlComment.accept(context, visitor);
      } else if (type == Node.DOCUMENT_TYPE_NODE) {
        XmlDtd xmlDtd = (XmlDtd) getCachedNodeOrCreate(context.runtime, child);
        xmlDtd.accept(context, visitor);
      } else if (type == Node.PROCESSING_INSTRUCTION_NODE) {
        XmlProcessingInstruction xmlProcessingInstruction = (XmlProcessingInstruction) getCachedNodeOrCreate(context.runtime,
            child);
        xmlProcessingInstruction.accept(context, visitor);
      } else if (type == Node.TEXT_NODE) {
        XmlText xmlText = (XmlText) getCachedNodeOrCreate(context.runtime, child);
        xmlText.accept(context, visitor);
      } else if (type == Node.ELEMENT_NODE) {
        XmlElement xmlElement = (XmlElement) getCachedNodeOrCreate(context.runtime, child);
        xmlElement.accept(context, visitor);
      }
    }
    visitor.leave(document);
  }

  @JRubyMethod(meta = true)
  public static IRubyObject
  wrap(ThreadContext context, IRubyObject klass, IRubyObject arg)
  {
    XmlDocument xmlDocument = new XmlDocument(context.runtime, (RubyClass) klass, arg.toJava(Document.class));
    Helpers.invoke(context, xmlDocument, "initialize");
    return xmlDocument;
  }

  @Deprecated
  @JRubyMethod(meta = true, visibility = Visibility.PRIVATE)
  public static IRubyObject
  wrapJavaDocument(ThreadContext context, IRubyObject klass, IRubyObject arg)
  {
    return wrap(context, klass, arg);
  }

  @Deprecated // default to_java works (due inherited from XmlNode#toJava)
  @JRubyMethod(visibility = Visibility.PRIVATE)
  public IRubyObject
  toJavaDocument(ThreadContext context)
  {
    return JavaUtil.convertJavaToUsableRubyObject(context.getRuntime(), node);
  }

  /* call-seq:
   *  doc.canonicalize(mode=XML_C14N_1_0,inclusive_namespaces=nil,with_comments=false)
   *  doc.canonicalize { |obj, parent| ... }
   *
   * Canonicalize a document and return the results.  Takes an optional block
   * that takes two parameters: the +obj+ and that node's +parent+.
   * The  +obj+ will be either a Nokogiri::XML::Node, or a Nokogiri::XML::Namespace
   * The block must return a non-nil, non-false value if the +obj+ passed in
   * should be included in the canonicalized document.
   */
  @JRubyMethod(optional = 3)
  public IRubyObject
  canonicalize(ThreadContext context, IRubyObject[] args, Block block)
  {
    int mode = 0;
    String inclusive_namespace = null;
    Boolean with_comments = false;
    if (args.length > 0 && !(args[0].isNil())) {
      mode = RubyFixnum.fix2int(args[0]);
    }
    if (args.length > 1) {
      if (!args[1].isNil() && !(args[1] instanceof List)) {
        throw context.runtime.newTypeError("Expected array");
      }
      if (!args[1].isNil()) {
        inclusive_namespace = ((RubyArray)args[1])
                              .join(context, context.runtime.newString(" "))
                              .asString()
                              .asJavaString(); // OMG I wish I knew JRuby better, this is ugly
      }
    }
    if (args.length > 2) {
      with_comments = args[2].isTrue();
    }
    String algorithmURI = null;
    switch (mode) {
      case 0:  // XML_C14N_1_0
        if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N_WITH_COMMENTS; }
        else { algorithmURI = Canonicalizer.ALGO_ID_C14N_OMIT_COMMENTS; }
        break;
      case 1:  // XML_C14N_EXCLUSIVE_1_0
        if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N_EXCL_WITH_COMMENTS; }
        else { algorithmURI = Canonicalizer.ALGO_ID_C14N_EXCL_OMIT_COMMENTS; }
        break;
      case 2: // XML_C14N_1_1 = 2
        if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N11_WITH_COMMENTS; }
        else { algorithmURI = Canonicalizer.ALGO_ID_C14N11_OMIT_COMMENTS; }
    }
    try {
      Canonicalizer canonicalizer = Canonicalizer.getInstance(algorithmURI);
      XmlNode startingNode = getStartingNode(block);
      byte[] result;
      CanonicalFilter filter = new CanonicalFilter(context, block);
      if (inclusive_namespace == null) {
        result = canonicalizer.canonicalizeSubtree(startingNode.getNode(), filter);
      } else {
        result = canonicalizer.canonicalizeSubtree(startingNode.getNode(), inclusive_namespace, filter);
      }
      return RubyString.newString(context.runtime, new ByteList(result, UTF8Encoding.INSTANCE));
    } catch (Exception e) {
      throw context.getRuntime().newRuntimeError(e.getMessage());
    }
  }

  private XmlNode
  getStartingNode(Block block)
  {
    if (block.isGiven()) {
      IRubyObject boundSelf = block.getBinding().getSelf();
      if (boundSelf instanceof XmlNode) { return (XmlNode) boundSelf; }
    }
    return this;
  }

  public void
  resetNamespaceCache(ThreadContext context)
  {
    nsCache = new NokogiriNamespaceCache();
    createAndCacheNamespaces(context.runtime, node);
  }
}