sparklemotion/nokogiri

View on GitHub
ext/java/nokogiri/internals/SaveContextVisitor.java

Summary

Maintainability
F
1 wk
Test Coverage
package nokogiri.internals;

import static nokogiri.internals.NokogiriHelpers.canonicalizeWhitespace;
import static nokogiri.internals.NokogiriHelpers.encodeJavaString;
import static nokogiri.internals.NokogiriHelpers.isNamespace;
import static nokogiri.internals.NokogiriHelpers.isBlank;
import static nokogiri.internals.NokogiriHelpers.shouldEncode;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sourceforge.htmlunit.cyberneko.HTMLElements;
import org.w3c.dom.Attr;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import org.w3c.dom.Entity;
import org.w3c.dom.EntityReference;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.Notation;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;

/**
 * A class for serializing a document.
 *
 * @author sergio
 * @author Patrick Mahoney <pat@polycrystal.org>
 * @author Yoko Harada <yokolet@gmail.com>
 */
public class SaveContextVisitor
{

  private final StringBuilder buffer;
  private final Stack<String> indentation;
  private String encoding;
  private final CharSequence indentString;
  private boolean format;
  private final boolean noDecl;
  private final boolean noEmpty;
  private final boolean noXhtml;
  private final boolean asXhtml;
  private boolean asXml;
  private final boolean asHtml;
  private final boolean asBuilder;
  private boolean htmlDoc;
  private final boolean fragment;
  private final boolean canonical, incl_ns, with_comments;
  private boolean subsets;
  private boolean exclusive;
  private final List<Node> c14nNodeList;
  private final Deque<Attr[]> c14nNamespaceStack;
  private final Deque<Attr[]> c14nAttrStack;
  //private List<String> c14nExclusiveInclusivePrefixes = null;

  /*
   * U can't touch this.
   * http://www.youtube.com/watch?v=WJ2ZFVx6A4Q
   *
   * Taken from libxml save options.
   */

  public static final int FORMAT = 1;
  public static final int NO_DECL = 2;
  public static final int NO_EMPTY = 4;
  public static final int NO_XHTML = 8;
  public static final int AS_XHTML = 16;
  public static final int AS_XML = 32;
  public static final int AS_HTML = 64;
  public static final int AS_BUILDER = 128;
  public static final int DEFAULT_HTML = NO_DECL | NO_EMPTY | AS_HTML;

  public static final int CANONICAL = 1;
  public static final int INCL_NS = 2;
  public static final int WITH_COMMENTS = 4;
  public static final int SUBSETS = 8;
  public static final int EXCLUSIVE = 16;

  private static final HTMLElements htmlElements_ = new HTMLElements();

  public
  SaveContextVisitor(int options, CharSequence indent, String encoding, boolean htmlDoc, boolean fragment,
                     int canonicalOpts)
  {
    buffer = new StringBuilder();
    this.encoding = encoding;
    indentation = new Stack<String>();
    indentation.push("");
    this.htmlDoc = htmlDoc;
    this.fragment = fragment;
    c14nNodeList = new ArrayList<Node>();
    c14nNamespaceStack = new ArrayDeque<Attr[]>();
    c14nAttrStack = new ArrayDeque<Attr[]>();
    format = (options & FORMAT) == FORMAT;

    noDecl = (options & NO_DECL) == NO_DECL;
    noEmpty = (options & NO_EMPTY) == NO_EMPTY;
    noXhtml = (options & NO_XHTML) == NO_XHTML;
    asXhtml = (options & AS_XHTML) == AS_XHTML;
    asXml = (options & AS_XML) == AS_XML;
    asHtml = (options & AS_HTML) == AS_HTML;
    asBuilder = (options & AS_BUILDER) == AS_BUILDER;

    canonical = (canonicalOpts & CANONICAL) == CANONICAL;
    incl_ns = (canonicalOpts & INCL_NS) == INCL_NS;
    with_comments = (canonicalOpts & WITH_COMMENTS) == WITH_COMMENTS;
    subsets = (canonicalOpts & SUBSETS) == SUBSETS;

    if ((format && indent == null) || (format && indent.length() == 0)) { indent = "  "; } // default, two spaces
    if ((!format && indent != null) && indent.length() > 0) { format = true; }
    if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) { indent = "  "; } // default, two spaces
    indentString = indent;
    if (!asXml && !asHtml && !asXhtml && !asBuilder) { asXml = true; }
  }

  @Override
  public String
  toString()
  {
    return buffer.toString();
  }

  public StringBuilder
  getInternalBuffer() { return buffer; }

  public void
  setHtmlDoc(boolean htmlDoc)
  {
    this.htmlDoc = htmlDoc;
  }

  public void
  setEncoding(String encoding)
  {
    this.encoding = encoding;
  }

  public boolean
  enter(Node node)
  {
    if (node instanceof Document) {
      return enter((Document)node);
    }
    if (node instanceof Element) {
      return enter((Element)node);
    }
    if (node instanceof Attr) {
      return enter((Attr)node);
    }
    if (node instanceof Text) {
      return enter((Text)node);
    }
    if (node instanceof CDATASection) {
      return enter((CDATASection)node);
    }
    if (node instanceof Comment) {
      return enter((Comment)node);
    }
    if (node instanceof DocumentType) {
      return enter((DocumentType)node);
    }
    if (node instanceof Entity) {
      return enter((Entity)node);
    }
    if (node instanceof EntityReference) {
      return enter((EntityReference) node);
    }
    if (node instanceof Notation) {
      return enter((Notation)node);
    }
    if (node instanceof ProcessingInstruction) {
      return enter((ProcessingInstruction)node);
    }
    return false;
  }

  public void
  leave(Node node)
  {
    if (node instanceof Document) {
      leave((Document)node);
      return;
    }
    if (node instanceof Element) {
      leave((Element)node);
      return;
    }
    if (node instanceof Attr) {
      leave((Attr)node);
      return;
    }
    if (node instanceof Text) {
      return;
    }
    if (node instanceof CDATASection) {
      leave((CDATASection)node);
      return;
    }
    if (node instanceof Comment) {
      leave((Comment)node);
      return;
    }
    if (node instanceof DocumentType) {
      leave((DocumentType)node);
      return;
    }
    if (node instanceof Entity) {
      leave((Entity)node);
      return;
    }
    if (node instanceof EntityReference) {
      leave((EntityReference) node);
      return;
    }
    if (node instanceof Notation) {
      leave((Notation)node);
      return;
    }
    if (node instanceof ProcessingInstruction) {
      leave((ProcessingInstruction)node);
      return;
    }
  }

  public boolean
  enter(String string)
  {
    buffer.append(string);
    return true;
  }

  public void
  leave(String string)
  {
    // no-op
  }

  public boolean
  enter(Attr attr)
  {
    String name = attr.getName();
    buffer.append(name);
    if (!asHtml || !isHtmlBooleanAttr(name)) {
      buffer.append('=');
      buffer.append('"');
      String value = replaceCharsetIfNecessary(attr);
      buffer.append(serializeAttrTextContent(value, htmlDoc));
      buffer.append('"');
    }
    return true;
  }

  private static final Pattern CHARSET =
    Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE);

  private String
  replaceCharsetIfNecessary(Attr attr)
  {
    String value = attr.getValue();
    if (encoding == null) { return value; }   // unable to replace in any case
    if (!"content".equals(attr.getName().toLowerCase())) { return value; }  // must be content attr
    if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) { return value; }
    Matcher m = CHARSET.matcher(value);
    if (!m.find()) { return value; }
    if (value.contains(encoding)) { return value; }  // no need to replace
    return value.replace(m.group(), "charset=" + encoding);
  }

  static final Set<String> HTML_BOOLEAN_ATTRS;
  static
  {
    final String[] _HTML_BOOLEAN_ATTRS = {
      "checked", "compact", "declare", "defer", "disabled", "ismap",
      "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
      "selected"
    };
    HTML_BOOLEAN_ATTRS = new HashSet<String>(Arrays.asList(_HTML_BOOLEAN_ATTRS));
  }

  private static boolean
  isHtmlBooleanAttr(String name)
  {
    return HTML_BOOLEAN_ATTRS.contains(name);
  }

  private static CharSequence
  serializeAttrTextContent(String str, boolean htmlDoc)
  {
    if (str == null || str.length() == 0) { return ""; }

    StringBuilder buffer = new StringBuilder(str.length() + 16);

    for (int i = 0; i < str.length(); i++) {
      char c;
      switch (c = str.charAt(i)) {
        case '\n':
          buffer.append("&#10;");
          break;
        case '\r':
          buffer.append("&#13;");
          break;
        case '\t':
          buffer.append("&#9;");
          break;
        case '"':
          if (htmlDoc) { buffer.append("%22"); }
          else { buffer.append("&quot;"); }
          break;
        case '<':
          buffer.append("&lt;");
          break;
        case '>':
          buffer.append("&gt;");
          break;
        case '&':
          buffer.append("&amp;");
          break;
        default:
          buffer.append(c);
      }
    }

    return buffer;
  }

  public void
  leave(Attr attr)
  {
    // no-op
  }

  public boolean
  enter(CDATASection cdata)
  {
    buffer.append("<![CDATA[");
    buffer.append(cdata.getData());
    buffer.append("]]>");
    return true;
  }

  public void
  leave(CDATASection cdata)
  {
    // no-op
  }

  public boolean
  enter(Comment comment)
  {
    if (canonical) {
      c14nNodeList.add(comment);
      if (!with_comments) { return true; }
    }
    buffer.append("<!--");
    buffer.append(comment.getData());
    buffer.append("-->");
    return true;
  }

  public void
  leave(Comment comment)
  {
    // no-op
  }

  public boolean
  enter(Document document)
  {
    if (!noDecl) {
      buffer.append("<?xml version=\"");
      buffer.append(document.getXmlVersion());
      buffer.append("\"");

      if (encoding != null) {
        buffer.append(" encoding=\"");
        buffer.append(encoding);
        buffer.append("\"");
      }
      buffer.append("?>\n");
    }
    return true;
  }

  public void
  leave(Document document)
  {
    // no-op
  }

  public boolean
  enter(DocumentType docType)
  {
    if (canonical) {
      c14nNodeList.add(docType);
      return true;
    }
    String name = docType.getName();
    String pubId = docType.getPublicId();
    String sysId = docType.getSystemId();
    String internalSubset = docType.getInternalSubset();
    if (docType.getPreviousSibling() != null) {
      buffer.append('\n');
    }
    buffer.append("<!DOCTYPE ").append(name).append(' ');
    if (pubId != null) {
      buffer.append("PUBLIC \"").append(pubId).append('"');
      if (sysId != null) { buffer.append(" \"").append(sysId).append('"'); }
    } else if (sysId != null) {
      buffer.append("SYSTEM \"").append(sysId).append('"');
    }
    if (internalSubset != null) {
      buffer.append(' ').append('[');
      buffer.append(internalSubset);
      buffer.append(']');
    }
    buffer.append(">\n");
    return true;
  }

  public void
  leave(DocumentType docType)
  {
    // no-op
  }

  public boolean
  enter(Element element)
  {
    if (canonical) {
      c14nNodeList.add(element);
      if (element == element.getOwnerDocument().getDocumentElement()) {
        c14nNodeList.add(element.getOwnerDocument());
      }
    }
    String current = indentation.peek();
    buffer.append(current);
    if (needIndent(element)) {
      indentation.push(current + indentString);
    }
    String name = element.getTagName();
    buffer.append('<').append(name);
    Attr[] attrs = getAttrsAndNamespaces(element);
    for (Attr attr : attrs) {
      if (attr.getSpecified()) {
        buffer.append(' ');
        enter(attr);
        leave(attr);
      }
    }
    if (element.hasChildNodes()) {
      buffer.append('>');
      if (needBreakInOpening(element)) { buffer.append('\n'); }
      return true;
    }
    // no child
    if (asHtml) {
      buffer.append('>');
    } else if (asXml && noEmpty) {
      buffer.append('>');
    } else if (asXhtml) {
      if (isEmpty(name)) {
        buffer.append(" />"); // see http://www.w3.org/TR/xhtml1/#C_2
      } else {
        buffer.append('>');
      }
    } else {
      buffer.append("/>");
    }
    if (needBreakInOpening(element)) {
      buffer.append('\n');
    }
    return true;
  }

  private boolean
  needIndent(Element element)
  {
    if (containsText(element)) { return false; }
    if (fragment) { return false; }  // a given option might be fragment and format. fragment matters
    if (format || asBuilder) { return true; }
    return false;
  }

  private boolean
  needBreakInOpening(Element element)
  {
    if (containsText(element)) { return false; }
    if (fragment) { return false; }
    if (format) { return true; }
    if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) { return true; }
    if (format && element.getNextSibling() == null && element.hasChildNodes()) { return true; }
    return false;
  }

  private boolean
  isEmpty(String name)
  {
    HTMLElements.Element element = htmlElements_.getElement(name);
    return element.isEmpty();
  }

  private Attr[]
  getAttrsAndNamespaces(Element element)
  {
    NamedNodeMap attrs = element.getAttributes();
    if (!canonical) {
      if (attrs == null || attrs.getLength() == 0) { return new Attr[0]; }
      Attr[] attrsAndNamespaces = new Attr[attrs.getLength()];
      for (int i = 0; i < attrs.getLength(); i++) {
        attrsAndNamespaces[i] = (Attr) attrs.item(i);
      }
      return attrsAndNamespaces;
    } else {
      List<Attr> namespaces = new ArrayList<Attr>();
      List<Attr> attributes = new ArrayList<Attr>();
      if (subsets) {
        getAttrsOfAncestors(element.getParentNode(), namespaces, attributes);
        Attr[] namespaceOfAncestors = getSortedArray(namespaces);
        Attr[] attributeOfAncestors = getSortedArray(attributes);
        c14nNamespaceStack.push(namespaceOfAncestors);
        c14nAttrStack.push(attributeOfAncestors);
        subsets = false; // namespace propagation should be done only once on top level node.
      }

      getNamespacesAndAttrs(element, namespaces, attributes);

      Attr[] namespaceArray = getSortedArray(namespaces);
      Attr[] attributeArray = getSortedArray(attributes);
      Attr[] allAttrs = new Attr[namespaceArray.length + attributeArray.length];
      for (int i = 0; i < allAttrs.length; i++) {
        if (i < namespaceArray.length) {
          allAttrs[i] = namespaceArray[i];
        } else {
          allAttrs[i] = attributeArray[i - namespaceArray.length];
        }
      }
      c14nNamespaceStack.push(namespaceArray);
      c14nAttrStack.push(attributeArray);
      return allAttrs;
    }

  }

  private void
  getAttrsOfAncestors(Node parent, List<Attr> namespaces, List<Attr> attributes)
  {
    if (parent == null) { return; }
    NamedNodeMap attrs = parent.getAttributes();
    if (attrs == null || attrs.getLength() == 0) { return; }
    for (int i = 0; i < attrs.getLength(); i++) {
      Attr attr = (Attr)attrs.item(i);
      if (isNamespace(attr.getNodeName())) { namespaces.add(attr); }
      else { attributes.add(attr); }
    }
    getAttrsOfAncestors(parent.getParentNode(), namespaces, attributes);
  }

  private void
  getNamespacesAndAttrs(Node current, List<Attr> namespaces, List<Attr> attributes)
  {
    NamedNodeMap attrs = current.getAttributes();
    for (int i = 0; i < attrs.getLength(); i++) {
      Attr attr = (Attr)attrs.item(i);
      if (isNamespace(attr.getNodeName())) {
        getNamespacesWithPropagated(namespaces, attr);
      } else {
        getAttributesWithPropagated(attributes, attr);
      }
      if (exclusive) {
        verifyXmlSpace(attributes, attrs);
      }
    }
  }

  private void
  getNamespacesWithPropagated(List<Attr> namespaces, Attr attr)
  {
    boolean newNamespace = true;
    Iterator<Attr[]> iter = c14nNamespaceStack.iterator();
    while (iter.hasNext()) {
      Attr[] parentNamespaces = iter.next();
      for (int n = 0; n < parentNamespaces.length; n++) {
        if (parentNamespaces[n].getNodeName().equals(attr.getNodeName())) {
          if (parentNamespaces[n].getNodeValue().equals(attr.getNodeValue())) {
            // exactly the same namespace should not be added
            newNamespace = false;
          } else {
            // in case of namespace url change, propagated namespace will be override
            namespaces.remove(parentNamespaces[n]);
          }
        }
      }
      if (newNamespace && !namespaces.contains(attr)) { namespaces.add(attr); }
    }
  }

  private void
  getAttributesWithPropagated(List<Attr> attributes, Attr attr)
  {
    boolean newAttribute = true;
    Iterator<Attr[]> iter = c14nAttrStack.iterator();
    while (iter.hasNext()) {
      Attr[] parentAttr = iter.next();
      for (int n = 0; n < parentAttr.length; n++) {
        if (!parentAttr[n].getNodeName().startsWith("xml:")) { continue; }
        if (parentAttr[n].getNodeName().equals(attr.getNodeName())) {
          if (parentAttr[n].getNodeValue().equals(attr.getNodeValue())) {
            // exactly the same attribute should not be added
            newAttribute = false;
          } else {
            // in case of attribute value change, propagated attribute will be override
            attributes.remove(parentAttr[n]);
          }
        }
      }
      if (newAttribute) { attributes.add(attr); }
    }
  }

  private void
  verifyXmlSpace(List<Attr> attributes, NamedNodeMap attrs)
  {
    Attr attr = (Attr) attrs.getNamedItem("xml:space");
    if (attr == null) {
      for (int i = 0; i < attributes.size(); i++) {
        if (attributes.get(i).getNodeName().equals("xml:space")) {
          attributes.remove(i);
          break;
        }
      }
    }
  }

  private Attr[]
  getSortedArray(List<Attr> attrList)
  {
    Attr[] attrArray = attrList.toArray(new Attr[0]);
    Arrays.sort(attrArray, new Comparator<Attr>() {
      @Override
      public int compare(Attr attr0, Attr attr1) {
        return attr0.getNodeName().compareTo(attr1.getNodeName());
      }
    });
    return attrArray;
  }

  public void
  leave(Element element)
  {
    if (canonical) {
      c14nNamespaceStack.poll();
      c14nAttrStack.poll();
    }
    String name = element.getTagName();
    if (element.hasChildNodes()) {
      if (needIndentInClosing(element)) {
        indentation.pop();
        buffer.append(indentation.peek());
      } else if (asBuilder) {
        if (!containsText(element)) { indentation.pop(); }
      }
      buffer.append("</").append(name).append('>');
      if (needBreakInClosing(element)) {
        buffer.append('\n');
      }
      return;
    }
    // no child, but HTML might need a closing tag.
    if (asHtml || noEmpty) {
      if (!isEmpty(name) && noEmpty) {
        buffer.append("</").append(name).append('>');
      }
    } else if (asXhtml && !isEmpty(name)) {
      buffer.append("</").append(name).append('>');
    }
    if (needBreakInClosing(element)) {
      if (!containsText(element)) { indentation.pop(); }
      buffer.append('\n');
    }
  }

  private boolean
  needIndentInClosing(Element element)
  {
    if (containsText(element)) { return false; }

    if (fragment) { return false; }  // a given option might be fragment and format. fragment matters
    if (format) { return true; }
    if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) { return true; }
    return false;
  }

  private boolean
  needBreakInClosing(Element element)
  {
    if (fragment) { return false; }
    if (format || asBuilder) { return true; }
    return false;
  }

  private boolean
  containsText(Element element)
  {
    return (element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.TEXT_NODE);
  }

  public boolean
  enter(Entity entity)
  {
    String name = entity.getNodeName();
    String pubId = entity.getPublicId();
    String sysId = entity.getSystemId();
    String notation = entity.getNotationName();
    buffer.append("<!ENTITY ");
    buffer.append(name);
    if (pubId != null) {
      buffer.append(" PUBLIC \"");
      buffer.append(pubId);
      buffer.append("\"");
    }
    if (sysId != null) {
      buffer.append(" SYSTEM \"");
      buffer.append(sysId);
      buffer.append("\"");
    }
    if (notation != null) {
      buffer.append(" NDATA ");
      buffer.append(notation);
    }
    buffer.append(">");
    return true;
  }

  public void
  leave(Entity entity)
  {
    // no-op
  }

  public boolean
  enter(EntityReference entityRef)
  {
    buffer.append('&').append(entityRef.getNodeName()).append(';');
    return true;
  }
  public void
  leave(EntityReference entityRef)
  {
    // no-op
  }

  public boolean
  enter(Notation notation)
  {
    String name = notation.getNodeName();
    String pubId = notation.getPublicId();
    String sysId = notation.getSystemId();
    buffer.append("<!NOTATION ");
    buffer.append(name);
    if (pubId != null) {
      buffer.append(" PUBLIC \"");
      buffer.append(pubId);
      buffer.append("\"");
      if (sysId != null) {
        buffer.append(" \"");
        buffer.append(sysId);
        buffer.append("\"");
      }
    } else if (sysId != null) {
      buffer.append(" SYSTEM \"");
      buffer.append(sysId);
      buffer.append("\"");
    }
    buffer.append(">");
    return true;
  }

  public void
  leave(Notation notation)
  {
    // no-op
  }

  public boolean
  enter(ProcessingInstruction pi)
  {
    buffer.append("<?");
    buffer.append(pi.getTarget());
    buffer.append(" ");
    buffer.append(pi.getData());
    if (asHtml) { buffer.append(">"); }
    else { buffer.append("?>"); }
    buffer.append("\n");
    if (canonical) { c14nNodeList.add(pi); }
    return true;
  }

  public void
  leave(ProcessingInstruction pi)
  {
    // no-op
  }

  private boolean
  isCDATA(Text text)
  {
    Node parentNode = text.getParentNode();
    return htmlDoc && parentNode != null && (parentNode.getNodeName().equals("style")
           || parentNode.getNodeName().equals("script"));
  }


  public boolean
  enter(Text text)
  {
    CharSequence textContent = text.getNodeValue();
    if (canonical) {
      c14nNodeList.add(text);
      if (isBlank(textContent)) {
        buffer.append(canonicalizeWhitespace(textContent));
        return true;
      }
    }

    if (shouldEncode(text) && !isCDATA(text)) {
      textContent = encodeJavaString(textContent);
    }

    textContent = encodeStringToHtmlEntity(textContent);
    buffer.append(textContent);
    return true;
  }

  private CharSequence
  encodeStringToHtmlEntity(CharSequence text)
  {
    if (encoding == null) { return text; }

    CharsetEncoder encoder = Charset.forName(encoding).newEncoder();
    StringBuilder sb = new StringBuilder(text.length() + 16);
    // make sure we can handle code points that are higher than 2 bytes
    for (int i = 0; i < text.length();) {
      int code = Character.codePointAt(text, i);
      // TODO not sure about bigger offset then 2 ?!
      int offset = code > 65535 ? 2 : 1;
      CharSequence substr = text.subSequence(i, i + offset);
      boolean canEncode = encoder.canEncode(substr);
      if (canEncode) {
        sb.append(substr);
      } else {
        sb.append("&#x").append(Integer.toHexString(code)).append(';');
      }
      i += offset;
    }
    return sb;
  }
}