sparklemotion/nokogiri

View on GitHub
ext/java/nokogiri/XmlReader.java

Summary

Maintainability
D
1 day
Test Coverage
package nokogiri;

import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static nokogiri.internals.NokogiriHelpers.stringOrBlank;

import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;

import org.apache.xerces.impl.Constants;
import org.apache.xerces.impl.xs.opti.DefaultXMLDocumentHandler;
import org.apache.xerces.parsers.StandardParserConfiguration;
import org.apache.xerces.util.EntityResolver2Wrapper;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParseException;
import org.apache.xerces.xni.parser.XMLPullParserConfiguration;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBoolean;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.Block;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.IOInputStream;
import org.xml.sax.InputSource;

import nokogiri.internals.NokogiriEntityResolver;
import nokogiri.internals.ParserContext;
import nokogiri.internals.ParserContext.Options;
import nokogiri.internals.ReaderNode;
import nokogiri.internals.ReaderNode.ClosingNode;
import nokogiri.internals.ReaderNode.ElementNode;
import nokogiri.internals.ReaderNode.TextNode;

/**
 * Class for Nokogiri:XML::Reader
 *
 * @author sergio
 * @author Yoko Harada <yokolet@gmail.com>
 */
@JRubyClass(name = "Nokogiri::XML::Reader")
public class XmlReader extends RubyObject
{
  private static final long serialVersionUID = 1L;

  private static final int XML_TEXTREADER_MODE_INITIAL = 0;
  private static final int XML_TEXTREADER_MODE_INTERACTIVE = 1;
  private static final int XML_TEXTREADER_MODE_ERROR = 2;
  private static final int XML_TEXTREADER_MODE_EOF = 3;
  private static final int XML_TEXTREADER_MODE_CLOSED = 4;
  private static final int XML_TEXTREADER_MODE_READING = 5;

  List<ReaderNode> nodeQueue;
  private int state;
  private int position = 0;
  private XMLPullParserConfiguration config;
  private boolean continueParsing = true;

  public
  XmlReader(Ruby runtime, RubyClass klazz)
  {
    super(runtime, klazz);
  }

  /**
   * Create and return a copy of this object.
   *
   * @return a clone of this object
   */
  @Override
  public Object
  clone() throws CloneNotSupportedException
  {
    return super.clone();
  }

  public void
  init(Ruby runtime)
  {
    nodeQueue = new LinkedList<ReaderNode>();
    nodeQueue.add(new ReaderNode.EmptyNode(runtime));
  }

  private void
  setInput(ThreadContext context, InputStream in, IRubyObject url, Options options)
  {
    this.setState(XML_TEXTREADER_MODE_READING);
    config = this.createReader(context.getRuntime(), options);
    InputSource inputSource = new InputSource();
    ParserContext.setUrl(context, inputSource, url);
    XMLInputSource xmlInputSource = new XMLInputSource(inputSource.getPublicId(),
        inputSource.getSystemId(), null, in, null);
    try {
      config.setInputSource(xmlInputSource);
    } catch (IOException e) {
      throw context.getRuntime().newRuntimeError(e.getMessage());
    }
    this.setState(XML_TEXTREADER_MODE_CLOSED);
  }

  private void
  setState(int state) { this.state = state; }

  @JRubyMethod
  public IRubyObject
  attribute(ThreadContext context, IRubyObject name)
  {
    return currentNode().getAttributeByName(name);
  }

  @JRubyMethod
  public IRubyObject
  attribute_at(ThreadContext context, IRubyObject index)
  {
    return currentNode().getAttributeByIndex(index);
  }

  @JRubyMethod
  public IRubyObject
  attribute_count(ThreadContext context)
  {
    return currentNode().getAttributeCount();
  }

  @JRubyMethod
  public IRubyObject
  attribute_hash(ThreadContext context)
  {
    return currentNode().getAttributes(context);
  }

  @JRubyMethod(name = "attributes?")
  public IRubyObject
  attributes_p(ThreadContext context)
  {
    return currentNode().hasAttributes();
  }

  @JRubyMethod
  public IRubyObject
  base_uri(ThreadContext context)
  {
    return currentNode().getXmlBase();
  }

  @JRubyMethod(name = "default?")
  public IRubyObject
  default_p(ThreadContext context)
  {
    return currentNode().isDefault();
  }

  @JRubyMethod
  public IRubyObject
  depth(ThreadContext context)
  {
    return currentNode().getDepth();
  }

  @JRubyMethod(name = {"empty_element?", "self_closing?"})
  public IRubyObject
  empty_element_p(ThreadContext context)
  {
    ReaderNode readerNode = currentNode();
    ensureNodeClosed(context);

    if (readerNode == null) { return context.getRuntime().getNil(); }
    if (!(readerNode instanceof ElementNode)) { context.getRuntime().getFalse(); }
    return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren);
  }

  @JRubyMethod
  public IRubyObject
  encoding(ThreadContext context)
  {
    IRubyObject constructor_encoding = getInstanceVariable("@encoding");
    if (!constructor_encoding.isNil()) {
      return constructor_encoding;
    }
    // TODO: get the parser's detected encoding
    return context.getRuntime().getNil();
  }

  @JRubyMethod(meta = true, rest = true)
  public static IRubyObject
  from_io(ThreadContext context, IRubyObject cls, IRubyObject args[])
  {
    // Only to pass the  source test.
    Ruby runtime = context.getRuntime();
    // Not nil allowed!
    if (args[0].isNil()) { throw runtime.newArgumentError("io cannot be nil"); }

    XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                       "Nokogiri::XML::Reader"));
    reader.init(runtime);
    reader.setInstanceVariable("@source", args[0]);
    reader.setInstanceVariable("@errors", runtime.newArray());
    IRubyObject url = context.nil;
    if (args.length > 1) { url = args[1]; }
    if (args.length > 2) { reader.setInstanceVariable("@encoding", args[2]); }

    Options options;
    if (args.length > 3) {
      options = new ParserContext.Options(args[3].toJava(Long.class));
    } else {
      // use the default options RECOVER | NONET
      options = new ParserContext.Options(2048 | 1);
    }

    InputStream in = new IOInputStream(args[0]);
    reader.setInput(context, in, url, options);
    return reader;
  }

  @JRubyMethod(meta = true, rest = true)
  public static IRubyObject
  from_memory(ThreadContext context, IRubyObject cls, IRubyObject args[])
  {
    // args[0]: string, args[1]: url, args[2]: encoding, args[3]: options
    Ruby runtime = context.getRuntime();
    // Not nil allowed!
    if (args[0].isNil()) { throw runtime.newArgumentError("string cannot be nil"); }

    XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
                       "Nokogiri::XML::Reader"));
    reader.init(runtime);
    reader.setInstanceVariable("@source", args[0]);
    reader.setInstanceVariable("@errors", runtime.newArray());
    IRubyObject url = context.nil;
    if (args.length > 1) { url = args[1]; }
    if (args.length > 2) { reader.setInstanceVariable("@encoding", args[2]); }

    Options options;
    if (args.length > 3) {
      options = new ParserContext.Options(args[3].toJava(Long.class));
    } else {
      // use the default options RECOVER | NONET
      options = new ParserContext.Options(2048 | 1);
    }
    IRubyObject stringIO = runtime.getClass("StringIO").newInstance(context, args[0], Block.NULL_BLOCK);
    InputStream in = new IOInputStream(stringIO);
    reader.setInput(context, in, url, options);
    return reader;
  }

  @JRubyMethod
  public IRubyObject
  node_type(ThreadContext context)
  {
    IRubyObject node_type = currentNode().getNodeType();
    return node_type == null ? RubyFixnum.zero(context.getRuntime()) : node_type;
  }

  @JRubyMethod
  public IRubyObject
  inner_xml(ThreadContext context)
  {
    ensureNodeClosed(context);
    return stringOrBlank(context.getRuntime(), getInnerXml(currentNode()));
  }

  private String
  getInnerXml(ReaderNode current)
  {
    if (current.depth < 0) { return null; }
    if (!current.hasChildren) { return null; }
    StringBuffer sb = new StringBuffer();
    for (int i = current.startOffset + 1; i <= current.endOffset - 1; i++) {
      sb.append(nodeQueue.get(i).getString());
    }
    return new String(sb);
  }

  @JRubyMethod
  public IRubyObject
  outer_xml(ThreadContext context)
  {
    ensureNodeClosed(context);
    return stringOrBlank(context.getRuntime(), getOuterXml());
  }

  private String
  getOuterXml()
  {
    ReaderNode current = currentNode();
    if (current == null || current.depth < 0) { return null; }

    if (current instanceof ClosingNode) {
      return "<" + current.name + "/>";
    }

    StringBuilder sb = new StringBuilder();
    for (int i = position; i <= current.endOffset; i++) {
      sb.append(nodeQueue.get(i).getString());
    }
    return new String(sb);
  }

  @JRubyMethod
  public IRubyObject
  lang(ThreadContext context)
  {
    return currentNode().getLang();
  }

  @JRubyMethod
  public IRubyObject
  local_name(ThreadContext context)
  {
    return currentNode().getLocalName();
  }

  @JRubyMethod
  public IRubyObject
  name(ThreadContext context)
  {
    return currentNode().getName();
  }

  @JRubyMethod
  public IRubyObject
  namespace_uri(ThreadContext context)
  {
    return currentNode().getUri();
  }

  @JRubyMethod
  public IRubyObject
  namespaces(ThreadContext context)
  {
    return currentNode().getNamespaces(context);
  }

  @JRubyMethod
  public IRubyObject
  prefix(ThreadContext context)
  {
    return currentNode().getPrefix();
  }

  private void
  readMoreData(ThreadContext context)
  {
    if (!continueParsing) { throw context.runtime.newRuntimeError("Cannot parse more data"); }
    try {
      continueParsing = config.parse(false);
    } catch (XNIException e) {
      throw XmlSyntaxError.createXMLSyntaxError(context.runtime, e).toThrowable(); // Nokogiri::XML::SyntaxError
    } catch (IOException e) {
      throw context.runtime.newRuntimeError(e.toString());
    }
  }

  private void
  ensureNodeClosed(ThreadContext context)
  {
    ReaderNode node = currentNode();
    if (node instanceof TextNode) { return; }
    while (node.endOffset < 1) { readMoreData(context); }
  }

  @JRubyMethod
  public IRubyObject
  read(ThreadContext context)
  {
    position++;
    try {
      while (nodeQueue.size() <= position && continueParsing) {
        readMoreData(context);
      }
      return setAndRaiseErrorsIfAny(context.runtime, null);
    } catch (RaiseException ex) {
      return setAndRaiseErrorsIfAny(context.runtime, ex);
    }
  }

  private IRubyObject
  setAndRaiseErrorsIfAny(final Ruby runtime, final RaiseException ex) throws RaiseException
  {
    final ReaderNode currentNode = currentNode();
    if (currentNode == null) { return runtime.getNil(); }
    if (currentNode.isError()) {
      RubyArray<?> errors = (RubyArray) getInstanceVariable("@errors");
      IRubyObject error = currentNode.toSyntaxError();
      errors.append(error);
      setInstanceVariable("@errors", errors);

      throw ex != null ? ex : ((XmlSyntaxError) error).toThrowable();
    }
    if (ex != null) { throw ex; }
    return this;
  }

  private ReaderNode
  currentNode()
  {
    if (position >= nodeQueue.size()) { return null; }
    return nodeQueue.get(position);
  }

  @JRubyMethod
  public IRubyObject
  state(ThreadContext context)
  {
    return context.getRuntime().newFixnum(this.state);
  }

  @JRubyMethod
  public IRubyObject
  value(ThreadContext context)
  {
    return currentNode().getValue();
  }

  @JRubyMethod(name = "value?")
  public IRubyObject
  value_p(ThreadContext context)
  {
    return currentNode().hasValue();
  }

  @JRubyMethod
  public IRubyObject
  xml_version(ThreadContext context)
  {
    return currentNode().getXmlVersion();
  }

  protected XMLPullParserConfiguration
  createReader(Ruby ruby, Options options)
  {
    StandardParserConfiguration config = new StandardParserConfiguration();
    DocumentHandler handler = new DocumentHandler(ruby);
    // XMLReader reader = XMLReaderFactory.createXMLReader();
    config.setDocumentHandler(handler);
    config.setDTDHandler(handler);
    config.setErrorHandler(handler);
    config.setEntityResolver(new EntityResolver2Wrapper(new NokogiriEntityResolver(ruby, null, options)));
    // config.setFeature("http://xml.org/sax/features/xmlns-uris", true);
    // config.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
    config.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", options.dtdLoad
                      || options.dtdValid);
    return config;
  }

  private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLErrorHandler
  {

    Stack<String> langStack;
    int depth;
    Stack<String> xmlBaseStack;
    Stack<ReaderNode.ElementNode> elementStack;
    private final Ruby ruby;

    public
    DocumentHandler(Ruby ruby)
    {
      this.ruby = ruby;
    }

    @Override
    public void
    startGeneralEntity(String name, XMLResourceIdentifier identifier,
                       String encoding, Augmentations augs) throws XNIException
    {
      Object entitySkipped;
      if (augs != null && (entitySkipped = augs.getItem(Constants.ENTITY_SKIPPED)) != null && ((Boolean) entitySkipped)) {
        nodeQueue.add(new ReaderNode.ExceptionNode(ruby, null));
      }
    }



    @Override
    public void
    startDocument(XMLLocator locator, String encoding, NamespaceContext context, Augmentations augs)
    {
      depth = 0;
      langStack = new Stack<String>();
      xmlBaseStack = new Stack<String>();
      elementStack = new Stack<ReaderNode.ElementNode>();
    }

    @Override
    public void
    endDocument(Augmentations augs)
    {
      langStack = null;
      xmlBaseStack = null;
      elementStack = null;
    }

    @Override
    public void
    startElement(QName element, XMLAttributes attrs, Augmentations augs)
    {
      commonElement(element, attrs, false);
    }

    @Override
    public void
    endElement(QName element, Augmentations augs)
    {
      String uri = element.uri;
      String localName = element.localpart;
      String qName = element.rawname;
      depth--;
      ElementNode startElementNode = elementStack.pop();
      ReaderNode node = ReaderNode.createClosingNode(ruby, uri, localName, qName, depth, langStack, xmlBaseStack);

      startElementNode.endOffset = nodeQueue.size() - 1;

      if (startElementNode.endOffset != startElementNode.startOffset) {
        // this node isn't empty
        node.attributeList = startElementNode.attributeList;
        node.namespaces = startElementNode.namespaces;
        node.startOffset = startElementNode.startOffset;
        node.endOffset = ++startElementNode.endOffset;
        node.hasChildren = startElementNode.hasChildren = true;
        nodeQueue.add(node);
      }
      if (!langStack.isEmpty()) { langStack.pop(); }
      if (!xmlBaseStack.isEmpty()) { xmlBaseStack.pop(); }
    }

    @Override
    public void
    emptyElement(QName element, XMLAttributes attrs, Augmentations augs)
    {
      commonElement(element, attrs, true);
    }

    private void
    commonElement(QName element, XMLAttributes attrs, boolean isEmpty)
    {
      String qName = element.rawname;
      String uri = element.uri;
      String localName = element.localpart;
      ReaderNode readerNode = ReaderNode.createElementNode(ruby, uri, localName, qName, attrs, depth, langStack,
                              xmlBaseStack);
      if (!elementStack.isEmpty()) {
        ElementNode parent = elementStack.peek();
        parent.hasChildren = true;
      }
      nodeQueue.add(readerNode);
      readerNode.startOffset = nodeQueue.size() - 1;
      if (!isEmpty) {
        depth++;
        if (readerNode.lang != null) { langStack.push(readerNode.lang); }
        if (readerNode.xmlBase != null) { xmlBaseStack.push(readerNode.xmlBase); }
        elementStack.push((ReaderNode.ElementNode)readerNode);
      } else {
        readerNode.endOffset = readerNode.startOffset;
        readerNode.hasChildren = false;
      }
    }

    @Override
    public void
    characters(XMLString string, Augmentations augs)
    {
      ReaderNode.TextNode node = ReaderNode.createTextNode(ruby, string.toString(), depth, langStack, xmlBaseStack);
      nodeQueue.add(node);
      node.startOffset = node.endOffset = nodeQueue.size() - 1;
    }

    @Override
    public void
    error(String domain, String key, XMLParseException ex)
    {
      nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
      throw ex;
    }

    @Override
    public void
    fatalError(String domain, String key, XMLParseException ex)
    {
      nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
      throw ex;
    }

    @Override
    public void
    warning(String domain, String key, XMLParseException ex)
    {
      nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
      throw ex;
    }
  }
}