ext/java/nokogiri/XmlReader.java
package nokogiri;
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
import static nokogiri.internals.NokogiriHelpers.stringOrBlank;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.Stack;
import org.apache.xerces.impl.Constants;
import org.apache.xerces.impl.xs.opti.DefaultXMLDocumentHandler;
import org.apache.xerces.parsers.StandardParserConfiguration;
import org.apache.xerces.util.EntityResolver2Wrapper;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParseException;
import org.apache.xerces.xni.parser.XMLPullParserConfiguration;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBoolean;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.Block;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.IOInputStream;
import org.xml.sax.InputSource;
import nokogiri.internals.NokogiriEntityResolver;
import nokogiri.internals.ParserContext;
import nokogiri.internals.ParserContext.Options;
import nokogiri.internals.ReaderNode;
import nokogiri.internals.ReaderNode.ClosingNode;
import nokogiri.internals.ReaderNode.ElementNode;
import nokogiri.internals.ReaderNode.TextNode;
/**
* Class for Nokogiri:XML::Reader
*
* @author sergio
* @author Yoko Harada <yokolet@gmail.com>
*/
@JRubyClass(name = "Nokogiri::XML::Reader")
public class XmlReader extends RubyObject
{
private static final long serialVersionUID = 1L;
private static final int XML_TEXTREADER_MODE_INITIAL = 0;
private static final int XML_TEXTREADER_MODE_INTERACTIVE = 1;
private static final int XML_TEXTREADER_MODE_ERROR = 2;
private static final int XML_TEXTREADER_MODE_EOF = 3;
private static final int XML_TEXTREADER_MODE_CLOSED = 4;
private static final int XML_TEXTREADER_MODE_READING = 5;
List<ReaderNode> nodeQueue;
private int state;
private int position = 0;
private XMLPullParserConfiguration config;
private boolean continueParsing = true;
public
XmlReader(Ruby runtime, RubyClass klazz)
{
super(runtime, klazz);
}
/**
* Create and return a copy of this object.
*
* @return a clone of this object
*/
@Override
public Object
clone() throws CloneNotSupportedException
{
return super.clone();
}
public void
init(Ruby runtime)
{
nodeQueue = new LinkedList<ReaderNode>();
nodeQueue.add(new ReaderNode.EmptyNode(runtime));
}
private void
setInput(ThreadContext context, InputStream in, IRubyObject url, Options options)
{
this.setState(XML_TEXTREADER_MODE_READING);
config = this.createReader(context.getRuntime(), options);
InputSource inputSource = new InputSource();
ParserContext.setUrl(context, inputSource, url);
XMLInputSource xmlInputSource = new XMLInputSource(inputSource.getPublicId(),
inputSource.getSystemId(), null, in, null);
try {
config.setInputSource(xmlInputSource);
} catch (IOException e) {
throw context.getRuntime().newRuntimeError(e.getMessage());
}
this.setState(XML_TEXTREADER_MODE_CLOSED);
}
private void
setState(int state) { this.state = state; }
@JRubyMethod
public IRubyObject
attribute(ThreadContext context, IRubyObject name)
{
return currentNode().getAttributeByName(name);
}
@JRubyMethod
public IRubyObject
attribute_at(ThreadContext context, IRubyObject index)
{
return currentNode().getAttributeByIndex(index);
}
@JRubyMethod
public IRubyObject
attribute_count(ThreadContext context)
{
return currentNode().getAttributeCount();
}
@JRubyMethod
public IRubyObject
attribute_hash(ThreadContext context)
{
return currentNode().getAttributes(context);
}
@JRubyMethod(name = "attributes?")
public IRubyObject
attributes_p(ThreadContext context)
{
return currentNode().hasAttributes();
}
@JRubyMethod
public IRubyObject
base_uri(ThreadContext context)
{
return currentNode().getXmlBase();
}
@JRubyMethod(name = "default?")
public IRubyObject
default_p(ThreadContext context)
{
return currentNode().isDefault();
}
@JRubyMethod
public IRubyObject
depth(ThreadContext context)
{
return currentNode().getDepth();
}
@JRubyMethod(name = {"empty_element?", "self_closing?"})
public IRubyObject
empty_element_p(ThreadContext context)
{
ReaderNode readerNode = currentNode();
ensureNodeClosed(context);
if (readerNode == null) { return context.getRuntime().getNil(); }
if (!(readerNode instanceof ElementNode)) { context.getRuntime().getFalse(); }
return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren);
}
@JRubyMethod
public IRubyObject
encoding(ThreadContext context)
{
IRubyObject constructor_encoding = getInstanceVariable("@encoding");
if (!constructor_encoding.isNil()) {
return constructor_encoding;
}
// TODO: get the parser's detected encoding
return context.getRuntime().getNil();
}
@JRubyMethod(meta = true, rest = true)
public static IRubyObject
from_io(ThreadContext context, IRubyObject cls, IRubyObject args[])
{
// Only to pass the source test.
Ruby runtime = context.getRuntime();
// Not nil allowed!
if (args[0].isNil()) { throw runtime.newArgumentError("io cannot be nil"); }
XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
"Nokogiri::XML::Reader"));
reader.init(runtime);
reader.setInstanceVariable("@source", args[0]);
reader.setInstanceVariable("@errors", runtime.newArray());
IRubyObject url = context.nil;
if (args.length > 1) { url = args[1]; }
if (args.length > 2) { reader.setInstanceVariable("@encoding", args[2]); }
Options options;
if (args.length > 3) {
options = new ParserContext.Options(args[3].toJava(Long.class));
} else {
// use the default options RECOVER | NONET
options = new ParserContext.Options(2048 | 1);
}
InputStream in = new IOInputStream(args[0]);
reader.setInput(context, in, url, options);
return reader;
}
@JRubyMethod(meta = true, rest = true)
public static IRubyObject
from_memory(ThreadContext context, IRubyObject cls, IRubyObject args[])
{
// args[0]: string, args[1]: url, args[2]: encoding, args[3]: options
Ruby runtime = context.getRuntime();
// Not nil allowed!
if (args[0].isNil()) { throw runtime.newArgumentError("string cannot be nil"); }
XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime,
"Nokogiri::XML::Reader"));
reader.init(runtime);
reader.setInstanceVariable("@source", args[0]);
reader.setInstanceVariable("@errors", runtime.newArray());
IRubyObject url = context.nil;
if (args.length > 1) { url = args[1]; }
if (args.length > 2) { reader.setInstanceVariable("@encoding", args[2]); }
Options options;
if (args.length > 3) {
options = new ParserContext.Options(args[3].toJava(Long.class));
} else {
// use the default options RECOVER | NONET
options = new ParserContext.Options(2048 | 1);
}
IRubyObject stringIO = runtime.getClass("StringIO").newInstance(context, args[0], Block.NULL_BLOCK);
InputStream in = new IOInputStream(stringIO);
reader.setInput(context, in, url, options);
return reader;
}
@JRubyMethod
public IRubyObject
node_type(ThreadContext context)
{
IRubyObject node_type = currentNode().getNodeType();
return node_type == null ? RubyFixnum.zero(context.getRuntime()) : node_type;
}
@JRubyMethod
public IRubyObject
inner_xml(ThreadContext context)
{
ensureNodeClosed(context);
return stringOrBlank(context.getRuntime(), getInnerXml(currentNode()));
}
private String
getInnerXml(ReaderNode current)
{
if (current.depth < 0) { return null; }
if (!current.hasChildren) { return null; }
StringBuffer sb = new StringBuffer();
for (int i = current.startOffset + 1; i <= current.endOffset - 1; i++) {
sb.append(nodeQueue.get(i).getString());
}
return new String(sb);
}
@JRubyMethod
public IRubyObject
outer_xml(ThreadContext context)
{
ensureNodeClosed(context);
return stringOrBlank(context.getRuntime(), getOuterXml());
}
private String
getOuterXml()
{
ReaderNode current = currentNode();
if (current == null || current.depth < 0) { return null; }
if (current instanceof ClosingNode) {
return "<" + current.name + "/>";
}
StringBuilder sb = new StringBuilder();
for (int i = position; i <= current.endOffset; i++) {
sb.append(nodeQueue.get(i).getString());
}
return new String(sb);
}
@JRubyMethod
public IRubyObject
lang(ThreadContext context)
{
return currentNode().getLang();
}
@JRubyMethod
public IRubyObject
local_name(ThreadContext context)
{
return currentNode().getLocalName();
}
@JRubyMethod
public IRubyObject
name(ThreadContext context)
{
return currentNode().getName();
}
@JRubyMethod
public IRubyObject
namespace_uri(ThreadContext context)
{
return currentNode().getUri();
}
@JRubyMethod
public IRubyObject
namespaces(ThreadContext context)
{
return currentNode().getNamespaces(context);
}
@JRubyMethod
public IRubyObject
prefix(ThreadContext context)
{
return currentNode().getPrefix();
}
private void
readMoreData(ThreadContext context)
{
if (!continueParsing) { throw context.runtime.newRuntimeError("Cannot parse more data"); }
try {
continueParsing = config.parse(false);
} catch (XNIException e) {
throw XmlSyntaxError.createXMLSyntaxError(context.runtime, e).toThrowable(); // Nokogiri::XML::SyntaxError
} catch (IOException e) {
throw context.runtime.newRuntimeError(e.toString());
}
}
private void
ensureNodeClosed(ThreadContext context)
{
ReaderNode node = currentNode();
if (node instanceof TextNode) { return; }
while (node.endOffset < 1) { readMoreData(context); }
}
@JRubyMethod
public IRubyObject
read(ThreadContext context)
{
position++;
try {
while (nodeQueue.size() <= position && continueParsing) {
readMoreData(context);
}
return setAndRaiseErrorsIfAny(context.runtime, null);
} catch (RaiseException ex) {
return setAndRaiseErrorsIfAny(context.runtime, ex);
}
}
private IRubyObject
setAndRaiseErrorsIfAny(final Ruby runtime, final RaiseException ex) throws RaiseException
{
final ReaderNode currentNode = currentNode();
if (currentNode == null) { return runtime.getNil(); }
if (currentNode.isError()) {
RubyArray<?> errors = (RubyArray) getInstanceVariable("@errors");
IRubyObject error = currentNode.toSyntaxError();
errors.append(error);
setInstanceVariable("@errors", errors);
throw ex != null ? ex : ((XmlSyntaxError) error).toThrowable();
}
if (ex != null) { throw ex; }
return this;
}
private ReaderNode
currentNode()
{
if (position >= nodeQueue.size()) { return null; }
return nodeQueue.get(position);
}
@JRubyMethod
public IRubyObject
state(ThreadContext context)
{
return context.getRuntime().newFixnum(this.state);
}
@JRubyMethod
public IRubyObject
value(ThreadContext context)
{
return currentNode().getValue();
}
@JRubyMethod(name = "value?")
public IRubyObject
value_p(ThreadContext context)
{
return currentNode().hasValue();
}
@JRubyMethod
public IRubyObject
xml_version(ThreadContext context)
{
return currentNode().getXmlVersion();
}
protected XMLPullParserConfiguration
createReader(Ruby ruby, Options options)
{
StandardParserConfiguration config = new StandardParserConfiguration();
DocumentHandler handler = new DocumentHandler(ruby);
// XMLReader reader = XMLReaderFactory.createXMLReader();
config.setDocumentHandler(handler);
config.setDTDHandler(handler);
config.setErrorHandler(handler);
config.setEntityResolver(new EntityResolver2Wrapper(new NokogiriEntityResolver(ruby, null, options)));
// config.setFeature("http://xml.org/sax/features/xmlns-uris", true);
// config.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
config.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", options.dtdLoad
|| options.dtdValid);
return config;
}
private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLErrorHandler
{
Stack<String> langStack;
int depth;
Stack<String> xmlBaseStack;
Stack<ReaderNode.ElementNode> elementStack;
private final Ruby ruby;
public
DocumentHandler(Ruby ruby)
{
this.ruby = ruby;
}
@Override
public void
startGeneralEntity(String name, XMLResourceIdentifier identifier,
String encoding, Augmentations augs) throws XNIException
{
Object entitySkipped;
if (augs != null && (entitySkipped = augs.getItem(Constants.ENTITY_SKIPPED)) != null && ((Boolean) entitySkipped)) {
nodeQueue.add(new ReaderNode.ExceptionNode(ruby, null));
}
}
@Override
public void
startDocument(XMLLocator locator, String encoding, NamespaceContext context, Augmentations augs)
{
depth = 0;
langStack = new Stack<String>();
xmlBaseStack = new Stack<String>();
elementStack = new Stack<ReaderNode.ElementNode>();
}
@Override
public void
endDocument(Augmentations augs)
{
langStack = null;
xmlBaseStack = null;
elementStack = null;
}
@Override
public void
startElement(QName element, XMLAttributes attrs, Augmentations augs)
{
commonElement(element, attrs, false);
}
@Override
public void
endElement(QName element, Augmentations augs)
{
String uri = element.uri;
String localName = element.localpart;
String qName = element.rawname;
depth--;
ElementNode startElementNode = elementStack.pop();
ReaderNode node = ReaderNode.createClosingNode(ruby, uri, localName, qName, depth, langStack, xmlBaseStack);
startElementNode.endOffset = nodeQueue.size() - 1;
if (startElementNode.endOffset != startElementNode.startOffset) {
// this node isn't empty
node.attributeList = startElementNode.attributeList;
node.namespaces = startElementNode.namespaces;
node.startOffset = startElementNode.startOffset;
node.endOffset = ++startElementNode.endOffset;
node.hasChildren = startElementNode.hasChildren = true;
nodeQueue.add(node);
}
if (!langStack.isEmpty()) { langStack.pop(); }
if (!xmlBaseStack.isEmpty()) { xmlBaseStack.pop(); }
}
@Override
public void
emptyElement(QName element, XMLAttributes attrs, Augmentations augs)
{
commonElement(element, attrs, true);
}
private void
commonElement(QName element, XMLAttributes attrs, boolean isEmpty)
{
String qName = element.rawname;
String uri = element.uri;
String localName = element.localpart;
ReaderNode readerNode = ReaderNode.createElementNode(ruby, uri, localName, qName, attrs, depth, langStack,
xmlBaseStack);
if (!elementStack.isEmpty()) {
ElementNode parent = elementStack.peek();
parent.hasChildren = true;
}
nodeQueue.add(readerNode);
readerNode.startOffset = nodeQueue.size() - 1;
if (!isEmpty) {
depth++;
if (readerNode.lang != null) { langStack.push(readerNode.lang); }
if (readerNode.xmlBase != null) { xmlBaseStack.push(readerNode.xmlBase); }
elementStack.push((ReaderNode.ElementNode)readerNode);
} else {
readerNode.endOffset = readerNode.startOffset;
readerNode.hasChildren = false;
}
}
@Override
public void
characters(XMLString string, Augmentations augs)
{
ReaderNode.TextNode node = ReaderNode.createTextNode(ruby, string.toString(), depth, langStack, xmlBaseStack);
nodeQueue.add(node);
node.startOffset = node.endOffset = nodeQueue.size() - 1;
}
@Override
public void
error(String domain, String key, XMLParseException ex)
{
nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
throw ex;
}
@Override
public void
fatalError(String domain, String key, XMLParseException ex)
{
nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
throw ex;
}
@Override
public void
warning(String domain, String key, XMLParseException ex)
{
nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex));
throw ex;
}
}
}