LoboHTML/src/main/java/org/loboevolution/html/parser/XHtmlParser.java
/*
* MIT License
*
* Copyright (c) 2014 - 2024 LoboEvolution
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Contact info: ivan.difrancesco@yahoo.it
*/
package org.loboevolution.html.parser;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.htmlunit.cssparser.dom.DOMException;
import org.loboevolution.common.Strings;
import org.loboevolution.html.Entities;
import org.loboevolution.html.HTMLEntities;
import org.loboevolution.html.HTMLTag;
import org.loboevolution.html.dom.domimpl.HTMLDocumentImpl;
import org.loboevolution.html.dom.nodeimpl.DocumentTypeImpl;
import org.loboevolution.html.dom.nodeimpl.ElementImpl;
import org.loboevolution.html.dom.nodeimpl.EntityReferenceImpl;
import org.loboevolution.html.dom.nodeimpl.NotationImpl;
import org.loboevolution.html.node.*;
import org.loboevolution.http.UserAgentContext;
import org.loboevolution.info.AttributeInfo;
import org.loboevolution.info.ElementInfo;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
/**
* The XHtmlParser class is an HTML DOM parser. This parser provides
* the functionality for the standard DOM parser implementation
* {@link org.loboevolution.html.parser.DocumentBuilderImpl}. This parser class
* may be used directly when a different DOM implementation is preferred.
*/
@Slf4j
public class XHtmlParser {
/** Constant MODIFYING_KEY="cobra.suspend" */
public static final String MODIFYING_KEY = "cobra.suspend";
private static final int TOKEN_BAD = 6;
private static final int TOKEN_BEGIN_ELEMENT = 3;
private static final int TOKEN_COMMENT = 1;
private static final int TOKEN_END_ELEMENT = 4;
private static final int TOKEN_EOD = 0;
private static final int TOKEN_FULL_ELEMENT = 5;
private static final int TOKEN_TEXT = 2;
private final Document document;
private boolean justReadEmptyElement = false;
private boolean justReadTagBegin = false;
private boolean justReadTagEnd = false;
private String normalLastTag = null;
private final UserAgentContext ucontext;
private Node lastRootElement = null;
private Node lastHeadElement = null;
private Node lastBodyElement = null;
private boolean needRoot = false;
@Getter
private final Map<String, String> namespaces = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
/**
* Constructs a XHtmlParser.
*
* @param ucontext The user agent context.
* @param document A W3C Document instance.
*/
public XHtmlParser(final UserAgentContext ucontext, final Document document) {
this.ucontext = ucontext;
this.document = document;
}
/**
* Constructs a XHtmlParser.
*
* @param ucontext The user agent context.
* @param document An W3C Document instance.
* @param needRoot a boolean.
*/
public XHtmlParser(final UserAgentContext ucontext, final Document document, final boolean needRoot) {
this.ucontext = ucontext;
this.document = document;
this.needRoot = needRoot;
}
/**
* <p>isDecodeEntities.</p>
*
* @param elementName a {@link java.lang.String} object.
* @return a boolean.
*/
public static boolean isDecodeEntities(final String elementName) {
final ElementInfo einfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(elementName.toUpperCase()));
return einfo == null || einfo.isDecodeEntities();
}
/**
* This method may be used when the DOM should be built under a given node, such
* as when innerHTML is used in Javascript.
*
* @param reader A LineNumberReader for the document.
* @param parent The root node for the parsed DOM.
* @throws java.io.IOException if any.
* @throws org.xml.sax.SAXException if any.
*/
public void parse(final LineNumberReader reader, final Node parent) throws IOException, SAXException {
// Note: Parser does not clear document. It could be used incrementally.
try {
parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
try {
while (this.parseToken(parent, reader, null, new LinkedList<>()) != TOKEN_EOD) {
}
} catch (final StopException se) {
throw new SAXException("Unexpected flow exception", se);
}
} finally {
if (needRoot) {
ensureRootElement(parent);
ensureHeadElement(lastRootElement);
ensureBodyElement(lastRootElement);
}
parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
/**
* Parses HTML given by a Reader. This method appends nodes to the
* document provided to the parser.
*
* @param reader An instance of Reader.
* @throws java.io.IOException if any.
* @throws org.xml.sax.SAXException if any.
*/
public void parse(final Reader reader) throws IOException, SAXException {
this.parse(new LineNumberReader(reader));
}
/**
* This method may be used when the DOM should be built under a given node, such
* as when innerHTML is used in Javascript.
*
* @param reader A document reader.
* @param parent The root node for the parsed DOM.
* @throws java.io.IOException if any.
* @throws org.xml.sax.SAXException if any.
*/
public void parse(final Reader reader, final Node parent) throws IOException, SAXException {
this.parse(new LineNumberReader(reader), parent);
}
/**
* <p>parse.</p>
*
* @param reader a {@link java.io.LineNumberReader} object.
* @throws java.io.IOException if any.
* @throws org.xml.sax.SAXException if any.
*/
public void parse(final LineNumberReader reader) throws IOException, SAXException {
this.parse(reader, this.document);
}
/**
* Parses text followed by one element.
* If tags in this set are encountered, the method throws StopException.
*
* @param parent a {@link org.loboevolution.html.node.Node} object.
* @param reader a {@link java.io.LineNumberReader} object.
* @param stopTags a {@link java.util.Set} object
* @param ancestors a {@link java.util.LinkedList} object
* @return {@link java.lang.Number} object.
*/
private int parseToken(final Node parent, final LineNumberReader reader, final Set<HTMLTag> stopTags,
final LinkedList<String> ancestors) throws IOException, StopException {
final Document doc = this.document;
final HTMLDocumentImpl htmlDoc = (HTMLDocumentImpl) doc;
final StringBuilder textSb = this.readUpToTagBegin(reader);
if (textSb == null) {
return TOKEN_EOD;
}
if (textSb.length() > 0) {
final String text = textSb.toString();
if (Strings.isNotBlank(text.trim())) {
final StringBuilder ent = new StringBuilder();
final StringBuilder txt = new StringBuilder();
final StringBuilder cdata = new StringBuilder();
final AtomicBoolean isEnt = new AtomicBoolean(false);
final AtomicBoolean isCda = new AtomicBoolean(false);
try {
text.chars()
.forEach(i -> {
final char ch = (char) i;
if (ch == '&') {
if (txt.length() > 0) {
final Node textNode = doc.createTextNode(txt.toString());
safeAppendChild(parent, textNode);
txt.setLength(0);
}
isEnt.set(true);
}
if (ch == '<') {
isCda.set(true);
}
if (ch == '>') {
isCda.set(false);
cdata.append(ch);
final Node textNode = doc.createCDATASection(cdata.toString());
safeAppendChild(parent, textNode);
cdata.setLength(0);
}
if (!isEnt.get() && !isCda.get()) {
txt.append(ch);
} else if (isCda.get()) {
cdata.append(ch);
} else {
ent.append(ch);
}
if (ch == ';') {
isEnt.set(false);
final Node textNode = doc.createEntityReference(ent.toString());
safeAppendChild(parent, textNode);
ent.setLength(0);
}
});
if (txt.length() > 0) {
final Node textNode = doc.createTextNode(txt.toString());
safeAppendChild(parent, textNode);
txt.setLength(0);
}
} catch (final DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.getCode() != DOMException.HIERARCHY_REQUEST_ERR)) {
log.error("parseToken(): Unable to append child to {} ", parent, de);
}
}
}
}
if (this.justReadTagBegin) {
String tag = this.readTag(parent, reader);
if (Strings.isBlank(tag)) {
return TOKEN_EOD;
}
String normalTag = tag.toUpperCase();
try {
if (tag.startsWith("!")) {
switch (tag) {
case "!--":
final StringBuilder comment = this.passEndOfComment(reader);
final StringBuilder decText = entityDecode(comment);
safeAppendChild(parent, doc.createComment(decText.toString()));
return TOKEN_COMMENT;
case "!DOCTYPE":
final String doctypeStr = this.parseEndOfTag(reader);
String qName = null;
String publicId = null;
String systemId = null;
if (Strings.containsIgnoreCase(doctypeStr, "public")) {
final String[] publics = Strings.splitIgnoreCase(doctypeStr, "public");
final String[] result = publics[1].replace("[", "").split("\"");
final List<String> list = Arrays.stream(result)
.filter(s -> Strings.isNotBlank(s) && s.length() > 1)
.collect(Collectors.toList());
if(list.size() == 1) {
publicId = list.stream().findFirst().get();
}
if(list.size() == 2) {
publicId = list.get(0);
systemId = list.get(1);
}
qName = publics[0];
}
if (qName == null && Strings.containsIgnoreCase(doctypeStr, "svg")) {
final String[] publics = Strings.splitIgnoreCase(doctypeStr, "svg");
qName = publics[0];
this.document.setXml(true);
}
if (qName == null && Strings.containsIgnoreCase(doctypeStr, "html")) {
qName = "html";
}
final DocumentType docType = new DocumentTypeImpl(qName, publicId, systemId);
docType.setOwnerDocument(htmlDoc);
htmlDoc.setDoctype(docType);
needRoot = false;
return TOKEN_BAD;
case "!ENTITY":
String doctypeStr2 = this.parseEndOfTag(reader);
doctypeStr2 = doctypeStr2.substring(0, doctypeStr2.length() - 1);
String[] sp = doctypeStr2.split("\"");
final EntityReferenceImpl reference;
if (sp.length == 2) {
reference = new EntityReferenceImpl(null, null, sp[0].trim(), sp[1], null);
htmlDoc.getDoctype().getEntities().setNamedItem(reference);
} else {
reference = new EntityReferenceImpl();
}
reference.setOwnerDocument(document);
reference.setParentImpl(document);
if (sp.length > 2) {
sp = doctypeStr2.split("[\"\\s+]");
final AtomicInteger ai = new AtomicInteger(0);
final AtomicBoolean isPublic = new AtomicBoolean(false);
final AtomicBoolean isNotation = new AtomicBoolean(false);
Arrays.stream(sp).forEach(s -> {
if (Strings.isNotBlank(s)) {
if (ai.get() == 0) {
reference.setNodeName(s.trim());
} else {
if (isPublic.get()) {
reference.setPublicId(s);
isPublic.set(false);
} else if (isNotation.get()) {
reference.setNotationName(s);
isNotation.set(false);
} else if (s.equals("PUBLIC")) {
isPublic.set(true);
} else if (s.equals("NDATA")) {
isNotation.set(true);
} else {
reference.setSystemId(s);
}
}
ai.incrementAndGet();
}
});
htmlDoc.getDoctype().getEntities().setNamedItem(reference);
}
needRoot = false;
return TOKEN_BAD;
case "!NOTATION":
final String notationStr = this.parseEndOfTag(reader);
final NotationImpl not = new NotationImpl();
not.setOwnerDocument(document);
not.setParentImpl(document);
if (notationStr.contains("PUBLIC")) {
final String[] split = notationStr.split("PUBLIC");
final AtomicInteger ai = new AtomicInteger(0);
Arrays.stream(split).forEach(s -> {
if(ai.get() == 0) {
not.setNodeName(s.trim());
}
if(ai.get() == 1) {
not.setPublicId(s.split("\"")[1].trim());
}
ai.incrementAndGet();
});
ai.set(0);
}
if (notationStr.contains("SYSTEM")) {
final String[] split = notationStr.split("SYSTEM");
final AtomicInteger ai = new AtomicInteger(0);
Arrays.stream(split).forEach(s -> {
if(ai.get() == 0) {
not.setNodeName(s.trim());
}
if(ai.get() == 1) {
not.setPublicId(s.split("\"")[1].trim());
}
ai.incrementAndGet();
});
}
htmlDoc.getDoctype().getNotations().setNamedItem(not);
needRoot = false;
return TOKEN_BAD;
default:
passEndOfTag(reader);
return TOKEN_BAD;
}
} else if (tag.startsWith("/")) {
normalTag = normalTag.substring(1);
this.passEndOfTag(reader);
return TOKEN_END_ELEMENT;
} else if (tag.startsWith("?")) {
tag = tag.substring(1);
final StringBuilder data = readProcessingInstruction(reader);
if (!tag.equals("xml")) {
String processData = data.toString();
processData = processData.substring(0, processData.length() - 1);
final ProcessingInstruction pi = doc.createProcessingInstruction(tag, processData);
parent.appendChild(pi);
return TOKEN_FULL_ELEMENT;
} else {
this.document.setXml(true);
return TOKEN_TEXT;
}
} else {
final List<AttributeInfo> attributeInfo = new ArrayList<>();
ElementImpl element = null;
try {
if (!this.justReadTagEnd) {
while (this.readAttribute(reader, attributeInfo)) {
// EMPTY LOOP
}
}
if (this.document.isXml()) {
final AtomicReference<String> atomicReference = new AtomicReference<>(normalTag);
final AtomicReference<String> reference = new AtomicReference<>();
final String elm = atomicReference.get();
if (attributeInfo.isEmpty()) {
reference.set(getNamespaces().get(elm.contains(":") ? elm.split(":")[0] : ""));
}
attributeInfo.forEach(info -> {
final String attribute = info.getAttributeName();
final int index = attribute.contains("xmlns") ? 1 : 0;
final String attributeSplit = attribute.contains(":") ? attribute.split(":")[index] : attribute;
if (attribute.equals("xmlns") ||
(attributeSplit.equalsIgnoreCase((elm.contains(":") ? elm.split(":")[0] : elm).toLowerCase()))) {
if (getNamespaces().get(attributeSplit) == null) {
getNamespaces().put(attributeSplit, info.getAttributeValue());
reference.set(info.getAttributeValue());
} else {
reference.set(getNamespaces().get(attributeSplit));
}
}
});
element = (ElementImpl) doc.createElementNS(reference.get(), normalTag);
} else {
element = (ElementImpl) doc.createElement(normalTag);
}
element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
safeAppendChild(parent, element);
final AtomicReference<ElementImpl> elementAtomicReference = new AtomicReference<>(element);
attributeInfo.forEach(info -> {
setAttributeNode(elementAtomicReference.get(), info.getAttributeName(), info.getAttributeValue());
});
if (stopTags != null && stopTags.contains(HTMLTag.get(normalTag))) {
throw new StopException(element);
}
if (!this.justReadEmptyElement) {
ElementInfo einfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(normalTag.toUpperCase()));
int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.getEndElementType();
if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
boolean childrenOk = einfo == null || einfo.isChildElementOk();
Set<HTMLTag> newStopSet = einfo == null ? null : einfo.getStopTags();
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(HTMLTag.get(normalTag));
}
}
if (stopTags != null) {
if (newStopSet != null) {
final Set<HTMLTag> newStopSet2 = new HashSet<>();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
} else {
newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
}
}
ancestors.addFirst(normalTag);
try {
for (;;) {
try {
final int token;
if ((einfo != null) && einfo.isNoScriptElement()) {
final UserAgentContext ucontext = this.ucontext;
if ((ucontext == null) || ucontext.isScriptingEnabled()) {
token = this.parseForEndTag(parent, reader, tag, false, shouldDecodeEntities(einfo));
} else {
token = this.parseToken(element, reader, newStopSet, ancestors);
}
} else {
token = childrenOk
? this.parseToken(element, reader, newStopSet, ancestors)
: this.parseForEndTag(element, reader, tag, true,
shouldDecodeEntities(einfo));
}
if (token == TOKEN_END_ELEMENT) {
final String normalLastTag = this.normalLastTag;
if (normalTag.equalsIgnoreCase(normalLastTag)) {
return TOKEN_FULL_ELEMENT;
} else {
final ElementInfo closeTagInfo = HTMLEntities.ELEMENT_INFOS
.get(HTMLTag.get(normalLastTag.toUpperCase()));
if ((closeTagInfo == null)
|| (closeTagInfo.getEndElementType() != ElementInfo.END_ELEMENT_FORBIDDEN)) {
final Iterator<String> i = ancestors.iterator();
if (i.hasNext()) {
i.next();
while (i.hasNext()) {
final String normalAncestorTag = i.next();
if (normalLastTag.equals(normalAncestorTag)) {
normalTag = normalLastTag;
return TOKEN_END_ELEMENT;
}
}
}
}
}
} else if (token == TOKEN_EOD) {
getNamespaces().clear();
return TOKEN_EOD;
}
} catch (final StopException se) {
// newElement does not have a parent.
final Element newElement = se.getElement();
tag = newElement.getTagName();
normalTag = tag.toUpperCase();
if (stopTags != null && stopTags.contains(HTMLTag.get(normalTag))) {
throw se;
}
einfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(normalTag.toUpperCase()));
endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED
: einfo.getEndElementType();
childrenOk = einfo == null || einfo.isChildElementOk();
newStopSet = einfo == null ? null : einfo.getStopTags();
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(HTMLTag.get(normalTag));
}
}
if (stopTags != null && newStopSet != null) {
final Set<HTMLTag> newStopSet2 = new HashSet<>();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
}
ancestors.removeFirst();
ancestors.addFirst(normalTag);
// Switch element
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
// newElement should have been suspended.
element = (ElementImpl) newElement;
// Add to parent
safeAppendChild(parent, element);
if (this.justReadEmptyElement) {
return TOKEN_BEGIN_ELEMENT;
}
}
}
} finally {
ancestors.removeFirst();
}
}
}
return TOKEN_BEGIN_ELEMENT;
} finally {
if(element != null)
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
} finally {
this.normalLastTag = normalTag;
}
} else {
this.normalLastTag = null;
return TOKEN_TEXT;
}
}
/**
* Reads text until the beginning of the next tag. Leaves the reader offset past
* the opening angle bracket. Returns null only on EOF.
*/
private StringBuilder readUpToTagBegin(final LineNumberReader reader) throws IOException {
StringBuilder sb = null;
int intCh;
while ((intCh = reader.read()) != -1) {
final char ch = (char) intCh;
if (ch == '<') {
this.justReadTagBegin = true;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (sb == null) {
sb = new StringBuilder(0);
}
return sb;
}
if (sb == null) {
sb = new StringBuilder();
}
sb.append(ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return sb;
}
/**
* Assumes that the content is completely made up of text, and parses until an
* ending tag is found.
*/
private int parseForEndTag(final Node parent, final LineNumberReader reader, final String tagName,
final boolean addTextNode, final boolean decodeEntities) throws IOException {
final Document doc = this.document;
int intCh;
StringBuilder sb = new StringBuilder();
while ((intCh = reader.read()) != -1) {
char ch = (char) intCh;
if (ch == '<') {
intCh = reader.read();
if (intCh != -1) {
ch = (char) intCh;
if (ch == '/') {
final StringBuilder tempBuffer = new StringBuilder();
while ((intCh = reader.read()) != -1) {
ch = (char) intCh;
if (ch == '>') {
final String thisTag = tempBuffer.toString().trim();
if (thisTag.equalsIgnoreCase(tagName)) {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = false;
this.normalLastTag = thisTag;
if (addTextNode) {
if (decodeEntities) {
sb = entityDecode(sb);
}
final String text = sb.toString();
if (Strings.isNotBlank(text.trim())) {
final Node textNode = text.trim().startsWith("&") ? doc.createEntityReference(text) : doc.createTextNode(text);
safeAppendChild(parent, textNode);
}
}
return TOKEN_END_ELEMENT;
} else {
break;
}
} else {
tempBuffer.append(ch);
}
}
sb.append("</");
sb.append(tempBuffer);
} else if (ch == '!') {
final String nextSeven = readN(reader, 7);
if ("[CDATA[".equals(nextSeven)) {
readCData(reader, sb);
} else {
sb.append('!');
if (nextSeven != null) {
sb.append(nextSeven);
}
}
} else {
sb.append('<');
sb.append(ch);
}
} else {
sb.append('<');
}
} else {
sb.append(ch);
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (addTextNode) {
if (decodeEntities) {
sb = entityDecode(sb);
}
final String text = sb.toString();
if (Strings.isNotBlank(text.trim())) {
final Node textNode = text.trim().startsWith("&") ? doc.createEntityReference(text) : doc.createTextNode(text);
safeAppendChild(parent, textNode);
}
}
return XHtmlParser.TOKEN_EOD;
}
private static void readCData(final LineNumberReader reader, final StringBuilder sb) throws IOException {
int next = reader.read();
while (next >= 0) {
final char nextCh = (char) next;
if (nextCh == ']') {
final String next2 = readN(reader, 2);
if (next2 != null) {
if ("]>".equals(next2)) {
break;
} else {
sb.append(nextCh);
sb.append(next2);
next = reader.read();
}
} else {
break;
}
} else {
sb.append(nextCh);
next = reader.read();
}
}
}
// Tries to read at most n characters.
private static String readN(final LineNumberReader reader, final int n) {
final char[] chars = new char[n];
int i = 0;
while (i < n) {
final int ich;
try {
ich = reader.read();
} catch (final IOException e) {
break;
}
if (ich >= 0) {
chars[i] = (char) ich;
i += 1;
} else {
break;
}
}
if (i == 0) {
return null;
} else {
return String.valueOf(chars, 0, i);
}
}
/**
* The reader offset should be
*/
private String readTag(final Node parent, final LineNumberReader reader) throws IOException {
final StringBuilder sb = new StringBuilder();
int chInt;
chInt = reader.read();
if (chInt != -1) {
boolean cont = true;
char ch;
for (; ; ) {
ch = (char) chInt;
if (Character.isLetter(ch)) {
// Speed up normal case
break;
} else if (ch == '!') {
sb.append('!');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
cont = false;
}
} else {
cont = false;
}
} else{
if (ch == '[') {
final StringBuilder ltText = new StringBuilder();
readCData(reader, ltText);
parent.appendChild(document.createCDATASection("<![" + ltText + "]]"));
}
}
} else {
cont = false;
}
} else if (ch == '/') {
sb.append(ch);
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
} else {
cont = false;
}
} else if (ch == '<') {
final StringBuilder ltText = new StringBuilder(3);
ltText.append('<');
while ((chInt = reader.read()) == '<') {
ltText.append('<');
}
final String text = ltText.toString();
final Node textNode = text.trim().startsWith("&") ? this.document.createEntityReference(text) : this.document.createTextNode(text);
try {
parent.appendChild(textNode);
} catch (final DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE)
|| (de.getCode() != DOMException.HIERARCHY_REQUEST_ERR)) {
log.error("parseToken(): Unable to append child to {} ", parent, de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue;
}
} else if (Character.isWhitespace(ch)) {
final StringBuilder ltText = new StringBuilder();
ltText.append('<');
ltText.append(ch);
while ((chInt = reader.read()) != -1) {
ch = (char) chInt;
if (ch == '<') {
chInt = reader.read();
break;
}
ltText.append(ch);
}
final String text = ltText.toString();
final Node textNode = text.trim().startsWith("&") ? this.document.createEntityReference(text) : this.document.createTextNode(text);
try {
parent.appendChild(textNode);
} catch (final DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE)
|| (de.getCode() != DOMException.HIERARCHY_REQUEST_ERR)) {
log.error("parseToken(): Unable to append child to {} ", parent, de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue;
}
}
break;
}
if (cont) {
boolean lastCharSlash = false;
for (;;) {
if (Character.isWhitespace(ch)) {
break;
} else if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
this.justReadEmptyElement = lastCharSlash;
return sb.toString();
} else if (ch == '/') {
lastCharSlash = true;
} else {
if (lastCharSlash) {
sb.append('/');
}
lastCharSlash = false;
sb.append(ch);
}
chInt = reader.read();
if (chInt == -1) {
break;
}
ch = (char) chInt;
}
}
}
if (sb.length() > 0) {
this.justReadTagEnd = false;
this.justReadTagBegin = false;
this.justReadEmptyElement = false;
}
return sb.toString();
}
private StringBuilder passEndOfComment(final LineNumberReader reader) throws IOException {
if (this.justReadTagEnd) {
return new StringBuilder(0);
}
final StringBuilder sb = new StringBuilder();
OUTER: for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
char ch = (char) chInt;
if (ch == '-') {
chInt = reader.read();
if (chInt == -1) {
sb.append(ch);
break;
}
ch = (char) chInt;
if (ch == '-') {
StringBuilder extra = null;
for (; ; ) {
chInt = reader.read();
if (chInt == -1) {
if (extra != null) {
sb.append(extra);
}
break OUTER;
}
ch = (char) chInt;
if (ch == '>') {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
return sb;
} else if (ch == '-') {
// Allow any number of dashes at the end
if (extra == null) {
extra = new StringBuilder();
extra.append("--");
}
extra.append("-");
} else if (Character.isWhitespace(ch)) {
if (extra == null) {
extra = new StringBuilder();
extra.append("--");
}
extra.append(ch);
} else {
if (extra != null) {
sb.append(extra);
}
sb.append(ch);
break;
}
}
} else {
sb.append('-');
sb.append(ch);
}
} else {
sb.append(ch);
}
}
if (sb.length() > 0) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
return sb;
}
private String parseEndOfTag(final Reader reader) throws IOException {
if (this.justReadTagEnd) {
return "";
}
final StringBuilder result = new StringBuilder();
boolean readSomething = false;
for (;;) {
final int chInt = reader.read();
if (chInt == -1) {
break;
}
result.append((char) chInt);
readSomething = true;
final char ch = (char) chInt;
if (ch == '>' || ch == '[') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
return result.toString();
}
}
if (readSomething) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
return result.toString();
}
private void passEndOfTag(final Reader reader) throws IOException {
if (this.justReadTagEnd) {
return;
}
boolean readSomething = false;
for (;;) {
final int chInt = reader.read();
if (chInt == -1) {
break;
}
readSomething = true;
final char ch = (char) chInt;
if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
return;
}
}
if (readSomething) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
}
private StringBuilder readProcessingInstruction(final LineNumberReader reader) throws IOException {
final StringBuilder pidata = new StringBuilder();
if (this.justReadTagEnd) {
return pidata;
}
int ch;
for (ch = reader.read(); (ch != -1) && (ch != '>'); ch = reader.read()) {
pidata.append((char) ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = ch != -1;
return pidata;
}
private boolean readAttribute(final LineNumberReader reader, final List<AttributeInfo> attributes)
throws IOException {
if (this.justReadTagEnd) {
return false;
}
// Read attribute name up to '=' character.
// May read several attribute names without explicit values.
StringBuilder attributeName = null;
boolean blankFound = false;
boolean lastCharSlash = false;
for (;;) {
final int chInt = reader.read();
if (chInt == -1) {
if (Strings.isStringBuilderNotBlack(attributeName)) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeNameStr));
attributeName.setLength(0);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return false;
}
final char ch = (char) chInt;
if (ch == '=') {
lastCharSlash = false;
blankFound = false;
break;
} else if (ch == '>') {
if (Strings.isStringBuilderNotBlack(attributeName)) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeNameStr));
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
blankFound = true;
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
blankFound = true;
} else {
lastCharSlash = false;
if (blankFound) {
blankFound = false;
if (Strings.isStringBuilderNotBlack(attributeName)) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeNameStr));
attributeName.setLength(0);
}
}
if (attributeName == null) {
attributeName = new StringBuilder(6);
}
attributeName.append(ch);
}
}
// Read blanks up to open quote or first non-blank.
StringBuilder attributeValue = null;
int openQuote = -1;
for (;;) {
final int chInt = reader.read();
if (chInt == -1) {
break;
}
final char ch = (char) chInt;
if (ch == '>') {
if (Strings.isStringBuilderNotBlack(attributeName)) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeNameStr));
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
} else {
if (ch == '"') {
openQuote = '"';
} else if (ch == '\'') {
openQuote = '\'';
} else {
openQuote = -1;
attributeValue = new StringBuilder(6);
if (lastCharSlash) {
attributeValue.append('/');
}
attributeValue.append(ch);
}
lastCharSlash = false;
break;
}
}
// Read attribute value
for (;;) {
final int chInt = reader.read();
if (chInt == -1) {
break;
}
final char ch = (char) chInt;
if (ch == openQuote) {
lastCharSlash = false;
if (attributeName != null) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeValue == null ? "" : entityDecode(attributeValue).toString()));
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else if (openQuote == -1 && ch == '>') {
if (attributeName != null) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeValue == null ? "" : entityDecode(attributeValue).toString()));
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (openQuote == -1 && Character.isWhitespace(ch)) {
lastCharSlash = false;
if (attributeName != null) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeValue == null ? "" : entityDecode(attributeValue).toString()));
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else {
if (attributeValue == null) {
attributeValue = new StringBuilder(6);
}
if (lastCharSlash) {
attributeValue.append('/');
}
lastCharSlash = false;
attributeValue.append(ch);
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
if (attributeName != null) {
final String attributeNameStr = attributeName.toString();
attributes.add(new AttributeInfo(attributeNameStr, attributeValue == null ? "" : entityDecode(attributeValue).toString()));
}
return false;
}
private static boolean hasAncestorTag(final Node node, final String tag) {
if (node == null) {
return false;
} else if (tag.equalsIgnoreCase(node.getNodeName())) {
return true;
} else {
return hasAncestorTag(node.getParentNode(), tag);
}
}
private void safeAppendChild(final Node parent, final Node child) {
Node newParent = parent;
if (needRoot) {
final String nodeName = child.getNodeName();
if ("HTML".equalsIgnoreCase(nodeName)) {
lastRootElement = child;
} else if ((child instanceof Element) && (depthAtMost(parent, 1)) && (!hasAncestorTag(parent, "HTML"))) {
ensureRootElement(parent);
newParent = lastRootElement;
}
}
ensureBodyAppendChild(newParent, child);
}
private void ensureRootElement(final Node parent) {
if (lastRootElement == null) {
lastRootElement = document.createElement("HTML");
parent.appendChild(lastRootElement);
}
}
private static boolean depthAtMost(final Node n, final int maxDepth) {
if (maxDepth <= 0) {
return false;
} else {
final Node parent = n.getParentNode();
return parent == null || depthAtMost(parent, maxDepth - 1);
}
}
private void ensureBodyAppendChild(final Node parent, final Node child) {
final Node newParent = parent;
if (needRoot) {
final String nodeNameTU = child.getNodeName().toUpperCase();
if ("BODY".equals(nodeNameTU)) {
lastBodyElement = child;
} else if ("HEAD".equals(nodeNameTU)) {
lastHeadElement = child;
}
}
if(newParent != null) {newParent.appendChild(child);}
}
private void ensureBodyElement(final Node parent) {
if (lastBodyElement == null) {
lastBodyElement = document.createElement("BODY");
parent.appendChild(lastBodyElement);
}
}
private void ensureHeadElement(final Node parent) {
if (lastHeadElement == null) {
lastHeadElement = document.createElement("HEAD");
parent.appendChild(lastHeadElement);
}
}
private boolean shouldDecodeEntities(final ElementInfo einfo) {
return (einfo == null || einfo.isDecodeEntities());
}
private static StringBuilder entityDecode(final StringBuilder rawText) {
int startIdx = 0;
StringBuilder sb = null;
for (;;) {
final int ampIdx = rawText.indexOf("&", startIdx);
if (ampIdx == -1) {
if (sb == null) {
return rawText;
} else {
sb.append(rawText.substring(startIdx));
return sb;
}
}
if (sb == null) {
sb = new StringBuilder();
}
sb.append(rawText.substring(startIdx, ampIdx));
final int colonIdx = rawText.indexOf(";", ampIdx);
if (colonIdx == -1) {
sb.append('&');
startIdx = ampIdx + 1;
continue;
}
final String spec = rawText.substring(ampIdx + 1, colonIdx);
if (spec.startsWith("#")) {
final String number = spec.substring(1).toLowerCase();
int decimal;
try {
if (number.startsWith("x")) {
decimal = Integer.parseInt(number.substring(1), 16);
} else {
decimal = Integer.parseInt(number);
}
} catch (final NumberFormatException nfe) {
log.warn("entityDecode() ", nfe);
decimal = 0;
}
sb.append((char) decimal);
} else {
final int chInt = getEntityChar(spec);
if (chInt == -1) {
sb.append(spec);
} else {
sb.append((char) chInt);
}
}
startIdx = colonIdx + 1;
}
}
private static int getEntityChar(final String spec) {
Character c = HTMLEntities.ENTITIES.get(Entities.get(spec));
if (c == null) {
final String specTL = spec.toLowerCase();
c = HTMLEntities.ENTITIES.get(Entities.get(specTL));
if (c == null) {
return -1;
}
}
return c;
}
private void setAttributeNode(final ElementImpl element, final String attributeName, final String attributeValue) {
if (this.document.isXml()) {
String namespaceURI = null;
if (attributeName.contains(":")) {
String key = attributeName.split(":")[attributeName.contains("xmlns") ? 1 : 0];
if (getNamespaces().get(key) == null) {
getNamespaces().put(key, attributeValue);
namespaceURI = attributeValue;
} else {
namespaceURI = getNamespaces().get(key);
}
}
if (Strings.isNotBlank(namespaceURI)) {
element.setAttributeNS(namespaceURI, attributeName, attributeValue);
} else {
element.setAttribute(attributeName, attributeValue);
}
} else {
element.setAttribute(attributeName, attributeValue);
}
}
}