NaturalIntelligence/fast-xml-parser

View on GitHub
src/v5/Xml2JsParser.js

Summary

Maintainability
B
5 hrs
Test Coverage
const StringSource = require("./inputSource/StringSource");
const BufferSource = require("./inputSource/BufferSource");
const {readTagExp,readClosingTagName} = require("./XmlPartReader");
const {readComment, readCdata,readDocType,readPiTag} = require("./XmlSpecialTagsReader");
const TagPath = require("./TagPath");
const TagPathMatcher = require("./TagPathMatcher");
const EntitiesParser = require('./EntitiesParser');

//To hold the data of current tag
//This is usually used to compare jpath expression against current tag
class TagDetail{
  constructor(name){
    this.name = name;
    this.position = 0;
    // this.attributes = {};
  }
}

class Xml2JsParser {
    constructor(options) {
      this.options = options;
      
      this.currentTagDetail = null;
      this.tagTextData = "";
      this.tagsStack = [];
      this.entityParser = new EntitiesParser(options.htmlEntities);
      this.stopNodes = [];
      for (let i = 0; i < this.options.stopNodes.length; i++) {
        this.stopNodes.push(new TagPath(this.options.stopNodes[i]));
      }
    }

    parse(strData) {
        this.source = new StringSource(strData);
        this.parseXml();
        return this.outputBuilder.getOutput();
    }
    parseBytesArr(data) {
        this.source = new BufferSource(data );
        this.parseXml();
        return this.outputBuilder.getOutput();
    }
  
    parseXml() {
      //TODO: Separate TagValueParser as separate class. So no scope issue in node builder class 

      //OutputBuilder should be set in XML Parser
      this.outputBuilder = this.options.OutputBuilder.getInstance(this.options);
      this.root = { root: true};
      this.currentTagDetail = this.root;

      while(this.source.canRead()){
        let ch = this.source.readCh();
        if (ch === "") break;
        
        if(ch === "<"){//tagStart
          let nextChar = this.source.readChAt(0);
          if (nextChar === "" ) throw new Error("Unexpected end of source");
          
        
          if(nextChar === "!" || nextChar === "?"){
            this.source.updateBufferBoundary();
            //previously collected text should be added to current node
            this.addTextNode(); 
            
            this.readSpecialTag(nextChar);// Read DOCTYPE, comment, CDATA, PI tag
          }else if(nextChar === "/"){
            this.source.updateBufferBoundary();
            this.readClosingTag();
            // console.log(this.source.buffer.length, this.source.readable);
            // console.log(this.tagsStack.length);
          }else{//opening tag
            this.readOpeningTag();
          }
        }else{
          this.tagTextData += ch;
        }
      }//End While loop
      if(this.tagsStack.length > 0 || ( this.tagTextData !== "undefined" && this.tagTextData.trimEnd().length > 0) ) throw new Error("Unexpected data in the end of document");
    }
  
    /**
     * read closing paired tag. Set parent tag in scope.
     * skip a node on user's choice
     */
    readClosingTag(){
      const tagName = this.processTagName(readClosingTagName(this.source));
      // console.log(tagName, this.tagsStack.length);
      this.validateClosingTag(tagName);
      // All the text data collected, belongs to current tag.
      if(!this.currentTagDetail.root) this.addTextNode();
      this.outputBuilder.closeTag();
      // Since the tag is closed now, parent tag comes in scope
      this.currentTagDetail = this.tagsStack.pop(); 
    }

    validateClosingTag(tagName){
      // This can't be unpaired tag, or a stop tag.
      if(this.isUnpaired(tagName) || this.isStopNode(tagName)) throw new Error(`Unexpected closing tag '${tagName}'`);
      // This must match with last opening tag
      else if(tagName !== this.currentTagDetail.name) 
        throw new Error(`Unexpected closing tag '${tagName}' expecting '${this.currentTagDetail.name}'`)
    }

    /**
     * Read paired, unpaired, self-closing, stop and special tags.
     * Create a new node
     * Push paired tag in stack.
     */
    readOpeningTag(){
      //save previously collected text data to current node
      this.addTextNode();

      //create new tag
      let tagExp = readTagExp(this, ">" );
      
      // process and skip from tagsStack For unpaired tag, self closing tag, and stop node
      const tagDetail = new TagDetail(tagExp.tagName);
      if(this.isUnpaired(tagExp.tagName)) {
        //TODO: this will lead 2 extra stack operation
        this.outputBuilder.addTag(tagDetail);
        this.outputBuilder.closeTag();
      } else if(tagExp.selfClosing){
        this.outputBuilder.addTag(tagDetail);
        this.outputBuilder.closeTag();
      } else if(this.isStopNode(this.currentTagDetail)){
        // TODO: let's user set a stop node boundary detector for complex contents like script tag
        //TODO: pass tag name only to avoid string operations
        const content = source.readUptoCloseTag(`</${tagExp.tagName}`);
        this.outputBuilder.addTag(tagDetail);
        this.outputBuilder.addValue(content);
        this.outputBuilder.closeTag();
      }else{//paired tag
        //set new nested tag in scope.
        this.tagsStack.push(this.currentTagDetail);
        this.outputBuilder.addTag(tagDetail);
        this.currentTagDetail = tagDetail;
      }
      // console.log(tagExp.tagName,this.tagsStack.length);
      // this.options.onClose()

    }

    readSpecialTag(startCh){
      if(startCh == "!"){
        let nextChar = this.source.readCh();
        if (nextChar === null || nextChar === undefined) throw new Error("Unexpected ending of the source");
        
        if(nextChar === "-"){//comment
          readComment(this);
        }else if(nextChar === "["){//CDATA
          readCdata(this);
        }else if(nextChar === "D"){//DOCTYPE
          readDocType(this);
        }
      }else if(startCh === "?"){
        readPiTag(this);
      }else{
        throw new Error(`Invalid tag '<${startCh}' at ${this.source.line}:${this.source.col}`)
      }
    }
    addTextNode = function() {
      // if(this.currentTagDetail){
        //save text as child node
        // if(this.currentTagDetail.tagname !== '!xml')
        if (this.tagTextData !== undefined && this.tagTextData !== "") { //store previously collected data as textNode
          if(this.tagTextData.trim().length > 0){
            //TODO: shift parsing to output builder

            this.outputBuilder.addValue(this.replaceEntities(this.tagTextData));
          }
          this.tagTextData = "";
        }
      // }
    }

    processAttrName(name){
      if(name === "__proto__") name  = "#__proto__";
      name = resolveNameSpace(name, this.removeNSPrefix);
      return name;
    }
    
    processTagName(name){
      if(name === "__proto__") name  = "#__proto__";
      name = resolveNameSpace(name, this.removeNSPrefix);
      return name;
    }

    /**
     * Generate tags path from tagsStack
     */
    tagsPath(tagName){
      //TODO: return TagPath Object. User can call match method with path
      return "";
    }

    isUnpaired(tagName){
      return this.options.tags.unpaired.indexOf(tagName) !== -1;
    }

    /**
     * valid expressions are 
     * tag nested
     * * nested
     * tag nested[attribute]
     * tag nested[attribute=""]
     * tag nested[attribute!=""]
     * tag nested:0 //for future
     * @param {string} tagName 
     * @returns 
     */
    isStopNode(node){
      for (let i = 0; i < this.stopNodes.length; i++) {
        const givenPath = this.stopNodes[i];
        if(givenPath.match(this.tagsStack, node)) return true;
      }
      return false 
    }

    replaceEntities(text){
      //TODO: if option is set then replace entities
      return this.entityParser.parse(text)
    }
}

function resolveNameSpace(name, removeNSPrefix) {
  if (removeNSPrefix) {
    const parts = name.split(':');
    if(parts.length === 2){
      if (parts[0] === 'xmlns') return '';
      else return parts[1];
    }else reportError(`Multiple namespaces ${name}`)
  }
  return name;
}

module.exports = Xml2JsParser;