stage3systems/node-html5-purifier

View on GitHub
lib/html_parser/index.js

Summary

Maintainability
A
1 hr
Test Coverage
'use strict';
var async = require('async');
var attributeValidator = require('./attribute_validator');
var namespacer = require('./namespacer');
var pcdataValidator = require('./pcdata_validator');
var sanitizer = require('./sanitizer');
var tagValidator = require('./tag_validator');

/**
 * HTML Purifier HTML Parser
 *
 * @impl parse
 */

/**
 * Removes unwhitelisted tags and attributes from the provided htmlInput. In
 * addition, id and class attributes are namespaced with the provided prefix and
 * postfix.
 *
 * @cb err, string
 * @pattern facade
 */
function parse(htmlInput, prefix, postfix, cb) {
  async.waterfall([
    // remove invalid pcdata, ie) <![if mso ...]>
    function(cb) {
      pcdataValidator.filter(htmlInput, cb);
    },
    // sanitize - caja sanitizes, tag balances, and converts html entities
    function(pcdataValidated, cb) {
      sanitizer.sanitize(pcdataValidated, cb);
    },
    // remove all tags that are not on the tags whitelist
    function(sanitizedHtml, cb) {
      tagValidator.validate(sanitizedHtml, cb);
    },
    // remove all tags that are not on the attributes whitelist
    function(tagValidatedHtml, cb) {
      attributeValidator.validate(tagValidatedHtml, cb);
    },
    // namespace id and class attributes
    function(attributeSanitizedHtml, cb) {
      if (typeof prefix !== 'undefined' || typeof postfix !== 'undefined') {
        namespacer.namespace(attributeSanitizedHtml, prefix, postfix, cb);
      } else {
        cb(null, attributeSanitizedHtml);
      }
    },
  ], cb);
}

function parsePurified(htmlInput, prefix, postfix, cb) {
  async.waterfall([
    // remove invalid pcdata, ie) <![if mso ...]>
    function(cb) {
      pcdataValidator.filter(htmlInput, cb);
    },
    // sanitize - caja sanitizes, tag balances, and converts html entities
    function(pcdataValidated, cb) {
      sanitizer.sanitize(pcdataValidated, cb);
    },
    // remove all tags that are not on the tags whitelist
    function(sanitizedHtml, cb) {
      tagValidator.validate(sanitizedHtml, cb);
    },
    // remove all tags that are not on the attributes whitelist
    function(tagValidatedHtml, cb) {
      attributeValidator.validate(tagValidatedHtml, cb);
    },
    function(attributeSanitizedHtml, cb) {
      // strip prefix and postfix from id and class attributes recursively
      if (typeof prefix !== 'undefined' || typeof postfix !== 'undefined') {
        var isReplaced = true;
        async.whilst(function() {
          return isReplaced;
        }, function(cb) {
          namespacer.stripNamespace(attributeSanitizedHtml, prefix, postfix, function(err, revertedHtml, replaced) {
            attributeSanitizedHtml = revertedHtml;
            isReplaced = replaced;
            cb();
          });
        }, function(err) {
          if (err) return cb(err);
          return cb(null, attributeSanitizedHtml);
        });
      } else {
        cb(null, attributeSanitizedHtml);
      }
    },
  ], cb);
}


exports.parse = parse;
exports.parsePurified = parsePurified;