spicycms/spicy.core

View on GitHub
src/spicy/core/siteskin/html.py

Summary

Maintainability
C
1 day
Test Coverage
# -*- coding: utf-8 -*-
import html5lib
import re
from xml.sax.saxutils import escape, unescape
from html5lib import sanitizer, treebuilders, treewalkers, serializer


try:
    from html5lib.constants import tokenTypes
    # dirty hack to support both html5lib v0.11 and v0.90
except ImportError:
    # html5lib v 0.11
    tokenTypes = ("StartTag", "EndTag", "EmptyTag",
                  "selfClosing", 'Comment', 'Characters')
    tokenTypes = dict((x, x) for x in tokenTypes)

SAFE_CLASSES = {}


class TokenSanitazer(sanitizer.HTMLSanitizer):
    escape_invalid_tags = False
    
    # only html (not SVG or MathML) elements and attributes
    allowed_elements = sanitizer.HTMLSanitizer.acceptable_elements
    allowed_attributes = sanitizer.HTMLSanitizer.acceptable_attributes
    allowed_classes = SAFE_CLASSES
    
    options = (
        'allowed_elements', 'allowed_attributes', 'allowed_css_properties',
        'allowed_css_keywords', 'allowed_protocols', 'escape_invalid_tags',
        'allowed_classes', 'attr_val_is_uri')
    # names from genshi-like style for backward compatibility
    property_aliases = [
        ('safe_tags', 'allowed_elements'),
        ('safe_attrs', 'allowed_attributes'), ('uri_attrs', 'attr_val_is_uri'),
        ('classes', 'allowed_classes'), ('safe_schemes', 'allowed_protocols')]

    def __init__(self, *args, **kwargs):
        for old, new in self.property_aliases:
            if old in kwargs:
                # XXX write warning
                kwargs[new] = kwargs.pop(old)

        for key in kwargs.keys():
            if key in self.options:
                setattr(self, key, kwargs.pop(key))
            elif key not in (
                    'encoding', 'parseMeta', 'useChardet',
                    'lowercaseElementName', 'lowercaseAttrName'):
                kwargs.pop(key)
        super(TokenSanitazer, self).__init__(*args, **kwargs)
    
    def sanitize_token(self, token):
        if token["type"] in (
                tokenTypes["StartTag"], tokenTypes["EndTag"],
                tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
                if 'data' in token:
                    # Copypasted from html5lib
                    attrs = dict(
                        [(name, val) for name, val in token["data"][::-1]
                         if name in self.allowed_attributes])
                    for attr in self.attr_val_is_uri:
                        if not attr in attrs:
                            continue
                        val_unescaped = re.sub(
                            "[`\000-\040\177-\240\s]+", '',
                            unescape(attrs[attr])).lower()
                        # remove replacement characters from unescaped
                        # characters
                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
                        if (
                                re.match(
                                    "^[a-z0-9][-+.a-z0-9]*:", val_unescaped)
                                and (
                                    val_unescaped.split(':')[0] not in
                                    self.allowed_protocols)):
                            del attrs[attr]
                    # end copypasted
                    
                    if 'style' in attrs:
                        styles = self.sanitize_css(attrs.pop('style'))
                        if styles:
                            attrs['style'] = styles
                    if 'class' in attrs:
                        attrs = self.sanitize_classes(token, attrs)
                    token["data"] = [
                        [name, val] for name, val in attrs.items()]
                return token
            elif self.escape_invalid_tags:
                return self.escape_token(token)
        elif token["type"] == tokenTypes["Comment"]:
            pass
        else:
            return token

    def sanitize_classes(self, token, attrs):
        # drop restricted classes
        classes = attrs.pop('class').split()
        if token['name'] in self.allowed_classes:
            allowed = self.allowed_classes[token['name']]
            condition = callable(allowed) and allowed or (
                lambda cls: cls in allowed)
            value = ' '.join(filter(condition, classes))
            if value:
                attrs['class'] = value
        return attrs

    def escape_token(self, token):
        # a part of html5lib sanitize_token method
        if token["type"] == tokenTypes["EndTag"]:
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            attrs = ''.join(
                [' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
            token["data"] = "<%s%s>" % (token["name"], attrs)
        else:
            token["data"] = "<%s>" % token["name"]
        if token["type"] == tokenTypes["EmptyTag"]:
            token["data"] = token["data"][:-1] + "/>"
        token["type"] = tokenTypes["Characters"]
        del token["name"]
        return token


class Sanitizer(object):
    dom_callbacks = []
    string_callbacks = []
    method = 'xhtml'
    strip_whitespace = True
    tokensanitazer = TokenSanitazer

    options = (
        'dom_callbacks', 'string_callbacks', 'method', 'strip_whitespace')

    def __init__(self, **kwargs):
        for key in kwargs.keys():
            if key in self.options:
                setattr(self, key, kwargs.pop(key))
        self.kwargs = kwargs

    def token_sanitizer(self):
        '''Proxy function to pass arguments into Sanitizer constructor'''
        def func(*args, **kwargs):
            kwargs.update(self.kwargs)
            return self.tokensanitazer(*args, **kwargs)
        return func

    def get_dom(self, buf):
        buf = buf.strip()
        if not buf:
            return None
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                tokenizer=self.token_sanitizer())
        return p.parseFragment(buf)

    def render(self, dom_tree):
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        if self.method == "xhtml":
            Serializer = serializer.xhtmlserializer.XHTMLSerializer
        else:
            Serializer = serializer.htmlserializer.HTMLSerializer
        ser = Serializer(
            strip_whitespace=self.strip_whitespace,
            quote_attr_values=True, omit_optional_tags=False)
        return ser.render(stream)

    def sanitize(self, buf):
        '''
            HTML sanitirization with html5lib-like style interface
        '''
        dom_tree = self.get_dom(buf)
        if dom_tree is None:
            return ''

        for callback in self.dom_callbacks:
            dom_tree = callback(dom_tree, **self.kwargs)

        clean = self.render(dom_tree)

        for callback in self.string_callbacks:
            clean = callback(clean, **self.kwargs)

        return unicode(clean)