podcasts/utils/filters.py
from urllib.parse import urlparse from bleach.sanitizer import BleachSanitizerFilter, Cleanerfrom django.utils.text import format_lazyfrom html5lib.filters.base import Filter CLEAN_HTML_GLOBAL = ["summary", "subtitle"]CLEAN_HTML_EPISODE = ["description", "subtitle"] ALLOWED_HTML_TAGS = [ "a", "abbr", "acronym", "b", "blockquote", "code", "em", "i", "li", "ol", "p", "strong", "ul",] ALLOWED_HTML_ATTRIBUTES = { "a": ["href", "title"], "acronym": ["title"], "abbr": ["title"],} EXTENDED_HTML_TAGS = [ "h1", "h2", "h3", "h4", "h5", "h6", "img", "table", "thead", "tbody", "tr", "th", "td",] EXTENDED_HTML_ATTRIBUTES = {"img": ["rel", "src", "alt"], "td": ["colspan", "rowspan"]} def clean_link(link, include_path=False): parsed = urlparse(link) netloc = parsed.netloc if parsed.netloc.startswith("www."): netloc = netloc[4:] if include_path: path = parsed.path.rstrip("/") splits = str.split(path, "/") if len(splits) > 2: path = "/…/" + splits[-1] return netloc + path return netloc class CleanerWithOptions(Cleaner): def clean(self, text, allowed_domains=False): if not allowed_domains: allowed_domains = [] if not isinstance(text, str): message = f"argument cannot be of '{text.__class__.__name__}' type, must be of text type" raise TypeError(message) if not text: return "" dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: fc = filter_class(source=filtered) filtered = fc.__iter__(allowed_domains=allowed_domains) return self.serializer.render(filtered) class ImgSrcFilter(Filter):Function `__iter__` has a Cognitive Complexity of 23 (exceeds 18 allowed). Consider refactoring. def __iter__(self, **kwargs): allowed_domains = kwargs.pop("allowed_domains", []) for token in Filter.__iter__(self): if token["type"] in ["StartTag", "EmptyTag"] and token["data"]: data_alt = None data_src = None for attr, value in token["data"].items(): if attr[1] in ("alt", "src"): data_alt = value if data_src: domain = clean_link(data_src) if domain not in allowed_domains: token["data"][(None, "data-src")] = data_src token["data"][(None, "class")] = "has-src" token["data"][(None, "alt")] = format_lazy("Image from {domain}", domain=domain) token["data"][(None, "src")] = ""Avoid deeply nested control flow statements. if data_alt: token["data"][(None, "data-alt")] = data_alt yield token subtitle_cleaner = Cleaner(tags=[], strip=True) summary_cleaner = Cleaner(tags=ALLOWED_HTML_TAGS, attributes=ALLOWED_HTML_ATTRIBUTES, strip=True) shownotes_cleaner = Cleaner( tags=ALLOWED_HTML_TAGS + EXTENDED_HTML_TAGS, attributes={**ALLOWED_HTML_ATTRIBUTES, **EXTENDED_HTML_ATTRIBUTES}, strip=True,) shownotes_image_cleaner = CleanerWithOptions( tags=ALLOWED_HTML_TAGS + EXTENDED_HTML_TAGS, attributes={**ALLOWED_HTML_ATTRIBUTES, **EXTENDED_HTML_ATTRIBUTES}, strip=True, filters=[ImgSrcFilter],)