weblyzard/inscriptis

View on GitHub
src/inscriptis/annotation/output/html.py

Summary

Maintainability
A
1 hr
Test Coverage
"""HTML Annotation Processor."""
from collections import defaultdict
from itertools import cycle
from typing import Dict, Any, List

from inscriptis.annotation.output import AnnotationProcessor

COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80")


class HtmlExtractor(AnnotationProcessor):
    """Provides an HTML version of the extracted text.

    The generated HTML colors annotations based on the COLOR_SCHEMA
    constant.
    """

    verbatim = True

    def __call__(self, annotated_text: Dict[str, Any]) -> str:
        tag_indices = defaultdict(list)

        for start, end, label in sorted(annotated_text["label"]):
            tag_indices[start].append(label)
            tag_indices[end].append("/" + label)

        open_tags = []
        tagged_content = [
            "<html><head><style>",
            self._get_css(annotated_text["label"]),
            "</style></head><body><pre>",
        ]
        for idx, ch in enumerate(annotated_text["text"]):
            if idx in tag_indices:
                tags = tag_indices[idx]
                # close tags:
                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
                    open_tags.pop()
                    tagged_content.append("</span>")
                # open tags
                for tag in (
                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
                ):
                    open_tags.append(tag)
                    tagged_content.append(
                        '<span class="{tag}-label">{tag}</span>'
                        '<span class="{tag}">'.format(tag=tag)
                    )

            if ch == "\n":
                tagged_content.extend(["</span>" for _ in open_tags])
                tagged_content.append("</pre>\n<pre>")
                tagged_content.extend(
                    ['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
                )
            else:
                tagged_content.append(ch)

        return "".join(tagged_content) + "</pre></body></html>"

    @staticmethod
    def _get_label_colors(labels: List[str]) -> Dict[str, str]:
        """Compute the mapping between annotation labels and colors.

        The used color schema is available in the global variable COLOR_SCHEMA.

        Args:
            labels: a list of the annotations classes (e.g., heading, etc.)
                    that need to be color-coded.
        Returns:
            A mapping between the available labels and the corresponding color
            from the COLOR_SCHEMA.
        """
        return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA)))

    def _get_css(self, labels: List[str]) -> str:
        """Compute the CSS to be included into the HTML output.

        Args:
            labels: a list of the annotations classes (e.g., heading, etc.)
                    that need to be color-coded.

        Returns:
            A string containing the CSS to be embedded into the HTML output.

        """
        css = []
        for label, color in sorted(self._get_label_colors(labels).items()):
            css.append(
                "pre{{"
                "  position: relative;\n"
                "}}\n"
                ".{label} {{\n"
                "  background-color: {color};\n"
                "  border-radius: 0.4em;\n"
                "}}\n"
                ".{label}-label {{\n"
                "  top: -1.0em;\n"
                '  content: "{label}";\n'
                "  position: absolute;\n"
                "  background-color: {color};\n"
                "  font-size: 75%; }}\n".format(label=label, color=color)
            )
        return "\n".join(css)