python/whylogs/viz/notebook_profile_viz.py from whylabs/whylogs-python

python/whylogs/viz/notebook_profile_viz.py
Summary

Maintainability

5 hrs
Test Coverage

Issues
import html
import json
import logging
import os
from typing import Any, Dict, List, Optional, Union

from IPython.core.display import HTML  # type: ignore

import whylogs.viz.drift.column_drift_algorithms as column_drift_algorithms
from whylogs.api.usage_stats import emit_usage
from whylogs.core.configs import SummaryConfig
from whylogs.core.constraints import Constraints
from whylogs.core.view.dataset_profile_view import DatasetProfileView
from whylogs.migration.uncompound import _uncompound_dataset_profile
from whylogs.viz.enums.enums import PageSpec, PageSpecEnum
from whylogs.viz.utils.frequent_items_calculations import zero_padding_frequent_items
from whylogs.viz.utils.html_template_utils import _get_compiled_template
from whylogs.viz.utils.profile_viz_calculations import (
    add_feature_statistics,
    frequent_items_from_view,
    generate_profile_summary,
    generate_summaries_with_drift_score,
    histogram_from_view,
)

logger = logging.getLogger(__name__)
emit_usage("visualizer")


class NotebookProfileVisualizer:
    """
    Visualize and compare profiles for drift detection, data quality, distribution comparison and feature statistics.

    NotebookProfileVisualizer enables visualization features for Jupyter Notebook environments, but also enables
    download
    of the generated reports as HTML files.

    Examples
    --------

    Create target and reference dataframes:

    .. code-block:: python

        import pandas as pd

        data_target = {
            "animal": ["cat", "hawk", "snake", "cat", "snake", "cat", "cat", "snake", "hawk","cat"],
            "legs": [4, 2, 0, 4, 0, 4, 4, 0, 2, 4],
            "weight": [4.3, None, 2.3, 7.8, 3.7, 2.5, 5.5, 3.3, 0.6, 13.3],
        }

        data_reference = {
            "animal": ["hawk", "hawk", "snake", "hawk", "snake", "snake", "cat", "snake", "hawk","snake"],
            "legs": [2, 2, 0, 2, 0, 0, 4, 0, 2, 0],
            "weight": [2.7, None, 1.2, 10.5, 2.2, 4.6, 3.8, 4.7, 0.6, 11.2],
        }

        target_df = pd.DataFrame(data_target)
        reference_df = pd.DataFrame(data_reference)

    Log data and create profile views:

    .. code-block:: python

        import whylogs as why

        results = why.log(pandas=target_df)
        prof_view = results.view()

        results_ref = why.log(pandas=reference_df)
        prof_view_ref = results_ref.view()

    Log data and create profile views:

    .. code-block:: python

        import whylogs as why

        results = why.log(pandas=target_df)
        prof_view = results.view()

        results_ref = why.log(pandas=reference_df)
        prof_view_ref = results_ref.view()

    Instantiate and set profile views:

    .. code-block:: python

        from whylogs.viz import NotebookProfileVisualizer

        visualization = NotebookProfileVisualizer()
        visualization.set_profiles(target_profile_view=prof_view,reference_profile_view=prof_view_ref)
    """

    _ref_view: Optional[DatasetProfileView]
    _target_view: DatasetProfileView
    _drift_map: Optional[Dict[str, column_drift_algorithms.ColumnDriftAlgorithm]] = None

    @staticmethod
    def _display(template: str, page_spec: PageSpec, height: Optional[str]) -> "HTML":
        if not height:
            height = page_spec.height
        iframe = f"""<div></div><iframe srcdoc="{html.escape(template)}" width=100% height={height}
        frameBorder=0></iframe>"""
        display = HTML(iframe)
        return display

    def _display_distribution_chart(
        self,
        feature_name: str,
        difference: bool,
        cell_height: Optional[str] = None,
        config: Optional[SummaryConfig] = None,
    ) -> Optional[HTML]:
        if config is None:
            config = SummaryConfig()
        if difference:
            page_spec = PageSpecEnum.DIFFERENCED_CHART.value
        else:
            page_spec = PageSpecEnum.DISTRIBUTION_CHART.value

        template = _get_compiled_template(page_spec.html)
        if self._target_view:
            target_profile_features: Dict[str, Dict[str, Any]] = {feature_name: {}}
            reference_profile_features: Dict[str, Dict[str, Any]] = {feature_name: {}}

            target_column_profile_view = self._target_view.get_column(feature_name)
            if not target_column_profile_view:
                raise ValueError("ColumnProfileView for feature {} not found.".format(feature_name))

            target_profile_features[feature_name]["frequentItems"] = frequent_items_from_view(
                target_column_profile_view, feature_name, config
            )
            if self._ref_view:
                ref_col_view = self._ref_view.get_column(feature_name)
                if not ref_col_view:
                    raise ValueError("ColumnProfileView for feature {} not found.".format(feature_name))

                reference_profile_features[feature_name]["frequentItems"] = frequent_items_from_view(
                    ref_col_view, feature_name, config
                )

                (
                    target_profile_features[feature_name]["frequentItems"],
                    reference_profile_features[feature_name]["frequentItems"],
                ) = zero_padding_frequent_items(
                    target_feature_items=target_profile_features[feature_name]["frequentItems"],
                    reference_feature_items=reference_profile_features[feature_name]["frequentItems"],
                )
            else:
                logger.warning("Reference profile not detected. Plotting only for target feature.")
                reference_profile_features[feature_name]["frequentItems"] = [
                    {"value": x["value"], "estimate": 0} for x in target_profile_features[feature_name]["frequentItems"]
                ]  # Getting the same frequent items categories for target and adding 0 as estimate.
            distribution_chart = template(
                {
                    "profile_from_whylogs": json.dumps(target_profile_features),
                    "reference_profile_from_whylogs": json.dumps(reference_profile_features),
                }
            )
            result = self._display(distribution_chart, page_spec, cell_height)
            return result

        else:
            logger.warning("This method has to get at least a target profile, with valid feature title")
            return None

    def _display_histogram_chart(self, feature_name: str, cell_height: Optional[str] = None) -> Optional[HTML]:
        page_spec = PageSpecEnum.DOUBLE_HISTOGRAM.value
        template = _get_compiled_template(page_spec.html)
        if self._target_view:
            target_features: Dict[str, Dict[str, Any]] = {feature_name: {}}
            ref_features: Dict[str, Dict[str, Any]] = {feature_name: {}}

            target_col_view = self._target_view.get_column(feature_name)
            if not target_col_view:
                raise ValueError(f"ColumnProfileView for feature {feature_name} not found.")

            target_histogram = histogram_from_view(target_col_view, feature_name)
            if self._ref_view:
                reference_column_profile_view = self._ref_view.get_column(feature_name)
                if not reference_column_profile_view:
                    raise ValueError("ColumnProfileView for feature {} not found.".format(feature_name))
                ref_histogram = histogram_from_view(reference_column_profile_view, feature_name)
            else:
                logger.warning("Reference profile not detected. Plotting only for target feature.")
                ref_histogram = target_histogram.copy()
                ref_histogram["counts"] = [
                    0 for _ in ref_histogram["counts"]
                ]  # To plot single profile, zero counts for non-existing profile.

            ref_features[feature_name]["histogram"] = ref_histogram
            target_features[feature_name]["histogram"] = target_histogram
            if target_histogram["n"] == 1:
                # in the degenerate case when the target is a single value, it will be hidden
                # so here we draw a vertical line, using the max (which is the observed value)
                target_features[feature_name]["vertical_line"] = target_histogram["max"]
            histogram_chart = template(
                {
                    "profile_from_whylogs": json.dumps(target_features),
                    "reference_profile_from_whylogs": json.dumps(ref_features),
                }
            )
            return self._display(histogram_chart, page_spec, height=cell_height)
        else:
            logger.warning("This method has to get at least a target profile, with valid feature title")
            return None

    def add_drift_config(
        self, column_names: List[str], algorithm: column_drift_algorithms.ColumnDriftAlgorithm
    ) -> None:
        """Add drift configuration.
        The algorithms and thresholds added through this method will be used to calculate drift scores in the `summary_drift_report()` method.
        If any drift configuration exists, the new configuration will overwrite the standard behavior when appliable.
        If a column has multiple configurations defined, the last one defined will be used.

        Parameters
        ----------
        config: DriftConfig, required
            Drift configuration.

        """
        self._drift_map = {} if not self._drift_map else self._drift_map
        if not isinstance(algorithm, column_drift_algorithms.ColumnDriftAlgorithm):
            raise ValueError("Algorithm must be of class ColumnDriftAlgorithm.")
        if not self._target_view or not self._ref_view:
            logger.error("Set target and reference profiles before adding drift configuration.")
            raise ValueError
        if not algorithm:
            raise ValueError("Drift algorithm cannot be None.")
        if not column_names:
            raise ValueError("Drift configuration must have at least one column name.")
        if column_names:
            for column_name in column_names:
                if column_name not in self._target_view.get_columns().keys():
                    raise ValueError(f"Column {column_name} not found in target profile.")
                if column_name not in self._target_view.get_columns().keys():
                    raise ValueError(f"Column {column_name} not found in reference profile.")
        for column_name in column_names:
            if column_name in self._drift_map:
                logger.warning(f"Overwriting existing drift configuration for column {column_name}.")
            self._drift_map[column_name] = algorithm

    def set_profiles(
        self, target_profile_view: DatasetProfileView, reference_profile_view: Optional[DatasetProfileView] = None
    ) -> None:
        """Set profiles for Visualization/Comparison.

        Drift calculation is done if both `target_profile` and `reference profile` are passed.

        Parameters
        ----------
        target_profile_view: DatasetProfileView, required
            Target profile to visualize.
        reference_profile_view: DatasetProfileView, optional
            Reference, or baseline, profile to be compared against the target profile.

        """
        self._target_view = _uncompound_dataset_profile(target_profile_view) if target_profile_view else None
        self._ref_view = _uncompound_dataset_profile(reference_profile_view) if reference_profile_view else None

    def profile_summary(self, cell_height: Optional[str] = None) -> HTML:
        page_spec = PageSpecEnum.PROFILE_SUMMARY.value
        template = _get_compiled_template(page_spec.html)

        try:
            profile_summary = generate_profile_summary(self._target_view, config=None)
            rendered_template = template(profile_summary)
            return self._display(rendered_template, page_spec, cell_height)
        except ValueError as e:
            logger.error("This method has to get target Dataset Profile View")
            raise e

    def summary_drift_report(self, height: Optional[str] = None) -> HTML:
        """Generate drift report between target and reference profiles.

        KS is calculated if distribution metrics exists for said column.
        If not, Chi2 is calculated if frequent items, cardinality and count metric exists. If not, then no drift value is associated to the column.
        If feature is missing from any profile, it will not be included in the report.
        Both target_profile_view and reference_profile_view must be set previously with `set_profiles`.
        If custom drift behavior is desired, use `add_drift_config` before calling this method.

        Parameters
        ----------
        height: str, optional
            Preferred height, in pixels, for in-notebook visualization. Example:
            `"1000px"`. (Default is None)

        Returns
        -------
        HTML
            HTML Page of the given plot.

        Examples
        --------

        Generate Summary Drift Report (after setting profiles with `set_profiles`):

        .. code-block:: python
            from whylogs.viz.drift.column_drift_algorithms import Hellinger, ChiSquare
            from whylogs.viz import NotebookProfileVisualizer

            visualization = NotebookProfileVisualizer()
            visualization.set_profiles(target_profile_view=target_view, reference_profile_view=ref_view)

            visualization.add_drift_config(column_names=["weight"], algorithm=Hellinger())
            visualization.add_drift_config(column_names=["legs"], algorithm=ChiSquare())

            visualization.summary_drift_report()

        """
        if not self._target_view or not self._ref_view:
            logger.error("This method has to get both target and reference profiles")
            raise ValueError
        page_spec = PageSpecEnum.SUMMARY_REPORT.value
        template = _get_compiled_template(page_spec.html)

        profiles_summary = generate_summaries_with_drift_score(
            self._target_view, self._ref_view, config=None, drift_map=self._drift_map
        )
        rendered_template = template(profiles_summary)
        summary_drift_report = self._display(rendered_template, page_spec, height)
        return summary_drift_report

    def double_histogram(self, feature_name: Union[str, List[str]], cell_height: Optional[str] = None) -> HTML:
        """Plot overlayed histograms for specified feature present in both `target_profile` and `reference_profile`.

        Applicable to numerical features only.
        If reference profile was not set, `double_histogram` will plot single histogram for target profile.

        Parameters
        ----------
        feature_name: str
            Name of the feature to generate histograms.
        cell_height: str, optional
            Preferred cell height, in pixels, for in-notebook visualization. Example:
            `"1000px"`. (Default is None)

        Examples
        --------

        Generate double histogram plot for feature named `weight` (after setting profiles with `set_profiles`)

        .. code-block:: python

            visualization.double_histogram(feature_name="weight")
        """
        if isinstance(feature_name, str):
            feature_name = [feature_name]
        html_contents = []
        for feature in feature_name:
            html_content = self._display_histogram_chart(feature, cell_height)
            if html_content:
                html_contents.append(html_content.data)
        return HTML("<br>".join(html_contents))

    def distribution_chart(self, feature_name: Union[str, List[str]], cell_height: Optional[str] = None) -> HTML:
        """Plot overlayed distribution charts for specified feature between two profiles.

        Applicable to categorical features.
        If reference profile was not set, `distribution_chart` will plot single chart for target profile.


        Parameters
        ----------
        feature_name : str
            Name of the feature to plot chart.
        cell_height : str, optional
            Preferred cell height, in pixels, for in-notebook visualization. Example:
            `cell_height="1000px"`. (Default is None)

        Returns
        -------
        HTML
            HTML Page of the given plot.

        Examples
        --------

        Generate distribution chart for `animal` feature (after setting profiles with `set_profiles`):

        .. code-block:: python

            visualization.distribution_chart(feature_name="animal")
        """
        if isinstance(feature_name, str):
            feature_name = [feature_name]
        difference = False

        html_contents = []
        for feature in feature_name:
            html_content = self._display_distribution_chart(feature, difference, cell_height)
            if html_content:
                html_contents.append(html_content.data)
        return HTML("<br>".join(html_contents))

    def difference_distribution_chart(
        self, feature_name: Union[str, List[str]], cell_height: Optional[str] = None
    ) -> HTML:
        """Plot overlayed distribution charts of differences between the categories of both profiles.

        Applicable to categorical features.

        Parameters
        ----------
        feature_name : str
            Name of the feature to plot chart.
        cell_height : str, optional
            Preferred cell height, in pixels, for in-notebook visualization. Example:
            `cell_height="1000px"`. (Default is None)

        Returns
        -------
        HTML
            HTML Page of the given plot.

        Examples
        --------

        Generate Difference Distribution Chart for feature named "animal":

        .. code-block:: python

            visualization.difference_distribution_chart(feature_name="animal")

        """
        if isinstance(feature_name, str):
            feature_name = [feature_name]
        difference = True

        html_contents = []
        for feature in feature_name:
            html_content = self._display_distribution_chart(feature, difference, cell_height)
            if html_content:
                html_contents.append(html_content.data)
        return HTML("<br>".join(html_contents))

    def constraints_report(self, constraints: Constraints, cell_height: Optional[str] = None) -> HTML:
        page_spec = PageSpecEnum.CONSTRAINTS_REPORT.value
        template = _get_compiled_template(page_spec.html)
        rendered_template = template(
            {"constraints_report": json.dumps(constraints.generate_constraints_report(with_summary=True))}
        )
        constraints_report = self._display(rendered_template, page_spec, cell_height)
        return constraints_report

    def feature_statistics(
        self, feature_name: Union[str, List[str]], profile: str = "reference", cell_height: Optional[str] = None
    ) -> HTML:
        """
        Generate a report for the main statistics of specified feature, for a given profile (target or reference).

        Statistics include overall metrics such as distinct and missing values, as well as quantile and descriptive
        statistics.
        If `profile` is not passed, the default is the reference profile.

        Parameters
        ----------
        feature_name: str
            Name of the feature to generate histograms.
        profile: str
            Profile to be used to generate the report. (Default is `reference`)
        cell_height: str, optional
            Preferred cell height, in pixels, for in-notebook visualization. Example:
            `cell_height="1000px"`. (Default is None)

        Examples
        --------

        Generate Difference Distribution Chart for feature named "weight", for target profile:

        .. code-block:: python

            visualization.feature_statistics(feature_name="weight", profile="target")

        """
        if isinstance(feature_name, str):
            feature_name = [feature_name]
        page_spec = PageSpecEnum.FEATURE_STATISTICS.value
        template = _get_compiled_template(page_spec.html)
        feature_statistics = []
        for feature in feature_name:
            if self._ref_view and profile.lower() == "reference":
                selected_profile_column = self._ref_view.get_column(feature)
            else:
                selected_profile_column = self._target_view.get_column(feature)

            rendered_template = template(
                {
                    "profile_feature_statistics_from_whylogs": json.dumps(
                        add_feature_statistics(feature, selected_profile_column)
                    )
                }
            )
            html_content = self._display(rendered_template, page_spec, cell_height).data
            feature_statistics.append(html_content)
        return HTML("<br>".join(feature_statistics))

    @staticmethod
    def write(
        rendered_html: HTML,
        preferred_path: Optional[str] = None,  # type: ignore
        html_file_name: Optional[str] = None,  # type: ignore
    ) -> None:
        """Create HTML file for a given report.

        Parameters
        ----------
        rendered_html: HTML, optional
            Rendered HTML returned by a given report.
        preferred_path: str, optional
            Preferred path to write the HTML file.
        html_file_name: str, optional
            Name for the created HTML file. If none is passed, created HTML will be named `ProfileVisualizer.html`

        Examples
        --------
        Dowloads an HTML page named `test.html` into the current working directory, with feature statistics for `weight` feature for the target profile.

        .. code-block:: python

            import os
            visualization.write(
                rendered_html=visualization.feature_statistics(feature_name="weight", profile="target"),
                html_file_name=os.getcwd() + "/test",
            )


        """
        if not html_file_name:
            html_file_name = "ProfileVisualizer"
        if preferred_path:
            full_path = os.path.join(os.path.expanduser(preferred_path), str(html_file_name) + ".html")
        else:
            full_path = os.path.join(os.pardir, "html_reports", str(html_file_name) + ".html")

        with open(os.path.abspath(full_path), "w") as saved_html:
            saved_html.write(rendered_html.data)