whylabs/whylogs-python

View on GitHub
python/whylogs/viz/utils/frequent_items_calculations.py

Summary

Maintainability
A
25 mins
Test Coverage
from typing import List, Optional, Tuple

from typing_extensions import TypedDict

from whylogs.core.configs import SummaryConfig
from whylogs.core.metrics.metrics import FrequentItem
from whylogs.core.view.column_profile_view import ColumnProfileView


class FrequentItemEstimate(TypedDict):
    value: str
    estimate: float


def get_frequent_items_estimate(frequent_items: List[FrequentItem]) -> List[FrequentItemEstimate]:
    return [{"value": x.value, "estimate": x.est} for x in frequent_items]


class FrequentStats(TypedDict):
    frequent_items: List[FrequentItemEstimate]
    total_count: int
    unique_count: int


def get_frequent_stats(column_view: ColumnProfileView, config: Optional[SummaryConfig]) -> Optional[FrequentStats]:
    if config is None:
        config = SummaryConfig()

    target_fi_metric = column_view.get_metric("frequent_items")
    if target_fi_metric is None:
        return None

    target_frequent_items = target_fi_metric.to_summary_dict(config)["frequent_strings"]
    target_cnt_metric = column_view.get_metric("counts")
    target_count = target_cnt_metric.n.value
    target_card_metric = column_view.get_metric("cardinality")
    if not target_card_metric:
        return None
    target_unique_count = int(target_card_metric.hll.value.get_estimate())

    target_frequent_stats: FrequentStats = {
        "frequent_items": get_frequent_items_estimate(target_frequent_items),
        "total_count": target_count,
        "unique_count": target_unique_count,
    }

    return target_frequent_stats


def frequent_items_from_view(
    column_view: ColumnProfileView, feature_name: str, config: Optional[SummaryConfig] = None
) -> List[FrequentItemEstimate]:
    config = config or SummaryConfig()
    column_frequent_items_metric = column_view.get_metric("frequent_items")
    if not column_frequent_items_metric:
        raise ValueError("Frequent Items Metrics not found for feature {}.".format(feature_name))

    target_frequent_items = column_frequent_items_metric.to_summary_dict(config)["frequent_strings"]
    frequent_items = get_frequent_items_estimate(target_frequent_items)
    return frequent_items


def zero_padding_frequent_items(
    target_feature_items: List[FrequentItemEstimate], reference_feature_items: List[FrequentItemEstimate]
) -> Tuple[List[FrequentItemEstimate], List[FrequentItemEstimate]]:
    """Fills estimate value of item with 0 when such item is present in the other profile but absent in the current profile.
    This is done for both profiles passed.

    Parameters
    ----------
    target_feature_items : List[FrequentItemEstimate]
        A list of frequent items of a given column for target profile
    reference_feature_items : List[FrequentItemEstimate]
        A list of frequent items of a given column for reference profile

    Returns
    -------
    Tuple[List[FrequentItemEstimate], List[FrequentItemEstimate]]
        The same list of items given in the input, but with zero padding for absent items.
    """
    for reference_item in reference_feature_items:
        item_value = reference_item["value"]
        if item_value not in [x["value"] for x in target_feature_items]:
            target_feature_items.append({"value": item_value, "estimate": 0})
    for target_item in target_feature_items:
        item_value = target_item["value"]
        if item_value not in [x["value"] for x in reference_feature_items]:
            reference_feature_items.append({"value": item_value, "estimate": 0})

    return target_feature_items, reference_feature_items