hyperparameter_hunter/i_o/leaderboards.py from HunterMcGushion/hyperparameter_hunter

hyperparameter_hunter/i_o/leaderboards.py
Summary

Maintainability

25 mins
Test Coverage

Issues
"""This module defines the Leaderboard classes that are saved to the
'HyperparameterHunterAssets/Leaderboards' subdirectory. It provides the ability to compare all
Experiment results at a glance

Related
-------
:mod:`hyperparameter_hunter.recorders`
    This module initiates the saving of Experiment entries to Leaderboards"""
##################################################
# Import Miscellaneous Assets
##################################################
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from numbers import Number
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple


class Leaderboard(metaclass=ABCMeta):
    def __init__(self, data=None):
        """The Leaderboard class is used for reading, updating, and saving leaderboard files within
        the 'HyperparameterHunterAssets/Leaderboards' subdirectory

        Parameters
        ----------
        data: pd.DataFrame, or None, default=None
            The starting state of the Leaderboard. If None, an empty DataFrame is used"""
        self.data = data if data is not None else pd.DataFrame()

    def __getattr__(self, item):
        return self.data.__getattribute__(item)

    @classmethod
    def from_path(cls, path, assert_existence=False):
        """Initialize a Leaderboard from a .csv `path`

        Parameters
        ----------
        path: str
            The path of the file to read in as a DataFrame
        assert_existence: boolean, default=False
            If False, and :func:`pandas.read_csv` raises FileNotFoundError, the Leaderboard will be
            initialized with None. Else the exception is raised normally"""
        try:
            data = pd.read_csv(path, index_col=None)
        except FileNotFoundError:
            if assert_existence is True:
                raise
            data = None
        return cls(data=data)

    @abstractmethod
    def add_entry(self, experiment, **kwargs):
        """Add an entry row for `experiment` to :attr:`data`

        Parameters
        ----------
        experiment: :class:`experiments.BaseExperiment`
            An instance of a completed Experiment from which to construct a Leaderboard entry"""

    def save(self, path, **kwargs):
        """Save the Leaderboard instance

        Parameters
        ----------
        path: str
            The file to which the Leaderboard instance should be saved
        **kwargs: Dict
            Additional arguments to supply to :meth:`pandas.DataFrame.to_csv`"""
        self.data.to_csv(path_or_buf=path, index=False, float_format="%.10f", **kwargs)

    def sort(self, by, ascending=False):
        """Sort the rows in :attr:`data` according to the values of a column

        Parameters
        ----------
        by: str, or list of str
            The column name(s) by which to sort the rows of :attr:`data`
        ascending: boolean, default=False
            The direction in which to sort the rows of :attr:`data`"""
        self.data.sort_values(by=by, axis=0, inplace=True, ascending=ascending)


class GlobalLeaderboard(Leaderboard):
    def add_entry(self, experiment, **kwargs):
        """Add an entry row to :attr:`Leaderboard.data` (pandas.DataFrame). This method also handles
        column conflicts to an extent

        Parameters
        ----------
        experiment: Instance of :class:`experiments.BaseExperiment` descendant
            An Experiment instance for which a leaderboard entry row should be added
        **kwargs: Dict
            Extra keyword arguments"""
        final_evaluations = experiment.last_evaluation_results
        entry_columns, entry_data = [], []
        # TODO: Resolve cases where `data` contains an aliased column for a metric, but the current experiment uses the
        # TODO: ... standard metric name. EX) 'oof_roc' vs 'oof_roc_auc_score' - They should be considered the same - Use alias
        evaluation_columns, evaluation_values = list(
            zip(*evaluations_to_columns(final_evaluations))
        )
        entry_columns.extend(evaluation_columns)
        entry_data.extend(evaluation_values)

        identifier_cols = [
            "experiment_id",
            "hyperparameter_key",
            "cross_experiment_key",
            "algorithm_name",
        ]
        entry_columns.extend(identifier_cols)
        for id_col in identifier_cols:
            entry_data.append(str(getattr(experiment, id_col)))

        entry_columns.append("experiment_#")
        entry_data.append(self.data.shape[0])
        identifier_cols.append("experiment_#")

        entry = pd.DataFrame(data=[entry_data], columns=entry_columns)

        self.data = self.data.append(entry, ignore_index=True)[
            combine_column_order(self.data, entry, both_cols=identifier_cols)
        ]


# class AlgorithmLeaderboard(Leaderboard):
#     pass


# class EnvironmentLeaderboard(Leaderboard):
#     pass


def evaluations_to_columns(
    evaluation: Dict[str, Optional[OrderedDict]], decimals=10
) -> List[Tuple[str, Number]]:
    """Convert the results of :meth:`metrics.ScoringMixIn.evaluate` to a pd.DataFrame-ready format

    Parameters
    ----------
    evaluation: Dict[str, OrderedDict]
        The result of consecutive calls to :meth:`metrics.ScoringMixIn.evaluate` for all given
        dataset types
    decimals: Int, default=10
        Number of decimal places to which to round. If `decimals` is negative, it specifies the
        number of positions to the left of the decimal point

    Returns
    -------
    column_metrics: list of pairs
        A pair for each data_type-metric combination, where the first item is the key, and the
        second is the metric value

    Examples
    --------
    >>> evaluations_to_columns({
    ...     'in_fold': None,
    ...     'holdout': OrderedDict([('roc_auc_score', 0.9856), ('f1_score', 0.9768)]),
    ...     'oof': OrderedDict([('roc_auc_score', 0.9634)])
    ... })
    [('oof_roc_auc_score', 0.9634), ('holdout_roc_auc_score', 0.9856), ('holdout_f1_score', 0.9768)]
    """
    data_types = ["oof", "holdout", "in_fold"]
    column_metrics = []

    for data_type in data_types:
        if evaluation[data_type] is not None:
            for metric_key, metric_val in evaluation[data_type].items():
                column_metrics.append((f"{data_type}_{metric_key}", np.round(metric_val, decimals)))

    return column_metrics


def combine_column_order(df_1, df_2, both_cols=None):
    """Determine the sort order for the combined columns of two DataFrames

    Parameters
    ----------
    df_1: pd.DataFrame
        The first DataFrame, whose columns will be sorted. Columns unique to `df_1` will be sorted
        before those of `df_2`
    df_2: pd.DataFrame
        The second DataFrame, whose columns will be sorted. Columns unique to `df_2` will be sorted
        after those of `df_1`
    both_cols: list, or None, default=None
        If list, the column names that should be common to both DataFrames and placed last in the
        sort order

    Returns
    -------
    combined_cols: list of strings
        The result of combining and sorting column names from `df_1`, and `df_2`

    Examples
    --------
    >>> df_1 = pd.DataFrame(columns=['A', 'B', 'C', 'Common_1', 'Common_2'])
    >>> df_2 = pd.DataFrame(columns=['A', 'D', 'E', 'Common_1', 'Common_2'])
    >>> combine_column_order(df_1, df_2, both_cols=['Common_1', 'Common_2'])
    ['A', 'B', 'C', 'D', 'E', 'Common_1', 'Common_2']
    >>> combine_column_order(df_1, df_2, both_cols=None)
    ['A', 'Common_1', 'Common_2', 'B', 'C', 'D', 'E']
    """
    both_cols = both_cols or []
    df_1_cols = [_ for _ in list(df_1.columns) if _ not in both_cols]
    df_2_cols = [_ for _ in list(df_2.columns) if _ not in both_cols]

    common_cols = [_ for _ in df_1_cols if _ in df_2_cols]
    unique_cols_1 = [_ for _ in df_1_cols if _ not in df_2_cols]
    unique_cols_2 = [_ for _ in df_2_cols if _ not in df_1_cols]

    combined_cols = common_cols + unique_cols_1 + unique_cols_2 + both_cols
    return combined_cols


if __name__ == "__main__":
    pass