HunterMcGushion/hyperparameter_hunter

View on GitHub
hyperparameter_hunter/keys/makers.py

Summary

Maintainability
B
5 hrs
Test Coverage
"""This module handles the creation of `cross_experiment_key` s and `hyperparameter_key` s for
:class:`hyperparameter_hunter.environment.Environment`, and
:class:`hyperparameter_hunter.experiments.BaseExperiment`, respectively. It also handles the
treatment of complex-typed inputs and their storage in the 'KeyAttributeLookup' subdirectory. The
descendants of :class:`hyperparameter_hunter.keys.makers.KeyMaker` defined herein are each
responsible for the generation and saving of their keys, as well as determining whether such a key
already exists

Related
-------
:mod:`hyperparameter_hunter.environment`
    This module uses :class:`hyperparameter_hunter.keys.makers.CrossExperimentKeyMaker` to set
    :attr:`hyperparameter_hunter.environment.Environment.cross_experiment_key`
:mod:`hyperparameter_hunter.experiments`
    This module uses :class:`hyperparameter_hunter.keys.makers.HyperparameterKeyMaker` to set
    :attr:`hyperparameter_hunter.experiments.BaseExperiment.hyperparameter_key`"""
##################################################
# Import Own Assets
##################################################
from hyperparameter_hunter.feature_engineering import FeatureEngineer, EngineerStep
from hyperparameter_hunter.i_o.exceptions import EnvironmentInvalidError, EnvironmentInactiveError
from hyperparameter_hunter.keys.hashing import make_hash_sha256
from hyperparameter_hunter.compat.keras_helper import (
    keras_callback_to_dict,
    keras_initializer_to_dict,
    parameterize_compiled_keras_model,
)
from hyperparameter_hunter.compat.keras_optimization_helper import initialize_dummy_model
from hyperparameter_hunter.metrics import Metric
from hyperparameter_hunter.sentinels import Sentinel
from hyperparameter_hunter.settings import G
from hyperparameter_hunter.utils.file_utils import write_json, read_json, add_to_json, make_dirs
from hyperparameter_hunter.utils.file_utils import RetryMakeDirs
from hyperparameter_hunter.utils.general_utils import subdict
from hyperparameter_hunter.utils.boltons_utils import remap, default_enter

##################################################
# Import Miscellaneous Assets
##################################################
from abc import ABCMeta, abstractmethod
from copy import deepcopy
from functools import partial
from inspect import isclass, getsource
from os import listdir
import os.path
import pandas as pd
import shelve

##################################################
# Import Learning Assets
##################################################
try:
    from keras.callbacks import Callback as BaseKerasCallback
    from keras.initializers import Initializer as BaseKerasInitializer
except ModuleNotFoundError:
    BaseKerasCallback = type("BaseKerasCallback", tuple(), {})
    BaseKerasInitializer = type("BaseKerasInitializer", tuple(), {})


##################################################
# KeyMaker Base Class:
##################################################
class KeyMaker(metaclass=ABCMeta):
    def __init__(self, parameters, **kwargs):
        """Base class to handle making key hashes and checking for their existence. Additionally,
        this class handles saving entries for complex-typed parameters, along with their hashes to
        ensure experiments are reproducible

        Parameters
        ----------
        parameters: Dict
            All the parameters to be included when creating the key hash. Keys should correspond to
            parameter names, and values should be the values of the corresponding keys
        **kwargs: Dict
            Additional arguments

        Attributes
        ----------
        parameters: Dict
            A deep copy of the given `parameters` input
        key: Str, or None
            If a key has been generated for `parameters`, it is saved here. Else, None
        exists: Boolean
            True if `key` is not None, and already exists in `tested_keys_dir`. Else, False
        lookup_dir: Str
            The directory in which complex-typed parameter entries will be saved
        tested_keys_dir: Str, or None
            The directory is which `key` will be saved if it does not already contain `key`"""
        self.parameters = deepcopy(parameters)
        self.key = None
        self.exists = False

        self.lookup_dir = None
        self.tested_keys_dir = None

        self.validate_environment()
        self.handle_complex_types()
        self.make_key()

        self.does_key_exist()

    def __repr__(self):
        return f"{self.__class__.__name__}(key={self.key!r})"

    def __str__(self):
        return f"{self.key!s}"

    def __eq__(self, other):
        return self.key == other

    def __ne__(self, other):
        """Instance will always return True for a non-equality check if `key` is unset (None)"""
        return (self.key is None) or (self.key != other)

    ##################################################
    # Core Methods
    ##################################################
    def validate_environment(self):
        """Check that the currently active Environment is suitable"""
        if G.Env is None:
            raise EnvironmentInactiveError("")
        if not all([hasattr(G.Env, _) for _ in ["result_paths", "cross_experiment_key"]]):
            raise EnvironmentInvalidError("")
        try:
            self.lookup_dir = G.Env.result_paths["key_attribute_lookup"]
            self.tested_keys_dir = G.Env.result_paths["tested_keys"]

            # Ensure :attr:`tested_keys_dir` exists before calling :meth:`does_key_exist`, so "None" paths won't be checked
            if os.path.exists(self.tested_keys_dir) is False:
                # TypeError may also be raised if :func:`os.path.exists` receives invalid input
                raise TypeError
        except TypeError:  # Key-making blacklisted
            if self.tested_keys_dir is None:
                return
            make_dirs(self.tested_keys_dir)

    def handle_complex_types(self):
        """Locate complex types in :attr:`parameters`, create hashes for them, add lookup entries
        linking their original values to their hashes, then update their values in
        :attr:`parameters` to their hashes to facilitate Description saving"""
        dataframe_hashes = {}

        def enter(path, key, value):
            """Produce iterable of attributes to remap for instances of :class:`metrics.Metric`"""
            if isinstance(value, Metric):
                metric_attrs = ["name", "metric_function", "direction"]
                return ({}, [(_, getattr(value, _)) for _ in metric_attrs])

            if isinstance(value, EngineerStep):
                return ({}, list(value.get_key_data().items()))
            if isinstance(value, FeatureEngineer):
                return ({}, list(value.get_key_data().items()))

            return default_enter(path, key, value)

        def visit(path, key, value):
            """Check whether a parameter is of a complex type. If not, return it unchanged.
            Otherwise, 1) create a hash for its value; 2) save a complex type lookup entry linking
            `key`, `value`, and the hash for `value`; and 3) return the hashed value with `key`,
            instead of the original complex-typed `value`

            Parameters
            ----------
            path: Tuple
                The path of keys that leads to `key`
            key: Str
                The parameter name
            value: *
                The value of the parameter `key`

            Returns
            -------
            Tuple of (`key`, value), in which value is either unchanged or a hash for the original
            `value`"""
            if isinstance(value, BaseKerasCallback):
                return (key, keras_callback_to_dict(value))
            if isinstance(value, BaseKerasInitializer):
                return (key, keras_initializer_to_dict(value))
            if isinstance(value, Sentinel):
                return (key, value.sentinel)
            elif callable(value) or isinstance(value, pd.DataFrame):
                # FLAG: Look into adding package version number to hashed attributes
                hashed_value = make_hash_sha256(value)

                if isinstance(value, pd.DataFrame):
                    dataframe_hashes.setdefault(hashed_value, []).append(key)

                if self.tested_keys_dir is not None:  # Key-making not blacklisted
                    self.add_complex_type_lookup_entry(path, key, value, hashed_value)
                return (key, hashed_value)
            return (key, value)

        self.parameters = remap(self.parameters, visit=visit, enter=enter)

        #################### Check for Identical DataFrames ####################
        for df_hash, df_names in dataframe_hashes.items():
            if len(df_names) > 1:
                G.warn(f"The dataframes: {df_names} are identical. Scores may be misleading!")

    @RetryMakeDirs()
    def add_complex_type_lookup_entry(self, path, key, value, hashed_value):
        """Add lookup entry in `lookup_dir` for a complex-typed parameter, linking
        the parameter `key`, its `value`, and its `hashed_value`

        Parameters
        ----------
        path: Tuple
            The path of keys that leads to `key`
        key: Str
            The parameter name
        value: *
            The value of the parameter `key`
        hashed_value: Str
            The hash produced for `value`"""
        shelve_params = ["model_initializer", "cv_type"]
        lookup_path = partial(os.path.join, self.lookup_dir, *[f"{_}" for _ in path])

        if isclass(value) or (key in shelve_params):
            make_dirs(lookup_path(), exist_ok=True)

            with shelve.open(lookup_path(f"{key}"), flag="c") as s:
                # NOTE: When reading from shelve file, DO NOT add the ".db" file extension
                s[hashed_value] = value
        elif isinstance(value, pd.DataFrame):
            make_dirs(lookup_path(key), exist_ok=True)
            value.to_csv(lookup_path(key, f"{hashed_value}.csv"), index=False)
        else:  # Possible types: partial, function, *other
            add_to_json(
                file_path=lookup_path(f"{key}.json"),
                data_to_add=getsource(value),
                key=hashed_value,
                condition=lambda _: hashed_value not in _.keys(),
                default={},
            )

    def make_key(self):
        """Set :attr:`key` to an sha256 hash for :attr:`parameters`"""
        self.key = make_hash_sha256(self._filter_parameters_to_hash(deepcopy(self.parameters)))

    @staticmethod
    def _filter_parameters_to_hash(parameters):
        """Produce a filtered version of `parameters` that does not include values that should be
        ignored during hashing

        Parameters
        ----------
        parameters: Dict
            The full dictionary of initial parameters to be filtered

        Returns
        -------
        parameters: Dict
            The filtered version of the given `parameters`"""
        return parameters

    ##################################################
    # Abstract Methods
    ##################################################
    @property
    @abstractmethod
    def key_type(self) -> str:
        """Str in ["hyperparameter", "cross_experiment"], denoting the key type being processed"""

    @abstractmethod
    def does_key_exist(self) -> bool:
        """Check if key hash exists among saved keys in the contents of :attr:`tested_keys_dir`"""

    @abstractmethod
    def save_key(self):
        """Save the key hash and the parameters used to make it to :attr:`tested_keys_dir`"""


class CrossExperimentKeyMaker(KeyMaker):
    key_type = "cross_experiment"

    def __init__(self, parameters, **kwargs):
        """A KeyMaker class dedicated to creating cross-experiment keys, which determine when
        experiments were executed under sufficiently similar conditions to permit proper comparison.
        Two separate instances of :class:`environment.Environment` should produce identical
        `cross_experiment_key` s if their arguments are the same (or close enough)

        Parameters
        ----------
        parameters: Dict
            All the parameters to be included when creating the key hash. Keys should correspond to
            parameter names, and values should be the values of the corresponding keys
        **kwargs: Dict
            Additional arguments supplied to :meth:`keys.makers.KeyMaker.__init__`"""
        KeyMaker.__init__(self, parameters, **kwargs)

    def does_key_exist(self):
        """Check if a file corresponding to this cross_experiment_key already exists

        Returns
        -------
        Boolean"""
        tested_keys_dir_contents = [os.path.splitext(_)[0] for _ in listdir(self.tested_keys_dir)]
        self.exists = self.key in tested_keys_dir_contents

        return self.exists

    def save_key(self):
        """Create a new file for this cross_experiment_key if :attr:`exists` is False"""
        if not self.exists:
            write_json(f"{self.tested_keys_dir}/{self.key}.json", {})
            self.exists = True
            G.log(f'Saved {self.key_type}_key: "{self.key}"', 4)
        else:
            G.log(f'{self.key_type}_key "{self.key}" already exists - Skipped saving', 4)


class HyperparameterKeyMaker(KeyMaker):
    key_type = "hyperparameter"

    def __init__(self, parameters, cross_experiment_key, **kwargs):
        """A KeyMaker class dedicated to creating hyperparameter keys, which determine when
        experiments were executed using identical hyperparameters. Two separate instances of
        :class:`experiments.CVExperiment` should produce identical `hyperparameter_key` s if their
        hyperparameters are the same (or close enough)

        Parameters
        ----------
        parameters: Dict
            All the parameters to be included when creating the key hash. Keys should correspond to
            parameter names, and values should be the values of the corresponding keys
        cross_experiment_key: Str
            The key produced by the active Environment via
            :class:`keys.makers.CrossExperimentKeyMaker`, used for determining when a
            hyperparameter key has already been tested under the same cross-experiment parameters
        **kwargs: Dict
            Additional arguments supplied to :meth:`keys.makers.KeyMaker.__init__`"""
        self.cross_experiment_key = cross_experiment_key
        self.is_task_keras = (
            hasattr(G.Env, "current_task")
            and G.Env.current_task
            and G.Env.current_task.module_name == "keras"
        )

        if self.is_task_keras:
            parameters = deepcopy(parameters)

            #################### Initialize and Parameterize Dummy Model ####################
            temp_model = initialize_dummy_model(
                parameters["model_initializer"],
                parameters["model_init_params"]["build_fn"],
                parameters["model_extra_params"],
            )

            temp_layers, temp_compile_params = parameterize_compiled_keras_model(temp_model)

            #################### Process Parameters ####################
            # noinspection PyUnusedLocal
            def _visit(path, key, value):
                """If `key` not in ('input_shape', 'input_dim'), return True. Else, return False"""
                return key not in ("input_shape", "input_dim")

            temp_layers = remap(temp_layers, visit=_visit)

            parameters["model_init_params"]["layers"] = temp_layers
            parameters["model_init_params"]["compile_params"] = temp_compile_params

            parameters["model_extra_params"] = subdict(
                parameters["model_extra_params"], drop=["params"]
            )

        KeyMaker.__init__(self, parameters, **kwargs)

    def _filter_parameters_to_hash(self, parameters):
        """Produce a filtered version of `parameters` that does not include hyperparameters that
        should be ignored during hashing, such as those pertaining to verbosity, seeds, and random
        states, as they have no effect on HyperparameterHunter experiment results

        Parameters
        ----------
        parameters: Dict
            Full dictionary of initial parameters to be filtered

        Returns
        -------
        parameters: Dict
            Filtered version of the given `parameters`"""
        reject = ["verbose", "verbosity", "silent"]
        reject += ["random_state", "random_seed", "seed", "n_jobs", "nthread"]

        if self.is_task_keras:
            reject.append("build_fn")

        parameters["model_init_params"] = subdict(parameters["model_init_params"], drop=reject)
        parameters["model_extra_params"] = subdict(parameters["model_extra_params"], drop=reject)
        return parameters

    def does_key_exist(self):
        """Check that 1) there is a file for :attr:`cross_experiment_key.key`, 2) the aforementioned
        file contains the key :attr:`key`, and 3) the value at :attr:`key` is a non-empty list

        Returns
        -------
        Boolean"""
        if self.cross_experiment_key.exists is True:
            records = read_json(f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json")

            for a_hyperparameter_key in records.keys():
                if self.key == a_hyperparameter_key:
                    experiments_run = records[a_hyperparameter_key]
                    if isinstance(experiments_run, list) and len(experiments_run) > 0:
                        self.exists = True
                        return self.exists

        return self.exists

    def save_key(self):
        """Create an entry in the dict contained in the file at :attr:`cross_experiment_key.key`,
        whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False"""
        if not self.exists:
            if self.cross_experiment_key.exists is False:
                _err = "Cannot save hyperparameter_key: '{}', before cross_experiment_key '{}'"
                raise ValueError(_err.format(self.key, self.cross_experiment_key.key))

            key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json"
            add_to_json(key_path, [], key=self.key, condition=lambda _: self.key not in _.keys())

            self.exists = True
            G.log(f'Saved {self.key_type}_key: "{self.key}"', 4)
        else:
            G.log(f'{self.key_type}_key "{self.key}" already exists - Skipped saving', 4)