hyperparameter_hunter/feature_engineering.py
"""This module organizes and executes feature engineering/preprocessing step functions. The central
components of the module are :class:`FeatureEngineer` and :class:`EngineerStep` - everything else
is built to support those two classes. This module works with a very broad definition of
"feature engineering". The following is a non-exhaustive list of transformations that are
considered valid for `FeatureEngineer` step functions:
* Manual feature creation
* Input data scaling/normalization/standardization
* Target data transformation
* Re-sampling
* Data imputation
* Feature selection/elimination
* Encoding (one-hot, label, etc.)
* Binarization/binning/discretization
* Feature extraction (as for NLP/image recognition tasks)
* Feature shuffling
Related
-------
:mod:`hyperparameter_hunter.space`
Only related when optimizing `FeatureEngineer` steps within an Optimization Protocol, but
defines :class:`~hyperparameter_hunter.space.dimensions.Categorical`, which is the mechanism for
defining a feature engineer step search space, and
:class:`~hyperparameter_hunter.space.dimensions.RejectedOptional`, which is used to represent
the absence of a feature engineer step, when labeled as `optional`"""
##################################################
# Import Own Assets
##################################################
from hyperparameter_hunter.keys.hashing import make_hash_sha256
from hyperparameter_hunter.space.dimensions import Categorical, RejectedOptional
from hyperparameter_hunter.utils.boltons_utils import remap, default_visit, default_enter
from hyperparameter_hunter.utils.general_utils import subdict
##################################################
# Import Miscellaneous Assets
##################################################
import ast
from contextlib import suppress
from inspect import getsource
import numpy as np
import pandas as pd
from typing import List, Callable, Dict, Union, Tuple
##################################################
# Global Variables
##################################################
EMPTY_SENTINEL = type("EMPTY_SENTINEL", tuple(), {})
DFDict = Dict[str, pd.DataFrame]
DescendantsType = Dict[str, Union["DescendantsType", None]]
N_DATASET_TRAIN = ["train_data", "train_inputs", "train_targets"]
N_DATASET_VALIDATION = ["validation_data", "validation_inputs", "validation_targets"]
N_DATASET_HOLDOUT = ["holdout_data", "holdout_inputs", "holdout_targets"]
N_DATASET_TEST = ["test_inputs"]
N_DATASET_ALL = ["all_data", "all_inputs", "all_targets"]
N_DATASET_NON_TRAIN = ["non_train_data", "non_train_inputs", "non_train_targets"]
STANDARD_DATASET_NAMES = N_DATASET_TRAIN + N_DATASET_VALIDATION + N_DATASET_HOLDOUT + N_DATASET_TEST
MERGED_DATASET_NAMES = N_DATASET_ALL + N_DATASET_NON_TRAIN
COUPLED_DATASET_CANDIDATES = [
N_DATASET_TRAIN,
N_DATASET_VALIDATION,
N_DATASET_HOLDOUT,
N_DATASET_ALL,
N_DATASET_NON_TRAIN,
]
class DatasetNameReport:
def __init__(self, params: Tuple[str], stage: str):
"""Characterize the relationships between the dataset names `params`
Parameters
----------
params: Tuple[str]
Dataset names requested by a feature engineering step callable. Must be a subset of
{"train_data", "train_inputs", "train_targets", "validation_data", "validation_inputs",
"validation_targets", "holdout_data", "holdout_inputs", "holdout_targets",
"test_inputs", "all_data", "all_inputs", "all_targets", "non_train_data",
"non_train_inputs", "non_train_targets"}
stage: String in {"pre_cv", "intra_cv"}
Feature engineering stage during which the datasets `params` are requested
Attributes
----------
merged_datasets: List[tuple]
Tuples of strings denoting paths to datasets that represent a merge between multiple
datasets. Merged datasets are those prefixed with either "all" or "non_train". These
paths are locations in `descendants`
coupled_datasets: List[tuple]
Tuples of strings denoting paths to datasets that represent a coupling of "inputs" and
"targets" datasets. Coupled datasets are those suffixed with "data". These paths are
locations in `descendants`, and the values at each path should be a dict containing keys
with "inputs" and "targets" suffixes
leaves: Dict[tuple, str]
Mapping of full path tuples in `descendants` to their leaf values. Tuple paths represent
the steps necessary to reach the standard dataset leaf value in `descendants` by
traversing merged and coupled datasets. Values in `leaves` should be identical to the
last element of the corresponding tuple key
descendants: DescendantsType
Nested dict in which all keys are dataset name strings, and all leaf values are `None`.
Represents the structure of the requested dataset names, traversing over merged and
coupled datasets (if necessary) in order to reach the standard dataset leaves"""
self.params: Tuple[str] = params
self.stage: str = stage
self.merged_datasets: List[tuple] = []
self.coupled_datasets: List[tuple] = []
self.leaves: Dict[tuple, str] = dict()
self.descendants: DescendantsType = remap(
{_: _ for _ in self.params}, visit=self._visit, enter=self._enter, use_registry=False
)
@staticmethod
def _visit(path, key, value):
"""If `key` == `value`, return tuple of (`key`, None). Else `default_visit`"""
if key and key == value:
return (key, None)
return default_visit(path, key, value)
def _enter(self, path, key, value):
"""Update contents of `merged_datasets`, `coupled_datasets`, and `leaves` and direct
traversal of the sub-datasets that compose the current dataset name"""
#################### Merged Datasets ####################
if value in MERGED_DATASET_NAMES:
self.merged_datasets.append(path + (key,))
_names_for_merge = names_for_merge(value, self.stage)
return dict(), zip(_names_for_merge, _names_for_merge)
#################### Coupled Datasets ####################
for coupled_candidate in COUPLED_DATASET_CANDIDATES:
if value == coupled_candidate[0]:
self.coupled_datasets.append(path + (key,))
return dict(), zip(coupled_candidate[1:], coupled_candidate[1:])
#################### Leaf Datasets ####################
if key:
self.leaves[path + (key,)] = key
return default_enter(path, key, value)
def names_for_merge(merge_to: str, stage: str) -> List[str]:
"""Retrieve the names of the standard datasets that are allowed to be included in a merged
DataFrame of type `merge_to` at stage `stage`
Parameters
----------
merge_to: String
Type of merged dataframe to produce. Should be one of the following: {"all_data",
"all_inputs", "all_targets", "non_train_data", "non_train_inputs", "non_train_targets"}
stage: String in {"pre_cv", "intra_cv}
Feature engineering stage for which the merged dataframe is requested. The results produced
with each option differ only in that a `merged_df` created with `stage="pre_cv"` will never
contain "validation" data because it doesn't exist before cross-validation has begun.
Conversely, a `merged_df` created with `stage="intra_cv"` will contain the appropriate
"validation" data if it exists
Returns
-------
names: List
Subset of {"train_data", "train_inputs", "train_targets", "validation_data",
"validation_inputs", "validation_targets", "holdout_data", "holdout_inputs",
"holdout_targets", "test_inputs"}
Examples
--------
>>> names_for_merge("all_data", "intra_cv")
['train_data', 'validation_data', 'holdout_data']
>>> names_for_merge("all_inputs", "intra_cv")
['train_inputs', 'validation_inputs', 'holdout_inputs', 'test_inputs']
>>> names_for_merge("all_targets", "intra_cv")
['train_targets', 'validation_targets', 'holdout_targets']
>>> names_for_merge("all_data", "pre_cv")
['train_data', 'holdout_data']
>>> names_for_merge("all_inputs", "pre_cv")
['train_inputs', 'holdout_inputs', 'test_inputs']
>>> names_for_merge("all_targets", "pre_cv")
['train_targets', 'holdout_targets']
>>> names_for_merge("non_train_data", "intra_cv")
['validation_data', 'holdout_data']
>>> names_for_merge("non_train_inputs", "intra_cv")
['validation_inputs', 'holdout_inputs', 'test_inputs']
>>> names_for_merge("non_train_targets", "intra_cv")
['validation_targets', 'holdout_targets']
>>> names_for_merge("non_train_data", "pre_cv")
['holdout_data']
>>> names_for_merge("non_train_inputs", "pre_cv")
['holdout_inputs', 'test_inputs']
>>> names_for_merge("non_train_targets", "pre_cv")
['holdout_targets']"""
merge_type, data_group = merge_to.rsplit("_", 1)
names = [_ for _ in STANDARD_DATASET_NAMES if _.endswith(data_group)]
if stage == "pre_cv":
names = [_ for _ in names if _ not in N_DATASET_VALIDATION]
if merge_type == "non_train":
names = [_ for _ in names if not _.startswith("train")]
return names
def merge_dfs(merge_to: str, stage: str, dfs: DFDict) -> pd.DataFrame:
"""Construct a multi-indexed DataFrame containing the values of `dfs` deemed necessary by
`merge_to` and `stage`. This is the opposite of `split_merged_df`
Parameters
----------
merge_to: String
Type of `merged_df` to produce. Should be one of the following: {"all_data", "all_inputs",
"all_targets", "non_train_data", "non_train_inputs", "non_train_targets"}
stage: String in {"pre_cv", "intra_cv}
Feature engineering stage for which `merged_df` is requested
dfs: Dict
Mapping of dataset names to their DataFrame values. Keys in `dfs` should be a subset of
{"train_data", "train_inputs", "train_targets", "validation_data", "validation_inputs",
"validation_targets", "holdout_data", "holdout_inputs", "holdout_targets", "test_inputs"}
Returns
-------
merged_df: pd.DataFrame
Multi-indexed DataFrame, in which the first index is a string naming the dataset in `dfs`
from which the corresponding data originates. The following index(es) are the original
index(es) from the dataset in `dfs`. All primary indexes in `merged_df` will be one of the
strings considered to be valid keys for `dfs`
Raises
------
ValueError
If all the DataFrames that would have been used in `merged_df` are None. This can happen if
requesting `merge_to="non_train_targets"` during `stage="pre_cv"` when there is no holdout
dataset available. Under these circumstances, the holdout dataset targets would be the sole
contents of `merged_df`, rendering `merged_df` invalid since the data is unavailable
See Also
--------
names_for_merge: Describes how `stage` values differ"""
df_names = names_for_merge(merge_to, stage)
df_names = [_ for _ in df_names if isinstance(dfs.get(_, None), pd.DataFrame)]
try:
merged_df = pd.concat([dfs[_] for _ in df_names], keys=df_names)
except ValueError as _ex:
raise ValueError(f"Merging {df_names} into {merge_to} does not produce DataFrame") from _ex
# TODO: Add more specific error message for below scenario?
# Tricky: This will be raised when `stage`="pre_cv" and `merge_to`="non_train..." if
# holdout/test data not available in `dfs`. May occur, for example, when using a step
# function that requests "non_train_inputs", with an `Environment` that has neither
# `holdout_dataset` nor `test_dataset` IF attempting to force the `EngineerStep`'s
# `stage`="pre_cv", instead of its default "intra_cv". This is correct behavior because
# "non_train_inputs" cannot be made under these circumstances; however, the precise cause
# of the problem may not be immediately apparent
return merged_df
def split_merged_df(merged_df: pd.DataFrame) -> DFDict:
"""Separate a multi-indexed DataFrame into a dict mapping primary indexes in `merged_df` to
DataFrames containing one fewer dimension than `merged_df`. This is the opposite of `merge_dfs`
Parameters
----------
merged_df: pd.DataFrame
Multi-indexed DataFrame of the form returned by :func:`merge_dfs` to split into the separate
DataFrames named by the primary indexes of `merged_df`
Returns
-------
dfs: Dict
Mapping of dataset names to their DataFrame values. Keys in `dfs` will be a subset of
{"train_data", "train_inputs", "train_targets", "validation_data", "validation_inputs",
"validation_targets", "holdout_data", "holdout_inputs", "holdout_targets", "test_inputs"}
containing only those values that are also primary indexes in `merged_df`"""
dfs = dict()
for df_index in merged_df.index.levels[0]:
dfs[df_index] = merged_df.loc[df_index, :].copy()
return dfs
def validate_dataset_names(params: Tuple[str], stage: str) -> List[str]:
"""Produce the names of merged datasets in `params` and verify there are no duplicate references
to any datasets in `params`
Parameters
----------
params: Tuple[str]
Dataset names requested by a feature engineering step callable. Must be a subset of
{"train_data", "train_inputs", "train_targets", "validation_data", "validation_inputs",
"validation_targets", "holdout_data", "holdout_inputs", "holdout_targets",
"test_inputs", "all_data", "all_inputs", "all_targets", "non_train_data",
"non_train_inputs", "non_train_targets"}
stage: String in {"pre_cv", "intra_cv}
Feature engineering stage for which `merged_df` is requested
Returns
-------
List[str]
Names of merged datasets in `params`
Raises
------
ValueError
If requested `params` contain a duplicate reference to any dataset, either by way of
merging/coupling or not"""
report = DatasetNameReport(params, stage)
reverse_multidict = dict()
for leaf_path, leaf_name in report.leaves.items():
reverse_multidict.setdefault(leaf_name, set()).add(leaf_path)
for leaf_name, leaf_paths in reverse_multidict.items():
if len(leaf_paths) > 1:
err_str = f"Requested params include duplicate references to `{leaf_name}` by way of:"
err_str += "".join([f"\n - {a_path}" for a_path in sorted(leaf_paths)])
err_str += "\nEach dataset may only be requested by a single param for each function"
raise ValueError(err_str)
return [_[0] if len(_) == 1 else _ for _ in report.merged_datasets]
class EngineerStep:
def __init__(self, f: Callable, stage=None, name=None, params=None, do_validate=False):
"""Container for individual :class:`FeatureEngineer` step functions
Compartmentalizes functions of singular engineer steps and allows for greater customization
than a raw engineer step function
Parameters
----------
f: Callable
Feature engineering step function that requests, modifies, and returns datasets `params`
Step functions should follow these guidelines:
1. Request as input a subset of the 11 data strings listed in `params`
2. Do whatever you want to the DataFrames given as input
3. Return new DataFrame values of the input parameters in same order as requested
If performing a task like target transformation, causing predictions to be transformed,
it is often desirable to inverse-transform the predictions to be of the expected form.
This can easily be done by returning an extra value from `f` (after the datasets) that
is either a callable, or a transformer class that was fitted during the execution of `f`
and implements an `inverse_transform` method. This is the only instance in which it is
acceptable for `f` to return values that don't mimic its input parameters. See the
engineer function definition using SKLearn's `QuantileTransformer` in the Examples
section below for an actual inverse-transformation-compatible implementation
stage: String in {"pre_cv", "intra_cv"}, or None, default=None
Feature engineering stage during which the callable `f` will be given the datasets
`params` to modify and return. If None, will be inferred based on `params`.
* "pre_cv" functions are applied only once in the experiment: when it starts
* "intra_cv" functions are reapplied for each fold in the cross-validation splits
If `stage` is left to be inferred, "pre_cv" will *usually* be selected. However, if
any `params` (or parameters in the signature of `f`) are prefixed with "validation..."
or "non_train...", then `stage` will inferred as "intra_cv". See the Notes section
below for suggestions on the `stage` to use for different functions
name: String, or None, default=None
Identifier for the transformation applied by this engineering step. If None,
`f.__name__` will be used
params: Tuple[str], or None, default=None
Dataset names requested by feature engineering step callable `f`. If None, will be
inferred by parsing the signature of `f`. Must be a subset of the following 11 strings:
Input Data
1. "train_inputs"
2. "validation_inputs"
3. "holdout_inputs"
4. "test_inputs"
5. "all_inputs"
``("train_inputs" + ["validation_inputs"] + "holdout_inputs" + "test_inputs")``
6. "non_train_inputs"
``(["validation_inputs"] + "holdout_inputs" + "test_inputs")``
Target Data
7. "train_targets"
8. "validation_targets"
9. "holdout_targets"
10. "all_targets"
``("train_targets" + ["validation_targets"] + "holdout_targets")``
11. "non_train_targets"
``(["validation_targets"] + "holdout_targets")``
As an alternative to the above list, just remember that the first half of all parameter
names should be one of {"train", "validation", "holdout", "test", "all", "non_train"},
and the second half should be either "inputs" or "targets". The only exception to this
rule is "test_targets", which doesn't exist.
Inference of "validation" `params` is affected by `stage`. During the "pre_cv" stage,
the validation dataset has not yet been created and is still a part of the train
dataset. During the "intra_cv" stage, the validation dataset is created by removing a
portion of the train dataset, and their values passed to `f` reflect this fact. This
also means that the values of the merged ("all"/"non_train"-prefixed) datasets may or
may not contain "validation" data depending on the `stage`; however, this is all handled
internally, so you probably don't need to worry about it.
`params` may not include multiple references to the same dataset, either directly or
indirectly. This means `("train_inputs", "train_inputs")` is invalid due to duplicate
direct references. Less obviously, `("train_inputs", "all_inputs")` is invalid because
"all_inputs" includes "train_inputs"
do_validate: Boolean, or "strict", default=False
... Experimental...
Whether to validate the datasets resulting from feature engineering steps. If True,
hashes of the new datasets will be compared to those of the originals to ensure they
were actually modified. Results will be logged. If `do_validate` = "strict", an
exception will be raised if any anomalies are found, rather than logging a message. If
`do_validate` = False, no validation will be performed
See Also
--------
:class:`FeatureEngineer`
The container for `EngineerStep` instances - `EngineerStep`s should always be provided
to HyperparameterHunter through a `FeatureEngineer`
:class:`~hyperparameter_hunter.space.dimensions.Categorical`
Can be used during optimization to search through a group of `EngineerStep`s given as
`categories`. The `optional` kwarg of `Categorical` designates a `FeatureEngineer` step
that may be one of the `EngineerStep`s in `categories`, or may be omitted entirely
:func:`get_engineering_step_stage`
More information on `stage` inference and situations where overriding it may be prudent
Notes
-----
`stage`: Generally, feature engineering conducted in the "pre_cv" stage should regard each
sample/row as independent entities. For example, steps like converting a string day of the
week to one-hot encoded columns, or imputing missing values by replacement with -1 might be
conducted "pre_cv", since they are unlikely to introduce an information leakage. Conversely,
steps like scaling/normalization, whose results for the data in one row are affected by the
data in other rows should be performed "intra_cv" in order to recalculate the final values
of the datasets for each cross validation split and avoid information leakage.
`params`: In the list of the 11 valid `params` strings, "test_inputs" is notably missing the
"..._targets" counterpart accompanying the other datasets. The "targets" suffix is missing
because test data targets are never given. Note that although "test_inputs" is still
included in both "all_inputs" and "non_train_inputs", its lack of a target column means that
"all_targets" and "non_train_targets" may have different lengths than their
"inputs"-suffixed counterparts
Examples
--------
>>> from sklearn.preprocessing import StandardScaler, QuantileTransformer
>>> def s_scale(train_inputs, non_train_inputs):
... s = StandardScaler()
... train_inputs[train_inputs.columns] = s.fit_transform(train_inputs.values)
... non_train_inputs[train_inputs.columns] = s.transform(non_train_inputs.values)
... return train_inputs, non_train_inputs
>>> # Sensible parameter defaults inferred based on `f`
>>> es_0 = EngineerStep(s_scale)
>>> es_0.stage
'intra_cv'
>>> es_0.name
's_scale'
>>> es_0.params
('train_inputs', 'non_train_inputs')
>>> # Override `stage` if you want to fit your scaler on OOF data like a crazy person
>>> es_1 = EngineerStep(s_scale, stage="pre_cv")
>>> es_1.stage
'pre_cv'
*Watch out for multiple requests to the same data*
>>> es_2 = EngineerStep(s_scale, params=("train_inputs", "all_inputs"))
Traceback (most recent call last):
File "feature_engineering.py", line ? in validate_dataset_names
ValueError: Requested params include duplicate references to `train_inputs` by way of:
- ('all_inputs', 'train_inputs')
- ('train_inputs',)
Each dataset may only be requested by a single param for each function
*Error is the same if `(train_inputs, all_inputs)` is in the actual function signature*
*EngineerStep functions aren't just limited to transformations. Make your own features!*
>>> def sqr_sum(all_inputs):
... all_inputs["square_sum"] = all_inputs.agg(
... lambda row: np.sqrt(np.sum([np.square(_) for _ in row])), axis="columns"
... )
... return all_inputs
>>> es_3 = EngineerStep(sqr_sum)
>>> es_3.stage
'pre_cv'
>>> es_3.name
'sqr_sum'
>>> es_3.params
('all_inputs',)
*Inverse-transformation Implementation:*
>>> def q_transform(train_targets, non_train_targets):
... t = QuantileTransformer(output_distribution="normal")
... train_targets[train_targets.columns] = t.fit_transform(train_targets.values)
... non_train_targets[train_targets.columns] = t.transform(non_train_targets.values)
... return train_targets, non_train_targets, t
>>> # Note that `train_targets` and `non_train_targets` must still be returned in order,
>>> # but they are followed by `t`, an instance of `QuantileTransformer` we just fitted,
>>> # whose `inverse_transform` method will be called on predictions
>>> es_4 = EngineerStep(q_transform)
>>> es_4.stage
'intra_cv'
>>> es_4.name
'q_transform'
>>> es_4.params
('train_targets', 'non_train_targets')
>>> # `params` does not include any returned transformers - Only data requested as input
"""
self._f = f
self._name = name
self.params = params
self._stage = stage
self.do_validate = do_validate
self.inversion = None
self.merged_datasets: List[str] = validate_dataset_names(self.params, self.stage)
self.original_hashes = dict()
self.updated_hashes = dict()
def __call__(self, **datasets: DFDict) -> DFDict:
"""Apply :attr:`f` to `datasets` to produce updated datasets. If `f` requests any
merged/coupled datasets (as reflected by :attr:`params`), conversions to accommodate those
requests will take place here
Parameters
----------
**datasets: DFDict
Original dict of datasets, containing all datasets, some of which may be superfluous, or
may require additional processing to resolve merged/coupled datasets
Returns
-------
new_datasets: DFDict
Dict of datasets, which have been updated by :attr:`f`. Any datasets that may have been
merged prior to being given to :attr:`f` have been split back into the original
datasets, with the updates made by :attr:`f`"""
if self.do_validate:
self.original_hashes = hash_datasets(datasets)
datasets_for_f = self.get_datasets_for_f(datasets)
step_result = self.f(**datasets_for_f)
step_result = (step_result,) if not isinstance(step_result, tuple) else step_result
if len(step_result) == len(self.params) + 1:
self.inversion, step_result = step_result[-1], step_result[:-1]
new_datasets = dict(zip(self.params, step_result))
for dataset_name, dataset_value in new_datasets.items():
if dataset_name in self.merged_datasets:
new_datasets = dict(new_datasets, **split_merged_df(dataset_value))
new_datasets = dict(datasets, **new_datasets)
if self.do_validate:
self.updated_hashes = hash_datasets(new_datasets)
return new_datasets
def inverse_transform(self, data):
"""Perform the inverse transformation for this engineer step (if it exists)
Parameters
----------
data: Array-like
Data to inverse transform with :attr:`inversion` or :attr:`inversion.inverse_transform`
Returns
-------
Array-like
If :attr:`inversion` is None, return `data` unmodified. Else, return the result of
:attr:`inversion` or :attr:`inversion.inverse_transform`, given `data`"""
if not self.inversion:
return data
elif callable(getattr(self.inversion, "inverse_transform", None)):
return self.inversion.inverse_transform(data)
elif callable(self.inversion):
return self.inversion(data)
raise TypeError(
f"`inversion` must be callable, or class with `inverse_transform`, not {self.inversion}"
)
def get_datasets_for_f(self, datasets: DFDict) -> DFDict:
"""Produce a dict of DataFrames containing only the merged datasets and standard datasets
requested in :attr:`params`. In other words, add the requested merged datasets and remove
unnecessary standard datasets
Parameters
----------
datasets: DFDict
Original dict of datasets, containing all datasets provided to
:meth:`EngineerStep.__call__`, some of which may be superfluous, or may require
additional processing to resolve merged/coupled datasets
Returns
-------
DFDict
Updated version of `datasets`, in which unnecessary datasets have been filtered out, and
the requested merged datasets have been added"""
datasets_for_f = datasets
for _dataset_name in self.merged_datasets:
datasets_for_f[_dataset_name] = merge_dfs(_dataset_name, self.stage, datasets)
return subdict(datasets_for_f, keep=self.params)
def get_key_data(self) -> dict:
"""Produce a dict of critical attributes describing the :class:`EngineerStep` instance for
use by key-making classes
Returns
-------
Dict
Important attributes describing this :class:`EngineerStep` instance"""
return dict(
name=self.name,
f=self.f,
params=self.params,
stage=self.stage,
do_validate=self.do_validate,
original_hashes=self.original_hashes,
updated_hashes=self.updated_hashes,
)
##################################################
# Properties
##################################################
@property
def f(self) -> Callable:
"""Feature engineering step callable that requests, modifies, and returns datasets"""
return self._f
@property
def name(self) -> str:
"""Identifier for the transformation applied by this engineering step"""
if self._name is None:
self._name = self.f.__name__
return self._name
@property
def params(self) -> tuple:
"""Dataset names requested by feature engineering step callable :attr:`f`. See documentation
in :meth:`EngineerStep.__init__` for more information/restrictions"""
return self._params
@params.setter
def params(self, value):
self._params = value if value is not None else get_engineering_step_params(self.f)
@property
def stage(self) -> str:
"""Feature engineering stage during which the `EngineerStep` will be executed"""
if self._stage is None:
self._stage = get_engineering_step_stage(self.params)
return self._stage
##################################################
# Comparison Methods
##################################################
def __hash__(self):
return hash((self.name, self.f, self.params, self.stage, self.do_validate))
def __eq__(self, other):
"""Check whether `other` is equal to `self`
The two are considered equal if `other` has the following attributes and their values
are equal to those of `self`: :attr:`name`, :attr:`f`, :attr:`params`, :attr:`stage`, and
:attr:`do_validate`. The values of all the aforementioned attributes will have been set on
initialization (either explicitly or by inference), and they should never be altered
Parameters
----------
other: EngineerStep, dict, str
Object to compare to `self`. If dict, the critical attributes mentioned above are
regarded as keys of `other`, and `other` should be of the form returned by
:meth:`EngineerStep.get_comparison_attrs`. If str, `other` will be compared to the
result of `self`'s :meth:`EngineerStep.stringify`
Returns
-------
Boolean
True if `other` is equal to `self`, else False
Examples
--------
>>> def dummy_f(train_inputs, non_train_inputs):
... return train_inputs, non_train_inputs
>>> es_0 = EngineerStep(dummy_f)
>>> assert es_0 == EngineerStep(dummy_f)
>>> assert es_0 == EngineerStep.get_comparison_attrs(es_0)
>>> assert es_0 == es_0.stringify()
"""
if isinstance(other, str):
return self.stringify() == other
elif isinstance(other, (dict, EngineerStep)):
# Collect dicts of attributes for comparison
other_attrs = self.get_comparison_attrs(other)
own_attrs = self.get_comparison_attrs(self)
# If `other_attrs["f"]` is str, should be SHA256 - Use hash of `self.f` to compare
if isinstance(other_attrs["f"], str):
own_attrs["f"] = make_hash_sha256(own_attrs["f"])
return own_attrs == other_attrs
return False
@staticmethod
def get_comparison_attrs(step_obj: Union["EngineerStep", dict]) -> dict:
"""Build a dict of critical :class:`EngineerStep` attributes
Parameters
----------
step_obj: EngineerStep, dict
Object for which critical :class:`EngineerStep` attributes should be collected
Returns
-------
attr_vals: Dict
Critical :class:`EngineerStep` attributes. If `step_obj` does not have a necessary
attribute (for `EngineerStep`) or a necessary key (for dict), its value in `attr_vals`
will be a placeholder object. This is to facilitate comparison, while also ensuring
missing values will always be considered unequal to other values
Examples
--------
>>> def dummy_f(train_inputs, non_train_inputs):
... return train_inputs, non_train_inputs
>>> es_0 = EngineerStep(dummy_f)
>>> EngineerStep.get_comparison_attrs(es_0) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
{'name': 'dummy_f',
'f': <function dummy_f at ...>,
'params': ('train_inputs', 'non_train_inputs'),
'stage': 'intra_cv',
'do_validate': False}
>>> EngineerStep.get_comparison_attrs(
... dict(foo="hello", f=dummy_f, params=["all_inputs", "all_targets"], stage="pre_cv")
... ) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
{'name': <object object at ...>,
'f': <function dummy_f at ...>,
'params': ('all_inputs', 'all_targets'),
'stage': 'pre_cv',
'do_validate': <object object at ...>}
"""
# Attributes necessary for equality
attr_names = ("name", "f", "params", "stage", "do_validate")
if isinstance(step_obj, dict):
attr_vals = {_: step_obj.get(_, object()) for _ in attr_names}
else:
attr_vals = {_: getattr(step_obj, _, object()) for _ in attr_names}
# Ensure :attr:`params` is always a tuple, not a list
attr_vals["params"] = tuple(attr_vals["params"])
return attr_vals
def stringify(self) -> str:
"""Make a stringified representation of `self`, compatible with :meth:`EngineerStep.__eq__`
Returns
-------
String
String describing all critical attributes of the :class:`EngineerStep` instance. This
value is not particularly human-friendly due to both its length and the fact that
:attr:`EngineerStep.f` is represented by its hash
Examples
--------
>>> def dummy_f(train_inputs, non_train_inputs):
... return train_inputs, non_train_inputs
>>> EngineerStep(dummy_f).stringify() # doctest: +ELLIPSIS
"EngineerStep(dummy_f, ..., ('train_inputs', 'non_train_inputs'), intra_cv, False)"
>>> EngineerStep(dummy_f, stage="pre_cv").stringify() # doctest: +ELLIPSIS
"EngineerStep(dummy_f, ..., ('train_inputs', 'non_train_inputs'), pre_cv, False)"
"""
return "{}({}, {}, {}, {}, {})".format(
self.__class__.__name__,
self.name,
make_hash_sha256(self.f),
self.params,
self.stage,
self.do_validate,
)
@classmethod
def honorary_step_from_dict(cls, step_dict: dict, dimension: Categorical):
"""Get an `EngineerStep` from `dimension` that is equal to its dict form, `step_dict`
Parameters
----------
step_dict: Dict
Dict of form saved in Experiment description files for `EngineerStep`. Expected to
have following keys, with values of the given types:
* "name": String
* "f": String (SHA256 hash)
* "params": List[str], or Tuple[str, ...]
* "stage": String in {"pre_cv", "intra_cv"}
* "do_validate": Boolean
dimension: Categorical
`Categorical` instance expected to contain the `EngineerStep` equivalent of `step_dict`
in its categories
Returns
-------
EngineerStep
From `dimension.categories` if it is the `EngineerStep` equivalent of `step_dict`
Raises
------
ValueError
If `dimension.categories` does not contain an `EngineerStep` matching `step_dict`"""
for category in dimension.categories:
if category == step_dict:
return category
raise ValueError("`step_dict` could not be found in `dimension`")
def __repr__(self) -> str:
return "{}({})".format(self.__class__.__name__, self.name)
class FeatureEngineer:
def __init__(self, steps=None, do_validate=False, **datasets: DFDict):
"""Class to organize feature engineering step callables `steps` (:class:`EngineerStep`
instances) and the datasets that the steps request and return.
Parameters
----------
steps: List, or None, default=None
List of arbitrary length, containing any of the following values:
1. :class:`EngineerStep` instance,
2. Function to provide as input to :class:`EngineerStep`, or
3. :class:`~hyperparameter_hunter.space.dimensions.Categorical`, with `categories`
comprising a selection of the previous two `steps` values (optimization only)
The third value can only be used during optimization. The `feature_engineer` provided to
:class:`~hyperparameter_hunter.experiments.CVExperiment`, for example, may only contain
the first two values. To search a space optionally including an `EngineerStep`, use the
`optional` kwarg of :class:`~hyperparameter_hunter.space.dimensions.Categorical`.
See :class:`EngineerStep` for information on properly formatted `EngineerStep`
functions. Additional engineering steps may be added via :meth:`add_step`
do_validate: Boolean, or "strict", default=False
... Experimental...
Whether to validate the datasets resulting from feature engineering steps. If True,
hashes of the new datasets will be compared to those of the originals to ensure they
were actually modified. Results will be logged. If `do_validate` = "strict", an
exception will be raised if any anomalies are found, rather than logging a message. If
`do_validate` = False, no validation will be performed
**datasets: DFDict
This is not expected to be provided on initialization and is offered primarily for
debugging/testing. Mapping of datasets necessary to perform feature engineering steps
See Also
--------
:class:`EngineerStep`
For proper formatting of non-`Categorical` values of `steps`
Notes
-----
If `steps` does include any instances of
:class:`hyperparameter_hunter.space.dimensions.Categorical`, this `FeatureEngineer` instance
will not be usable by Experiments. It can only be used by Optimization Protocols.
Furthermore, the `FeatureEngineer` that the Optimization Protocol actually ends up using
will not pass identity checks against the original `FeatureEngineer` that contained
`Categorical` steps
Examples
--------
>>> from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
>>> # Define some engineer step functions to play with
>>> def s_scale(train_inputs, non_train_inputs):
... s = StandardScaler()
... train_inputs[train_inputs.columns] = s.fit_transform(train_inputs.values)
... non_train_inputs[train_inputs.columns] = s.transform(non_train_inputs.values)
... return train_inputs, non_train_inputs
>>> def mm_scale(train_inputs, non_train_inputs):
... s = MinMaxScaler()
... train_inputs[train_inputs.columns] = s.fit_transform(train_inputs.values)
... non_train_inputs[train_inputs.columns] = s.transform(non_train_inputs.values)
... return train_inputs, non_train_inputs
>>> def q_transform(train_targets, non_train_targets):
... t = QuantileTransformer(output_distribution="normal")
... train_targets[train_targets.columns] = t.fit_transform(train_targets.values)
... non_train_targets[train_targets.columns] = t.transform(non_train_targets.values)
... return train_targets, non_train_targets, t
>>> def sqr_sum(all_inputs):
... all_inputs["square_sum"] = all_inputs.agg(
... lambda row: np.sqrt(np.sum([np.square(_) for _ in row])), axis="columns"
... )
... return all_inputs
*FeatureEngineer steps wrapped by `EngineerStep` == raw function steps - as long as the
`EngineerStep` is using the default parameters*
>>> # FeatureEngineer steps wrapped by `EngineerStep` == raw function steps
>>> # ... As long as the `EngineerStep` is using the default parameters
>>> fe_0 = FeatureEngineer([sqr_sum, s_scale])
>>> fe_1 = FeatureEngineer([EngineerStep(sqr_sum), EngineerStep(s_scale)])
>>> fe_0.steps == fe_1.steps
True
>>> fe_2 = FeatureEngineer([sqr_sum, EngineerStep(s_scale), q_transform])
*`Categorical` can be used during optimization and placed anywhere in `steps`. `Categorical`
can also handle either `EngineerStep` categories or raw functions. Use the `optional` kwarg
of `Categorical` to test some questionable steps*
>>> fe_3 = FeatureEngineer([sqr_sum, Categorical([s_scale, mm_scale]), q_transform])
>>> fe_4 = FeatureEngineer([Categorical([sqr_sum], optional=True), s_scale, q_transform])
>>> fe_5 = FeatureEngineer([
... Categorical([sqr_sum], optional=True),
... Categorical([EngineerStep(s_scale), mm_scale]),
... q_transform
... ])
"""
self.steps = []
self.do_validate = do_validate
self.datasets = datasets or {}
for step in steps or []:
self.add_step(step)
def __call__(self, stage: str, **datasets: DFDict):
"""Execute all feature engineering steps in :attr:`steps` for `stage`, with datasets
`datasets` as inputs
Parameters
----------
stage: String in {"pre_cv", "intra_cv"}
Feature engineering stage, specifying which :class:`EngineerStep` instances in
:attr:`steps` should be executed
datasets: DFDict
Original dict of datasets, containing all datasets, some of which may be superfluous, or
may require additional processing to resolve merged/coupled datasets"""
if datasets:
self.datasets = datasets
for i, step in enumerate(self.steps):
if step.stage == stage:
self.datasets = step(**self.datasets)
def __eq__(self, other: "FeatureEngineer"):
return (
isinstance(other, FeatureEngineer)
and len(self.steps) == len(other.steps)
and all(s_self == s_other for (s_self, s_other) in zip(self.steps, other.steps))
)
def inverse_transform(self, data):
"""Perform the inverse transformation for all engineer steps in :attr:`steps` in sequence
on `data`
Parameters
----------
data: Array-like
Data to inverse transform with any inversions present in :attr:`steps`
Returns
-------
Array-like
Result of sequentially calling inverse transformations in :attr:`steps` on `data`. If
any step has :attr:`EngineerStep.inversion` = None, `data` is unmodified for that step,
and proceeds to next engineer step inversion"""
inverted_data = data
# TODO: Make sure "pre_cv"-stage steps are inverted first, then "intra_cv"-stage
for i, step in enumerate(self.steps):
inverted_data = step.inverse_transform(inverted_data)
return inverted_data
@property
def steps(self) -> List[EngineerStep]:
"""Feature engineering steps to execute in sequence on :meth:`FeatureEngineer.__call__`"""
return self._steps
@steps.setter
def steps(self, value: list):
self._steps = value
def get_key_data(self) -> dict:
"""Produce a dict of critical attributes describing the :class:`FeatureEngineer` instance
for use by key-making classes
Returns
-------
Dict
Important attributes describing this :class:`FeatureEngineer` instance"""
return dict(
steps=[_.get_key_data() if isinstance(_, EngineerStep) else _ for _ in self.steps],
do_validate=self.do_validate,
datasets=self.datasets,
)
def add_step(
self,
step: Union[Callable, EngineerStep, Categorical],
stage: str = None,
name: str = None,
before: str = EMPTY_SENTINEL,
after: str = EMPTY_SENTINEL,
number: int = EMPTY_SENTINEL,
):
"""Add an engineering step to :attr:`steps` to be executed with the other contents of
:attr:`steps` on :meth:`FeatureEngineer.__call__`
Parameters
----------
step: Callable, or `EngineerStep`, or `Categorical`
If `EngineerStep` instance, will be added directly to :attr:`steps`. Otherwise, must be
a feature engineering step callable that requests, modifies, and returns datasets, which
will be used to instantiate a :class:`EngineerStep` to add to :attr:`steps`. If
`Categorical`, `categories` should contain `EngineerStep` instances or callables
stage: String in {"pre_cv", "intra_cv"}, or None, default=None
Feature engineering stage during which the callable `step` will be executed
name: String, or None, default=None
Identifier for the transformation applied by this engineering step. If None and `step`
is not an `EngineerStep`, will be inferred during :class:`EngineerStep` instantiation
before: String, default=EMPTY_SENTINEL
... Experimental...
after: String, default=EMPTY_SENTINEL
... Experimental...
number: String, default=EMPTY_SENTINEL
... Experimental..."""
if isinstance(step, Categorical):
cat_params = step.get_params()
cat_params["categories"] = [self._to_step(_) for _ in cat_params["categories"]]
self._steps.append(Categorical(**cat_params))
else:
self._steps.append(self._to_step(step, stage=stage, name=name))
def _to_step(self, step: Union[Callable, EngineerStep], stage=None, name=None) -> EngineerStep:
"""Ensure a candidate `step` is an `EngineerStep` instance, and return it
Parameters
----------
step: Callable, or `EngineerStep`
If `EngineerStep` instance, will be added directly to :attr:`steps`. Otherwise, must be
a feature engineering step callable that requests, modifies, and returns datasets, which
will be used to instantiate a :class:`EngineerStep` to add to :attr:`steps`
stage: String in {"pre_cv", "intra_cv"}, or None, default=None
Feature engineering stage during which the callable `step` will be executed
name: String, or None, default=None
Identifier for the transformation applied by this engineering step. If None and `step`
is not an `EngineerStep`, will be inferred during :class:`EngineerStep` instantiation
Returns
-------
EngineerStep
`step` if already an instance of `EngineerStep`. Else an `EngineerStep` initialized
using `step`, `name`, and `stage`"""
if isinstance(step, EngineerStep):
return step
elif step == RejectedOptional():
return step # Return as-is - OptimizationProtocol will take care of it
else:
return EngineerStep(step, name=name, stage=stage, do_validate=self.do_validate)
# FLAG: Tally number of columns "transformed" and "added" at each step and report
def get_engineering_step_stage(datasets: Tuple[str, ...]) -> str:
"""Determine the stage in which a feature engineering step that requests `datasets` as input
should be executed
Parameters
----------
datasets: Tuple[str]
Dataset names requested by a feature engineering step callable
Returns
-------
stage: {"pre_cv", "intra_cv"}
"pre_cv" if a step processing the given `datasets` should be executed in the
pre-cross-validation stage. "intra_cv" if the step should be executed for each
cross-validation split. If any of the elements in `datasets` is prefixed with "validation"
or "non_train", `stage` will be "intra_cv". Otherwise, it will be "pre_cv"
Notes
-----
Generally, feature engineering conducted in the "pre_cv" stage should regard each sample/row as
independent entities. For example, steps like converting a string day of the week to one-hot
encoded columns, or imputing missing values by replacement with -1 might be conducted "pre_cv",
since they are unlikely to introduce an information leakage. Conversely, steps like
scaling/normalization, whose results for the data in one row are affected by the data in other
rows should be performed "intra_cv" in order to recalculate the final values of the datasets for
each cross validation split and avoid information leakage
Technically, the inference of `stage="intra_cv"` due to the existence of a "non_train"-prefixed
value in `datasets` could unnecessarily force steps to be executed "intra_cv" if, for example,
there is no validation data. However, this is safer than the alternative of executing these
steps "pre_cv", in which validation data would be a subset of train data, probably introducing
information leakage. A simple workaround for this is to explicitly provide :class:`EngineerStep`
with the desired `stage` parameter to bypass this inference
Examples
--------
>>> get_engineering_step_stage(("train_inputs", "validation_inputs", "holdout_inputs"))
'intra_cv'
>>> get_engineering_step_stage(("all_data"))
'pre_cv'
>>> get_engineering_step_stage(("all_inputs", "all_targets"))
'pre_cv'
>>> get_engineering_step_stage(("train_data", "non_train_data"))
'intra_cv'"""
if any(_.startswith("validation_") for _ in datasets):
return "intra_cv"
if any(_.startswith("non_train_") for _ in datasets):
return "intra_cv"
return "pre_cv"
class ParameterParser(ast.NodeVisitor):
def __init__(self):
"""`ast.NodeVisitor` subclass that collects the arguments specified in the signature of a
callable node, as well as the values returned by the callable, in the attributes `args` and
`returns`, respectively"""
self.args = []
self.returns = []
def visit_arg(self, node):
with suppress(AttributeError):
if isinstance(node.parent.parent, ast.FunctionDef):
if isinstance(node.parent.parent.parent, ast.Module):
self.args.append(node.arg)
self.generic_visit(node)
def visit_Return(self, node):
try:
self.returns.append(node.value.id)
except AttributeError:
for element in node.value.elts:
try:
self.returns.append(element.id)
except AttributeError: # Straight-up function probably, instead of variable name
self.returns.append(getattr(element, "attr", element.__class__.__name__))
self.generic_visit(node)
def get_engineering_step_params(f: callable) -> Tuple[str]:
"""Verify that callable `f` requests valid input parameters, and returns a tuple of the same
parameters, with the assumption that the parameters are modified by `f`
Parameters
----------
f: Callable
Feature engineering step function that requests, modifies, and returns datasets
Returns
-------
Tuple
Argument/return value names declared by `f`
Examples
--------
>>> def impute_negative_one(all_inputs):
... all_inputs.fillna(-1, inplace=True)
... return all_inputs
>>> get_engineering_step_params(impute_negative_one)
('all_inputs',)
>>> def standard_scale(train_inputs, non_train_inputs):
... scaler = StandardScaler()
... train_inputs[train_inputs.columns] = scaler.fit_transform(train_inputs.values)
... non_train_inputs[train_inputs.columns] = scaler.transform(non_train_inputs.values)
... return train_inputs, non_train_inputs
>>> get_engineering_step_params(standard_scale)
('train_inputs', 'non_train_inputs')
>>> def error_invalid_dataset(train_inputs, foo):
... return train_inputs, foo
>>> get_engineering_step_params(error_invalid_dataset)
Traceback (most recent call last):
File "feature_engineering.py", line ?, in get_engineering_step_params
ValueError: Invalid dataset name: 'foo'"""
valid_datasets = MERGED_DATASET_NAMES + STANDARD_DATASET_NAMES
source_code = getsource(f)
tree = ast.parse(source_code)
#################### Add Links to Nodes' Parents ####################
for node in ast.walk(tree):
for child in ast.iter_child_nodes(node):
child.parent = node
#################### Collect Parameters and Returns ####################
parser = ParameterParser()
parser.visit(tree)
for name in parser.args:
if name not in valid_datasets:
raise ValueError(f"Invalid dataset name: {name!r}")
if name.endswith("_data"):
raise ValueError(
f"Sorry, 'data'-suffixed parameters like {name!r} are not supported yet. "
"Try using both the 'inputs' and 'targets' params for this dataset, instead!"
)
return tuple(parser.args)
def _hash_dataset(dataset: pd.DataFrame) -> dict:
"""Generate hashes for `dataset` at various levels of specificity
Parameters
----------
dataset: pandas.DataFrame
DataFrame to be described with a dict of hashes
Returns
-------
dict
"dataset" (str): Hash of `dataset`, itself
"column_names" (str): Hash of `dataset.columns`, capturing names, order, and add/drops
"column_values" (dict): Keys are `dataset.columns`, and values are hashes for each column
Examples
--------
>>> _hash_dataset(pd.DataFrame(dict(a=[0, 1], b=[2, 3]))) # doctest: +NORMALIZE_WHITESPACE
{'dataset': 't0rdT14SDIH-CVm-dce1Hlsr2oM7q6pss_GpV3rJ6bw=',
'column_names': 't2r52T-rdDqIDs75-83buoieqk0KyHEpRJMJAAzfzb4=',
'column_values': {'a': 'buQ0yuUUbLN57tC6050g7yWrvAdk-NwGIEEWHJC88EY=',
'b': 'j9nBFZVu4ZEnsoaRYiI93DcrbV3A_hzcKdf0P5gS7g4='}}
>>> _hash_dataset(pd.DataFrame(dict(x=[0, 1], b=[6, 7]))) # doctest: +NORMALIZE_WHITESPACE
{'dataset': 'TNLSddRnWVfoytkhHrSNWXqVW2TV7cHKht8MMLWcbhY=',
'column_names': '9l1vTGGIxfuA4rJZ-ePalM-9Q5D0BfLp5bogE0U-oYQ=',
'column_values': {'x': 'l2dZ6AeGRuHH97J0qb8I1H-pwK-ubHqElDqFIuKAbIw=',
'b': 'uIvA32AuBuj9LTU652UQUBI0VH9UmF2ZJeL4NefiiLg='}}
>>> _hash_dataset(None)
{'dataset': None, 'column_names': None, 'column_values': None}"""
if (not isinstance(dataset, pd.DataFrame)) and (dataset is None or dataset == 0):
return dict(dataset=None, column_names=None, column_values=None)
return dict(
dataset=make_hash_sha256(dataset),
column_names=make_hash_sha256(dataset.columns),
column_values={_: make_hash_sha256(dataset[_]) for _ in dataset.columns},
)
def hash_datasets(datasets: dict) -> dict:
"""Describe `datasets` with dicts of hashes for their values, column names, and column values
Parameters
----------
datasets: Dict
Mapping of dataset names to `pandas.DataFrame` instances
Returns
-------
hashes: Dict
Mapping with same keys as `datasets`, whose values are dicts returned from
:func:`_hash_dataset` that provide hashes for each DataFrame and its column names/values
Examples
--------
>>> df_x = pd.DataFrame(dict(a=[0, 1], b=[2, 3], c=[4, 5]))
>>> df_y = pd.DataFrame(dict(a=[0, 1], b=[6, 7], d=[8, 9]))
>>> hash_datasets(dict(x=df_x, y=df_y)) == dict(x=_hash_dataset(df_x), y=_hash_dataset(df_y))
True"""
hashes = {k: _hash_dataset(v) for k, v in datasets.items()}
return hashes
# def _compare_hash_(columns_a: dict, columns_b: dict):
# """
#
# Parameters
# ----------
# columns_a
# columns_b
#
# Returns
# -------
#
# """
# columns_added = dict()
# columns_dropped = dict()
# columns_modified = dict()
# columns_unchanged = dict()
#
#
# def compare_dataset_columns(datasets_a: dict, datasets_b: dict):
# compare_column_hashes(..., ...)
# def step(order=None, before=None, after=None, returns="frame"):
# """
#
# Parameters
# ----------
# order: Integer, or None, default=None
# ...
# before: String, or None, default=None
# ...
# after: String, or None, default=None
# ...
# returns: {"frame", "cols"}, default="frame"
# ...
#
# Returns
# -------
#
# """
# ...