pyod/models/feature_bagging.py
# -*- coding: utf-8 -*-
"""Feature bagging detector
"""
# Author: Yue Zhao <yzhao062@gmail.com>
# License: BSD 2 clause
import numbers
import numpy as np
from sklearn.base import clone
from sklearn.utils import check_array
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted
from .base import BaseDetector
from .combination import average, maximization
from .lof import LOF
from ..utils.utility import check_detector
from ..utils.utility import check_parameter
from ..utils.utility import generate_bagging_indices
MAX_INT = np.iinfo(np.int32).max
def _set_random_states(estimator, random_state=None):
"""Sets fixed random_state parameters for an estimator. Internal use only.
Modified from sklearn/base.py
Finds all parameters ending ``random_state`` and sets them to integers
derived from ``random_state``.
Parameters
----------
estimator : estimator supporting get/set_params
Estimator with potential randomness managed by random_state
parameters.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Notes
-----
This does not necessarily set *all* ``random_state`` attributes that
control an estimator's randomness, only those accessible through
``estimator.get_params()``. ``random_state``s not controlled include
those belonging to:
* cross-validation splitters
* ``scipy.stats`` rvs
"""
random_state = check_random_state(random_state)
to_set = {}
for key in sorted(estimator.get_params(deep=True)):
if key == 'random_state' or key.endswith('__random_state'):
to_set[key] = random_state.randint(MAX_INT)
if to_set:
estimator.set_params(**to_set)
# def _parallel_decision_function(estimators, estimators_features, X):
# n_samples = X.shape[0]
# scores = np.zeros((n_samples, len(estimators)))
#
# for i, (estimator, features) in enumerate(
# zip(estimators, estimators_features)):
# if hasattr(estimator, 'decision_function'):
# estimator_score = estimator.decision_function(
# X[:, features])
# scores[:, i] = estimator_score
# else:
# raise NotImplementedError(
# 'current base detector has no decision_function')
# return scores
# TODO: should support parallelization at the model level
# TODO: detector score combination through BFS should be implemented
# See https://github.com/yzhao062/pyod/issues/59
class FeatureBagging(BaseDetector):
""" A feature bagging detector is a meta estimator that fits a number of
base detectors on various sub-samples of the dataset and use averaging
or other combination methods to improve the predictive accuracy and
control over-fitting.
The sub-sample size is always the same as the original input sample size
but the features are randomly sampled from half of the features to all
features.
By default, LOF is used as the base estimator. However, any estimator
could be used as the base estimator, such as kNN and ABOD.
Feature bagging first construct n subsamples by random selecting a subset
of features, which induces the diversity of base estimators.
Finally, the prediction score is generated by averaging/taking the maximum
of all base detectors. See :cite:`lazarevic2005feature` for details.
Parameters
----------
base_estimator : object or None, optional (default=None)
The base estimator to fit on random subsets of the dataset.
If None, then the base estimator is a LOF detector.
n_estimators : int, optional (default=10)
The number of base estimators in the ensemble.
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set,
i.e. the proportion of outliers in the data set. Used when fitting to
define the threshold on the decision function.
max_features : int or float, optional (default=1.0)
The number of features to draw from X to train each base estimator.
- If int, then draw `max_features` features.
- If float, then draw `max_features * X.shape[1]` features.
bootstrap_features : bool, optional (default=False)
Whether features are drawn with replacement.
check_detector : bool, optional (default=True)
If set to True, check whether the base estimator is consistent with
pyod standard.
check_estimator : bool, optional (default=False)
If set to True, check whether the base estimator is consistent with
sklearn standard.
.. deprecated:: 0.6.9
`check_estimator` will be removed in pyod 0.8.0.; it will be
replaced by `check_detector`.
n_jobs : optional (default=1)
The number of jobs to run in parallel for both `fit` and
`predict`. If -1, then the number of jobs is set to the
number of cores.
random_state : int, RandomState or None, optional (default=None)
If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by `np.random`.
combination : str, optional (default='average')
The method of combination:
- if 'average': take the average of all detectors
- if 'max': take the maximum scores of all detectors
verbose : int, optional (default=0)
Controls the verbosity of the building process.
estimator_params : dict, optional (default=None)
The list of attributes to use as parameters
when instantiating a new base estimator. If none are given,
default parameters are used.
Attributes
----------
decision_scores_ : numpy array of shape (n_samples,)
The outlier scores of the training data.
The higher, the more abnormal. Outliers tend to have higher
scores. This value is available once the detector is
fitted.
threshold_ : float
The threshold is based on ``contamination``. It is the
``n_samples * contamination`` most abnormal samples in
``decision_scores_``. The threshold is calculated for generating
binary outlier labels.
labels_ : int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.
"""
def __init__(self, base_estimator=None, n_estimators=10, contamination=0.1,
max_features=1.0, bootstrap_features=False,
check_detector=True, check_estimator=False, n_jobs=1,
random_state=None, combination='average', verbose=0,
estimator_params=None):
super(FeatureBagging, self).__init__(contamination=contamination)
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.max_features = max_features
self.bootstrap_features = bootstrap_features
self.check_detector = check_detector
self.check_estimator = check_estimator
self.combination = combination
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
if estimator_params is not None:
self.estimator_params = estimator_params
else:
self.estimator_params = {}
def fit(self, X, y=None):
"""Fit detector. y is ignored in unsupervised methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted estimator.
"""
random_state = check_random_state(self.random_state)
X = check_array(X)
self.n_samples_, self.n_features_ = X.shape[0], X.shape[1]
self._set_n_classes(y)
# expect at least 2 features, does not make sense if only have
# 1 feature
check_parameter(self.n_features_, low=2, include_left=True,
param_name='n_features')
# check parameters
self._validate_estimator(default=LOF(n_jobs=self.n_jobs))
# use at least half of the features
self.min_features_ = int(0.5 * self.n_features_)
# Validate max_features
if isinstance(self.max_features, (numbers.Integral, np.integer)):
self.max_features_ = self.max_features
else: # float
self.max_features_ = int(self.max_features * self.n_features_)
# min_features and max_features could equal
check_parameter(self.max_features_, low=self.min_features_,
param_name='max_features', high=self.n_features_,
include_left=True, include_right=True)
self.estimators_ = []
self.estimators_features_ = []
n_more_estimators = self.n_estimators - len(self.estimators_)
if n_more_estimators < 0:
raise ValueError('n_estimators=%d must be larger or equal to '
'len(estimators_)=%d when warm_start==True'
% (self.n_estimators, len(self.estimators_)))
seeds = random_state.randint(MAX_INT, size=n_more_estimators)
self._seeds = seeds
for i in range(self.n_estimators):
random_state = np.random.RandomState(seeds[i])
# max_features is incremented by one since random
# function is [min_features, max_features)
features = generate_bagging_indices(random_state,
self.bootstrap_features,
self.n_features_,
self.min_features_,
self.max_features_ + 1)
# initialize and append estimators
estimator = self._make_estimator(append=False,
random_state=random_state)
estimator.fit(X[:, features])
self.estimators_.append(estimator)
self.estimators_features_.append(features)
# decision score matrix from all estimators
all_decision_scores = self._get_decision_scores()
if self.combination == 'average':
self.decision_scores_ = average(all_decision_scores)
else:
self.decision_scores_ = maximization(all_decision_scores)
self._process_decision_scores()
return self
def decision_function(self, X):
"""Predict raw anomaly score of X using the fitted detector.
The anomaly score of an input sample is computed based on different
detector algorithms. For consistency, outliers are assigned with
larger anomaly scores.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only
if they are supported by the base estimator.
Returns
-------
anomaly_scores : numpy array of shape (n_samples,)
The anomaly score of the input samples.
"""
check_is_fitted(self, ['estimators_', 'estimators_features_',
'decision_scores_', 'threshold_', 'labels_'])
X = check_array(X)
if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
"match the input. Model n_features is {0} and "
"input n_features is {1}."
"".format(self.n_features_, X.shape[1]))
# Parallel loop
# n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
# self.n_jobs)
# all_pred_scores = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
# delayed(_parallel_decision_function)(
# self.estimators_[starts[i]:starts[i + 1]],
# self.estimators_features_[starts[i]:starts[i + 1]],
# X)
# for i in range(n_jobs))
#
# # Reduce
# all_pred_scores = np.concatenate(all_pred_scores, axis=1)
all_pred_scores = self._predict_decision_scores(X)
if self.combination == 'average':
return average(all_pred_scores)
else:
return maximization(all_pred_scores)
def _predict_decision_scores(self, X):
all_pred_scores = np.zeros([X.shape[0], self.n_estimators])
for i in range(self.n_estimators):
features = self.estimators_features_[i]
all_pred_scores[:, i] = self.estimators_[i].decision_function(
X[:, features])
return all_pred_scores
def _get_decision_scores(self):
all_decision_scores = np.zeros([self.n_samples_, self.n_estimators])
for i in range(self.n_estimators):
all_decision_scores[:, i] = self.estimators_[i].decision_scores_
return all_decision_scores
def _validate_estimator(self, default=None):
"""Check the estimator and the n_estimator attribute, set the
`base_estimator_` attribute."""
if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
raise ValueError("n_estimators must be an integer, "
"got {0}.".format(type(self.n_estimators)))
if self.n_estimators <= 0:
raise ValueError("n_estimators must be greater than zero, "
"got {0}.".format(self.n_estimators))
if self.base_estimator is not None:
self.base_estimator_ = self.base_estimator
else:
self.base_estimator_ = default
if self.base_estimator_ is None:
raise ValueError("base_estimator cannot be None")
# make sure estimator is consistent with sklearn
if self.check_detector:
check_detector(self.base_estimator_)
def _make_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
sklearn/base.py
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
# TODO: add a check for estimator_param
estimator = clone(self.base_estimator_)
estimator.set_params(**self.estimator_params)
if random_state is not None:
_set_random_states(estimator, random_state)
if append:
self.estimators_.append(estimator)
return estimator
def __len__(self):
"""Returns the number of estimators in the ensemble."""
return len(self.estimators_)
def __getitem__(self, index):
"""Returns the index'th estimator in the ensemble."""
return self.estimators_[index]
def __iter__(self):
"""Returns iterator over estimators in the ensemble."""
return iter(self.estimators_)