pyod/models/rgraph.py
# -*- coding: utf-8 -*-
""" R-graph
"""
# Author: Michiel Bongaerts (but not author of the R-graph method)
# License: BSD 2 clause
import warnings
import numpy as np
from scipy import sparse
from sklearn.decomposition import sparse_encode
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.utils import check_array
from .base import BaseDetector
class RGraph(BaseDetector):
""" Outlier Detection via R-graph.
Paper: https://openaccess.thecvf.com/content_cvpr_2017/papers/You_Provable_Self-Representation_Based_CVPR_2017_paper.pdf
See :cite:`you2017provable` for details.
Parameters
----------
transition_steps : int, optional (default=20)
Number of transition steps that are taken in the graph, after which
the outlier scores are determined.
gamma : float
gamma_nz : boolean, default True
gamma and gamma_nz together determines the parameter alpha.
When ``gamma_nz = False``, alpha = gamma.
When ``gamma_nz = True``, then alpha = gamma * alpha0, where alpha0 is
the largest number such that the solution to the optimization problem
with alpha = alpha0 is the zero vector (see Proposition 1 in [1]).
Therefore, when ``gamma_nz = True``, gamma should be a value greater
than 1.0. A good choice is typically in the range [5, 500].
tau : float, default 1.0
Parameter for elastic net penalty term.
When tau = 1.0, the method reduces to sparse subspace clustering with
basis pursuit (SSC-BP) [2].
When tau = 0.0, the method reduces to least squares regression (LSR).
algorithm : string, default ``lasso_lars``
Algorithm for computing the representation. Either lasso_lars or
lasso_cd.
Note: ``lasso_lars`` and ``lasso_cd`` only support tau = 1.
For cases tau << 1 linear regression is used.
fit_intercept_LR: bool, optional (default=False)
For ``gamma`` > 10000 linear regression is used instead of
``lasso_lars`` or ``lasso_cd``. This parameter determines whether the
intercept for the model is calculated.
maxiter_lasso : int, default 1000
The maximum number of iterations for ``lasso_lars`` and ``lasso_cd``.
n_nonzero : int, default 50
This is an upper bound on the number of nonzero entries of each
representation vector.
If there are more than n_nonzero nonzero entries,
only the top n_nonzero number of
entries with largest absolute value are kept.
active_support: boolean, default True
Set to True to use the active support algorithm in [1] for solving the
optimization problem. This should significantly reduce the running time
when n_samples is large.
active_support_params: dictionary of string to any, optional
Parameters (keyword arguments) and values for the active support
algorithm. It may be used to set the parameters ``support_init``,
``support_size`` and ``maxiter``, see
``active_support_elastic_net`` for details.
Example: active_support_params={'support_size':50, 'maxiter':100}
Ignored when ``active_support=False``
preprocessing : bool, optional (default=True)
If True, apply standardization on the data.
verbose : int, optional (default=1)
Verbosity mode.
- 0 = silent
- 1 = progress bar
- 2 = one line per epoch.
For verbose >= 1, model summary may be printed.
random_state : random_state: int, RandomState instance or None, optional
(default=None)
If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by `np.random`.
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set, i.e.
the proportion of outliers in the data set. When fitting this is used
to define the threshold on the decision function.
blocksize_test_data: int, optional (default=10)
Test set is splitted into blocks of the size ``blocksize_test_data``
to at least partially separate test - and train set
Attributes
----------
transition_matrix_ : numpy array of shape (n_samples,)
Transition matrix from the last fitted data, this might include
training + test data
decision_scores_ : numpy array of shape (n_samples,)
The outlier scores of the training data.
The higher, the more abnormal. Outliers tend to have higher
scores. This value is available once the detector is
fitted.
threshold_ : float
The threshold is based on ``contamination``. It is the
``n_samples * contamination`` most abnormal samples in
``decision_scores_``. The threshold is calculated for generating
binary outlier labels.
labels_ : int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.
"""
def __init__(self, transition_steps=10, n_nonzero=10, gamma=50.0,
gamma_nz=True, algorithm='lasso_lars', tau=1.0,
maxiter_lasso=1000, preprocessing=True, contamination=0.1,
blocksize_test_data=10,
support_init='L2', maxiter=40, support_size=100,
active_support=True, fit_intercept_LR=False,
verbose=True):
super(RGraph, self).__init__(contamination=contamination)
self.transition_steps = transition_steps
self.n_nonzero = n_nonzero
self.gamma = gamma
self.gamma_nz = gamma_nz
self.algorithm = algorithm
self.tau = tau
self.preprocessing = preprocessing
self.contamination = contamination
self.maxiter_lasso = maxiter_lasso
self.support_init = support_init
self.maxiter = maxiter
self.support_size = support_size
self.active_support = active_support
self.verbose = verbose
self.blocksize_test_data = blocksize_test_data
self.fit_intercept_LR = fit_intercept_LR
def active_support_elastic_net(self, X, y, alpha, tau=1.0,
algorithm='lasso_lars', support_init='L2',
support_size=100, maxiter=40,
maxiter_lasso=1000):
"""
Source: https://github.com/ChongYou/subspace-clustering/blob/master/cluster/selfrepresentation.py
An active support based algorithm for solving the elastic net optimization problem
min_{c} tau ||c||_1 + (1-tau)/2 ||c||_2^2 + alpha / 2 ||y - c X ||_2^2.
Parameters
-----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (1, n_features)
alpha : float
tau : float, default 1.0
algorithm : string, default ``spams``
Algorithm for computing solving the subproblems. Either lasso_lars
or lasso_cd or spams
(installation of spams package is required).
Note: ``lasso_lars`` and ``lasso_cd`` only support tau = 1.
support_init: string, default ``knn``
This determines how the active support is initialized.
It can be either ``knn`` or ``L2``.
support_size: int, default 100
This determines the size of the working set.
A small support_size decreases the runtime per iteration while
increase the number of iterations.
maxiter: int default 40
Termination condition for active support update.
Returns
-------
c : shape n_samples
The optimal solution to the optimization problem.
"""
n_samples = X.shape[0]
if n_samples <= support_size: # skip active support search for small scale data
supp = np.arange(n_samples,
dtype=int) # this results in the following iteration to converge in 1 iteration
else:
if support_init == 'L2':
L2sol = np.linalg.solve(
np.identity(y.shape[1]) * alpha + np.dot(X.T, X), y.T)
c0 = np.dot(X, L2sol)[:, 0]
supp = np.argpartition(-np.abs(c0), support_size)[
0:support_size]
elif support_init == 'knn':
supp = np.argpartition(-np.abs(np.dot(y, X.T)[0]),
support_size)[0:support_size]
curr_obj = float("inf")
for _ in range(maxiter):
Xs = X[supp, :]
## Removed the original option to use 'spams' since this would
# require the spams dependency
# if algorithm == 'spams':
# cs = spams.lasso(np.asfortranarray(y.T), D=np.asfortranarray(Xs.T),
# lambda1=tau*alpha, lambda2=(1.0-tau)*alpha)
# cs = np.asarray(cs.todense()).T
# else:
cs = sparse_encode(y, Xs, algorithm=algorithm, alpha=alpha,
max_iter=maxiter_lasso)
delta = (y - np.dot(cs, Xs)) / alpha
obj = tau * np.sum(np.abs(cs[0])) + (1.0 - tau) / 2.0 * np.sum(
np.power(cs[0], 2.0)) + alpha / 2.0 * np.sum(
np.power(delta, 2.0))
if curr_obj - obj < 1.0e-10 * curr_obj:
break
curr_obj = obj
coherence = np.abs(np.dot(delta, X.T))[0]
coherence[supp] = 0
addedsupp = np.nonzero(coherence > tau + 1.0e-10)[0]
if addedsupp.size == 0: # converged
break
# Find the set of nonzero entries of cs.
activesupp = supp[np.abs(cs[0]) > 1.0e-10]
if activesupp.size > 0.8 * support_size: # this suggests that support_size is too small and needs to be increased
support_size = min(
[round(max([activesupp.size, support_size]) * 1.1),
n_samples])
if addedsupp.size + activesupp.size > support_size:
ord = np.argpartition(-coherence[addedsupp],
support_size - activesupp.size)[
0:support_size - activesupp.size]
addedsupp = addedsupp[ord]
supp = np.concatenate([activesupp, addedsupp])
c = np.zeros(n_samples)
c[supp] = cs
return c
def elastic_net_subspace_clustering(self, X, gamma=50.0, gamma_nz=True,
tau=1.0, algorithm='lasso_lars',
fit_intercept_LR=False,
active_support=True,
active_support_params=None,
n_nonzero=50,
maxiter_lasso=1000):
"""
Source: https://github.com/ChongYou/subspace-clustering/blob/master/cluster/selfrepresentation.py
Elastic net subspace clustering (EnSC) [1].
Compute self-representation matrix C from solving the following optimization problem
min_{c_j} tau ||c_j||_1 + (1-tau)/2 ||c_j||_2^2 + alpha / 2 ||x_j - c_j X ||_2^2 s.t. c_jj = 0,
where c_j and x_j are the j-th rows of C and X, respectively.
Parameter ``algorithm`` specifies the algorithm for solving the optimization problem.
``lasso_lars`` and ``lasso_cd`` are algorithms implemented in sklearn,
``spams`` refers to the same algorithm as ``lasso_lars`` but is implemented in
spams package available at http://spams-devel.gforge.inria.fr/ (installation required)
In principle, all three algorithms give the same result.
For large scale data (e.g. with > 5000 data points), use any of these algorithms in
conjunction with ``active_support=True``. It adopts an efficient active support
strategy that solves the optimization problem by breaking it into a sequence of
small scale optimization problems as described in [1].
If tau = 1.0, the method reduces to sparse subspace clustering with basis pursuit (SSC-BP) [2].
If tau = 0.0, the method reduces to least squares regression (LSR) [3].
Note: ``lasso_lars`` and ``lasso_cd`` only support tau = 1.
Parameters
-----------
X : array-like, shape (n_samples, n_features)
Input data to be clustered
gamma : float
gamma_nz : boolean, default True
gamma and gamma_nz together determines the parameter alpha. When ``gamma_nz = False``,
alpha = gamma. When ``gamma_nz = True``, then alpha = gamma * alpha0, where alpha0 is
the largest number such that the solution to the optimization problem with alpha = alpha0
is the zero vector (see Proposition 1 in [1]). Therefore, when ``gamma_nz = True``, gamma
should be a value greater than 1.0. A good choice is typically in the range [5, 500].
tau : float, default 1.0
Parameter for elastic net penalty term.
When tau = 1.0, the method reduces to sparse subspace clustering with basis pursuit (SSC-BP) [2].
When tau = 0.0, the method reduces to least squares regression (LSR) [3].
algorithm : string, default ``lasso_lars``
Algorithm for computing the representation. Either lasso_lars or lasso_cd or spams
(installation of spams package is required).
Note: ``lasso_lars`` and ``lasso_cd`` only support tau = 1.
n_nonzero : int, default 50
This is an upper bound on the number of nonzero entries of each representation vector.
If there are more than n_nonzero nonzero entries, only the top n_nonzero number of
entries with largest absolute value are kept.
active_support: boolean, default True
Set to True to use the active support algorithm in [1] for solving the optimization problem.
This should significantly reduce the running time when n_samples is large.
active_support_params: dictionary of string to any, optional
Parameters (keyword arguments) and values for the active support algorithm. It may be
used to set the parameters ``support_init``, ``support_size`` and ``maxiter``, see
``active_support_elastic_net`` for details.
Example: active_support_params={'support_size':50, 'maxiter':100}
Ignored when ``active_support=False``
Returns
-------
representation_matrix_ : csr matrix, shape: n_samples by n_samples
The self-representation matrix.
References
-----------
[1] C. You, C.-G. Li, D. Robinson, R. Vidal, Oracle Based Active Set Algorithm for Scalable Elastic Net Subspace Clustering, CVPR 2016
[2] E. Elhaifar, R. Vidal, Sparse Subspace Clustering: Algorithm, Theory, and Applications, TPAMI 2013
[3] C. Lu, et al. Robust and efficient subspace segmentation via least squares regression, ECCV 2012
"""
if ((algorithm in ('lasso_lars', 'lasso_cd')) and (
tau < 1.0 - 1.0e-10)):
warnings.warn(
'algorithm {} cannot handle tau smaller than 1. Using tau = 1'.format(
algorithm))
tau = 1.0
if active_support == True and active_support_params == None:
active_support_params = {}
n_samples = X.shape[0]
rows = np.zeros(n_samples * n_nonzero)
cols = np.zeros(n_samples * n_nonzero)
vals = np.zeros(n_samples * n_nonzero)
curr_pos = 0
gamma_is_zero_notification = False
for i in range(n_samples):
if ((i % 25 == 0) and (self.verbose == 1)):
print('{}/{}'.format(i, n_samples))
y = X[i, :].copy().reshape(1, -1)
X[i, :] = 0
if algorithm in ('lasso_lars', 'lasso_cd'):
if gamma_nz == True:
coh = np.delete(np.absolute(np.dot(X, y.T)), i)
alpha0 = np.amax(
coh) / tau # value for which the solution is zero
alpha = alpha0 / gamma
else:
alpha = 1.0 / gamma
if (gamma >= 10 ** 4):
if (gamma_is_zero_notification == False):
warnings.warn(
'Set alpha = 0 i.e. LinearRegression() is used')
gamma_is_zero_notification = True
alpha = 0
if (alpha == 0):
lr = LinearRegression(fit_intercept=fit_intercept_LR)
lr.fit(X.T, y[0])
c = lr.coef_
elif active_support == True:
c = self.active_support_elastic_net(X, y, alpha, tau,
algorithm,
**active_support_params)
else:
## Removed the original option to use 'spams' since this would require the spams dependency
# if algorithm == 'spams':
# c = spams.lasso(np.asfortranarray(y.T), D=np.asfortranarray(X.T),
# lambda1=tau * alpha, lambda2=(1.0-tau) * alpha)
# c = np.asarray(c.todense()).T[0]
# else:
c = sparse_encode(y, X, algorithm=algorithm, alpha=alpha,
max_iter=maxiter_lasso)[0]
else:
warnings.warn("algorithm {} not found".format(algorithm))
index = np.flatnonzero(c)
if index.size > n_nonzero:
# warnings.warn("The number of nonzero entries in sparse subspace clustering exceeds n_nonzero")
index = index[np.argsort(-np.absolute(c[index]))[0:n_nonzero]]
rows[curr_pos:curr_pos + len(index)] = i
cols[curr_pos:curr_pos + len(index)] = index
vals[curr_pos:curr_pos + len(index)] = c[index]
curr_pos += len(index)
X[i, :] = y
return sparse.csr_matrix((vals, (rows, cols)),
shape=(n_samples, n_samples))
def fit(self, X, y=None):
"""Fit detector. y is ignored in unsupervised methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Fitted estimator.
"""
# If we "re-fit" the model then we need to make sure previour self.X_train is first deleted
# since this parameter is used downstream in the analysis in self.decision_function().
if hasattr(self, 'X_train'):
del self.X_train
X = check_array(X)
# Fit scaler on train set
if self.preprocessing:
self.scaler_ = StandardScaler()
self.scaler_.fit(X)
self._set_n_classes(y)
self.decision_scores_ = self.decision_function(X)
self.X_train = X
self._process_decision_scores()
return self
def decision_function(self, X):
"""Predict raw anomaly score of X using the fitted detector.
The anomaly score of an input sample is computed based on different
detector algorithms. For consistency, outliers are assigned with
larger anomaly scores.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only
if they are supported by the base estimator.
Returns
-------
anomaly_scores : numpy array of shape (n_samples,)
The anomaly score of the input samples.
"""
X = check_array(X)
# Since we already have a train set, we want to only partially concatenate the test set to the
# train set. This will be done by splitting the test set into different parts and concatenate
# those to the train set.
if hasattr(self, 'X_train'):
N = int(X.shape[0] / self.blocksize_test_data) + 1
scores = []
for i in range(N):
if (self.verbose == 1):
print("Test block {}/{}".format(i, N))
X_block_i = np.copy(X[i * self.blocksize_test_data: (
i + 1) * self.blocksize_test_data])
if (X_block_i.shape[0] >= 1):
original_size_i = X_block_i.shape[0]
# Concatenate train set with part of the test set
X_i = np.concatenate((self.X_train, X_block_i), axis=0)
if self.preprocessing:
# Scale concatenated data
X_i_norm = self.scaler_.transform(X_i)
else:
X_i_norm = np.copy(X_i)
scores_i = self._decision_function(X_i_norm)
scores_i = scores_i[-original_size_i:]
scores.extend(list(scores_i))
scores = np.array(scores)
return scores
else:
if self.preprocessing:
# Scale train set
X_norm = self.scaler_.transform(X)
else:
X_norm = np.copy(X)
scores = self._decision_function(X_norm)
return scores
def _decision_function(self, X_norm):
A = self.elastic_net_subspace_clustering(
X_norm, gamma=self.gamma,
gamma_nz=self.gamma_nz,
tau=self.tau,
algorithm=self.algorithm,
fit_intercept_LR=self.fit_intercept_LR,
active_support=self.active_support,
n_nonzero=self.n_nonzero,
maxiter_lasso=self.maxiter_lasso,
active_support_params={
'support_init': self.support_init,
'support_size': self.support_size,
'maxiter': self.maxiter}
)
self.transition_matrix_ = normalize(np.abs(A.toarray()), norm='l1')
pi = np.ones((1, len(self.transition_matrix_)), dtype='float64') / len(
self.transition_matrix_)
pi_bar = np.zeros((1, len(self.transition_matrix_)), dtype='float64')
# Do transition steps
for _ in range(self.transition_steps):
pi = pi @ self.transition_matrix_
pi_bar += pi
pi_bar /= self.transition_steps
scores = pi_bar[0]
# smaller scores correspond with outliers,
# thus we use -1 * score such that
# higher scores are associated with outliers
scores = -1 * scores
return scores