yzhao062/Pyod

View on GitHub
pyod/models/thresholds.py

Summary

Maintainability
C
7 hrs
Test Coverage
def AUCP(**kwargs):
    """AUCP class for Area Under Curve Precentage thresholder.

       Use the area under the curve to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond where the auc of the kde is less
       than the (mean + abs(mean-median)) percent of the total kde auc.
    """

    from pythresh.thresholds.aucp import AUCP as AUCP_thres
    return AUCP_thres(**kwargs)


def BOOT(**kwargs):
    """BOOT class for Bootstrapping thresholder.

       Use a boostrapping based method to find a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the mean of the confidence intervals.
   
       Parameters
       ----------
       
       random_state : int, optional (default=1234)
            Random seed for bootstrapping a confidence interval. Can also be set to None.

    """

    from pythresh.thresholds.boot import BOOT as BOOT_thres
    return BOOT_thres(**kwargs)


def CHAU(**kwargs):
    """CHAU class for Chauvenet's criterion thresholder.

       Use the Chauvenet's criterion to evaluate a non-parametric
       means to threshold scores generated by the decision_scores
       where outliers are set to any value below the Chauvenet's
       criterion.

       Parameters
       ----------

       method : {'mean', 'median', 'gmean'}, optional (default='mean')
            Calculate the area normal to distance using a scaler
       
            - 'mean':  Construct a scaler with the mean of the scores
            - 'median: Construct a scaler with the median of the scores
            - 'gmean': Construct a scaler with the geometric mean of the scores
    """

    from pythresh.thresholds.chau import CHAU as CHAU_thres
    return CHAU_thres(**kwargs)


def CLF(**kwargs):
    """CLF class for Trained Classifier thresholder.

       Use the trained linear classifier to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond 0.

       Parameters
       ----------

       method : {'simple', 'complex'}, optional (default='complex')
            Type of linear model

            - 'simple':  Uses only the scores
            - 'complex': Uses the scores, log of the scores, and the scores' PDF
    """

    from pythresh.thresholds.clf import CLF as CLF_thres
    return CLF_thres(**kwargs)


def CLUST(**kwargs):
    """CLUST class for clustering type thresholders.

       Use the clustering methods to evaluate a non-parametric means to
       threshold scores generated by the decision_scores where outliers
       are set to any value not labelled as part of the main cluster.

       Parameters
       ----------
        
       method : {'agg', 'birch', 'bang', 'bgm', 'bsas', 'dbscan', 'ema', 'kmeans', 'mbsas', 'mshift', 'optics', 'somsc', 'spec', 'xmeans'}, optional (default='spec')
            Clustering method
        
            - 'agg':    Agglomerative
            - 'birch':  Balanced Iterative Reducing and Clustering using Hierarchies
            - 'bang':   BANG
            - 'bgm':    Bayesian Gaussian Mixture
            - 'bsas':   Basic Sequential Algorithmic Scheme
            - 'dbscan': Density-based spatial clustering of applications with noise
            - 'ema':    Expectation-Maximization clustering algorithm for Gaussian Mixture Model
            - 'kmeans': K-means
            - 'mbsas':  Modified Basic Sequential Algorithmic Scheme
            - 'mshift': Mean shift
            - 'optics': Ordering Points To Identify Clustering Structure
            - 'somsc':  Self-organized feature map 
            - 'spec':   Clustering to a projection of the normalized Laplacian 
            - 'xmeans': X-means
            
       random_state : int, optional (default=1234)
            Random seed for the BayesianGaussianMixture clustering (method='bgm'). Can
            also be set to None.
    """

    from pythresh.thresholds.clust import CLUST as CLUST_thres
    return CLUST_thres(**kwargs)


def CPD(**kwargs):
    """CPD class for Change Point Detection thresholder.

       Use change point detection to find a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the detected change point.

       Parameters
       ----------

       method : {'Dynp', 'KernelCPD', 'Binseg', 'BottomUp'}, optional (default='Dynp')
            Method for change point detection

            - 'Dynp':      Dynamic programming (optimal minimum sum of errors per partition)
            - 'KernelCPD': RBF kernel function (optimal minimum sum of errors per partition)
            - 'Binseg':    Binary segmentation
            - 'BottomUp':  Bottom-up segmentation

       transform : {'cdf', 'kde'}, optional (default='cdf')
            Data transformation method prior to fit

            - 'cdf': Use the cumulative distribution function
            - 'kde': Use the kernel density estimation
            
    """

    from pythresh.thresholds.cpd import CPD as CPD_thres
    return CPD_thres(**kwargs)


def DECOMP(**kwargs):
    """DECOMP class for Decomposition based thresholders.

       Use decomposition to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the maximum of the decomposed
       matrix that results from decomposing the cumulative distribution
       function of the decision scores.
       
       Parameters
       ----------

       method : {'NMF', 'PCA', 'GRP', 'SRP'}, optional (default='PCA')
            Method to use for decomposition

            - 'NMF':  Non-Negative Matrix Factorization
            - 'PCA':  Principal Component Analysis
            - 'GRP':  Gaussian Random Projection
            - 'SRP':  Sparse Random Projection
            
       random_state : int, optional (default=1234)
            Random seed for the decomposition algorithm. Can also be set to None.

    """

    from pythresh.thresholds.decomp import DECOMP as DECOMP_thres
    return DECOMP_thres(**kwargs)


def DSN(**kwargs):
    """DSN class for Distance Shift from Normal thresholder.

       Use the distance shift from normal to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the distance calculated by the selected
       metric.
       
       Parameters
       ----------

       metric : {'JS', 'WS', 'ENG', 'BHT', 'HLL', 'HI', 'LK', 'LP', 'MAH', 'TMT', 'RES', 'KS', 'INT', 'MMD'}, optional (default='MAH')
            Metric to use for distance computation
        
            - 'JS':  Jensen-Shannon distance
            - 'WS':  Wasserstein or Earth Movers distance
            - 'ENG': Energy distance
            - 'BHT': Bhattacharyya distance
            - 'HLL': Hellinger distance
            - 'HI':  Histogram intersection distance
            - 'LK':  Lukaszyk-Karmowski metric for normal distributions
            - 'LP':  Levy-Prokhorov metric
            - 'MAH': Mahalanobis distance
            - 'TMT': Tanimoto distance
            - 'RES': Studentized residual distance
            - 'KS':  Kolmogorov-Smirnov distance
            - 'INT': Weighted spline interpolated distance
            - 'MMD': Maximum Mean Discrepancy distance
            
       random_state : int, optional (default=1234)
            Random seed for the normal distribution. Can also be set to None.

    """

    from pythresh.thresholds.dsn import DSN as DSN_thres
    return DSN_thres(**kwargs)


def EB(**kwargs):
    """EB class for Elliptical Boundary thresholder.

       Use pseudo-random elliptical boundaries to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond a pseudo-random elliptical boundary set
       between inliers and outliers.
    """

    from pythresh.thresholds.eb import EB as EB_thres
    return EB_thres(**kwargs)


def FGD(**kwargs):
    """FGD class for Fixed Gradient Descent thresholder.

       Use the fixed gradient descent to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond where the first derivative of the kde
       with respect to the decision scores passes the mean of the first 
       and second inflection points.
    """

    from pythresh.thresholds.fgd import FGD as FGD_thres
    return FGD_thres(**kwargs)


def FILTER(**kwargs):
    """FILTER class for Filtering based thresholders.

       Use the filtering based methods to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the maximum filter value.
       See :cite:`hashemi2019filter` for details.
       
       Parameters
       ----------

       method : {'gaussian', 'savgol', 'hilbert', 'wiener', 'medfilt', 'decimate','detrend', 'resample'}, optional (default='savgol')
            Method to filter the scores
       
            - 'gaussian': use a gaussian based filter
            - 'savgol':   use the savgol based filter
            - 'hilbert':  use the hilbert based filter
            - 'wiener':   use the wiener based filter
            - 'medfilt:   use a median based filter
            - 'decimate': use a decimate based filter
            - 'detrend':  use a detrend based filter
            - 'resample': use a resampling based filter
        

       sigma : int, optional (default='auto') 
            Variable specific to each filter type, default sets sigma to len(scores)*np.std(scores)
       
            - 'gaussian': standard deviation for Gaussian kernel
            - 'savgol':   savgol filter window size
            - 'hilbert':  number of Fourier components
            - 'medfilt:   kernel size
            - 'decimate': downsampling factor
            - 'detrend':  number of break points
            - 'resample': resampling window size
    """

    from pythresh.thresholds.filter import FILTER as FILTER_thres
    return FILTER_thres(**kwargs)


def FWFM(**kwargs):
    """FWFM class for Full Width at Full Minimum thresholder.

       Use the full width at full minimum (aka base width) to evaluate
       a non-parametric means to threshold scores generated by the
       decision_scores where outliers are set to any value beyond the base
       width.
    """

    from pythresh.thresholds.fwfm import FWFM as FWFM_thres
    return FWFM_thres(**kwargs)


def GESD(**kwargs):
    """GESD class for Generalized Extreme Studentized Deviate thresholder.

       Use the generalized extreme studentized deviate to evaluate a
       non-parametric means to threshold scores generated by the decision_scores
       where outliers are set to any less than the smallest detected outlier.
       
       Parameters
       ----------

       max_outliers : int, optional (default='auto')
            mamiximum number of outliers that the dataset may have. Default sets 
            max_outliers to be half the size of the dataset

       alpha : float, optional (default=0.05)
            significance level
    """

    from pythresh.thresholds.gesd import GESD as GESD_thres
    return GESD_thres(**kwargs)


def HIST(**kwargs):
    """HIST class for Histogram based thresholders.

       Use histograms methods as described in scikit-image.filters to
       evaluate a non-parametric means to threshold scores generated by
       the decision_scores where outliers are set by histogram generated
       thresholds depending on the selected methods. 
       
       Parameters
       ----------
       
       nbins : int, optional (default='auto')
            Number of bins to use in the hostogram, default set to int(len(scores)**0.7)
               
       method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle')
            Histogram filtering based method
            
            - 'otsu':     OTSU's method for filtering
            - 'yen':      Yen's method for filtering
            - 'isodata':  Ridler-Calvard or inter-means method for filtering
            - 'li':       Li's iterative Minimum Cross Entropy method for filtering
            - 'minimum':  Minimum between two maxima via smoothing method for filtering
            - 'triangle': Triangle algorithm method for filtering
    """

    from pythresh.thresholds.hist import HIST as HIST_thres
    return HIST_thres(**kwargs)


def IQR(**kwargs):
    """IQR class for Inter-Qaurtile Region thresholder.

       Use the inter-quartile region to evaluate a non-parametric
       means to threshold scores generated by the decision_scores
       where outliers are set to any value beyond the third quartile
       plus 1.5 times the inter-quartile region.
    """

    from pythresh.thresholds.iqr import IQR as IQR_thres
    return IQR_thres(**kwargs)


def KARCH(**kwargs):
    """KARCH class for Riemannian Center of Mass thresholder.

       Use the Karcher mean (Riemannian Center of Mass) to evaluate a
       non-parametric means to threshold scores generated by the
       decision_scores where outliers are set to any value beyond the
       Karcher mean + one standard deviation of the decision_scores.
       
       Parameters
       ----------

       ndim : int, optional (default=2)
            Number of dimensions to construct the Euclidean manifold

       method : {'simple', 'complex'}, optional (default='complex')
            Method for computing the Karcher mean
       
            - 'simple':  Compute the Karcher mean using the 1D array of scores
            - 'complex': Compute the Karcher mean between a 2D array dot product of the scores and the sorted scores arrays
    """

    from pythresh.thresholds.karch import KARCH as KARCH_thres
    return KARCH_thres(**kwargs)


def MAD(**kwargs):
    """MAD class for Median Absolute Deviation thresholder.

       Use the median absolute deviation to evaluate a non-parametric
       means to threshold scores generated by the decision_scores
       where outliers are set to any value beyond the mean plus the 
       median absolute deviation over the standard deviation.
    """

    from pythresh.thresholds.mad import MAD as MAD_thres
    return MAD_thres(**kwargs)


def MCST(**kwargs):
    """MCST class for Monte Carlo Shapiro Tests thresholder.

       Use uniform random sampling and statstical testing to evaluate a
       non-parametric means to threshold scores generated by the decision_scores
       where outliers are set to any value beyond the minimum value left after
       iterative Shapiro-Wilk tests have occured. Note** accuracy decreases with
       array size. For good results the should be array<1000. However still this
       threshold method may fail at any array size.
       
       Parameters
       ----------

       random_state : int, optional (default=1234)
            Random seed for the uniform distribution. Can also be set to None.

    """

    from pythresh.thresholds.mcst import MCST as MCST_thres
    return MCST_thres(**kwargs)


def META(**kwargs):
    """META class for Meta-modelling thresholder.

       Use a trained meta-model to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set based on the trained meta-model classifier. 
       
       Parameters
       ----------

       method : {'LIN', 'GNB', 'GNBC', 'GNBM'}, optional (default='GNBM')
           select

           - 'LIN':  RidgeCV trained linear classifier meta-model on true labels
           - 'GNB':  Gaussian Naive Bayes trained classifier meta-model on true labels
           - 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination
           - 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model

    """

    from pythresh.thresholds.meta import META as META_thres
    return META_thres(**kwargs)


def MOLL(**kwargs):
    """MOLL class for Friedrichs' mollifier thresholder.

       Use the Friedrichs' mollifier to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond one minus the  maximum of the smoothed
       dataset via convolution.
    """

    from pythresh.thresholds.moll import MOLL as MOLL_thres
    return MOLL_thres(**kwargs)


def MTT(**kwargs):
    """MTT class for Modified Thompson Tau test thresholder.

       Use the modified Thompson Tau test to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the smallest outlier detected by the test.
       
       Parameters
       ----------

       strictness : [1,2,3,4,5], optional (default=4)
            Level of strictness corresponding to the t-Student distribution map to sample
    """
    from pythresh.thresholds.mtt import MTT as MTT_thres
    return MTT_thres(**kwargs)


def OCSVM(**kwargs):
    """OCSVM class for One-Class Support Vector Machine thresholder.

       Use a one-class svm to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are determined by the one-class svm using a polynomial kernel
       with the polynomial degree either set or determined by regression
       internally.
       
       Parameters
       ----------
       
       model : {'poly', 'sgd'}, optional (default='sgd')
           OCSVM model to apply

           - 'poly':  Use a polynomial kernel with a regular OCSVM
           - 'sgd':   Used the Additive Chi2 kernel approximation with a SGDOneClassSVM

       degree : int, optional (default='auto')
           Polynomial degree to use for the one-class svm.
           Default 'auto' finds the optimal degree with linear regression

       gamma : float, optional (default='auto')
           Kernel coefficient for polynomial fit for the one-class svm.
           Default 'auto' uses 1 / n_features

       criterion : {'aic', 'bic'}, optional (default='bic')
           regression performance metric. AIC is the Akaike Information Criterion,
           and BIC is the Bayesian Information Criterion. This only applies
           when degree is set to 'auto'

       nu : float, optional (default='auto')
           An upper bound on the fraction of training errors and a lower bound
           of the fraction of support vectors. Default 'auto' sets nu as the ratio
           between the any point that is less than or equal to the median plus
           the absolute difference between the mean and geometric mean over the
           the number of points in the entire dataset 

       tol : float, optional (default=1e-3)
           The stopping criterion for the one-class svm
           
       random_state : int, optional (default=1234)
           Random seed for the SVM's data sampling. Can also be set to None.

    """

    from pythresh.thresholds.ocsvm import OCSVM as OCSVM_thres
    return OCSVM_thres(**kwargs)


def QMCD(**kwargs):
    """QMCD class for Quasi-Monte Carlo Discreprancy thresholder.

       Use the quasi-Monte Carlo discreprancy to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond and percentile or quantile of one minus the
       descreperancy (Note** A discrepancy quantifies the distance between the
       continuous uniform distribution on a hypercube and the discrete uniform
       distribution on distinct sample points).
       
       Parameters
       ----------

       method : {'CD', 'WD', 'MD', 'L2-star'}, optional (default='WD')
            Type of discrepancy
        
            - 'CD':      Centered Discrepancy
            - 'WD':      Wrap-around Discrepancy
            - 'MD':      Mix between CD/WD
            - 'L2-star': L2-star discrepancy

       lim : {'Q', 'P'}, optional (default='P')
            Filtering method to threshold scores using 1 - discrepancy
        
            - 'Q': Use quntile limiting
            - 'P': Use percentile limiting
    """

    from pythresh.thresholds.qmcd import QMCD as QMCD_thres
    return QMCD_thres(**kwargs)


def REGR(**kwargs):
    """REGR class for Regression based thresholder.

       Use the regression to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the y-intercept value of the linear fit.
       
       Parameters
       ----------

       method : {'siegel', 'theil'}, optional (default='siegel')
            Regression based method to calculate the y-intercept
            
            - 'siegel': implements a method for robust linear regression using repeated medians
            - 'theil':  implements a method for robust linear regression using paired values
            
       random_state : int, optional (default=1234)
            random seed for the normal distribution. Can also be set to None

    """

    from pythresh.thresholds.regr import REGR as REGR_thres
    return REGR_thres(**kwargs)


def VAE(**kwargs):
    """VAE class for Variational AutoEncoder thresholder.

       Use a VAE to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the maximum minus the minimum of the
       reconstructed distribution probabilities after encoding.

       Parameters
       ----------

       verbose : bool, optional (default=False)
            display training progress

       device : str, optional (default='cpu')
            device for pytorch

       latent_dims : int, optional (default='auto')
            number of latent dimensions the encoder will map the scores to.
            Default 'auto' applies automatic dimensionality selection using
            a profile likelihood.

       random_state : int, optional (default=1234)
            random seed for the normal distribution. Can also be set to None

       epochs : int, optional (default=100)
            number of epochs to train the VAE

       batch_size : int, optional (default=64)
            batch size for the dataloader during training

       loss : str, optional (default='kl')
            Loss function during training

            - 'kl' : use the combined negative log likelihood and Kullback-Leibler divergence
            - 'mmd': use the combined negative log likelihood and maximum mean discrepancy

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers
    """

    from pythresh.thresholds.vae import VAE as VAE_thres
    return VAE_thres(**kwargs)


def WIND(**kwargs):
    """WIND class for topological Winding number thresholder.

       Use the topological winding number (with respect to the origin) to
       evaluate a non-parametric means to threshold scores generated by
       the decision_scores where outliers are set to any value beyond the
       mean intersection point calculated from the winding number.
       
       Parameters
       ----------

       random_state : int, optional (default=1234)
            Random seed for the normal distribution. Can also be set to None.

    """

    from pythresh.thresholds.wind import WIND as WIND_thres
    return WIND_thres(**kwargs)


def YJ(**kwargs):
    """YJ class for Yeo-Johnson transformation thresholder.

       Use the Yeo-Johnson transformation to evaluate
       a non-parametric means to threshold scores generated by the
       decision_scores where outliers are set to any value beyond the
       max value in the YJ transformed data.
    """

    from pythresh.thresholds.yj import YJ as YJ_thres
    return YJ_thres(**kwargs)


def ZSCORE(**kwargs):
    """ZSCORE class for ZSCORE thresholder.

       Use the zscore to evaluate a non-parametric means to threshold
       scores generated by the decision_scores where outliers are set
       to any value beyond a zscore of one.
    """

    from pythresh.thresholds.zscore import ZSCORE as ZSCORE_thres
    return ZSCORE_thres(**kwargs)