DeveloperCAP/MLCAT

View on GitHub
lib/util/preprocess.py

Summary

Maintainability
A
0 mins
Test Coverage
import numpy as np


def is_outlier(points, thresh=3.5):
    """
    Returns a boolean array with True if points are outliers and False otherwise.

    :param points: An numobservations by numdimensions array of observations.
    :param thresh: The modified z-score to use as a threshold. Observations with a modified z-score (based on the median absolute deviation) greater than this value will be classified as outliers.
    :return: A numobservations-length boolean array.

    .. note:: Reference: Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control:Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
    """
    if len(points.shape) == 1:
        return np.array([False], np.bool)
    else:
        median = np.median(points, axis=0)
        diff = np.sum((points - median)**2, axis=-1)
        diff = np.sqrt(diff)
        med_abs_deviation = np.median(diff)

        modified_z_score = 0.6745 * diff / med_abs_deviation

        return modified_z_score > thresh