src/tse/tse/tse_datautils.py from cgddrd/CS39440-major-project

src/tse/tse/tse_datautils.py
Summary

Maintainability

6 days
Test Coverage

Issues
"""

Module Name: TSEDataUtils

Description: Utility module providing common data handling and statistical processing functionality
for use with single and multi-dimensional data structures.

"""

from __future__ import division

import numpy as np

__author__ = 'Connor Luke Goddard (clg11@aber.ac.uk)'


class TSEDataUtils(object):

    def __init__(self):
        # 'pass' is used when a statement is required syntactically but you do not want any command or code to execute.
        pass

    @staticmethod
    def get_smallest_key_dict(dict):
        """
        Returns the smallest key from a dictionary containing integer values.

        :param dict: Dictionary of integer values.
        :return: The smallest KEY from the dictionary, not the VALUE of the smallest key.
        """
        return min(dict, key=dict.get)

    @staticmethod
    def get_largest_key_dict(dict):
        """
        Returns the largest key from a dictionary containing integer values.

        :param dict: Dictionary of integer values.
        :return: The largest KEY from the dictionary, not the VALUE of the smallest key.
        """
        return max(dict, key=dict.get)

    @staticmethod
    def get_smallest_key_value_dict(dict):
        """
        Returns the value of the smallest key from a dictionary containing integer values.

        :param dict: Dictionary of integer values.
        :return: The value of the largest key from the dictionary.
        """
        smallest_dict_key = min(dict, key=dict.get)
        return dict[smallest_dict_key]

    @staticmethod
    def string_2d_list_to_int_2d_list(original_list):
        """
        Converts a 2D list containing String values to a 2D list of integers.

        :param original_list: The 2-dimensional list of String values.
        :return: A 2-dimensional list of Int values.
        """

        # We use 'map' to perform the conversion over all elements in the extracted 1D array.
        return map(TSEDataUtils.convert_list_to_int, original_list)

    @staticmethod
    def convert_list_to_int(original_list):
        """
        Converts a 1D list containing String values to a 1D list of integers.

        :param original_list: The list of String values.
        :return: A list of Int values.
        """
        return map(int, original_list)

    @staticmethod
    def calc_centered_moving_average(values, window):
        """
        Calculates the centered moving average from a collection of Float or Integer values.

        :param values: The original list of Float or Integer values to be averaged.
        :param window: The size of the centered convolution window (i.e. the centered range of indexs either side of the curremt index to average)
        :return: A list of Float or Integer values representing the averages of the original list.
        """

        # Using 'same' mode for the 'numpy.convolve' function will centre the convolution window at each point of the array overlap.

        # WARNING: Boundary effects will be observed at the ends as a result of the arrays not fully overlapping.
        return TSEDataUtils.calc_moving_average(values, window, 'same')


    @staticmethod
    def calc_moving_average(values, window, mode='valid'):
        """
        Calculates the moving average from a collection of Float or Integer values.

        :param values: The original list of Float or Integer values to be averaged.
        :param window: The size of the centered convolution window (i.e. the centered range of indexs either side of the curremt index to average)
        :param mode: The type of convolution mode for the 'numpy.convolve' function. (Default: 'valid')
        :return: A list of Float or Integer values representing the averages of the original list.
        """

        # Create the `window' from which to select the elements to average.
        weights = np.repeat(1.0, window)/window

        # Including 'valid' MODE will REQUIRE there to be enough data points before beginning convolution.
        # 'valid' mode only convolve where points overlap exactly. This is equivalent to a Simple Moving Average.
        # See: http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html
        return np.convolve(values, weights, mode)

    @staticmethod
    def convert_array_to_numpy_array(original_array):
        """
        Converts a Python 1D list to a 'Numpy' 1D array using the 'np.array' convenience function.

        :param original_array: The standard 'Python' list to be converted.
        :return: An equivalent 'Numpy' 1D array.
        """
        return np.array(original_array)

    @staticmethod
    # This code has been modified from an original source: http://stackoverflow.com/a/11146645/4768230
    def calc_cartesian_product(arrays):
        """
        Calculates the cartesian product of elements between a collection of 1D arrays.

        :param arrays: The collection of 1D arrays of which each element will be combined with each other.
        :return: The combined results of the cartesian product between all input elements.
        """

        la = len(arrays)

        arr = np.empty([len(a) for a in arrays] + [la])

        # Perform cross combination across each array in turn.
        for i, a in enumerate(np.ix_(*arrays)):
            arr[..., i] = a

        return arr.reshape(-1, la)

    @staticmethod
    def calc_1d_array_average(data_collection):
        """
        Calculates the mean over a numeric collection (Int or Float) using the 'np.mean' function.

        :param data_collection: The collection of Integers or Floats to be averaged.
        :return: Float holding the  average of all the numeric input values.
        """
        # In single precision, mean can be inaccurate. Computing the mean in float64 is more accurate.
        # (http://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html#numpy.mean)
        return np.mean(data_collection, dtype=np.float64)

    @staticmethod
    def numpy_array_indices_subset(data_collection, indices_list):
        """
        Returns a subset of values from a collection based upon a list of specified indices.

        :param data_collection: The collection of values from which to extract specific values.
        :param indices_list: The collection of indices used to select the element values to extract from the data collection.
        :return: A subset of the original data collection defined by the specified indices.
        """
        # Ensure we are dealing with a numpy array before operating.
        data_collection = TSEDataUtils.convert_array_to_numpy_array(data_collection)
        return data_collection[indices_list]

    @staticmethod
    # Modified from original source: http://stackoverflow.com/a/11686764
    def filter_outliers_mean_stdev(data_collection, stdev_factor=2):
        """
        Filters outliers from a collection of Integers or Floats by removing all values that fall outside of a defined range between the mean and standard deviation of the collection.

        :param data_collection: The collection of values to be filtered.
        :param stdev_factor: The factor between the standard deviation and the mean.
        :return: The filtered results.
        """
        # Ensure we are dealing with a numpy array before operating.
        data_collection = TSEDataUtils.convert_array_to_numpy_array(data_collection)

        return data_collection[abs(data_collection - np.mean(data_collection)) < stdev_factor * np.std(data_collection)]

    @staticmethod
    # This is a convenience function for 'TSEUtils.filter_outliers_ab_dist_median_indices' to return the actual data.
    def filter_outliers_ab_dist_median(data_collection, ab_dist_median_factor=2.):
        """
        Filters outliers from a collection of numeric values by removing all values that fall outside of a defined range between the mean and calculated median of the collection.

        :param data_collection: The collection of values to be filtered.
        :param ab_dist_median_factor: The factor between the calculated median and the mean.
        :return: The filtered results.
        """
        # Ensure we are dealing with a numpy array before operating.
        data_collection = TSEDataUtils.convert_array_to_numpy_array(data_collection)

        return data_collection[TSEDataUtils.filter_outliers_ab_dist_median_indices(data_collection, ab_dist_median_factor)]

    @staticmethod
    # Modified from original source: http://stackoverflow.com/a/16562028
    def filter_outliers_ab_dist_median_indices(data, ab_dist_median_factor=2.):
        """
        Filters outliers from a collection of numeric values by removing all values that fall outside of a defined range between the mean and calculated median of the collection.
        Returns the indices of the filtered numeric values as opposed to the values themselves.

        :param data_collection: The collection of values to be filtered.
        :param ab_dist_median_factor: The factor between the calculated median and the mean.
        :return: The indices of the filtered numeric values.
        """
        # Ensure we are dealing with a numpy array before operating.
        data = TSEDataUtils.convert_array_to_numpy_array(data)

        # Calculate the median of the input dataset.
        d = np.abs(data - np.median(data))
        mdev = np.median(d)

        # Prevent calculating a factor that is negative.
        s = d/mdev if mdev else 0.

        # 'np.where' returns the indices of the elements that match the mask. See: http://stackoverflow.com/a/9891802
        indices = np.where(s < ab_dist_median_factor)[0]

        return indices

    @staticmethod
    def extract_tuple_elements_list(tuple_collection, tuple_index):
        """
        Extracts a specified element index from each tuple within a collection.

        :param tuple_collection: The collection of tuples from which values are to be extracted.
        :param tuple_index: The specified index to be extracted from each tuple.
        :return: A collection of extracted values from each tuple in the original collection.
        """
        result = []

        for val in tuple_collection:
            result.append(val[tuple_index])

        return result

    @staticmethod
    # Modified from original source: http://stackoverflow.com/a/16158798
    def calc_element_wise_average(data_collection):
        """
        Calculates the average across corresponding element within a collection of numeric arrays.

        :param data_collection: A collection of numeric arrays.
        :return: A single array containing the element-wise averages calculated.
        """

        # 'data' expected format e.g.: '[[1, 2, 3], [1, 3, 4], [2, 4, 5]]'
        return [sum(e)/len(e) for e in zip(*data_collection)]