neuropsychology/NeuroKit

View on GitHub
neurokit2/complexity/optim_complexity_tolerance.py

Summary

Maintainability
A
0 mins
Test Coverage
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import scipy.spatial

from ..stats import density
from .utils_complexity_embedding import complexity_embedding
from .utils import _phi


def complexity_tolerance(
    signal, method="maxApEn", r_range=None, delay=None, dimension=None, show=False
):
    """**Automated selection of tolerance (r)**

    Estimate and select the optimal tolerance (*r*) parameter used by other entropy and other
    complexity algorithms.

    Many complexity algorithms are built on the notion of self-similarity and recurrence, and how
    often a system revisits its past states. Considering two states as identical is straightforward
    for discrete systems (e.g., a sequence of ``"A"``, ``"B"`` and ``"C"`` states), but for
    continuous signals, we cannot simply look for when the two numbers are exactly the same.
    Instead, we have to pick a threshold by which to consider two points as similar.

    The tolerance *r* is essentially this threshold value (the numerical difference between two
    similar points that we "tolerate"). This parameter has a critical impact and is a major
    source of inconsistencies in the literature.

    Different methods have been described to estimate the most appropriate tolerance value:

    * ``"sd"`` (as in Standard Deviation): r = 0.2 * standard deviation of the signal will be
      returned.
    * ``"adjusted_sd"``: Adjusted value based on the SD and the dimension. The rationale is that
      the chebyshev distance (used in various metrics) rises logarithmically with increasing
      dimension. ``0.5627 * np.log(dimension) + 1.3334`` is the logarithmic trend line for the
      chebyshev distance of vectors sampled from a univariate normal distribution. A constant of
      ``0.1164`` is used so that ``tolerance = 0.2 * SDs`` for ``dimension = 2`` (originally in
      https://github.com/CSchoel/nolds).
    * ``"maxApEn"``: Different values of tolerance will be tested and the one where the approximate
      entropy (ApEn) is maximized will be selected and returned.
    * ``"recurrence"``, the tolerance that yields a recurrence rate (see ``RQA``) close to 5% will
      be returned.

    Parameters
    ----------
    signal : Union[list, np.array, pd.Series]
        The signal (i.e., a time series) in the form of a vector of values.
    method : str
        Can be ``"maxApEn"`` (default), ``"sd"`` (or ``"default"``), or ``"recurrence"``.
    r_range : Union[list, int]
        The range of tolerance values (or the number of values) to test. Only used if ``method`` is
        ``"maxApEn"`` or ``"recurrence"``. If ``None`` (default), the default range will be used;
        ``np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)`` for ``"maxApEn"``, and ``np.
        linspace(0, np.max(d), 30 + 1)[1:]`` for ``"recurrence"``. You can set a lower number for
        faster results.
    delay : int
        Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
    dimension : int
        Only used if ``method="maxApEn"``. See :func:`entropy_approximate()`.
    show : bool
        If ``True`` and method is ``"maxApEn"``, will plot the ApEn values for each value of r.

    See Also
    --------
    complexity, complexity_delay, complexity_dimension, complexity_embedding

    Returns
    ----------
    float
        The optimal tolerance value.
    dict
        A dictionary with the values of r and the corresponding ApEn values (when ``method="maxApEn"``).

    Examples
    ----------
    * **Example 1**: The method based on the SD of the signal is fast. The plot shows the d
      distribution of the values making the signal, and the width of the arrow represents the
      chosen ``r`` parameter.

    .. ipython:: python

      import neurokit2 as nk

      # Simulate signal
      signal = nk.signal_simulate(duration=2, frequency=5)

      # Fast method (based on the standard deviation)
      @savefig p_complexity_tolerance1.png scale=100%
      r, info = nk.complexity_tolerance(signal, method = "sd", show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    The dimension can be taken into account:
    .. ipython:: python

      # Adjusted SD
      @savefig p_complexity_tolerance2.png scale=100%
      r, info = nk.complexity_tolerance(signal, method = "adjusted_sd", dimension=3, show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r


    * **Example 2**: The method based on the recurrence rate will display the rates according to
      different values of tolerance. The horizontal line indicates 5%.

    .. ipython:: python

      @savefig p_complexity_tolerance3.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=1, dimension=10,
                                        method = 'recurrence', show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    * **Example 3**: The default method selects the tolerance at which *ApEn* is maximized.

    .. ipython:: python

      # Slow method
      @savefig p_complexity_tolerance4.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=8, dimension=6,
                                        method = 'maxApEn', show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    * **Example 4**: The tolerance values that are tested can be modified to get a more precise
      estimate.

    .. ipython:: python

      # Narrower range
      @savefig p_complexity_tolerance5.png scale=100%
      r, info = nk.complexity_tolerance(signal, delay=8, dimension=6, method = 'maxApEn',
                                        r_range=np.linspace(0.002, 0.8, 30), show=True)
      @suppress
      plt.close()

    .. ipython:: python

      r

    References
    -----------
    * Lu, S., Chen, X., Kanters, J. K., Solomon, I. C., & Chon, K. H. (2008). Automatic selection of
      the threshold value r for approximate entropy. IEEE Transactions on Biomedical Engineering,
      55(8), 1966-1972.
    """
    if not isinstance(method, str):
        return method, {"Method": "None"}

    # Method
    method = method.lower()
    if method in ["traditional", "sd", "std", "default"]:
        r = 0.2 * np.std(signal, ddof=1)
        info = {"Method": "20% SD"}
    elif method in ["adjusted_sd"] and isinstance(dimension, int):
        if dimension is None:
            raise ValueError("'dimension' cannot be empty for the 'adjusted_sd' method.")
        r = 0.11604738531196232 * np.std(signal, ddof=1) * (0.5627 * np.log(dimension) + 1.3334)
        info = {"Method": "Adjusted 20% SD"}
    elif method in ["maxapen", "optimize"]:
        r, info = _optimize_tolerance_maxapen(
            signal, r_range=r_range, delay=delay, dimension=dimension
        )
        info.update({"Method": "Max ApEn"})
    elif method in ["recurrence", "rqa"]:
        r, info = _optimize_tolerance_recurrence(
            signal, r_range=r_range, delay=delay, dimension=dimension
        )
        info.update({"Method": "5% Recurrence Rate"})
    else:
        raise ValueError("NeuroKit error: complexity_tolerance(): 'method' not recognized.")

    if show is True:
        _optimize_tolerance_plot(r, info, method=method, signal=signal)
    return r, info


# =============================================================================
# Internals
# =============================================================================


def _optimize_tolerance_recurrence(signal, r_range=None, delay=None, dimension=None):

    # Optimize missing parameters
    if delay is None or dimension is None:
        raise ValueError("If method='RQA', both delay and dimension must be specified.")

    # Compute distance matrix
    emb = complexity_embedding(signal, delay=delay, dimension=dimension)
    d = scipy.spatial.distance.cdist(emb, emb, metric="euclidean")

    if r_range is None:
        r_range = 50
    if isinstance(r_range, int):
        r_range = np.linspace(0, np.max(d), r_range + 1)[1:]

    recurrence_rate = np.zeros_like(r_range)
    # Indices of the lower triangular (without the diagonal)
    idx = np.tril_indices(len(d), k=-1)
    n = len(d[idx])
    for i, r in enumerate(r_range):
        recurrence_rate[i] = (d[idx] <= r).sum() / n
    # Closest to 0.05 (5%)
    optimal = r_range[np.abs(recurrence_rate - 0.05).argmin()]

    return optimal, {"Values": r_range, "Scores": recurrence_rate}


def _optimize_tolerance_maxapen(signal, r_range=None, delay=None, dimension=None):

    # Optimize missing parameters
    if delay is None or dimension is None:
        raise ValueError("If method='maxApEn', both delay and dimension must be specified.")

    if r_range is None:
        r_range = 40
    if isinstance(r_range, int):
        r_range = np.linspace(0.02, 0.8, r_range) * np.std(signal, ddof=1)

    apens = [_entropy_apen(signal, delay=delay, dimension=dimension, tolerance=r) for r in r_range]

    return r_range[np.argmax(apens)], {"Values": r_range, "Scores": np.array(apens)}


def _entropy_apen(signal, delay, dimension, tolerance, **kwargs):

    phi = _phi(
        signal,
        delay=delay,
        dimension=dimension,
        tolerance=tolerance,
        approximate=True,
        **kwargs,
    )

    return np.abs(np.subtract(phi[0], phi[1]))


# =============================================================================
# Plotting
# =============================================================================
def _optimize_tolerance_plot(r, info, ax=None, method="maxApEn", signal=None):

    if ax is None:
        fig, ax = plt.subplots()
    else:
        fig = None

    if method in ["traditional", "sd", "std", "default", "none", "adjusted_sd"]:
        x, y = density(signal)
        arrow_y = np.mean([np.max(y), np.min(y)])
        x_range = np.max(x) - np.min(x)
        ax.plot(x, y, color="#80059c", label="Optimal r: " + str(np.round(r, 3)))
        ax.arrow(
            np.mean(x),
            arrow_y,
            np.mean(x) + r / 2,
            0,
            head_width=0.01 * x_range,
            head_length=0.01 * x_range,
            linewidth=4,
            color="g",
            length_includes_head=True,
        )
        ax.arrow(
            np.mean(x),
            arrow_y,
            np.mean(x) - r / 2,
            0,
            head_width=0.01 * x_range,
            head_length=0.01 * x_range,
            linewidth=4,
            color="g",
            length_includes_head=True,
        )
        ax.set_title("Optimization of Tolerance Threshold (r)")
        ax.set_xlabel("Signal values")
        ax.set_ylabel("Distribution")
        ax.legend(loc="upper right")
        return fig

    r_range = info["Values"]
    y_values = info["Scores"]

    # Custom legend depending on method
    if method in ["maxapen", "optimize"]:
        ylabel = "Approximate Entropy $ApEn$"
        legend = "$ApEn$"
    else:
        ylabel = "Recurrence Rate $RR$"
        legend = "$RR$"
        y_values *= 100  # Convert to percentage
        ax.axhline(y=0.5, color="grey")
        ax.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter())

    ax.set_title("Optimization of Tolerance Threshold (r)")
    ax.set_xlabel("Tolerance threshold $r$")
    ax.set_ylabel(ylabel)
    ax.plot(r_range, y_values, "o-", label=legend, color="#80059c")
    ax.axvline(x=r, color="#E91E63", label="Optimal r: " + str(np.round(r, 3)))
    ax.legend(loc="upper right")

    return fig