pysprint/core/bases/algorithms.py from Ptrskay3/PySprint

pysprint/core/bases/algorithms.py
Summary

Maintainability

0 mins
Test Coverage

Issues
import numpy as np


def longest_common_subsequence(x1, y1, x2, y2, tol=None):
    """
    Given two datasets with x-y values, find the longest common
    subsequence of them including a small threshold which might be
    present due to numerical errors.
    This function is mainly used when two datasets's y values need
    to be multiplied together, but their domains are slightly off.

    Parameters
    ----------
    x1 : np.ndarray-like
        The x values for the original array
    y1 : np.ndarray-like
        The y values for the original array
    x2 : np.ndarray-like
        The x values for the second array
    y2 : np.ndarray-like
        The y values for the second array
    tol : float, optional
        The tolerance which determines how big difference is allowed
        _between x values to interpret them as the same datapoint.

    Returns
    -------
    longest_x1: np.ndarray
        The x values of longest common subsequence in x1.
    longest_y1: np.ndarray
        The y values of longest common subsequence in y1.
    longest_x2: np.ndarray
        The x values of longest common subsequence in x2.
    longest_y2: np.ndarray
        The y values of longest common subsequence in y2.
    """
    # sort both datasets first to maintain order
    idx1 = np.argsort(x1)
    x1, y1 = x1[idx1], y1[idx1]
    idx2 = np.argsort(x2)
    x2, y2 = x2[idx2], y2[idx2]

    if tol is None:
        tol = np.max(np.diff(x2))

    dist = np.abs(x1 - x2[:, None])
    i1 = np.argmin(dist, axis=1)
    i2 = np.flatnonzero(dist[np.arange(x2.size), i1] < tol)
    i1 = i1[i2]

    mask = (np.diff(i1) == 1) & (np.diff(i2) == 1)

    # smear the mask to include both endpoints
    mask = np.r_[False, mask] | np.r_[mask, False]

    # pad the mask to ensure proper indexing and find the changeover points
    locs = np.diff(np.r_[False, mask, False])
    inds = np.flatnonzero(locs)
    lengths = inds[1::2] - inds[::2]

    k = np.argmax(lengths)
    start = inds[2 * k]
    stop = inds[2 * k + 1]

    longest_x1 = x1[i1[start:stop]]
    longest_y1 = y1[i1[start:stop]]
    longest_x2 = x2[i2[start:stop]]
    longest_y2 = y2[i2[start:stop]]

    return longest_x1, longest_y1, longest_x2, longest_y2