chrislit/abydos

View on GitHub
abydos/distance/_sokal_sneath_v.py

Summary

Maintainability
A
0 mins
Test Coverage
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.distance._sokal_sneath_v.

Sokal & Sneath V similarity
"""

from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union

from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer

__all__ = ['SokalSneathV']


class SokalSneathV(_TokenDistance):
    r"""Sokal & Sneath V similarity.

    For two sets X and Y and a population N, Sokal & Sneath V similarity
    :cite:`Sokal:1963` is

        .. math::

            sim_{SokalSneathV}(X, Y) =
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y|}
            {\sqrt{|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|}}

    This is the fifth of five "Unnamed coefficients" presented in
    :cite:`Sokal:1963`. It corresponds to the second "Marginal totals in the
    Denominator" with "Negative Matches in Numerator Included", also sometimes
    referred to as Ochiai II similarity.
    "Negative Matches in Numerator Excluded" corresponds to the Cosine
    similarity, :class:`.Cosine`.

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            sim_{SokalSneathV} =
            \frac{ad}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize SokalSneathV instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(SokalSneathV, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def sim(self, src: str, tar: str) -> float:
        """Return the Sokal & Sneath V similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Sokal & Sneath V similarity

        Examples
        --------
        >>> cmp = SokalSneathV()
        >>> cmp.sim('cat', 'hat')
        0.4987179487179487
        >>> cmp.sim('Niall', 'Neil')
        0.3635068033537323
        >>> cmp.sim('aluminum', 'Catalan')
        0.11671286273067434
        >>> cmp.sim('ATCG', 'TAGC')
        0.0


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0
        if not src or not tar:
            return 0.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = a * d
        if num:
            return num / ((a + b) * (a + c) * (b + d) * (c + d)) ** 0.5
        return 0.0


if __name__ == '__main__':
    import doctest

    doctest.testmod()