chrislit/abydos

View on GitHub
abydos/tokenizer/_tokenizer.py

Summary

Maintainability
A
1 hr
Test Coverage
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tokenizer._tokenize.

_Tokenizer base class
"""

from collections import Counter, defaultdict
from math import exp, log1p, log2
from typing import (
    Any,
    Callable,
    Counter as TCounter,
    DefaultDict,
    List,
    Optional,
    Set,
    Union,
    cast,
)

__all__ = ['_Tokenizer']


class _Tokenizer:
    """Abstract _Tokenizer class.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        scaler: Optional[Union[str, Callable[[float], float]]] = None,
        *args: Any,
        **kwargs: Any
    ) -> None:
        """Initialize Tokenizer.

        Parameters
        ----------
        scaler : None, str, or function
            A scaling function for the Counter:

                - None : no scaling
                - 'set' : All non-zero values are set to 1.
                - 'length' : Each token has weight equal to its length.
                - 'length-log' : Each token has weight equal to the log of its
                   length + 1.
                - 'length-exp' : Each token has weight equal to e raised to its
                   length.
                - 'entropy' : Weights are scaled to the (log_2) information
                  entropy of each key's frequency.
                - a callable function : The function is applied to each value
                  in the Counter. Some useful functions include math.exp,
                  math.log1p, math.sqrt, and indexes into interesting integer
                  sequences such as the Fibonacci sequence.


        .. versionadded:: 0.4.0

        """
        super(_Tokenizer, self).__init__()

        self._scaler = scaler
        self._tokens = defaultdict(int)  # type: DefaultDict[str, float]
        self._string = ''
        self._ordered_tokens = []  # type: List[str]
        self._ordered_weights = []  # type: List[float]

    def tokenize(self, string: str) -> '_Tokenizer':
        """Tokenize the term and store it.

        The tokenized term is stored as an ordered list and as a defaultdict
        object.

        Parameters
        ----------
        string : str
            The string to tokenize


        .. versionadded:: 0.4.0
        .. versionchanged:: 0.4.1
            Added 'length', 'entropy', and related scalers
        .. versionchanged:: 0.6.0
            Moved scaling & counterizing to separate function

        """
        self._string = string
        self._ordered_tokens = [self._string]
        self._ordered_weights = [1]

        self._scale_and_counterize()
        return self

    def _scale_and_counterize(self) -> None:
        """Scale the tokens and store them in a defaultdict.

        .. versionadded:: 0.6.0

        """
        if self._scaler in {'SSK', 'length', 'length-log', 'length-exp'}:
            self._tokens = defaultdict(float)
            if cast(str, self._scaler)[:6] == 'length':
                self._ordered_weights = [len(_) for _ in self._ordered_tokens]
                if self._scaler == 'length-log':
                    self._ordered_weights = [
                        log1p(_) for _ in self._ordered_weights
                    ]
                elif self._scaler == 'length-exp':
                    self._ordered_weights = [
                        exp(_) for _ in self._ordered_weights
                    ]
            for token, weight in zip(
                self._ordered_tokens, self._ordered_weights
            ):
                self._tokens[token] += weight
        elif self._scaler == 'entropy':
            counts = Counter(self._ordered_tokens)
            n = len(self._ordered_tokens)
            self._tokens = defaultdict(float)
            self._tokens.update(
                {
                    key: -(val / n) * log2(val / n)
                    for key, val in counts.items()
                }
            )
            self._ordered_weights = [
                self._tokens[tok] / counts[tok] for tok in self._ordered_tokens
            ]
        else:
            self._tokens = defaultdict(int)
            self._tokens.update(Counter(self._ordered_tokens))

    def count(self) -> int:
        """Return token count.

        Returns
        -------
        int
            The total count of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count()
        1


        .. versionadded:: 0.4.0

        """
        return sum(self.get_counter().values())

    def count_unique(self) -> int:
        """Return the number of unique elements.

        Returns
        -------
        int
            The number of unique tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.count_unique()
        1


        .. versionadded:: 0.4.0

        """
        return len(self._tokens.values())

    def get_counter(self) -> TCounter[str]:
        """Return the tokens as a Counter object.

        Returns
        -------
        Counter
            The Counter of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_counter()
        Counter({'term': 1})


        .. versionadded:: 0.4.0

        """
        if self._scaler == 'set':
            return Counter({key: 1 for key in self._tokens.keys()})
        elif callable(self._scaler):
            return Counter(
                {key: self._scaler(val) for key, val in self._tokens.items()}
            )
        else:
            return Counter(self._tokens)

    def get_set(self) -> Set[str]:
        """Return the unique tokens as a set.

        Returns
        -------
        Counter
            The set of tokens

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_set()
        {'term'}


        .. versionadded:: 0.4.0

        """
        return set(self._tokens.keys())

    def get_list(self) -> List[str]:
        """Return the tokens as an ordered list.

        Returns
        -------
        Counter
            The list of q-grams in the order they were added.

        Examples
        --------
        >>> tok = _Tokenizer().tokenize('term')
        >>> tok.get_list()
        ['term']


        .. versionadded:: 0.4.0

        """
        return self._ordered_tokens

    def __repr__(self) -> str:
        """Return representation of tokens object.

        .. versionadded:: 0.4.0

        """
        return self.__class__.__name__ + '({}'.format(str(self._tokens)[27:])

    def __and__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return intersection with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() & other.get_counter()

    def __add__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return union with other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() + other.get_counter()

    def __sub__(self, other: '_Tokenizer') -> TCounter[str]:
        """Return difference from other tokens.

        .. versionadded:: 0.4.0

        """
        return self.get_counter() - other.get_counter()


if __name__ == '__main__':
    import doctest

    doctest.testmod()