
View on GitHub


3 hrs
Test Coverage
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <>.


QGrams multi-set class

from import Iterable
from typing import Callable, Iterable as TIterable, Optional, Union, cast

from ._tokenizer import _Tokenizer

__all__ = ['QGrams']

class QGrams(_Tokenizer):
    """A q-gram class, which functions like a bag/multiset.

    A q-gram is here defined as all sequences of q characters. Q-grams are also
    known as k-grams and n-grams, but the term n-gram more typically refers to
    sequences of whitespace-delimited words in a string, where q-gram refers
    to sequences of characters in a word or string.

    .. versionadded:: 0.1.0

    def __init__(
        qval: Union[int, TIterable[int]] = 2,
        start_stop: str = '$#',
        skip: Union[int, TIterable[int]] = 0,
        scaler: Optional[Union[str, Callable[[float], float]]] = None,
    ) -> None:
        """Initialize QGrams.

        qval : int or Iterable
            The q-gram length (defaults to 2), can be an integer, range object,
            or list
        start_stop : str
            A string of length >= 0 indicating start & stop symbols.
            If the string is '', q-grams will be calculated without start &
            stop symbols appended to each end.
            Otherwise, the first character of start_stop will pad the
            beginning of the string and the last character of start_stop
            will pad the end of the string before q-grams are calculated.
            (In the case that start_stop is only 1 character long, the same
            symbol will be used for both.)
        skip : int or Iterable
            The number of characters to skip, can be an integer, range object,
            or list
        scaler : None, str, or function
            A scaling function for the Counter:

                - None : no scaling
                - 'set' : All non-zero values are set to 1.
                - 'length' : Each token has weight equal to its length.
                - 'length-log' : Each token has weight equal to the log of its
                   length + 1.
                - 'length-exp' : Each token has weight equal to e raised to its
                - a callable function : The function is applied to each value
                  in the Counter. Some useful functions include math.exp,
                  math.log1p, math.sqrt, and indexes into interesting integer
                  sequences such as the Fibonacci sequence.

            Use WhitespaceTokenizer instead of qval=0.

        >>> qg = QGrams().tokenize('AATTATAT')
        >>> qg
        QGrams({'$A': 1, 'AA': 1, 'AT': 3, 'TT': 1, 'TA': 2, 'T#': 1})

        >>> qg = QGrams(qval=1, start_stop='').tokenize('AATTATAT')
        >>> qg
        QGrams({'A': 4, 'T': 4})

        >>> qg = QGrams(qval=3, start_stop='').tokenize('AATTATAT')
        >>> qg
        QGrams({'AAT': 1, 'ATT': 1, 'TTA': 1, 'TAT': 2, 'ATA': 1})

        >>> QGrams(qval=2, start_stop='$#').tokenize('interning')
        QGrams({'$i': 1, 'in': 2, 'nt': 1, 'te': 1, 'er': 1, 'rn': 1, 'ni': 1,
        'ng': 1, 'g#': 1})

        >>> QGrams(start_stop='', skip=1).tokenize('AACTAGAAC')
        QGrams({'AC': 2, 'AT': 1, 'CA': 1, 'TG': 1, 'AA': 1, 'GA': 1, 'A': 1})

        >>> QGrams(start_stop='', skip=[0, 1]).tokenize('AACTAGAAC')
        QGrams({'AA': 3, 'AC': 4, 'CT': 1, 'TA': 1, 'AG': 1, 'GA': 2, 'AT': 1,
        'CA': 1, 'TG': 1, 'A': 1})

        >>> QGrams(qval=range(3), skip=[0, 1]).tokenize('interdisciplinarian')
        QGrams({'i': 10, 'n': 7, 't': 2, 'e': 2, 'r': 4, 'd': 2, 's': 2,
        'c': 2, 'p': 2, 'l': 2, 'a': 4, '$i': 1, 'in': 3, 'nt': 1, 'te': 1,
        'er': 1, 'rd': 1, 'di': 1, 'is': 1, 'sc': 1, 'ci': 1, 'ip': 1, 'pl': 1,
        'li': 1, 'na': 1, 'ar': 1, 'ri': 2, 'ia': 2, 'an': 1, 'n#': 1, '$n': 1,
        'it': 1, 'ne': 1, 'tr': 1, 'ed': 1, 'ds': 1, 'ic': 1, 'si': 1, 'cp': 1,
        'il': 1, 'pi': 1, 'ln': 1, 'nr': 1, 'ai': 1, 'ra': 1, 'a#': 1})

        .. versionadded:: 0.1.0
        .. versionchanged:: 0.4.0
            Broke tokenization functions out into tokenize method

        if qval == 0:
            raise ValueError('Use WhitespaceTokenizer instead of qval=0.')
        super(QGrams, self).__init__(scaler)

        # Save parameters
        self.qval = qval
        self.start_stop = start_stop
        if qval == 1:
            self.start_stop = ''
        self.skip = skip

        self._string_ss = self._string

    def tokenize(self, string: str) -> 'QGrams':
        """Tokenize the term and store it.

        The tokenized term is stored as an ordered list and as a Counter

        string : str
            The string to tokenize

        .. versionadded:: 0.4.0

        self._string = string
        self._ordered_tokens = []

        if not isinstance(self.qval, Iterable):
            self.qval = (self.qval,)
        if not isinstance(self.skip, Iterable):
            self.skip = (self.skip,)

        if string:
            for qval_i in cast(TIterable[int], self.qval):
                for skip_i in cast(TIterable[int], self.skip):
                    if qval_i < 1:

                    if self.start_stop:
                        string = (
                            self.start_stop[0] * (qval_i - 1)
                            + self._string
                            + self.start_stop[-1] * (qval_i - 1)
                        string = self._string

                    if qval_i > 1 and len(string) < qval_i:

                    # Having appended start & stop symbols (or not), save the
                    # result, but only for the longest valid qval_i
                    if len(string) > len(self._string_ss):
                        self._string_ss = string

                    skip_i += 1
                    self._ordered_tokens += [
                        string[i : i + (qval_i * skip_i) : skip_i]
                        for i in range(len(string) - (qval_i - 1))

        return self

if __name__ == '__main__':
    import doctest
