chrislit/abydos

View on GitHub
abydos/stemmer/_paice_husk.py

Summary

Maintainability
B
4 hrs
Test Coverage
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._paice_husk.

Paice-Husk Stemmer
"""

from typing import Dict, Optional, Tuple

from ._stemmer import _Stemmer

__all__ = ['PaiceHusk']


class PaiceHusk(_Stemmer):
    """Paice-Husk stemmer.

    Implementation of the Paice-Husk Stemmer, also known as the Lancaster
    Stemmer, developed by Chris Paice, with the assistance of Gareth Husk

    This is based on the algorithm's description in :cite:`Paice:1990`.

    .. versionadded:: 0.3.6
    """

    _rule_table = {
        6: {
            'ifiabl': ((False, 6, None, True),),
            'plicat': ((False, 4, 'y', True),),
        },
        5: {
            'guish': ((False, 5, 'ct', True),),
            'sumpt': ((False, 2, None, True),),
            'istry': ((False, 5, None, True),),
        },
        4: {
            'ytic': ((False, 3, 's', True),),
            'ceed': ((False, 2, 'ss', True),),
            'hood': ((False, 4, None, False),),
            'lief': ((False, 1, 'v', True),),
            'verj': ((False, 1, 't', True),),
            'misj': ((False, 2, 't', True),),
            'iabl': ((False, 4, 'y', True),),
            'iful': ((False, 4, 'y', True),),
            'sion': ((False, 4, 'j', False),),
            'xion': ((False, 4, 'ct', True),),
            'ship': ((False, 4, None, False),),
            'ness': ((False, 4, None, False),),
            'ment': ((False, 4, None, False),),
            'ript': ((False, 2, 'b', True),),
            'orpt': ((False, 2, 'b', True),),
            'duct': ((False, 1, None, True),),
            'cept': ((False, 2, 'iv', True),),
            'olut': ((False, 2, 'v', True),),
            'sist': ((False, 0, None, True),),
        },
        3: {
            'ied': ((False, 3, 'y', False),),
            'eed': ((False, 1, None, True),),
            'ing': ((False, 3, None, False),),
            'iag': ((False, 3, 'y', True),),
            'ish': ((False, 3, None, False),),
            'fuj': ((False, 1, 's', True),),
            'hej': ((False, 1, 'r', True),),
            'abl': ((False, 3, None, False),),
            'ibl': ((False, 3, None, True),),
            'bil': ((False, 2, 'l', False),),
            'ful': ((False, 3, None, False),),
            'ial': ((False, 3, None, False),),
            'ual': ((False, 3, None, False),),
            'ium': ((False, 3, None, True),),
            'ism': ((False, 3, None, False),),
            'ion': ((False, 3, None, False),),
            'ian': ((False, 3, None, False),),
            'een': ((False, 0, None, True),),
            'ear': ((False, 0, None, True),),
            'ier': ((False, 3, 'y', False),),
            'ies': ((False, 3, 'y', False),),
            'sis': ((False, 2, None, True),),
            'ous': ((False, 3, None, False),),
            'ent': ((False, 3, None, False),),
            'ant': ((False, 3, None, False),),
            'ist': ((False, 3, None, False),),
            'iqu': ((False, 3, None, True),),
            'ogu': ((False, 1, None, True),),
            'siv': ((False, 3, 'j', False),),
            'eiv': ((False, 0, None, True),),
            'bly': ((False, 1, None, False),),
            'ily': ((False, 3, 'y', False),),
            'ply': ((False, 0, None, True),),
            'ogy': ((False, 1, None, True),),
            'phy': ((False, 1, None, True),),
            'omy': ((False, 1, None, True),),
            'opy': ((False, 1, None, True),),
            'ity': ((False, 3, None, False),),
            'ety': ((False, 3, None, False),),
            'lty': ((False, 2, None, True),),
            'ary': ((False, 3, None, False),),
            'ory': ((False, 3, None, False),),
            'ify': ((False, 3, None, True),),
            'ncy': ((False, 2, 't', False),),
            'acy': ((False, 3, None, False),),
        },
        2: {
            'ia': ((True, 2, None, True),),
            'bb': ((False, 1, None, True),),
            'ic': ((False, 2, None, False),),
            'nc': ((False, 1, 't', False),),
            'dd': ((False, 1, None, True),),
            'ed': ((False, 2, None, False),),
            'if': ((False, 2, None, False),),
            'ag': ((False, 2, None, False),),
            'gg': ((False, 1, None, True),),
            'th': ((True, 2, None, True),),
            'ij': ((False, 1, 'd', True),),
            'uj': ((False, 1, 'd', True),),
            'oj': ((False, 1, 'd', True),),
            'nj': ((False, 1, 'd', True),),
            'cl': ((False, 1, None, True),),
            'ul': ((False, 2, None, True),),
            'al': ((False, 2, None, False),),
            'll': ((False, 1, None, True),),
            'um': ((True, 2, None, True),),
            'mm': ((False, 1, None, True),),
            'an': ((False, 2, None, False),),
            'en': ((False, 2, None, False),),
            'nn': ((False, 1, None, True),),
            'pp': ((False, 1, None, True),),
            'er': ((False, 2, None, False),),
            'ar': ((False, 2, None, True),),
            'or': ((False, 2, None, False),),
            'ur': ((False, 2, None, False),),
            'rr': ((False, 1, None, True),),
            'tr': ((False, 1, None, False),),
            'is': ((False, 2, None, False),),
            'ss': ((False, 0, None, True),),
            'us': ((True, 2, None, True),),
            'at': ((False, 2, None, False),),
            'tt': ((False, 1, None, True),),
            'iv': ((False, 2, None, False),),
            'ly': ((False, 2, None, False),),
            'iz': ((False, 2, None, False),),
            'yz': ((False, 1, 's', True),),
        },
        1: {
            'a': ((True, 1, None, True),),
            'e': ((False, 1, None, False),),
            'i': ((True, 1, None, True), (False, 1, 'y', False)),
            'j': ((False, 1, 's', True),),
            's': ((True, 1, None, False), (False, 0, None, True)),
        },
    }  # type: Dict[int, Dict[str, Tuple[Tuple[bool, int, Optional[str], bool], ...]]]  # noqa: E501

    def _has_vowel(self, word: str) -> bool:
        for char in word:
            if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
                return True
        return False

    def _acceptable(self, word: str) -> bool:
        if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
            return len(word) > 1
        return len(word) > 2 and self._has_vowel(word[1:])

    def _apply_rule(
        self,
        word: str,
        rule: Tuple[bool, int, Optional[str], bool],
        intact: bool,
        terminate: bool,
    ) -> Tuple[str, bool, bool, bool]:
        old_word = word
        only_intact, del_len, add_str, set_terminate = rule
        # print(word, word[-n:], rule)

        if (not only_intact) or (intact and only_intact):
            if del_len:
                word = word[:-del_len]
            if add_str:
                word += add_str
        else:
            return word, False, intact, terminate

        if self._acceptable(word):
            return word, True, False, set_terminate
        else:
            return old_word, False, intact, terminate

    def stem(self, word: str) -> str:
        """Return Paice-Husk stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = PaiceHusk()
        >>> stmr.stem('assumption')
        'assum'
        >>> stmr.stem('verifiable')
        'ver'
        >>> stmr.stem('fancies')
        'fant'
        >>> stmr.stem('fanciful')
        'fancy'
        >>> stmr.stem('torment')
        'tor'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        terminate = False
        intact = True
        while not terminate:
            for n in range(6, 0, -1):
                if word[-n:] in self._rule_table[n]:
                    accept = False
                    for rule in self._rule_table[n][word[-n:]]:
                        (word, accept, intact, terminate,) = self._apply_rule(
                            word, rule, intact, terminate
                        )
                        if accept:
                            break

                    if accept:
                        break
            else:
                break

        return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()