abydos/stemmer/_paice_husk.py
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
"""abydos.stemmer._paice_husk.
Paice-Husk Stemmer
"""
from typing import Dict, Optional, Tuple
from ._stemmer import _Stemmer
__all__ = ['PaiceHusk']
class PaiceHusk(_Stemmer):
"""Paice-Husk stemmer.
Implementation of the Paice-Husk Stemmer, also known as the Lancaster
Stemmer, developed by Chris Paice, with the assistance of Gareth Husk
This is based on the algorithm's description in :cite:`Paice:1990`.
.. versionadded:: 0.3.6
"""
_rule_table = {
6: {
'ifiabl': ((False, 6, None, True),),
'plicat': ((False, 4, 'y', True),),
},
5: {
'guish': ((False, 5, 'ct', True),),
'sumpt': ((False, 2, None, True),),
'istry': ((False, 5, None, True),),
},
4: {
'ytic': ((False, 3, 's', True),),
'ceed': ((False, 2, 'ss', True),),
'hood': ((False, 4, None, False),),
'lief': ((False, 1, 'v', True),),
'verj': ((False, 1, 't', True),),
'misj': ((False, 2, 't', True),),
'iabl': ((False, 4, 'y', True),),
'iful': ((False, 4, 'y', True),),
'sion': ((False, 4, 'j', False),),
'xion': ((False, 4, 'ct', True),),
'ship': ((False, 4, None, False),),
'ness': ((False, 4, None, False),),
'ment': ((False, 4, None, False),),
'ript': ((False, 2, 'b', True),),
'orpt': ((False, 2, 'b', True),),
'duct': ((False, 1, None, True),),
'cept': ((False, 2, 'iv', True),),
'olut': ((False, 2, 'v', True),),
'sist': ((False, 0, None, True),),
},
3: {
'ied': ((False, 3, 'y', False),),
'eed': ((False, 1, None, True),),
'ing': ((False, 3, None, False),),
'iag': ((False, 3, 'y', True),),
'ish': ((False, 3, None, False),),
'fuj': ((False, 1, 's', True),),
'hej': ((False, 1, 'r', True),),
'abl': ((False, 3, None, False),),
'ibl': ((False, 3, None, True),),
'bil': ((False, 2, 'l', False),),
'ful': ((False, 3, None, False),),
'ial': ((False, 3, None, False),),
'ual': ((False, 3, None, False),),
'ium': ((False, 3, None, True),),
'ism': ((False, 3, None, False),),
'ion': ((False, 3, None, False),),
'ian': ((False, 3, None, False),),
'een': ((False, 0, None, True),),
'ear': ((False, 0, None, True),),
'ier': ((False, 3, 'y', False),),
'ies': ((False, 3, 'y', False),),
'sis': ((False, 2, None, True),),
'ous': ((False, 3, None, False),),
'ent': ((False, 3, None, False),),
'ant': ((False, 3, None, False),),
'ist': ((False, 3, None, False),),
'iqu': ((False, 3, None, True),),
'ogu': ((False, 1, None, True),),
'siv': ((False, 3, 'j', False),),
'eiv': ((False, 0, None, True),),
'bly': ((False, 1, None, False),),
'ily': ((False, 3, 'y', False),),
'ply': ((False, 0, None, True),),
'ogy': ((False, 1, None, True),),
'phy': ((False, 1, None, True),),
'omy': ((False, 1, None, True),),
'opy': ((False, 1, None, True),),
'ity': ((False, 3, None, False),),
'ety': ((False, 3, None, False),),
'lty': ((False, 2, None, True),),
'ary': ((False, 3, None, False),),
'ory': ((False, 3, None, False),),
'ify': ((False, 3, None, True),),
'ncy': ((False, 2, 't', False),),
'acy': ((False, 3, None, False),),
},
2: {
'ia': ((True, 2, None, True),),
'bb': ((False, 1, None, True),),
'ic': ((False, 2, None, False),),
'nc': ((False, 1, 't', False),),
'dd': ((False, 1, None, True),),
'ed': ((False, 2, None, False),),
'if': ((False, 2, None, False),),
'ag': ((False, 2, None, False),),
'gg': ((False, 1, None, True),),
'th': ((True, 2, None, True),),
'ij': ((False, 1, 'd', True),),
'uj': ((False, 1, 'd', True),),
'oj': ((False, 1, 'd', True),),
'nj': ((False, 1, 'd', True),),
'cl': ((False, 1, None, True),),
'ul': ((False, 2, None, True),),
'al': ((False, 2, None, False),),
'll': ((False, 1, None, True),),
'um': ((True, 2, None, True),),
'mm': ((False, 1, None, True),),
'an': ((False, 2, None, False),),
'en': ((False, 2, None, False),),
'nn': ((False, 1, None, True),),
'pp': ((False, 1, None, True),),
'er': ((False, 2, None, False),),
'ar': ((False, 2, None, True),),
'or': ((False, 2, None, False),),
'ur': ((False, 2, None, False),),
'rr': ((False, 1, None, True),),
'tr': ((False, 1, None, False),),
'is': ((False, 2, None, False),),
'ss': ((False, 0, None, True),),
'us': ((True, 2, None, True),),
'at': ((False, 2, None, False),),
'tt': ((False, 1, None, True),),
'iv': ((False, 2, None, False),),
'ly': ((False, 2, None, False),),
'iz': ((False, 2, None, False),),
'yz': ((False, 1, 's', True),),
},
1: {
'a': ((True, 1, None, True),),
'e': ((False, 1, None, False),),
'i': ((True, 1, None, True), (False, 1, 'y', False)),
'j': ((False, 1, 's', True),),
's': ((True, 1, None, False), (False, 0, None, True)),
},
} # type: Dict[int, Dict[str, Tuple[Tuple[bool, int, Optional[str], bool], ...]]] # noqa: E501
def _has_vowel(self, word: str) -> bool:
for char in word:
if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
return True
return False
def _acceptable(self, word: str) -> bool:
if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
return len(word) > 1
return len(word) > 2 and self._has_vowel(word[1:])
def _apply_rule(
self,
word: str,
rule: Tuple[bool, int, Optional[str], bool],
intact: bool,
terminate: bool,
) -> Tuple[str, bool, bool, bool]:
old_word = word
only_intact, del_len, add_str, set_terminate = rule
# print(word, word[-n:], rule)
if (not only_intact) or (intact and only_intact):
if del_len:
word = word[:-del_len]
if add_str:
word += add_str
else:
return word, False, intact, terminate
if self._acceptable(word):
return word, True, False, set_terminate
else:
return old_word, False, intact, terminate
def stem(self, word: str) -> str:
"""Return Paice-Husk stem.
Parameters
----------
word : str
The word to stem
Returns
-------
str
Word stem
Examples
--------
>>> stmr = PaiceHusk()
>>> stmr.stem('assumption')
'assum'
>>> stmr.stem('verifiable')
'ver'
>>> stmr.stem('fancies')
'fant'
>>> stmr.stem('fanciful')
'fancy'
>>> stmr.stem('torment')
'tor'
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
terminate = False
intact = True
while not terminate:
for n in range(6, 0, -1):
if word[-n:] in self._rule_table[n]:
accept = False
for rule in self._rule_table[n][word[-n:]]:
(word, accept, intact, terminate,) = self._apply_rule(
word, rule, intact, terminate
)
if accept:
break
if accept:
break
else:
break
return word
if __name__ == '__main__':
import doctest
doctest.testmod()