abydos/phonetic/_roger_root.py from chrislit/abydos

abydos/phonetic/_roger_root.py
Summary

Maintainability

1 hr
Test Coverage

Issues
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._roger_root.

Roger Root phonetic algorithm
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['RogerRoot']


class RogerRoot(_Phonetic):
    """Roger Root code.

    This is Roger Root name coding, described in :cite:`Moore:1977`.

    .. versionadded:: 0.3.6
    """

    # '*' is used to prevent combining by _delete_consecutive_repeats()
    _init_patterns = {
        4: {'TSCH': '06'},
        3: {'TSH': '06', 'SCH': '06'},
        2: {
            'CE': '0*0',
            'CH': '06',
            'CI': '0*0',
            'CY': '0*0',
            'DG': '07',
            'GF': '08',
            'GM': '03',
            'GN': '02',
            'KN': '02',
            'PF': '08',
            'PH': '08',
            'PN': '02',
            'SH': '06',
            'TS': '0*0',
            'WR': '04',
        },
        1: {
            'A': '1',
            'B': '09',
            'C': '07',
            'D': '01',
            'E': '1',
            'F': '08',
            'G': '07',
            'H': '2',
            'I': '1',
            'J': '3',
            'K': '07',
            'L': '05',
            'M': '03',
            'N': '02',
            'O': '1',
            'P': '09',
            'Q': '07',
            'R': '04',
            'S': '0*0',
            'T': '01',
            'U': '1',
            'V': '08',
            'W': '4',
            'X': '07',
            'Y': '5',
            'Z': '0*0',
        },
    }

    _med_patterns = {
        4: {'TSCH': '6'},
        3: {'TSH': '6', 'SCH': '6'},
        2: {
            'CE': '0',
            'CH': '6',
            'CI': '0',
            'CY': '0',
            'DG': '7',
            'PH': '8',
            'SH': '6',
            'TS': '0',
        },
        1: {
            'B': '9',
            'C': '7',
            'D': '1',
            'F': '8',
            'G': '7',
            'J': '6',
            'K': '7',
            'L': '5',
            'M': '3',
            'N': '2',
            'P': '9',
            'Q': '7',
            'R': '4',
            'S': '0',
            'T': '1',
            'V': '8',
            'X': '7',
            'Z': '0',
            'A': '*',
            'E': '*',
            'H': '*',
            'I': '*',
            'O': '*',
            'U': '*',
            'W': '*',
            'Y': '*',
        },
    }

    _alphabetic_initial = dict(zip((ord(_) for _ in '012345'), ' AHJWY'))
    _alphabetic = dict(zip((ord(_) for _ in '0123456789'), 'STNMRLJKFP'))

    def __init__(self, max_length: int = 5, zero_pad: bool = True) -> None:
        """Initialize RogerRoot instance.

        Parameters
        ----------
        max_length : int
            The maximum length (default 5) of the code to return
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string


        .. versionadded:: 0.4.0

        """
        self._max_length = max_length
        self._zero_pad = zero_pad

    def encode_alpha(self, word: str) -> str:
        """Return the alphabetic Roger Root code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Roger Root code

        Examples
        --------
        >>> pe = RogerRoot()
        >>> pe.encode_alpha('Christopher')
        'JRST'
        >>> pe.encode_alpha('Niall')
        'NL'
        >>> pe.encode_alpha('Smith')
        'SMT'
        >>> pe.encode_alpha('Schmidt')
        'JMT'


        .. versionadded:: 0.4.0

        """
        code = self.encode(word).rstrip('0')
        return code[:1].translate(self._alphabetic_initial).strip() + code[
            1:
        ].translate(self._alphabetic)

    def encode(self, word: str) -> str:
        """Return the Roger Root code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Roger Root code

        Examples
        --------
        >>> pe = RogerRoot()
        >>> pe.encode('Christopher')
        '06401'
        >>> pe.encode('Niall')
        '02500'
        >>> pe.encode('Smith')
        '00310'
        >>> pe.encode('Schmidt')
        '06310'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._uc_set)

        code = ''
        pos = 0

        # Do first digit(s) first
        for num in range(4, 0, -1):
            if word[:num] in self._init_patterns[num]:
                code = self._init_patterns[num][word[:num]]
                pos += num
                break

        # Then code subsequent digits
        while pos < len(word):
            for num in range(4, 0, -1):  # pragma: no branch
                if word[pos : pos + num] in self._med_patterns[num]:
                    code += self._med_patterns[num][word[pos : pos + num]]
                    pos += num
                    break

        code = self._delete_consecutive_repeats(code)
        code = code.replace('*', '')

        if self._zero_pad:
            code += '0' * self._max_length

        return code[: self._max_length]


if __name__ == '__main__':
    import doctest

    doctest.testmod()