tanbro/emoji-data

View on GitHub
src/emoji_data/sequence.py

Summary

Maintainability
C
7 hrs
Test Coverage
B
86%
from __future__ import annotations

import re
import sys
from typing import ClassVar, Generator, Iterable, Iterator, Optional, Pattern, Sequence, Tuple, Union, final

if sys.version_info < (3, 11):  # pragma: no cover
    from typing_extensions import Self
else:  # pragma: no cover
    from typing import Self

from .character import EmojiCharacter
from .types import BaseDictContainer
from .utils import emoji_data_lines

__all__ = ["EmojiSequence"]

# http://www.unicode.org/reports/tr51/#Data_Files_Table
# keep the order!
DATA_FILES = ("emoji-variation-sequences.txt", "emoji-zwj-sequences.txt", "emoji-sequences.txt")


class MetaClass(BaseDictContainer[str, "EmojiSequence"]):
    pass


@final
class EmojiSequence(metaclass=MetaClass):  # pyright: ignore[reportGeneralTypeIssues]
    """Emoji and Text Presentation Sequences used to represent emoji

    See also:
        http://www.unicode.org/reports/tr51/#Emoji_Sequences
    """

    def __init__(
        self,
        code_points: Union[int, Iterable[int]],
        type_field: Optional[str] = None,
        version: Optional[str] = None,
        variation: Optional[str] = None,
        description: Optional[str] = None,
    ):
        if isinstance(code_points, Iterable):
            self._code_points = list(code_points)
        else:
            self._code_points = [code_points]
        self._string = "".join(chr(n) for n in self._code_points)
        self._characters = [EmojiCharacter.from_hex(n) for n in self._code_points]
        self._type_field = type_field or ""
        self._version = version or ""
        self._variation = variation or ""
        self._description = description or ""
        # regex
        self._regex = r""
        if not self._regex:
            self._regex = "".join(m.regex for m in self._characters)
        self._regex_pat = re.compile(self._regex)

    def __len__(self):
        return len(self._code_points)

    def __str__(self):
        return self._string

    def __repr__(self):
        return "<{} code_points={!r} string={!r} version={!r} description={!r}>".format(
            type(self).__name__, self.code_points_string, self.string, self.version, self.description
        )

    _initialed = False

    pattern: ClassVar[Pattern[str]]
    """Compiled regular expression pattern object for all-together Emoji sequences.
    """

    @classmethod
    def initial(cls):
        """Initial the class

        Load Emoji Sequences from package data file into class internal dictionary
        """
        if cls._initialed:
            return
        EmojiCharacter.initial()

        def _decode_code_points(_cps, **_kwargs):
            try:
                _cp_head, _cp_tail = _cps.split("..", 1)  # begin..end form
            except ValueError:
                _arr_cp = [int(x, 16) for x in _cps.split()]
                _seq = cls(_arr_cp, **_kwargs)
                cls[_seq.string] = _seq
            else:
                # begin..end form: A range of single char emoji-seq
                for _cp in range(int(_cp_head, 16), 1 + int(_cp_tail, 16)):
                    _seq = cls(_cp, **_kwargs)
                    cls[_seq.string] = _seq

        for fname in DATA_FILES:
            for content, comment in emoji_data_lines(fname):
                if fname in ("emoji-sequences.txt", "emoji-zwj-sequences.txt"):
                    cps, type_field, description = (part.strip() for part in content.split(";", 2))
                    version = comment.split(maxsplit=1)[0]
                    _decode_code_points(cps, type_field=type_field, version=version, description=description)
                elif fname == "emoji-variation-sequences.txt":
                    cps, variation, _ = (part.strip() for part in content.split(";", 2))
                    version, description = (x.strip() for x in comment.split(maxsplit=1))
                    version = "E" + version.lstrip("(").rstrip(")").strip()
                    description = f"{description.lower()} ({variation})"
                    _decode_code_points(cps, version=version, variation=variation, description=description)
                else:
                    raise RuntimeError(f"Invalid data file name {fname}")
        # build regex
        cls.pattern = re.compile(
            r"|".join(
                m.regex
                for m in sorted(
                    (m for m in cls.values()),
                    key=lambda x: len(x.code_points),
                    reverse=True,
                )
            )
        )
        # initialed OK
        cls._initialed = True

    @classmethod
    def release(cls):
        if not cls._initialed:
            return
        keys = list(cls)
        for k in keys:
            del cls[k]
        cls._initialed = False

    @classmethod
    def items(cls) -> Iterator[Tuple[str, Self]]:
        """Returns an iterator of all string -> emoji-sequence pairs of the class"""
        return ((k, cls[k]) for k in cls)  # pyright: ignore[reportReturnType]

    @classmethod
    def keys(cls) -> Iterator[str]:
        """Returns an iterator of each emoji-sequence's key string of the class"""
        yield from cls

    @classmethod
    def values(cls) -> Iterator[Self]:
        """Returns an iterator of all emoji-sequences of the class"""
        return (cls[k] for k in cls)  # pyright: ignore[reportReturnType]

    @classmethod
    def from_string(cls, s: str) -> Self:
        """Get an :class:`EmojiSequence` instance from string

        Args:
            s: Emoji string

        Returns:
            Instance from internal dictionary

        Raises:
            KeyError: When passed-in ``s`` not found in internal dictionary
        """
        return cls[s]  # pyright: ignore[reportReturnType]

    @classmethod
    def from_characters(cls, value: Union[EmojiCharacter, Iterable[EmojiCharacter]]) -> Self:
        """Get an :class:`EmojiSequence` instance from :class:`EmojiCharacter` object or list

        Args:
            value: Single or iterable object of :class:`EmojiCharacter`, composing the sequence

        Returns:
            Instance from internal dictionary

        Raises:
            KeyError: When passed-in value not found in internal dictionary
            TypeError: When passed-in value is not :class:`.EmojiCharacter` object or list
        """
        if isinstance(value, EmojiCharacter):
            s = value.string
        elif isinstance(value, Iterable):
            s = "".join(m.string for m in value)
        else:
            raise TypeError("Argument `value` must be one of `EmojiCharacter` or `Iterable[EmojiCharacter]`")
        return cls.from_string(s)

    @classmethod
    def from_hex(cls, value: Union[int, str, Iterable[Union[int, str]]]) -> Self:
        """Get an :class:`EmojiSequence` instance by unicode code point(s)

        Args:
            value: A single or sequence of HEX string/code.

                it could be:

                - one or more code-point(s) in HEX format string, separated by spaces
                - a single code-point integer
                - An iterable object whose members are code-point string in HEX format
                - An iterable object whose members are code-point integer

        Returns:
            Instance returned from the class's internal dictionary

        Raises:
            KeyError: When passed-in value not found in the class internal dictionary
        """
        cps_array: Iterable[Union[int, str]]
        if isinstance(value, str):
            cps_array = value.split()
        elif isinstance(value, int):
            cps_array = (value,)
        elif isinstance(value, Iterable):
            cps_array = value
        else:
            raise TypeError(
                f"Argument `value` expects to be one of `str`, `bytes`, `int`, or a sequence of that, but actual is {type(value)}"
            )
        return cls.from_characters(EmojiCharacter.from_hex(cp) for cp in cps_array)  # pyright: ignore[reportReturnType]

    @property
    def type_field(self) -> str:
        """A convenience for parsing the emoji sequence files, and is not intended to be maintained as a property.

        may be one of:

        - `"Basic_Emoji"`
        - `"Emoji_Keycap_Sequence"`
        - `"Emoji_Flag_Sequence"`
        - `"Emoji_Tag_Sequence"`
        - `"Emoji_Modifier_Sequence"`
        - `"RGI_Emoji_ZWJ_Sequence"`
        """
        return self._type_field

    @property
    def description(self) -> str:
        """Description"""
        return self._description

    @property
    def version(self) -> str:
        """Version of the Emoji.

        Example:
            ``E0.0``, ``E0.6``, ``E11.0``
        """
        return self._version

    @property
    def variation(self) -> str:
        """``"emoji style"`` or ``"text style"`` of a variable sequence"""
        return self._variation

    @property
    def characters(self) -> Sequence[EmojiCharacter]:
        """Emoji character objects list which makes up the Emoji Sequence"""
        return self._characters

    @property
    def hex(self) -> str:
        """Python style hex string of each emoji-characters's code-point, separated by spaces

        Example:
            ``"0xa9 0xfe0f"``
        """
        return " ".join(c.hex for c in self.characters)

    @property
    def string(self) -> str:
        """string of the Emoji Sequence"""
        return self._string

    @property
    def regex(self) -> str:
        """Regular expression string of the Emoji Sequence"""
        return self._regex

    @property
    def regex_pattern(self):
        """Compiled regular expression pattern of the Emoji Sequence"""
        return self._regex_pat

    @property
    def code_points(self) -> Sequence[int]:
        """List of unicode integer value of the characters who make up this Emoji Sequence"""
        return self._code_points

    @property
    def code_points_string(self) -> str:
        """Unicode style hex string of each emoji-characters's code-point, separated by spaces

        eg: ``"00A9 FE0F"``
        """
        return " ".join(c.code_point_string for c in self.characters)

    @classmethod
    def find_all(cls, s: str) -> Sequence[Tuple[Self, int, int]]:
        """Find out all emoji sequences in a string, and return them in a list

        Items of the returned list is the same as ``yield`` result of :meth:`find`

        The function equals::

            list(EmojiSequence.find(s))

        or ::

            [x for x in EmojiSequence.find(s)]
        """
        return list(cls.find(s))  # pyright: ignore[reportReturnType]

    @classmethod
    def find(cls, s: str) -> Generator[Tuple[Self, int, int], None, None]:
        """Return an iterator which yields all emoji sequences in a string, without actually storing them all simultaneously.

        Args:
            s: The string to find emoji sequences in it

        Yields:
            : Yields at every matched emoji sequence as a 3-members tuple, whose members are:

                0. The found :class:`.EmojiSequence` object
                1. Begin position of the emoji sequence in the string
                2. End position of the emoji sequence in the string
        """
        for m in cls.pattern.finditer(s):
            yield cls.from_string(m.group()), m.start(), m.end()


EmojiSequence.initial()