hSaria/ChromaTerm

View on GitHub
chromaterm/pcre.py

Summary

Maintainability
A
0 mins
Test Coverage
'''A PCRE2 implementation. Similar to Python's `re`, albeit very minimal.'''
import collections
import re
from ctypes import (CDLL, POINTER, byref, c_char, c_size_t, c_uint32,
                    create_string_buffer)
from ctypes.util import find_library

PCRE2 = CDLL(find_library('pcre2-8'))
PCRE2.pcre2_compile_8.restype = POINTER(c_char)
PCRE2.pcre2_match_data_create_8.restype = POINTER(c_char)

# PCRE2_NOTEMPTY_ATSTART = 4
MATCH_OPTIONS = c_uint32(4)


class GroupIndex(collections.UserDict):
    '''Extracting the named groups table is too much work. This class patches
    `__getitem__` to extract a named group on-demand (much easier to do).'''

    def __init__(self, regex):
        super().__init__()
        self._regex = regex

    def __getitem__(self, name):
        group_id = PCRE2.pcre2_substring_number_from_name_8(
            self._regex,
            name.encode(),
        )

        if group_id < 1:
            raise KeyError(name)

        return group_id


class Pattern:
    '''PCRE2 Pattern. Like that of Python's, if it was stripped of everything.'''

    def __init__(self, pattern):
        '''Constructor.

        Args:
            pattern (bytes): The regex pattern.
        '''
        error_buffer = create_string_buffer(4096)
        error_code = c_uint32()
        error_offset = c_size_t()
        groups = c_uint32()

        self.pattern = pattern
        self._regex = PCRE2.pcre2_compile_8(
            self.pattern,
            c_size_t(len(self.pattern)),
            c_uint32(0),
            byref(error_code),
            byref(error_offset),
            None,
        )

        if error_code.value != 100:
            PCRE2.pcre2_get_error_message_8(
                error_code,
                error_buffer,
                byref(c_size_t(len(error_buffer) - 1)),
            )
            msg = f'position {error_offset.value}: {error_buffer.value.decode()}'

            raise re.error(msg)

        # PCRE2_JIT_COMPLETE = 1
        PCRE2.pcre2_jit_compile_8(self._regex, c_uint32(1))

        # PCRE2_INFO_CAPTURECOUNT = 4
        PCRE2.pcre2_pattern_info_8(self._regex, c_uint32(4), byref(groups))

        self.groups = groups.value
        self.groupindex = GroupIndex(self._regex)

        self._match_data = PCRE2.pcre2_match_data_create_8(
            c_uint32(self.groups + 1), None)

        PCRE2.pcre2_get_ovector_pointer_8.restype = POINTER(
            2 * (self.groups + 1) * c_size_t)

        # Emulate `span` method of `re.Match`
        match = PCRE2.pcre2_get_ovector_pointer_8(self._match_data).contents
        match.span = lambda gid: (match[2 * gid], match[2 * gid + 1])

        self._match = match
        self._data_len = c_size_t()

    def __del__(self):
        if PCRE2:
            if hasattr(self, '_match_data'):
                PCRE2.pcre2_match_data_free_8(self._match_data)
            if hasattr(self, '_regex'):
                PCRE2.pcre2_code_free_8(self._regex)

    def finditer(self, data):
        '''A generator for finding matches.

        Args:
            data (bytes): The subject of the search.
        '''
        # Reset the startoffset to 0
        self._match[1] = 0
        self._data_len.value = len(data)

        while PCRE2.pcre2_jit_match_8(
                self._regex,
                data,
                self._data_len,
                self._match[1],
                MATCH_OPTIONS,
                self._match_data,
                None,
        ) >= 0:
            yield self._match