toddy15/spamdetect

View on GitHub
src/Tokenizer.php

Summary

Maintainability
A
0 mins
Test Coverage
A
100%
<?php

declare(strict_types=1);

namespace Toddy15\SpamDetect;

class Tokenizer
{
    /**
     * Define start and ending tags of some common markups:
     * - HTML or XML tags like <h1>
     * - Emoji notation like :smile:
     * - Emoji notation like [wink]
     *
     * Each array consists of the start and end delimiter
     * for the markup.
     */
    private array $commonMarkups = [
        ['<', '>'],
        [':', ':'],
        // Needs to be escaped for the regular expression.
        ['\[', '\]'],
    ];

    /**
     * Current result of tokenizing
     */
    private array $tokens;

    public function __construct(array $input)
    {
        $this->tokens = $input;
    }

    /**
     * Split input into tokens.
     */
    public function tokenize(): array
    {
        foreach ($this->commonMarkups as $delimiters) {
            $start = $delimiters[0];
            $end = $delimiters[1];
            $this->tokenizeCommonMarkups($start, $end);
        }

        // Perform the splitting on whitespace
        $result = [];
        foreach ($this->tokens as $token) {
            $split_tokens = preg_split("/\s+/", $token);
            if ($split_tokens === false) {
                $split_tokens = [$token]; // @codeCoverageIgnore
            }
            $result = array_merge($result, array_filter($split_tokens));
        }

        // Finally, remove identical tokens. Each token
        // should appear only once in the result.
        return array_values(array_unique($result));
    }

    /**
     * Split common markups into tokens.
     */
    private function tokenizeCommonMarkups(string $start, string $end): void
    {
        $pattern = '/('.$start.'[^'.$end.'\s]+?'.$end.')/';
        $result = [];

        foreach ($this->tokens as $s) {
            $tokens = preg_split($pattern, $s, 0, PREG_SPLIT_DELIM_CAPTURE);
            if ($tokens === false) {
                $tokens = [$s]; // @codeCoverageIgnore
            }
            $result = array_merge($result, $tokens);
        }

        // Remove empty tokens
        $this->tokens = array_filter($result);
    }
}