fisharebest/webtrees

View on GitHub
app/Encodings/AbstractUTF16Encoding.php

Summary

Maintainability
A
3 hrs
Test Coverage
<?php

/**
 * webtrees: online genealogy
 * Copyright (C) 2023 webtrees development team
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
 */

declare(strict_types=1);

namespace Fisharebest\Webtrees\Encodings;

use function chr;
use function intdiv;
use function ord;
use function str_split;
use function strlen;

/**
 * Convert between an encoding and UTF-16.
 */
abstract class AbstractUTF16Encoding implements EncodingInterface
{
    // Concrete classes should implement this.
    public const REPLACEMENT_CHARACTER = '';

    /**
     * Convert a string from UTF-8 to another encoding.
     *
     * @param string $text
     *
     * @return string
     */
    public function fromUtf8(string $text): string
    {
        $out = '';
        $len = strlen($text);

        for ($n = 0; $n < $len; ++$n) {
            $code_point = ord($text[$n]);

            if ($code_point <= 0x7F) {
                $out .= $this->codePointToCharacter($code_point);
            } elseif ($code_point <= 0xBF) {
                // Invalid
                $out .= static::REPLACEMENT_CHARACTER;
            } elseif ($code_point <= 0xDF) {
                $byte2 = ord($text[++$n]);

                if (($byte2 & 0xC0) !== 0x80) {
                    // Invalid
                    $out .= static::REPLACEMENT_CHARACTER;
                } else {
                    $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F);
                }
            } elseif ($code_point <= 0xEF) {
                $byte2 = ord($text[++$n]);
                $byte3 = ord($text[++$n]);

                if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) {
                    // Invalid
                    $out .= static::REPLACEMENT_CHARACTER;
                } else {
                    $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F);
                }
            } else {
                // Invalid
                $out .= static::REPLACEMENT_CHARACTER;
            }
        }

        return $out;
    }

    /**
     * Convert a string from another encoding to UTF-8.
     *
     * @param string $text
     *
     * @return string
     */
    public function toUtf8(string $text): string
    {
        $utf8 = '';

        foreach (str_split($text, 2) as $character) {
            $code_point = $this->characterToCodePoint($character);

            if ($code_point <= 0x7F) {
                // 7 bits => 1 byte
                $utf8 .= chr($code_point);
            } elseif ($code_point <= 0xFF) {
                // U+80 - U+FF are invalid
                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
            } elseif ($code_point <= 0x7FF) {
                // 11 bits (5,6) => 2 bytes
                $utf8 .= chr(0xC0 | ($code_point >> 6));
                $utf8 .= chr(0x80 | $code_point & 0x3F);
            } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) {
                // 16 bits (4,6,6) => 3 bytes
                $utf8 .= chr(0xE0 | ($code_point >> 12));
                $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F);
                $utf8 .= chr(0x80 | $code_point & 0x3F);
            } else {
                // U+D800 - U+DFFF are invalid
                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
            }
        }

        return $utf8;
    }

    /**
     * When reading multi-byte encodings using a stream, we must avoid incomplete characters.
     *
     * @param string $text
     *
     * @return int
     */
    public function convertibleBytes(string $text): int
    {
        return 2 * intdiv(strlen($text), 2);
    }

    /**
     * Convert two bytes to a code-point, taking care of byte-order.
     *
     * @param string $character
     *
     * @return int
     */
    abstract protected function characterToCodePoint(string $character): int;

    /**
     * Convert a code-point to two bytes, taking care of byte-order.
     *
     * @param int $code_point
     *
     * @return string
     */
    abstract protected function codePointToCharacter(int $code_point): string;
}