src/Debug/Utility/Utf8.php from bkdotcom/PHPDebugConsole

src/Debug/Utility/Utf8.php
Summary

Maintainability

0 mins
Test Coverage

96%
Issues
Coverage
<?php

/**
 * This file is part of PHPDebugConsole
 *
 * @package   PHPDebugConsole
 * @author    Brad Kent <bkfake-github@yahoo.com>
 * @license   http://opensource.org/licenses/MIT MIT
 * @copyright 2014-2024 Brad Kent
 * @since     2.0
 */

namespace bdk\Debug\Utility;

use bdk\Debug\Utility\Utf8Buffer;

/**
 * Validate Utf8 / "highlight" non-utf8, control, & whitespace characters
 *
 * @link https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 * @link http://www.i18nqa.com/debug/utf8-debug.html
 */
class Utf8
{
    const TYPE_OTHER = 'other';
    const TYPE_UTF8 = 'utf8';
    const TYPE_UTF8_CONTROL = 'utf8Control'; // control character (sans \r\n\t)

    /** @var Utf8Buffer|null */
    private static $buffer;

    /**
     * Convert code point to character
     *
     * @param int $codePoint Unicode code-point
     *
     * @return string
     */
    public static function chr($codePoint)
    {
        if ($codePoint <= 0x7F) {
            // Plain ASCII
            return \chr($codePoint);
        }
        if ($codePoint <= 0x07FF) {
            // 2-byte unicode (range 0x80-0x7FF)
            return ''
                . \chr((($codePoint >> 6) & 0x1F) | 0xC0)
                . \chr((($codePoint >> 0) & 0x3F) | 0x80);
        }
        if ($codePoint <= 0xFFFF) {
            // 3-byte unicode (range 0x800-0xFFFF)
            return ''
                . \chr((($codePoint >> 12) & 0x0F) | 0xE0)
                . \chr((($codePoint >>  6) & 0x3F) | 0x80)
                . \chr((($codePoint >>  0) & 0x3F) | 0x80);
        }
        if ($codePoint <= 0x10FFFF) {
            // 4-byte unicode (range 0x10000-1114111)
            return ''
                . \chr((($codePoint >> 18) & 0x07) | 0xF0)
                . \chr((($codePoint >> 12) & 0x3F) | 0x80)
                . \chr((($codePoint >>  6) & 0x3F) | 0x80)
                . \chr((($codePoint >>  0) & 0x3F) | 0x80);
        }
    }

    /**
     * Determine if string is UTF-8 encoded
     *
     * In addition, if valid UTF-8, will also report whether string contains
     * control, or other special characters that could otherwise go unnoticed
     *
     * @param string $str string to check
     *
     * @return bool
     */
    public static function isUtf8($str)
    {
        $buffer = new Utf8Buffer($str);
        return $buffer->isUtf8();
    }

    /**
     * Get Unicode code point of character
     *
     * @param string $char Character to get code point for
     *
     * @return int|false The Unicode code point for the first character of string or false on failure.
     */
    public static function ord($char)
    {
        $ord = \ord($char[0]);
        if ($ord < 0x80) {
            return $ord;
        } elseif ($ord < 0xe0) {
            return ($ord - 0xc0 << 6) + \ord($char[1]) - 0x80;
        } elseif ($ord < 0xf0) {
            return ($ord - 0xe0 << 12)
                + (\ord($char[1]) - 0x80 << 6)
                + \ord($char[2]) - 0x80;
        } elseif ($ord < 0xf8) {
            return ($ord - 0xf0 << 18)
                + (\ord($char[1]) - 0x80 << 12)
                + (\ord($char[2]) - 0x80 << 6)
                + \ord($char[3]) - 0x80;
        }
        return false;
    }

    /**
     * mb_strcut implementation
     *
     * @param string   $str    The string being cut
     * @param int      $start  start position
     * @param int|null $length length in bytes
     *
     * @return string
     * @see    https://www.php.net/manual/en/function.mb-strcut.php
     */
    public static function strcut($str, $start, $length = null)
    {
        self::$buffer = new Utf8Buffer($str);
        $start = self::strcutGetStart($start);
        $length = $length !== null
            ? self::strcutGetLength($start, $length)
            : self::$buffer->strlen() - $start;
        self::$buffer->seek($start);
        return self::$buffer->read($length);
    }

    /**
     * Get string's length in bytes
     *
     * @param string $string string to calculate
     *
     * @return int
     */
    public static function strlen($string)
    {
        return \function_exists('mb_strlen') && ((int) \ini_get('mbstring.func_overload') & 2)
            ? \mb_strlen($string, '8bit')
            : \strlen($string);
    }

    /**
     * Attempt to convert string to UTF-8 encoding
     *
     * @param string $str string to convert
     *
     * @return string
     */
    public static function toUtf8($str)
    {
        if (\extension_loaded('mbstring') === false || \function_exists('iconv') === false) {
            return $str; // @codeCoverageIgnore
        }
        // 'Windows-1252' detection only seems to work in PHP-8 ?
        // we won't include... ISO-8859-1  too many false positive
        $encodings = ['ASCII', 'UTF-8'];
        $encoding = \mb_detect_encoding($str, $encodings, true);
        if ($encoding === false) {
            // Assume Windows-1252
            return self::toUtf8Unknown($str);
        }
        return $str;
    }

    /**
     * Find length value...
     *
     * @param int $start  Our start value
     * @param int $length User supplied length
     *
     * @return int
     */
    private static function strcutGetLength($start, $length)
    {
        $length = (int) $length;
        $strlen = self::$buffer->strlen();
        $end = $length >= 0
            ? $start + $length
            : $strlen + $length;
        if ($end >= $strlen) {
            return $strlen - $start;
        }
        $end++; // increment to offset the initial decrement below
        for ($i = 0; $i < 4, $end > 0; $i++) {
            $end--;
            self::$buffer->seek($end);
            if (self::$buffer->isOffsetUtf8()) {
                break;
            }
        }
        return \max($end - $start, 0);
    }

    /**
     * Find start position
     *
     * @param int $start User supplied start position
     *
     * @return int
     */
    private static function strcutGetStart($start)
    {
        if ($start <= 0) {
            return 0;
        }
        $start++; // increment so that we start at original
        for ($i = 0; $i < 4; $i++) {
            $start--;
            self::$buffer->seek($start);
            if ($start === 0 || self::$buffer->isOffsetUtf8()) {
                break;
            }
        }
        return $start;
    }

    /**
     * Attempt to convert string to UTF-8 when unable to determine current encoding
     *
     * @param string $str string to convert
     *
     * @return string
     */
    private static function toUtf8Unknown($str)
    {
        $strConv = \iconv('Windows-1252', 'UTF-8', $str);
        if ($strConv === false) {
            $strConv = \htmlentities($str, ENT_COMPAT);
            $strConv = \html_entity_decode($strConv, ENT_COMPAT, 'UTF-8');
        }
        return $strConv;
    }
}