src/Debug/Utility/Utf8Dump.php
<?php
/**
* This file is part of PHPDebugConsole
*
* @package PHPDebugConsole
* @author Brad Kent <bkfake-github@yahoo.com>
* @license http://opensource.org/licenses/MIT MIT
* @copyright 2014-2024 Brad Kent
* @version v3.0
*/
namespace bdk\Debug\Utility;
use bdk\Debug\Utility\Utf8;
/**
* Dump strings / "highlight" non-printing & whitespace characters
*/
class Utf8Dump
{
/** @var array<int, string> */
private $charDesc = array( // @phpcs:ignore SlevomatCodingStandard.Arrays.AlphabeticallySortedByKeys
0x00 => 'NUL',
0x01 => 'SOH (start of heading)',
0x02 => 'STX (start of text)',
0x03 => 'ETX (end of text)',
0x04 => 'EOT (end of transmission)',
0x05 => 'ENQ (enquiry)',
0x06 => 'ACK (acknowledge)',
0x07 => 'BEL (bell)',
0x08 => 'BS (backspace)',
0x09 => 'HT (horizontal tab)', // \t not treated special by default
0x0A => 'LF (NL line feed / new line)', // \n not treated special by default
0x0B => 'VT (vertical tab)',
0x0C => 'FF (NP form feed / new page)',
0x0D => 'CR (carriage return)', // \r not treated special by default
0x0E => 'SO (shift out)',
0x0F => 'SI (shift in)',
0x10 => 'DLE (data link escape)',
0x11 => 'DC1 (device control 1)',
0x12 => 'DC2 (device control 2)',
0x13 => 'DC3 (device control 3)',
0x14 => 'DC4 (device control 4)',
0x15 => 'NAK (negative acknowledge)',
0x16 => 'SYN (synchronous idle)',
0x17 => 'ETB (end of trans. block)',
0x18 => 'CAN (cancel)',
0x19 => 'EM (end of medium)',
0x1A => 'SUB (substitute)',
0x1B => 'ESC (escape)',
0x1C => 'FS (file seperator)',
0x1D => 'GS (group seperator)',
0x1E => 'RS (record seperator)',
0x1F => 'US (unit seperator)',
0x7F => 'DEL',
0x00A0 => 'NBSP',
0x1680 => 'Ogham Space Mark',
0x180E => 'Mongolian Vowel Separator',
0x2000 => 'En Quad',
0x2001 => 'Em Quad',
0x2002 => 'En Space',
0x2003 => 'Em Space',
0x2004 => 'Three-Per-Em Space',
0x2005 => 'Four-Per-Em Space',
0x2006 => 'Six-Per-Em Space',
0x2007 => 'Figure Space',
0x2008 => 'Punctuation Space',
0x2009 => 'Thin Space',
0x200A => 'Hair Space',
0x200B => 'Zero Width Space', // not included in Separator Category
0x2028 => 'Line Separator',
0x2029 => 'Paragraph Separator',
0x202F => 'Narrow No-Break Space',
0x205F => 'Medium Mathematical Space',
0x3000 => 'Ideographic Space',
0xFEFF => 'BOM / Zero Width No-Break Space', // not included in Separator Category
0xFFFD => 'Replacement Character',
);
/** @var array<string,mixed> */
private $options = array(
'prefix' => true,
'sanitizeNonBinary' => false,
'useHtml' => false,
);
/**
* Format a block of text
*
* @param string $str string to output
* @param Utf8::TYPE_* $blockType one of the Utf8::_TYPE_* constants
*
* @return string hidden/special chars converted to visible human-readable
*/
public function dumpBlock($str, $blockType)
{
if ($str === '') {
return '';
}
switch ($blockType) {
case Utf8::TYPE_SPECIAL:
return $this->dumpBlockSpecial($str);
case Utf8::TYPE_CONTROL:
case Utf8::TYPE_OTHER:
$str = $this->dumpBlockCtrlOther($str);
return $this->options['useHtml']
? '<span class="binary">' . $str . '</span>'
: $str;
}
// default / 'utf8'
return $this->options['sanitizeNonBinary']
? \htmlspecialchars($str)
: $str;
}
/**
* Set one or more options
*
* setOptions('key', 'value')
* setOptions(array('k1'=>'v1', 'k2'=>'v2'))
*
* @param array<string,mixed>|string $mixed key=>value array or key
* @param mixed $val new value
*
* @return void
*/
public function setOptions($mixed, $val = null)
{
if (\is_string($mixed)) {
$mixed = array($mixed => $val);
}
$this->options = \array_merge($this->options, $mixed);
}
/**
* Dump "other" characters (ie control char)
*
* @param string $str string/char
*
* @return string
*/
private function dumpBlockCtrlOther($str)
{
if ($this->options['prefix'] === false) {
$str = \bin2hex($str);
return \trim(\chunk_split($str, 2, ' '));
}
if ($this->options['useHtml'] === false) {
$prefix = '\\x';
$str = \bin2hex($str);
$str = \trim(\chunk_split($str, 2, ' '));
return $prefix . \str_replace(' ', $prefix, $str);
}
$chars = \str_split($str);
foreach ($chars as $i => $char) {
$chars[$i] = $this->dumpCtrlOtherCharHtml($char);
}
return \implode('', $chars);
}
/**
* Dump a "special" char (ie hidden/whitespace)
*
* @param string $str string/char
*
* @return string
*/
private function dumpBlockSpecial($str)
{
$strNew = '';
$pos = 0; // ordUtf8 updates
$length = Utf8::strlen($str);
while ($pos < $length) {
$char = '';
$ord = self::ord($str, $pos, $char);
$ordHex = \dechex($ord);
$ordHex = \str_pad($ordHex, 4, '0', STR_PAD_LEFT);
if ($this->options['useHtml'] === false) {
$strNew .= '\\u{' . $ordHex . '}';
continue;
}
$title = isset($this->charDesc[$ord])
? 'U-' . $ordHex . ': ' . $this->charDesc[$ord]
: 'U-' . $ordHex;
$strNew .= \sprintf(
'<a class="unicode" href="%s" target="unicode" title="%s">\u%s</a>',
'https://symbl.cc/en/' . $ordHex,
$title,
$ordHex
);
}
return $strNew;
}
/**
* Dump control and "other" character
*
* @param string $char single (may be multi-byte) char
*
* @return string
*/
private function dumpCtrlOtherCharHtml($char)
{
$ord = \ord($char);
$prefix = '\\x';
$hex = $prefix . \bin2hex($char); // could use dechex($ord), but would require padding
if (!isset($this->charDesc[$ord])) {
// other
return $hex;
}
// lets use the control pictures
$chr = $ord === 0x7f
? "\xe2\x90\xa1" // "del" char
: "\xe2\x90" . \chr($ord + 128); // chars for 0x00 - 0x1F
return '<span class="c1-control" title="' . $this->charDesc[$ord] . ': ' . $hex . '">' . $chr . '</span>';
}
/**
* Returns decimal code-point for multi-byte character
*
* Use dechex to convert to hex (ie \uxxxx)
*
* $ord = ordUtf8($char)
* $ordHex = \dechex($ord);
* $ordHex = \str_pad($ordHex, 4, '0', STR_PAD_LEFT);
* $ordHex = '\\u{' . $ordHex . '}';
*
* @param string $str A string or single character
* @param int $offset (0) Zero-based offset will be updated to offset of next char
* @param string $char will be populated with the character found at offset
*
* @return int
*/
private static function ord($str, &$offset = 0, &$char = '')
{
$code = \ord($str[$offset]);
$numBytes = 1;
if ($code < 0x80) {
$numBytes = 1;
} elseif ($code < 0xe0) { // 110xxxxx
$code -= 0xc0;
$numBytes = 2;
} elseif ($code < 0xf0) { // 1110xxxx
$code -= 0xe0;
$numBytes = 3;
} elseif ($code < 0xf8) {
$code -= 0xf0;
$numBytes = 4; // 11110xxx
}
for ($i = 1; $i < $numBytes; $i++) {
$code2 = \ord($str[$offset + $i]) - 0x80; // 10xxxxxx
$code = $code * 64 + $code2;
}
$char = \substr($str, $offset, $numBytes);
$offset = $offset + $numBytes;
return $code;
}
}