garrettw/stringobject

View on GitHub
src/UString.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

namespace StringObject;

class UString extends AnyString
{
    const RAW = 0;
    const NFC = 1;
    const NFD = 2;
    const NFK = 4;
    const NFKC = 5;
    const NFKD = 6;

    protected $chars = [];
    protected $uhandler;
    protected $normform = self::RAW;

    protected static $spec = [
        2 => ['mask' => 0b00011111, 'start' => 0x80],
        3 => ['mask' => 0b00001111, 'start' => 0x800],
        4 => ['mask' => 0b00000111, 'start' => 0x10000],
        5 => ['mask' => 0b00000011, 'start' => 0x200000],
        6 => ['mask' => 0b00000001, 'start' => 0x4000000],
    ];
    protected static $winc1umap = [
        0x80 => 0x20AC,
        0x81 => 0xFFFD, // invalid
        0x82 => 0x201A,
        0x83 => 0x0192,
        0x84 => 0x201E,
        0x85 => 0x2026,
        0x86 => 0x2020,
        0x87 => 0x2021,
        0x88 => 0x02C6,
        0x89 => 0x2030,
        0x8A => 0x0160,
        0x8B => 0x2039,
        0x8C => 0x0152,
        0x8D => 0xFFFD, // invalid
        0x8E => 0x017D,
        0x8F => 0xFFFD, // invalid
        0x90 => 0xFFFD, // invalid
        0x91 => 0x2018,
        0x92 => 0x2019,
        0x93 => 0x201C,
        0x94 => 0x201D,
        0x95 => 0x2022,
        0x96 => 0x2013,
        0x97 => 0x2014,
        0x98 => 0x02DC,
        0x99 => 0x2122,
        0x9A => 0x0161,
        0x9B => 0x203A,
        0x9C => 0x0153,
        0x9D => 0xFFFD, // invalid
        0x9E => 0x017E,
        0x9F => 0x0178,
    ];

    public function toArray($delim = '', $limit = null)
    {
        $this->parse();

        if (empty($delim)) {
            return $this->chars;
        }
        if (is_int($delim)) {
            return \str_split($this->raw, $delim);
        }
        if ($limit === null) {
            return \explode($delim, $this->raw);
        }
        return \explode($delim, $this->raw, $limit);
    }

    /**
     * @return string
     */
    public function charAt($index)
    {
        $this->parse();
        return $this->chars[$index][0];
    }

    /**
     * @return int
     */
    public function charCodeAt($index)
    {
        $this->parse();
        return $this->chars[$index][1];
    }

    public function detectForm()
    {

    }

    public function length()
    {
        $this->parse();
        return \count($this->chars);
    }

    public function normalize(Normalize\Normalizer $norm)
    {
        $this->parse();
        $result = $norm->normalize($this);

        if (is_int($result)) {
            // it was already normalized
            $this->normform = $result;
            return $this;
        }
        return $this->replaceWhole($result);
    }

    /**
     *
     */
    protected static function cpToUtf8Char($cpt)
    {
        if ($cpt < self::$spec[2]['start']) {
            return \chr($cpt);
        }

        if ($cpt == 0xFEFF) {
            return '';
        }

        if (($cpt >= 0xD800 && $cpt <= 0xDFFF) || $cpt > 0x10FFFF) {
            return "\xEF\xBF\xBD"; // U+FFFD; invalid symbol
        }

        if ($cpt < self::$spec[3]['start']) {
            $data = [
                0b11000000 | ($cpt >> 6),
                0b10000000 | ($cpt & 0b00111111)
            ];
        } elseif ($cpt < self::$spec[4]['start']) {
            $data = [
                0b11100000 | ($cpt >> 12),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        } else {
            $data = [
                0b11110100,
                0b10000000 | (($cpt >> 12) & 0b00111111),
                0b10000000 | (($cpt >> 6) & 0b00111111),
                0b10000000 | ($cpt & 0b00111111),
            ];
        }

        return implode(array_map('chr', $data));
    }
    /**
     * @param integer $byte
     */
    protected static function charLength($byte)
    {
        if ($byte < 192) {
            return 1;
        }
        if (($byte & 0b11100000) === 0b11000000) {
            return 2;
        }
        if (($byte & 0b11110000) === 0b11100000) {
            return 3;
        }
        if (($byte & 0b11111000) === 0b11110000) {
            return 4;
        }
        if (($byte & 0b11111100) === 0b11111000) {
            return 5;
        }
        if (($byte & 0b11111110) === 0b11111100) {
            return 6;
        }
        return false;
    }

    private function parse()
    {
        if (!empty($this->chars)) {
            return;
        }

        $len = \strlen($this->raw);
        $inside = false; // are we "inside" of evaluating a valid UTF-8 char?
        $invalid = false;

        for ($offset = 0; $offset < $len; $offset++) {
            $char = $this->raw{$offset};
            $ord = \ord($char);

            if ($inside === false) {
                $bytes = self::charLength($ord);

                if ($bytes > 1 && $offset + $bytes <= $len && $invalid === false) {
                    // valid UTF-8 multibyte start
                    $inside = true;
                    $cache = $char;
                    $ordcache = ($ord & self::$spec[$bytes]['mask']) << (6 * ($bytes - 1));
                    $originOffset = $offset;
                    continue;
                }
                if ($ord < self::$spec[2]['start']) {
                    // ASCII 7-bit char
                    $this->chars[] = [$char, $ord];
                    continue;
                }

                // either C0/C1 block or higher; map from cp1252 to utf8 or just convert
                if (isset(self::$winc1umap[$ord])) {
                    $ord = self::$winc1umap[$ord];
                }
                $this->chars[] = [self::cpToUtf8Char($ord), $ord];
                $invalid = false;
                continue;
            }

            // $inside === true, i.e. *should be* continuation character
            if (($ord & 0b11000000) !== 0b10000000) {
                // actually, it's not one, so now the whole UTF-8 char is invalid
                // go back and force it to parse as ISO or 1252
                $inside = false;
                $invalid = true;
                $offset = $originOffset - 1;
                continue;
            }

            // put this byte's data where it needs to go
            $ordcache |= ($ord & 0b00111111) << (6 * ($bytes - 1 - ($offset - $originOffset)));
            $cache .= $char;

            if ($originOffset + ($bytes - 1) === $offset) {
                // we're done parsing this char, now let's verify
                $inside = false;

                // check for overlong, surrogate, too large, BOM, or C0/C1
                $overlong = ($ordcache < self::$spec[$bytes]['start']);
                $surrogate = ($ordcache & 0xFFFFF800 === 0xD800);
                $toobig = ($ordcache > 0x10FFFF);

                if ($overlong || $surrogate || $toobig) {
                    $invalid = true;
                    $offset = $originOffset - 1;
                    continue;
                }

                if ($ordcache === 0xFEFF) { // BOM
                    if ($originOffset !== 0) {
                        // if not at beginning, store as word joiner U+2060
                        $this->chars[] = ["\xE2\x81\xA0", 0x2060];
                    }
                    // otherwise discard
                    continue;
                }

                // verification passed, now store it
                $this->chars[] = [$cache, $ordcache];
            }
        }
    }
}