status-im/status-go

View on GitHub
abi-spec/utf8.go

Summary

Maintainability
A
0 mins
Test Coverage
D
66%
package abispec

import (
    "fmt"
    "unicode/utf8"

    "github.com/ethereum/go-ethereum/common/hexutil"
)

func stringToRunes(str string) []rune {
    var runes []rune
    bytes := []byte(str)
    for len(bytes) > 0 {
        r, size := utf8.DecodeRune(bytes)
        if r == utf8.RuneError {
            for i := 0; i < size; i++ {
                runes = append(runes, rune(bytes[i]))
            }
        } else {
            runes = append(runes, r)
        }
        bytes = bytes[size:]
    }
    return runes
}

// Taken from https://mths.be/punycode
func ucs2decode(str string) []rune {
    var runes = stringToRunes(str)
    var output []rune
    var counter = 0
    var length = len(runes)
    var value rune
    var extra rune
    for counter < length {
        value = runes[counter]
        counter++
        if value >= 0xD800 && value <= 0xDBFF && counter < length {
            // high surrogate, and there is a next character
            extra = runes[counter]
            counter++
            if (extra & 0xFC00) == 0xDC00 { // low surrogate
                output = append(output, ((value&0x3FF)<<10)+(extra&0x3FF)+0x10000)
            } else {
                // unmatched surrogate; only append this code unit, in case the next
                // code unit is the high surrogate of a surrogate pair
                output = append(output, value)
                counter--
            }
        } else {
            output = append(output, value)
        }
    }
    return output
}

// Taken from https://mths.be/punycode
func ucs2encode(array []rune) []byte {
    var length = len(array)
    var index = 0
    var value rune
    var output []byte
    for index < length {
        value = array[index]
        if value > 0xFFFF {
            value -= 0x10000
            codePoint := rune(uint32(value)>>10&0x3FF | 0xD800)
            output = appendBytes(output, stringFromCharCode(codePoint))
            value = 0xDC00 | value&0x3FF
        }
        output = appendBytes(output, stringFromCharCode(value))
        index++
    }
    return output
}

func appendBytes(dest []byte, bytes []byte) []byte {
    for i := 0; i < len(bytes); i++ {
        dest = append(dest, bytes[i])
    }
    return dest
}

func checkScalarValue(codePoint rune) error {
    if codePoint >= 0xD800 && codePoint <= 0xDFFF {
        return fmt.Errorf("lone surrogate U+%s is not a scalar value", hexutil.EncodeUint64(uint64(codePoint)))
    }
    return nil
}

func stringFromCharCode(codePoint rune) []byte {
    var buf = make([]byte, 4)
    n := utf8.EncodeRune(buf, codePoint)
    return buf[0:n]
}

func createByte(codePoint rune, shift uint32) []byte {
    return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80)
}

func encodeCodePoint(codePoint rune) ([]byte, error) {
    if (uint32(codePoint) & uint32(0xFFFFFF80)) == 0 { // 1-byte sequence
        return stringFromCharCode(codePoint), nil
    }
    var symbol []byte
    if uint32(codePoint)&0xFFFFF800 == 0 { // 2-byte sequence
        symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0)
    } else if (uint32(codePoint) & 0xFFFF0000) == 0 { // 3-byte sequence
        err := checkScalarValue(codePoint)
        if err != nil {
            return nil, err
        }
        symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0)
        symbol = appendBytes(symbol, createByte(codePoint, 6))
    } else if (uint32(codePoint) & 0xFFE00000) == 0 { // 4-byte sequence
        symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0)
        symbol = appendBytes(symbol, createByte(codePoint, 12))
        symbol = appendBytes(symbol, createByte(codePoint, 6))
    }
    symbol = appendBytes(symbol, stringFromCharCode((codePoint&0x3F)|0x80))
    return symbol, nil
}

// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
func Utf8encode(str string) (string, error) {
    var codePoints = ucs2decode(str)
    var length = len(codePoints)
    var index = 0
    var codePoint rune
    var bytes []byte
    for index < length {
        codePoint = codePoints[index]
        cps, err := encodeCodePoint(codePoint)
        if err != nil {
            return "", err
        }
        bytes = appendBytes(bytes, cps)
        index++
    }
    return string(bytes), nil
}

func readContinuationByte(byteArray []rune, byteCount int, pByteIndex *int) (rune, error) {
    if *pByteIndex >= byteCount {
        return utf8.RuneError, fmt.Errorf("invalid byte index")
    }

    var continuationByte = byteArray[*pByteIndex] & 0xFF
    *pByteIndex = *pByteIndex + 1

    if (continuationByte & 0xC0) == 0x80 {
        return continuationByte & 0x3F, nil
    }

    // If we end up here, it’s not a continuation byte
    return utf8.RuneError, fmt.Errorf("invalid continuation byte")
}

func decodeSymbol(byteArray []rune, byteCount int, pByteIndex *int) (rune, bool, error) {
    var byte1 rune
    var codePoint rune

    if *pByteIndex > byteCount {
        return utf8.RuneError, false, fmt.Errorf("invalid byte index")
    }

    if *pByteIndex == byteCount {
        return utf8.RuneError, false, nil
    }

    // Read first byte
    byte1 = byteArray[*pByteIndex] & 0xFF
    *pByteIndex = *pByteIndex + 1

    // 1-byte sequence (no continuation bytes)
    if (byte1 & 0x80) == 0 {
        return byte1, true, nil
    }

    // 2-byte sequence
    if (byte1 & 0xE0) == 0xC0 {
        byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        codePoint = ((byte1 & 0x1F) << 6) | byte2
        if codePoint >= 0x80 {
            return codePoint, true, nil
        }
        return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
    }

    // 3-byte sequence (may include unpaired surrogates)
    if (byte1 & 0xF0) == 0xE0 {
        byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3
        if codePoint >= 0x0800 {
            err := checkScalarValue(codePoint)
            if err != nil {
                return utf8.RuneError, false, err
            }
            return codePoint, true, nil
        }
        return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
    }

    // 4-byte sequence
    if (byte1 & 0xF8) == 0xF0 {
        byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        byte4, err := readContinuationByte(byteArray, byteCount, pByteIndex)
        if err != nil {
            return utf8.RuneError, false, err
        }
        codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
            (byte3 << 0x06) | byte4
        if codePoint >= 0x010000 && codePoint <= 0x10FFFF {
            return codePoint, true, nil
        }
    }

    return utf8.RuneError, false, fmt.Errorf("invalid UTF-8 detected")
}

// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
func Utf8decode(str string) ([]byte, error) {
    byteArray := ucs2decode(str)
    byteCount := len(byteArray)
    byteIndex := 0
    var codePoints []rune
    for {
        codePoint, goOn, err := decodeSymbol(byteArray, byteCount, &byteIndex)
        if err != nil {
            return nil, err
        }
        if !goOn {
            break
        }
        codePoints = append(codePoints, codePoint)
    }
    return ucs2encode(codePoints), nil
}