ARM-software/golang-utils

View on GitHub
utils/charset/charset.go

Summary

Maintainability
A
0 mins
Test Coverage
/*
 * Copyright (C) 2020-2022 Arm Limited or its affiliates and Contributors. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 */
package charset

import (
    "bufio"
    "fmt"
    "io"
    "unicode/utf8"

    "github.com/gogs/chardet"
    "golang.org/x/text/encoding"
    "golang.org/x/text/encoding/htmlindex"
    "golang.org/x/text/encoding/ianaindex"

    "github.com/ARM-software/golang-utils/utils/charset/iconv"
    "github.com/ARM-software/golang-utils/utils/commonerrors"
)

// DetectTextEncoding returns best guess of encoding of given content.
func DetectTextEncoding(content []byte) (encoding.Encoding, string, error) {
    if utf8.Valid(content) {
        return LookupCharset("UTF-8")
    }

    result, err := chardet.NewTextDetector().DetectBest(content)
    if err != nil {
        return nil, "", fmt.Errorf("%w: %v", commonerrors.ErrNotFound, err)
    }

    return LookupCharset(result.Charset)
}

// DetectTextEncodingFromReader returns best guess of encoding of given reader content. Looks at the first 1024 bytes in the same way as https://pkg.go.dev/golang.org/x/net/html/charset#DetermineEncoding
func DetectTextEncodingFromReader(reader io.Reader) (encoding.Encoding, string, error) {
    bytes, err := bufio.NewReader(reader).Peek(1024)
    if !commonerrors.Any(err, nil, io.EOF, commonerrors.ErrEOF) {
        return nil, "", err
    }
    return DetectTextEncoding(bytes)
}

// LookupCharset returns the encoding with the specified charsetLabel, and its canonical
// name. Matching is case-insensitive and ignores
// leading and trailing whitespace.
func LookupCharset(charsetLabel string) (charsetEnc encoding.Encoding, charsetName string, err error) {
    charsetEnc, err = findCharsetEncoding(charsetLabel)
    if err != nil {
        if commonerrors.Any(err, commonerrors.ErrUnsupported) {
            err = fmt.Errorf("%w: charset [%v] is not supported by go: %v", commonerrors.ErrUnsupported, charsetLabel, err.Error())
        } else {
            err = fmt.Errorf("%w: charset [%v] is invalid: %v", commonerrors.ErrInvalid, charsetLabel, err.Error())
        }
        return
    }
    charsetName, err = htmlindex.Name(charsetEnc)
    if err == nil {
        return
    }
    charsetName, err = ianaindex.IANA.Name(charsetEnc)
    return
}

func findCharsetEncoding(charsetLabel string) (charsetEnc encoding.Encoding, err error) {
    // Check in http://www.w3.org/TR/encoding
    charsetEnc, err = findCharsetEncodingInAnIndex(htmlindex.Get, charsetLabel)
    if commonerrors.Any(err, nil, commonerrors.ErrUnsupported) {
        return
    }
    // Look at this index https://www.iana.org/assignments/character-sets/character-sets.xhtml
    charsetEnc, err = findCharsetEncodingInAnIndex(ianaindex.IANA.Encoding, charsetLabel)
    if commonerrors.Any(err, nil, commonerrors.ErrUnsupported) {
        return
    }
    // Look at the list of known unsupported charsets
    charsetEnc, err = findCharsetEncodingInAnIndex(GetUnsupported, charsetLabel)
    return
}

func findCharsetEncodingInAnIndex(indexSearch func(string) (encoding.Encoding, error), charsetLabel string) (charsetEnc encoding.Encoding, err error) {
    charsetEnc, err = checkEncodingSupport(indexSearch(charsetLabel))
    if commonerrors.Any(err, nil, commonerrors.ErrUnsupported) {
        return
    }
    otherLabel, err := getEncodingMapping().GetCanonicalName(charsetLabel)
    if err != nil {
        return
    }
    charsetEnc, err = checkEncodingSupport(indexSearch(otherLabel))
    return
}

func checkEncodingSupport(charsetEnc encoding.Encoding, err error) (encoding.Encoding, error) {
    // according to index documentation, if the error is nil but the encoding as well, then the encoding should be considered as unsupported by the language
    newErr := err
    if err == nil {
        if charsetEnc == nil {
            newErr = fmt.Errorf("%w charset encoding", commonerrors.ErrUnsupported)
        }
    }
    return charsetEnc, newErr
}

// IconvString converts string from one text encoding charset to another.
func IconvString(input string, fromEncoding encoding.Encoding, toEncoding encoding.Encoding) (string, error) {
    return iconv.NewConverter(fromEncoding, toEncoding).ConvertString(input)
}

// IconvStringFromLabels is similar to IconvString but uses labels.
func IconvStringFromLabels(input string, fromEncodingLabel string, toEncodingLabel string) (transformedText string, err error) {
    fromEncoding, _, err := LookupCharset(fromEncodingLabel)
    if err != nil {
        return
    }
    toEncoding, _, err := LookupCharset(toEncodingLabel)
    if err != nil {
        return
    }
    transformedText, err = IconvString(input, fromEncoding, toEncoding)
    return
}

// IconvBytes converts bytes from one text encoding charset to another.
func IconvBytes(input []byte, fromEncoding encoding.Encoding, toEncoding encoding.Encoding) ([]byte, error) {
    return iconv.NewConverter(fromEncoding, toEncoding).ConvertBytes(input)
}

// IconvBytesFromLabels is similar to IconvBytes but uses labels.
func IconvBytesFromLabels(input []byte, fromEncodingLabel string, toEncodingLabel string) (transformedBytes []byte, err error) {
    fromEncoding, _, err := LookupCharset(fromEncodingLabel)
    if err != nil {
        return
    }
    toEncoding, _, err := LookupCharset(toEncodingLabel)
    if err != nil {
        return
    }
    transformedBytes, err = IconvBytes(input, fromEncoding, toEncoding)
    return
}

// Iconv converts from any supported text encodings to any other, through Unicode conversion.
// Similar to https://www.gnu.org/software/libiconv/ but using pure go as opposed to many go libraries
func Iconv(reader io.Reader, fromEncoding encoding.Encoding, toEncoding encoding.Encoding) io.Reader {
    return iconv.NewConverter(fromEncoding, toEncoding).Convert(reader)
}

// IconvFromLabels is similar to Iconv but uses labels.
func IconvFromLabels(reader io.Reader, fromEncodingLabel string, toEncodingLabel string) (transformedReader io.Reader, err error) {
    fromEncoding, _, err := LookupCharset(fromEncodingLabel)
    if err != nil {
        return
    }
    toEncoding, _, err := LookupCharset(toEncodingLabel)
    if err != nil {
        return
    }
    transformedReader = Iconv(reader, fromEncoding, toEncoding)
    return
}