ReanGD/go-web-search

View on GitHub
crawler/body_parser.go

Summary

Maintainability
A
35 mins
Test Coverage
package crawler

// status: ok
import (
    "bufio"
    "bytes"
    "compress/gzip"
    "io"
    "io/ioutil"

    "github.com/ReanGD/go-web-search/werrors"
    "github.com/uber-go/zap"

    "golang.org/x/net/html"
    "golang.org/x/net/html/charset"
    "golang.org/x/text/encoding"
    "golang.org/x/text/transform"
)

func readBody(contentEncoding string, body io.Reader) ([]byte, error) {
    var err error
    result := []byte{}
    if contentEncoding == "gzip" {
        reader, err := gzip.NewReader(body)
        if err != nil {
            return result, werrors.NewDetails(ErrReadGZipResponse, err)
        }
        result, err = ioutil.ReadAll(reader)
        if err == nil {
            err = reader.Close()
        } else {
            _ = reader.Close()
        }
        if err != nil {
            return result, werrors.NewDetails(ErrReadGZipResponse, err)
        }
    } else if contentEncoding == "identity" || contentEncoding == "" {
        result, err = ioutil.ReadAll(body)
        if err != nil {
            return result, werrors.NewDetails(ErrReadResponse, err)
        }
    } else {
        return result, werrors.NewFields(ErrUnknownContentEncoding, zap.String("encoding", contentEncoding))
    }

    return result, nil
}

func isHTML(content []byte) bool {
    isHTML := false
    if len(content) == 0 {
        return isHTML
    }
    if len(content) > 1024 {
        content = content[:1024]
    }

    z := html.NewTokenizer(bytes.NewReader(content))
    isFinish := false
    for !isFinish {
        switch z.Next() {
        case html.ErrorToken:
            isFinish = true
        case html.StartTagToken:
            tagName, _ := z.TagName()
            if bytes.Equal(tagName, []byte("html")) {
                isHTML = true
                isFinish = true
            }
        }
    }

    return isHTML
}

func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) {
    enc, _, _ := charset.DetermineEncoding(body, contentType)
    if enc == encoding.Nop {
        return nil, werrors.New(ErrEncodingNotFound)
    }

    return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil
}

func bodyMinification(node *html.Node, buf io.Writer) error {
    htmlMinification := minificationHTML{}
    err := htmlMinification.Run(node)

    if err == nil {
        textMinification := minificationText{}
        err = textMinification.Run(node)
    }

    if err == nil {
        wbuf := bufio.NewWriter(buf)
        err = html.Render(wbuf, node)
        if err == nil {
            err = wbuf.Flush()
        }
        if err != nil {
            return werrors.NewDetails(ErrRenderHTML, err)
        }
    }

    return err
}