grokify/mogo

View on GitHub
html/htmlutil/htmlutil.go

Summary

Maintainability
A
0 mins
Test Coverage
package htmlutil

import (
    "html"
    "regexp"
    "strings"

    "github.com/microcosm-cc/bluemonday"
    xhtml "golang.org/x/net/html"
    "golang.org/x/net/html/atom"
)

// ChartColor1 is the color palette for Google Charts as collected by
// Craig Davis here: https://gist.github.com/there4/2579834
var ChartColor1 = [...]string{
    "#3366CC",
    "#DC3912",
    "#FF9900",
    "#109618",
    "#990099",
    "#3B3EAC",
    "#0099C6",
    "#DD4477",
    "#66AA00",
    "#B82E2E",
    "#316395",
    "#994499",
    "#22AA99",
    "#AAAA11",
    "#6633CC",
    "#E67300",
    "#8B0707",
    "#329262",
    "#5574A6",
    "#3B3EAC",
}

// Link is a struct to hold information for an HTML link.
type Link struct {
    Href      string
    InnerHTML string
}

const (
    Color2GreenHex       = "#00FF2A"
    Color2YellowHex      = "#DFDD13"
    Color2RedHex         = "#FF0000"
    RingCentralOrangeHex = "#FF8800"
    RingCentralBlueHex   = "#0073AE"
    RingCentralGreyHex   = "#585858"
)

var (
    bluemondayStrictPolicy                  = bluemonday.StrictPolicy()
    rxHTMLToTextNewLines     *regexp.Regexp = regexp.MustCompile(`(?i:</?p>)`)
    rxCarriageReturn         *regexp.Regexp = regexp.MustCompile(`\r`)
    rxDiv                    *regexp.Regexp = regexp.MustCompile(`(?i)<div>`)
    rxLineFeed               *regexp.Regexp = regexp.MustCompile(`\n`)
    rxLineFeedMore           *regexp.Regexp = regexp.MustCompile(`\n+`)
    rxCarriageReturnLineFeed *regexp.Regexp = regexp.MustCompile(`\r\n`)
    rxLineFeedMore2          *regexp.Regexp = regexp.MustCompile(`\n\n+`)
    doubleLinefeed                          = "\n\n"
    // rxCarriageReturnLineFeedMore *regexp.Regexp = regexp.MustCompile(`[\r\n]+`)
    // rxEndingSpacesLineFeed       *regexp.Regexp = regexp.MustCompile(`\s+\n`)
)

func EscapeStrings(s []string) []string {
    var n []string
    for _, si := range s {
        n = append(n, html.EscapeString(si))
    }
    return n
}

func StreamlineCRLFs(s string) string {
    newLines := []string{}
    extLines := strings.Split(s, "\n")
    for _, line := range extLines {
        newLines = append(newLines, strings.TrimSpace(line))
    }
    s2 := strings.Join(newLines, "\n")
    s2 = rxLineFeedMore2.ReplaceAllString(
        rxCarriageReturn.ReplaceAllString(
            rxCarriageReturnLineFeed.ReplaceAllString(s2, "\n"),
            "\n"),
        "\n",
    )
    return s2
}

// HTMLToTextCondensed removes HTML tags, unescapes HTML entities,
// and removes extra whitespace including non-breaking spaces.
func HTMLToTextCondensed(s string) string {
    return strings.Join(
        strings.Fields(
            html.UnescapeString(
                bluemondayStrictPolicy.Sanitize(s),
            ),
        ),
        " ",
    )
}

// HTMLToText converts HTML to multi-line text.
func HTMLToText(s string) string {
    return rxLineFeedMore2.ReplaceAllString(
        strings.TrimSpace(
            html.UnescapeString(
                bluemondayStrictPolicy.Sanitize(
                    rxHTMLToTextNewLines.ReplaceAllString(
                        rxDiv.ReplaceAllString(s, doubleLinefeed),
                        "$1"+doubleLinefeed),
                ),
            ),
        ),
        doubleLinefeed,
    )
}

func HTMLToTextH1(b []byte, policy *bluemonday.Policy) (string, error) {
    return HTMLToTextAtom(b, policy, atom.H1)
}

func HTMLToTextAtom(b []byte, policy *bluemonday.Policy, a atom.Atom) (string, error) {
    z := NewTokenizerBytes(b)
    // filter := []golanghtml.Token{{DataAtom: a}}
    opts := NextTokensOpts{
        SkipErrors:     false,
        IncludeChain:   true,
        InclusiveMatch: true,
        StartFilter:    []xhtml.Token{{DataAtom: a, Type: xhtml.StartTagToken}},
        EndFilter:      []xhtml.Token{{DataAtom: a, Type: xhtml.EndTagToken}},
    }
    toks, err := NextTokens(z, opts)
    // toks, err := TokensBetweenAtom(t, false, true, a)
    if err != nil {
        return "", err
    }
    if policy == nil {
        policy = bluemonday.StrictPolicy()
    }
    return strings.TrimSpace(policy.Sanitize(toks.String())), nil
}

func SimplifyHTMLText(s string) string {
    text := HTMLToText(s)
    lines := strings.Split(text, "\n")
    newlines := []string{}
    for _, line := range lines {
        line := strings.TrimSpace(line)
        if len(line) > 0 {
            newlines = append(newlines, "<p>"+line+"</p>")
        }
    }
    return strings.Join(newlines, "")
}

func TextToHTML(s string) string {
    return rxLineFeed.ReplaceAllString(StreamlineCRLFs(s), "<br/>")
}

func TextToHTMLBr2(s string) string {
    return rxLineFeed.ReplaceAllString(
        rxLineFeedMore.ReplaceAllString(StreamlineCRLFs(s), "\n"),
        "<br/><br/>",
    )
}