grokify/mogo

View on GitHub
html/htmlutil/tokenizer.go

Summary

Maintainability
B
4 hrs
Test Coverage
package htmlutil

import (
    "bytes"
    "errors"
    "io"
    "os"

    "github.com/grokify/mogo/type/stringsutil"
    "golang.org/x/net/html"
)

var (
    ErrTokenNotFound           = errors.New("token(s) not found")
    ErrTokenizerNotInitialized = errors.New("tokenizer not initialized")
)

func NewTokenizerBytes(b []byte) *html.Tokenizer {
    return html.NewTokenizer(bytes.NewReader(b))
}

func NewTokenizerFile(name string) (*html.Tokenizer, error) {
    b, err := os.ReadFile(name)
    if err != nil {
        return nil, err
    }
    return NewTokenizerBytes(b), nil
}

/*
// TokensBetweenAtom returns the tokens that represent the `innerHtml`
// between a start and end tag token.
func TokensBetweenAtom(z *html.Tokenizer, skipErrors, inclusive bool, htmlAtom atom.Atom) ([]html.Token, error) {
    return TokensBetween(z, skipErrors, inclusive,
        TokenFilters{{
            TokenType: html.StartTagToken,
            AtomSet:   NewAtomSet(htmlAtom)}},
        TokenFilters{{
            TokenType: html.EndTagToken,
            AtomSet:   NewAtomSet(htmlAtom)}})
}
*/

/*
func TokensBetween(z *html.Tokenizer, skipErrors, inclusive bool, begin, end TokenFilters) ([]html.Token, error) {
    tokens := []html.Token{}
    tmsBegin, err := NextTokenMatch(z, skipErrors, false, false, begin...)
    if err != nil {
        return tokens, err
    }
    if inclusive {
        tokens = append(tokens, tmsBegin...)
    }
    tokensChain, err := NextTokenMatch(z, skipErrors, true, inclusive, end...)
    if err != nil {
        return tokens, err
    }
    tokens = append(tokens, tokensChain...)
    return tokens, nil
}
*/

/*

func TokensBetweenNew(z *html.Tokenizer, skipErrors, inclusive bool, begin, end []html.Token) ([]html.Token, error) {
    begFilters := Tokens(begin)
    endFilter := Tokens(end)

    tokens := []html.Token{}
    tmsBegin, err := NextToken(z, skipErrors, begin...)
    if err != nil {
        return tokens, err
    }
    if inclusive {
        tokens = append(tokens, tmsBegin...)
    }
    tokensChain, err := NextTokenMatch(z, skipErrors, true, inclusive, end...)
    if err != nil {
        return tokens, err
    }
    tokens = append(tokens, tokensChain...)
    return tokens, nil
}
*/

func NextToken(z *html.Tokenizer, skipErrors bool, tokFilters ...html.Token) (html.Token, error) {
    opts := NextTokensOpts{
        SkipErrors:     skipErrors,
        IncludeChain:   false,
        InclusiveMatch: true,
        StartFilter:    []html.Token{},
        EndFilter:      tokFilters,
    }
    toks, err := NextTokens(z, opts)
    if err != nil {
        return html.Token{}, err
    } else if len(toks) == 0 {
        return html.Token{}, ErrTokenNotFound
    } else if len(toks) > 1 {
        panic("too many tokens (>1) found")
    }
    return toks[0], nil
}

type NextTokensOpts struct {
    SkipErrors               bool
    IncludeChain             bool
    InclusiveMatch           bool
    StartFilter              Tokens
    StartAttributeValueMatch *stringsutil.MatchInfo
    EndFilter                Tokens
}

func NextTokens(z *html.Tokenizer, opts NextTokensOpts) (Tokens, error) {
    // func NextTokens(z *html.Tokenizer, skipErrors, includeChain, includeMatch bool, start, end []html.Token) ([]html.Token, error) {
    matches := []html.Token{}
    if z == nil {
        return matches, ErrTokenizerNotInitialized
    }
    if opts.StartAttributeValueMatch == nil {
        opts.StartAttributeValueMatch = &stringsutil.MatchInfo{
            MatchType: stringsutil.MatchExact,
        }
    }
    foundStartToken := false
    if len(opts.StartFilter) == 0 {
        foundStartToken = true
    }

    for {
        ttThis := z.Next()
        switch ttThis {
        case html.ErrorToken:
            err := z.Err()
            if err == io.EOF {
                return matches, nil
            } else if !opts.SkipErrors {
                return matches, err
            }
        default:
            tok := z.Token()
            if foundStartToken {
                if opts.EndFilter.MatchLeft(tok, nil) {
                    if opts.InclusiveMatch {
                        matches = append(matches, tok)
                    }
                    return matches, nil
                } else if opts.IncludeChain {
                    matches = append(matches, tok)
                }
            } else {
                if opts.StartFilter.MatchLeft(tok, opts.StartAttributeValueMatch) {
                    if opts.InclusiveMatch {
                        matches = append(matches, tok)
                    }
                    foundStartToken = true
                }
            }
        }
    }
}

// NextTextToken uses `NextTokensOpts` specifically for `SkipErrors`, `StartFilter`, and `StartAttributeValueMatch`.
func NextTextToken(z *html.Tokenizer, opts NextTokensOpts) (html.Token, error) {
    if z == nil {
        return html.Token{}, ErrTokenizerNotInitialized
    }
    if opts.StartAttributeValueMatch == nil {
        opts.StartAttributeValueMatch = &stringsutil.MatchInfo{
            MatchType: stringsutil.MatchExact,
        }
    }
    foundStartToken := false
    for {
        tokType := z.Next()
        tok := z.Token()
        if tokType == html.ErrorToken {
            err := z.Err()
            if err == io.EOF {
                return tok, ErrTokenNotFound
            } else if !opts.SkipErrors {
                return tok, err
            }
        } else if tokType == html.TextToken &&
            len(opts.StartFilter) == 0 || foundStartToken {
            return tok, nil
        } else if opts.StartFilter.MatchLeft(tok, opts.StartAttributeValueMatch) {
            foundStartToken = true
        }
    }
}

/*
// NextTokenMatch returns a string of matches. `includeMatch` is only used when `includeChain` is included.
func NextTokenMatch(z *html.Tokenizer, skipErrors, includeChain, includeMatch bool, filters ...TokenFilter) ([]html.Token, error) {
    matches := []html.Token{}
    if len(filters) == 0 {
        return matches, errors.New("no filters provided")
    }
    filtersMore := TokenFilters(filters)
    for {
        tt := z.Next()
        token := z.Token()
        if token.Type == html.ErrorToken {
            break
        }
        filtersForType := filtersMore.ByTokenType(tt)
        if len(filtersForType) > 0 {
            for _, filter := range filtersForType {
                if filter.AtomSet.Len() == 0 {
                    if !includeChain || includeMatch {
                        matches = append(matches, token)
                    }
                    return matches, nil
                } else if filter.AtomSet.Exists(token.DataAtom) {
                    if !includeChain || includeMatch {
                        matches = append(matches, token)
                    }
                    return matches, nil
                }
            }
        }
        if includeChain {
            matches = append(matches, token)
        }
    }
    return matches, nil
}
*/

/*
func NextStartToken(z *html.Tokenizer, skipErrors bool, htmlAtoms ...atom.Atom) (html.Token, error) {
    if len(htmlAtoms) == 0 {
        return html.Token{}, errors.New("no atoms requested")
    }
    atoms := NewAtomSet(htmlAtoms...)
    for {
        ttThis := z.Next()
        switch ttThis {
        case html.ErrorToken:
            err := z.Err()
            if z.Err() == io.EOF {
                return html.Token{}, ErrTokenNotFound
            } else if !skipErrors {
                return html.Token{}, err
            }
        case html.StartTagToken:
            tok := z.Token()
            if atoms.Exists(tok.DataAtom) {
                return tok, nil
            }
        }
    }
}
*/

/*
func NextTextToken(z *html.Tokenizer, skipErrors bool, htmlAtoms ...atom.Atom) (html.Token, error) {
    atoms := NewAtomSet(htmlAtoms...)
    for {
        tokType := z.Next()
        tok := z.Token()
        if tokType == html.ErrorToken {
            err := z.Err()
            if err == io.EOF {
                return tok, ErrTokenNotFound
            } else if !skipErrors {
                return tok, err
            }
        } else if atoms.Len() == 0 && tokType == html.TextToken {
            return tok, nil
        } else if atoms.Len() > 0 &&
            tokType == html.StartTagToken &&
            atoms.Exists(tok.DataAtom) {
            return NextTextToken(z, skipErrors)
        }
    }
}
*/

/*
func NextTextToken(z *html.Tokenizer, skipErrors bool, start []html.Token) (html.Token, error) {
    _, err := NextToken(z, skipErrors, start...)
    if err != nil {
        return html.Token{}, err
    }
    return NextToken(z, skipErrors, html.Token{Type: html.TextToken})
}
*/