s0rg/crawley

View on GitHub
internal/crawler/util.go

Summary

Maintainability
A
40 mins
Test Coverage
A
93%
package crawler

import (
    "encoding/base64"
    "hash/fnv"
    "io"
    "log"
    "mime"
    "net/url"
    "path"
    "path/filepath"
    "strings"

    "github.com/s0rg/set"
    "golang.org/x/net/html"
    "golang.org/x/net/html/atom"

    "github.com/s0rg/crawley/internal/links"
)

const (
    proxyAuthKey   = "Proxy-Authorization"
    proxyAuthBasic = "Basic"

    contentType = "Content-Type"
    contentHTML = "text/html"
    contentCSS  = "text/css"
    contentJS   = "application/javascript"
    fileExtJS   = ".js"
    fileExtCSS  = ".css"
)

var parsableExts = set.Load(make(set.Unordered[string]),
    ".asp",
    ".aspx",
    ".cgi",
    ".htm",
    ".html",
    ".jsp",
    ".php",
    ".pl",
    ".xhtml",
    ".xml",
    fileExtJS,
    fileExtCSS,
)

func proxyAuthHeader(v string) (rv string) {
    return proxyAuthKey + ": " + proxyAuthBasic + " " + base64.StdEncoding.EncodeToString([]byte(v))
}

func prepareFilter(tags []string) links.TokenFilter {
    if len(tags) == 0 {
        return links.AllowALL
    }

    atoms := make(set.Unordered[atom.Atom])

    var a atom.Atom

    for _, t := range tags {
        if a = atom.Lookup([]byte(t)); a != 0 {
            atoms.Add(a)
        } else {
            log.Printf("[!] invalid tag: `%s` skipping...", t)
        }
    }

    return func(t html.Token) (ok bool) {
        return atoms.Has(t.DataAtom)
    }
}

func canCrawl(a, b *url.URL, d int, subdomains bool) (yes bool) {
    if a.Host != b.Host {
        if !subdomains {
            return false
        }

        domainA := strings.Split(a.Host, ".")
        domainB := strings.Split(b.Host, ".")

        if len(domainA) >= len(domainB) {
            // The base domain must be shorter than the found domain
            return false
        }

        j := len(domainB) - 1

        for i := len(domainA) - 1; i >= 0 && j >= 0; i-- {
            // Traverse each domain from the end, to check if their top-level domain are the same
            if domainA[i] != domainB[j] {
                // not the same top-level host
                return false
            }

            j--
        }
    }

    var apath, bpath string

    if apath = a.Path; apath == "" {
        apath = "/"
    }

    if bpath = b.Path; bpath == "" {
        bpath = "/"
    }

    depth, found := relativeDepth(apath, bpath)
    if !found {
        return false
    }

    if d >= 0 && depth > d {
        return false
    }

    return true
}

func relativeDepth(base, sub string) (n int, ok bool) {
    var (
        bn = path.Clean(base)
        sn = path.Clean(sub)
    )

    if len(sn) < len(bn) {
        return
    }

    if !strings.HasPrefix(sn, bn) {
        return
    }

    const pathSep = '/'

    fields := strings.FieldsFunc(sn[len(bn):], func(r rune) bool {
        return r == pathSep
    })

    for i := 0; i < len(fields); i++ {
        if fields[i] != "" {
            n++
        }
    }

    return n, true
}

func webExt(v string) (rv string) {
    const maxCut = 2

    p := strings.SplitN(v, "?", maxCut)

    return filepath.Ext(p[0])
}

func canParse(v string) (yes bool) {
    _, tmp := path.Split(v)
    if tmp == "" {
        return true
    }

    if tmp = webExt(tmp); tmp == "" {
        return true
    }

    return parsableExts.Has(strings.ToLower(tmp))
}

func isSitemap(s string) (yes bool) {
    const (
        sitemapXML = "sitemap.xml"
        sitemapIDX = "sitemap-index.xml"
    )

    return strings.HasSuffix(s, sitemapXML) || strings.HasSuffix(s, sitemapIDX)
}

func isResorce(v string) (yes bool) {
    _, tmp := path.Split(v)
    if tmp == "" {
        return
    }

    if tmp = webExt(tmp); tmp == "" {
        return
    }

    return true
}

func isHTML(v string) (yes bool) {
    typ, _, err := mime.ParseMediaType(v)
    if err != nil {
        return
    }

    return typ == contentHTML
}

func isJS(v, n string) (yes bool) {
    typ, _, err := mime.ParseMediaType(v)
    if err == nil && typ == contentJS {
        return true
    }

    return webExt(n) == fileExtJS
}

func isCSS(v, n string) (yes bool) {
    typ, _, err := mime.ParseMediaType(v)
    if err == nil && typ == contentCSS {
        return true
    }

    return webExt(n) == fileExtCSS
}

func urlhash(s string) (rv uint64) {
    hash := fnv.New64()
    _, _ = io.WriteString(hash, strings.ToLower(s))

    return hash.Sum64()
}

func resolveRef(base, uri string) (rv string, ok bool) {
    b, err := url.Parse(base)
    if err != nil {
        return
    }

    u, err := url.Parse(uri)
    if err != nil {
        return
    }

    rv = b.ResolveReference(u).String()

    return rv, true
}