gps/verify/digest.go

Summary

Maintainability
A
0 mins
Test Coverage
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package verify

import (
    "bytes"
    "crypto/sha256"
    "encoding/binary"
    "encoding/hex"
    "fmt"
    "hash"
    "io"
    "os"
    "path/filepath"
    "sort"
    "strconv"
    "strings"

    "github.com/pkg/errors"
)

// HashVersion is an arbitrary number that identifies the hash algorithm used by
// the directory hasher.
//
//   1: SHA256, as implemented in crypto/sha256
const HashVersion = 1

const osPathSeparator = string(filepath.Separator)

// lineEndingReader is a `io.Reader` that converts CRLF sequences to LF.
//
// When cloning or checking out repositories, some Version Control Systems,
// VCSs, on some supported Go Operating System architectures, GOOS, will
// automatically convert line endings that end in a single line feed byte, LF,
// to line endings that end in a two byte sequence of carriage return, CR,
// followed by LF. This LF to CRLF conversion would cause otherwise identical
// versioned files to have different on disk contents simply based on which VCS
// and GOOS are involved. Different file contents for the same file would cause
// the resultant hashes to differ. In order to ensure file contents normalize
// and produce the same hash, this structure wraps an io.Reader that modifies
// the file's contents when it is read, translating all CRLF sequences to LF.
type lineEndingReader struct {
    src             io.Reader // source io.Reader from which this reads
    prevReadEndedCR bool      // used to track whether final byte of previous Read was CR
}

// newLineEndingReader returns a new lineEndingReader that reads from the
// specified source io.Reader.
func newLineEndingReader(src io.Reader) *lineEndingReader {
    return &lineEndingReader{src: src}
}

var crlf = []byte("\r\n")

// Read consumes bytes from the structure's source io.Reader to fill the
// specified slice of bytes. It converts all CRLF byte sequences to LF, and
// handles cases where CR and LF straddle across two Read operations.
func (f *lineEndingReader) Read(buf []byte) (int, error) {
    buflen := len(buf)
    if f.prevReadEndedCR {
        // Read one fewer bytes so we have room if the first byte of the
        // upcoming Read is not a LF, in which case we will need to insert
        // trailing CR from previous read.
        buflen--
    }
    nr, er := f.src.Read(buf[:buflen])
    if nr > 0 {
        if f.prevReadEndedCR && buf[0] != '\n' {
            // Having a CRLF split across two Read operations is rare, so the
            // performance impact of copying entire buffer to the right by one
            // byte, while suboptimal, will at least will not happen very
            // often. This negative performance impact is mitigated somewhat on
            // many Go compilation architectures, GOARCH, because the `copy`
            // builtin uses a machine opcode for performing the memory copy on
            // possibly overlapping regions of memory. This machine opcodes is
            // not instantaneous and does require multiple CPU cycles to
            // complete, but is significantly faster than the application
            // looping through bytes.
            copy(buf[1:nr+1], buf[:nr]) // shift data to right one byte
            buf[0] = '\r'               // insert the previous skipped CR byte at start of buf
            nr++                        // pretend we read one more byte
        }

        // Remove any CRLF sequences in the buffer using `bytes.Index` because,
        // like the `copy` builtin on many GOARCHs, it also takes advantage of a
        // machine opcode to search for byte patterns.
        var searchOffset int // index within buffer from whence the search will commence for each loop; set to the index of the end of the previous loop.
        var shiftCount int   // each subsequenct shift operation needs to shift bytes to the left by one more position than the shift that preceded it.
        previousIndex := -1  // index of previously found CRLF; -1 means no previous index
        for {
            index := bytes.Index(buf[searchOffset:nr], crlf)
            if index == -1 {
                break
            }
            index += searchOffset // convert relative index to absolute
            if previousIndex != -1 {
                // shift substring between previous index and this index
                copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:index])
                shiftCount++ // next shift needs to be 1 byte to the left
            }
            previousIndex = index
            searchOffset = index + 2 // start next search after len(crlf)
        }
        if previousIndex != -1 {
            // handle final shift
            copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:nr])
            shiftCount++
        }
        nr -= shiftCount // shorten byte read count by number of shifts executed

        // When final byte from a read operation is CR, do not emit it until
        // ensure first byte on next read is not LF.
        if f.prevReadEndedCR = buf[nr-1] == '\r'; f.prevReadEndedCR {
            nr-- // pretend byte was never read from source
        }
    } else if f.prevReadEndedCR {
        // Reading from source returned nothing, but this struct is sitting on a
        // trailing CR from previous Read, so let's give it to client now.
        buf[0] = '\r'
        nr = 1
        er = nil
        f.prevReadEndedCR = false // prevent infinite loop
    }
    return nr, er
}

// writeBytesWithNull appends the specified data to the specified hash, followed by
// the NULL byte, in order to make accidental hash collisions less likely.
func writeBytesWithNull(h hash.Hash, data []byte) {
    // Ignore return values from writing to the hash, because hash write always
    // returns nil error.
    _, _ = h.Write(append(data, 0))
}

// dirWalkClosure is used to reduce number of allocation involved in closing
// over these variables.
type dirWalkClosure struct {
    someCopyBufer []byte // allocate once and reuse for each file copy
    someModeBytes []byte // allocate once and reuse for each node
    someDirLen    int
    someHash      hash.Hash
}

// DigestFromDirectory returns a hash of the specified directory contents, which
// will match the hash computed for any directory on any supported Go platform
// whose contents exactly match the specified directory.
//
// This function ignores any file system node named `vendor`, `.bzr`, `.git`,
// `.hg`, and `.svn`, as these are typically used as Version Control System
// (VCS) directories.
//
// Other than the `vendor` and VCS directories mentioned above, the calculated
// hash includes the pathname to every discovered file system node, whether it
// is an empty directory, a non-empty directory, an empty file, or a non-empty file.
//
// Symbolic links are excluded, as they are not considered valid elements in the
// definition of a Go module.
func DigestFromDirectory(osDirname string) (VersionedDigest, error) {
    osDirname = filepath.Clean(osDirname)

    // Create a single hash instance for the entire operation, rather than a new
    // hash for each node we encounter.

    closure := dirWalkClosure{
        someCopyBufer: make([]byte, 4*1024), // only allocate a single page
        someModeBytes: make([]byte, 4),      // scratch place to store encoded os.FileMode (uint32)
        someDirLen:    len(osDirname) + len(osPathSeparator),
        someHash:      sha256.New(),
    }

    err := filepath.Walk(osDirname, func(osPathname string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }

        // Completely ignore symlinks.
        if info.Mode()&os.ModeSymlink != 0 {
            return nil
        }

        var osRelative string
        if len(osPathname) > closure.someDirLen {
            osRelative = osPathname[closure.someDirLen:]
        }

        switch filepath.Base(osRelative) {
        case "vendor", ".bzr", ".git", ".hg", ".svn":
            return filepath.SkipDir
        }

        // We could make our own enum-like data type for encoding the file type,
        // but Go's runtime already gives us architecture independent file
        // modes, as discussed in `os/types.go`:
        //
        //    Go's runtime FileMode type has same definition on all systems, so
        //    that information about files can be moved from one system to
        //    another portably.
        var mt os.FileMode

        // We only care about the bits that identify the type of a file system
        // node, and can ignore append, exclusive, temporary, setuid, setgid,
        // permission bits, and sticky bits, which are coincident to bits which
        // declare type of the file system node.
        modeType := info.Mode() & os.ModeType
        var shouldSkip bool // skip some types of file system nodes

        switch {
        case modeType&os.ModeDir > 0:
            mt = os.ModeDir
            // This func does not need to enumerate children, because
            // filepath.Walk will do that for us.
            shouldSkip = true
        case modeType&os.ModeNamedPipe > 0:
            mt = os.ModeNamedPipe
            shouldSkip = true
        case modeType&os.ModeSocket > 0:
            mt = os.ModeSocket
            shouldSkip = true
        case modeType&os.ModeDevice > 0:
            mt = os.ModeDevice
            shouldSkip = true
        }

        // Write the relative pathname to hash because the hash is a function of
        // the node names, node types, and node contents. Added benefit is that
        // empty directories, named pipes, sockets, and devices. Use
        // `filepath.ToSlash` to ensure relative pathname is os-agnostic.
        writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative)))

        binary.LittleEndian.PutUint32(closure.someModeBytes, uint32(mt)) // encode the type of mode
        writeBytesWithNull(closure.someHash, closure.someModeBytes)      // and write to hash

        if shouldSkip {
            return nil // nothing more to do for some of the node types
        }

        // If we get here, node is a regular file.
        fh, err := os.Open(osPathname)
        if err != nil {
            return errors.Wrap(err, "cannot Open")
        }

        var bytesWritten int64
        bytesWritten, err = io.CopyBuffer(closure.someHash, newLineEndingReader(fh), closure.someCopyBufer) // fast copy of file contents to hash
        err = errors.Wrap(err, "cannot Copy")                                                               // errors.Wrap only wraps non-nil, so skip extra check
        writeBytesWithNull(closure.someHash, []byte(strconv.FormatInt(bytesWritten, 10)))                   // 10: format file size as base 10 integer

        // Close the file handle to the open file without masking
        // possible previous error value.
        if er := fh.Close(); err == nil {
            err = errors.Wrap(er, "cannot Close")
        }
        return err
    })

    if err != nil {
        return VersionedDigest{}, err
    }

    return VersionedDigest{
        HashVersion: HashVersion,
        Digest:      closure.someHash.Sum(nil),
    }, nil
}

// VendorStatus represents one of a handful of possible status conditions for a
// particular file system node in the vendor directory tree.
type VendorStatus uint8

const (
    // NotInLock is used when a file system node exists for which there is no
    // corresponding dependency in the lock file.
    NotInLock VendorStatus = iota

    // NotInTree is used when a lock file dependency exists for which there is
    // no corresponding file system node.
    NotInTree

    // NoMismatch is used when the digest for a dependency listed in the
    // lockfile matches what is calculated from the file system.
    NoMismatch

    // EmptyDigestInLock is used when the digest for a dependency listed in the
    // lock file is the empty string. While this is a special case of
    // DigestMismatchInLock, separating the cases is a desired feature.
    EmptyDigestInLock

    // DigestMismatchInLock is used when the digest for a dependency listed in
    // the lock file does not match what is calculated from the file system.
    DigestMismatchInLock

    // HashVersionMismatch indicates that the hashing algorithm used to generate
    // the digest being compared against is not the same as the one used by the
    // current program.
    HashVersionMismatch
)

func (ls VendorStatus) String() string {
    switch ls {
    case NotInLock:
        return "not in lock"
    case NotInTree:
        return "not in tree"
    case NoMismatch:
        return "match"
    case EmptyDigestInLock:
        return "empty digest in lock"
    case DigestMismatchInLock:
        return "mismatch"
    case HashVersionMismatch:
        return "hasher changed"
    }
    return "unknown"
}

// fsnode is used to track which file system nodes are required by the lock
// file. When a directory is found whose name matches one of the declared
// projects in the lock file, e.g., "github.com/alice/alice1", an fsnode is
// created for that directory, but not for any of its children. All other file
// system nodes encountered will result in a fsnode created to represent it.
type fsnode struct {
    osRelative           string // os-specific relative path of a resource under vendor root
    isRequiredAncestor   bool   // true iff this node or one of its descendants is in the lock file
    myIndex, parentIndex int    // index of this node and its parent in the tree's slice
}

// VersionedDigest comprises both a hash digest, and a simple integer indicating
// the version of the hash algorithm that produced the digest.
type VersionedDigest struct {
    HashVersion int
    Digest      []byte
}

func (vd VersionedDigest) String() string {
    return fmt.Sprintf("%s:%s", strconv.Itoa(vd.HashVersion), hex.EncodeToString(vd.Digest))
}

// IsEmpty indicates if the VersionedDigest is the zero value.
func (vd VersionedDigest) IsEmpty() bool {
    return vd.HashVersion == 0 && len(vd.Digest) == 0
}

// ParseVersionedDigest decodes the string representation of versioned digest
// information - a colon-separated string with a version number in the first
// part and the hex-encdoed hash digest in the second - as a VersionedDigest.
func ParseVersionedDigest(input string) (VersionedDigest, error) {
    var vd VersionedDigest
    var err error

    parts := strings.Split(input, ":")
    if len(parts) != 2 {
        return VersionedDigest{}, errors.Errorf("expected two colon-separated components in the versioned hash digest, got %q", input)
    }
    if vd.Digest, err = hex.DecodeString(parts[1]); err != nil {
        return VersionedDigest{}, err
    }
    if vd.HashVersion, err = strconv.Atoi(parts[0]); err != nil {
        return VersionedDigest{}, err
    }

    return vd, nil
}

// CheckDepTree verifies a dependency tree according to expected digest sums,
// and returns an associative array of file system nodes and their respective
// vendor status conditions.
//
// The keys to the expected digest sums associative array represent the
// project's dependencies, and each is required to be expressed using the
// solidus character, `/`, as its path separator. For example, even on a GOOS
// platform where the file system path separator is a character other than
// solidus, one particular dependency would be represented as
// "github.com/alice/alice1".
func CheckDepTree(osDirname string, wantDigests map[string]VersionedDigest) (map[string]VendorStatus, error) {
    osDirname = filepath.Clean(osDirname)

    // Create associative array to store the results of calling this function.
    slashStatus := make(map[string]VendorStatus)

    // Ensure top level pathname is a directory
    fi, err := os.Stat(osDirname)
    if err != nil {
        // If the dir doesn't exist at all, that's OK - just consider all the
        // wanted paths absent.
        if os.IsNotExist(err) {
            for path := range wantDigests {
                slashStatus[path] = NotInTree
            }
            return slashStatus, nil
        }
        return nil, errors.Wrap(err, "cannot Stat")
    }

    if !fi.IsDir() {
        return nil, errors.Errorf("cannot verify non directory: %q", osDirname)
    }

    // Initialize work queue with a node representing the specified directory
    // name by declaring its relative pathname under the directory name as the
    // empty string.
    currentNode := &fsnode{osRelative: "", parentIndex: -1, isRequiredAncestor: true}
    queue := []*fsnode{currentNode} // queue of directories that must be inspected

    // In order to identify all file system nodes that are not in the lock file,
    // represented by the specified expected sums parameter, and in order to
    // only report the top level of a subdirectory of file system nodes, rather
    // than every node internal to them, we will create a tree of nodes stored
    // in a slice. We do this because we cannot predict the depth at which
    // project roots occur. Some projects are fewer than and some projects more
    // than the typical three layer subdirectory under the vendor root
    // directory.
    //
    // For a following few examples, assume the below vendor root directory:
    //
    // github.com/alice/alice1/a1.go
    // github.com/alice/alice2/a2.go
    // github.com/bob/bob1/b1.go
    // github.com/bob/bob2/b2.go
    // launchpad.net/nifty/n1.go
    //
    // 1) If only the `alice1` and `alice2` projects were in the lock file, we'd
    // prefer the output to state that `github.com/bob` is `NotInLock`, and
    // `launchpad.net/nifty` is `NotInLock`.
    //
    // 2) If `alice1`, `alice2`, and `bob1` were in the lock file, we'd want to
    // report `github.com/bob/bob2` as `NotInLock`, and `launchpad.net/nifty` is
    // `NotInLock`.
    //
    // 3) If none of `alice1`, `alice2`, `bob1`, or `bob2` were in the lock
    // file, the entire `github.com` directory would be reported as `NotInLock`,
    // along with `launchpad.net/nifty` is `NotInLock`.
    //
    // Each node in our tree has the slice index of its parent node, so once we
    // can categorically state a particular directory is required because it is
    // in the lock file, we can mark all of its ancestors as also being
    // required. Then, when we finish walking the directory hierarchy, any nodes
    // which are not required but have a required parent will be marked as
    // `NotInLock`.
    nodes := []*fsnode{currentNode}

    // Mark directories of expected projects as required. When each respective
    // project is later found while traversing the vendor root hierarchy, its
    // status will be updated to reflect whether its digest is empty, or,
    // whether or not it matches the expected digest.
    for slashPathname := range wantDigests {
        slashStatus[slashPathname] = NotInTree
    }

    for len(queue) > 0 {
        // Pop node from the top of queue (depth first traversal, reverse
        // lexicographical order inside a directory), clearing the value stored
        // in the slice's backing array as we proceed.
        lq1 := len(queue) - 1
        currentNode, queue[lq1], queue = queue[lq1], nil, queue[:lq1]
        slashPathname := filepath.ToSlash(currentNode.osRelative)
        osPathname := filepath.Join(osDirname, currentNode.osRelative)

        if expectedSum, ok := wantDigests[slashPathname]; ok {
            ls := EmptyDigestInLock
            if expectedSum.HashVersion != HashVersion {
                if !expectedSum.IsEmpty() {
                    ls = HashVersionMismatch
                }
            } else if len(expectedSum.Digest) > 0 {
                projectSum, err := DigestFromDirectory(osPathname)
                if err != nil {
                    return nil, errors.Wrap(err, "cannot compute dependency hash")
                }
                if bytes.Equal(projectSum.Digest, expectedSum.Digest) {
                    ls = NoMismatch
                } else {
                    ls = DigestMismatchInLock
                }
            }
            slashStatus[slashPathname] = ls

            // Mark current nodes and all its parents as required.
            for i := currentNode.myIndex; i != -1; i = nodes[i].parentIndex {
                nodes[i].isRequiredAncestor = true
            }

            // Do not need to process this directory's contents because we
            // already accounted for its contents while calculating its digest.
            continue
        }

        osChildrenNames, err := sortedChildrenFromDirname(osPathname)
        if err != nil {
            return nil, errors.Wrap(err, "cannot get sorted list of directory children")
        }
        for _, osChildName := range osChildrenNames {
            switch osChildName {
            case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn":
                // skip
            default:
                osChildRelative := filepath.Join(currentNode.osRelative, osChildName)
                osChildPathname := filepath.Join(osDirname, osChildRelative)

                // Create a new fsnode for this file system node, with a parent
                // index set to the index of the current node.
                otherNode := &fsnode{osRelative: osChildRelative, myIndex: len(nodes), parentIndex: currentNode.myIndex}

                fi, err := os.Stat(osChildPathname)
                if err != nil {
                    return nil, errors.Wrap(err, "cannot Stat")
                }
                nodes = append(nodes, otherNode) // Track all file system nodes...
                if fi.IsDir() {
                    queue = append(queue, otherNode) // but only need to add directories to the work queue.
                }
            }
        }
    }

    // Ignoring first node in the list, walk nodes from last to first. Whenever
    // the current node is not required, but its parent is required, then the
    // current node ought to be marked as `NotInLock`.
    for len(nodes) > 1 {
        // Pop node from top of queue, clearing the value stored in the slice's
        // backing array as we proceed.
        ln1 := len(nodes) - 1
        currentNode, nodes[ln1], nodes = nodes[ln1], nil, nodes[:ln1]

        if !currentNode.isRequiredAncestor && nodes[currentNode.parentIndex].isRequiredAncestor {
            slashStatus[filepath.ToSlash(currentNode.osRelative)] = NotInLock
        }
    }
    currentNode, nodes = nil, nil

    return slashStatus, nil
}

// sortedChildrenFromDirname returns a lexicographically sorted list of child
// nodes for the specified directory.
func sortedChildrenFromDirname(osDirname string) ([]string, error) {
    fh, err := os.Open(osDirname)
    if err != nil {
        return nil, errors.Wrap(err, "cannot Open")
    }

    osChildrenNames, err := fh.Readdirnames(0) // 0: read names of all children
    if err != nil {
        return nil, errors.Wrap(err, "cannot Readdirnames")
    }
    sort.Strings(osChildrenNames)

    // Close the file handle to the open directory without masking possible
    // previous error value.
    if er := fh.Close(); err == nil {
        err = errors.Wrap(er, "cannot Close")
    }
    return osChildrenNames, err
}