github.com/golang/dep@v0.5.4/gps/verify/digest.go (about)

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package verify
     6  
     7  import (
     8  	"bytes"
     9  	"crypto/sha256"
    10  	"encoding/binary"
    11  	"encoding/hex"
    12  	"fmt"
    13  	"hash"
    14  	"io"
    15  	"os"
    16  	"path/filepath"
    17  	"sort"
    18  	"strconv"
    19  	"strings"
    20  
    21  	"github.com/pkg/errors"
    22  )
    23  
    24  // HashVersion is an arbitrary number that identifies the hash algorithm used by
    25  // the directory hasher.
    26  //
    27  //   1: SHA256, as implemented in crypto/sha256
    28  const HashVersion = 1
    29  
    30  const osPathSeparator = string(filepath.Separator)
    31  
    32  // lineEndingReader is a `io.Reader` that converts CRLF sequences to LF.
    33  //
    34  // When cloning or checking out repositories, some Version Control Systems,
    35  // VCSs, on some supported Go Operating System architectures, GOOS, will
    36  // automatically convert line endings that end in a single line feed byte, LF,
    37  // to line endings that end in a two byte sequence of carriage return, CR,
    38  // followed by LF. This LF to CRLF conversion would cause otherwise identical
    39  // versioned files to have different on disk contents simply based on which VCS
    40  // and GOOS are involved. Different file contents for the same file would cause
    41  // the resultant hashes to differ. In order to ensure file contents normalize
    42  // and produce the same hash, this structure wraps an io.Reader that modifies
    43  // the file's contents when it is read, translating all CRLF sequences to LF.
    44  type lineEndingReader struct {
    45  	src             io.Reader // source io.Reader from which this reads
    46  	prevReadEndedCR bool      // used to track whether final byte of previous Read was CR
    47  }
    48  
    49  // newLineEndingReader returns a new lineEndingReader that reads from the
    50  // specified source io.Reader.
    51  func newLineEndingReader(src io.Reader) *lineEndingReader {
    52  	return &lineEndingReader{src: src}
    53  }
    54  
    55  var crlf = []byte("\r\n")
    56  
    57  // Read consumes bytes from the structure's source io.Reader to fill the
    58  // specified slice of bytes. It converts all CRLF byte sequences to LF, and
    59  // handles cases where CR and LF straddle across two Read operations.
    60  func (f *lineEndingReader) Read(buf []byte) (int, error) {
    61  	buflen := len(buf)
    62  	if f.prevReadEndedCR {
    63  		// Read one fewer bytes so we have room if the first byte of the
    64  		// upcoming Read is not a LF, in which case we will need to insert
    65  		// trailing CR from previous read.
    66  		buflen--
    67  	}
    68  	nr, er := f.src.Read(buf[:buflen])
    69  	if nr > 0 {
    70  		if f.prevReadEndedCR && buf[0] != '\n' {
    71  			// Having a CRLF split across two Read operations is rare, so the
    72  			// performance impact of copying entire buffer to the right by one
    73  			// byte, while suboptimal, will at least will not happen very
    74  			// often. This negative performance impact is mitigated somewhat on
    75  			// many Go compilation architectures, GOARCH, because the `copy`
    76  			// builtin uses a machine opcode for performing the memory copy on
    77  			// possibly overlapping regions of memory. This machine opcodes is
    78  			// not instantaneous and does require multiple CPU cycles to
    79  			// complete, but is significantly faster than the application
    80  			// looping through bytes.
    81  			copy(buf[1:nr+1], buf[:nr]) // shift data to right one byte
    82  			buf[0] = '\r'               // insert the previous skipped CR byte at start of buf
    83  			nr++                        // pretend we read one more byte
    84  		}
    85  
    86  		// Remove any CRLF sequences in the buffer using `bytes.Index` because,
    87  		// like the `copy` builtin on many GOARCHs, it also takes advantage of a
    88  		// machine opcode to search for byte patterns.
    89  		var searchOffset int // index within buffer from whence the search will commence for each loop; set to the index of the end of the previous loop.
    90  		var shiftCount int   // each subsequenct shift operation needs to shift bytes to the left by one more position than the shift that preceded it.
    91  		previousIndex := -1  // index of previously found CRLF; -1 means no previous index
    92  		for {
    93  			index := bytes.Index(buf[searchOffset:nr], crlf)
    94  			if index == -1 {
    95  				break
    96  			}
    97  			index += searchOffset // convert relative index to absolute
    98  			if previousIndex != -1 {
    99  				// shift substring between previous index and this index
   100  				copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:index])
   101  				shiftCount++ // next shift needs to be 1 byte to the left
   102  			}
   103  			previousIndex = index
   104  			searchOffset = index + 2 // start next search after len(crlf)
   105  		}
   106  		if previousIndex != -1 {
   107  			// handle final shift
   108  			copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:nr])
   109  			shiftCount++
   110  		}
   111  		nr -= shiftCount // shorten byte read count by number of shifts executed
   112  
   113  		// When final byte from a read operation is CR, do not emit it until
   114  		// ensure first byte on next read is not LF.
   115  		if f.prevReadEndedCR = buf[nr-1] == '\r'; f.prevReadEndedCR {
   116  			nr-- // pretend byte was never read from source
   117  		}
   118  	} else if f.prevReadEndedCR {
   119  		// Reading from source returned nothing, but this struct is sitting on a
   120  		// trailing CR from previous Read, so let's give it to client now.
   121  		buf[0] = '\r'
   122  		nr = 1
   123  		er = nil
   124  		f.prevReadEndedCR = false // prevent infinite loop
   125  	}
   126  	return nr, er
   127  }
   128  
   129  // writeBytesWithNull appends the specified data to the specified hash, followed by
   130  // the NULL byte, in order to make accidental hash collisions less likely.
   131  func writeBytesWithNull(h hash.Hash, data []byte) {
   132  	// Ignore return values from writing to the hash, because hash write always
   133  	// returns nil error.
   134  	_, _ = h.Write(append(data, 0))
   135  }
   136  
   137  // dirWalkClosure is used to reduce number of allocation involved in closing
   138  // over these variables.
   139  type dirWalkClosure struct {
   140  	someCopyBufer []byte // allocate once and reuse for each file copy
   141  	someModeBytes []byte // allocate once and reuse for each node
   142  	someDirLen    int
   143  	someHash      hash.Hash
   144  }
   145  
   146  // DigestFromDirectory returns a hash of the specified directory contents, which
   147  // will match the hash computed for any directory on any supported Go platform
   148  // whose contents exactly match the specified directory.
   149  //
   150  // This function ignores any file system node named `vendor`, `.bzr`, `.git`,
   151  // `.hg`, and `.svn`, as these are typically used as Version Control System
   152  // (VCS) directories.
   153  //
   154  // Other than the `vendor` and VCS directories mentioned above, the calculated
   155  // hash includes the pathname to every discovered file system node, whether it
   156  // is an empty directory, a non-empty directory, an empty file, or a non-empty file.
   157  //
   158  // Symbolic links are excluded, as they are not considered valid elements in the
   159  // definition of a Go module.
   160  func DigestFromDirectory(osDirname string) (VersionedDigest, error) {
   161  	osDirname = filepath.Clean(osDirname)
   162  
   163  	// Create a single hash instance for the entire operation, rather than a new
   164  	// hash for each node we encounter.
   165  
   166  	closure := dirWalkClosure{
   167  		someCopyBufer: make([]byte, 4*1024), // only allocate a single page
   168  		someModeBytes: make([]byte, 4),      // scratch place to store encoded os.FileMode (uint32)
   169  		someDirLen:    len(osDirname) + len(osPathSeparator),
   170  		someHash:      sha256.New(),
   171  	}
   172  
   173  	err := filepath.Walk(osDirname, func(osPathname string, info os.FileInfo, err error) error {
   174  		if err != nil {
   175  			return err
   176  		}
   177  
   178  		// Completely ignore symlinks.
   179  		if info.Mode()&os.ModeSymlink != 0 {
   180  			return nil
   181  		}
   182  
   183  		var osRelative string
   184  		if len(osPathname) > closure.someDirLen {
   185  			osRelative = osPathname[closure.someDirLen:]
   186  		}
   187  
   188  		switch filepath.Base(osRelative) {
   189  		case "vendor", ".bzr", ".git", ".hg", ".svn":
   190  			return filepath.SkipDir
   191  		}
   192  
   193  		// We could make our own enum-like data type for encoding the file type,
   194  		// but Go's runtime already gives us architecture independent file
   195  		// modes, as discussed in `os/types.go`:
   196  		//
   197  		//    Go's runtime FileMode type has same definition on all systems, so
   198  		//    that information about files can be moved from one system to
   199  		//    another portably.
   200  		var mt os.FileMode
   201  
   202  		// We only care about the bits that identify the type of a file system
   203  		// node, and can ignore append, exclusive, temporary, setuid, setgid,
   204  		// permission bits, and sticky bits, which are coincident to bits which
   205  		// declare type of the file system node.
   206  		modeType := info.Mode() & os.ModeType
   207  		var shouldSkip bool // skip some types of file system nodes
   208  
   209  		switch {
   210  		case modeType&os.ModeDir > 0:
   211  			mt = os.ModeDir
   212  			// This func does not need to enumerate children, because
   213  			// filepath.Walk will do that for us.
   214  			shouldSkip = true
   215  		case modeType&os.ModeNamedPipe > 0:
   216  			mt = os.ModeNamedPipe
   217  			shouldSkip = true
   218  		case modeType&os.ModeSocket > 0:
   219  			mt = os.ModeSocket
   220  			shouldSkip = true
   221  		case modeType&os.ModeDevice > 0:
   222  			mt = os.ModeDevice
   223  			shouldSkip = true
   224  		}
   225  
   226  		// Write the relative pathname to hash because the hash is a function of
   227  		// the node names, node types, and node contents. Added benefit is that
   228  		// empty directories, named pipes, sockets, and devices. Use
   229  		// `filepath.ToSlash` to ensure relative pathname is os-agnostic.
   230  		writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative)))
   231  
   232  		binary.LittleEndian.PutUint32(closure.someModeBytes, uint32(mt)) // encode the type of mode
   233  		writeBytesWithNull(closure.someHash, closure.someModeBytes)      // and write to hash
   234  
   235  		if shouldSkip {
   236  			return nil // nothing more to do for some of the node types
   237  		}
   238  
   239  		// If we get here, node is a regular file.
   240  		fh, err := os.Open(osPathname)
   241  		if err != nil {
   242  			return errors.Wrap(err, "cannot Open")
   243  		}
   244  
   245  		var bytesWritten int64
   246  		bytesWritten, err = io.CopyBuffer(closure.someHash, newLineEndingReader(fh), closure.someCopyBufer) // fast copy of file contents to hash
   247  		err = errors.Wrap(err, "cannot Copy")                                                               // errors.Wrap only wraps non-nil, so skip extra check
   248  		writeBytesWithNull(closure.someHash, []byte(strconv.FormatInt(bytesWritten, 10)))                   // 10: format file size as base 10 integer
   249  
   250  		// Close the file handle to the open file without masking
   251  		// possible previous error value.
   252  		if er := fh.Close(); err == nil {
   253  			err = errors.Wrap(er, "cannot Close")
   254  		}
   255  		return err
   256  	})
   257  
   258  	if err != nil {
   259  		return VersionedDigest{}, err
   260  	}
   261  
   262  	return VersionedDigest{
   263  		HashVersion: HashVersion,
   264  		Digest:      closure.someHash.Sum(nil),
   265  	}, nil
   266  }
   267  
   268  // VendorStatus represents one of a handful of possible status conditions for a
   269  // particular file system node in the vendor directory tree.
   270  type VendorStatus uint8
   271  
   272  const (
   273  	// NotInLock is used when a file system node exists for which there is no
   274  	// corresponding dependency in the lock file.
   275  	NotInLock VendorStatus = iota
   276  
   277  	// NotInTree is used when a lock file dependency exists for which there is
   278  	// no corresponding file system node.
   279  	NotInTree
   280  
   281  	// NoMismatch is used when the digest for a dependency listed in the
   282  	// lockfile matches what is calculated from the file system.
   283  	NoMismatch
   284  
   285  	// EmptyDigestInLock is used when the digest for a dependency listed in the
   286  	// lock file is the empty string. While this is a special case of
   287  	// DigestMismatchInLock, separating the cases is a desired feature.
   288  	EmptyDigestInLock
   289  
   290  	// DigestMismatchInLock is used when the digest for a dependency listed in
   291  	// the lock file does not match what is calculated from the file system.
   292  	DigestMismatchInLock
   293  
   294  	// HashVersionMismatch indicates that the hashing algorithm used to generate
   295  	// the digest being compared against is not the same as the one used by the
   296  	// current program.
   297  	HashVersionMismatch
   298  )
   299  
   300  func (ls VendorStatus) String() string {
   301  	switch ls {
   302  	case NotInLock:
   303  		return "not in lock"
   304  	case NotInTree:
   305  		return "not in tree"
   306  	case NoMismatch:
   307  		return "match"
   308  	case EmptyDigestInLock:
   309  		return "empty digest in lock"
   310  	case DigestMismatchInLock:
   311  		return "mismatch"
   312  	case HashVersionMismatch:
   313  		return "hasher changed"
   314  	}
   315  	return "unknown"
   316  }
   317  
   318  // fsnode is used to track which file system nodes are required by the lock
   319  // file. When a directory is found whose name matches one of the declared
   320  // projects in the lock file, e.g., "github.com/alice/alice1", an fsnode is
   321  // created for that directory, but not for any of its children. All other file
   322  // system nodes encountered will result in a fsnode created to represent it.
   323  type fsnode struct {
   324  	osRelative           string // os-specific relative path of a resource under vendor root
   325  	isRequiredAncestor   bool   // true iff this node or one of its descendants is in the lock file
   326  	myIndex, parentIndex int    // index of this node and its parent in the tree's slice
   327  }
   328  
   329  // VersionedDigest comprises both a hash digest, and a simple integer indicating
   330  // the version of the hash algorithm that produced the digest.
   331  type VersionedDigest struct {
   332  	HashVersion int
   333  	Digest      []byte
   334  }
   335  
   336  func (vd VersionedDigest) String() string {
   337  	return fmt.Sprintf("%s:%s", strconv.Itoa(vd.HashVersion), hex.EncodeToString(vd.Digest))
   338  }
   339  
   340  // IsEmpty indicates if the VersionedDigest is the zero value.
   341  func (vd VersionedDigest) IsEmpty() bool {
   342  	return vd.HashVersion == 0 && len(vd.Digest) == 0
   343  }
   344  
   345  // ParseVersionedDigest decodes the string representation of versioned digest
   346  // information - a colon-separated string with a version number in the first
   347  // part and the hex-encdoed hash digest in the second - as a VersionedDigest.
   348  func ParseVersionedDigest(input string) (VersionedDigest, error) {
   349  	var vd VersionedDigest
   350  	var err error
   351  
   352  	parts := strings.Split(input, ":")
   353  	if len(parts) != 2 {
   354  		return VersionedDigest{}, errors.Errorf("expected two colon-separated components in the versioned hash digest, got %q", input)
   355  	}
   356  	if vd.Digest, err = hex.DecodeString(parts[1]); err != nil {
   357  		return VersionedDigest{}, err
   358  	}
   359  	if vd.HashVersion, err = strconv.Atoi(parts[0]); err != nil {
   360  		return VersionedDigest{}, err
   361  	}
   362  
   363  	return vd, nil
   364  }
   365  
   366  // CheckDepTree verifies a dependency tree according to expected digest sums,
   367  // and returns an associative array of file system nodes and their respective
   368  // vendor status conditions.
   369  //
   370  // The keys to the expected digest sums associative array represent the
   371  // project's dependencies, and each is required to be expressed using the
   372  // solidus character, `/`, as its path separator. For example, even on a GOOS
   373  // platform where the file system path separator is a character other than
   374  // solidus, one particular dependency would be represented as
   375  // "github.com/alice/alice1".
   376  func CheckDepTree(osDirname string, wantDigests map[string]VersionedDigest) (map[string]VendorStatus, error) {
   377  	osDirname = filepath.Clean(osDirname)
   378  
   379  	// Create associative array to store the results of calling this function.
   380  	slashStatus := make(map[string]VendorStatus)
   381  
   382  	// Ensure top level pathname is a directory
   383  	fi, err := os.Stat(osDirname)
   384  	if err != nil {
   385  		// If the dir doesn't exist at all, that's OK - just consider all the
   386  		// wanted paths absent.
   387  		if os.IsNotExist(err) {
   388  			for path := range wantDigests {
   389  				slashStatus[path] = NotInTree
   390  			}
   391  			return slashStatus, nil
   392  		}
   393  		return nil, errors.Wrap(err, "cannot Stat")
   394  	}
   395  
   396  	if !fi.IsDir() {
   397  		return nil, errors.Errorf("cannot verify non directory: %q", osDirname)
   398  	}
   399  
   400  	// Initialize work queue with a node representing the specified directory
   401  	// name by declaring its relative pathname under the directory name as the
   402  	// empty string.
   403  	currentNode := &fsnode{osRelative: "", parentIndex: -1, isRequiredAncestor: true}
   404  	queue := []*fsnode{currentNode} // queue of directories that must be inspected
   405  
   406  	// In order to identify all file system nodes that are not in the lock file,
   407  	// represented by the specified expected sums parameter, and in order to
   408  	// only report the top level of a subdirectory of file system nodes, rather
   409  	// than every node internal to them, we will create a tree of nodes stored
   410  	// in a slice. We do this because we cannot predict the depth at which
   411  	// project roots occur. Some projects are fewer than and some projects more
   412  	// than the typical three layer subdirectory under the vendor root
   413  	// directory.
   414  	//
   415  	// For a following few examples, assume the below vendor root directory:
   416  	//
   417  	// github.com/alice/alice1/a1.go
   418  	// github.com/alice/alice2/a2.go
   419  	// github.com/bob/bob1/b1.go
   420  	// github.com/bob/bob2/b2.go
   421  	// launchpad.net/nifty/n1.go
   422  	//
   423  	// 1) If only the `alice1` and `alice2` projects were in the lock file, we'd
   424  	// prefer the output to state that `github.com/bob` is `NotInLock`, and
   425  	// `launchpad.net/nifty` is `NotInLock`.
   426  	//
   427  	// 2) If `alice1`, `alice2`, and `bob1` were in the lock file, we'd want to
   428  	// report `github.com/bob/bob2` as `NotInLock`, and `launchpad.net/nifty` is
   429  	// `NotInLock`.
   430  	//
   431  	// 3) If none of `alice1`, `alice2`, `bob1`, or `bob2` were in the lock
   432  	// file, the entire `github.com` directory would be reported as `NotInLock`,
   433  	// along with `launchpad.net/nifty` is `NotInLock`.
   434  	//
   435  	// Each node in our tree has the slice index of its parent node, so once we
   436  	// can categorically state a particular directory is required because it is
   437  	// in the lock file, we can mark all of its ancestors as also being
   438  	// required. Then, when we finish walking the directory hierarchy, any nodes
   439  	// which are not required but have a required parent will be marked as
   440  	// `NotInLock`.
   441  	nodes := []*fsnode{currentNode}
   442  
   443  	// Mark directories of expected projects as required. When each respective
   444  	// project is later found while traversing the vendor root hierarchy, its
   445  	// status will be updated to reflect whether its digest is empty, or,
   446  	// whether or not it matches the expected digest.
   447  	for slashPathname := range wantDigests {
   448  		slashStatus[slashPathname] = NotInTree
   449  	}
   450  
   451  	for len(queue) > 0 {
   452  		// Pop node from the top of queue (depth first traversal, reverse
   453  		// lexicographical order inside a directory), clearing the value stored
   454  		// in the slice's backing array as we proceed.
   455  		lq1 := len(queue) - 1
   456  		currentNode, queue[lq1], queue = queue[lq1], nil, queue[:lq1]
   457  		slashPathname := filepath.ToSlash(currentNode.osRelative)
   458  		osPathname := filepath.Join(osDirname, currentNode.osRelative)
   459  
   460  		if expectedSum, ok := wantDigests[slashPathname]; ok {
   461  			ls := EmptyDigestInLock
   462  			if expectedSum.HashVersion != HashVersion {
   463  				if !expectedSum.IsEmpty() {
   464  					ls = HashVersionMismatch
   465  				}
   466  			} else if len(expectedSum.Digest) > 0 {
   467  				projectSum, err := DigestFromDirectory(osPathname)
   468  				if err != nil {
   469  					return nil, errors.Wrap(err, "cannot compute dependency hash")
   470  				}
   471  				if bytes.Equal(projectSum.Digest, expectedSum.Digest) {
   472  					ls = NoMismatch
   473  				} else {
   474  					ls = DigestMismatchInLock
   475  				}
   476  			}
   477  			slashStatus[slashPathname] = ls
   478  
   479  			// Mark current nodes and all its parents as required.
   480  			for i := currentNode.myIndex; i != -1; i = nodes[i].parentIndex {
   481  				nodes[i].isRequiredAncestor = true
   482  			}
   483  
   484  			// Do not need to process this directory's contents because we
   485  			// already accounted for its contents while calculating its digest.
   486  			continue
   487  		}
   488  
   489  		osChildrenNames, err := sortedChildrenFromDirname(osPathname)
   490  		if err != nil {
   491  			return nil, errors.Wrap(err, "cannot get sorted list of directory children")
   492  		}
   493  		for _, osChildName := range osChildrenNames {
   494  			switch osChildName {
   495  			case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn":
   496  				// skip
   497  			default:
   498  				osChildRelative := filepath.Join(currentNode.osRelative, osChildName)
   499  				osChildPathname := filepath.Join(osDirname, osChildRelative)
   500  
   501  				// Create a new fsnode for this file system node, with a parent
   502  				// index set to the index of the current node.
   503  				otherNode := &fsnode{osRelative: osChildRelative, myIndex: len(nodes), parentIndex: currentNode.myIndex}
   504  
   505  				fi, err := os.Stat(osChildPathname)
   506  				if err != nil {
   507  					return nil, errors.Wrap(err, "cannot Stat")
   508  				}
   509  				nodes = append(nodes, otherNode) // Track all file system nodes...
   510  				if fi.IsDir() {
   511  					queue = append(queue, otherNode) // but only need to add directories to the work queue.
   512  				}
   513  			}
   514  		}
   515  	}
   516  
   517  	// Ignoring first node in the list, walk nodes from last to first. Whenever
   518  	// the current node is not required, but its parent is required, then the
   519  	// current node ought to be marked as `NotInLock`.
   520  	for len(nodes) > 1 {
   521  		// Pop node from top of queue, clearing the value stored in the slice's
   522  		// backing array as we proceed.
   523  		ln1 := len(nodes) - 1
   524  		currentNode, nodes[ln1], nodes = nodes[ln1], nil, nodes[:ln1]
   525  
   526  		if !currentNode.isRequiredAncestor && nodes[currentNode.parentIndex].isRequiredAncestor {
   527  			slashStatus[filepath.ToSlash(currentNode.osRelative)] = NotInLock
   528  		}
   529  	}
   530  	currentNode, nodes = nil, nil
   531  
   532  	return slashStatus, nil
   533  }
   534  
   535  // sortedChildrenFromDirname returns a lexicographically sorted list of child
   536  // nodes for the specified directory.
   537  func sortedChildrenFromDirname(osDirname string) ([]string, error) {
   538  	fh, err := os.Open(osDirname)
   539  	if err != nil {
   540  		return nil, errors.Wrap(err, "cannot Open")
   541  	}
   542  
   543  	osChildrenNames, err := fh.Readdirnames(0) // 0: read names of all children
   544  	if err != nil {
   545  		return nil, errors.Wrap(err, "cannot Readdirnames")
   546  	}
   547  	sort.Strings(osChildrenNames)
   548  
   549  	// Close the file handle to the open directory without masking possible
   550  	// previous error value.
   551  	if er := fh.Close(); err == nil {
   552  		err = errors.Wrap(er, "cannot Close")
   553  	}
   554  	return osChildrenNames, err
   555  }