github.com/golang/dep@v0.5.4/gps/verify/digest.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package verify 6 7 import ( 8 "bytes" 9 "crypto/sha256" 10 "encoding/binary" 11 "encoding/hex" 12 "fmt" 13 "hash" 14 "io" 15 "os" 16 "path/filepath" 17 "sort" 18 "strconv" 19 "strings" 20 21 "github.com/pkg/errors" 22 ) 23 24 // HashVersion is an arbitrary number that identifies the hash algorithm used by 25 // the directory hasher. 26 // 27 // 1: SHA256, as implemented in crypto/sha256 28 const HashVersion = 1 29 30 const osPathSeparator = string(filepath.Separator) 31 32 // lineEndingReader is a `io.Reader` that converts CRLF sequences to LF. 33 // 34 // When cloning or checking out repositories, some Version Control Systems, 35 // VCSs, on some supported Go Operating System architectures, GOOS, will 36 // automatically convert line endings that end in a single line feed byte, LF, 37 // to line endings that end in a two byte sequence of carriage return, CR, 38 // followed by LF. This LF to CRLF conversion would cause otherwise identical 39 // versioned files to have different on disk contents simply based on which VCS 40 // and GOOS are involved. Different file contents for the same file would cause 41 // the resultant hashes to differ. In order to ensure file contents normalize 42 // and produce the same hash, this structure wraps an io.Reader that modifies 43 // the file's contents when it is read, translating all CRLF sequences to LF. 44 type lineEndingReader struct { 45 src io.Reader // source io.Reader from which this reads 46 prevReadEndedCR bool // used to track whether final byte of previous Read was CR 47 } 48 49 // newLineEndingReader returns a new lineEndingReader that reads from the 50 // specified source io.Reader. 51 func newLineEndingReader(src io.Reader) *lineEndingReader { 52 return &lineEndingReader{src: src} 53 } 54 55 var crlf = []byte("\r\n") 56 57 // Read consumes bytes from the structure's source io.Reader to fill the 58 // specified slice of bytes. It converts all CRLF byte sequences to LF, and 59 // handles cases where CR and LF straddle across two Read operations. 60 func (f *lineEndingReader) Read(buf []byte) (int, error) { 61 buflen := len(buf) 62 if f.prevReadEndedCR { 63 // Read one fewer bytes so we have room if the first byte of the 64 // upcoming Read is not a LF, in which case we will need to insert 65 // trailing CR from previous read. 66 buflen-- 67 } 68 nr, er := f.src.Read(buf[:buflen]) 69 if nr > 0 { 70 if f.prevReadEndedCR && buf[0] != '\n' { 71 // Having a CRLF split across two Read operations is rare, so the 72 // performance impact of copying entire buffer to the right by one 73 // byte, while suboptimal, will at least will not happen very 74 // often. This negative performance impact is mitigated somewhat on 75 // many Go compilation architectures, GOARCH, because the `copy` 76 // builtin uses a machine opcode for performing the memory copy on 77 // possibly overlapping regions of memory. This machine opcodes is 78 // not instantaneous and does require multiple CPU cycles to 79 // complete, but is significantly faster than the application 80 // looping through bytes. 81 copy(buf[1:nr+1], buf[:nr]) // shift data to right one byte 82 buf[0] = '\r' // insert the previous skipped CR byte at start of buf 83 nr++ // pretend we read one more byte 84 } 85 86 // Remove any CRLF sequences in the buffer using `bytes.Index` because, 87 // like the `copy` builtin on many GOARCHs, it also takes advantage of a 88 // machine opcode to search for byte patterns. 89 var searchOffset int // index within buffer from whence the search will commence for each loop; set to the index of the end of the previous loop. 90 var shiftCount int // each subsequenct shift operation needs to shift bytes to the left by one more position than the shift that preceded it. 91 previousIndex := -1 // index of previously found CRLF; -1 means no previous index 92 for { 93 index := bytes.Index(buf[searchOffset:nr], crlf) 94 if index == -1 { 95 break 96 } 97 index += searchOffset // convert relative index to absolute 98 if previousIndex != -1 { 99 // shift substring between previous index and this index 100 copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:index]) 101 shiftCount++ // next shift needs to be 1 byte to the left 102 } 103 previousIndex = index 104 searchOffset = index + 2 // start next search after len(crlf) 105 } 106 if previousIndex != -1 { 107 // handle final shift 108 copy(buf[previousIndex-shiftCount:], buf[previousIndex+1:nr]) 109 shiftCount++ 110 } 111 nr -= shiftCount // shorten byte read count by number of shifts executed 112 113 // When final byte from a read operation is CR, do not emit it until 114 // ensure first byte on next read is not LF. 115 if f.prevReadEndedCR = buf[nr-1] == '\r'; f.prevReadEndedCR { 116 nr-- // pretend byte was never read from source 117 } 118 } else if f.prevReadEndedCR { 119 // Reading from source returned nothing, but this struct is sitting on a 120 // trailing CR from previous Read, so let's give it to client now. 121 buf[0] = '\r' 122 nr = 1 123 er = nil 124 f.prevReadEndedCR = false // prevent infinite loop 125 } 126 return nr, er 127 } 128 129 // writeBytesWithNull appends the specified data to the specified hash, followed by 130 // the NULL byte, in order to make accidental hash collisions less likely. 131 func writeBytesWithNull(h hash.Hash, data []byte) { 132 // Ignore return values from writing to the hash, because hash write always 133 // returns nil error. 134 _, _ = h.Write(append(data, 0)) 135 } 136 137 // dirWalkClosure is used to reduce number of allocation involved in closing 138 // over these variables. 139 type dirWalkClosure struct { 140 someCopyBufer []byte // allocate once and reuse for each file copy 141 someModeBytes []byte // allocate once and reuse for each node 142 someDirLen int 143 someHash hash.Hash 144 } 145 146 // DigestFromDirectory returns a hash of the specified directory contents, which 147 // will match the hash computed for any directory on any supported Go platform 148 // whose contents exactly match the specified directory. 149 // 150 // This function ignores any file system node named `vendor`, `.bzr`, `.git`, 151 // `.hg`, and `.svn`, as these are typically used as Version Control System 152 // (VCS) directories. 153 // 154 // Other than the `vendor` and VCS directories mentioned above, the calculated 155 // hash includes the pathname to every discovered file system node, whether it 156 // is an empty directory, a non-empty directory, an empty file, or a non-empty file. 157 // 158 // Symbolic links are excluded, as they are not considered valid elements in the 159 // definition of a Go module. 160 func DigestFromDirectory(osDirname string) (VersionedDigest, error) { 161 osDirname = filepath.Clean(osDirname) 162 163 // Create a single hash instance for the entire operation, rather than a new 164 // hash for each node we encounter. 165 166 closure := dirWalkClosure{ 167 someCopyBufer: make([]byte, 4*1024), // only allocate a single page 168 someModeBytes: make([]byte, 4), // scratch place to store encoded os.FileMode (uint32) 169 someDirLen: len(osDirname) + len(osPathSeparator), 170 someHash: sha256.New(), 171 } 172 173 err := filepath.Walk(osDirname, func(osPathname string, info os.FileInfo, err error) error { 174 if err != nil { 175 return err 176 } 177 178 // Completely ignore symlinks. 179 if info.Mode()&os.ModeSymlink != 0 { 180 return nil 181 } 182 183 var osRelative string 184 if len(osPathname) > closure.someDirLen { 185 osRelative = osPathname[closure.someDirLen:] 186 } 187 188 switch filepath.Base(osRelative) { 189 case "vendor", ".bzr", ".git", ".hg", ".svn": 190 return filepath.SkipDir 191 } 192 193 // We could make our own enum-like data type for encoding the file type, 194 // but Go's runtime already gives us architecture independent file 195 // modes, as discussed in `os/types.go`: 196 // 197 // Go's runtime FileMode type has same definition on all systems, so 198 // that information about files can be moved from one system to 199 // another portably. 200 var mt os.FileMode 201 202 // We only care about the bits that identify the type of a file system 203 // node, and can ignore append, exclusive, temporary, setuid, setgid, 204 // permission bits, and sticky bits, which are coincident to bits which 205 // declare type of the file system node. 206 modeType := info.Mode() & os.ModeType 207 var shouldSkip bool // skip some types of file system nodes 208 209 switch { 210 case modeType&os.ModeDir > 0: 211 mt = os.ModeDir 212 // This func does not need to enumerate children, because 213 // filepath.Walk will do that for us. 214 shouldSkip = true 215 case modeType&os.ModeNamedPipe > 0: 216 mt = os.ModeNamedPipe 217 shouldSkip = true 218 case modeType&os.ModeSocket > 0: 219 mt = os.ModeSocket 220 shouldSkip = true 221 case modeType&os.ModeDevice > 0: 222 mt = os.ModeDevice 223 shouldSkip = true 224 } 225 226 // Write the relative pathname to hash because the hash is a function of 227 // the node names, node types, and node contents. Added benefit is that 228 // empty directories, named pipes, sockets, and devices. Use 229 // `filepath.ToSlash` to ensure relative pathname is os-agnostic. 230 writeBytesWithNull(closure.someHash, []byte(filepath.ToSlash(osRelative))) 231 232 binary.LittleEndian.PutUint32(closure.someModeBytes, uint32(mt)) // encode the type of mode 233 writeBytesWithNull(closure.someHash, closure.someModeBytes) // and write to hash 234 235 if shouldSkip { 236 return nil // nothing more to do for some of the node types 237 } 238 239 // If we get here, node is a regular file. 240 fh, err := os.Open(osPathname) 241 if err != nil { 242 return errors.Wrap(err, "cannot Open") 243 } 244 245 var bytesWritten int64 246 bytesWritten, err = io.CopyBuffer(closure.someHash, newLineEndingReader(fh), closure.someCopyBufer) // fast copy of file contents to hash 247 err = errors.Wrap(err, "cannot Copy") // errors.Wrap only wraps non-nil, so skip extra check 248 writeBytesWithNull(closure.someHash, []byte(strconv.FormatInt(bytesWritten, 10))) // 10: format file size as base 10 integer 249 250 // Close the file handle to the open file without masking 251 // possible previous error value. 252 if er := fh.Close(); err == nil { 253 err = errors.Wrap(er, "cannot Close") 254 } 255 return err 256 }) 257 258 if err != nil { 259 return VersionedDigest{}, err 260 } 261 262 return VersionedDigest{ 263 HashVersion: HashVersion, 264 Digest: closure.someHash.Sum(nil), 265 }, nil 266 } 267 268 // VendorStatus represents one of a handful of possible status conditions for a 269 // particular file system node in the vendor directory tree. 270 type VendorStatus uint8 271 272 const ( 273 // NotInLock is used when a file system node exists for which there is no 274 // corresponding dependency in the lock file. 275 NotInLock VendorStatus = iota 276 277 // NotInTree is used when a lock file dependency exists for which there is 278 // no corresponding file system node. 279 NotInTree 280 281 // NoMismatch is used when the digest for a dependency listed in the 282 // lockfile matches what is calculated from the file system. 283 NoMismatch 284 285 // EmptyDigestInLock is used when the digest for a dependency listed in the 286 // lock file is the empty string. While this is a special case of 287 // DigestMismatchInLock, separating the cases is a desired feature. 288 EmptyDigestInLock 289 290 // DigestMismatchInLock is used when the digest for a dependency listed in 291 // the lock file does not match what is calculated from the file system. 292 DigestMismatchInLock 293 294 // HashVersionMismatch indicates that the hashing algorithm used to generate 295 // the digest being compared against is not the same as the one used by the 296 // current program. 297 HashVersionMismatch 298 ) 299 300 func (ls VendorStatus) String() string { 301 switch ls { 302 case NotInLock: 303 return "not in lock" 304 case NotInTree: 305 return "not in tree" 306 case NoMismatch: 307 return "match" 308 case EmptyDigestInLock: 309 return "empty digest in lock" 310 case DigestMismatchInLock: 311 return "mismatch" 312 case HashVersionMismatch: 313 return "hasher changed" 314 } 315 return "unknown" 316 } 317 318 // fsnode is used to track which file system nodes are required by the lock 319 // file. When a directory is found whose name matches one of the declared 320 // projects in the lock file, e.g., "github.com/alice/alice1", an fsnode is 321 // created for that directory, but not for any of its children. All other file 322 // system nodes encountered will result in a fsnode created to represent it. 323 type fsnode struct { 324 osRelative string // os-specific relative path of a resource under vendor root 325 isRequiredAncestor bool // true iff this node or one of its descendants is in the lock file 326 myIndex, parentIndex int // index of this node and its parent in the tree's slice 327 } 328 329 // VersionedDigest comprises both a hash digest, and a simple integer indicating 330 // the version of the hash algorithm that produced the digest. 331 type VersionedDigest struct { 332 HashVersion int 333 Digest []byte 334 } 335 336 func (vd VersionedDigest) String() string { 337 return fmt.Sprintf("%s:%s", strconv.Itoa(vd.HashVersion), hex.EncodeToString(vd.Digest)) 338 } 339 340 // IsEmpty indicates if the VersionedDigest is the zero value. 341 func (vd VersionedDigest) IsEmpty() bool { 342 return vd.HashVersion == 0 && len(vd.Digest) == 0 343 } 344 345 // ParseVersionedDigest decodes the string representation of versioned digest 346 // information - a colon-separated string with a version number in the first 347 // part and the hex-encdoed hash digest in the second - as a VersionedDigest. 348 func ParseVersionedDigest(input string) (VersionedDigest, error) { 349 var vd VersionedDigest 350 var err error 351 352 parts := strings.Split(input, ":") 353 if len(parts) != 2 { 354 return VersionedDigest{}, errors.Errorf("expected two colon-separated components in the versioned hash digest, got %q", input) 355 } 356 if vd.Digest, err = hex.DecodeString(parts[1]); err != nil { 357 return VersionedDigest{}, err 358 } 359 if vd.HashVersion, err = strconv.Atoi(parts[0]); err != nil { 360 return VersionedDigest{}, err 361 } 362 363 return vd, nil 364 } 365 366 // CheckDepTree verifies a dependency tree according to expected digest sums, 367 // and returns an associative array of file system nodes and their respective 368 // vendor status conditions. 369 // 370 // The keys to the expected digest sums associative array represent the 371 // project's dependencies, and each is required to be expressed using the 372 // solidus character, `/`, as its path separator. For example, even on a GOOS 373 // platform where the file system path separator is a character other than 374 // solidus, one particular dependency would be represented as 375 // "github.com/alice/alice1". 376 func CheckDepTree(osDirname string, wantDigests map[string]VersionedDigest) (map[string]VendorStatus, error) { 377 osDirname = filepath.Clean(osDirname) 378 379 // Create associative array to store the results of calling this function. 380 slashStatus := make(map[string]VendorStatus) 381 382 // Ensure top level pathname is a directory 383 fi, err := os.Stat(osDirname) 384 if err != nil { 385 // If the dir doesn't exist at all, that's OK - just consider all the 386 // wanted paths absent. 387 if os.IsNotExist(err) { 388 for path := range wantDigests { 389 slashStatus[path] = NotInTree 390 } 391 return slashStatus, nil 392 } 393 return nil, errors.Wrap(err, "cannot Stat") 394 } 395 396 if !fi.IsDir() { 397 return nil, errors.Errorf("cannot verify non directory: %q", osDirname) 398 } 399 400 // Initialize work queue with a node representing the specified directory 401 // name by declaring its relative pathname under the directory name as the 402 // empty string. 403 currentNode := &fsnode{osRelative: "", parentIndex: -1, isRequiredAncestor: true} 404 queue := []*fsnode{currentNode} // queue of directories that must be inspected 405 406 // In order to identify all file system nodes that are not in the lock file, 407 // represented by the specified expected sums parameter, and in order to 408 // only report the top level of a subdirectory of file system nodes, rather 409 // than every node internal to them, we will create a tree of nodes stored 410 // in a slice. We do this because we cannot predict the depth at which 411 // project roots occur. Some projects are fewer than and some projects more 412 // than the typical three layer subdirectory under the vendor root 413 // directory. 414 // 415 // For a following few examples, assume the below vendor root directory: 416 // 417 // github.com/alice/alice1/a1.go 418 // github.com/alice/alice2/a2.go 419 // github.com/bob/bob1/b1.go 420 // github.com/bob/bob2/b2.go 421 // launchpad.net/nifty/n1.go 422 // 423 // 1) If only the `alice1` and `alice2` projects were in the lock file, we'd 424 // prefer the output to state that `github.com/bob` is `NotInLock`, and 425 // `launchpad.net/nifty` is `NotInLock`. 426 // 427 // 2) If `alice1`, `alice2`, and `bob1` were in the lock file, we'd want to 428 // report `github.com/bob/bob2` as `NotInLock`, and `launchpad.net/nifty` is 429 // `NotInLock`. 430 // 431 // 3) If none of `alice1`, `alice2`, `bob1`, or `bob2` were in the lock 432 // file, the entire `github.com` directory would be reported as `NotInLock`, 433 // along with `launchpad.net/nifty` is `NotInLock`. 434 // 435 // Each node in our tree has the slice index of its parent node, so once we 436 // can categorically state a particular directory is required because it is 437 // in the lock file, we can mark all of its ancestors as also being 438 // required. Then, when we finish walking the directory hierarchy, any nodes 439 // which are not required but have a required parent will be marked as 440 // `NotInLock`. 441 nodes := []*fsnode{currentNode} 442 443 // Mark directories of expected projects as required. When each respective 444 // project is later found while traversing the vendor root hierarchy, its 445 // status will be updated to reflect whether its digest is empty, or, 446 // whether or not it matches the expected digest. 447 for slashPathname := range wantDigests { 448 slashStatus[slashPathname] = NotInTree 449 } 450 451 for len(queue) > 0 { 452 // Pop node from the top of queue (depth first traversal, reverse 453 // lexicographical order inside a directory), clearing the value stored 454 // in the slice's backing array as we proceed. 455 lq1 := len(queue) - 1 456 currentNode, queue[lq1], queue = queue[lq1], nil, queue[:lq1] 457 slashPathname := filepath.ToSlash(currentNode.osRelative) 458 osPathname := filepath.Join(osDirname, currentNode.osRelative) 459 460 if expectedSum, ok := wantDigests[slashPathname]; ok { 461 ls := EmptyDigestInLock 462 if expectedSum.HashVersion != HashVersion { 463 if !expectedSum.IsEmpty() { 464 ls = HashVersionMismatch 465 } 466 } else if len(expectedSum.Digest) > 0 { 467 projectSum, err := DigestFromDirectory(osPathname) 468 if err != nil { 469 return nil, errors.Wrap(err, "cannot compute dependency hash") 470 } 471 if bytes.Equal(projectSum.Digest, expectedSum.Digest) { 472 ls = NoMismatch 473 } else { 474 ls = DigestMismatchInLock 475 } 476 } 477 slashStatus[slashPathname] = ls 478 479 // Mark current nodes and all its parents as required. 480 for i := currentNode.myIndex; i != -1; i = nodes[i].parentIndex { 481 nodes[i].isRequiredAncestor = true 482 } 483 484 // Do not need to process this directory's contents because we 485 // already accounted for its contents while calculating its digest. 486 continue 487 } 488 489 osChildrenNames, err := sortedChildrenFromDirname(osPathname) 490 if err != nil { 491 return nil, errors.Wrap(err, "cannot get sorted list of directory children") 492 } 493 for _, osChildName := range osChildrenNames { 494 switch osChildName { 495 case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn": 496 // skip 497 default: 498 osChildRelative := filepath.Join(currentNode.osRelative, osChildName) 499 osChildPathname := filepath.Join(osDirname, osChildRelative) 500 501 // Create a new fsnode for this file system node, with a parent 502 // index set to the index of the current node. 503 otherNode := &fsnode{osRelative: osChildRelative, myIndex: len(nodes), parentIndex: currentNode.myIndex} 504 505 fi, err := os.Stat(osChildPathname) 506 if err != nil { 507 return nil, errors.Wrap(err, "cannot Stat") 508 } 509 nodes = append(nodes, otherNode) // Track all file system nodes... 510 if fi.IsDir() { 511 queue = append(queue, otherNode) // but only need to add directories to the work queue. 512 } 513 } 514 } 515 } 516 517 // Ignoring first node in the list, walk nodes from last to first. Whenever 518 // the current node is not required, but its parent is required, then the 519 // current node ought to be marked as `NotInLock`. 520 for len(nodes) > 1 { 521 // Pop node from top of queue, clearing the value stored in the slice's 522 // backing array as we proceed. 523 ln1 := len(nodes) - 1 524 currentNode, nodes[ln1], nodes = nodes[ln1], nil, nodes[:ln1] 525 526 if !currentNode.isRequiredAncestor && nodes[currentNode.parentIndex].isRequiredAncestor { 527 slashStatus[filepath.ToSlash(currentNode.osRelative)] = NotInLock 528 } 529 } 530 currentNode, nodes = nil, nil 531 532 return slashStatus, nil 533 } 534 535 // sortedChildrenFromDirname returns a lexicographically sorted list of child 536 // nodes for the specified directory. 537 func sortedChildrenFromDirname(osDirname string) ([]string, error) { 538 fh, err := os.Open(osDirname) 539 if err != nil { 540 return nil, errors.Wrap(err, "cannot Open") 541 } 542 543 osChildrenNames, err := fh.Readdirnames(0) // 0: read names of all children 544 if err != nil { 545 return nil, errors.Wrap(err, "cannot Readdirnames") 546 } 547 sort.Strings(osChildrenNames) 548 549 // Close the file handle to the open directory without masking possible 550 // previous error value. 551 if er := fh.Close(); err == nil { 552 err = errors.Wrap(er, "cannot Close") 553 } 554 return osChildrenNames, err 555 }