github.com/maruel/nin@v0.0.0-20220112143044-f35891e3ce7e/deps_log.go (about) 1 // Copyright 2012 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nin 16 17 import ( 18 "bufio" 19 "bytes" 20 "encoding/binary" 21 "errors" 22 "fmt" 23 "io/ioutil" 24 "os" 25 ) 26 27 // Deps is the reading (startup-time) struct. 28 type Deps struct { 29 MTime TimeStamp 30 Nodes []*Node 31 } 32 33 // NewDeps returns an initialized Deps. 34 func NewDeps(mtime TimeStamp, nodeCount int) *Deps { 35 return &Deps{ 36 MTime: mtime, 37 Nodes: make([]*Node, nodeCount), 38 } 39 } 40 41 // DepsLog represents a .ninja_deps log file to accelerate incremental build. 42 // 43 // As build commands run they can output extra dependency information 44 // (e.g. header dependencies for C source) dynamically. DepsLog collects 45 // that information at build time and uses it for subsequent builds. 46 // 47 // The on-disk format is based on two primary design constraints: 48 // 49 // - it must be written to as a stream (during the build, which may be 50 // interrupted); 51 // 52 // - it can be read all at once on startup. (Alternative designs, where 53 // it contains indexing information, were considered and discarded as 54 // too complicated to implement; if the file is small than reading it 55 // fully on startup is acceptable.) 56 // 57 // Here are some stats from the Windows Chrome dependency files, to 58 // help guide the design space. The total text in the files sums to 59 // 90mb so some compression is warranted to keep load-time fast. 60 // There's about 10k files worth of dependencies that reference about 61 // 40k total paths totalling 2mb of unique strings. 62 // 63 // Based on these stats, here's the current design. 64 // 65 // The file is structured as version header followed by a sequence of records. 66 // Each record is either a path string or a dependency list. 67 // Numbering the path strings in file order gives them dense integer ids. 68 // A dependency list maps an output id to a list of input ids. 69 // 70 // Concretely, a record is: 71 // four bytes record length, high bit indicates record type 72 // (but max record sizes are capped at 512kB) 73 // path records contain the string name of the path, followed by up to 3 74 // padding bytes to align on 4 byte boundaries, followed by the 75 // one's complement of the expected index of the record (to detect 76 // concurrent writes of multiple ninja processes to the log). 77 // dependency records are an array of 4-byte integers 78 // [output path id, 79 // output path mtime (lower 4 bytes), output path mtime (upper 4 bytes), 80 // input path id, input path id...] 81 // (The mtime is compared against the on-disk output path mtime 82 // to verify the stored data is up-to-date.) 83 // If two records reference the same output the latter one in the file 84 // wins, allowing updates to just be appended to the file. A separate 85 // repacking step can run occasionally to remove dead records. 86 type DepsLog struct { 87 // Maps id -> Node. 88 Nodes []*Node 89 // Maps id -> Deps of that id. 90 Deps []*Deps 91 92 filePath string 93 file *os.File 94 buf *bufio.Writer 95 needsRecompaction bool 96 } 97 98 // The version is stored as 4 bytes after the signature and also serves as a 99 // byte order mark. Signature and version combined are 16 bytes long. 100 const ( 101 depsLogFileSignature = "# ninjadeps\n" 102 depsLogCurrentVersion = uint32(4) 103 ) 104 105 // Record size is currently limited to less than the full 32 bit, due to 106 // internal buffers having to have this size. 107 const maxRecordSize = (1 << 19) - 1 108 109 // OpenForWrite prepares writing to the log file without actually opening it - 110 // that will happen when/if it's needed. 111 func (d *DepsLog) OpenForWrite(path string) error { 112 if d.needsRecompaction { 113 if err := d.Recompact(path); err != nil { 114 return err 115 } 116 } 117 118 if d.file != nil { 119 panic("M-A") 120 } 121 // we don't actually open the file right now, but will do 122 // so on the first write attempt 123 d.filePath = path 124 return nil 125 } 126 127 func (d *DepsLog) recordDeps(node *Node, mtime TimeStamp, nodes []*Node) error { 128 nodeCount := len(nodes) 129 // Track whether there's any new data to be recorded. 130 madeChange := false 131 132 // Assign ids to all nodes that are missing one. 133 if node.ID < 0 { 134 if err := d.recordID(node); err != nil { 135 return err 136 } 137 madeChange = true 138 } 139 for i := 0; i < nodeCount; i++ { 140 if nodes[i].ID < 0 { 141 if err := d.recordID(nodes[i]); err != nil { 142 return err 143 } 144 madeChange = true 145 } 146 } 147 148 // See if the new data is different than the existing data, if any. 149 if !madeChange { 150 deps := d.GetDeps(node) 151 if deps == nil || deps.MTime != mtime || len(deps.Nodes) != nodeCount { 152 madeChange = true 153 } else { 154 for i := 0; i < nodeCount; i++ { 155 if deps.Nodes[i] != nodes[i] { 156 madeChange = true 157 break 158 } 159 } 160 } 161 } 162 163 // Don't write anything if there's no new info. 164 if !madeChange { 165 return nil 166 } 167 168 // Update on-disk representation. 169 size := uint32(4 * (1 + 2 + nodeCount)) 170 if size > maxRecordSize { 171 return errors.New("too many dependencies") 172 } 173 if err := d.openForWriteIfNeeded(); err != nil { 174 return err 175 } 176 size |= 0x80000000 // Deps record: set high bit. 177 178 if err := binary.Write(d.buf, binary.LittleEndian, size); err != nil { 179 return err 180 } 181 if err := binary.Write(d.buf, binary.LittleEndian, uint32(node.ID)); err != nil { 182 return err 183 } 184 if err := binary.Write(d.buf, binary.LittleEndian, uint64(mtime)); err != nil { 185 return err 186 } 187 for i := 0; i < nodeCount; i++ { 188 if err := binary.Write(d.buf, binary.LittleEndian, uint32(nodes[i].ID)); err != nil { 189 return err 190 } 191 } 192 if err := d.buf.Flush(); err != nil { 193 return err 194 } 195 196 // Update in-memory representation. 197 deps := NewDeps(mtime, nodeCount) 198 for i := 0; i < nodeCount; i++ { 199 deps.Nodes[i] = nodes[i] 200 } 201 d.updateDeps(node.ID, deps) 202 return nil 203 } 204 205 // Close closes the file handle. 206 func (d *DepsLog) Close() error { 207 // create the file even if nothing has been recorded 208 if err := d.openForWriteIfNeeded(); err != nil { 209 return err 210 } 211 var err error 212 if d.file != nil { 213 if err2 := d.buf.Flush(); err2 != nil { 214 err = err2 215 } 216 if err2 := d.file.Close(); err2 != nil { 217 err = err2 218 } 219 } 220 d.buf = nil 221 d.file = nil 222 return err 223 } 224 225 // Load loads a .ninja_deps to accelerate incremental build. 226 // 227 // Note: For version differences, this should migrate to the new format. 228 // But the v1 format could sometimes (rarely) end up with invalid data, so 229 // don't migrate v1 to v3 to force a rebuild. (v2 only existed for a few days, 230 // and there was no release with it, so pretend that it never happened.) 231 // 232 // Warning: the whole file content is kept alive. 233 // 234 // TODO(maruel): Make it an option so that when used as a library it doesn't 235 // become a memory bloat. This is especially important when recompacting. 236 func (d *DepsLog) Load(path string, state *State) (LoadStatus, error) { 237 defer metricRecord(".ninja_deps load")() 238 // Read the file all at once. The drawback is that it will fail hard on 32 239 // bits OS on large builds. This should be rare in 2022. For small builds, it 240 // will be fine (and faster). 241 data, err := ioutil.ReadFile(path) 242 if err != nil { 243 if os.IsNotExist(err) { 244 return LoadNotFound, err 245 } 246 return LoadError, err 247 } 248 249 // Validate header. 250 validHeader := false 251 version := uint32(0) 252 if len(data) >= len(depsLogFileSignature)+4 && unsafeString(data[:len(depsLogFileSignature)]) == depsLogFileSignature { 253 version = binary.LittleEndian.Uint32(data[len(depsLogFileSignature):]) 254 validHeader = version == depsLogCurrentVersion 255 } 256 if !validHeader { 257 // Don't report this as a failure. An empty deps log will cause 258 // us to rebuild the outputs anyway. 259 _ = os.Remove(path) 260 if version == 1 { 261 return LoadSuccess, errors.New("deps log version change; rebuilding") 262 } 263 l := bytes.IndexByte(data[:], 0) 264 if l <= 0 { 265 return LoadSuccess, errors.New("bad deps log signature or version; starting over") 266 } 267 return LoadSuccess, fmt.Errorf("bad deps log signature %q or version %d; starting over", data[:l], version) 268 } 269 270 // Skip the header. 271 // TODO(maruel): Calculate if it is faster to do "data = data[4:8]" or use 272 // "data[offset+4:offset+8]". 273 // Offset is kept to keep the last successful read, to truncate in case of 274 // failure. 275 offset := int64(len(depsLogFileSignature) + 4) 276 data = data[offset:] 277 uniqueDepRecordCount := 0 278 totalDepRecordCount := 0 279 for len(data) != 0 { 280 // A minimal record is size (4 bytes) plus one of: 281 // - content (>=4 + checksum(4)); CanonicalizePath() rejects empty paths. 282 // - (id(4)+mtime(8)+nodes(4x) >12) for deps node. 283 if len(data) < 12 { 284 err = fmt.Errorf("premature end of file after %d bytes", int(offset)+len(data)) 285 break 286 } 287 size := binary.LittleEndian.Uint32(data[:4]) 288 // Skip |size|. Only bump offset after a successful read down below. 289 isDeps := size&0x80000000 != 0 290 size = size & ^uint32(0x80000000) 291 data = data[4:] 292 if len(data) < int(size) { 293 err = fmt.Errorf("premature end of file after %d bytes", int(offset)+len(data)+4) 294 break 295 } 296 if size%4 != 0 || size < 8 || size > maxRecordSize { 297 // It'd be nice to do a check for "size < 12" instead. The likelihood of 298 // a path with 3 characters or less is very small. 299 err = fmt.Errorf("record size %d is out of bounds", size) 300 break 301 } 302 if isDeps { 303 if size < 12 { 304 err = errors.New("record size is too small for deps") 305 break 306 } 307 outID := int32(binary.LittleEndian.Uint32(data[:4])) 308 if outID < 0 || outID >= 0x1000000 { 309 // That's a lot of nodes. 310 err = errors.New("record deps id is out of bounds") 311 break 312 } 313 mtime := TimeStamp(binary.LittleEndian.Uint64(data[4:12])) 314 depsCount := int(size-12) / 4 315 316 // TODO(maruel): Redesign to reduce bound checks. 317 deps := NewDeps(mtime, depsCount) 318 x := 12 319 for i := 0; i < depsCount; i++ { 320 v := binary.LittleEndian.Uint32(data[x : x+4]) 321 if int(v) >= len(d.Nodes) || d.Nodes[v] == nil { 322 err = errors.New("record deps node id is out of bounds") 323 break 324 } 325 deps.Nodes[i] = d.Nodes[v] 326 x += 4 327 } 328 329 totalDepRecordCount++ 330 if !d.updateDeps(outID, deps) { 331 uniqueDepRecordCount++ 332 } 333 } else { 334 pathSize := size - 4 335 // There can be up to 3 bytes of padding. 336 if data[pathSize-1] == '\x00' { 337 pathSize-- 338 if data[pathSize-1] == '\x00' { 339 pathSize-- 340 if data[pathSize-1] == '\x00' { 341 pathSize-- 342 } 343 } 344 } 345 346 // TODO(maruel): We need to differentiate if we are using the GC or not. 347 // When the GC is disabled, #YOLO, the buffer will never go away anyway 348 // so better to leverage it! 349 subpath := unsafeString(data[:pathSize]) 350 // Here we make a copy, because we do not want to keep a reference to the 351 // read buffer. 352 // subpath := string(data[:pathSize]) 353 354 // It is not necessary to pass in a correct slashBits here. It will 355 // either be a Node that's in the manifest (in which case it will already 356 // have a correct slashBits that GetNode will look up), or it is an 357 // implicit dependency from a .d which does not affect the build command 358 // (and so need not have its slashes maintained). 359 node := state.GetNode(subpath, 0) 360 361 // Check that the expected index matches the actual index. This can only 362 // happen if two ninja processes write to the same deps log concurrently. 363 // (This uses unary complement to make the checksum look less like a 364 // dependency record entry.) 365 checksum := binary.LittleEndian.Uint32(data[size-4 : size]) 366 expectedID := ^checksum 367 id := int32(len(d.Nodes)) 368 if id != int32(expectedID) { 369 err = errors.New("node id checksum is invalid") 370 break 371 } 372 if node.ID >= 0 { 373 err = errors.New("node is duplicate") 374 break 375 } 376 node.ID = id 377 d.Nodes = append(d.Nodes, node) 378 } 379 // Register the successful read. 380 data = data[size:] 381 offset += int64(size) + 4 382 } 383 384 if err != nil { 385 // An error occurred while loading; try to recover by truncating the 386 // file to the last fully-read record. 387 if err2 := os.Truncate(path, offset); err2 != nil { 388 return LoadError, fmt.Errorf("truncating failed while parsing error %q: %w", err, err2) 389 } 390 391 // The truncate succeeded; we'll just report the load error as a 392 // warning because the build can proceed. 393 err = errors.New(err.Error() + "; recovering") 394 return LoadSuccess, err 395 } 396 397 // Rebuild the log if there are too many dead records. 398 const minCompactionEntryCount = 1000 399 kCompactionRatio := 3 400 if totalDepRecordCount > minCompactionEntryCount && totalDepRecordCount > uniqueDepRecordCount*kCompactionRatio { 401 d.needsRecompaction = true 402 } 403 return LoadSuccess, nil 404 } 405 406 // GetDeps returns the Deps for this node ID. 407 // 408 // Silently ignore invalid node ID. 409 func (d *DepsLog) GetDeps(node *Node) *Deps { 410 // Abort if the node has no id (never referenced in the deps) or if 411 // there's no deps recorded for the node. 412 if node.ID < 0 || int(node.ID) >= len(d.Deps) { 413 return nil 414 } 415 return d.Deps[node.ID] 416 } 417 418 // GetFirstReverseDepsNode returns something? 419 // 420 // TODO(maruel): Understand better. 421 func (d *DepsLog) GetFirstReverseDepsNode(node *Node) *Node { 422 for id := 0; id < len(d.Deps); id++ { 423 deps := d.Deps[id] 424 if deps == nil { 425 continue 426 } 427 for _, n := range deps.Nodes { 428 if n == node { 429 return d.Nodes[id] 430 } 431 } 432 } 433 return nil 434 } 435 436 // Recompact rewrites the known log entries, throwing away old data. 437 func (d *DepsLog) Recompact(path string) error { 438 defer metricRecord(".ninja_deps recompact")() 439 440 if err := d.Close(); err != nil { 441 return err 442 } 443 tempPath := path + ".recompact" 444 445 // OpenForWrite() opens for append. Make sure it's not appending to a 446 // left-over file from a previous recompaction attempt that crashed somehow. 447 if err := os.Remove(tempPath); err != nil && !os.IsNotExist(err) { 448 return err 449 } 450 451 // Create a new temporary log to regenerate everything. 452 newLog := DepsLog{} 453 if err := newLog.OpenForWrite(tempPath); err != nil { 454 return err 455 } 456 457 // Clear all known ids so that new ones can be reassigned. The new indices 458 // will refer to the ordering in newLog, not in the current log. 459 for _, i := range d.Nodes { 460 i.ID = -1 461 } 462 463 // Write out all deps again. 464 for oldID := 0; oldID < len(d.Deps); oldID++ { 465 deps := d.Deps[oldID] 466 if deps == nil { // If nodes[oldID] is a leaf, it has no deps. 467 continue 468 } 469 470 if !d.IsDepsEntryLiveFor(d.Nodes[oldID]) { 471 continue 472 } 473 474 if err := newLog.recordDeps(d.Nodes[oldID], deps.MTime, deps.Nodes); err != nil { 475 _ = newLog.Close() 476 return err 477 } 478 } 479 480 if err := newLog.Close(); err != nil { 481 return err 482 } 483 484 // All nodes now have ids that refer to newLog, so steal its data. 485 d.Deps = newLog.Deps 486 d.Nodes = newLog.Nodes 487 488 if err := os.Remove(path); err != nil { 489 return err 490 } 491 return os.Rename(tempPath, path) 492 } 493 494 // IsDepsEntryLiveFor returns if the deps entry for a node is still reachable 495 // from the manifest. 496 // 497 // The deps log can contain deps entries for files that were built in the 498 // past but are no longer part of the manifest. This function returns if 499 // this is the case for a given node. This function is slow, don't call 500 // it from code that runs on every build. 501 func (d *DepsLog) IsDepsEntryLiveFor(node *Node) bool { 502 // Skip entries that don't have in-edges or whose edges don't have a 503 // "deps" attribute. They were in the deps log from previous builds, but 504 // the the files they were for were removed from the build and their deps 505 // entries are no longer needed. 506 // (Without the check for "deps", a chain of two or more nodes that each 507 // had deps wouldn't be collected in a single recompaction.) 508 return node.InEdge != nil && node.InEdge.GetBinding("deps") != "" 509 } 510 511 // Updates the in-memory representation. Takes ownership of |deps|. 512 // Returns true if a prior deps record was deleted. 513 func (d *DepsLog) updateDeps(outID int32, deps *Deps) bool { 514 if n := int(outID) + 1 - len(d.Deps); n > 0 { 515 d.Deps = append(d.Deps, make([]*Deps, n)...) 516 } 517 existed := d.Deps[outID] != nil 518 d.Deps[outID] = deps 519 return existed 520 } 521 522 var zeroBytes [4]byte 523 524 // Write a node name record, assigning it an id. 525 func (d *DepsLog) recordID(node *Node) error { 526 if node.Path == "" { 527 return errors.New("node.Path is empty") 528 } 529 pathSize := len(node.Path) 530 padding := (4 - pathSize%4) % 4 // Pad path to 4 byte boundary. 531 532 size := uint32(pathSize + padding + 4) 533 if size > maxRecordSize { 534 return errors.New("node.Path is too long") 535 } 536 if err := d.openForWriteIfNeeded(); err != nil { 537 return nil 538 } 539 if err := binary.Write(d.buf, binary.LittleEndian, size); err != nil { 540 return nil 541 } 542 if _, err := d.buf.WriteString(node.Path); err != nil { 543 return nil 544 } 545 if padding != 0 { 546 if _, err := d.buf.Write(zeroBytes[:padding]); err != nil { 547 return nil 548 } 549 } 550 id := int32(len(d.Nodes)) 551 checksum := ^uint32(id) 552 if err := binary.Write(d.buf, binary.LittleEndian, checksum); err != nil { 553 return nil 554 } 555 if err := d.buf.Flush(); err != nil { 556 return nil 557 } 558 node.ID = id 559 d.Nodes = append(d.Nodes, node) 560 return nil 561 } 562 563 // openForWriteIfNeeded should be called before using file. 564 func (d *DepsLog) openForWriteIfNeeded() error { 565 if d.filePath == "" { 566 return nil 567 } 568 if d.file != nil { 569 panic("surprising state") 570 } 571 var err error 572 d.file, err = os.OpenFile(d.filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666) 573 if err != nil { 574 return err 575 } 576 // Set the buffer size large and flush the file buffer after every record to 577 // make sure records aren't written partially. 578 d.buf = bufio.NewWriterSize(d.file, maxRecordSize+1) 579 580 // Opening a file in append mode doesn't set the file pointer to the file's 581 // end on Windows. Do that explicitly. 582 offset, err := d.file.Seek(0, os.SEEK_END) 583 if err != nil { 584 return err 585 } 586 587 if offset == 0 { 588 if _, err = d.buf.WriteString(depsLogFileSignature); err != nil { 589 return err 590 } 591 if err = binary.Write(d.buf, binary.LittleEndian, depsLogCurrentVersion); err != nil { 592 return err 593 } 594 } 595 if err = d.buf.Flush(); err != nil { 596 return err 597 } 598 d.filePath = "" 599 return nil 600 }