github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/version_edit.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bufio" 9 "bytes" 10 stdcmp "cmp" 11 "encoding/binary" 12 "fmt" 13 "io" 14 "slices" 15 "time" 16 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/base" 19 "github.com/cockroachdb/pebble/internal/invariants" 20 ) 21 22 // TODO(peter): describe the MANIFEST file format, independently of the C++ 23 // project. 24 25 var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest") 26 27 type byteReader interface { 28 io.ByteReader 29 io.Reader 30 } 31 32 // Tags for the versionEdit disk format. 33 // Tag 8 is no longer used. 34 const ( 35 // LevelDB tags. 36 tagComparator = 1 37 tagLogNumber = 2 38 tagNextFileNumber = 3 39 tagLastSequence = 4 40 tagCompactPointer = 5 41 tagDeletedFile = 6 42 tagNewFile = 7 43 tagPrevLogNumber = 9 44 45 // RocksDB tags. 46 tagNewFile2 = 100 47 tagNewFile3 = 102 48 tagNewFile4 = 103 49 tagColumnFamily = 200 50 tagColumnFamilyAdd = 201 51 tagColumnFamilyDrop = 202 52 tagMaxColumnFamily = 203 53 54 // Pebble tags. 55 tagNewFile5 = 104 // Range keys. 56 tagCreatedBackingTable = 105 57 tagRemovedBackingTable = 106 58 59 // The custom tags sub-format used by tagNewFile4 and above. 60 customTagTerminate = 1 61 customTagNeedsCompaction = 2 62 customTagCreationTime = 6 63 customTagPathID = 65 64 customTagNonSafeIgnoreMask = 1 << 6 65 customTagVirtual = 66 66 ) 67 68 // DeletedFileEntry holds the state for a file deletion from a level. The file 69 // itself might still be referenced by another level. 70 type DeletedFileEntry struct { 71 Level int 72 FileNum base.FileNum 73 } 74 75 // NewFileEntry holds the state for a new file or one moved from a different 76 // level. 77 type NewFileEntry struct { 78 Level int 79 Meta *FileMetadata 80 // BackingFileNum is only set during manifest replay, and only for virtual 81 // sstables. 82 BackingFileNum base.DiskFileNum 83 } 84 85 // VersionEdit holds the state for an edit to a Version along with other 86 // on-disk state (log numbers, next file number, and the last sequence number). 87 type VersionEdit struct { 88 // ComparerName is the value of Options.Comparer.Name. This is only set in 89 // the first VersionEdit in a manifest (either when the DB is created, or 90 // when a new manifest is created) and is used to verify that the comparer 91 // specified at Open matches the comparer that was previously used. 92 ComparerName string 93 94 // MinUnflushedLogNum is the smallest WAL log file number corresponding to 95 // mutations that have not been flushed to an sstable. 96 // 97 // This is an optional field, and 0 represents it is not set. 98 MinUnflushedLogNum base.DiskFileNum 99 100 // ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by 101 // Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in 102 // 6/2011. We keep it around purely for informational purposes when 103 // displaying MANIFEST contents. 104 ObsoletePrevLogNum uint64 105 106 // The next file number. A single counter is used to assign file numbers 107 // for the WAL, MANIFEST, sstable, and OPTIONS files. 108 NextFileNum uint64 109 110 // LastSeqNum is an upper bound on the sequence numbers that have been 111 // assigned in flushed WALs. Unflushed WALs (that will be replayed during 112 // recovery) may contain sequence numbers greater than this value. 113 LastSeqNum uint64 114 115 // A file num may be present in both deleted files and new files when it 116 // is moved from a lower level to a higher level (when the compaction 117 // found that there was no overlapping file at the higher level). 118 DeletedFiles map[DeletedFileEntry]*FileMetadata 119 NewFiles []NewFileEntry 120 // CreatedBackingTables can be used to preserve the FileBacking associated 121 // with a physical sstable. This is useful when virtual sstables in the 122 // latest version are reconstructed during manifest replay, and we also need 123 // to reconstruct the FileBacking which is required by these virtual 124 // sstables. 125 // 126 // INVARIANT: The FileBacking associated with a physical sstable must only 127 // be added as a backing file in the same version edit where the physical 128 // sstable is first virtualized. This means that the physical sstable must 129 // be present in DeletedFiles and that there must be at least one virtual 130 // sstable with the same FileBacking as the physical sstable in NewFiles. A 131 // file must be present in CreatedBackingTables in exactly one version edit. 132 // The physical sstable associated with the FileBacking must also not be 133 // present in NewFiles. 134 CreatedBackingTables []*FileBacking 135 // RemovedBackingTables is used to remove the FileBacking associated with a 136 // virtual sstable. Note that a backing sstable can be removed as soon as 137 // there are no virtual sstables in the latest version which are using the 138 // backing sstable, but the backing sstable doesn't necessarily have to be 139 // removed atomically with the version edit which removes the last virtual 140 // sstable associated with the backing sstable. The removal can happen in a 141 // future version edit. 142 // 143 // INVARIANT: A file must only be added to RemovedBackingTables if it was 144 // added to CreateBackingTables in a prior version edit. The same version 145 // edit also cannot have the same file present in both CreateBackingTables 146 // and RemovedBackingTables. A file must be present in RemovedBackingTables 147 // in exactly one version edit. 148 RemovedBackingTables []base.DiskFileNum 149 } 150 151 // Decode decodes an edit from the specified reader. 152 // 153 // Note that the Decode step will not set the FileBacking for virtual sstables 154 // and the responsibility is left to the caller. However, the Decode step will 155 // populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles. 156 func (v *VersionEdit) Decode(r io.Reader) error { 157 br, ok := r.(byteReader) 158 if !ok { 159 br = bufio.NewReader(r) 160 } 161 d := versionEditDecoder{br} 162 for { 163 tag, err := binary.ReadUvarint(br) 164 if err == io.EOF { 165 break 166 } 167 if err != nil { 168 return err 169 } 170 switch tag { 171 case tagComparator: 172 s, err := d.readBytes() 173 if err != nil { 174 return err 175 } 176 v.ComparerName = string(s) 177 178 case tagLogNumber: 179 n, err := d.readUvarint() 180 if err != nil { 181 return err 182 } 183 v.MinUnflushedLogNum = base.DiskFileNum(n) 184 185 case tagNextFileNumber: 186 n, err := d.readUvarint() 187 if err != nil { 188 return err 189 } 190 v.NextFileNum = n 191 192 case tagLastSequence: 193 n, err := d.readUvarint() 194 if err != nil { 195 return err 196 } 197 v.LastSeqNum = n 198 199 case tagCompactPointer: 200 if _, err := d.readLevel(); err != nil { 201 return err 202 } 203 if _, err := d.readBytes(); err != nil { 204 return err 205 } 206 // NB: RocksDB does not use compaction pointers anymore. 207 208 case tagRemovedBackingTable: 209 n, err := d.readUvarint() 210 if err != nil { 211 return err 212 } 213 v.RemovedBackingTables = append( 214 v.RemovedBackingTables, base.FileNum(n).DiskFileNum(), 215 ) 216 case tagCreatedBackingTable: 217 dfn, err := d.readUvarint() 218 if err != nil { 219 return err 220 } 221 size, err := d.readUvarint() 222 if err != nil { 223 return err 224 } 225 fileBacking := &FileBacking{ 226 DiskFileNum: base.FileNum(dfn).DiskFileNum(), 227 Size: size, 228 } 229 v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking) 230 case tagDeletedFile: 231 level, err := d.readLevel() 232 if err != nil { 233 return err 234 } 235 fileNum, err := d.readFileNum() 236 if err != nil { 237 return err 238 } 239 if v.DeletedFiles == nil { 240 v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata) 241 } 242 v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil 243 244 case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5: 245 level, err := d.readLevel() 246 if err != nil { 247 return err 248 } 249 fileNum, err := d.readFileNum() 250 if err != nil { 251 return err 252 } 253 if tag == tagNewFile3 { 254 // The pathID field appears unused in RocksDB. 255 _ /* pathID */, err := d.readUvarint() 256 if err != nil { 257 return err 258 } 259 } 260 size, err := d.readUvarint() 261 if err != nil { 262 return err 263 } 264 // We read the smallest / largest key bounds differently depending on 265 // whether we have point, range or both types of keys present in the 266 // table. 267 var ( 268 smallestPointKey, largestPointKey []byte 269 smallestRangeKey, largestRangeKey []byte 270 parsedPointBounds bool 271 boundsMarker byte 272 ) 273 if tag != tagNewFile5 { 274 // Range keys not present in the table. Parse the point key bounds. 275 smallestPointKey, err = d.readBytes() 276 if err != nil { 277 return err 278 } 279 largestPointKey, err = d.readBytes() 280 if err != nil { 281 return err 282 } 283 } else { 284 // Range keys are present in the table. Determine whether we have point 285 // keys to parse, in addition to the bounds. 286 boundsMarker, err = d.ReadByte() 287 if err != nil { 288 return err 289 } 290 // Parse point key bounds, if present. 291 if boundsMarker&maskContainsPointKeys > 0 { 292 smallestPointKey, err = d.readBytes() 293 if err != nil { 294 return err 295 } 296 largestPointKey, err = d.readBytes() 297 if err != nil { 298 return err 299 } 300 parsedPointBounds = true 301 } else { 302 // The table does not have point keys. 303 // Sanity check: the bounds must be range keys. 304 if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 { 305 return base.CorruptionErrorf( 306 "new-file-4-range-keys: table without point keys has point key bounds: marker=%x", 307 boundsMarker, 308 ) 309 } 310 } 311 // Parse range key bounds. 312 smallestRangeKey, err = d.readBytes() 313 if err != nil { 314 return err 315 } 316 largestRangeKey, err = d.readBytes() 317 if err != nil { 318 return err 319 } 320 } 321 var smallestSeqNum uint64 322 var largestSeqNum uint64 323 if tag != tagNewFile { 324 smallestSeqNum, err = d.readUvarint() 325 if err != nil { 326 return err 327 } 328 largestSeqNum, err = d.readUvarint() 329 if err != nil { 330 return err 331 } 332 } 333 var markedForCompaction bool 334 var creationTime uint64 335 virtualState := struct { 336 virtual bool 337 backingFileNum uint64 338 }{} 339 if tag == tagNewFile4 || tag == tagNewFile5 { 340 for { 341 customTag, err := d.readUvarint() 342 if err != nil { 343 return err 344 } 345 if customTag == customTagTerminate { 346 break 347 } else if customTag == customTagVirtual { 348 virtualState.virtual = true 349 n, err := d.readUvarint() 350 if err != nil { 351 return err 352 } 353 virtualState.backingFileNum = n 354 continue 355 } 356 357 field, err := d.readBytes() 358 if err != nil { 359 return err 360 } 361 switch customTag { 362 case customTagNeedsCompaction: 363 if len(field) != 1 { 364 return base.CorruptionErrorf("new-file4: need-compaction field wrong size") 365 } 366 markedForCompaction = (field[0] == 1) 367 368 case customTagCreationTime: 369 var n int 370 creationTime, n = binary.Uvarint(field) 371 if n != len(field) { 372 return base.CorruptionErrorf("new-file4: invalid file creation time") 373 } 374 375 case customTagPathID: 376 return base.CorruptionErrorf("new-file4: path-id field not supported") 377 378 default: 379 if (customTag & customTagNonSafeIgnoreMask) != 0 { 380 return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag) 381 } 382 } 383 } 384 } 385 m := &FileMetadata{ 386 FileNum: fileNum, 387 Size: size, 388 CreationTime: int64(creationTime), 389 SmallestSeqNum: smallestSeqNum, 390 LargestSeqNum: largestSeqNum, 391 MarkedForCompaction: markedForCompaction, 392 Virtual: virtualState.virtual, 393 } 394 if tag != tagNewFile5 { // no range keys present 395 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 396 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 397 m.HasPointKeys = true 398 m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey 399 m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey 400 } else { // range keys present 401 // Set point key bounds, if parsed. 402 if parsedPointBounds { 403 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 404 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 405 m.HasPointKeys = true 406 } 407 // Set range key bounds. 408 m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey) 409 m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey) 410 m.HasRangeKeys = true 411 // Set overall bounds (by default assume range keys). 412 m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey 413 m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey 414 if boundsMarker&maskSmallest == maskSmallest { 415 m.Smallest = m.SmallestPointKey 416 m.boundTypeSmallest = boundTypePointKey 417 } 418 if boundsMarker&maskLargest == maskLargest { 419 m.Largest = m.LargestPointKey 420 m.boundTypeLargest = boundTypePointKey 421 } 422 } 423 m.boundsSet = true 424 if !virtualState.virtual { 425 m.InitPhysicalBacking() 426 } 427 428 nfe := NewFileEntry{ 429 Level: level, 430 Meta: m, 431 } 432 if virtualState.virtual { 433 nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum() 434 } 435 v.NewFiles = append(v.NewFiles, nfe) 436 437 case tagPrevLogNumber: 438 n, err := d.readUvarint() 439 if err != nil { 440 return err 441 } 442 v.ObsoletePrevLogNum = n 443 444 case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: 445 return base.CorruptionErrorf("column families are not supported") 446 447 default: 448 return errCorruptManifest 449 } 450 } 451 return nil 452 } 453 454 func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string { 455 var buf bytes.Buffer 456 if v.ComparerName != "" { 457 fmt.Fprintf(&buf, " comparer: %s", v.ComparerName) 458 } 459 if v.MinUnflushedLogNum != 0 { 460 fmt.Fprintf(&buf, " log-num: %d\n", v.MinUnflushedLogNum) 461 } 462 if v.ObsoletePrevLogNum != 0 { 463 fmt.Fprintf(&buf, " prev-log-num: %d\n", v.ObsoletePrevLogNum) 464 } 465 if v.NextFileNum != 0 { 466 fmt.Fprintf(&buf, " next-file-num: %d\n", v.NextFileNum) 467 } 468 if v.LastSeqNum != 0 { 469 fmt.Fprintf(&buf, " last-seq-num: %d\n", v.LastSeqNum) 470 } 471 entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles)) 472 for df := range v.DeletedFiles { 473 entries = append(entries, df) 474 } 475 slices.SortFunc(entries, func(a, b DeletedFileEntry) int { 476 if v := stdcmp.Compare(a.Level, b.Level); v != 0 { 477 return v 478 } 479 return stdcmp.Compare(a.FileNum, b.FileNum) 480 }) 481 for _, df := range entries { 482 fmt.Fprintf(&buf, " deleted: L%d %s\n", df.Level, df.FileNum) 483 } 484 for _, nf := range v.NewFiles { 485 fmt.Fprintf(&buf, " added: L%d", nf.Level) 486 if verbose { 487 fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */)) 488 } else { 489 fmt.Fprintf(&buf, " %s", nf.Meta.String()) 490 } 491 if nf.Meta.CreationTime != 0 { 492 fmt.Fprintf(&buf, " (%s)", 493 time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339)) 494 } 495 fmt.Fprintln(&buf) 496 } 497 return buf.String() 498 } 499 500 // DebugString is a more verbose version of String(). Use this in tests. 501 func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string { 502 return v.string(true /* verbose */, fmtKey) 503 } 504 505 // String implements fmt.Stringer for a VersionEdit. 506 func (v *VersionEdit) String() string { 507 return v.string(false /* verbose */, base.DefaultFormatter) 508 } 509 510 // Encode encodes an edit to the specified writer. 511 func (v *VersionEdit) Encode(w io.Writer) error { 512 e := versionEditEncoder{new(bytes.Buffer)} 513 514 if v.ComparerName != "" { 515 e.writeUvarint(tagComparator) 516 e.writeString(v.ComparerName) 517 } 518 if v.MinUnflushedLogNum != 0 { 519 e.writeUvarint(tagLogNumber) 520 e.writeUvarint(uint64(v.MinUnflushedLogNum)) 521 } 522 if v.ObsoletePrevLogNum != 0 { 523 e.writeUvarint(tagPrevLogNumber) 524 e.writeUvarint(v.ObsoletePrevLogNum) 525 } 526 if v.NextFileNum != 0 { 527 e.writeUvarint(tagNextFileNumber) 528 e.writeUvarint(uint64(v.NextFileNum)) 529 } 530 for _, dfn := range v.RemovedBackingTables { 531 e.writeUvarint(tagRemovedBackingTable) 532 e.writeUvarint(uint64(dfn.FileNum())) 533 } 534 for _, fileBacking := range v.CreatedBackingTables { 535 e.writeUvarint(tagCreatedBackingTable) 536 e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum())) 537 e.writeUvarint(fileBacking.Size) 538 } 539 // RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry, 540 // even though its value is zero. We detect this by encoding LastSeqNum when 541 // ComparerName is set. 542 if v.LastSeqNum != 0 || v.ComparerName != "" { 543 e.writeUvarint(tagLastSequence) 544 e.writeUvarint(v.LastSeqNum) 545 } 546 for x := range v.DeletedFiles { 547 e.writeUvarint(tagDeletedFile) 548 e.writeUvarint(uint64(x.Level)) 549 e.writeUvarint(uint64(x.FileNum)) 550 } 551 for _, x := range v.NewFiles { 552 customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual 553 var tag uint64 554 switch { 555 case x.Meta.HasRangeKeys: 556 tag = tagNewFile5 557 case customFields: 558 tag = tagNewFile4 559 default: 560 tag = tagNewFile2 561 } 562 e.writeUvarint(tag) 563 e.writeUvarint(uint64(x.Level)) 564 e.writeUvarint(uint64(x.Meta.FileNum)) 565 e.writeUvarint(x.Meta.Size) 566 if !x.Meta.HasRangeKeys { 567 // If we have no range keys, preserve the original format and write the 568 // smallest and largest point keys. 569 e.writeKey(x.Meta.SmallestPointKey) 570 e.writeKey(x.Meta.LargestPointKey) 571 } else { 572 // When range keys are present, we first write a marker byte that 573 // indicates if the table also contains point keys, in addition to how the 574 // overall bounds for the table should be reconstructed. This byte is 575 // followed by the keys themselves. 576 b, err := x.Meta.boundsMarker() 577 if err != nil { 578 return err 579 } 580 if err = e.WriteByte(b); err != nil { 581 return err 582 } 583 // Write point key bounds (if present). 584 if x.Meta.HasPointKeys { 585 e.writeKey(x.Meta.SmallestPointKey) 586 e.writeKey(x.Meta.LargestPointKey) 587 } 588 // Write range key bounds. 589 e.writeKey(x.Meta.SmallestRangeKey) 590 e.writeKey(x.Meta.LargestRangeKey) 591 } 592 e.writeUvarint(x.Meta.SmallestSeqNum) 593 e.writeUvarint(x.Meta.LargestSeqNum) 594 if customFields { 595 if x.Meta.CreationTime != 0 { 596 e.writeUvarint(customTagCreationTime) 597 var buf [binary.MaxVarintLen64]byte 598 n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime)) 599 e.writeBytes(buf[:n]) 600 } 601 if x.Meta.MarkedForCompaction { 602 e.writeUvarint(customTagNeedsCompaction) 603 e.writeBytes([]byte{1}) 604 } 605 if x.Meta.Virtual { 606 e.writeUvarint(customTagVirtual) 607 e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum())) 608 } 609 e.writeUvarint(customTagTerminate) 610 } 611 } 612 _, err := w.Write(e.Bytes()) 613 return err 614 } 615 616 // versionEditDecoder should be used to decode version edits. 617 type versionEditDecoder struct { 618 byteReader 619 } 620 621 func (d versionEditDecoder) readBytes() ([]byte, error) { 622 n, err := d.readUvarint() 623 if err != nil { 624 return nil, err 625 } 626 s := make([]byte, n) 627 _, err = io.ReadFull(d, s) 628 if err != nil { 629 if err == io.ErrUnexpectedEOF { 630 return nil, errCorruptManifest 631 } 632 return nil, err 633 } 634 return s, nil 635 } 636 637 func (d versionEditDecoder) readLevel() (int, error) { 638 u, err := d.readUvarint() 639 if err != nil { 640 return 0, err 641 } 642 if u >= NumLevels { 643 return 0, errCorruptManifest 644 } 645 return int(u), nil 646 } 647 648 func (d versionEditDecoder) readFileNum() (base.FileNum, error) { 649 u, err := d.readUvarint() 650 if err != nil { 651 return 0, err 652 } 653 return base.FileNum(u), nil 654 } 655 656 func (d versionEditDecoder) readUvarint() (uint64, error) { 657 u, err := binary.ReadUvarint(d) 658 if err != nil { 659 if err == io.EOF { 660 return 0, errCorruptManifest 661 } 662 return 0, err 663 } 664 return u, nil 665 } 666 667 type versionEditEncoder struct { 668 *bytes.Buffer 669 } 670 671 func (e versionEditEncoder) writeBytes(p []byte) { 672 e.writeUvarint(uint64(len(p))) 673 e.Write(p) 674 } 675 676 func (e versionEditEncoder) writeKey(k InternalKey) { 677 e.writeUvarint(uint64(k.Size())) 678 e.Write(k.UserKey) 679 buf := k.EncodeTrailer() 680 e.Write(buf[:]) 681 } 682 683 func (e versionEditEncoder) writeString(s string) { 684 e.writeUvarint(uint64(len(s))) 685 e.WriteString(s) 686 } 687 688 func (e versionEditEncoder) writeUvarint(u uint64) { 689 var buf [binary.MaxVarintLen64]byte 690 n := binary.PutUvarint(buf[:], u) 691 e.Write(buf[:n]) 692 } 693 694 // BulkVersionEdit summarizes the files added and deleted from a set of version 695 // edits. 696 // 697 // INVARIANTS: 698 // No file can be added to a level more than once. This is true globally, and 699 // also true for all of the calls to Accumulate for a single bulk version edit. 700 // 701 // No file can be removed from a level more than once. This is true globally, 702 // and also true for all of the calls to Accumulate for a single bulk version 703 // edit. 704 // 705 // A file must not be added and removed from a given level in the same version 706 // edit. 707 // 708 // A file that is being removed from a level must have been added to that level 709 // before (in a prior version edit). Note that a given file can be deleted from 710 // a level and added to another level in a single version edit 711 type BulkVersionEdit struct { 712 Added [NumLevels]map[base.FileNum]*FileMetadata 713 Deleted [NumLevels]map[base.FileNum]*FileMetadata 714 715 // AddedFileBacking is a map to support lookup so that we can populate the 716 // FileBacking of virtual sstables during manifest replay. 717 AddedFileBacking map[base.DiskFileNum]*FileBacking 718 RemovedFileBacking []base.DiskFileNum 719 720 // AddedByFileNum maps file number to file metadata for all added files 721 // from accumulated version edits. AddedByFileNum is only populated if set 722 // to non-nil by a caller. It must be set to non-nil when replaying 723 // version edits read from a MANIFEST (as opposed to VersionEdits 724 // constructed in-memory). While replaying a MANIFEST file, 725 // VersionEdit.DeletedFiles map entries have nil values, because the 726 // on-disk deletion record encodes only the file number. Accumulate 727 // uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted 728 // field with non-nil *FileMetadata. 729 AddedByFileNum map[base.FileNum]*FileMetadata 730 731 // MarkedForCompactionCountDiff holds the aggregated count of files 732 // marked for compaction added or removed. 733 MarkedForCompactionCountDiff int 734 } 735 736 // Accumulate adds the file addition and deletions in the specified version 737 // edit to the bulk edit's internal state. 738 // 739 // INVARIANTS: 740 // If a file is added to a given level in a call to Accumulate and then removed 741 // from that level in a subsequent call, the file will not be present in the 742 // resulting BulkVersionEdit.Deleted for that level. 743 // 744 // After accumulation of version edits, the bulk version edit may have 745 // information about a file which has been deleted from a level, but it may 746 // not have information about the same file added to the same level. The add 747 // could've occurred as part of a previous bulk version edit. In this case, 748 // the deleted file must be present in BulkVersionEdit.Deleted, at the end 749 // of the accumulation, because we need to decrease the refcount of the 750 // deleted file in Apply. 751 func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { 752 for df, m := range ve.DeletedFiles { 753 dmap := b.Deleted[df.Level] 754 if dmap == nil { 755 dmap = make(map[base.FileNum]*FileMetadata) 756 b.Deleted[df.Level] = dmap 757 } 758 759 if m == nil { 760 // m is nil only when replaying a MANIFEST. 761 if b.AddedByFileNum == nil { 762 return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum) 763 } 764 m = b.AddedByFileNum[df.FileNum] 765 if m == nil { 766 return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum) 767 } 768 } 769 if m.MarkedForCompaction { 770 b.MarkedForCompactionCountDiff-- 771 } 772 if _, ok := b.Added[df.Level][df.FileNum]; !ok { 773 dmap[df.FileNum] = m 774 } else { 775 // Present in b.Added for the same level. 776 delete(b.Added[df.Level], df.FileNum) 777 } 778 } 779 780 // Generate state for Added backing files. Note that these must be generated 781 // before we loop through the NewFiles, because we need to populate the 782 // FileBackings which might be used by the NewFiles loop. 783 if b.AddedFileBacking == nil { 784 b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking) 785 } 786 for _, fb := range ve.CreatedBackingTables { 787 if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok { 788 // There is already a FileBacking associated with fb.DiskFileNum. 789 // This should never happen. There must always be only one FileBacking 790 // associated with a backing sstable. 791 panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String())) 792 } 793 b.AddedFileBacking[fb.DiskFileNum] = fb 794 } 795 796 for _, nf := range ve.NewFiles { 797 // A new file should not have been deleted in this or a preceding 798 // VersionEdit at the same level (though files can move across levels). 799 if dmap := b.Deleted[nf.Level]; dmap != nil { 800 if _, ok := dmap[nf.Meta.FileNum]; ok { 801 return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum) 802 } 803 } 804 if nf.Meta.Virtual && nf.Meta.FileBacking == nil { 805 // FileBacking for a virtual sstable must only be nil if we're performing 806 // manifest replay. 807 nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum] 808 if nf.Meta.FileBacking == nil { 809 return errors.Errorf("FileBacking for virtual sstable must not be nil") 810 } 811 } else if nf.Meta.FileBacking == nil { 812 return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum) 813 } 814 815 if b.Added[nf.Level] == nil { 816 b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata) 817 } 818 b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta 819 if b.AddedByFileNum != nil { 820 b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta 821 } 822 if nf.Meta.MarkedForCompaction { 823 b.MarkedForCompactionCountDiff++ 824 } 825 } 826 827 // Since a file can be removed from backing files in exactly one version 828 // edit it is safe to just append without any de-duplication. 829 b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...) 830 831 return nil 832 } 833 834 // AccumulateIncompleteAndApplySingleVE should be called if a single version edit 835 // is to be applied to the provided curr Version and if the caller needs to 836 // update the versionSet.zombieTables map. This function exists separately from 837 // BulkVersionEdit.Apply because it is easier to reason about properties 838 // regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we 839 // know that exactly one version edit is being accumulated. 840 // 841 // Note that the version edit passed into this function may be incomplete 842 // because compactions don't have the ref counting information necessary to 843 // populate VersionEdit.RemovedBackingTables. This function will complete such a 844 // version edit by populating RemovedBackingTables. 845 // 846 // Invariant: Any file being deleted through ve must belong to the curr Version. 847 // We can't have a delete for some arbitrary file which does not exist in curr. 848 func AccumulateIncompleteAndApplySingleVE( 849 ve *VersionEdit, 850 curr *Version, 851 cmp Compare, 852 formatKey base.FormatKey, 853 flushSplitBytes int64, 854 readCompactionRate int64, 855 backingStateMap map[base.DiskFileNum]*FileBacking, 856 addBackingFunc func(*FileBacking), 857 removeBackingFunc func(base.DiskFileNum), 858 orderingInvariants OrderingInvariants, 859 ) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) { 860 if len(ve.RemovedBackingTables) != 0 { 861 panic("pebble: invalid incomplete version edit") 862 } 863 var b BulkVersionEdit 864 err := b.Accumulate(ve) 865 if err != nil { 866 return nil, nil, err 867 } 868 zombies = make(map[base.DiskFileNum]uint64) 869 v, err := b.Apply( 870 curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants, 871 ) 872 if err != nil { 873 return nil, nil, err 874 } 875 876 for _, s := range b.AddedFileBacking { 877 addBackingFunc(s) 878 } 879 880 for fileNum := range zombies { 881 if _, ok := backingStateMap[fileNum]; ok { 882 // This table was backing some virtual sstable in the latest version, 883 // but is now a zombie. We add RemovedBackingTables entries for 884 // these, before the version edit is written to disk. 885 ve.RemovedBackingTables = append( 886 ve.RemovedBackingTables, fileNum, 887 ) 888 removeBackingFunc(fileNum) 889 } 890 } 891 return v, zombies, nil 892 } 893 894 // Apply applies the delta b to the current version to produce a new 895 // version. The new version is consistent with respect to the comparer cmp. 896 // 897 // curr may be nil, which is equivalent to a pointer to a zero version. 898 // 899 // On success, if a non-nil zombies map is provided to Apply, the map is updated 900 // with file numbers and files sizes of deleted files. These files are 901 // considered zombies because they are no longer referenced by the returned 902 // Version, but cannot be deleted from disk as they are still in use by the 903 // incoming Version. 904 func (b *BulkVersionEdit) Apply( 905 curr *Version, 906 cmp Compare, 907 formatKey base.FormatKey, 908 flushSplitBytes int64, 909 readCompactionRate int64, 910 zombies map[base.DiskFileNum]uint64, 911 orderingInvariants OrderingInvariants, 912 ) (*Version, error) { 913 addZombie := func(state *FileBacking) { 914 if zombies != nil { 915 zombies[state.DiskFileNum] = state.Size 916 } 917 } 918 removeZombie := func(state *FileBacking) { 919 if zombies != nil { 920 delete(zombies, state.DiskFileNum) 921 } 922 } 923 924 v := new(Version) 925 926 // Adjust the count of files marked for compaction. 927 if curr != nil { 928 v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction 929 } 930 v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff 931 if v.Stats.MarkedForCompaction < 0 { 932 return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative") 933 } 934 935 for level := range v.Levels { 936 if curr == nil || curr.Levels[level].tree.root == nil { 937 v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */) 938 } else { 939 v.Levels[level] = curr.Levels[level].clone() 940 } 941 if curr == nil || curr.RangeKeyLevels[level].tree.root == nil { 942 v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */) 943 } else { 944 v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone() 945 } 946 947 if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 { 948 // There are no edits on this level. 949 if level == 0 { 950 // Initialize L0Sublevels. 951 if curr == nil || curr.L0Sublevels == nil { 952 if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 953 return nil, errors.Wrap(err, "pebble: internal error") 954 } 955 } else { 956 v.L0Sublevels = curr.L0Sublevels 957 v.L0SublevelFiles = v.L0Sublevels.Levels 958 } 959 } 960 continue 961 } 962 963 // Some edits on this level. 964 lm := &v.Levels[level] 965 lmRange := &v.RangeKeyLevels[level] 966 967 addedFilesMap := b.Added[level] 968 deletedFilesMap := b.Deleted[level] 969 if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 { 970 return nil, base.CorruptionErrorf( 971 "pebble: internal error: No current or added files but have deleted files: %d", 972 errors.Safe(len(deletedFilesMap))) 973 } 974 975 // NB: addedFilesMap may be empty. If a file is present in addedFilesMap 976 // for a level, it won't be present in deletedFilesMap for the same 977 // level. 978 979 for _, f := range deletedFilesMap { 980 if obsolete := v.Levels[level].remove(f); obsolete { 981 // Deleting a file from the B-Tree may decrement its 982 // reference count. However, because we cloned the 983 // previous level's B-Tree, this should never result in a 984 // file's reference count dropping to zero. 985 err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum) 986 return nil, err 987 } 988 if f.HasRangeKeys { 989 if obsolete := v.RangeKeyLevels[level].remove(f); obsolete { 990 // Deleting a file from the B-Tree may decrement its 991 // reference count. However, because we cloned the 992 // previous level's B-Tree, this should never result in a 993 // file's reference count dropping to zero. 994 err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum) 995 return nil, err 996 } 997 } 998 999 // Note that a backing sst will only become a zombie if the 1000 // references to it in the latest version is 0. We will remove the 1001 // backing sst from the zombie list in the next loop if one of the 1002 // addedFiles in any of the levels is referencing the backing sst. 1003 // This is possible if a physical sstable is virtualized, or if it 1004 // is moved. 1005 latestRefCount := f.LatestRefs() 1006 if latestRefCount <= 0 { 1007 // If a file is present in deletedFilesMap for a level, then it 1008 // must have already been added to the level previously, which 1009 // means that its latest ref count cannot be 0. 1010 err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum) 1011 return nil, err 1012 } else if f.LatestUnref() == 0 { 1013 addZombie(f.FileBacking) 1014 } 1015 } 1016 1017 addedFiles := make([]*FileMetadata, 0, len(addedFilesMap)) 1018 for _, f := range addedFilesMap { 1019 addedFiles = append(addedFiles, f) 1020 } 1021 // Sort addedFiles by file number. This isn't necessary, but tests which 1022 // replay invalid manifests check the error output, and the error output 1023 // depends on the order in which files are added to the btree. 1024 slices.SortFunc(addedFiles, func(a, b *FileMetadata) int { 1025 return stdcmp.Compare(a.FileNum, b.FileNum) 1026 }) 1027 1028 var sm, la *FileMetadata 1029 for _, f := range addedFiles { 1030 // NB: allowedSeeks is used for read triggered compactions. It is set using 1031 // Options.Experimental.ReadCompactionRate which defaults to 32KB. 1032 var allowedSeeks int64 1033 if readCompactionRate != 0 { 1034 allowedSeeks = int64(f.Size) / readCompactionRate 1035 } 1036 if allowedSeeks < 100 { 1037 allowedSeeks = 100 1038 } 1039 f.AllowedSeeks.Store(allowedSeeks) 1040 f.InitAllowedSeeks = allowedSeeks 1041 1042 err := lm.insert(f) 1043 // We're adding this file to the new version, so increment the 1044 // latest refs count. 1045 f.LatestRef() 1046 if err != nil { 1047 return nil, errors.Wrap(err, "pebble") 1048 } 1049 if f.HasRangeKeys { 1050 err = lmRange.insert(f) 1051 if err != nil { 1052 return nil, errors.Wrap(err, "pebble") 1053 } 1054 } 1055 removeZombie(f.FileBacking) 1056 // Track the keys with the smallest and largest keys, so that we can 1057 // check consistency of the modified span. 1058 if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 { 1059 sm = f 1060 } 1061 if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 { 1062 la = f 1063 } 1064 } 1065 1066 if level == 0 { 1067 if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 { 1068 // Flushes and ingestions that do not delete any L0 files do not require 1069 // a regeneration of L0Sublevels from scratch. We can instead generate 1070 // it incrementally. 1071 var err error 1072 // AddL0Files requires addedFiles to be sorted in seqnum order. 1073 SortBySeqNum(addedFiles) 1074 v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0]) 1075 if errors.Is(err, errInvalidL0SublevelsOpt) { 1076 err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes) 1077 } else if invariants.Enabled && err == nil { 1078 copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) 1079 if err != nil { 1080 panic(fmt.Sprintf("error when regenerating sublevels: %s", err)) 1081 } 1082 s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels) 1083 s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels) 1084 if s1 != s2 { 1085 panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2)) 1086 } 1087 } 1088 if err != nil { 1089 return nil, errors.Wrap(err, "pebble: internal error") 1090 } 1091 v.L0SublevelFiles = v.L0Sublevels.Levels 1092 } else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 1093 return nil, errors.Wrap(err, "pebble: internal error") 1094 } 1095 if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil { 1096 return nil, errors.Wrap(err, "pebble: internal error") 1097 } 1098 continue 1099 } 1100 1101 // Check consistency of the level in the vicinity of our edits. 1102 if sm != nil && la != nil { 1103 overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey, 1104 la.Largest.UserKey, la.Largest.IsExclusiveSentinel()) 1105 // overlap contains all of the added files. We want to ensure that 1106 // the added files are consistent with neighboring existing files 1107 // too, so reslice the overlap to pull in a neighbor on each side. 1108 check := overlap.Reslice(func(start, end *LevelIterator) { 1109 if m := start.Prev(); m == nil { 1110 start.Next() 1111 } 1112 if m := end.Next(); m == nil { 1113 end.Prev() 1114 } 1115 }) 1116 if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil { 1117 return nil, errors.Wrap(err, "pebble: internal error") 1118 } 1119 } 1120 } 1121 return v, nil 1122 }