github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/manifest/version_edit.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bufio" 9 "bytes" 10 "encoding/binary" 11 "fmt" 12 "io" 13 "sort" 14 "time" 15 16 "github.com/cockroachdb/errors" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/invariants" 19 ) 20 21 // TODO(peter): describe the MANIFEST file format, independently of the C++ 22 // project. 23 24 var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest") 25 26 type byteReader interface { 27 io.ByteReader 28 io.Reader 29 } 30 31 // Tags for the versionEdit disk format. 32 // Tag 8 is no longer used. 33 const ( 34 // LevelDB tags. 35 tagComparator = 1 36 tagLogNumber = 2 37 tagNextFileNumber = 3 38 tagLastSequence = 4 39 tagCompactPointer = 5 40 tagDeletedFile = 6 41 tagNewFile = 7 42 tagPrevLogNumber = 9 43 44 // RocksDB tags. 45 tagNewFile2 = 100 46 tagNewFile3 = 102 47 tagNewFile4 = 103 48 tagColumnFamily = 200 49 tagColumnFamilyAdd = 201 50 tagColumnFamilyDrop = 202 51 tagMaxColumnFamily = 203 52 53 // Pebble tags. 54 tagNewFile5 = 104 // Range keys. 55 tagCreatedBackingTable = 105 56 tagRemovedBackingTable = 106 57 58 // The custom tags sub-format used by tagNewFile4 and above. 59 customTagTerminate = 1 60 customTagNeedsCompaction = 2 61 customTagCreationTime = 6 62 customTagPathID = 65 63 customTagNonSafeIgnoreMask = 1 << 6 64 customTagVirtual = 66 65 ) 66 67 // DeletedFileEntry holds the state for a file deletion from a level. The file 68 // itself might still be referenced by another level. 69 type DeletedFileEntry struct { 70 Level int 71 FileNum base.FileNum 72 } 73 74 // NewFileEntry holds the state for a new file or one moved from a different 75 // level. 76 type NewFileEntry struct { 77 Level int 78 Meta *FileMetadata 79 // BackingFileNum is only set during manifest replay, and only for virtual 80 // sstables. 81 BackingFileNum base.DiskFileNum 82 } 83 84 // VersionEdit holds the state for an edit to a Version along with other 85 // on-disk state (log numbers, next file number, and the last sequence number). 86 type VersionEdit struct { 87 // ComparerName is the value of Options.Comparer.Name. This is only set in 88 // the first VersionEdit in a manifest (either when the DB is created, or 89 // when a new manifest is created) and is used to verify that the comparer 90 // specified at Open matches the comparer that was previously used. 91 ComparerName string 92 93 // MinUnflushedLogNum is the smallest WAL log file number corresponding to 94 // mutations that have not been flushed to an sstable. 95 // 96 // This is an optional field, and 0 represents it is not set. 97 MinUnflushedLogNum base.FileNum 98 99 // ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by 100 // Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in 101 // 6/2011. We keep it around purely for informational purposes when 102 // displaying MANIFEST contents. 103 ObsoletePrevLogNum uint64 104 105 // The next file number. A single counter is used to assign file numbers 106 // for the WAL, MANIFEST, sstable, and OPTIONS files. 107 NextFileNum base.FileNum 108 109 // LastSeqNum is an upper bound on the sequence numbers that have been 110 // assigned in flushed WALs. Unflushed WALs (that will be replayed during 111 // recovery) may contain sequence numbers greater than this value. 112 LastSeqNum uint64 113 114 // A file num may be present in both deleted files and new files when it 115 // is moved from a lower level to a higher level (when the compaction 116 // found that there was no overlapping file at the higher level). 117 DeletedFiles map[DeletedFileEntry]*FileMetadata 118 NewFiles []NewFileEntry 119 // CreatedBackingTables can be used to preserve the FileBacking associated 120 // with a physical sstable. This is useful when virtual sstables in the 121 // latest version are reconstructed during manifest replay, and we also need 122 // to reconstruct the FileBacking which is required by these virtual 123 // sstables. 124 // 125 // INVARIANT: The FileBacking associated with a physical sstable must only 126 // be added as a backing file in the same version edit where the physical 127 // sstable is first virtualized. This means that the physical sstable must 128 // be present in DeletedFiles and that there must be at least one virtual 129 // sstable with the same FileBacking as the physical sstable in NewFiles. A 130 // file must be present in CreatedBackingTables in exactly one version edit. 131 // The physical sstable associated with the FileBacking must also not be 132 // present in NewFiles. 133 CreatedBackingTables []*FileBacking 134 // RemovedBackingTables is used to remove the FileBacking associated with a 135 // virtual sstable. Note that a backing sstable can be removed as soon as 136 // there are no virtual sstables in the latest version which are using the 137 // backing sstable, but the backing sstable doesn't necessarily have to be 138 // removed atomically with the version edit which removes the last virtual 139 // sstable associated with the backing sstable. The removal can happen in a 140 // future version edit. 141 // 142 // INVARIANT: A file must only be added to RemovedBackingTables if it was 143 // added to CreateBackingTables in a prior version edit. The same version 144 // edit also cannot have the same file present in both CreateBackingTables 145 // and RemovedBackingTables. A file must be present in RemovedBackingTables 146 // in exactly one version edit. 147 RemovedBackingTables []base.DiskFileNum 148 } 149 150 // Decode decodes an edit from the specified reader. 151 // 152 // Note that the Decode step will not set the FileBacking for virtual sstables 153 // and the responsibility is left to the caller. However, the Decode step will 154 // populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles. 155 func (v *VersionEdit) Decode(r io.Reader) error { 156 br, ok := r.(byteReader) 157 if !ok { 158 br = bufio.NewReader(r) 159 } 160 d := versionEditDecoder{br} 161 for { 162 tag, err := binary.ReadUvarint(br) 163 if err == io.EOF { 164 break 165 } 166 if err != nil { 167 return err 168 } 169 switch tag { 170 case tagComparator: 171 s, err := d.readBytes() 172 if err != nil { 173 return err 174 } 175 v.ComparerName = string(s) 176 177 case tagLogNumber: 178 n, err := d.readFileNum() 179 if err != nil { 180 return err 181 } 182 v.MinUnflushedLogNum = n 183 184 case tagNextFileNumber: 185 n, err := d.readFileNum() 186 if err != nil { 187 return err 188 } 189 v.NextFileNum = n 190 191 case tagLastSequence: 192 n, err := d.readUvarint() 193 if err != nil { 194 return err 195 } 196 v.LastSeqNum = n 197 198 case tagCompactPointer: 199 if _, err := d.readLevel(); err != nil { 200 return err 201 } 202 if _, err := d.readBytes(); err != nil { 203 return err 204 } 205 // NB: RocksDB does not use compaction pointers anymore. 206 207 case tagRemovedBackingTable: 208 n, err := d.readUvarint() 209 if err != nil { 210 return err 211 } 212 v.RemovedBackingTables = append( 213 v.RemovedBackingTables, base.FileNum(n).DiskFileNum(), 214 ) 215 case tagCreatedBackingTable: 216 dfn, err := d.readUvarint() 217 if err != nil { 218 return err 219 } 220 size, err := d.readUvarint() 221 if err != nil { 222 return err 223 } 224 fileBacking := &FileBacking{ 225 DiskFileNum: base.FileNum(dfn).DiskFileNum(), 226 Size: size, 227 } 228 v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking) 229 case tagDeletedFile: 230 level, err := d.readLevel() 231 if err != nil { 232 return err 233 } 234 fileNum, err := d.readFileNum() 235 if err != nil { 236 return err 237 } 238 if v.DeletedFiles == nil { 239 v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata) 240 } 241 v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil 242 243 case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5: 244 level, err := d.readLevel() 245 if err != nil { 246 return err 247 } 248 fileNum, err := d.readFileNum() 249 if err != nil { 250 return err 251 } 252 if tag == tagNewFile3 { 253 // The pathID field appears unused in RocksDB. 254 _ /* pathID */, err := d.readUvarint() 255 if err != nil { 256 return err 257 } 258 } 259 size, err := d.readUvarint() 260 if err != nil { 261 return err 262 } 263 // We read the smallest / largest key bounds differently depending on 264 // whether we have point, range or both types of keys present in the 265 // table. 266 var ( 267 smallestPointKey, largestPointKey []byte 268 smallestRangeKey, largestRangeKey []byte 269 parsedPointBounds bool 270 boundsMarker byte 271 ) 272 if tag != tagNewFile5 { 273 // Range keys not present in the table. Parse the point key bounds. 274 smallestPointKey, err = d.readBytes() 275 if err != nil { 276 return err 277 } 278 largestPointKey, err = d.readBytes() 279 if err != nil { 280 return err 281 } 282 } else { 283 // Range keys are present in the table. Determine whether we have point 284 // keys to parse, in addition to the bounds. 285 boundsMarker, err = d.ReadByte() 286 if err != nil { 287 return err 288 } 289 // Parse point key bounds, if present. 290 if boundsMarker&maskContainsPointKeys > 0 { 291 smallestPointKey, err = d.readBytes() 292 if err != nil { 293 return err 294 } 295 largestPointKey, err = d.readBytes() 296 if err != nil { 297 return err 298 } 299 parsedPointBounds = true 300 } else { 301 // The table does not have point keys. 302 // Sanity check: the bounds must be range keys. 303 if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 { 304 return base.CorruptionErrorf( 305 "new-file-4-range-keys: table without point keys has point key bounds: marker=%x", 306 boundsMarker, 307 ) 308 } 309 } 310 // Parse range key bounds. 311 smallestRangeKey, err = d.readBytes() 312 if err != nil { 313 return err 314 } 315 largestRangeKey, err = d.readBytes() 316 if err != nil { 317 return err 318 } 319 } 320 var smallestSeqNum uint64 321 var largestSeqNum uint64 322 if tag != tagNewFile { 323 smallestSeqNum, err = d.readUvarint() 324 if err != nil { 325 return err 326 } 327 largestSeqNum, err = d.readUvarint() 328 if err != nil { 329 return err 330 } 331 } 332 var markedForCompaction bool 333 var creationTime uint64 334 virtualState := struct { 335 virtual bool 336 backingFileNum uint64 337 }{} 338 if tag == tagNewFile4 || tag == tagNewFile5 { 339 for { 340 customTag, err := d.readUvarint() 341 if err != nil { 342 return err 343 } 344 if customTag == customTagTerminate { 345 break 346 } else if customTag == customTagVirtual { 347 virtualState.virtual = true 348 n, err := d.readUvarint() 349 if err != nil { 350 return err 351 } 352 virtualState.backingFileNum = n 353 continue 354 } 355 356 field, err := d.readBytes() 357 if err != nil { 358 return err 359 } 360 switch customTag { 361 case customTagNeedsCompaction: 362 if len(field) != 1 { 363 return base.CorruptionErrorf("new-file4: need-compaction field wrong size") 364 } 365 markedForCompaction = (field[0] == 1) 366 367 case customTagCreationTime: 368 var n int 369 creationTime, n = binary.Uvarint(field) 370 if n != len(field) { 371 return base.CorruptionErrorf("new-file4: invalid file creation time") 372 } 373 374 case customTagPathID: 375 return base.CorruptionErrorf("new-file4: path-id field not supported") 376 377 default: 378 if (customTag & customTagNonSafeIgnoreMask) != 0 { 379 return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag) 380 } 381 } 382 } 383 } 384 m := &FileMetadata{ 385 FileNum: fileNum, 386 Size: size, 387 CreationTime: int64(creationTime), 388 SmallestSeqNum: smallestSeqNum, 389 LargestSeqNum: largestSeqNum, 390 MarkedForCompaction: markedForCompaction, 391 Virtual: virtualState.virtual, 392 } 393 if tag != tagNewFile5 { // no range keys present 394 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 395 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 396 m.HasPointKeys = true 397 m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey 398 m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey 399 } else { // range keys present 400 // Set point key bounds, if parsed. 401 if parsedPointBounds { 402 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 403 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 404 m.HasPointKeys = true 405 } 406 // Set range key bounds. 407 m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey) 408 m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey) 409 m.HasRangeKeys = true 410 // Set overall bounds (by default assume range keys). 411 m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey 412 m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey 413 if boundsMarker&maskSmallest == maskSmallest { 414 m.Smallest = m.SmallestPointKey 415 m.boundTypeSmallest = boundTypePointKey 416 } 417 if boundsMarker&maskLargest == maskLargest { 418 m.Largest = m.LargestPointKey 419 m.boundTypeLargest = boundTypePointKey 420 } 421 } 422 m.boundsSet = true 423 if !virtualState.virtual { 424 m.InitPhysicalBacking() 425 } 426 427 nfe := NewFileEntry{ 428 Level: level, 429 Meta: m, 430 } 431 if virtualState.virtual { 432 nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum() 433 } 434 v.NewFiles = append(v.NewFiles, nfe) 435 436 case tagPrevLogNumber: 437 n, err := d.readUvarint() 438 if err != nil { 439 return err 440 } 441 v.ObsoletePrevLogNum = n 442 443 case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: 444 return base.CorruptionErrorf("column families are not supported") 445 446 default: 447 return errCorruptManifest 448 } 449 } 450 return nil 451 } 452 453 func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string { 454 var buf bytes.Buffer 455 if v.ComparerName != "" { 456 fmt.Fprintf(&buf, " comparer: %s", v.ComparerName) 457 } 458 if v.MinUnflushedLogNum != 0 { 459 fmt.Fprintf(&buf, " log-num: %d\n", v.MinUnflushedLogNum) 460 } 461 if v.ObsoletePrevLogNum != 0 { 462 fmt.Fprintf(&buf, " prev-log-num: %d\n", v.ObsoletePrevLogNum) 463 } 464 if v.NextFileNum != 0 { 465 fmt.Fprintf(&buf, " next-file-num: %d\n", v.NextFileNum) 466 } 467 if v.LastSeqNum != 0 { 468 fmt.Fprintf(&buf, " last-seq-num: %d\n", v.LastSeqNum) 469 } 470 entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles)) 471 for df := range v.DeletedFiles { 472 entries = append(entries, df) 473 } 474 sort.Slice(entries, func(i, j int) bool { 475 if entries[i].Level != entries[j].Level { 476 return entries[i].Level < entries[j].Level 477 } 478 return entries[i].FileNum < entries[j].FileNum 479 }) 480 for _, df := range entries { 481 fmt.Fprintf(&buf, " deleted: L%d %s\n", df.Level, df.FileNum) 482 } 483 for _, nf := range v.NewFiles { 484 fmt.Fprintf(&buf, " added: L%d", nf.Level) 485 if verbose { 486 fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */)) 487 } else { 488 fmt.Fprintf(&buf, " %s", nf.Meta.String()) 489 } 490 if nf.Meta.CreationTime != 0 { 491 fmt.Fprintf(&buf, " (%s)", 492 time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339)) 493 } 494 fmt.Fprintln(&buf) 495 } 496 return buf.String() 497 } 498 499 // DebugString is a more verbose version of String(). Use this in tests. 500 func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string { 501 return v.string(true /* verbose */, fmtKey) 502 } 503 504 // String implements fmt.Stringer for a VersionEdit. 505 func (v *VersionEdit) String() string { 506 return v.string(false /* verbose */, base.DefaultFormatter) 507 } 508 509 // Encode encodes an edit to the specified writer. 510 func (v *VersionEdit) Encode(w io.Writer) error { 511 e := versionEditEncoder{new(bytes.Buffer)} 512 513 if v.ComparerName != "" { 514 e.writeUvarint(tagComparator) 515 e.writeString(v.ComparerName) 516 } 517 if v.MinUnflushedLogNum != 0 { 518 e.writeUvarint(tagLogNumber) 519 e.writeUvarint(uint64(v.MinUnflushedLogNum)) 520 } 521 if v.ObsoletePrevLogNum != 0 { 522 e.writeUvarint(tagPrevLogNumber) 523 e.writeUvarint(v.ObsoletePrevLogNum) 524 } 525 if v.NextFileNum != 0 { 526 e.writeUvarint(tagNextFileNumber) 527 e.writeUvarint(uint64(v.NextFileNum)) 528 } 529 for _, dfn := range v.RemovedBackingTables { 530 e.writeUvarint(tagRemovedBackingTable) 531 e.writeUvarint(uint64(dfn.FileNum())) 532 } 533 for _, fileBacking := range v.CreatedBackingTables { 534 e.writeUvarint(tagCreatedBackingTable) 535 e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum())) 536 e.writeUvarint(fileBacking.Size) 537 } 538 // RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry, 539 // even though its value is zero. We detect this by encoding LastSeqNum when 540 // ComparerName is set. 541 if v.LastSeqNum != 0 || v.ComparerName != "" { 542 e.writeUvarint(tagLastSequence) 543 e.writeUvarint(v.LastSeqNum) 544 } 545 for x := range v.DeletedFiles { 546 e.writeUvarint(tagDeletedFile) 547 e.writeUvarint(uint64(x.Level)) 548 e.writeUvarint(uint64(x.FileNum)) 549 } 550 for _, x := range v.NewFiles { 551 customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual 552 var tag uint64 553 switch { 554 case x.Meta.HasRangeKeys: 555 tag = tagNewFile5 556 case customFields: 557 tag = tagNewFile4 558 default: 559 tag = tagNewFile2 560 } 561 e.writeUvarint(tag) 562 e.writeUvarint(uint64(x.Level)) 563 e.writeUvarint(uint64(x.Meta.FileNum)) 564 e.writeUvarint(x.Meta.Size) 565 if !x.Meta.HasRangeKeys { 566 // If we have no range keys, preserve the original format and write the 567 // smallest and largest point keys. 568 e.writeKey(x.Meta.SmallestPointKey) 569 e.writeKey(x.Meta.LargestPointKey) 570 } else { 571 // When range keys are present, we first write a marker byte that 572 // indicates if the table also contains point keys, in addition to how the 573 // overall bounds for the table should be reconstructed. This byte is 574 // followed by the keys themselves. 575 b, err := x.Meta.boundsMarker() 576 if err != nil { 577 return err 578 } 579 if err = e.WriteByte(b); err != nil { 580 return err 581 } 582 // Write point key bounds (if present). 583 if x.Meta.HasPointKeys { 584 e.writeKey(x.Meta.SmallestPointKey) 585 e.writeKey(x.Meta.LargestPointKey) 586 } 587 // Write range key bounds. 588 e.writeKey(x.Meta.SmallestRangeKey) 589 e.writeKey(x.Meta.LargestRangeKey) 590 } 591 e.writeUvarint(x.Meta.SmallestSeqNum) 592 e.writeUvarint(x.Meta.LargestSeqNum) 593 if customFields { 594 if x.Meta.CreationTime != 0 { 595 e.writeUvarint(customTagCreationTime) 596 var buf [binary.MaxVarintLen64]byte 597 n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime)) 598 e.writeBytes(buf[:n]) 599 } 600 if x.Meta.MarkedForCompaction { 601 e.writeUvarint(customTagNeedsCompaction) 602 e.writeBytes([]byte{1}) 603 } 604 if x.Meta.Virtual { 605 e.writeUvarint(customTagVirtual) 606 e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum())) 607 } 608 e.writeUvarint(customTagTerminate) 609 } 610 } 611 _, err := w.Write(e.Bytes()) 612 return err 613 } 614 615 // versionEditDecoder should be used to decode version edits. 616 type versionEditDecoder struct { 617 byteReader 618 } 619 620 func (d versionEditDecoder) readBytes() ([]byte, error) { 621 n, err := d.readUvarint() 622 if err != nil { 623 return nil, err 624 } 625 s := make([]byte, n) 626 _, err = io.ReadFull(d, s) 627 if err != nil { 628 if err == io.ErrUnexpectedEOF { 629 return nil, errCorruptManifest 630 } 631 return nil, err 632 } 633 return s, nil 634 } 635 636 func (d versionEditDecoder) readLevel() (int, error) { 637 u, err := d.readUvarint() 638 if err != nil { 639 return 0, err 640 } 641 if u >= NumLevels { 642 return 0, errCorruptManifest 643 } 644 return int(u), nil 645 } 646 647 func (d versionEditDecoder) readFileNum() (base.FileNum, error) { 648 u, err := d.readUvarint() 649 if err != nil { 650 return 0, err 651 } 652 return base.FileNum(u), nil 653 } 654 655 func (d versionEditDecoder) readUvarint() (uint64, error) { 656 u, err := binary.ReadUvarint(d) 657 if err != nil { 658 if err == io.EOF { 659 return 0, errCorruptManifest 660 } 661 return 0, err 662 } 663 return u, nil 664 } 665 666 type versionEditEncoder struct { 667 *bytes.Buffer 668 } 669 670 func (e versionEditEncoder) writeBytes(p []byte) { 671 e.writeUvarint(uint64(len(p))) 672 e.Write(p) 673 } 674 675 func (e versionEditEncoder) writeKey(k InternalKey) { 676 e.writeUvarint(uint64(k.Size())) 677 e.Write(k.UserKey) 678 buf := k.EncodeTrailer() 679 e.Write(buf[:]) 680 } 681 682 func (e versionEditEncoder) writeString(s string) { 683 e.writeUvarint(uint64(len(s))) 684 e.WriteString(s) 685 } 686 687 func (e versionEditEncoder) writeUvarint(u uint64) { 688 var buf [binary.MaxVarintLen64]byte 689 n := binary.PutUvarint(buf[:], u) 690 e.Write(buf[:n]) 691 } 692 693 // BulkVersionEdit summarizes the files added and deleted from a set of version 694 // edits. 695 // 696 // INVARIANTS: 697 // No file can be added to a level more than once. This is true globally, and 698 // also true for all of the calls to Accumulate for a single bulk version edit. 699 // 700 // No file can be removed from a level more than once. This is true globally, 701 // and also true for all of the calls to Accumulate for a single bulk version 702 // edit. 703 // 704 // A file must not be added and removed from a given level in the same version 705 // edit. 706 // 707 // A file that is being removed from a level must have been added to that level 708 // before (in a prior version edit). Note that a given file can be deleted from 709 // a level and added to another level in a single version edit 710 type BulkVersionEdit struct { 711 Added [NumLevels]map[base.FileNum]*FileMetadata 712 Deleted [NumLevels]map[base.FileNum]*FileMetadata 713 714 // AddedFileBacking is a map to support lookup so that we can populate the 715 // FileBacking of virtual sstables during manifest replay. 716 AddedFileBacking map[base.DiskFileNum]*FileBacking 717 RemovedFileBacking []base.DiskFileNum 718 719 // AddedByFileNum maps file number to file metadata for all added files 720 // from accumulated version edits. AddedByFileNum is only populated if set 721 // to non-nil by a caller. It must be set to non-nil when replaying 722 // version edits read from a MANIFEST (as opposed to VersionEdits 723 // constructed in-memory). While replaying a MANIFEST file, 724 // VersionEdit.DeletedFiles map entries have nil values, because the 725 // on-disk deletion record encodes only the file number. Accumulate 726 // uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted 727 // field with non-nil *FileMetadata. 728 AddedByFileNum map[base.FileNum]*FileMetadata 729 730 // MarkedForCompactionCountDiff holds the aggregated count of files 731 // marked for compaction added or removed. 732 MarkedForCompactionCountDiff int 733 } 734 735 // Accumulate adds the file addition and deletions in the specified version 736 // edit to the bulk edit's internal state. 737 // 738 // INVARIANTS: 739 // If a file is added to a given level in a call to Accumulate and then removed 740 // from that level in a subsequent call, the file will not be present in the 741 // resulting BulkVersionEdit.Deleted for that level. 742 // 743 // After accumulation of version edits, the bulk version edit may have 744 // information about a file which has been deleted from a level, but it may 745 // not have information about the same file added to the same level. The add 746 // could've occurred as part of a previous bulk version edit. In this case, 747 // the deleted file must be present in BulkVersionEdit.Deleted, at the end 748 // of the accumulation, because we need to decrease the refcount of the 749 // deleted file in Apply. 750 func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { 751 for df, m := range ve.DeletedFiles { 752 dmap := b.Deleted[df.Level] 753 if dmap == nil { 754 dmap = make(map[base.FileNum]*FileMetadata) 755 b.Deleted[df.Level] = dmap 756 } 757 758 if m == nil { 759 // m is nil only when replaying a MANIFEST. 760 if b.AddedByFileNum == nil { 761 return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum) 762 } 763 m = b.AddedByFileNum[df.FileNum] 764 if m == nil { 765 return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum) 766 } 767 } 768 if m.MarkedForCompaction { 769 b.MarkedForCompactionCountDiff-- 770 } 771 if _, ok := b.Added[df.Level][df.FileNum]; !ok { 772 dmap[df.FileNum] = m 773 } else { 774 // Present in b.Added for the same level. 775 delete(b.Added[df.Level], df.FileNum) 776 } 777 } 778 779 // Generate state for Added backing files. Note that these must be generated 780 // before we loop through the NewFiles, because we need to populate the 781 // FileBackings which might be used by the NewFiles loop. 782 if b.AddedFileBacking == nil { 783 b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking) 784 } 785 for _, fb := range ve.CreatedBackingTables { 786 if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok { 787 // There is already a FileBacking associated with fb.DiskFileNum. 788 // This should never happen. There must always be only one FileBacking 789 // associated with a backing sstable. 790 panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String())) 791 } 792 b.AddedFileBacking[fb.DiskFileNum] = fb 793 } 794 795 for _, nf := range ve.NewFiles { 796 // A new file should not have been deleted in this or a preceding 797 // VersionEdit at the same level (though files can move across levels). 798 if dmap := b.Deleted[nf.Level]; dmap != nil { 799 if _, ok := dmap[nf.Meta.FileNum]; ok { 800 return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum) 801 } 802 } 803 if nf.Meta.Virtual && nf.Meta.FileBacking == nil { 804 // FileBacking for a virtual sstable must only be nil if we're performing 805 // manifest replay. 806 nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum] 807 if nf.Meta.FileBacking == nil { 808 return errors.Errorf("FileBacking for virtual sstable must not be nil") 809 } 810 } else if nf.Meta.FileBacking == nil { 811 return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum) 812 } 813 814 if b.Added[nf.Level] == nil { 815 b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata) 816 } 817 b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta 818 if b.AddedByFileNum != nil { 819 b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta 820 } 821 if nf.Meta.MarkedForCompaction { 822 b.MarkedForCompactionCountDiff++ 823 } 824 } 825 826 // Since a file can be removed from backing files in exactly one version 827 // edit it is safe to just append without any de-duplication. 828 b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...) 829 830 return nil 831 } 832 833 // AccumulateIncompleteAndApplySingleVE should be called if a single version edit 834 // is to be applied to the provided curr Version and if the caller needs to 835 // update the versionSet.zombieTables map. This function exists separately from 836 // BulkVersionEdit.Apply because it is easier to reason about properties 837 // regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we 838 // know that exactly one version edit is being accumulated. 839 // 840 // Note that the version edit passed into this function may be incomplete 841 // because compactions don't have the ref counting information necessary to 842 // populate VersionEdit.RemovedBackingTables. This function will complete such a 843 // version edit by populating RemovedBackingTables. 844 // 845 // Invariant: Any file being deleted through ve must belong to the curr Version. 846 // We can't have a delete for some arbitrary file which does not exist in curr. 847 func AccumulateIncompleteAndApplySingleVE( 848 ve *VersionEdit, 849 curr *Version, 850 cmp Compare, 851 formatKey base.FormatKey, 852 flushSplitBytes int64, 853 readCompactionRate int64, 854 backingStateMap map[base.DiskFileNum]*FileBacking, 855 addBackingFunc func(*FileBacking), 856 removeBackingFunc func(base.DiskFileNum), 857 orderingInvariants OrderingInvariants, 858 ) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) { 859 if len(ve.RemovedBackingTables) != 0 { 860 panic("pebble: invalid incomplete version edit") 861 } 862 var b BulkVersionEdit 863 err := b.Accumulate(ve) 864 if err != nil { 865 return nil, nil, err 866 } 867 zombies = make(map[base.DiskFileNum]uint64) 868 v, err := b.Apply( 869 curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants, 870 ) 871 if err != nil { 872 return nil, nil, err 873 } 874 875 for _, s := range b.AddedFileBacking { 876 addBackingFunc(s) 877 } 878 879 for fileNum := range zombies { 880 if _, ok := backingStateMap[fileNum]; ok { 881 // This table was backing some virtual sstable in the latest version, 882 // but is now a zombie. We add RemovedBackingTables entries for 883 // these, before the version edit is written to disk. 884 ve.RemovedBackingTables = append( 885 ve.RemovedBackingTables, fileNum, 886 ) 887 removeBackingFunc(fileNum) 888 } 889 } 890 return v, zombies, nil 891 } 892 893 // Apply applies the delta b to the current version to produce a new 894 // version. The new version is consistent with respect to the comparer cmp. 895 // 896 // curr may be nil, which is equivalent to a pointer to a zero version. 897 // 898 // On success, if a non-nil zombies map is provided to Apply, the map is updated 899 // with file numbers and files sizes of deleted files. These files are 900 // considered zombies because they are no longer referenced by the returned 901 // Version, but cannot be deleted from disk as they are still in use by the 902 // incoming Version. 903 func (b *BulkVersionEdit) Apply( 904 curr *Version, 905 cmp Compare, 906 formatKey base.FormatKey, 907 flushSplitBytes int64, 908 readCompactionRate int64, 909 zombies map[base.DiskFileNum]uint64, 910 orderingInvariants OrderingInvariants, 911 ) (*Version, error) { 912 addZombie := func(state *FileBacking) { 913 if zombies != nil { 914 zombies[state.DiskFileNum] = state.Size 915 } 916 } 917 removeZombie := func(state *FileBacking) { 918 if zombies != nil { 919 delete(zombies, state.DiskFileNum) 920 } 921 } 922 923 v := new(Version) 924 925 // Adjust the count of files marked for compaction. 926 if curr != nil { 927 v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction 928 } 929 v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff 930 if v.Stats.MarkedForCompaction < 0 { 931 return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative") 932 } 933 934 for level := range v.Levels { 935 if curr == nil || curr.Levels[level].tree.root == nil { 936 v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */) 937 } else { 938 v.Levels[level] = curr.Levels[level].clone() 939 } 940 if curr == nil || curr.RangeKeyLevels[level].tree.root == nil { 941 v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */) 942 } else { 943 v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone() 944 } 945 946 if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 { 947 // There are no edits on this level. 948 if level == 0 { 949 // Initialize L0Sublevels. 950 if curr == nil || curr.L0Sublevels == nil { 951 if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 952 return nil, errors.Wrap(err, "pebble: internal error") 953 } 954 } else { 955 v.L0Sublevels = curr.L0Sublevels 956 v.L0SublevelFiles = v.L0Sublevels.Levels 957 } 958 } 959 continue 960 } 961 962 // Some edits on this level. 963 lm := &v.Levels[level] 964 lmRange := &v.RangeKeyLevels[level] 965 966 addedFilesMap := b.Added[level] 967 deletedFilesMap := b.Deleted[level] 968 if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 { 969 return nil, base.CorruptionErrorf( 970 "pebble: internal error: No current or added files but have deleted files: %d", 971 errors.Safe(len(deletedFilesMap))) 972 } 973 974 // NB: addedFilesMap may be empty. If a file is present in addedFilesMap 975 // for a level, it won't be present in deletedFilesMap for the same 976 // level. 977 978 for _, f := range deletedFilesMap { 979 if obsolete := v.Levels[level].remove(f); obsolete { 980 // Deleting a file from the B-Tree may decrement its 981 // reference count. However, because we cloned the 982 // previous level's B-Tree, this should never result in a 983 // file's reference count dropping to zero. 984 err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum) 985 return nil, err 986 } 987 if f.HasRangeKeys { 988 if obsolete := v.RangeKeyLevels[level].remove(f); obsolete { 989 // Deleting a file from the B-Tree may decrement its 990 // reference count. However, because we cloned the 991 // previous level's B-Tree, this should never result in a 992 // file's reference count dropping to zero. 993 err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum) 994 return nil, err 995 } 996 } 997 998 // Note that a backing sst will only become a zombie if the 999 // references to it in the latest version is 0. We will remove the 1000 // backing sst from the zombie list in the next loop if one of the 1001 // addedFiles in any of the levels is referencing the backing sst. 1002 // This is possible if a physical sstable is virtualized, or if it 1003 // is moved. 1004 latestRefCount := f.LatestRefs() 1005 if latestRefCount <= 0 { 1006 // If a file is present in deletedFilesMap for a level, then it 1007 // must have already been added to the level previously, which 1008 // means that its latest ref count cannot be 0. 1009 err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum) 1010 return nil, err 1011 } else if f.LatestUnref() == 0 { 1012 addZombie(f.FileBacking) 1013 } 1014 } 1015 1016 addedFiles := make([]*FileMetadata, 0, len(addedFilesMap)) 1017 for _, f := range addedFilesMap { 1018 addedFiles = append(addedFiles, f) 1019 } 1020 // Sort addedFiles by file number. This isn't necessary, but tests which 1021 // replay invalid manifests check the error output, and the error output 1022 // depends on the order in which files are added to the btree. 1023 sort.Slice(addedFiles, func(i, j int) bool { 1024 return addedFiles[i].FileNum < addedFiles[j].FileNum 1025 }) 1026 1027 var sm, la *FileMetadata 1028 for _, f := range addedFiles { 1029 // NB: allowedSeeks is used for read triggered compactions. It is set using 1030 // Options.Experimental.ReadCompactionRate which defaults to 32KB. 1031 var allowedSeeks int64 1032 if readCompactionRate != 0 { 1033 allowedSeeks = int64(f.Size) / readCompactionRate 1034 } 1035 if allowedSeeks < 100 { 1036 allowedSeeks = 100 1037 } 1038 f.AllowedSeeks.Store(allowedSeeks) 1039 f.InitAllowedSeeks = allowedSeeks 1040 1041 err := lm.insert(f) 1042 // We're adding this file to the new version, so increment the 1043 // latest refs count. 1044 f.LatestRef() 1045 if err != nil { 1046 return nil, errors.Wrap(err, "pebble") 1047 } 1048 if f.HasRangeKeys { 1049 err = lmRange.insert(f) 1050 if err != nil { 1051 return nil, errors.Wrap(err, "pebble") 1052 } 1053 } 1054 removeZombie(f.FileBacking) 1055 // Track the keys with the smallest and largest keys, so that we can 1056 // check consistency of the modified span. 1057 if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 { 1058 sm = f 1059 } 1060 if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 { 1061 la = f 1062 } 1063 } 1064 1065 if level == 0 { 1066 if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 { 1067 // Flushes and ingestions that do not delete any L0 files do not require 1068 // a regeneration of L0Sublevels from scratch. We can instead generate 1069 // it incrementally. 1070 var err error 1071 // AddL0Files requires addedFiles to be sorted in seqnum order. 1072 SortBySeqNum(addedFiles) 1073 v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0]) 1074 if errors.Is(err, errInvalidL0SublevelsOpt) { 1075 err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes) 1076 } else if invariants.Enabled && err == nil { 1077 copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) 1078 if err != nil { 1079 panic(fmt.Sprintf("error when regenerating sublevels: %s", err)) 1080 } 1081 s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels) 1082 s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels) 1083 if s1 != s2 { 1084 panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2)) 1085 } 1086 } 1087 if err != nil { 1088 return nil, errors.Wrap(err, "pebble: internal error") 1089 } 1090 v.L0SublevelFiles = v.L0Sublevels.Levels 1091 } else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 1092 return nil, errors.Wrap(err, "pebble: internal error") 1093 } 1094 if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil { 1095 return nil, errors.Wrap(err, "pebble: internal error") 1096 } 1097 continue 1098 } 1099 1100 // Check consistency of the level in the vicinity of our edits. 1101 if sm != nil && la != nil { 1102 overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey, 1103 la.Largest.UserKey, la.Largest.IsExclusiveSentinel()) 1104 // overlap contains all of the added files. We want to ensure that 1105 // the added files are consistent with neighboring existing files 1106 // too, so reslice the overlap to pull in a neighbor on each side. 1107 check := overlap.Reslice(func(start, end *LevelIterator) { 1108 if m := start.Prev(); m == nil { 1109 start.Next() 1110 } 1111 if m := end.Next(); m == nil { 1112 end.Prev() 1113 } 1114 }) 1115 if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil { 1116 return nil, errors.Wrap(err, "pebble: internal error") 1117 } 1118 } 1119 } 1120 return v, nil 1121 }