github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/version_edit.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bufio" 9 "bytes" 10 "encoding/binary" 11 "io" 12 "sync/atomic" 13 14 "github.com/cockroachdb/errors" 15 "github.com/zuoyebang/bitalostable/internal/base" 16 ) 17 18 // TODO(peter): describe the MANIFEST file format, independently of the C++ 19 // project. 20 21 var errCorruptManifest = base.CorruptionErrorf("bitalostable: corrupt manifest") 22 23 type byteReader interface { 24 io.ByteReader 25 io.Reader 26 } 27 28 // Tags for the versionEdit disk format. 29 // Tag 8 is no longer used. 30 const ( 31 // LevelDB tags. 32 tagComparator = 1 33 tagLogNumber = 2 34 tagNextFileNumber = 3 35 tagLastSequence = 4 36 tagCompactPointer = 5 37 tagDeletedFile = 6 38 tagNewFile = 7 39 tagPrevLogNumber = 9 40 41 // RocksDB tags. 42 tagNewFile2 = 100 43 tagNewFile3 = 102 44 tagNewFile4 = 103 45 tagColumnFamily = 200 46 tagColumnFamilyAdd = 201 47 tagColumnFamilyDrop = 202 48 tagMaxColumnFamily = 203 49 50 // Pebble tags. 51 tagNewFile5 = 104 // Range keys. 52 53 // The custom tags sub-format used by tagNewFile4 and above. 54 customTagTerminate = 1 55 customTagNeedsCompaction = 2 56 customTagCreationTime = 6 57 customTagPathID = 65 58 customTagNonSafeIgnoreMask = 1 << 6 59 ) 60 61 // DeletedFileEntry holds the state for a file deletion from a level. The file 62 // itself might still be referenced by another level. 63 type DeletedFileEntry struct { 64 Level int 65 FileNum base.FileNum 66 } 67 68 // NewFileEntry holds the state for a new file or one moved from a different 69 // level. 70 type NewFileEntry struct { 71 Level int 72 Meta *FileMetadata 73 } 74 75 // VersionEdit holds the state for an edit to a Version along with other 76 // on-disk state (log numbers, next file number, and the last sequence number). 77 type VersionEdit struct { 78 // ComparerName is the value of Options.Comparer.Name. This is only set in 79 // the first VersionEdit in a manifest (either when the DB is created, or 80 // when a new manifest is created) and is used to verify that the comparer 81 // specified at Open matches the comparer that was previously used. 82 ComparerName string 83 84 // MinUnflushedLogNum is the smallest WAL log file number corresponding to 85 // mutations that have not been flushed to an sstable. 86 // 87 // This is an optional field, and 0 represents it is not set. 88 MinUnflushedLogNum base.FileNum 89 90 // ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by 91 // Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in 92 // 6/2011. We keep it around purely for informational purposes when 93 // displaying MANIFEST contents. 94 ObsoletePrevLogNum uint64 95 96 // The next file number. A single counter is used to assign file numbers 97 // for the WAL, MANIFEST, sstable, and OPTIONS files. 98 NextFileNum base.FileNum 99 100 // LastSeqNum is an upper bound on the sequence numbers that have been 101 // assigned in flushed WALs. Unflushed WALs (that will be replayed during 102 // recovery) may contain sequence numbers greater than this value. 103 LastSeqNum uint64 104 105 // A file num may be present in both deleted files and new files when it 106 // is moved from a lower level to a higher level (when the compaction 107 // found that there was no overlapping file at the higher level). 108 DeletedFiles map[DeletedFileEntry]*FileMetadata 109 NewFiles []NewFileEntry 110 } 111 112 // Decode decodes an edit from the specified reader. 113 func (v *VersionEdit) Decode(r io.Reader) error { 114 br, ok := r.(byteReader) 115 if !ok { 116 br = bufio.NewReader(r) 117 } 118 d := versionEditDecoder{br} 119 for { 120 tag, err := binary.ReadUvarint(br) 121 if err == io.EOF { 122 break 123 } 124 if err != nil { 125 return err 126 } 127 switch tag { 128 case tagComparator: 129 s, err := d.readBytes() 130 if err != nil { 131 return err 132 } 133 v.ComparerName = string(s) 134 135 case tagLogNumber: 136 n, err := d.readFileNum() 137 if err != nil { 138 return err 139 } 140 v.MinUnflushedLogNum = n 141 142 case tagNextFileNumber: 143 n, err := d.readFileNum() 144 if err != nil { 145 return err 146 } 147 v.NextFileNum = n 148 149 case tagLastSequence: 150 n, err := d.readUvarint() 151 if err != nil { 152 return err 153 } 154 v.LastSeqNum = n 155 156 case tagCompactPointer: 157 if _, err := d.readLevel(); err != nil { 158 return err 159 } 160 if _, err := d.readBytes(); err != nil { 161 return err 162 } 163 // NB: RocksDB does not use compaction pointers anymore. 164 165 case tagDeletedFile: 166 level, err := d.readLevel() 167 if err != nil { 168 return err 169 } 170 fileNum, err := d.readFileNum() 171 if err != nil { 172 return err 173 } 174 if v.DeletedFiles == nil { 175 v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata) 176 } 177 v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil 178 179 case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5: 180 level, err := d.readLevel() 181 if err != nil { 182 return err 183 } 184 fileNum, err := d.readFileNum() 185 if err != nil { 186 return err 187 } 188 if tag == tagNewFile3 { 189 // The pathID field appears unused in RocksDB. 190 _ /* pathID */, err := d.readUvarint() 191 if err != nil { 192 return err 193 } 194 } 195 size, err := d.readUvarint() 196 if err != nil { 197 return err 198 } 199 // We read the smallest / largest key bounds differently depending on 200 // whether we have point, range or both types of keys present in the 201 // table. 202 var ( 203 smallestPointKey, largestPointKey []byte 204 smallestRangeKey, largestRangeKey []byte 205 parsedPointBounds bool 206 boundsMarker byte 207 ) 208 if tag != tagNewFile5 { 209 // Range keys not present in the table. Parse the point key bounds. 210 smallestPointKey, err = d.readBytes() 211 if err != nil { 212 return err 213 } 214 largestPointKey, err = d.readBytes() 215 if err != nil { 216 return err 217 } 218 } else { 219 // Range keys are present in the table. Determine whether we have point 220 // keys to parse, in addition to the bounds. 221 boundsMarker, err = d.ReadByte() 222 if err != nil { 223 return err 224 } 225 // Parse point key bounds, if present. 226 if boundsMarker&maskContainsPointKeys > 0 { 227 smallestPointKey, err = d.readBytes() 228 if err != nil { 229 return err 230 } 231 largestPointKey, err = d.readBytes() 232 if err != nil { 233 return err 234 } 235 parsedPointBounds = true 236 } else { 237 // The table does not have point keys. 238 // Sanity check: the bounds must be range keys. 239 if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 { 240 return base.CorruptionErrorf( 241 "new-file-4-range-keys: table without point keys has point key bounds: marker=%x", 242 boundsMarker, 243 ) 244 } 245 } 246 // Parse range key bounds. 247 smallestRangeKey, err = d.readBytes() 248 if err != nil { 249 return err 250 } 251 largestRangeKey, err = d.readBytes() 252 if err != nil { 253 return err 254 } 255 } 256 var smallestSeqNum uint64 257 var largestSeqNum uint64 258 if tag != tagNewFile { 259 smallestSeqNum, err = d.readUvarint() 260 if err != nil { 261 return err 262 } 263 largestSeqNum, err = d.readUvarint() 264 if err != nil { 265 return err 266 } 267 } 268 var markedForCompaction bool 269 var creationTime uint64 270 if tag == tagNewFile4 || tag == tagNewFile5 { 271 for { 272 customTag, err := d.readUvarint() 273 if err != nil { 274 return err 275 } 276 if customTag == customTagTerminate { 277 break 278 } 279 field, err := d.readBytes() 280 if err != nil { 281 return err 282 } 283 switch customTag { 284 case customTagNeedsCompaction: 285 if len(field) != 1 { 286 return base.CorruptionErrorf("new-file4: need-compaction field wrong size") 287 } 288 markedForCompaction = (field[0] == 1) 289 290 case customTagCreationTime: 291 var n int 292 creationTime, n = binary.Uvarint(field) 293 if n != len(field) { 294 return base.CorruptionErrorf("new-file4: invalid file creation time") 295 } 296 297 case customTagPathID: 298 return base.CorruptionErrorf("new-file4: path-id field not supported") 299 300 default: 301 if (customTag & customTagNonSafeIgnoreMask) != 0 { 302 return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag) 303 } 304 } 305 } 306 } 307 m := &FileMetadata{ 308 FileNum: fileNum, 309 Size: size, 310 CreationTime: int64(creationTime), 311 SmallestSeqNum: smallestSeqNum, 312 LargestSeqNum: largestSeqNum, 313 MarkedForCompaction: markedForCompaction, 314 } 315 if tag != tagNewFile5 { // no range keys present 316 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 317 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 318 m.HasPointKeys = true 319 m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey 320 m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey 321 } else { // range keys present 322 // Set point key bounds, if parsed. 323 if parsedPointBounds { 324 m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) 325 m.LargestPointKey = base.DecodeInternalKey(largestPointKey) 326 m.HasPointKeys = true 327 } 328 // Set range key bounds. 329 m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey) 330 m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey) 331 m.HasRangeKeys = true 332 // Set overall bounds (by default assume range keys). 333 m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey 334 m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey 335 if boundsMarker&maskSmallest == maskSmallest { 336 m.Smallest = m.SmallestPointKey 337 m.boundTypeSmallest = boundTypePointKey 338 } 339 if boundsMarker&maskLargest == maskLargest { 340 m.Largest = m.LargestPointKey 341 m.boundTypeLargest = boundTypePointKey 342 } 343 } 344 m.boundsSet = true 345 v.NewFiles = append(v.NewFiles, NewFileEntry{ 346 Level: level, 347 Meta: m, 348 }) 349 350 case tagPrevLogNumber: 351 n, err := d.readUvarint() 352 if err != nil { 353 return err 354 } 355 v.ObsoletePrevLogNum = n 356 357 case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: 358 return base.CorruptionErrorf("column families are not supported") 359 360 default: 361 return errCorruptManifest 362 } 363 } 364 return nil 365 } 366 367 // Encode encodes an edit to the specified writer. 368 func (v *VersionEdit) Encode(w io.Writer) error { 369 e := versionEditEncoder{new(bytes.Buffer)} 370 371 if v.ComparerName != "" { 372 e.writeUvarint(tagComparator) 373 e.writeString(v.ComparerName) 374 } 375 if v.MinUnflushedLogNum != 0 { 376 e.writeUvarint(tagLogNumber) 377 e.writeUvarint(uint64(v.MinUnflushedLogNum)) 378 } 379 if v.ObsoletePrevLogNum != 0 { 380 e.writeUvarint(tagPrevLogNumber) 381 e.writeUvarint(v.ObsoletePrevLogNum) 382 } 383 if v.NextFileNum != 0 { 384 e.writeUvarint(tagNextFileNumber) 385 e.writeUvarint(uint64(v.NextFileNum)) 386 } 387 // RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry, 388 // even though its value is zero. We detect this by encoding LastSeqNum when 389 // ComparerName is set. 390 if v.LastSeqNum != 0 || v.ComparerName != "" { 391 e.writeUvarint(tagLastSequence) 392 e.writeUvarint(v.LastSeqNum) 393 } 394 for x := range v.DeletedFiles { 395 e.writeUvarint(tagDeletedFile) 396 e.writeUvarint(uint64(x.Level)) 397 e.writeUvarint(uint64(x.FileNum)) 398 } 399 for _, x := range v.NewFiles { 400 customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 401 var tag uint64 402 switch { 403 case x.Meta.HasRangeKeys: 404 tag = tagNewFile5 405 case customFields: 406 tag = tagNewFile4 407 default: 408 tag = tagNewFile2 409 } 410 e.writeUvarint(tag) 411 e.writeUvarint(uint64(x.Level)) 412 e.writeUvarint(uint64(x.Meta.FileNum)) 413 e.writeUvarint(x.Meta.Size) 414 if !x.Meta.HasRangeKeys { 415 // If we have no range keys, preserve the original format and write the 416 // smallest and largest point keys. 417 e.writeKey(x.Meta.SmallestPointKey) 418 e.writeKey(x.Meta.LargestPointKey) 419 } else { 420 // When range keys are present, we first write a marker byte that 421 // indicates if the table also contains point keys, in addition to how the 422 // overall bounds for the table should be reconstructed. This byte is 423 // followed by the keys themselves. 424 b, err := x.Meta.boundsMarker() 425 if err != nil { 426 return err 427 } 428 if err = e.WriteByte(b); err != nil { 429 return err 430 } 431 // Write point key bounds (if present). 432 if x.Meta.HasPointKeys { 433 e.writeKey(x.Meta.SmallestPointKey) 434 e.writeKey(x.Meta.LargestPointKey) 435 } 436 // Write range key bounds. 437 e.writeKey(x.Meta.SmallestRangeKey) 438 e.writeKey(x.Meta.LargestRangeKey) 439 } 440 e.writeUvarint(x.Meta.SmallestSeqNum) 441 e.writeUvarint(x.Meta.LargestSeqNum) 442 if customFields { 443 if x.Meta.CreationTime != 0 { 444 e.writeUvarint(customTagCreationTime) 445 var buf [binary.MaxVarintLen64]byte 446 n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime)) 447 e.writeBytes(buf[:n]) 448 } 449 if x.Meta.MarkedForCompaction { 450 e.writeUvarint(customTagNeedsCompaction) 451 e.writeBytes([]byte{1}) 452 } 453 e.writeUvarint(customTagTerminate) 454 } 455 } 456 _, err := w.Write(e.Bytes()) 457 return err 458 } 459 460 type versionEditDecoder struct { 461 byteReader 462 } 463 464 func (d versionEditDecoder) readBytes() ([]byte, error) { 465 n, err := d.readUvarint() 466 if err != nil { 467 return nil, err 468 } 469 s := make([]byte, n) 470 _, err = io.ReadFull(d, s) 471 if err != nil { 472 if err == io.ErrUnexpectedEOF { 473 return nil, errCorruptManifest 474 } 475 return nil, err 476 } 477 return s, nil 478 } 479 480 func (d versionEditDecoder) readLevel() (int, error) { 481 u, err := d.readUvarint() 482 if err != nil { 483 return 0, err 484 } 485 if u >= NumLevels { 486 return 0, errCorruptManifest 487 } 488 return int(u), nil 489 } 490 491 func (d versionEditDecoder) readFileNum() (base.FileNum, error) { 492 u, err := d.readUvarint() 493 if err != nil { 494 return 0, err 495 } 496 return base.FileNum(u), nil 497 } 498 499 func (d versionEditDecoder) readUvarint() (uint64, error) { 500 u, err := binary.ReadUvarint(d) 501 if err != nil { 502 if err == io.EOF { 503 return 0, errCorruptManifest 504 } 505 return 0, err 506 } 507 return u, nil 508 } 509 510 type versionEditEncoder struct { 511 *bytes.Buffer 512 } 513 514 func (e versionEditEncoder) writeBytes(p []byte) { 515 e.writeUvarint(uint64(len(p))) 516 e.Write(p) 517 } 518 519 func (e versionEditEncoder) writeKey(k InternalKey) { 520 e.writeUvarint(uint64(k.Size())) 521 e.Write(k.UserKey) 522 buf := k.EncodeTrailer() 523 e.Write(buf[:]) 524 } 525 526 func (e versionEditEncoder) writeString(s string) { 527 e.writeUvarint(uint64(len(s))) 528 e.WriteString(s) 529 } 530 531 func (e versionEditEncoder) writeUvarint(u uint64) { 532 var buf [binary.MaxVarintLen64]byte 533 n := binary.PutUvarint(buf[:], u) 534 e.Write(buf[:n]) 535 } 536 537 // BulkVersionEdit summarizes the files added and deleted from a set of version 538 // edits. 539 type BulkVersionEdit struct { 540 Added [NumLevels][]*FileMetadata 541 Deleted [NumLevels]map[base.FileNum]*FileMetadata 542 543 // AddedByFileNum maps file number to file metadata for all added files 544 // from accumulated version edits. AddedByFileNum is only populated if set 545 // to non-nil by a caller. It must be set to non-nil when replaying 546 // version edits read from a MANIFEST (as opposed to VersionEdits 547 // constructed in-memory). While replaying a MANIFEST file, 548 // VersionEdit.DeletedFiles map entries have nil values, because the 549 // on-disk deletion record encodes only the file number. Accumulate 550 // uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted 551 // field with non-nil *FileMetadata. 552 AddedByFileNum map[base.FileNum]*FileMetadata 553 554 // MarkedForCompactionCountDiff holds the aggregated count of files 555 // marked for compaction added or removed. 556 MarkedForCompactionCountDiff int 557 } 558 559 // Accumulate adds the file addition and deletions in the specified version 560 // edit to the bulk edit's internal state. 561 func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { 562 for df, m := range ve.DeletedFiles { 563 dmap := b.Deleted[df.Level] 564 if dmap == nil { 565 dmap = make(map[base.FileNum]*FileMetadata) 566 b.Deleted[df.Level] = dmap 567 } 568 569 if m == nil { 570 // m is nil only when replaying a MANIFEST. 571 if b.AddedByFileNum == nil { 572 return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum) 573 } 574 m = b.AddedByFileNum[df.FileNum] 575 if m == nil { 576 return base.CorruptionErrorf("bitalostable: file deleted L%d.%s before it was inserted", df.Level, df.FileNum) 577 } 578 } 579 if m.MarkedForCompaction { 580 b.MarkedForCompactionCountDiff-- 581 } 582 dmap[df.FileNum] = m 583 } 584 585 for _, nf := range ve.NewFiles { 586 // A new file should not have been deleted in this or a preceding 587 // VersionEdit at the same level (though files can move across levels). 588 if dmap := b.Deleted[nf.Level]; dmap != nil { 589 if _, ok := dmap[nf.Meta.FileNum]; ok { 590 return base.CorruptionErrorf("bitalostable: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum) 591 } 592 } 593 b.Added[nf.Level] = append(b.Added[nf.Level], nf.Meta) 594 if b.AddedByFileNum != nil { 595 b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta 596 } 597 if nf.Meta.MarkedForCompaction { 598 b.MarkedForCompactionCountDiff++ 599 } 600 } 601 return nil 602 } 603 604 // Apply applies the delta b to the current version to produce a new 605 // version. The new version is consistent with respect to the comparer cmp. 606 // 607 // curr may be nil, which is equivalent to a pointer to a zero version. 608 // 609 // On success, a map of zombie files containing the file numbers and sizes of 610 // deleted files is returned. These files are considered zombies because they 611 // are no longer referenced by the returned Version, but cannot be deleted from 612 // disk as they are still in use by the incoming Version. 613 func (b *BulkVersionEdit) Apply( 614 curr *Version, 615 cmp Compare, 616 formatKey base.FormatKey, 617 flushSplitBytes int64, 618 readCompactionRate int64, 619 ) (_ *Version, zombies map[base.FileNum]uint64, _ error) { 620 addZombie := func(fileNum base.FileNum, size uint64) { 621 if zombies == nil { 622 zombies = make(map[base.FileNum]uint64) 623 } 624 zombies[fileNum] = size 625 } 626 // The remove zombie function is used to handle tables that are moved from 627 // one level to another during a version edit (i.e. a "move" compaction). 628 removeZombie := func(fileNum base.FileNum) { 629 if zombies != nil { 630 delete(zombies, fileNum) 631 } 632 } 633 634 v := new(Version) 635 636 // Adjust the count of files marked for compaction. 637 if curr != nil { 638 v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction 639 } 640 v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff 641 if v.Stats.MarkedForCompaction < 0 { 642 return nil, nil, base.CorruptionErrorf("bitalostable: version marked for compaction count negative") 643 } 644 645 for level := range v.Levels { 646 if curr == nil || curr.Levels[level].tree.root == nil { 647 v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */) 648 } else { 649 v.Levels[level] = curr.Levels[level].clone() 650 } 651 if curr == nil || curr.RangeKeyLevels[level].tree.root == nil { 652 v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */) 653 } else { 654 v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone() 655 } 656 657 if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 { 658 // There are no edits on this level. 659 if level == 0 { 660 // Initialize L0Sublevels. 661 if curr == nil || curr.L0Sublevels == nil { 662 if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 663 return nil, nil, errors.Wrap(err, "bitalostable: internal error") 664 } 665 } else { 666 v.L0Sublevels = curr.L0Sublevels 667 v.L0SublevelFiles = v.L0Sublevels.Levels 668 } 669 } 670 continue 671 } 672 673 // Some edits on this level. 674 lm := &v.Levels[level] 675 lmRange := &v.RangeKeyLevels[level] 676 addedFiles := b.Added[level] 677 deletedMap := b.Deleted[level] 678 if n := v.Levels[level].Len() + len(addedFiles); n == 0 { 679 return nil, nil, base.CorruptionErrorf( 680 "bitalostable: internal error: No current or added files but have deleted files: %d", 681 errors.Safe(len(deletedMap))) 682 } 683 684 // NB: addedFiles may be empty and it also is not necessarily 685 // internally consistent: it does not reflect deletions in deletedMap. 686 687 for _, f := range deletedMap { 688 addZombie(f.FileNum, f.Size) 689 if obsolete := v.Levels[level].tree.delete(f); obsolete { 690 // Deleting a file from the B-Tree may decrement its 691 // reference count. However, because we cloned the 692 // previous level's B-Tree, this should never result in a 693 // file's reference count dropping to zero. 694 err := errors.Errorf("bitalostable: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum) 695 return nil, nil, err 696 } 697 if f.HasRangeKeys { 698 if obsolete := v.RangeKeyLevels[level].tree.delete(f); obsolete { 699 // Deleting a file from the B-Tree may decrement its 700 // reference count. However, because we cloned the 701 // previous level's B-Tree, this should never result in a 702 // file's reference count dropping to zero. 703 err := errors.Errorf("bitalostable: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum) 704 return nil, nil, err 705 } 706 } 707 } 708 709 var sm, la *FileMetadata 710 for _, f := range addedFiles { 711 if _, ok := deletedMap[f.FileNum]; ok { 712 // Already called addZombie on this file in the preceding 713 // loop, so we don't need to do it here. 714 continue 715 } 716 717 // NB: allowedSeeks is used for read triggered compactions. It is set using 718 // Options.Experimental.ReadCompactionRate which defaults to 32KB. 719 var allowedSeeks int64 720 if readCompactionRate != 0 { 721 allowedSeeks = int64(f.Size) / readCompactionRate 722 } 723 if allowedSeeks < 100 { 724 allowedSeeks = 100 725 } 726 atomic.StoreInt64(&f.Atomic.AllowedSeeks, allowedSeeks) 727 f.InitAllowedSeeks = allowedSeeks 728 729 err := lm.tree.insert(f) 730 if err != nil { 731 return nil, nil, errors.Wrap(err, "bitalostable") 732 } 733 if f.HasRangeKeys { 734 err = lmRange.tree.insert(f) 735 if err != nil { 736 return nil, nil, errors.Wrap(err, "bitalostable") 737 } 738 } 739 removeZombie(f.FileNum) 740 // Track the keys with the smallest and largest keys, so that we can 741 // check consistency of the modified span. 742 if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 { 743 sm = f 744 } 745 if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 { 746 la = f 747 } 748 } 749 750 if level == 0 { 751 if curr != nil && curr.L0Sublevels != nil && len(deletedMap) == 0 { 752 // Flushes and ingestions that do not delete any L0 files do not require 753 // a regeneration of L0Sublevels from scratch. We can instead generate 754 // it incrementally. 755 var err error 756 // AddL0Files requires addedFiles to be sorted in seqnum order. 757 addedFiles = append([]*FileMetadata(nil), addedFiles...) 758 SortBySeqNum(addedFiles) 759 v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0]) 760 if errors.Is(err, errInvalidL0SublevelsOpt) { 761 err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes) 762 } 763 if err != nil { 764 return nil, nil, errors.Wrap(err, "bitalostable: internal error") 765 } 766 v.L0SublevelFiles = v.L0Sublevels.Levels 767 } else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { 768 return nil, nil, errors.Wrap(err, "bitalostable: internal error") 769 } 770 if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter()); err != nil { 771 return nil, nil, errors.Wrap(err, "bitalostable: internal error") 772 } 773 continue 774 } 775 776 // Check consistency of the level in the vicinity of our edits. 777 if sm != nil && la != nil { 778 overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey, 779 la.Largest.UserKey, la.Largest.IsExclusiveSentinel()) 780 // overlap contains all of the added files. We want to ensure that 781 // the added files are consistent with neighboring existing files 782 // too, so reslice the overlap to pull in a neighbor on each side. 783 check := overlap.Reslice(func(start, end *LevelIterator) { 784 if m := start.Prev(); m == nil { 785 start.Next() 786 } 787 if m := end.Next(); m == nil { 788 end.Prev() 789 } 790 }) 791 if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter()); err != nil { 792 return nil, nil, errors.Wrap(err, "bitalostable: internal error") 793 } 794 } 795 } 796 return v, zombies, nil 797 }