github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/version_set.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "math" 12 "sync" 13 "sync/atomic" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/errors/oserror" 17 "github.com/zuoyebang/bitalostable/internal/base" 18 "github.com/zuoyebang/bitalostable/internal/invariants" 19 "github.com/zuoyebang/bitalostable/internal/manifest" 20 "github.com/zuoyebang/bitalostable/record" 21 "github.com/zuoyebang/bitalostable/vfs" 22 "github.com/zuoyebang/bitalostable/vfs/atomicfs" 23 ) 24 25 const numLevels = manifest.NumLevels 26 27 const manifestMarkerName = `manifest` 28 29 // Provide type aliases for the various manifest structs. 30 type bulkVersionEdit = manifest.BulkVersionEdit 31 type deletedFileEntry = manifest.DeletedFileEntry 32 type fileMetadata = manifest.FileMetadata 33 type newFileEntry = manifest.NewFileEntry 34 type version = manifest.Version 35 type versionEdit = manifest.VersionEdit 36 type versionList = manifest.VersionList 37 38 // versionSet manages a collection of immutable versions, and manages the 39 // creation of a new version from the most recent version. A new version is 40 // created from an existing version by applying a version edit which is just 41 // like it sounds: a delta from the previous version. Version edits are logged 42 // to the MANIFEST file, which is replayed at startup. 43 type versionSet struct { 44 // WARNING: The following struct `atomic` contains fields are accessed atomically. 45 // 46 // Go allocations are guaranteed to be 64-bit aligned which we take advantage 47 // of by placing the 64-bit fields which we access atomically at the beginning 48 // of the versionSet struct. 49 // For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG. 50 atomic struct { 51 logSeqNum uint64 // next seqNum to use for WAL writes 52 53 // The upper bound on sequence numbers that have been assigned so far. 54 // A suffix of these sequence numbers may not have been written to a 55 // WAL. Both logSeqNum and visibleSeqNum are atomically updated by the 56 // commitPipeline. 57 visibleSeqNum uint64 // visible seqNum (<= logSeqNum) 58 59 // Number of bytes present in sstables being written by in-progress 60 // compactions. This value will be zero if there are no in-progress 61 // compactions. Updated and read atomically. 62 atomicInProgressBytes int64 63 } 64 65 // Immutable fields. 66 dirname string 67 // Set to DB.mu. 68 mu *sync.Mutex 69 opts *Options 70 fs vfs.FS 71 cmp Compare 72 cmpName string 73 diskAvailBytes func() uint64 74 // Dynamic base level allows the dynamic base level computation to be 75 // disabled. Used by tests which want to create specific LSM structures. 76 dynamicBaseLevel bool 77 78 // Mutable fields. 79 versions versionList 80 picker compactionPicker 81 82 metrics Metrics 83 84 // A pointer to versionSet.addObsoleteLocked. Avoids allocating a new closure 85 // on the creation of every version. 86 obsoleteFn func(obsolete []*manifest.FileMetadata) 87 obsoleteTables []*manifest.FileMetadata 88 obsoleteManifests []fileInfo 89 obsoleteOptions []fileInfo 90 91 // Zombie tables which have been removed from the current version but are 92 // still referenced by an inuse iterator. 93 zombieTables map[FileNum]uint64 // filenum -> size 94 95 // minUnflushedLogNum is the smallest WAL log file number corresponding to 96 // mutations that have not been flushed to an sstable. 97 minUnflushedLogNum FileNum 98 99 // The next file number. A single counter is used to assign file numbers 100 // for the WAL, MANIFEST, sstable, and OPTIONS files. 101 nextFileNum FileNum 102 103 // The current manifest file number. 104 manifestFileNum FileNum 105 manifestMarker *atomicfs.Marker 106 107 manifestFile vfs.File 108 manifest *record.Writer 109 setCurrent func(FileNum) error 110 111 writing bool 112 writerCond sync.Cond 113 } 114 115 func (vs *versionSet) init( 116 dirname string, 117 opts *Options, 118 marker *atomicfs.Marker, 119 setCurrent func(FileNum) error, 120 mu *sync.Mutex, 121 ) { 122 vs.dirname = dirname 123 vs.mu = mu 124 vs.writerCond.L = mu 125 vs.opts = opts 126 vs.fs = opts.FS 127 vs.cmp = opts.Comparer.Compare 128 vs.cmpName = opts.Comparer.Name 129 vs.dynamicBaseLevel = true 130 vs.versions.Init(mu) 131 vs.obsoleteFn = vs.addObsoleteLocked 132 vs.zombieTables = make(map[FileNum]uint64) 133 vs.nextFileNum = 1 134 vs.manifestMarker = marker 135 vs.setCurrent = setCurrent 136 if vs.diskAvailBytes == nil { 137 vs.diskAvailBytes = func() uint64 { return math.MaxUint64 } 138 } 139 } 140 141 // create creates a version set for a fresh DB. 142 func (vs *versionSet) create( 143 jobID int, 144 dirname string, 145 opts *Options, 146 marker *atomicfs.Marker, 147 setCurrent func(FileNum) error, 148 mu *sync.Mutex, 149 ) error { 150 vs.init(dirname, opts, marker, setCurrent, mu) 151 newVersion := &version{} 152 vs.append(newVersion) 153 var err error 154 155 vs.picker = newCompactionPicker(newVersion, vs.opts, nil, vs.metrics.levelSizes(), vs.diskAvailBytes) 156 157 // Note that a "snapshot" version edit is written to the manifest when it is 158 // created. 159 vs.manifestFileNum = vs.getNextFileNum() 160 err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum) 161 if err == nil { 162 if err = vs.manifest.Flush(); err != nil { 163 vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err) 164 } 165 } 166 if err == nil { 167 if err = vs.manifestFile.Sync(); err != nil { 168 vs.opts.Logger.Fatalf("MANIFEST sync failed: %v", err) 169 } 170 } 171 if err == nil { 172 // NB: setCurrent is responsible for syncing the data directory. 173 if err = vs.setCurrent(vs.manifestFileNum); err != nil { 174 vs.opts.Logger.Fatalf("MANIFEST set current failed: %v", err) 175 } 176 } 177 178 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 179 JobID: jobID, 180 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, vs.manifestFileNum), 181 FileNum: vs.manifestFileNum, 182 Err: err, 183 }) 184 if err != nil { 185 return err 186 } 187 return nil 188 } 189 190 // load loads the version set from the manifest file. 191 func (vs *versionSet) load( 192 dirname string, 193 opts *Options, 194 manifestFileNum FileNum, 195 marker *atomicfs.Marker, 196 setCurrent func(FileNum) error, 197 mu *sync.Mutex, 198 ) error { 199 vs.init(dirname, opts, marker, setCurrent, mu) 200 201 vs.manifestFileNum = manifestFileNum 202 manifestPath := base.MakeFilepath(opts.FS, dirname, fileTypeManifest, vs.manifestFileNum) 203 manifestFilename := opts.FS.PathBase(manifestPath) 204 205 // Read the versionEdits in the manifest file. 206 var bve bulkVersionEdit 207 bve.AddedByFileNum = make(map[base.FileNum]*fileMetadata) 208 manifest, err := vs.fs.Open(manifestPath) 209 if err != nil { 210 return errors.Wrapf(err, "bitalostable: could not open manifest file %q for DB %q", 211 errors.Safe(manifestFilename), dirname) 212 } 213 defer manifest.Close() 214 rr := record.NewReader(manifest, 0 /* logNum */) 215 for { 216 r, err := rr.Next() 217 if err == io.EOF || record.IsInvalidRecord(err) { 218 break 219 } 220 if err != nil { 221 return errors.Wrapf(err, "bitalostable: error when loading manifest file %q", 222 errors.Safe(manifestFilename)) 223 } 224 var ve versionEdit 225 err = ve.Decode(r) 226 if err != nil { 227 // Break instead of returning an error if the record is corrupted 228 // or invalid. 229 if err == io.EOF || record.IsInvalidRecord(err) { 230 break 231 } 232 return err 233 } 234 if ve.ComparerName != "" { 235 if ve.ComparerName != vs.cmpName { 236 return errors.Errorf("bitalostable: manifest file %q for DB %q: "+ 237 "comparer name from file %q != comparer name from Options %q", 238 errors.Safe(manifestFilename), dirname, errors.Safe(ve.ComparerName), errors.Safe(vs.cmpName)) 239 } 240 } 241 if err := bve.Accumulate(&ve); err != nil { 242 return err 243 } 244 if ve.MinUnflushedLogNum != 0 { 245 vs.minUnflushedLogNum = ve.MinUnflushedLogNum 246 } 247 if ve.NextFileNum != 0 { 248 vs.nextFileNum = ve.NextFileNum 249 } 250 if ve.LastSeqNum != 0 { 251 // logSeqNum is the _next_ sequence number that will be assigned, 252 // while LastSeqNum is the last assigned sequence number. Note that 253 // this behaviour mimics that in RocksDB; the first sequence number 254 // assigned is one greater than the one present in the manifest 255 // (assuming no WALs contain higher sequence numbers than the 256 // manifest's LastSeqNum). Increment LastSeqNum by 1 to get the 257 // next sequence number that will be assigned. 258 vs.atomic.logSeqNum = ve.LastSeqNum + 1 259 } 260 } 261 // We have already set vs.nextFileNum = 2 at the beginning of the 262 // function and could have only updated it to some other non-zero value, 263 // so it cannot be 0 here. 264 if vs.minUnflushedLogNum == 0 { 265 if vs.nextFileNum >= 2 { 266 // We either have a freshly created DB, or a DB created by RocksDB 267 // that has not had a single flushed SSTable yet. This is because 268 // RocksDB bumps up nextFileNum in this case without bumping up 269 // minUnflushedLogNum, even if WALs with non-zero file numbers are 270 // present in the directory. 271 } else { 272 return base.CorruptionErrorf("bitalostable: malformed manifest file %q for DB %q", 273 errors.Safe(manifestFilename), dirname) 274 } 275 } 276 vs.markFileNumUsed(vs.minUnflushedLogNum) 277 278 newVersion, _, err := bve.Apply(nil, vs.cmp, opts.Comparer.FormatKey, opts.FlushSplitBytes, opts.Experimental.ReadCompactionRate) 279 if err != nil { 280 return err 281 } 282 newVersion.L0Sublevels.InitCompactingFileInfo(nil /* in-progress compactions */) 283 vs.append(newVersion) 284 285 for i := range vs.metrics.Levels { 286 l := &vs.metrics.Levels[i] 287 l.NumFiles = int64(newVersion.Levels[i].Len()) 288 files := newVersion.Levels[i].Slice() 289 l.Size = int64(files.SizeSum()) 290 } 291 292 vs.picker = newCompactionPicker(newVersion, vs.opts, nil, vs.metrics.levelSizes(), vs.diskAvailBytes) 293 return nil 294 } 295 296 func (vs *versionSet) close() error { 297 if vs.manifestFile != nil { 298 if err := vs.manifestFile.Close(); err != nil { 299 return err 300 } 301 } 302 if vs.manifestMarker != nil { 303 if err := vs.manifestMarker.Close(); err != nil { 304 return err 305 } 306 } 307 return nil 308 } 309 310 // logLock locks the manifest for writing. The lock must be released by either 311 // a call to logUnlock or logAndApply. 312 // 313 // DB.mu must be held when calling this method, but the mutex may be dropped and 314 // re-acquired during the course of this method. 315 func (vs *versionSet) logLock() { 316 // Wait for any existing writing to the manifest to complete, then mark the 317 // manifest as busy. 318 for vs.writing { 319 vs.writerCond.Wait() 320 } 321 vs.writing = true 322 } 323 324 // logUnlock releases the lock for manifest writing. 325 // 326 // DB.mu must be held when calling this method. 327 func (vs *versionSet) logUnlock() { 328 if !vs.writing { 329 vs.opts.Logger.Fatalf("MANIFEST not locked for writing") 330 } 331 vs.writing = false 332 vs.writerCond.Signal() 333 } 334 335 // logAndApply logs the version edit to the manifest, applies the version edit 336 // to the current version, and installs the new version. 337 // 338 // DB.mu must be held when calling this method and will be released temporarily 339 // while performing file I/O. Requires that the manifest is locked for writing 340 // (see logLock). Will unconditionally release the manifest lock (via 341 // logUnlock) even if an error occurs. 342 // 343 // inProgressCompactions is called while DB.mu is held, to get the list of 344 // in-progress compactions. 345 func (vs *versionSet) logAndApply( 346 jobID int, 347 ve *versionEdit, 348 metrics map[int]*LevelMetrics, 349 forceRotation bool, 350 inProgressCompactions func() []compactionInfo, 351 ) error { 352 if !vs.writing { 353 vs.opts.Logger.Fatalf("MANIFEST not locked for writing") 354 } 355 defer vs.logUnlock() 356 357 if ve.MinUnflushedLogNum != 0 { 358 if ve.MinUnflushedLogNum < vs.minUnflushedLogNum || 359 vs.nextFileNum <= ve.MinUnflushedLogNum { 360 panic(fmt.Sprintf("bitalostable: inconsistent versionEdit minUnflushedLogNum %d", 361 ve.MinUnflushedLogNum)) 362 } 363 } 364 365 // This is the next manifest filenum, but if the current file is too big we 366 // will write this ve to the next file which means what ve encodes is the 367 // current filenum and not the next one. 368 // 369 // TODO(sbhola): figure out why this is correct and update comment. 370 ve.NextFileNum = vs.nextFileNum 371 372 // LastSeqNum is set to the current upper bound on the assigned sequence 373 // numbers. Note that this is exactly the behavior of RocksDB. LastSeqNum is 374 // used to initialize versionSet.logSeqNum and versionSet.visibleSeqNum on 375 // replay. It must be higher than or equal to any than any sequence number 376 // written to an sstable, including sequence numbers in ingested files. 377 // Note that LastSeqNum is not (and cannot be) the minimum unflushed sequence 378 // number. This is fallout from ingestion which allows a sequence number X to 379 // be assigned to an ingested sstable even though sequence number X-1 resides 380 // in an unflushed memtable. logSeqNum is the _next_ sequence number that 381 // will be assigned, so subtract that by 1 to get the upper bound on the 382 // last assigned sequence number. 383 logSeqNum := atomic.LoadUint64(&vs.atomic.logSeqNum) 384 ve.LastSeqNum = logSeqNum - 1 385 if logSeqNum == 0 { 386 // logSeqNum is initialized to 1 in Open() if there are no previous WAL 387 // or manifest records, so this case should never happen. 388 vs.opts.Logger.Fatalf("logSeqNum must be a positive integer: %d", logSeqNum) 389 } 390 391 currentVersion := vs.currentVersion() 392 var newVersion *version 393 394 // Generate a new manifest if we don't currently have one, or the current one 395 // is too large. 396 var newManifestFileNum FileNum 397 var prevManifestFileSize uint64 398 if forceRotation || vs.manifest == nil || vs.manifest.Size() >= vs.opts.MaxManifestFileSize { 399 newManifestFileNum = vs.getNextFileNum() 400 prevManifestFileSize = uint64(vs.manifest.Size()) 401 } 402 403 // Grab certain values before releasing vs.mu, in case createManifest() needs 404 // to be called. 405 minUnflushedLogNum := vs.minUnflushedLogNum 406 nextFileNum := vs.nextFileNum 407 408 var zombies map[FileNum]uint64 409 if err := func() error { 410 vs.mu.Unlock() 411 defer vs.mu.Lock() 412 413 var bve bulkVersionEdit 414 if err := bve.Accumulate(ve); err != nil { 415 return err 416 } 417 418 var err error 419 newVersion, zombies, err = bve.Apply(currentVersion, vs.cmp, vs.opts.Comparer.FormatKey, vs.opts.FlushSplitBytes, vs.opts.Experimental.ReadCompactionRate) 420 if err != nil { 421 return errors.Wrap(err, "MANIFEST apply failed") 422 } 423 424 if newManifestFileNum != 0 { 425 if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil { 426 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 427 JobID: jobID, 428 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum), 429 FileNum: newManifestFileNum, 430 Err: err, 431 }) 432 return errors.Wrap(err, "MANIFEST create failed") 433 } 434 } 435 436 w, err := vs.manifest.Next() 437 if err != nil { 438 return errors.Wrap(err, "MANIFEST next record write failed") 439 } 440 // NB: Any error from this point on is considered fatal as we don't now if 441 // the MANIFEST write occurred or not. Trying to determine that is 442 // fraught. Instead we rely on the standard recovery mechanism run when a 443 // database is open. In particular, that mechanism generates a new MANIFEST 444 // and ensures it is synced. 445 if err := ve.Encode(w); err != nil { 446 return errors.Wrap(err, "MANIFEST write failed") 447 } 448 if err := vs.manifest.Flush(); err != nil { 449 return errors.Wrap(err, "MANIFEST flush failed") 450 } 451 if err := vs.manifestFile.Sync(); err != nil { 452 return errors.Wrap(err, "MANIFEST sync failed") 453 } 454 if newManifestFileNum != 0 { 455 // NB: setCurrent is responsible for syncing the data directory. 456 if err := vs.setCurrent(newManifestFileNum); err != nil { 457 return errors.Wrap(err, "MANIFEST set current failed") 458 } 459 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 460 JobID: jobID, 461 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum), 462 FileNum: newManifestFileNum, 463 }) 464 } 465 return nil 466 }(); err != nil { 467 // Any error encountered during any of the operations in the previous 468 // closure are considered fatal. Treating such errors as fatal is preferred 469 // to attempting to unwind various file and b-tree reference counts, and 470 // re-generating L0 sublevel metadata. This may change in the future, if 471 // certain manifest / WAL operations become retryable. For more context, see 472 // #1159 and #1792. 473 vs.opts.Logger.Fatalf("%s", err) 474 return err 475 } 476 477 // Now that DB.mu is held again, initialize compacting file info in 478 // L0Sublevels. 479 inProgress := inProgressCompactions() 480 481 newVersion.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgress)) 482 483 // Update the zombie tables set first, as installation of the new version 484 // will unref the previous version which could result in addObsoleteLocked 485 // being called. 486 for fileNum, size := range zombies { 487 vs.zombieTables[fileNum] = size 488 } 489 490 // Install the new version. 491 vs.append(newVersion) 492 if ve.MinUnflushedLogNum != 0 { 493 vs.minUnflushedLogNum = ve.MinUnflushedLogNum 494 } 495 if newManifestFileNum != 0 { 496 if vs.manifestFileNum != 0 { 497 vs.obsoleteManifests = append(vs.obsoleteManifests, fileInfo{ 498 fileNum: vs.manifestFileNum, 499 fileSize: prevManifestFileSize, 500 }) 501 } 502 vs.manifestFileNum = newManifestFileNum 503 } 504 505 for level, update := range metrics { 506 vs.metrics.Levels[level].Add(update) 507 } 508 for i := range vs.metrics.Levels { 509 l := &vs.metrics.Levels[i] 510 l.Sublevels = 0 511 if l.NumFiles > 0 { 512 l.Sublevels = 1 513 } 514 if invariants.Enabled { 515 if count := int64(newVersion.Levels[i].Len()); l.NumFiles != count { 516 vs.opts.Logger.Fatalf("versionSet metrics L%d NumFiles = %d, actual count = %d", i, l.NumFiles, count) 517 } 518 levelFiles := newVersion.Levels[i].Slice() 519 if size := int64(levelFiles.SizeSum()); l.Size != size { 520 vs.opts.Logger.Fatalf("versionSet metrics L%d Size = %d, actual size = %d", i, l.Size, size) 521 } 522 } 523 } 524 vs.metrics.Levels[0].Sublevels = int32(len(newVersion.L0SublevelFiles)) 525 526 vs.picker = newCompactionPicker(newVersion, vs.opts, inProgress, vs.metrics.levelSizes(), vs.diskAvailBytes) 527 if !vs.dynamicBaseLevel { 528 vs.picker.forceBaseLevel1() 529 } 530 return nil 531 } 532 533 func (vs *versionSet) incrementCompactions(kind compactionKind, extraLevels []*compactionLevel) { 534 switch kind { 535 case compactionKindDefault: 536 vs.metrics.Compact.Count++ 537 vs.metrics.Compact.DefaultCount++ 538 539 case compactionKindFlush: 540 vs.metrics.Flush.Count++ 541 542 case compactionKindMove: 543 vs.metrics.Compact.Count++ 544 vs.metrics.Compact.MoveCount++ 545 546 case compactionKindDeleteOnly: 547 vs.metrics.Compact.Count++ 548 vs.metrics.Compact.DeleteOnlyCount++ 549 550 case compactionKindElisionOnly: 551 vs.metrics.Compact.Count++ 552 vs.metrics.Compact.ElisionOnlyCount++ 553 554 case compactionKindRead: 555 vs.metrics.Compact.Count++ 556 vs.metrics.Compact.ReadCount++ 557 558 case compactionKindRewrite: 559 vs.metrics.Compact.Count++ 560 vs.metrics.Compact.RewriteCount++ 561 } 562 if len(extraLevels) > 0 { 563 vs.metrics.Compact.MultiLevelCount++ 564 } 565 } 566 567 func (vs *versionSet) incrementCompactionBytes(numBytes int64) { 568 atomic.AddInt64(&vs.atomic.atomicInProgressBytes, numBytes) 569 } 570 571 // createManifest creates a manifest file that contains a snapshot of vs. 572 func (vs *versionSet) createManifest( 573 dirname string, fileNum, minUnflushedLogNum, nextFileNum FileNum, 574 ) (err error) { 575 var ( 576 filename = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum) 577 manifestFile vfs.File 578 manifest *record.Writer 579 ) 580 defer func() { 581 if manifest != nil { 582 manifest.Close() 583 } 584 if manifestFile != nil { 585 manifestFile.Close() 586 } 587 if err != nil { 588 vs.fs.Remove(filename) 589 } 590 }() 591 manifestFile, err = vs.fs.Create(filename) 592 if err != nil { 593 return err 594 } 595 manifest = record.NewWriter(manifestFile) 596 597 snapshot := versionEdit{ 598 ComparerName: vs.cmpName, 599 } 600 for level, levelMetadata := range vs.currentVersion().Levels { 601 iter := levelMetadata.Iter() 602 for meta := iter.First(); meta != nil; meta = iter.Next() { 603 snapshot.NewFiles = append(snapshot.NewFiles, newFileEntry{ 604 Level: level, 605 Meta: meta, 606 }) 607 } 608 } 609 610 // When creating a version snapshot for an existing DB, this snapshot VersionEdit will be 611 // immediately followed by another VersionEdit (being written in logAndApply()). That 612 // VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot. 613 // But it does not necessarily include MinUnflushedLogNum, NextFileNum, so we initialize those 614 // using the corresponding fields in the versionSet (which came from the latest preceding 615 // VersionEdit that had those fields). 616 snapshot.MinUnflushedLogNum = minUnflushedLogNum 617 snapshot.NextFileNum = nextFileNum 618 619 w, err1 := manifest.Next() 620 if err1 != nil { 621 return err1 622 } 623 if err := snapshot.Encode(w); err != nil { 624 return err 625 } 626 627 if vs.manifest != nil { 628 vs.manifest.Close() 629 vs.manifest = nil 630 } 631 if vs.manifestFile != nil { 632 if err := vs.manifestFile.Close(); err != nil { 633 return err 634 } 635 vs.manifestFile = nil 636 } 637 638 vs.manifest, manifest = manifest, nil 639 vs.manifestFile, manifestFile = manifestFile, nil 640 return nil 641 } 642 643 func (vs *versionSet) markFileNumUsed(fileNum FileNum) { 644 if vs.nextFileNum <= fileNum { 645 vs.nextFileNum = fileNum + 1 646 } 647 } 648 649 func (vs *versionSet) getNextFileNum() FileNum { 650 x := vs.nextFileNum 651 vs.nextFileNum++ 652 return x 653 } 654 655 func (vs *versionSet) append(v *version) { 656 if v.Refs() != 0 { 657 panic("bitalostable: version should be unreferenced") 658 } 659 if !vs.versions.Empty() { 660 vs.versions.Back().UnrefLocked() 661 } 662 v.Deleted = vs.obsoleteFn 663 v.Ref() 664 vs.versions.PushBack(v) 665 } 666 667 func (vs *versionSet) currentVersion() *version { 668 return vs.versions.Back() 669 } 670 671 func (vs *versionSet) addLiveFileNums(m map[FileNum]struct{}) { 672 current := vs.currentVersion() 673 for v := vs.versions.Front(); true; v = v.Next() { 674 for _, lm := range v.Levels { 675 iter := lm.Iter() 676 for f := iter.First(); f != nil; f = iter.Next() { 677 m[f.FileNum] = struct{}{} 678 } 679 } 680 if v == current { 681 break 682 } 683 } 684 } 685 686 func (vs *versionSet) addObsoleteLocked(obsolete []*manifest.FileMetadata) { 687 for _, fileMeta := range obsolete { 688 // Note that the obsolete tables are no longer zombie by the definition of 689 // zombie, but we leave them in the zombie tables map until they are 690 // deleted from disk. 691 if _, ok := vs.zombieTables[fileMeta.FileNum]; !ok { 692 vs.opts.Logger.Fatalf("MANIFEST obsolete table %s not marked as zombie", fileMeta.FileNum) 693 } 694 } 695 vs.obsoleteTables = append(vs.obsoleteTables, obsolete...) 696 vs.incrementObsoleteTablesLocked(obsolete) 697 } 698 699 func (vs *versionSet) incrementObsoleteTablesLocked(obsolete []*manifest.FileMetadata) { 700 for _, fileMeta := range obsolete { 701 vs.metrics.Table.ObsoleteCount++ 702 vs.metrics.Table.ObsoleteSize += fileMeta.Size 703 } 704 } 705 706 func setCurrentFunc( 707 vers FormatMajorVersion, marker *atomicfs.Marker, fs vfs.FS, dirname string, dir vfs.File, 708 ) func(FileNum) error { 709 if vers < formatVersionedManifestMarker { 710 // Pebble versions before `formatVersionedManifestMarker` used 711 // the CURRENT file to signal which MANIFEST is current. Ignore 712 // the filename read during LocateMarker. 713 return func(manifestFileNum FileNum) error { 714 if err := setCurrentFile(dirname, fs, manifestFileNum); err != nil { 715 return err 716 } 717 if err := dir.Sync(); err != nil { 718 // This is a panic here, rather than higher in the call 719 // stack, for parity with the atomicfs.Marker behavior. 720 // A panic is always necessary because failed Syncs are 721 // unrecoverable. 722 panic(errors.Wrap(err, "fatal: MANIFEST dirsync failed")) 723 } 724 return nil 725 } 726 } 727 return setCurrentFuncMarker(marker, fs, dirname) 728 } 729 730 func setCurrentFuncMarker(marker *atomicfs.Marker, fs vfs.FS, dirname string) func(FileNum) error { 731 return func(manifestFileNum FileNum) error { 732 return marker.Move(base.MakeFilename(fileTypeManifest, manifestFileNum)) 733 } 734 } 735 736 func findCurrentManifest( 737 vers FormatMajorVersion, fs vfs.FS, dirname string, 738 ) (marker *atomicfs.Marker, manifestNum FileNum, exists bool, err error) { 739 // NB: We always locate the manifest marker, even if we might not 740 // actually use it (because we're opening the database at an earlier 741 // format major version that uses the CURRENT file). Locating a 742 // marker should succeed even if the marker has never been placed. 743 var filename string 744 marker, filename, err = atomicfs.LocateMarker(fs, dirname, manifestMarkerName) 745 if err != nil { 746 return nil, 0, false, err 747 } 748 749 if vers < formatVersionedManifestMarker { 750 // Pebble versions before `formatVersionedManifestMarker` used 751 // the CURRENT file to signal which MANIFEST is current. Ignore 752 // the filename read during LocateMarker. 753 754 manifestNum, err = readCurrentFile(fs, dirname) 755 if oserror.IsNotExist(err) { 756 return marker, 0, false, nil 757 } else if err != nil { 758 return marker, 0, false, err 759 } 760 return marker, manifestNum, true, nil 761 } 762 763 // The current format major version is >= 764 // formatVersionedManifestMarker indicating that the 765 // atomicfs.Marker is the source of truth on the current manifest. 766 767 if filename == "" { 768 // The marker hasn't been set yet. This database doesn't exist. 769 return marker, 0, false, nil 770 } 771 772 var ok bool 773 _, manifestNum, ok = base.ParseFilename(fs, filename) 774 if !ok { 775 return marker, 0, false, base.CorruptionErrorf("bitalostable: MANIFEST name %q is malformed", errors.Safe(filename)) 776 } 777 return marker, manifestNum, true, nil 778 } 779 780 func readCurrentFile(fs vfs.FS, dirname string) (FileNum, error) { 781 // Read the CURRENT file to find the current manifest file. 782 current, err := fs.Open(base.MakeFilepath(fs, dirname, fileTypeCurrent, 0)) 783 if err != nil { 784 return 0, errors.Wrapf(err, "bitalostable: could not open CURRENT file for DB %q", dirname) 785 } 786 defer current.Close() 787 stat, err := current.Stat() 788 if err != nil { 789 return 0, err 790 } 791 n := stat.Size() 792 if n == 0 { 793 return 0, errors.Errorf("bitalostable: CURRENT file for DB %q is empty", dirname) 794 } 795 if n > 4096 { 796 return 0, errors.Errorf("bitalostable: CURRENT file for DB %q is too large", dirname) 797 } 798 b := make([]byte, n) 799 _, err = current.ReadAt(b, 0) 800 if err != nil { 801 return 0, err 802 } 803 if b[n-1] != '\n' { 804 return 0, base.CorruptionErrorf("bitalostable: CURRENT file for DB %q is malformed", dirname) 805 } 806 b = bytes.TrimSpace(b) 807 808 _, manifestFileNum, ok := base.ParseFilename(fs, string(b)) 809 if !ok { 810 return 0, base.CorruptionErrorf("bitalostable: MANIFEST name %q is malformed", errors.Safe(b)) 811 } 812 return manifestFileNum, nil 813 } 814 815 func newFileMetrics(newFiles []manifest.NewFileEntry) map[int]*LevelMetrics { 816 m := map[int]*LevelMetrics{} 817 for _, nf := range newFiles { 818 lm := m[nf.Level] 819 if lm == nil { 820 lm = &LevelMetrics{} 821 m[nf.Level] = lm 822 } 823 lm.NumFiles++ 824 lm.Size += int64(nf.Meta.Size) 825 } 826 return m 827 }