github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/version_set.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "sync" 12 "sync/atomic" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/errors/oserror" 16 "github.com/cockroachdb/pebble/internal/base" 17 "github.com/cockroachdb/pebble/internal/invariants" 18 "github.com/cockroachdb/pebble/internal/manifest" 19 "github.com/cockroachdb/pebble/record" 20 "github.com/cockroachdb/pebble/vfs" 21 "github.com/cockroachdb/pebble/vfs/atomicfs" 22 ) 23 24 const numLevels = manifest.NumLevels 25 26 const manifestMarkerName = `manifest` 27 28 // Provide type aliases for the various manifest structs. 29 type bulkVersionEdit = manifest.BulkVersionEdit 30 type deletedFileEntry = manifest.DeletedFileEntry 31 type fileMetadata = manifest.FileMetadata 32 type physicalMeta = manifest.PhysicalFileMeta 33 type virtualMeta = manifest.VirtualFileMeta 34 type fileBacking = manifest.FileBacking 35 type newFileEntry = manifest.NewFileEntry 36 type version = manifest.Version 37 type versionEdit = manifest.VersionEdit 38 type versionList = manifest.VersionList 39 40 // versionSet manages a collection of immutable versions, and manages the 41 // creation of a new version from the most recent version. A new version is 42 // created from an existing version by applying a version edit which is just 43 // like it sounds: a delta from the previous version. Version edits are logged 44 // to the MANIFEST file, which is replayed at startup. 45 type versionSet struct { 46 // Next seqNum to use for WAL writes. 47 logSeqNum atomic.Uint64 48 49 // The upper bound on sequence numbers that have been assigned so far. A 50 // suffix of these sequence numbers may not have been written to a WAL. Both 51 // logSeqNum and visibleSeqNum are atomically updated by the commitPipeline. 52 // visibleSeqNum is <= logSeqNum. 53 visibleSeqNum atomic.Uint64 54 55 // Number of bytes present in sstables being written by in-progress 56 // compactions. This value will be zero if there are no in-progress 57 // compactions. Updated and read atomically. 58 atomicInProgressBytes atomic.Int64 59 60 // Immutable fields. 61 dirname string 62 // Set to DB.mu. 63 mu *sync.Mutex 64 opts *Options 65 fs vfs.FS 66 cmp Compare 67 cmpName string 68 // Dynamic base level allows the dynamic base level computation to be 69 // disabled. Used by tests which want to create specific LSM structures. 70 dynamicBaseLevel bool 71 72 // Mutable fields. 73 versions versionList 74 picker compactionPicker 75 76 metrics Metrics 77 78 // A pointer to versionSet.addObsoleteLocked. Avoids allocating a new closure 79 // on the creation of every version. 80 obsoleteFn func(obsolete []*fileBacking) 81 obsoleteTables []fileInfo 82 obsoleteManifests []fileInfo 83 obsoleteOptions []fileInfo 84 85 // Zombie tables which have been removed from the current version but are 86 // still referenced by an inuse iterator. 87 zombieTables map[base.DiskFileNum]uint64 // filenum -> size 88 89 // backingState is protected by the versionSet.logLock. It's populated 90 // during Open in versionSet.load, but it's not used concurrently during 91 // load. 92 backingState struct { 93 // fileBackingMap is a map for the FileBacking which is supporting virtual 94 // sstables in the latest version. Once the file backing is backing no 95 // virtual sstables in the latest version, it is removed from this map and 96 // the corresponding state is added to the zombieTables map. Note that we 97 // don't keep track of file backing which supports a virtual sstable 98 // which is not in the latest version. 99 fileBackingMap map[base.DiskFileNum]*fileBacking 100 // fileBackingSize is the sum of the sizes of the fileBackings in the 101 // fileBackingMap. 102 fileBackingSize uint64 103 } 104 105 // minUnflushedLogNum is the smallest WAL log file number corresponding to 106 // mutations that have not been flushed to an sstable. 107 minUnflushedLogNum FileNum 108 109 // The next file number. A single counter is used to assign file numbers 110 // for the WAL, MANIFEST, sstable, and OPTIONS files. 111 nextFileNum FileNum 112 113 // The current manifest file number. 114 manifestFileNum FileNum 115 manifestMarker *atomicfs.Marker 116 117 manifestFile vfs.File 118 manifest *record.Writer 119 setCurrent func(FileNum) error 120 getFormatMajorVersion func() FormatMajorVersion 121 122 writing bool 123 writerCond sync.Cond 124 // State for deciding when to write a snapshot. Protected by mu. 125 rotationHelper record.RotationHelper 126 } 127 128 func (vs *versionSet) init( 129 dirname string, 130 opts *Options, 131 marker *atomicfs.Marker, 132 setCurrent func(FileNum) error, 133 getFMV func() FormatMajorVersion, 134 mu *sync.Mutex, 135 ) { 136 vs.dirname = dirname 137 vs.mu = mu 138 vs.writerCond.L = mu 139 vs.opts = opts 140 vs.fs = opts.FS 141 vs.cmp = opts.Comparer.Compare 142 vs.cmpName = opts.Comparer.Name 143 vs.dynamicBaseLevel = true 144 vs.versions.Init(mu) 145 vs.obsoleteFn = vs.addObsoleteLocked 146 vs.zombieTables = make(map[base.DiskFileNum]uint64) 147 vs.backingState.fileBackingMap = make(map[base.DiskFileNum]*fileBacking) 148 vs.backingState.fileBackingSize = 0 149 vs.nextFileNum = 1 150 vs.manifestMarker = marker 151 vs.setCurrent = setCurrent 152 vs.getFormatMajorVersion = getFMV 153 } 154 155 // create creates a version set for a fresh DB. 156 func (vs *versionSet) create( 157 jobID int, 158 dirname string, 159 opts *Options, 160 marker *atomicfs.Marker, 161 setCurrent func(FileNum) error, 162 getFormatMajorVersion func() FormatMajorVersion, 163 mu *sync.Mutex, 164 ) error { 165 vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu) 166 newVersion := &version{} 167 vs.append(newVersion) 168 var err error 169 170 vs.picker = newCompactionPicker(newVersion, vs.opts, nil) 171 // Note that a "snapshot" version edit is written to the manifest when it is 172 // created. 173 vs.manifestFileNum = vs.getNextFileNum() 174 err = vs.createManifest(vs.dirname, vs.manifestFileNum, vs.minUnflushedLogNum, vs.nextFileNum) 175 if err == nil { 176 if err = vs.manifest.Flush(); err != nil { 177 vs.opts.Logger.Fatalf("MANIFEST flush failed: %v", err) 178 } 179 } 180 if err == nil { 181 if err = vs.manifestFile.Sync(); err != nil { 182 vs.opts.Logger.Fatalf("MANIFEST sync failed: %v", err) 183 } 184 } 185 if err == nil { 186 // NB: setCurrent is responsible for syncing the data directory. 187 if err = vs.setCurrent(vs.manifestFileNum); err != nil { 188 vs.opts.Logger.Fatalf("MANIFEST set current failed: %v", err) 189 } 190 } 191 192 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 193 JobID: jobID, 194 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, vs.manifestFileNum.DiskFileNum()), 195 FileNum: vs.manifestFileNum, 196 Err: err, 197 }) 198 if err != nil { 199 return err 200 } 201 return nil 202 } 203 204 // load loads the version set from the manifest file. 205 func (vs *versionSet) load( 206 dirname string, 207 opts *Options, 208 manifestFileNum FileNum, 209 marker *atomicfs.Marker, 210 setCurrent func(FileNum) error, 211 getFormatMajorVersion func() FormatMajorVersion, 212 mu *sync.Mutex, 213 ) error { 214 vs.init(dirname, opts, marker, setCurrent, getFormatMajorVersion, mu) 215 216 vs.manifestFileNum = manifestFileNum 217 manifestPath := base.MakeFilepath(opts.FS, dirname, fileTypeManifest, vs.manifestFileNum.DiskFileNum()) 218 manifestFilename := opts.FS.PathBase(manifestPath) 219 220 // Read the versionEdits in the manifest file. 221 var bve bulkVersionEdit 222 bve.AddedByFileNum = make(map[base.FileNum]*fileMetadata) 223 manifest, err := vs.fs.Open(manifestPath) 224 if err != nil { 225 return errors.Wrapf(err, "pebble: could not open manifest file %q for DB %q", 226 errors.Safe(manifestFilename), dirname) 227 } 228 defer manifest.Close() 229 rr := record.NewReader(manifest, 0 /* logNum */) 230 for { 231 r, err := rr.Next() 232 if err == io.EOF || record.IsInvalidRecord(err) { 233 break 234 } 235 if err != nil { 236 return errors.Wrapf(err, "pebble: error when loading manifest file %q", 237 errors.Safe(manifestFilename)) 238 } 239 var ve versionEdit 240 err = ve.Decode(r) 241 if err != nil { 242 // Break instead of returning an error if the record is corrupted 243 // or invalid. 244 if err == io.EOF || record.IsInvalidRecord(err) { 245 break 246 } 247 return err 248 } 249 if ve.ComparerName != "" { 250 if ve.ComparerName != vs.cmpName { 251 return errors.Errorf("pebble: manifest file %q for DB %q: "+ 252 "comparer name from file %q != comparer name from Options %q", 253 errors.Safe(manifestFilename), dirname, errors.Safe(ve.ComparerName), errors.Safe(vs.cmpName)) 254 } 255 } 256 if err := bve.Accumulate(&ve); err != nil { 257 return err 258 } 259 if ve.MinUnflushedLogNum != 0 { 260 vs.minUnflushedLogNum = ve.MinUnflushedLogNum 261 } 262 if ve.NextFileNum != 0 { 263 vs.nextFileNum = ve.NextFileNum 264 } 265 if ve.LastSeqNum != 0 { 266 // logSeqNum is the _next_ sequence number that will be assigned, 267 // while LastSeqNum is the last assigned sequence number. Note that 268 // this behaviour mimics that in RocksDB; the first sequence number 269 // assigned is one greater than the one present in the manifest 270 // (assuming no WALs contain higher sequence numbers than the 271 // manifest's LastSeqNum). Increment LastSeqNum by 1 to get the 272 // next sequence number that will be assigned. 273 // 274 // If LastSeqNum is less than SeqNumStart, increase it to at least 275 // SeqNumStart to leave ample room for reserved sequence numbers. 276 if ve.LastSeqNum+1 < base.SeqNumStart { 277 vs.logSeqNum.Store(base.SeqNumStart) 278 } else { 279 vs.logSeqNum.Store(ve.LastSeqNum + 1) 280 } 281 } 282 } 283 // We have already set vs.nextFileNum = 2 at the beginning of the 284 // function and could have only updated it to some other non-zero value, 285 // so it cannot be 0 here. 286 if vs.minUnflushedLogNum == 0 { 287 if vs.nextFileNum >= 2 { 288 // We either have a freshly created DB, or a DB created by RocksDB 289 // that has not had a single flushed SSTable yet. This is because 290 // RocksDB bumps up nextFileNum in this case without bumping up 291 // minUnflushedLogNum, even if WALs with non-zero file numbers are 292 // present in the directory. 293 } else { 294 return base.CorruptionErrorf("pebble: malformed manifest file %q for DB %q", 295 errors.Safe(manifestFilename), dirname) 296 } 297 } 298 vs.markFileNumUsed(vs.minUnflushedLogNum) 299 300 // Populate the fileBackingMap and the FileBacking for virtual sstables since 301 // we have finished version edit accumulation. 302 for _, s := range bve.AddedFileBacking { 303 vs.addFileBacking(s) 304 } 305 306 for _, fileNum := range bve.RemovedFileBacking { 307 vs.removeFileBacking(fileNum) 308 } 309 310 newVersion, err := bve.Apply( 311 nil, vs.cmp, opts.Comparer.FormatKey, opts.FlushSplitBytes, 312 opts.Experimental.ReadCompactionRate, nil, /* zombies */ 313 getFormatMajorVersion().orderingInvariants(), 314 ) 315 if err != nil { 316 return err 317 } 318 newVersion.L0Sublevels.InitCompactingFileInfo(nil /* in-progress compactions */) 319 vs.append(newVersion) 320 321 for i := range vs.metrics.Levels { 322 l := &vs.metrics.Levels[i] 323 l.NumFiles = int64(newVersion.Levels[i].Len()) 324 files := newVersion.Levels[i].Slice() 325 l.Size = int64(files.SizeSum()) 326 } 327 328 vs.picker = newCompactionPicker(newVersion, vs.opts, nil) 329 return nil 330 } 331 332 func (vs *versionSet) close() error { 333 if vs.manifestFile != nil { 334 if err := vs.manifestFile.Close(); err != nil { 335 return err 336 } 337 } 338 if vs.manifestMarker != nil { 339 if err := vs.manifestMarker.Close(); err != nil { 340 return err 341 } 342 } 343 return nil 344 } 345 346 // logLock locks the manifest for writing. The lock must be released by either 347 // a call to logUnlock or logAndApply. 348 // 349 // DB.mu must be held when calling this method, but the mutex may be dropped and 350 // re-acquired during the course of this method. 351 func (vs *versionSet) logLock() { 352 // Wait for any existing writing to the manifest to complete, then mark the 353 // manifest as busy. 354 for vs.writing { 355 vs.writerCond.Wait() 356 } 357 vs.writing = true 358 } 359 360 // logUnlock releases the lock for manifest writing. 361 // 362 // DB.mu must be held when calling this method. 363 func (vs *versionSet) logUnlock() { 364 if !vs.writing { 365 vs.opts.Logger.Fatalf("MANIFEST not locked for writing") 366 } 367 vs.writing = false 368 vs.writerCond.Signal() 369 } 370 371 // Only call if the DiskFileNum doesn't exist in the fileBackingMap. 372 func (vs *versionSet) addFileBacking(backing *manifest.FileBacking) { 373 _, ok := vs.backingState.fileBackingMap[backing.DiskFileNum] 374 if ok { 375 panic("pebble: trying to add an existing file backing") 376 } 377 vs.backingState.fileBackingMap[backing.DiskFileNum] = backing 378 vs.backingState.fileBackingSize += backing.Size 379 } 380 381 // Only call if the the DiskFileNum exists in the fileBackingMap. 382 func (vs *versionSet) removeFileBacking(dfn base.DiskFileNum) { 383 backing, ok := vs.backingState.fileBackingMap[dfn] 384 if !ok { 385 panic("pebble: trying to remove an unknown file backing") 386 } 387 delete(vs.backingState.fileBackingMap, dfn) 388 vs.backingState.fileBackingSize -= backing.Size 389 } 390 391 // logAndApply logs the version edit to the manifest, applies the version edit 392 // to the current version, and installs the new version. 393 // 394 // DB.mu must be held when calling this method and will be released temporarily 395 // while performing file I/O. Requires that the manifest is locked for writing 396 // (see logLock). Will unconditionally release the manifest lock (via 397 // logUnlock) even if an error occurs. 398 // 399 // inProgressCompactions is called while DB.mu is held, to get the list of 400 // in-progress compactions. 401 func (vs *versionSet) logAndApply( 402 jobID int, 403 ve *versionEdit, 404 metrics map[int]*LevelMetrics, 405 forceRotation bool, 406 inProgressCompactions func() []compactionInfo, 407 ) error { 408 if !vs.writing { 409 vs.opts.Logger.Fatalf("MANIFEST not locked for writing") 410 } 411 defer vs.logUnlock() 412 413 if ve.MinUnflushedLogNum != 0 { 414 if ve.MinUnflushedLogNum < vs.minUnflushedLogNum || 415 vs.nextFileNum <= ve.MinUnflushedLogNum { 416 panic(fmt.Sprintf("pebble: inconsistent versionEdit minUnflushedLogNum %d", 417 ve.MinUnflushedLogNum)) 418 } 419 } 420 421 // This is the next manifest filenum, but if the current file is too big we 422 // will write this ve to the next file which means what ve encodes is the 423 // current filenum and not the next one. 424 // 425 // TODO(sbhola): figure out why this is correct and update comment. 426 ve.NextFileNum = vs.nextFileNum 427 428 // LastSeqNum is set to the current upper bound on the assigned sequence 429 // numbers. Note that this is exactly the behavior of RocksDB. LastSeqNum is 430 // used to initialize versionSet.logSeqNum and versionSet.visibleSeqNum on 431 // replay. It must be higher than or equal to any than any sequence number 432 // written to an sstable, including sequence numbers in ingested files. 433 // Note that LastSeqNum is not (and cannot be) the minimum unflushed sequence 434 // number. This is fallout from ingestion which allows a sequence number X to 435 // be assigned to an ingested sstable even though sequence number X-1 resides 436 // in an unflushed memtable. logSeqNum is the _next_ sequence number that 437 // will be assigned, so subtract that by 1 to get the upper bound on the 438 // last assigned sequence number. 439 logSeqNum := vs.logSeqNum.Load() 440 ve.LastSeqNum = logSeqNum - 1 441 if logSeqNum == 0 { 442 // logSeqNum is initialized to 1 in Open() if there are no previous WAL 443 // or manifest records, so this case should never happen. 444 vs.opts.Logger.Fatalf("logSeqNum must be a positive integer: %d", logSeqNum) 445 } 446 447 currentVersion := vs.currentVersion() 448 fmv := vs.getFormatMajorVersion() 449 orderingInvariants := fmv.orderingInvariants() 450 var newVersion *version 451 452 // Generate a new manifest if we don't currently have one, or forceRotation 453 // is true, or the current one is too large. 454 // 455 // For largeness, we do not exclusively use MaxManifestFileSize size 456 // threshold since we have had incidents where due to either large keys or 457 // large numbers of files, each edit results in a snapshot + write of the 458 // edit. This slows the system down since each flush or compaction is 459 // writing a new manifest snapshot. The primary goal of the size-based 460 // rollover logic is to ensure that when reopening a DB, the number of edits 461 // that need to be replayed on top of the snapshot is "sane". Rolling over 462 // to a new manifest after each edit is not relevant to that goal. 463 // 464 // Consider the following cases: 465 // - The number of live files F in the DB is roughly stable: after writing 466 // the snapshot (with F files), say we require that there be enough edits 467 // such that the cumulative number of files in those edits, E, be greater 468 // than F. This will ensure that the total amount of time in logAndApply 469 // that is spent in snapshot writing is ~50%. 470 // 471 // - The number of live files F in the DB is shrinking drastically, say from 472 // F to F/10: This can happen for various reasons, like wide range 473 // tombstones, or large numbers of smaller than usual files that are being 474 // merged together into larger files. And say the new files generated 475 // during this shrinkage is insignificant compared to F/10, and so for 476 // this example we will assume it is effectively 0. After this shrinking, 477 // E = 0.9F, and so if we used the previous snapshot file count, F, as the 478 // threshold that needs to be exceeded, we will further delay the snapshot 479 // writing. Which means on DB reopen we will need to replay 0.9F edits to 480 // get to a version with 0.1F files. It would be better to create a new 481 // snapshot when E exceeds the number of files in the current version. 482 // 483 // - The number of live files F in the DB is growing via perfect ingests 484 // into L6: Say we wrote the snapshot when there were F files and now we 485 // have 10F files, so E = 9F. We will further delay writing a new 486 // snapshot. This case can be critiqued as contrived, but we consider it 487 // nonetheless. 488 // 489 // The logic below uses the min of the last snapshot file count and the file 490 // count in the current version. 491 vs.rotationHelper.AddRecord(int64(len(ve.DeletedFiles) + len(ve.NewFiles))) 492 sizeExceeded := vs.manifest.Size() >= vs.opts.MaxManifestFileSize 493 requireRotation := forceRotation || vs.manifest == nil 494 495 var nextSnapshotFilecount int64 496 for i := range vs.metrics.Levels { 497 nextSnapshotFilecount += vs.metrics.Levels[i].NumFiles 498 } 499 if sizeExceeded && !requireRotation { 500 requireRotation = vs.rotationHelper.ShouldRotate(nextSnapshotFilecount) 501 } 502 var newManifestFileNum FileNum 503 var prevManifestFileSize uint64 504 if requireRotation { 505 newManifestFileNum = vs.getNextFileNum() 506 prevManifestFileSize = uint64(vs.manifest.Size()) 507 } 508 509 // Grab certain values before releasing vs.mu, in case createManifest() needs 510 // to be called. 511 minUnflushedLogNum := vs.minUnflushedLogNum 512 nextFileNum := vs.nextFileNum 513 514 var zombies map[base.DiskFileNum]uint64 515 if err := func() error { 516 vs.mu.Unlock() 517 defer vs.mu.Lock() 518 519 var err error 520 if vs.getFormatMajorVersion() < FormatVirtualSSTables && len(ve.CreatedBackingTables) > 0 { 521 return errors.AssertionFailedf("MANIFEST cannot contain virtual sstable records due to format major version") 522 } 523 newVersion, zombies, err = manifest.AccumulateIncompleteAndApplySingleVE( 524 ve, currentVersion, vs.cmp, vs.opts.Comparer.FormatKey, 525 vs.opts.FlushSplitBytes, vs.opts.Experimental.ReadCompactionRate, 526 vs.backingState.fileBackingMap, vs.addFileBacking, vs.removeFileBacking, 527 orderingInvariants, 528 ) 529 if err != nil { 530 return errors.Wrap(err, "MANIFEST apply failed") 531 } 532 533 if newManifestFileNum != 0 { 534 if err := vs.createManifest(vs.dirname, newManifestFileNum, minUnflushedLogNum, nextFileNum); err != nil { 535 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 536 JobID: jobID, 537 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum.DiskFileNum()), 538 FileNum: newManifestFileNum, 539 Err: err, 540 }) 541 return errors.Wrap(err, "MANIFEST create failed") 542 } 543 } 544 545 w, err := vs.manifest.Next() 546 if err != nil { 547 return errors.Wrap(err, "MANIFEST next record write failed") 548 } 549 550 // NB: Any error from this point on is considered fatal as we don't know if 551 // the MANIFEST write occurred or not. Trying to determine that is 552 // fraught. Instead we rely on the standard recovery mechanism run when a 553 // database is open. In particular, that mechanism generates a new MANIFEST 554 // and ensures it is synced. 555 if err := ve.Encode(w); err != nil { 556 return errors.Wrap(err, "MANIFEST write failed") 557 } 558 if err := vs.manifest.Flush(); err != nil { 559 return errors.Wrap(err, "MANIFEST flush failed") 560 } 561 if err := vs.manifestFile.Sync(); err != nil { 562 return errors.Wrap(err, "MANIFEST sync failed") 563 } 564 if newManifestFileNum != 0 { 565 // NB: setCurrent is responsible for syncing the data directory. 566 if err := vs.setCurrent(newManifestFileNum); err != nil { 567 return errors.Wrap(err, "MANIFEST set current failed") 568 } 569 vs.opts.EventListener.ManifestCreated(ManifestCreateInfo{ 570 JobID: jobID, 571 Path: base.MakeFilepath(vs.fs, vs.dirname, fileTypeManifest, newManifestFileNum.DiskFileNum()), 572 FileNum: newManifestFileNum, 573 }) 574 } 575 return nil 576 }(); err != nil { 577 // Any error encountered during any of the operations in the previous 578 // closure are considered fatal. Treating such errors as fatal is preferred 579 // to attempting to unwind various file and b-tree reference counts, and 580 // re-generating L0 sublevel metadata. This may change in the future, if 581 // certain manifest / WAL operations become retryable. For more context, see 582 // #1159 and #1792. 583 vs.opts.Logger.Fatalf("%s", err) 584 return err 585 } 586 587 if requireRotation { 588 // Successfully rotated. 589 vs.rotationHelper.Rotate(nextSnapshotFilecount) 590 } 591 // Now that DB.mu is held again, initialize compacting file info in 592 // L0Sublevels. 593 inProgress := inProgressCompactions() 594 595 newVersion.L0Sublevels.InitCompactingFileInfo(inProgressL0Compactions(inProgress)) 596 597 // Update the zombie tables set first, as installation of the new version 598 // will unref the previous version which could result in addObsoleteLocked 599 // being called. 600 for fileNum, size := range zombies { 601 vs.zombieTables[fileNum] = size 602 } 603 604 // Install the new version. 605 vs.append(newVersion) 606 if ve.MinUnflushedLogNum != 0 { 607 vs.minUnflushedLogNum = ve.MinUnflushedLogNum 608 } 609 if newManifestFileNum != 0 { 610 if vs.manifestFileNum != 0 { 611 vs.obsoleteManifests = append(vs.obsoleteManifests, fileInfo{ 612 fileNum: vs.manifestFileNum.DiskFileNum(), 613 fileSize: prevManifestFileSize, 614 }) 615 } 616 vs.manifestFileNum = newManifestFileNum 617 } 618 619 for level, update := range metrics { 620 vs.metrics.Levels[level].Add(update) 621 } 622 for i := range vs.metrics.Levels { 623 l := &vs.metrics.Levels[i] 624 l.NumFiles = int64(newVersion.Levels[i].Len()) 625 l.NumVirtualFiles = newVersion.Levels[i].NumVirtual 626 l.VirtualSize = newVersion.Levels[i].VirtualSize 627 l.Size = int64(newVersion.Levels[i].Size()) 628 629 l.Sublevels = 0 630 if l.NumFiles > 0 { 631 l.Sublevels = 1 632 } 633 if invariants.Enabled { 634 levelFiles := newVersion.Levels[i].Slice() 635 if size := int64(levelFiles.SizeSum()); l.Size != size { 636 vs.opts.Logger.Fatalf("versionSet metrics L%d Size = %d, actual size = %d", i, l.Size, size) 637 } 638 if nVirtual := levelFiles.NumVirtual(); nVirtual != l.NumVirtualFiles { 639 vs.opts.Logger.Fatalf( 640 "versionSet metrics L%d NumVirtual = %d, actual NumVirtual = %d", 641 i, l.NumVirtualFiles, nVirtual, 642 ) 643 } 644 if vSize := levelFiles.VirtualSizeSum(); vSize != l.VirtualSize { 645 vs.opts.Logger.Fatalf( 646 "versionSet metrics L%d Virtual size = %d, actual size = %d", 647 i, l.VirtualSize, vSize, 648 ) 649 } 650 } 651 } 652 vs.metrics.Levels[0].Sublevels = int32(len(newVersion.L0SublevelFiles)) 653 654 vs.picker = newCompactionPicker(newVersion, vs.opts, inProgress) 655 if !vs.dynamicBaseLevel { 656 vs.picker.forceBaseLevel1() 657 } 658 return nil 659 } 660 661 func (vs *versionSet) incrementCompactions( 662 kind compactionKind, extraLevels []*compactionLevel, pickerMetrics compactionPickerMetrics, 663 ) { 664 switch kind { 665 case compactionKindDefault: 666 vs.metrics.Compact.Count++ 667 vs.metrics.Compact.DefaultCount++ 668 669 case compactionKindFlush, compactionKindIngestedFlushable: 670 vs.metrics.Flush.Count++ 671 672 case compactionKindMove: 673 vs.metrics.Compact.Count++ 674 vs.metrics.Compact.MoveCount++ 675 676 case compactionKindDeleteOnly: 677 vs.metrics.Compact.Count++ 678 vs.metrics.Compact.DeleteOnlyCount++ 679 680 case compactionKindElisionOnly: 681 vs.metrics.Compact.Count++ 682 vs.metrics.Compact.ElisionOnlyCount++ 683 684 case compactionKindRead: 685 vs.metrics.Compact.Count++ 686 vs.metrics.Compact.ReadCount++ 687 688 case compactionKindRewrite: 689 vs.metrics.Compact.Count++ 690 vs.metrics.Compact.RewriteCount++ 691 } 692 if len(extraLevels) > 0 { 693 vs.metrics.Compact.MultiLevelCount++ 694 } 695 } 696 697 func (vs *versionSet) incrementCompactionBytes(numBytes int64) { 698 vs.atomicInProgressBytes.Add(numBytes) 699 } 700 701 // createManifest creates a manifest file that contains a snapshot of vs. 702 func (vs *versionSet) createManifest( 703 dirname string, fileNum, minUnflushedLogNum, nextFileNum FileNum, 704 ) (err error) { 705 var ( 706 filename = base.MakeFilepath(vs.fs, dirname, fileTypeManifest, fileNum.DiskFileNum()) 707 manifestFile vfs.File 708 manifest *record.Writer 709 ) 710 defer func() { 711 if manifest != nil { 712 manifest.Close() 713 } 714 if manifestFile != nil { 715 manifestFile.Close() 716 } 717 if err != nil { 718 vs.fs.Remove(filename) 719 } 720 }() 721 manifestFile, err = vs.fs.Create(filename) 722 if err != nil { 723 return err 724 } 725 manifest = record.NewWriter(manifestFile) 726 727 snapshot := versionEdit{ 728 ComparerName: vs.cmpName, 729 } 730 dedup := make(map[base.DiskFileNum]struct{}) 731 for level, levelMetadata := range vs.currentVersion().Levels { 732 iter := levelMetadata.Iter() 733 for meta := iter.First(); meta != nil; meta = iter.Next() { 734 snapshot.NewFiles = append(snapshot.NewFiles, newFileEntry{ 735 Level: level, 736 Meta: meta, 737 }) 738 if _, ok := dedup[meta.FileBacking.DiskFileNum]; meta.Virtual && !ok { 739 dedup[meta.FileBacking.DiskFileNum] = struct{}{} 740 snapshot.CreatedBackingTables = append( 741 snapshot.CreatedBackingTables, 742 meta.FileBacking, 743 ) 744 } 745 } 746 } 747 748 // When creating a version snapshot for an existing DB, this snapshot VersionEdit will be 749 // immediately followed by another VersionEdit (being written in logAndApply()). That 750 // VersionEdit always contains a LastSeqNum, so we don't need to include that in the snapshot. 751 // But it does not necessarily include MinUnflushedLogNum, NextFileNum, so we initialize those 752 // using the corresponding fields in the versionSet (which came from the latest preceding 753 // VersionEdit that had those fields). 754 snapshot.MinUnflushedLogNum = minUnflushedLogNum 755 snapshot.NextFileNum = nextFileNum 756 757 w, err1 := manifest.Next() 758 if err1 != nil { 759 return err1 760 } 761 if err := snapshot.Encode(w); err != nil { 762 return err 763 } 764 765 if vs.manifest != nil { 766 vs.manifest.Close() 767 vs.manifest = nil 768 } 769 if vs.manifestFile != nil { 770 if err := vs.manifestFile.Close(); err != nil { 771 return err 772 } 773 vs.manifestFile = nil 774 } 775 776 vs.manifest, manifest = manifest, nil 777 vs.manifestFile, manifestFile = manifestFile, nil 778 return nil 779 } 780 781 func (vs *versionSet) markFileNumUsed(fileNum FileNum) { 782 if vs.nextFileNum <= fileNum { 783 vs.nextFileNum = fileNum + 1 784 } 785 } 786 787 func (vs *versionSet) getNextFileNum() FileNum { 788 x := vs.nextFileNum 789 vs.nextFileNum++ 790 return x 791 } 792 793 func (vs *versionSet) append(v *version) { 794 if v.Refs() != 0 { 795 panic("pebble: version should be unreferenced") 796 } 797 if !vs.versions.Empty() { 798 vs.versions.Back().UnrefLocked() 799 } 800 v.Deleted = vs.obsoleteFn 801 v.Ref() 802 vs.versions.PushBack(v) 803 } 804 805 func (vs *versionSet) currentVersion() *version { 806 return vs.versions.Back() 807 } 808 809 func (vs *versionSet) addLiveFileNums(m map[base.DiskFileNum]struct{}) { 810 current := vs.currentVersion() 811 for v := vs.versions.Front(); true; v = v.Next() { 812 for _, lm := range v.Levels { 813 iter := lm.Iter() 814 for f := iter.First(); f != nil; f = iter.Next() { 815 m[f.FileBacking.DiskFileNum] = struct{}{} 816 } 817 } 818 if v == current { 819 break 820 } 821 } 822 } 823 824 // addObsoleteLocked will add the fileInfo associated with obsolete backing 825 // sstables to the obsolete tables list. 826 // 827 // The file backings in the obsolete list must not appear more than once. 828 // 829 // DB.mu must be held when addObsoleteLocked is called. 830 func (vs *versionSet) addObsoleteLocked(obsolete []*fileBacking) { 831 if len(obsolete) == 0 { 832 return 833 } 834 835 obsoleteFileInfo := make([]fileInfo, len(obsolete)) 836 for i, bs := range obsolete { 837 obsoleteFileInfo[i].fileNum = bs.DiskFileNum 838 obsoleteFileInfo[i].fileSize = bs.Size 839 } 840 841 if invariants.Enabled { 842 dedup := make(map[base.DiskFileNum]struct{}) 843 for _, fi := range obsoleteFileInfo { 844 dedup[fi.fileNum] = struct{}{} 845 } 846 if len(dedup) != len(obsoleteFileInfo) { 847 panic("pebble: duplicate FileBacking present in obsolete list") 848 } 849 } 850 851 for _, fi := range obsoleteFileInfo { 852 // Note that the obsolete tables are no longer zombie by the definition of 853 // zombie, but we leave them in the zombie tables map until they are 854 // deleted from disk. 855 if _, ok := vs.zombieTables[fi.fileNum]; !ok { 856 vs.opts.Logger.Fatalf("MANIFEST obsolete table %s not marked as zombie", fi.fileNum) 857 } 858 } 859 860 vs.obsoleteTables = append(vs.obsoleteTables, obsoleteFileInfo...) 861 vs.updateObsoleteTableMetricsLocked() 862 } 863 864 // addObsolete will acquire DB.mu, so DB.mu must not be held when this is 865 // called. 866 func (vs *versionSet) addObsolete(obsolete []*fileBacking) { 867 vs.mu.Lock() 868 defer vs.mu.Unlock() 869 vs.addObsoleteLocked(obsolete) 870 } 871 872 func (vs *versionSet) updateObsoleteTableMetricsLocked() { 873 vs.metrics.Table.ObsoleteCount = int64(len(vs.obsoleteTables)) 874 vs.metrics.Table.ObsoleteSize = 0 875 for _, fi := range vs.obsoleteTables { 876 vs.metrics.Table.ObsoleteSize += fi.fileSize 877 } 878 } 879 880 func setCurrentFunc( 881 vers FormatMajorVersion, marker *atomicfs.Marker, fs vfs.FS, dirname string, dir vfs.File, 882 ) func(FileNum) error { 883 if vers < formatVersionedManifestMarker { 884 // Pebble versions before `formatVersionedManifestMarker` used 885 // the CURRENT file to signal which MANIFEST is current. Ignore 886 // the filename read during LocateMarker. 887 return func(manifestFileNum FileNum) error { 888 if err := setCurrentFile(dirname, fs, manifestFileNum.DiskFileNum()); err != nil { 889 return err 890 } 891 if err := dir.Sync(); err != nil { 892 // This is a panic here, rather than higher in the call 893 // stack, for parity with the atomicfs.Marker behavior. 894 // A panic is always necessary because failed Syncs are 895 // unrecoverable. 896 panic(errors.Wrap(err, "fatal: MANIFEST dirsync failed")) 897 } 898 return nil 899 } 900 } 901 return setCurrentFuncMarker(marker, fs, dirname) 902 } 903 904 func setCurrentFuncMarker(marker *atomicfs.Marker, fs vfs.FS, dirname string) func(FileNum) error { 905 return func(manifestFileNum FileNum) error { 906 return marker.Move(base.MakeFilename(fileTypeManifest, manifestFileNum.DiskFileNum())) 907 } 908 } 909 910 func findCurrentManifest( 911 vers FormatMajorVersion, fs vfs.FS, dirname string, 912 ) (marker *atomicfs.Marker, manifestNum base.DiskFileNum, exists bool, err error) { 913 // NB: We always locate the manifest marker, even if we might not 914 // actually use it (because we're opening the database at an earlier 915 // format major version that uses the CURRENT file). Locating a 916 // marker should succeed even if the marker has never been placed. 917 var filename string 918 marker, filename, err = atomicfs.LocateMarker(fs, dirname, manifestMarkerName) 919 if err != nil { 920 return nil, base.FileNum(0).DiskFileNum(), false, err 921 } 922 923 if vers < formatVersionedManifestMarker { 924 // Pebble versions before `formatVersionedManifestMarker` used 925 // the CURRENT file to signal which MANIFEST is current. Ignore 926 // the filename read during LocateMarker. 927 928 manifestNum, err = readCurrentFile(fs, dirname) 929 if oserror.IsNotExist(err) { 930 return marker, base.FileNum(0).DiskFileNum(), false, nil 931 } else if err != nil { 932 return marker, base.FileNum(0).DiskFileNum(), false, err 933 } 934 return marker, manifestNum, true, nil 935 } 936 937 // The current format major version is >= 938 // formatVersionedManifestMarker indicating that the 939 // atomicfs.Marker is the source of truth on the current manifest. 940 941 if filename == "" { 942 // The marker hasn't been set yet. This database doesn't exist. 943 return marker, base.FileNum(0).DiskFileNum(), false, nil 944 } 945 946 var ok bool 947 _, manifestNum, ok = base.ParseFilename(fs, filename) 948 if !ok { 949 return marker, base.FileNum(0).DiskFileNum(), false, base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(filename)) 950 } 951 return marker, manifestNum, true, nil 952 } 953 954 func readCurrentFile(fs vfs.FS, dirname string) (base.DiskFileNum, error) { 955 // Read the CURRENT file to find the current manifest file. 956 current, err := fs.Open(base.MakeFilepath(fs, dirname, fileTypeCurrent, base.FileNum(0).DiskFileNum())) 957 if err != nil { 958 return base.FileNum(0).DiskFileNum(), errors.Wrapf(err, "pebble: could not open CURRENT file for DB %q", dirname) 959 } 960 defer current.Close() 961 stat, err := current.Stat() 962 if err != nil { 963 return base.FileNum(0).DiskFileNum(), err 964 } 965 n := stat.Size() 966 if n == 0 { 967 return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is empty", dirname) 968 } 969 if n > 4096 { 970 return base.FileNum(0).DiskFileNum(), errors.Errorf("pebble: CURRENT file for DB %q is too large", dirname) 971 } 972 b := make([]byte, n) 973 _, err = current.ReadAt(b, 0) 974 if err != nil { 975 return base.FileNum(0).DiskFileNum(), err 976 } 977 if b[n-1] != '\n' { 978 return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: CURRENT file for DB %q is malformed", dirname) 979 } 980 b = bytes.TrimSpace(b) 981 982 _, manifestFileNum, ok := base.ParseFilename(fs, string(b)) 983 if !ok { 984 return base.FileNum(0).DiskFileNum(), base.CorruptionErrorf("pebble: MANIFEST name %q is malformed", errors.Safe(b)) 985 } 986 return manifestFileNum, nil 987 } 988 989 func newFileMetrics(newFiles []manifest.NewFileEntry) map[int]*LevelMetrics { 990 m := map[int]*LevelMetrics{} 991 for _, nf := range newFiles { 992 lm := m[nf.Level] 993 if lm == nil { 994 lm = &LevelMetrics{} 995 m[nf.Level] = lm 996 } 997 lm.NumFiles++ 998 lm.Size += int64(nf.Meta.Size) 999 } 1000 return m 1001 }