github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/format_major_version.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "fmt" 9 "strconv" 10 11 "github.com/cockroachdb/errors" 12 "github.com/zuoyebang/bitalostable/internal/base" 13 "github.com/zuoyebang/bitalostable/internal/manifest" 14 "github.com/zuoyebang/bitalostable/sstable" 15 "github.com/zuoyebang/bitalostable/vfs" 16 "github.com/zuoyebang/bitalostable/vfs/atomicfs" 17 ) 18 19 // FormatMajorVersion is a constant controlling the format of persisted 20 // data. Backwards incompatible changes to durable formats are gated 21 // behind new format major versions. 22 // 23 // At any point, a database's format major version may be bumped. 24 // However, once a database's format major version is increased, 25 // previous versions of Pebble will refuse to open the database. 26 // 27 // The zero value format is the FormatDefault constant. The exact 28 // FormatVersion that the default corresponds to may change with time. 29 type FormatMajorVersion uint64 30 31 // String implements fmt.Stringer. 32 func (v FormatMajorVersion) String() string { 33 // NB: This must not change. It's used as the value for the the 34 // on-disk version marker file. 35 // 36 // Specifically, this value must always parse as a base 10 integer 37 // that fits in a uint64. We format it as zero-padded, 3-digit 38 // number today, but the padding may change. 39 return fmt.Sprintf("%03d", v) 40 } 41 42 const ( 43 // 21.2 versions. 44 45 // FormatDefault leaves the format version unspecified. The 46 // FormatDefault constant may be ratcheted upwards over time. 47 FormatDefault FormatMajorVersion = iota 48 // FormatMostCompatible maintains the most backwards compatibility, 49 // maintaining bi-directional compatibility with RocksDB 6.2.1 in 50 // the particular configuration described in the Pebble README. 51 FormatMostCompatible 52 // formatVersionedManifestMarker is the first 53 // backwards-incompatible change made to Pebble, introducing the 54 // format-version marker file for handling backwards-incompatible 55 // changes more broadly, and replacing the `CURRENT` file with a 56 // marker file. 57 // 58 // This format version is intended as an intermediary version state. 59 // It is deliberately unexported to discourage direct use of this 60 // format major version. Clients should use FormatVersioned which 61 // also ensures earlier versions of Pebble fail to open a database 62 // written in a future format major version. 63 formatVersionedManifestMarker 64 // FormatVersioned is a new format major version that replaces the 65 // old `CURRENT` file with a new 'marker' file scheme. Previous 66 // Pebble versions will be unable to open the database unless 67 // they're aware of format versions. 68 FormatVersioned 69 // FormatSetWithDelete is a format major version that introduces a new key 70 // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be 71 // unable to open this database. 72 FormatSetWithDelete 73 74 // 22.1 versions. 75 76 // FormatBlockPropertyCollector is a format major version that introduces 77 // BlockPropertyCollectors. 78 FormatBlockPropertyCollector 79 // FormatSplitUserKeysMarked is a format major version that guarantees that 80 // all files that share user keys with neighbors are marked for compaction 81 // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block 82 // (without holding mutexes) until the scan of the LSM is complete and the 83 // manifest has been rotated. 84 FormatSplitUserKeysMarked 85 86 // 22.2 versions. 87 88 // FormatSplitUserKeysMarkedCompacted is a format major version that 89 // guarantees that all files explicitly marked for compaction in the manifest 90 // have been compacted. Combined with the FormatSplitUserKeysMarked format 91 // major version, this version guarantees that there are no user keys split 92 // across multiple files within a level L1+. Ratcheting to this format version 93 // will block (without holding mutexes) until all necessary compactions for 94 // files marked for compaction are complete. 95 FormatSplitUserKeysMarkedCompacted 96 // FormatRangeKeys is a format major version that introduces range keys. 97 FormatRangeKeys 98 // FormatMinTableFormatPebblev1 is a format major version that guarantees that 99 // tables created by or ingested into the DB at or above this format major 100 // version will have a table format version of at least Pebblev1 (Block 101 // Properties). 102 FormatMinTableFormatPebblev1 103 // FormatPrePebblev1Marked is a format major version that guarantees that all 104 // sstables with a table format version pre-Pebblev1 (i.e. those that are 105 // guaranteed to not contain block properties) are marked for compaction in 106 // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without 107 // holding mutexes) until the scan of the LSM is complete and the manifest has 108 // been rotated. 109 FormatPrePebblev1Marked 110 111 // 23.1 versions. 112 113 // FormatPrePebblev1MarkedCompacted is a format major version that 114 // guarantees that all sstables explicitly marked for compaction in the 115 // manifest have been compacted. Ratcheting to this format version will block 116 // (without holding mutexes) until all necessary compactions for files marked 117 // for compaction are complete. 118 FormatPrePebblev1MarkedCompacted 119 120 // FormatNewest always contains the most recent format major version. 121 // NB: When adding new versions, the MaxTableFormat method should also be 122 // updated to return the maximum allowable version for the new 123 // FormatMajorVersion. 124 FormatNewest FormatMajorVersion = FormatPrePebblev1MarkedCompacted 125 ) 126 127 // MaxTableFormat returns the maximum sstable.TableFormat that can be used at 128 // this FormatMajorVersion. 129 func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { 130 switch v { 131 case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, 132 FormatVersioned, FormatSetWithDelete: 133 return sstable.TableFormatRocksDBv2 134 case FormatBlockPropertyCollector, FormatSplitUserKeysMarked, 135 FormatSplitUserKeysMarkedCompacted: 136 return sstable.TableFormatPebblev1 137 case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, 138 FormatPrePebblev1MarkedCompacted: 139 return sstable.TableFormatPebblev2 140 default: 141 panic(fmt.Sprintf("bitalostable: unsupported format major version: %s", v)) 142 } 143 } 144 145 // MinTableFormat returns the minimum sstable.TableFormat that can be used at 146 // this FormatMajorVersion. 147 func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat { 148 switch v { 149 case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, 150 FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector, 151 FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted, 152 FormatRangeKeys: 153 return sstable.TableFormatLevelDB 154 case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, 155 FormatPrePebblev1MarkedCompacted: 156 return sstable.TableFormatPebblev1 157 default: 158 panic(fmt.Sprintf("bitalostable: unsupported format major version: %s", v)) 159 } 160 } 161 162 // formatMajorVersionMigrations defines the migrations from one format 163 // major version to the next. Each migration is defined as a closure 164 // which will be invoked on the database before the new format major 165 // version is committed. Migrations must be idempotent. Migrations are 166 // invoked with d.mu locked. 167 // 168 // Each migration is responsible for invoking finalizeFormatVersUpgrade 169 // to set the new format major version. RatchetFormatMajorVersion will 170 // panic if a migration returns a nil error but fails to finalize the 171 // new format major version. 172 var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{ 173 FormatMostCompatible: func(d *DB) error { return nil }, 174 formatVersionedManifestMarker: func(d *DB) error { 175 // formatVersionedManifestMarker introduces the use of a marker 176 // file for pointing to the current MANIFEST file. 177 178 // Lock the manifest. 179 d.mu.versions.logLock() 180 defer d.mu.versions.logUnlock() 181 182 // Construct the filename of the currently active manifest and 183 // move the manifest marker to that filename. The marker is 184 // guaranteed to exist, because we unconditionally locate it 185 // during Open. 186 manifestFileNum := d.mu.versions.manifestFileNum 187 filename := base.MakeFilename(fileTypeManifest, manifestFileNum) 188 if err := d.mu.versions.manifestMarker.Move(filename); err != nil { 189 return errors.Wrap(err, "moving manifest marker") 190 } 191 192 // Now that we have a manifest marker file in place and pointing 193 // to the current MANIFEST, finalize the upgrade. If we fail for 194 // some reason, a retry of this migration is guaranteed to again 195 // move the manifest marker file to the latest manifest. If 196 // we're unable to finalize the upgrade, a subsequent call to 197 // Open will ignore the manifest marker. 198 if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil { 199 return err 200 } 201 202 // We've finalized the upgrade. All subsequent Open calls will 203 // ignore the CURRENT file and instead read the manifest marker. 204 // Before we unlock the manifest, we need to update versionSet 205 // to use the manifest marker on future rotations. 206 d.mu.versions.setCurrent = setCurrentFuncMarker( 207 d.mu.versions.manifestMarker, 208 d.mu.versions.fs, 209 d.mu.versions.dirname) 210 return nil 211 }, 212 // The FormatVersioned version is split into two, each with their 213 // own migration to ensure the post-migration cleanup happens even 214 // if there's a crash immediately after finalizing the version. Once 215 // a new format major version is finalized, its migration will never 216 // run again. Post-migration cleanup like the one in the migration 217 // below must be performed in a separate migration or every time the 218 // database opens. 219 FormatVersioned: func(d *DB) error { 220 // Replace the `CURRENT` file with one that points to the 221 // nonexistent `MANIFEST-000000` file. If an earlier Pebble 222 // version that does not know about format major versions 223 // attempts to open the database, it will error avoiding 224 // accidental corruption. 225 if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, 0); err != nil { 226 return err 227 } 228 return d.finalizeFormatVersUpgrade(FormatVersioned) 229 }, 230 // As SetWithDelete is a new key kind, there is nothing to migrate. We can 231 // simply finalize the format version and we're done. 232 FormatSetWithDelete: func(d *DB) error { 233 return d.finalizeFormatVersUpgrade(FormatSetWithDelete) 234 }, 235 FormatBlockPropertyCollector: func(d *DB) error { 236 return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector) 237 }, 238 FormatSplitUserKeysMarked: func(d *DB) error { 239 // Mark any unmarked files with split-user keys. Note all format major 240 // versions migrations are invoked with DB.mu locked. 241 if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil { 242 return err 243 } 244 return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked) 245 }, 246 FormatSplitUserKeysMarkedCompacted: func(d *DB) error { 247 // Before finalizing the format major version, rewrite any sstables 248 // still marked for compaction. Note all format major versions 249 // migrations are invoked with DB.mu locked. 250 if err := d.compactMarkedFilesLocked(); err != nil { 251 return err 252 } 253 return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted) 254 }, 255 FormatRangeKeys: func(d *DB) error { 256 return d.finalizeFormatVersUpgrade(FormatRangeKeys) 257 }, 258 FormatMinTableFormatPebblev1: func(d *DB) error { 259 return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1) 260 }, 261 FormatPrePebblev1Marked: func(d *DB) error { 262 // Mark any unmarked files that contain only table properties. Note all 263 // format major versions migrations are invoked with DB.mu locked. 264 if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil { 265 return err 266 } 267 return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked) 268 }, 269 FormatPrePebblev1MarkedCompacted: func(d *DB) error { 270 // Before finalizing the format major version, rewrite any sstables 271 // still marked for compaction. Note all format major versions 272 // migrations are invoked with DB.mu locked. 273 if err := d.compactMarkedFilesLocked(); err != nil { 274 return err 275 } 276 return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted) 277 }, 278 } 279 280 const formatVersionMarkerName = `format-version` 281 282 func lookupFormatMajorVersion( 283 fs vfs.FS, dirname string, 284 ) (FormatMajorVersion, *atomicfs.Marker, error) { 285 m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName) 286 if err != nil { 287 return 0, nil, err 288 } 289 if versString == "" { 290 return FormatMostCompatible, m, nil 291 } 292 v, err := strconv.ParseUint(versString, 10, 64) 293 if err != nil { 294 return 0, nil, errors.Wrap(err, "parsing format major version") 295 } 296 vers := FormatMajorVersion(v) 297 if vers == FormatDefault { 298 return 0, nil, errors.Newf("bitalostable: default format major version should not persisted", vers) 299 } 300 if vers > FormatNewest { 301 return 0, nil, errors.Newf("bitalostable: database %q written in format major version %d", dirname, vers) 302 } 303 return vers, m, nil 304 } 305 306 // FormatMajorVersion returns the database's active format major 307 // version. The format major version may be higher than the one 308 // provided in Options when the database was opened if the existing 309 // database was written with a higher format version. 310 func (d *DB) FormatMajorVersion() FormatMajorVersion { 311 d.mu.Lock() 312 defer d.mu.Unlock() 313 return d.mu.formatVers.vers 314 } 315 316 // RatchetFormatMajorVersion ratchets the opened database's format major 317 // version to the provided version. It errors if the provided format 318 // major version is below the database's current version. Once a 319 // database's format major version is upgraded, previous Pebble versions 320 // that do not know of the format version will be unable to open the 321 // database. 322 func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error { 323 if err := d.closed.Load(); err != nil { 324 panic(err) 325 } 326 327 d.mu.Lock() 328 defer d.mu.Unlock() 329 return d.ratchetFormatMajorVersionLocked(fmv) 330 } 331 332 func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error { 333 if d.opts.ReadOnly { 334 return ErrReadOnly 335 } 336 if formatVers > FormatNewest { 337 // Guard against accidentally forgetting to update FormatNewest. 338 return errors.Errorf("bitalostable: unknown format version %d", formatVers) 339 } 340 if d.mu.formatVers.vers > formatVers { 341 return errors.Newf("bitalostable: database already at format major version %d; cannot reduce to %d", 342 d.mu.formatVers.vers, formatVers) 343 } 344 if d.mu.formatVers.ratcheting { 345 return errors.Newf("bitalostable: database format major version upgrade is in-progress") 346 } 347 d.mu.formatVers.ratcheting = true 348 defer func() { d.mu.formatVers.ratcheting = false }() 349 350 for nextVers := d.mu.formatVers.vers + 1; nextVers <= formatVers; nextVers++ { 351 if err := formatMajorVersionMigrations[nextVers](d); err != nil { 352 return errors.Wrapf(err, "migrating to version %d", nextVers) 353 } 354 355 // NB: The migration is responsible for calling 356 // finalizeFormatVersUpgrade to finalize the upgrade. This 357 // structure is necessary because some migrations may need to 358 // update in-memory state (without ever dropping locks) after 359 // the upgrade is finalized. Here we assert that the upgrade 360 // did occur. 361 if d.mu.formatVers.vers != nextVers { 362 d.opts.Logger.Fatalf("bitalostable: successful migration to format version %d never finalized the upgrade", nextVers) 363 } 364 } 365 return nil 366 } 367 368 // finalizeFormatVersUpgrade is typically only be called from within a 369 // format major version migration. 370 // 371 // See formatMajorVersionMigrations. 372 func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error { 373 // We use the marker to encode the active format version in the 374 // marker filename. Unlike other uses of the atomic marker, there is 375 // no file with the filename `formatVers.String()` on the 376 // filesystem. 377 if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil { 378 return err 379 } 380 d.mu.formatVers.vers = formatVers 381 d.opts.EventListener.FormatUpgrade(formatVers) 382 return nil 383 } 384 385 // compactMarkedFilesLocked performs a migration that schedules rewrite 386 // compactions to compact away any sstables marked for compaction. 387 // compactMarkedFilesLocked is run while ratcheting the database's format major 388 // version to FormatSplitUserKeysMarkedCompacted. 389 // 390 // Note that while this method is called with the DB.mu held, and will not 391 // return until all marked files have been compacted, the mutex is dropped while 392 // waiting for compactions to complete (or for slots to free up). 393 func (d *DB) compactMarkedFilesLocked() error { 394 curr := d.mu.versions.currentVersion() 395 for curr.Stats.MarkedForCompaction > 0 { 396 // Attempt to schedule a compaction to rewrite a file marked for 397 // compaction. 398 d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction { 399 return picker.pickRewriteCompaction(env) 400 }) 401 402 // The above attempt might succeed and schedule a rewrite compaction. Or 403 // there might not be available compaction concurrency to schedule the 404 // compaction. Or compaction of the file might have already been in 405 // progress. In any scenario, wait until there's some change in the 406 // state of active compactions. 407 408 // Before waiting, check that the database hasn't been closed. Trying to 409 // schedule the compaction may have dropped d.mu while waiting for a 410 // manifest write to complete. In that dropped interim, the database may 411 // have been closed. 412 if err := d.closed.Load(); err != nil { 413 return err.(error) 414 } 415 // NB: Waiting on this condition variable drops d.mu while blocked. 416 d.mu.compact.cond.Wait() 417 418 // Some flush or compaction was scheduled or completed. Loop again to 419 // check again for files that must be compacted. The next iteration may 420 // find same file again, but that's okay. It'll eventually succeed in 421 // scheduling the compaction and eventually be woken by its completion. 422 curr = d.mu.versions.currentVersion() 423 } 424 return nil 425 } 426 427 // findFilesFunc scans the LSM for files, returning true if at least one 428 // file was found. The returned array contains the matched files, if any, per 429 // level. 430 type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) 431 432 // markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent 433 // files that contain the same user key. Such arrangements of files were 434 // permitted in RocksDB and in Pebble up to SHA a860bbad. 435 var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc { 436 return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) { 437 // Files with split user keys are expected to be rare and performing key 438 // comparisons for every file within the LSM is expensive, so drop the 439 // database lock while scanning the file metadata. 440 for l := numLevels - 1; l > 0; l-- { 441 iter := v.Levels[l].Iter() 442 var prevFile *fileMetadata 443 var prevUserKey []byte 444 for f := iter.First(); f != nil; f = iter.Next() { 445 if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) { 446 // NB: We may append a file twice, once as prevFile and once 447 // as f. That's okay, and handled below. 448 files[l] = append(files[l], prevFile, f) 449 found = true 450 } 451 if f.Largest.IsExclusiveSentinel() { 452 prevUserKey = nil 453 prevFile = nil 454 } else { 455 prevUserKey = f.Largest.UserKey 456 prevFile = f 457 } 458 } 459 } 460 return 461 } 462 } 463 464 // markFilesPrePebblev1 scans the LSM for files that do not support block 465 // properties (i.e. a table format version pre-Pebblev1). 466 var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc { 467 return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) { 468 for l := numLevels - 1; l > 0; l-- { 469 iter := v.Levels[l].Iter() 470 for f := iter.First(); f != nil; f = iter.Next() { 471 err = tc.withReader(f, func(r *sstable.Reader) error { 472 tf, err := r.TableFormat() 473 if err != nil { 474 return err 475 } 476 if tf < sstable.TableFormatPebblev1 { 477 found = true 478 files[l] = append(files[l], f) 479 } 480 return nil 481 }) 482 if err != nil { 483 return 484 } 485 } 486 } 487 return 488 } 489 } 490 491 // markFilesLock durably marks the files that match the given findFilesFunc for 492 // compaction. 493 func (d *DB) markFilesLocked(findFn findFilesFunc) error { 494 jobID := d.mu.nextJobID 495 d.mu.nextJobID++ 496 497 vers := d.mu.versions.currentVersion() 498 var ( 499 found bool 500 files [numLevels][]*fileMetadata 501 err error 502 ) 503 func() { 504 // Note the unusual locking: unlock, defer Lock(). The scan of the files in 505 // the version does not need to block other operations that require the 506 // DB.mu. Drop it for the scan, before re-acquiring it. 507 d.mu.Unlock() 508 defer d.mu.Lock() 509 found, files, err = findFn(vers) 510 }() 511 if err != nil { 512 return err 513 } 514 515 // The database lock has been acquired again by the defer within the above 516 // anonymous function. 517 if !found { 518 // Nothing to do. 519 return nil 520 } 521 522 // After scanning, if we found files to mark, we fetch the current state of 523 // the LSM (which may have changed) and set MarkedForCompaction on the files, 524 // and update the version's Stats.MarkedForCompaction count, which are both 525 // protected by d.mu. 526 527 // Lock the manifest for a coherent view of the LSM. The database lock has 528 // been re-acquired by the defer within the above anonymous function. 529 d.mu.versions.logLock() 530 vers = d.mu.versions.currentVersion() 531 for l, filesToMark := range files { 532 if len(filesToMark) == 0 { 533 continue 534 } 535 for _, f := range filesToMark { 536 // Ignore files to be marked that have already been compacted or marked. 537 if f.CompactionState == manifest.CompactionStateCompacted || 538 f.MarkedForCompaction { 539 continue 540 } 541 // Else, mark the file for compaction in this version. 542 vers.Stats.MarkedForCompaction++ 543 f.MarkedForCompaction = true 544 } 545 // The compaction picker uses the markedForCompactionAnnotator to 546 // quickly find files marked for compaction, or to quickly determine 547 // that there are no such files marked for compaction within a level. 548 // A b-tree node may be annotated with an annotation recording that 549 // there are no files marked for compaction within the node's subtree, 550 // based on the assumption that it's static. 551 // 552 // Since we're marking files for compaction, these b-tree nodes' 553 // annotations will be out of date. Clear the compaction-picking 554 // annotation, so that it's recomputed the next time the compaction 555 // picker looks for a file marked for compaction. 556 vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) 557 } 558 559 // The 'marked-for-compaction' bit is persisted in the MANIFEST file 560 // metadata. We've already modified the in-memory file metadata, but the 561 // manifest hasn't been updated. Force rotation to a new MANIFEST file, 562 // which will write every file metadata to the new manifest file and ensure 563 // that the now marked-for-compaction file metadata are persisted as marked. 564 // NB: This call to logAndApply will unlockthe MANIFEST, which we locked up 565 // above before obtaining `vers`. 566 return d.mu.versions.logAndApply( 567 jobID, 568 &manifest.VersionEdit{}, 569 map[int]*LevelMetrics{}, 570 true, /* forceRotation */ 571 func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) }) 572 }