github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/format_major_version.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "strconv" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/manifest" 14 "github.com/cockroachdb/pebble/sstable" 15 "github.com/cockroachdb/pebble/vfs" 16 "github.com/cockroachdb/pebble/vfs/atomicfs" 17 ) 18 19 // FormatMajorVersion is a constant controlling the format of persisted 20 // data. Backwards incompatible changes to durable formats are gated 21 // behind new format major versions. 22 // 23 // At any point, a database's format major version may be bumped. 24 // However, once a database's format major version is increased, 25 // previous versions of Pebble will refuse to open the database. 26 // 27 // The zero value format is the FormatDefault constant. The exact 28 // FormatVersion that the default corresponds to may change with time. 29 type FormatMajorVersion uint64 30 31 // SafeValue implements redact.SafeValue. 32 func (v FormatMajorVersion) SafeValue() {} 33 34 // String implements fmt.Stringer. 35 func (v FormatMajorVersion) String() string { 36 // NB: This must not change. It's used as the value for the on-disk 37 // version marker file. 38 // 39 // Specifically, this value must always parse as a base 10 integer 40 // that fits in a uint64. We format it as zero-padded, 3-digit 41 // number today, but the padding may change. 42 return fmt.Sprintf("%03d", v) 43 } 44 45 const ( 46 // 21.2 versions. 47 48 // FormatDefault leaves the format version unspecified. The 49 // FormatDefault constant may be ratcheted upwards over time. 50 FormatDefault FormatMajorVersion = iota 51 // FormatMostCompatible maintains the most backwards compatibility, 52 // maintaining bi-directional compatibility with RocksDB 6.2.1 in 53 // the particular configuration described in the Pebble README. 54 FormatMostCompatible 55 // formatVersionedManifestMarker is the first 56 // backwards-incompatible change made to Pebble, introducing the 57 // format-version marker file for handling backwards-incompatible 58 // changes more broadly, and replacing the `CURRENT` file with a 59 // marker file. 60 // 61 // This format version is intended as an intermediary version state. 62 // It is deliberately unexported to discourage direct use of this 63 // format major version. Clients should use FormatVersioned which 64 // also ensures earlier versions of Pebble fail to open a database 65 // written in a future format major version. 66 formatVersionedManifestMarker 67 // FormatVersioned is a new format major version that replaces the 68 // old `CURRENT` file with a new 'marker' file scheme. Previous 69 // Pebble versions will be unable to open the database unless 70 // they're aware of format versions. 71 FormatVersioned 72 // FormatSetWithDelete is a format major version that introduces a new key 73 // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be 74 // unable to open this database. 75 FormatSetWithDelete 76 77 // 22.1 versions. 78 79 // FormatBlockPropertyCollector is a format major version that introduces 80 // BlockPropertyCollectors. 81 FormatBlockPropertyCollector 82 // FormatSplitUserKeysMarked is a format major version that guarantees that 83 // all files that share user keys with neighbors are marked for compaction 84 // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block 85 // (without holding mutexes) until the scan of the LSM is complete and the 86 // manifest has been rotated. 87 FormatSplitUserKeysMarked 88 89 // 22.2 versions. 90 91 // FormatSplitUserKeysMarkedCompacted is a format major version that 92 // guarantees that all files explicitly marked for compaction in the manifest 93 // have been compacted. Combined with the FormatSplitUserKeysMarked format 94 // major version, this version guarantees that there are no user keys split 95 // across multiple files within a level L1+. Ratcheting to this format version 96 // will block (without holding mutexes) until all necessary compactions for 97 // files marked for compaction are complete. 98 FormatSplitUserKeysMarkedCompacted 99 // FormatRangeKeys is a format major version that introduces range keys. 100 FormatRangeKeys 101 // FormatMinTableFormatPebblev1 is a format major version that guarantees that 102 // tables created by or ingested into the DB at or above this format major 103 // version will have a table format version of at least Pebblev1 (Block 104 // Properties). 105 FormatMinTableFormatPebblev1 106 // FormatPrePebblev1Marked is a format major version that guarantees that all 107 // sstables with a table format version pre-Pebblev1 (i.e. those that are 108 // guaranteed to not contain block properties) are marked for compaction in 109 // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without 110 // holding mutexes) until the scan of the LSM is complete and the manifest has 111 // been rotated. 112 FormatPrePebblev1Marked 113 114 // 23.1 versions. 115 116 // formatUnusedPrePebblev1MarkedCompacted is an unused format major version. 117 // This format major version was originally intended to ship in the 23.1 118 // release. It was later decided that this should be deferred until a 119 // subsequent release. The original ordering is preserved so as not to 120 // introduce breaking changes in Cockroach. 121 formatUnusedPrePebblev1MarkedCompacted 122 123 // FormatSSTableValueBlocks is a format major version that adds support for 124 // storing values in value blocks in the sstable. Value block support is not 125 // necessarily enabled when writing sstables, when running with this format 126 // major version. 127 // 128 // WARNING: In development, so no production code should upgrade to this 129 // format, since a DB with this format major version will not actually 130 // interoperate correctly with another DB with the same format major 131 // version. This format major version is introduced so that tests can start 132 // being executed up to this version. Note that these tests succeed despite 133 // the incomplete support since they do not enable value blocks and use 134 // TableFormatPebblev2. 135 FormatSSTableValueBlocks 136 137 // FormatFlushableIngest is a format major version that enables lazy 138 // addition of ingested sstables into the LSM structure. When an ingest 139 // overlaps with a memtable, a record of the ingest is written to the WAL 140 // without waiting for a flush. Subsequent reads treat the ingested files as 141 // a level above the overlapping memtable. Once the memtable is flushed, the 142 // ingested files are moved into the lowest possible levels. 143 // 144 // This feature is behind a format major version because it required 145 // breaking changes to the WAL format. 146 FormatFlushableIngest 147 148 // 23.2 versions. 149 150 // FormatPrePebblev1MarkedCompacted is a format major version that guarantees 151 // that all sstables explicitly marked for compaction in the manifest (see 152 // FormatPrePebblev1Marked) have been compacted. Ratcheting to this format 153 // version will block (without holding mutexes) until all necessary 154 // compactions for files marked for compaction are complete. 155 FormatPrePebblev1MarkedCompacted 156 157 // FormatDeleteSizedAndObsolete is a format major version that adds support 158 // for deletion tombstones that encode the size of the value they're 159 // expected to delete. This format major version is required before the 160 // associated key kind may be committed through batch applications or 161 // ingests. It also adds support for keys that are marked obsolete (see 162 // sstable/format.go for details). 163 FormatDeleteSizedAndObsolete 164 165 // FormatVirtualSSTables is a format major version that adds support for 166 // virtual sstables that can reference a sub-range of keys in an underlying 167 // physical sstable. This information is persisted through new, 168 // backward-incompatible fields in the Manifest, and therefore requires 169 // a format major version. 170 FormatVirtualSSTables 171 172 // internalFormatNewest holds the newest format major version, including 173 // experimental ones excluded from the exported FormatNewest constant until 174 // they've stabilized. Used in tests. 175 internalFormatNewest FormatMajorVersion = iota - 1 176 177 // FormatNewest always contains the most recent format major version. 178 FormatNewest FormatMajorVersion = internalFormatNewest 179 ) 180 181 // MaxTableFormat returns the maximum sstable.TableFormat that can be used at 182 // this FormatMajorVersion. 183 func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { 184 switch v { 185 case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, 186 FormatVersioned, FormatSetWithDelete: 187 return sstable.TableFormatRocksDBv2 188 case FormatBlockPropertyCollector, FormatSplitUserKeysMarked, 189 FormatSplitUserKeysMarkedCompacted: 190 return sstable.TableFormatPebblev1 191 case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, 192 formatUnusedPrePebblev1MarkedCompacted: 193 return sstable.TableFormatPebblev2 194 case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted: 195 return sstable.TableFormatPebblev3 196 case FormatDeleteSizedAndObsolete, FormatVirtualSSTables: 197 return sstable.TableFormatPebblev4 198 default: 199 panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) 200 } 201 } 202 203 // MinTableFormat returns the minimum sstable.TableFormat that can be used at 204 // this FormatMajorVersion. 205 func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat { 206 switch v { 207 case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, 208 FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector, 209 FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted, 210 FormatRangeKeys: 211 return sstable.TableFormatLevelDB 212 case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, 213 formatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks, 214 FormatFlushableIngest, FormatPrePebblev1MarkedCompacted, 215 FormatDeleteSizedAndObsolete, FormatVirtualSSTables: 216 return sstable.TableFormatPebblev1 217 default: 218 panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) 219 } 220 } 221 222 // orderingInvariants returns an enum encoding the set of invariants that must 223 // hold within the receiver format major version. Invariants only get stricter 224 // as the format major version advances, so it is okay to retrieve the 225 // invariants from the current format major version and by the time the 226 // invariants are enforced, the format major version has advanced. 227 func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants { 228 if v < FormatSplitUserKeysMarkedCompacted { 229 return manifest.AllowSplitUserKeys 230 } 231 return manifest.ProhibitSplitUserKeys 232 } 233 234 // formatMajorVersionMigrations defines the migrations from one format 235 // major version to the next. Each migration is defined as a closure 236 // which will be invoked on the database before the new format major 237 // version is committed. Migrations must be idempotent. Migrations are 238 // invoked with d.mu locked. 239 // 240 // Each migration is responsible for invoking finalizeFormatVersUpgrade 241 // to set the new format major version. RatchetFormatMajorVersion will 242 // panic if a migration returns a nil error but fails to finalize the 243 // new format major version. 244 var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{ 245 FormatMostCompatible: func(d *DB) error { return nil }, 246 formatVersionedManifestMarker: func(d *DB) error { 247 // formatVersionedManifestMarker introduces the use of a marker 248 // file for pointing to the current MANIFEST file. 249 250 // Lock the manifest. 251 d.mu.versions.logLock() 252 defer d.mu.versions.logUnlock() 253 254 // Construct the filename of the currently active manifest and 255 // move the manifest marker to that filename. The marker is 256 // guaranteed to exist, because we unconditionally locate it 257 // during Open. 258 manifestFileNum := d.mu.versions.manifestFileNum 259 filename := base.MakeFilename(fileTypeManifest, manifestFileNum.DiskFileNum()) 260 if err := d.mu.versions.manifestMarker.Move(filename); err != nil { 261 return errors.Wrap(err, "moving manifest marker") 262 } 263 264 // Now that we have a manifest marker file in place and pointing 265 // to the current MANIFEST, finalize the upgrade. If we fail for 266 // some reason, a retry of this migration is guaranteed to again 267 // move the manifest marker file to the latest manifest. If 268 // we're unable to finalize the upgrade, a subsequent call to 269 // Open will ignore the manifest marker. 270 if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil { 271 return err 272 } 273 274 // We've finalized the upgrade. All subsequent Open calls will 275 // ignore the CURRENT file and instead read the manifest marker. 276 // Before we unlock the manifest, we need to update versionSet 277 // to use the manifest marker on future rotations. 278 d.mu.versions.setCurrent = setCurrentFuncMarker( 279 d.mu.versions.manifestMarker, 280 d.mu.versions.fs, 281 d.mu.versions.dirname) 282 return nil 283 }, 284 // The FormatVersioned version is split into two, each with their 285 // own migration to ensure the post-migration cleanup happens even 286 // if there's a crash immediately after finalizing the version. Once 287 // a new format major version is finalized, its migration will never 288 // run again. Post-migration cleanup like the one in the migration 289 // below must be performed in a separate migration or every time the 290 // database opens. 291 FormatVersioned: func(d *DB) error { 292 // Replace the `CURRENT` file with one that points to the 293 // nonexistent `MANIFEST-000000` file. If an earlier Pebble 294 // version that does not know about format major versions 295 // attempts to open the database, it will error avoiding 296 // accidental corruption. 297 if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil { 298 return err 299 } 300 return d.finalizeFormatVersUpgrade(FormatVersioned) 301 }, 302 // As SetWithDelete is a new key kind, there is nothing to migrate. We can 303 // simply finalize the format version and we're done. 304 FormatSetWithDelete: func(d *DB) error { 305 return d.finalizeFormatVersUpgrade(FormatSetWithDelete) 306 }, 307 FormatBlockPropertyCollector: func(d *DB) error { 308 return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector) 309 }, 310 FormatSplitUserKeysMarked: func(d *DB) error { 311 // Mark any unmarked files with split-user keys. Note all format major 312 // versions migrations are invoked with DB.mu locked. 313 if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil { 314 return err 315 } 316 return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked) 317 }, 318 FormatSplitUserKeysMarkedCompacted: func(d *DB) error { 319 // Before finalizing the format major version, rewrite any sstables 320 // still marked for compaction. Note all format major versions 321 // migrations are invoked with DB.mu locked. 322 if err := d.compactMarkedFilesLocked(); err != nil { 323 return err 324 } 325 return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted) 326 }, 327 FormatRangeKeys: func(d *DB) error { 328 return d.finalizeFormatVersUpgrade(FormatRangeKeys) 329 }, 330 FormatMinTableFormatPebblev1: func(d *DB) error { 331 return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1) 332 }, 333 FormatPrePebblev1Marked: func(d *DB) error { 334 // Mark any unmarked files that contain only table properties. Note all 335 // format major versions migrations are invoked with DB.mu locked. 336 if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil { 337 return err 338 } 339 return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked) 340 }, 341 formatUnusedPrePebblev1MarkedCompacted: func(d *DB) error { 342 // Intentional no-op. 343 return d.finalizeFormatVersUpgrade(formatUnusedPrePebblev1MarkedCompacted) 344 }, 345 FormatSSTableValueBlocks: func(d *DB) error { 346 return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks) 347 }, 348 FormatFlushableIngest: func(d *DB) error { 349 return d.finalizeFormatVersUpgrade(FormatFlushableIngest) 350 }, 351 FormatPrePebblev1MarkedCompacted: func(d *DB) error { 352 // Before finalizing the format major version, rewrite any sstables 353 // still marked for compaction. Note all format major versions 354 // migrations are invoked with DB.mu locked. 355 if err := d.compactMarkedFilesLocked(); err != nil { 356 return err 357 } 358 return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted) 359 }, 360 FormatDeleteSizedAndObsolete: func(d *DB) error { 361 return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete) 362 }, 363 FormatVirtualSSTables: func(d *DB) error { 364 return d.finalizeFormatVersUpgrade(FormatVirtualSSTables) 365 }, 366 } 367 368 const formatVersionMarkerName = `format-version` 369 370 func lookupFormatMajorVersion( 371 fs vfs.FS, dirname string, 372 ) (FormatMajorVersion, *atomicfs.Marker, error) { 373 m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName) 374 if err != nil { 375 return 0, nil, err 376 } 377 if versString == "" { 378 return FormatMostCompatible, m, nil 379 } 380 v, err := strconv.ParseUint(versString, 10, 64) 381 if err != nil { 382 return 0, nil, errors.Wrap(err, "parsing format major version") 383 } 384 vers := FormatMajorVersion(v) 385 if vers == FormatDefault { 386 return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers) 387 } 388 if vers > internalFormatNewest { 389 return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers) 390 } 391 return vers, m, nil 392 } 393 394 // FormatMajorVersion returns the database's active format major 395 // version. The format major version may be higher than the one 396 // provided in Options when the database was opened if the existing 397 // database was written with a higher format version. 398 func (d *DB) FormatMajorVersion() FormatMajorVersion { 399 return FormatMajorVersion(d.mu.formatVers.vers.Load()) 400 } 401 402 // RatchetFormatMajorVersion ratchets the opened database's format major 403 // version to the provided version. It errors if the provided format 404 // major version is below the database's current version. Once a 405 // database's format major version is upgraded, previous Pebble versions 406 // that do not know of the format version will be unable to open the 407 // database. 408 func (d *DB) RatchetFormatMajorVersion(fmv FormatMajorVersion) error { 409 if err := d.closed.Load(); err != nil { 410 panic(err) 411 } 412 413 d.mu.Lock() 414 defer d.mu.Unlock() 415 return d.ratchetFormatMajorVersionLocked(fmv) 416 } 417 418 func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) error { 419 if d.opts.ReadOnly { 420 return ErrReadOnly 421 } 422 if formatVers > internalFormatNewest { 423 // Guard against accidentally forgetting to update internalFormatNewest. 424 return errors.Errorf("pebble: unknown format version %d", formatVers) 425 } 426 if currentVers := d.FormatMajorVersion(); currentVers > formatVers { 427 return errors.Newf("pebble: database already at format major version %d; cannot reduce to %d", 428 currentVers, formatVers) 429 } 430 if d.mu.formatVers.ratcheting { 431 return errors.Newf("pebble: database format major version upgrade is in-progress") 432 } 433 d.mu.formatVers.ratcheting = true 434 defer func() { d.mu.formatVers.ratcheting = false }() 435 436 for nextVers := d.FormatMajorVersion() + 1; nextVers <= formatVers; nextVers++ { 437 if err := formatMajorVersionMigrations[nextVers](d); err != nil { 438 return errors.Wrapf(err, "migrating to version %d", nextVers) 439 } 440 441 // NB: The migration is responsible for calling 442 // finalizeFormatVersUpgrade to finalize the upgrade. This 443 // structure is necessary because some migrations may need to 444 // update in-memory state (without ever dropping locks) after 445 // the upgrade is finalized. Here we assert that the upgrade 446 // did occur. 447 if d.FormatMajorVersion() != nextVers { 448 d.opts.Logger.Fatalf("pebble: successful migration to format version %d never finalized the upgrade", nextVers) 449 } 450 } 451 return nil 452 } 453 454 // finalizeFormatVersUpgrade is typically only be called from within a 455 // format major version migration. 456 // 457 // See formatMajorVersionMigrations. 458 func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error { 459 // We use the marker to encode the active format version in the 460 // marker filename. Unlike other uses of the atomic marker, there is 461 // no file with the filename `formatVers.String()` on the 462 // filesystem. 463 if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil { 464 return err 465 } 466 d.mu.formatVers.vers.Store(uint64(formatVers)) 467 d.opts.EventListener.FormatUpgrade(formatVers) 468 return nil 469 } 470 471 // compactMarkedFilesLocked performs a migration that schedules rewrite 472 // compactions to compact away any sstables marked for compaction. 473 // compactMarkedFilesLocked is run while ratcheting the database's format major 474 // version to FormatSplitUserKeysMarkedCompacted. 475 // 476 // Note that while this method is called with the DB.mu held, and will not 477 // return until all marked files have been compacted, the mutex is dropped while 478 // waiting for compactions to complete (or for slots to free up). 479 func (d *DB) compactMarkedFilesLocked() error { 480 curr := d.mu.versions.currentVersion() 481 for curr.Stats.MarkedForCompaction > 0 { 482 // Attempt to schedule a compaction to rewrite a file marked for 483 // compaction. 484 d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction { 485 return picker.pickRewriteCompaction(env) 486 }) 487 488 // The above attempt might succeed and schedule a rewrite compaction. Or 489 // there might not be available compaction concurrency to schedule the 490 // compaction. Or compaction of the file might have already been in 491 // progress. In any scenario, wait until there's some change in the 492 // state of active compactions. 493 494 // Before waiting, check that the database hasn't been closed. Trying to 495 // schedule the compaction may have dropped d.mu while waiting for a 496 // manifest write to complete. In that dropped interim, the database may 497 // have been closed. 498 if err := d.closed.Load(); err != nil { 499 return err.(error) 500 } 501 502 // Some flush or compaction may have scheduled or completed while we waited 503 // for the manifest lock in maybeScheduleCompactionPicker. Get the latest 504 // Version before waiting on a compaction. 505 curr = d.mu.versions.currentVersion() 506 507 // Only wait on compactions if there are files still marked for compaction. 508 // NB: Waiting on this condition variable drops d.mu while blocked. 509 if curr.Stats.MarkedForCompaction > 0 { 510 if d.mu.compact.compactingCount == 0 { 511 panic("expected a compaction of marked files in progress") 512 } 513 d.mu.compact.cond.Wait() 514 // Refresh the current version again. 515 curr = d.mu.versions.currentVersion() 516 } 517 } 518 return nil 519 } 520 521 // findFilesFunc scans the LSM for files, returning true if at least one 522 // file was found. The returned array contains the matched files, if any, per 523 // level. 524 type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) 525 526 // markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent 527 // files that contain the same user key. Such arrangements of files were 528 // permitted in RocksDB and in Pebble up to SHA a860bbad. 529 var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc { 530 return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) { 531 // Files with split user keys are expected to be rare and performing key 532 // comparisons for every file within the LSM is expensive, so drop the 533 // database lock while scanning the file metadata. 534 for l := numLevels - 1; l > 0; l-- { 535 iter := v.Levels[l].Iter() 536 var prevFile *fileMetadata 537 var prevUserKey []byte 538 for f := iter.First(); f != nil; f = iter.Next() { 539 if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) { 540 // NB: We may append a file twice, once as prevFile and once 541 // as f. That's okay, and handled below. 542 files[l] = append(files[l], prevFile, f) 543 found = true 544 } 545 if f.Largest.IsExclusiveSentinel() { 546 prevUserKey = nil 547 prevFile = nil 548 } else { 549 prevUserKey = f.Largest.UserKey 550 prevFile = f 551 } 552 } 553 } 554 return 555 } 556 } 557 558 // markFilesPrePebblev1 scans the LSM for files that do not support block 559 // properties (i.e. a table format version pre-Pebblev1). 560 var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc { 561 return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) { 562 for l := numLevels - 1; l > 0; l-- { 563 iter := v.Levels[l].Iter() 564 for f := iter.First(); f != nil; f = iter.Next() { 565 if f.Virtual { 566 // Any physical sstable which has been virtualized must 567 // have already undergone this migration, and we don't 568 // need to worry about the virtual sstable themselves. 569 panic("pebble: unexpected virtual sstable during migration") 570 } 571 err = tc.withReader( 572 f.PhysicalMeta(), func(r *sstable.Reader) error { 573 tf, err := r.TableFormat() 574 if err != nil { 575 return err 576 } 577 if tf < sstable.TableFormatPebblev1 { 578 found = true 579 files[l] = append(files[l], f) 580 } 581 return nil 582 }) 583 if err != nil { 584 return 585 } 586 } 587 } 588 return 589 } 590 } 591 592 // markFilesLock durably marks the files that match the given findFilesFunc for 593 // compaction. 594 func (d *DB) markFilesLocked(findFn findFilesFunc) error { 595 jobID := d.mu.nextJobID 596 d.mu.nextJobID++ 597 598 // Acquire a read state to have a view of the LSM and a guarantee that none 599 // of the referenced files will be deleted until we've unreferenced the read 600 // state. Some findFilesFuncs may read the files, requiring they not be 601 // deleted. 602 rs := d.loadReadState() 603 var ( 604 found bool 605 files [numLevels][]*fileMetadata 606 err error 607 ) 608 func() { 609 defer rs.unrefLocked() 610 // Note the unusual locking: unlock, defer Lock(). The scan of the files in 611 // the version does not need to block other operations that require the 612 // DB.mu. Drop it for the scan, before re-acquiring it. 613 d.mu.Unlock() 614 defer d.mu.Lock() 615 found, files, err = findFn(rs.current) 616 }() 617 if err != nil { 618 return err 619 } 620 621 // The database lock has been acquired again by the defer within the above 622 // anonymous function. 623 if !found { 624 // Nothing to do. 625 return nil 626 } 627 628 // After scanning, if we found files to mark, we fetch the current state of 629 // the LSM (which may have changed) and set MarkedForCompaction on the files, 630 // and update the version's Stats.MarkedForCompaction count, which are both 631 // protected by d.mu. 632 633 // Lock the manifest for a coherent view of the LSM. The database lock has 634 // been re-acquired by the defer within the above anonymous function. 635 d.mu.versions.logLock() 636 vers := d.mu.versions.currentVersion() 637 for l, filesToMark := range files { 638 if len(filesToMark) == 0 { 639 continue 640 } 641 for _, f := range filesToMark { 642 // Ignore files to be marked that have already been compacted or marked. 643 if f.CompactionState == manifest.CompactionStateCompacted || 644 f.MarkedForCompaction { 645 continue 646 } 647 // Else, mark the file for compaction in this version. 648 vers.Stats.MarkedForCompaction++ 649 f.MarkedForCompaction = true 650 } 651 // The compaction picker uses the markedForCompactionAnnotator to 652 // quickly find files marked for compaction, or to quickly determine 653 // that there are no such files marked for compaction within a level. 654 // A b-tree node may be annotated with an annotation recording that 655 // there are no files marked for compaction within the node's subtree, 656 // based on the assumption that it's static. 657 // 658 // Since we're marking files for compaction, these b-tree nodes' 659 // annotations will be out of date. Clear the compaction-picking 660 // annotation, so that it's recomputed the next time the compaction 661 // picker looks for a file marked for compaction. 662 vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) 663 } 664 665 // The 'marked-for-compaction' bit is persisted in the MANIFEST file 666 // metadata. We've already modified the in-memory file metadata, but the 667 // manifest hasn't been updated. Force rotation to a new MANIFEST file, 668 // which will write every file metadata to the new manifest file and ensure 669 // that the now marked-for-compaction file metadata are persisted as marked. 670 // NB: This call to logAndApply will unlockthe MANIFEST, which we locked up 671 // above before obtaining `vers`. 672 return d.mu.versions.logAndApply( 673 jobID, 674 &manifest.VersionEdit{}, 675 map[int]*LevelMetrics{}, 676 true, /* forceRotation */ 677 func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) }) 678 }