github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/open.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "cmp" 10 "context" 11 "encoding/binary" 12 "fmt" 13 "io" 14 "math" 15 "os" 16 "slices" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/errors" 21 "github.com/cockroachdb/errors/oserror" 22 "github.com/cockroachdb/pebble/internal/arenaskl" 23 "github.com/cockroachdb/pebble/internal/base" 24 "github.com/cockroachdb/pebble/internal/cache" 25 "github.com/cockroachdb/pebble/internal/constants" 26 "github.com/cockroachdb/pebble/internal/invariants" 27 "github.com/cockroachdb/pebble/internal/manifest" 28 "github.com/cockroachdb/pebble/internal/manual" 29 "github.com/cockroachdb/pebble/objstorage" 30 "github.com/cockroachdb/pebble/objstorage/objstorageprovider" 31 "github.com/cockroachdb/pebble/record" 32 "github.com/cockroachdb/pebble/sstable" 33 "github.com/cockroachdb/pebble/vfs" 34 "github.com/prometheus/client_golang/prometheus" 35 ) 36 37 const ( 38 initialMemTableSize = 256 << 10 // 256 KB 39 40 // The max batch size is limited by the uint32 offsets stored in 41 // internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry. 42 // 43 // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive 44 // end of an allocation fits in uint32. 45 // 46 // On 32-bit systems, slices are naturally limited to MaxInt (just short of 47 // 2GB). 48 maxBatchSize = constants.MaxUint32OrInt 49 50 // The max memtable size is limited by the uint32 offsets stored in 51 // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. 52 // 53 // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive 54 // end of an allocation fits in uint32. 55 // 56 // On 32-bit systems, slices are naturally limited to MaxInt (just short of 57 // 2GB). 58 maxMemTableSize = constants.MaxUint32OrInt 59 ) 60 61 // TableCacheSize can be used to determine the table 62 // cache size for a single db, given the maximum open 63 // files which can be used by a table cache which is 64 // only used by a single db. 65 func TableCacheSize(maxOpenFiles int) int { 66 tableCacheSize := maxOpenFiles - numNonTableCacheFiles 67 if tableCacheSize < minTableCacheSize { 68 tableCacheSize = minTableCacheSize 69 } 70 return tableCacheSize 71 } 72 73 // Open opens a DB whose files live in the given directory. 74 func Open(dirname string, opts *Options) (db *DB, _ error) { 75 // Make a copy of the options so that we don't mutate the passed in options. 76 opts = opts.Clone() 77 opts = opts.EnsureDefaults() 78 if err := opts.Validate(); err != nil { 79 return nil, err 80 } 81 if opts.LoggerAndTracer == nil { 82 opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger} 83 } else { 84 opts.Logger = opts.LoggerAndTracer 85 } 86 87 // In all error cases, we return db = nil; this is used by various 88 // deferred cleanups. 89 90 // Open the database and WAL directories first. 91 walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts) 92 if err != nil { 93 return nil, errors.Wrapf(err, "error opening database at %q", dirname) 94 } 95 defer func() { 96 if db == nil { 97 if walDir != dataDir { 98 walDir.Close() 99 } 100 dataDir.Close() 101 } 102 }() 103 104 // Lock the database directory. 105 var fileLock *Lock 106 if opts.Lock != nil { 107 // The caller already acquired the database lock. Ensure that the 108 // directory matches. 109 if dirname != opts.Lock.dirname { 110 return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname) 111 } 112 if err := opts.Lock.refForOpen(); err != nil { 113 return nil, err 114 } 115 fileLock = opts.Lock 116 } else { 117 fileLock, err = LockDirectory(dirname, opts.FS) 118 if err != nil { 119 return nil, err 120 } 121 } 122 defer func() { 123 if db == nil { 124 fileLock.Close() 125 } 126 }() 127 128 // Establish the format major version. 129 formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname) 130 if err != nil { 131 return nil, err 132 } 133 defer func() { 134 if db == nil { 135 formatVersionMarker.Close() 136 } 137 }() 138 139 // Find the currently active manifest, if there is one. 140 manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname) 141 if err != nil { 142 return nil, errors.Wrapf(err, "pebble: database %q", dirname) 143 } 144 defer func() { 145 if db == nil { 146 manifestMarker.Close() 147 } 148 }() 149 150 // Atomic markers may leave behind obsolete files if there's a crash 151 // mid-update. Clean these up if we're not in read-only mode. 152 if !opts.ReadOnly { 153 if err := formatVersionMarker.RemoveObsolete(); err != nil { 154 return nil, err 155 } 156 if err := manifestMarker.RemoveObsolete(); err != nil { 157 return nil, err 158 } 159 } 160 161 if opts.Cache == nil { 162 opts.Cache = cache.New(cacheDefaultSize) 163 } else { 164 opts.Cache.Ref() 165 } 166 167 d := &DB{ 168 cacheID: opts.Cache.NewID(), 169 dirname: dirname, 170 walDirname: walDirname, 171 opts: opts, 172 cmp: opts.Comparer.Compare, 173 equal: opts.equal(), 174 merge: opts.Merger.Merge, 175 split: opts.Comparer.Split, 176 abbreviatedKey: opts.Comparer.AbbreviatedKey, 177 largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2, 178 fileLock: fileLock, 179 dataDir: dataDir, 180 walDir: walDir, 181 logRecycler: logRecycler{limit: opts.MemTableStopWritesThreshold + 1}, 182 closed: new(atomic.Value), 183 closedCh: make(chan struct{}), 184 } 185 d.mu.versions = &versionSet{} 186 d.diskAvailBytes.Store(math.MaxUint64) 187 188 defer func() { 189 // If an error or panic occurs during open, attempt to release the manually 190 // allocated memory resources. Note that rather than look for an error, we 191 // look for the return of a nil DB pointer. 192 if r := recover(); db == nil { 193 // Release our references to the Cache. Note that both the DB, and 194 // tableCache have a reference. When we release the reference to 195 // the tableCache, and if there are no other references to 196 // the tableCache, then the tableCache will also release its 197 // reference to the cache. 198 opts.Cache.Unref() 199 200 if d.tableCache != nil { 201 _ = d.tableCache.close() 202 } 203 204 for _, mem := range d.mu.mem.queue { 205 switch t := mem.flushable.(type) { 206 case *memTable: 207 manual.Free(t.arenaBuf) 208 t.arenaBuf = nil 209 } 210 } 211 if d.cleanupManager != nil { 212 d.cleanupManager.Close() 213 } 214 if d.objProvider != nil { 215 d.objProvider.Close() 216 } 217 if r != nil { 218 panic(r) 219 } 220 } 221 }() 222 223 d.commit = newCommitPipeline(commitEnv{ 224 logSeqNum: &d.mu.versions.logSeqNum, 225 visibleSeqNum: &d.mu.versions.visibleSeqNum, 226 apply: d.commitApply, 227 write: d.commitWrite, 228 }) 229 d.mu.nextJobID = 1 230 d.mu.mem.nextSize = opts.MemTableSize 231 if d.mu.mem.nextSize > initialMemTableSize { 232 d.mu.mem.nextSize = initialMemTableSize 233 } 234 d.mu.compact.cond.L = &d.mu.Mutex 235 d.mu.compact.inProgress = make(map[*compaction]struct{}) 236 d.mu.compact.noOngoingFlushStartTime = time.Now() 237 d.mu.snapshots.init() 238 // logSeqNum is the next sequence number that will be assigned. 239 // Start assigning sequence numbers from base.SeqNumStart to leave 240 // room for reserved sequence numbers (see comments around 241 // SeqNumStart). 242 d.mu.versions.logSeqNum.Store(base.SeqNumStart) 243 d.mu.formatVers.vers.Store(uint64(formatVersion)) 244 d.mu.formatVers.marker = formatVersionMarker 245 246 d.timeNow = time.Now 247 d.openedAt = d.timeNow() 248 249 d.mu.Lock() 250 defer d.mu.Unlock() 251 252 jobID := d.mu.nextJobID 253 d.mu.nextJobID++ 254 255 setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir) 256 257 if !manifestExists { 258 // DB does not exist. 259 if d.opts.ErrorIfNotExists || d.opts.ReadOnly { 260 return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname) 261 } 262 263 // Create the DB. 264 if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { 265 return nil, err 266 } 267 } else { 268 if opts.ErrorIfExists { 269 return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname) 270 } 271 // Load the version set. 272 if err := d.mu.versions.load(dirname, opts, manifestFileNum, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { 273 return nil, err 274 } 275 if opts.ErrorIfNotPristine { 276 liveFileNums := make(map[base.DiskFileNum]struct{}) 277 d.mu.versions.addLiveFileNums(liveFileNums) 278 if len(liveFileNums) != 0 { 279 return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname) 280 } 281 } 282 } 283 284 // In read-only mode, we replay directly into the mutable memtable but never 285 // flush it. We need to delay creation of the memtable until we know the 286 // sequence number of the first batch that will be inserted. 287 if !d.opts.ReadOnly { 288 var entry *flushableEntry 289 d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load()) 290 d.mu.mem.queue = append(d.mu.mem.queue, entry) 291 } 292 293 // List the objects 294 ls, err := opts.FS.List(d.walDirname) 295 if err != nil { 296 return nil, err 297 } 298 if d.dirname != d.walDirname { 299 ls2, err := opts.FS.List(d.dirname) 300 if err != nil { 301 return nil, err 302 } 303 ls = append(ls, ls2...) 304 } 305 providerSettings := objstorageprovider.Settings{ 306 Logger: opts.Logger, 307 FS: opts.FS, 308 FSDirName: dirname, 309 FSDirInitialListing: ls, 310 FSCleaner: opts.Cleaner, 311 NoSyncOnClose: opts.NoSyncOnClose, 312 BytesPerSync: opts.BytesPerSync, 313 } 314 providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage 315 providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared 316 providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator 317 providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes 318 319 d.objProvider, err = objstorageprovider.Open(providerSettings) 320 if err != nil { 321 return nil, err 322 } 323 324 d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo) 325 326 if manifestExists { 327 curVersion := d.mu.versions.currentVersion() 328 if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil { 329 return nil, err 330 } 331 } 332 333 tableCacheSize := TableCacheSize(opts.MaxOpenFiles) 334 d.tableCache = newTableCacheContainer( 335 opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize, 336 &sstable.CategoryStatsCollector{}) 337 d.newIters = d.tableCache.newIters 338 d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter 339 340 // Replay any newer log files than the ones named in the manifest. 341 type fileNumAndName struct { 342 num base.DiskFileNum 343 name string 344 } 345 var logFiles []fileNumAndName 346 var previousOptionsFileNum FileNum 347 var previousOptionsFilename string 348 for _, filename := range ls { 349 ft, fn, ok := base.ParseFilename(opts.FS, filename) 350 if !ok { 351 continue 352 } 353 354 // Don't reuse any obsolete file numbers to avoid modifying an 355 // ingested sstable's original external file. 356 if d.mu.versions.nextFileNum <= uint64(fn.FileNum()) { 357 d.mu.versions.nextFileNum = uint64(fn.FileNum()) + 1 358 } 359 360 switch ft { 361 case fileTypeLog: 362 if fn >= d.mu.versions.minUnflushedLogNum { 363 logFiles = append(logFiles, fileNumAndName{fn, filename}) 364 } 365 if d.logRecycler.minRecycleLogNum <= fn.FileNum() { 366 d.logRecycler.minRecycleLogNum = fn.FileNum() + 1 367 } 368 case fileTypeOptions: 369 if previousOptionsFileNum < fn.FileNum() { 370 previousOptionsFileNum = fn.FileNum() 371 previousOptionsFilename = filename 372 } 373 case fileTypeTemp, fileTypeOldTemp: 374 if !d.opts.ReadOnly { 375 // Some codepaths write to a temporary file and then 376 // rename it to its final location when complete. A 377 // temp file is leftover if a process exits before the 378 // rename. Remove it. 379 err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename)) 380 if err != nil { 381 return nil, err 382 } 383 } 384 } 385 } 386 387 // Ratchet d.mu.versions.nextFileNum ahead of all known objects in the 388 // objProvider. This avoids FileNum collisions with obsolete sstables. 389 objects := d.objProvider.List() 390 for _, obj := range objects { 391 if d.mu.versions.nextFileNum <= uint64(obj.DiskFileNum) { 392 d.mu.versions.nextFileNum = uint64(obj.DiskFileNum) + 1 393 } 394 } 395 396 // Validate the most-recent OPTIONS file, if there is one. 397 var strictWALTail bool 398 if previousOptionsFilename != "" { 399 path := opts.FS.PathJoin(dirname, previousOptionsFilename) 400 strictWALTail, err = checkOptions(opts, path) 401 if err != nil { 402 return nil, err 403 } 404 } 405 406 slices.SortFunc(logFiles, func(a, b fileNumAndName) int { 407 return cmp.Compare(a.num, b.num) 408 }) 409 410 var ve versionEdit 411 var toFlush flushableList 412 for i, lf := range logFiles { 413 lastWAL := i == len(logFiles)-1 414 flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, 415 opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL) 416 if err != nil { 417 return nil, err 418 } 419 toFlush = append(toFlush, flush...) 420 d.mu.versions.markFileNumUsed(lf.num) 421 if d.mu.versions.logSeqNum.Load() < maxSeqNum { 422 d.mu.versions.logSeqNum.Store(maxSeqNum) 423 } 424 } 425 d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load()) 426 427 if !d.opts.ReadOnly { 428 // Create an empty .log file. 429 newLogNum := d.mu.versions.getNextDiskFileNum() 430 431 // This logic is slightly different than RocksDB's. Specifically, RocksDB 432 // sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the 433 // newLogNum. There should be no difference in using either value. 434 ve.MinUnflushedLogNum = newLogNum 435 436 // Create the manifest with the updated MinUnflushedLogNum before 437 // creating the new log file. If we created the log file first, a 438 // crash before the manifest is synced could leave two WALs with 439 // unclean tails. 440 d.mu.versions.logLock() 441 if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo { 442 return nil 443 }); err != nil { 444 return nil, err 445 } 446 447 for _, entry := range toFlush { 448 entry.readerUnrefLocked(true) 449 } 450 451 newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum) 452 d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0}) 453 logFile, err := opts.FS.Create(newLogName) 454 if err != nil { 455 return nil, err 456 } 457 if err := d.walDir.Sync(); err != nil { 458 return nil, err 459 } 460 d.opts.EventListener.WALCreated(WALCreateInfo{ 461 JobID: jobID, 462 Path: newLogName, 463 FileNum: newLogNum, 464 }) 465 // This isn't strictly necessary as we don't use the log number for 466 // memtables being flushed, only for the next unflushed memtable. 467 d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum 468 469 logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ 470 NoSyncOnClose: d.opts.NoSyncOnClose, 471 BytesPerSync: d.opts.WALBytesPerSync, 472 PreallocateSize: d.walPreallocateSize(), 473 }) 474 d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ 475 Buckets: FsyncLatencyBuckets, 476 }) 477 478 logWriterConfig := record.LogWriterConfig{ 479 WALMinSyncInterval: d.opts.WALMinSyncInterval, 480 WALFsyncLatency: d.mu.log.metrics.fsyncLatency, 481 QueueSemChan: d.commit.logSyncQSem, 482 } 483 d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig) 484 d.mu.versions.metrics.WAL.Files++ 485 } 486 d.updateReadStateLocked(d.opts.DebugCheck) 487 488 // If the Options specify a format major version higher than the 489 // loaded database's, upgrade it. If this is a new database, this 490 // code path also performs an initial upgrade from the starting 491 // implicit MostCompatible version. 492 // 493 // We ratchet the version this far into Open so that migrations have a read 494 // state available. 495 if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() { 496 if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil { 497 return nil, err 498 } 499 } 500 501 if !d.opts.ReadOnly { 502 // Write the current options to disk. 503 d.optionsFileNum = d.mu.versions.getNextDiskFileNum() 504 tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum) 505 optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum) 506 507 // Write them to a temporary file first, in case we crash before 508 // we're done. A corrupt options file prevents opening the 509 // database. 510 optionsFile, err := opts.FS.Create(tmpPath) 511 if err != nil { 512 return nil, err 513 } 514 serializedOpts := []byte(opts.String()) 515 if _, err := optionsFile.Write(serializedOpts); err != nil { 516 return nil, errors.CombineErrors(err, optionsFile.Close()) 517 } 518 d.optionsFileSize = uint64(len(serializedOpts)) 519 if err := optionsFile.Sync(); err != nil { 520 return nil, errors.CombineErrors(err, optionsFile.Close()) 521 } 522 if err := optionsFile.Close(); err != nil { 523 return nil, err 524 } 525 // Atomically rename to the OPTIONS-XXXXXX path. This rename is 526 // guaranteed to be atomic because the destination path does not 527 // exist. 528 if err := opts.FS.Rename(tmpPath, optionsPath); err != nil { 529 return nil, err 530 } 531 if err := d.dataDir.Sync(); err != nil { 532 return nil, err 533 } 534 } 535 536 if !d.opts.ReadOnly { 537 d.scanObsoleteFiles(ls) 538 d.deleteObsoleteFiles(jobID) 539 } else { 540 // All the log files are obsolete. 541 d.mu.versions.metrics.WAL.Files = int64(len(logFiles)) 542 } 543 d.mu.tableStats.cond.L = &d.mu.Mutex 544 d.mu.tableValidation.cond.L = &d.mu.Mutex 545 if !d.opts.ReadOnly { 546 d.maybeCollectTableStatsLocked() 547 } 548 d.calculateDiskAvailableBytes() 549 550 d.maybeScheduleFlush() 551 d.maybeScheduleCompaction() 552 553 // Note: this is a no-op if invariants are disabled or race is enabled. 554 // 555 // Setting a finalizer on *DB causes *DB to never be reclaimed and the 556 // finalizer to never be run. The problem is due to this limitation of 557 // finalizers mention in the SetFinalizer docs: 558 // 559 // If a cyclic structure includes a block with a finalizer, that cycle is 560 // not guaranteed to be garbage collected and the finalizer is not 561 // guaranteed to run, because there is no ordering that respects the 562 // dependencies. 563 // 564 // DB has cycles with several of its internal structures: readState, 565 // newIters, tableCache, versions, etc. Each of this individually cause a 566 // cycle and prevent the finalizer from being run. But we can workaround this 567 // finializer limitation by setting a finalizer on another object that is 568 // tied to the lifetime of DB: the DB.closed atomic.Value. 569 dPtr := fmt.Sprintf("%p", d) 570 invariants.SetFinalizer(d.closed, func(obj interface{}) { 571 v := obj.(*atomic.Value) 572 if err := v.Load(); err == nil { 573 fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr) 574 os.Exit(1) 575 } 576 }) 577 578 return d, nil 579 } 580 581 // prepareAndOpenDirs opens the directories for the store (and creates them if 582 // necessary). 583 // 584 // Returns an error if ReadOnly is set and the directories don't exist. 585 func prepareAndOpenDirs( 586 dirname string, opts *Options, 587 ) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) { 588 walDirname = opts.WALDir 589 if opts.WALDir == "" { 590 walDirname = dirname 591 } 592 593 // Create directories if needed. 594 if !opts.ReadOnly { 595 if err := opts.FS.MkdirAll(dirname, 0755); err != nil { 596 return "", nil, nil, err 597 } 598 if walDirname != dirname { 599 if err := opts.FS.MkdirAll(walDirname, 0755); err != nil { 600 return "", nil, nil, err 601 } 602 } 603 } 604 605 dataDir, err = opts.FS.OpenDir(dirname) 606 if err != nil { 607 if opts.ReadOnly && oserror.IsNotExist(err) { 608 return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname) 609 } 610 return "", nil, nil, err 611 } 612 613 if walDirname == dirname { 614 walDir = dataDir 615 } else { 616 walDir, err = opts.FS.OpenDir(walDirname) 617 if err != nil { 618 dataDir.Close() 619 return "", nil, nil, err 620 } 621 } 622 return walDirname, dataDir, walDir, nil 623 } 624 625 // GetVersion returns the engine version string from the latest options 626 // file present in dir. Used to check what Pebble or RocksDB version was last 627 // used to write to the database stored in this directory. An empty string is 628 // returned if no valid OPTIONS file with a version key was found. 629 func GetVersion(dir string, fs vfs.FS) (string, error) { 630 ls, err := fs.List(dir) 631 if err != nil { 632 return "", err 633 } 634 var version string 635 lastOptionsSeen := FileNum(0) 636 for _, filename := range ls { 637 ft, fn, ok := base.ParseFilename(fs, filename) 638 if !ok { 639 continue 640 } 641 switch ft { 642 case fileTypeOptions: 643 // If this file has a higher number than the last options file 644 // processed, reset version. This is because rocksdb often 645 // writes multiple options files without deleting previous ones. 646 // Otherwise, skip parsing this options file. 647 if fn.FileNum() > lastOptionsSeen { 648 version = "" 649 lastOptionsSeen = fn.FileNum() 650 } else { 651 continue 652 } 653 f, err := fs.Open(fs.PathJoin(dir, filename)) 654 if err != nil { 655 return "", err 656 } 657 data, err := io.ReadAll(f) 658 f.Close() 659 660 if err != nil { 661 return "", err 662 } 663 err = parseOptions(string(data), func(section, key, value string) error { 664 switch { 665 case section == "Version": 666 switch key { 667 case "pebble_version": 668 version = value 669 case "rocksdb_version": 670 version = fmt.Sprintf("rocksdb v%s", value) 671 } 672 } 673 return nil 674 }) 675 if err != nil { 676 return "", err 677 } 678 } 679 } 680 return version, nil 681 } 682 683 // replayWAL replays the edits in the specified log file. If the DB is in 684 // read only mode, then the WALs are replayed into memtables and not flushed. If 685 // the DB is not in read only mode, then the contents of the WAL are guaranteed 686 // to be flushed. 687 // 688 // The toFlush return value is a list of flushables associated with the WAL 689 // being replayed which will be flushed. Once the version edit has been applied 690 // to the manifest, it is up to the caller of replayWAL to unreference the 691 // toFlush flushables returned by replayWAL. 692 // 693 // d.mu must be held when calling this, but the mutex may be dropped and 694 // re-acquired during the course of this method. 695 func (d *DB) replayWAL( 696 jobID int, 697 ve *versionEdit, 698 fs vfs.FS, 699 filename string, 700 logNum base.DiskFileNum, 701 strictWALTail bool, 702 ) (toFlush flushableList, maxSeqNum uint64, err error) { 703 file, err := fs.Open(filename) 704 if err != nil { 705 return nil, 0, err 706 } 707 defer file.Close() 708 var ( 709 b Batch 710 buf bytes.Buffer 711 mem *memTable 712 entry *flushableEntry 713 rr = record.NewReader(file, logNum) 714 offset int64 // byte offset in rr 715 lastFlushOffset int64 716 keysReplayed int64 // number of keys replayed 717 batchesReplayed int64 // number of batches replayed 718 ) 719 720 // TODO(jackson): This function is interspersed with panics, in addition to 721 // corruption error propagation. Audit them to ensure we're truly only 722 // panicking where the error points to Pebble bug and not user or 723 // hardware-induced corruption. 724 725 if d.opts.ReadOnly { 726 // In read-only mode, we replay directly into the mutable memtable which will 727 // never be flushed. 728 mem = d.mu.mem.mutable 729 if mem != nil { 730 entry = d.mu.mem.queue[len(d.mu.mem.queue)-1] 731 } 732 } 733 734 // Flushes the current memtable, if not nil. 735 flushMem := func() { 736 if mem == nil { 737 return 738 } 739 var logSize uint64 740 if offset >= lastFlushOffset { 741 logSize = uint64(offset - lastFlushOffset) 742 } 743 // Else, this was the initial memtable in the read-only case which must have 744 // been empty, but we need to flush it since we don't want to add to it later. 745 lastFlushOffset = offset 746 entry.logSize = logSize 747 if !d.opts.ReadOnly { 748 toFlush = append(toFlush, entry) 749 } 750 mem, entry = nil, nil 751 } 752 // Creates a new memtable if there is no current memtable. 753 ensureMem := func(seqNum uint64) { 754 if mem != nil { 755 return 756 } 757 mem, entry = d.newMemTable(logNum, seqNum) 758 if d.opts.ReadOnly { 759 d.mu.mem.mutable = mem 760 d.mu.mem.queue = append(d.mu.mem.queue, entry) 761 } 762 } 763 764 // updateVE is used to update ve with information about new files created 765 // during the flush of any flushable not of type ingestedFlushable. For the 766 // flushable of type ingestedFlushable we use custom handling below. 767 updateVE := func() error { 768 // TODO(bananabrick): See if we can use the actual base level here, 769 // instead of using 1. 770 c := newFlush(d.opts, d.mu.versions.currentVersion(), 771 1 /* base level */, toFlush, d.timeNow()) 772 newVE, _, _, err := d.runCompaction(jobID, c) 773 if err != nil { 774 return errors.Wrapf(err, "running compaction during WAL replay") 775 } 776 ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...) 777 return nil 778 } 779 defer func() { 780 if err != nil { 781 err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset) 782 } 783 }() 784 785 for { 786 offset = rr.Offset() 787 r, err := rr.Next() 788 if err == nil { 789 _, err = io.Copy(&buf, r) 790 } 791 if err != nil { 792 // It is common to encounter a zeroed or invalid chunk due to WAL 793 // preallocation and WAL recycling. We need to distinguish these 794 // errors from EOF in order to recognize that the record was 795 // truncated and to avoid replaying subsequent WALs, but want 796 // to otherwise treat them like EOF. 797 if err == io.EOF { 798 break 799 } else if record.IsInvalidRecord(err) && !strictWALTail { 800 break 801 } 802 return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL") 803 } 804 805 if buf.Len() < batchHeaderLen { 806 return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)", 807 filename, errors.Safe(logNum)) 808 } 809 810 if d.opts.ErrorIfNotPristine { 811 return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname) 812 } 813 814 // Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize 815 // which is used below. 816 b = Batch{} 817 b.db = d 818 b.SetRepr(buf.Bytes()) 819 seqNum := b.SeqNum() 820 maxSeqNum = seqNum + uint64(b.Count()) 821 keysReplayed += int64(b.Count()) 822 batchesReplayed++ 823 { 824 br := b.Reader() 825 if kind, encodedFileNum, _, ok, err := br.Next(); err != nil { 826 return nil, 0, err 827 } else if ok && kind == InternalKeyKindIngestSST { 828 fileNums := make([]base.DiskFileNum, 0, b.Count()) 829 addFileNum := func(encodedFileNum []byte) { 830 fileNum, n := binary.Uvarint(encodedFileNum) 831 if n <= 0 { 832 panic("pebble: ingest sstable file num is invalid.") 833 } 834 fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum()) 835 } 836 addFileNum(encodedFileNum) 837 838 for i := 1; i < int(b.Count()); i++ { 839 kind, encodedFileNum, _, ok, err := br.Next() 840 if err != nil { 841 return nil, 0, err 842 } 843 if kind != InternalKeyKindIngestSST { 844 panic("pebble: invalid batch key kind.") 845 } 846 if !ok { 847 panic("pebble: invalid batch count.") 848 } 849 addFileNum(encodedFileNum) 850 } 851 852 if _, _, _, ok, err := br.Next(); err != nil { 853 return nil, 0, err 854 } else if ok { 855 panic("pebble: invalid number of entries in batch.") 856 } 857 858 meta := make([]*fileMetadata, len(fileNums)) 859 for i, n := range fileNums { 860 var readable objstorage.Readable 861 objMeta, err := d.objProvider.Lookup(fileTypeTable, n) 862 if err != nil { 863 return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs") 864 } 865 if objMeta.IsRemote() { 866 readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true}) 867 if err != nil { 868 return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files") 869 } 870 } else { 871 path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n) 872 f, err := d.opts.FS.Open(path) 873 if err != nil { 874 return nil, 0, err 875 } 876 877 readable, err = sstable.NewSimpleReadable(f) 878 if err != nil { 879 return nil, 0, err 880 } 881 } 882 // NB: ingestLoad1 will close readable. 883 meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n) 884 if err != nil { 885 return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files") 886 } 887 } 888 889 if uint32(len(meta)) != b.Count() { 890 panic("pebble: couldn't load all files in WAL entry.") 891 } 892 893 entry, err = d.newIngestedFlushableEntry( 894 meta, seqNum, logNum, 895 ) 896 if err != nil { 897 return nil, 0, err 898 } 899 900 if d.opts.ReadOnly { 901 d.mu.mem.queue = append(d.mu.mem.queue, entry) 902 // We added the IngestSST flushable to the queue. But there 903 // must be at least one WAL entry waiting to be replayed. We 904 // have to ensure this newer WAL entry isn't replayed into 905 // the current value of d.mu.mem.mutable because the current 906 // mutable memtable exists before this flushable entry in 907 // the memtable queue. To ensure this, we just need to unset 908 // d.mu.mem.mutable. When a newer WAL is replayed, we will 909 // set d.mu.mem.mutable to a newer value. 910 d.mu.mem.mutable = nil 911 } else { 912 toFlush = append(toFlush, entry) 913 // During WAL replay, the lsm only has L0, hence, the 914 // baseLevel is 1. For the sake of simplicity, we place the 915 // ingested files in L0 here, instead of finding their 916 // target levels. This is a simplification for the sake of 917 // simpler code. It is expected that WAL replay should be 918 // rare, and that flushables of type ingestedFlushable 919 // should also be rare. So, placing the ingested files in L0 920 // is alright. 921 // 922 // TODO(bananabrick): Maybe refactor this function to allow 923 // us to easily place ingested files in levels as low as 924 // possible during WAL replay. It would require breaking up 925 // the application of ve to the manifest into chunks and is 926 // not pretty w/o a refactor to this function and how it's 927 // used. 928 c := newFlush( 929 d.opts, d.mu.versions.currentVersion(), 930 1, /* base level */ 931 []*flushableEntry{entry}, 932 d.timeNow(), 933 ) 934 for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { 935 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata}) 936 } 937 } 938 return toFlush, maxSeqNum, nil 939 } 940 } 941 942 if b.memTableSize >= uint64(d.largeBatchThreshold) { 943 flushMem() 944 // Make a copy of the data slice since it is currently owned by buf and will 945 // be reused in the next iteration. 946 b.data = slices.Clone(b.data) 947 b.flushable, err = newFlushableBatch(&b, d.opts.Comparer) 948 if err != nil { 949 return nil, 0, err 950 } 951 entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum()) 952 // Disable memory accounting by adding a reader ref that will never be 953 // removed. 954 entry.readerRefs.Add(1) 955 if d.opts.ReadOnly { 956 d.mu.mem.queue = append(d.mu.mem.queue, entry) 957 // We added the flushable batch to the flushable to the queue. 958 // But there must be at least one WAL entry waiting to be 959 // replayed. We have to ensure this newer WAL entry isn't 960 // replayed into the current value of d.mu.mem.mutable because 961 // the current mutable memtable exists before this flushable 962 // entry in the memtable queue. To ensure this, we just need to 963 // unset d.mu.mem.mutable. When a newer WAL is replayed, we will 964 // set d.mu.mem.mutable to a newer value. 965 d.mu.mem.mutable = nil 966 } else { 967 toFlush = append(toFlush, entry) 968 } 969 } else { 970 ensureMem(seqNum) 971 if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull { 972 return nil, 0, err 973 } 974 // We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the 975 // batch may not initially fit, but will eventually fit (since it is smaller than 976 // largeBatchThreshold). 977 for err == arenaskl.ErrArenaFull { 978 flushMem() 979 ensureMem(seqNum) 980 err = mem.prepare(&b) 981 if err != nil && err != arenaskl.ErrArenaFull { 982 return nil, 0, err 983 } 984 } 985 if err = mem.apply(&b, seqNum); err != nil { 986 return nil, 0, err 987 } 988 mem.writerUnref() 989 } 990 buf.Reset() 991 } 992 993 d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed) 994 flushMem() 995 996 // mem is nil here. 997 if !d.opts.ReadOnly { 998 err = updateVE() 999 if err != nil { 1000 return nil, 0, err 1001 } 1002 } 1003 return toFlush, maxSeqNum, err 1004 } 1005 1006 func checkOptions(opts *Options, path string) (strictWALTail bool, err error) { 1007 f, err := opts.FS.Open(path) 1008 if err != nil { 1009 return false, err 1010 } 1011 defer f.Close() 1012 1013 data, err := io.ReadAll(f) 1014 if err != nil { 1015 return false, err 1016 } 1017 return opts.checkOptions(string(data)) 1018 } 1019 1020 // DBDesc briefly describes high-level state about a database. 1021 type DBDesc struct { 1022 // Exists is true if an existing database was found. 1023 Exists bool 1024 // FormatMajorVersion indicates the database's current format 1025 // version. 1026 FormatMajorVersion FormatMajorVersion 1027 // ManifestFilename is the filename of the current active manifest, 1028 // if the database exists. 1029 ManifestFilename string 1030 } 1031 1032 // Peek looks for an existing database in dirname on the provided FS. It 1033 // returns a brief description of the database. Peek is read-only and 1034 // does not open the database 1035 func Peek(dirname string, fs vfs.FS) (*DBDesc, error) { 1036 vers, versMarker, err := lookupFormatMajorVersion(fs, dirname) 1037 if err != nil { 1038 return nil, err 1039 } 1040 // TODO(jackson): Immediately closing the marker is clunky. Add a 1041 // PeekMarker variant that avoids opening the directory. 1042 if err := versMarker.Close(); err != nil { 1043 return nil, err 1044 } 1045 1046 // Find the currently active manifest, if there is one. 1047 manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname) 1048 if err != nil { 1049 return nil, err 1050 } 1051 // TODO(jackson): Immediately closing the marker is clunky. Add a 1052 // PeekMarker variant that avoids opening the directory. 1053 if err := manifestMarker.Close(); err != nil { 1054 return nil, err 1055 } 1056 1057 desc := &DBDesc{ 1058 Exists: exists, 1059 FormatMajorVersion: vers, 1060 } 1061 if exists { 1062 desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum) 1063 } 1064 return desc, nil 1065 } 1066 1067 // LockDirectory acquires the database directory lock in the named directory, 1068 // preventing another process from opening the database. LockDirectory returns a 1069 // handle to the held lock that may be passed to Open through Options.Lock to 1070 // subsequently open the database, skipping lock acquistion during Open. 1071 // 1072 // LockDirectory may be used to expand the critical section protected by the 1073 // database lock to include setup before the call to Open. 1074 func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) { 1075 fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum())) 1076 if err != nil { 1077 return nil, err 1078 } 1079 l := &Lock{dirname: dirname, fileLock: fileLock} 1080 l.refs.Store(1) 1081 invariants.SetFinalizer(l, func(obj interface{}) { 1082 if refs := obj.(*Lock).refs.Load(); refs > 0 { 1083 panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs)) 1084 } 1085 }) 1086 return l, nil 1087 } 1088 1089 // Lock represents a file lock on a directory. It may be passed to Open through 1090 // Options.Lock to elide lock aquisition during Open. 1091 type Lock struct { 1092 dirname string 1093 fileLock io.Closer 1094 // refs is a count of the number of handles on the lock. refs must be 0, 1 1095 // or 2. 1096 // 1097 // When acquired by the client and passed to Open, refs = 1 and the Open 1098 // call increments it to 2. When the database is closed, it's decremented to 1099 // 1. Finally when the original caller, calls Close on the Lock, it's 1100 // drecemented to zero and the underlying file lock is released. 1101 // 1102 // When Open acquires the file lock, refs remains at 1 until the database is 1103 // closed. 1104 refs atomic.Int32 1105 } 1106 1107 func (l *Lock) refForOpen() error { 1108 // During Open, when a user passed in a lock, the reference count must be 1109 // exactly 1. If it's zero, the lock is no longer held and is invalid. If 1110 // it's 2, the lock is already in use by another database within the 1111 // process. 1112 if !l.refs.CompareAndSwap(1, 2) { 1113 return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?") 1114 } 1115 return nil 1116 } 1117 1118 // Close releases the lock, permitting another process to lock and open the 1119 // database. Close must not be called until after a database using the Lock has 1120 // been closed. 1121 func (l *Lock) Close() error { 1122 if l.refs.Add(-1) > 0 { 1123 return nil 1124 } 1125 defer func() { l.fileLock = nil }() 1126 return l.fileLock.Close() 1127 } 1128 1129 // ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database 1130 // does not exist. 1131 // 1132 // Note that errors can be wrapped with more details; use errors.Is(). 1133 var ErrDBDoesNotExist = errors.New("pebble: database does not exist") 1134 1135 // ErrDBAlreadyExists is generated when ErrorIfExists is set and the database 1136 // already exists. 1137 // 1138 // Note that errors can be wrapped with more details; use errors.Is(). 1139 var ErrDBAlreadyExists = errors.New("pebble: database already exists") 1140 1141 // ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database 1142 // already exists and is not pristine. 1143 // 1144 // Note that errors can be wrapped with more details; use errors.Is(). 1145 var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine") 1146 1147 // IsCorruptionError returns true if the given error indicates database 1148 // corruption. 1149 func IsCorruptionError(err error) bool { 1150 return errors.Is(err, base.ErrCorruption) 1151 } 1152 1153 func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error { 1154 var errs []error 1155 dedup := make(map[base.DiskFileNum]struct{}) 1156 for level, files := range v.Levels { 1157 iter := files.Iter() 1158 for f := iter.First(); f != nil; f = iter.Next() { 1159 backingState := f.FileBacking 1160 if _, ok := dedup[backingState.DiskFileNum]; ok { 1161 continue 1162 } 1163 dedup[backingState.DiskFileNum] = struct{}{} 1164 fileNum := backingState.DiskFileNum 1165 fileSize := backingState.Size 1166 // We skip over remote objects; those are instead checked asynchronously 1167 // by the table stats loading job. 1168 meta, err := objProvider.Lookup(base.FileTypeTable, fileNum) 1169 var size int64 1170 if err == nil { 1171 if meta.IsRemote() { 1172 continue 1173 } 1174 size, err = objProvider.Size(meta) 1175 } 1176 if err != nil { 1177 errs = append(errs, errors.Wrapf(err, "L%d: %s", errors.Safe(level), fileNum)) 1178 continue 1179 } 1180 1181 if size != int64(fileSize) { 1182 errs = append(errs, errors.Errorf( 1183 "L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)", 1184 errors.Safe(level), fileNum, objProvider.Path(meta), 1185 errors.Safe(size), errors.Safe(fileSize))) 1186 continue 1187 } 1188 } 1189 } 1190 return errors.Join(errs...) 1191 }