github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/open.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/binary" 11 "fmt" 12 "io" 13 "math" 14 "os" 15 "sort" 16 "sync/atomic" 17 "time" 18 19 "github.com/cockroachdb/errors" 20 "github.com/cockroachdb/errors/oserror" 21 "github.com/cockroachdb/pebble/internal/arenaskl" 22 "github.com/cockroachdb/pebble/internal/base" 23 "github.com/cockroachdb/pebble/internal/cache" 24 "github.com/cockroachdb/pebble/internal/constants" 25 "github.com/cockroachdb/pebble/internal/invariants" 26 "github.com/cockroachdb/pebble/internal/manifest" 27 "github.com/cockroachdb/pebble/internal/manual" 28 "github.com/cockroachdb/pebble/objstorage" 29 "github.com/cockroachdb/pebble/objstorage/objstorageprovider" 30 "github.com/cockroachdb/pebble/record" 31 "github.com/cockroachdb/pebble/sstable" 32 "github.com/cockroachdb/pebble/vfs" 33 "github.com/prometheus/client_golang/prometheus" 34 ) 35 36 const ( 37 initialMemTableSize = 256 << 10 // 256 KB 38 39 // The max batch size is limited by the uint32 offsets stored in 40 // internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry. 41 // 42 // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive 43 // end of an allocation fits in uint32. 44 // 45 // On 32-bit systems, slices are naturally limited to MaxInt (just short of 46 // 2GB). 47 maxBatchSize = constants.MaxUint32OrInt 48 49 // The max memtable size is limited by the uint32 offsets stored in 50 // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. 51 // 52 // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive 53 // end of an allocation fits in uint32. 54 // 55 // On 32-bit systems, slices are naturally limited to MaxInt (just short of 56 // 2GB). 57 maxMemTableSize = constants.MaxUint32OrInt 58 ) 59 60 // TableCacheSize can be used to determine the table 61 // cache size for a single db, given the maximum open 62 // files which can be used by a table cache which is 63 // only used by a single db. 64 func TableCacheSize(maxOpenFiles int) int { 65 tableCacheSize := maxOpenFiles - numNonTableCacheFiles 66 if tableCacheSize < minTableCacheSize { 67 tableCacheSize = minTableCacheSize 68 } 69 return tableCacheSize 70 } 71 72 // Open opens a DB whose files live in the given directory. 73 func Open(dirname string, opts *Options) (db *DB, _ error) { 74 // Make a copy of the options so that we don't mutate the passed in options. 75 opts = opts.Clone() 76 opts = opts.EnsureDefaults() 77 if err := opts.Validate(); err != nil { 78 return nil, err 79 } 80 if opts.LoggerAndTracer == nil { 81 opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger} 82 } else { 83 opts.Logger = opts.LoggerAndTracer 84 } 85 86 // In all error cases, we return db = nil; this is used by various 87 // deferred cleanups. 88 89 // Open the database and WAL directories first. 90 walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts) 91 if err != nil { 92 return nil, errors.Wrapf(err, "error opening database at %q", dirname) 93 } 94 defer func() { 95 if db == nil { 96 if walDir != dataDir { 97 walDir.Close() 98 } 99 dataDir.Close() 100 } 101 }() 102 103 // Lock the database directory. 104 var fileLock *Lock 105 if opts.Lock != nil { 106 // The caller already acquired the database lock. Ensure that the 107 // directory matches. 108 if dirname != opts.Lock.dirname { 109 return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname) 110 } 111 if err := opts.Lock.refForOpen(); err != nil { 112 return nil, err 113 } 114 fileLock = opts.Lock 115 } else { 116 fileLock, err = LockDirectory(dirname, opts.FS) 117 if err != nil { 118 return nil, err 119 } 120 } 121 defer func() { 122 if db == nil { 123 fileLock.Close() 124 } 125 }() 126 127 // Establish the format major version. 128 formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname) 129 if err != nil { 130 return nil, err 131 } 132 defer func() { 133 if db == nil { 134 formatVersionMarker.Close() 135 } 136 }() 137 138 // Find the currently active manifest, if there is one. 139 manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname) 140 if err != nil { 141 return nil, errors.Wrapf(err, "pebble: database %q", dirname) 142 } 143 defer func() { 144 if db == nil { 145 manifestMarker.Close() 146 } 147 }() 148 149 // Atomic markers may leave behind obsolete files if there's a crash 150 // mid-update. Clean these up if we're not in read-only mode. 151 if !opts.ReadOnly { 152 if err := formatVersionMarker.RemoveObsolete(); err != nil { 153 return nil, err 154 } 155 if err := manifestMarker.RemoveObsolete(); err != nil { 156 return nil, err 157 } 158 } 159 160 if opts.Cache == nil { 161 opts.Cache = cache.New(cacheDefaultSize) 162 } else { 163 opts.Cache.Ref() 164 } 165 166 d := &DB{ 167 cacheID: opts.Cache.NewID(), 168 dirname: dirname, 169 walDirname: walDirname, 170 opts: opts, 171 cmp: opts.Comparer.Compare, 172 equal: opts.equal(), 173 merge: opts.Merger.Merge, 174 split: opts.Comparer.Split, 175 abbreviatedKey: opts.Comparer.AbbreviatedKey, 176 largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2, 177 fileLock: fileLock, 178 dataDir: dataDir, 179 walDir: walDir, 180 logRecycler: logRecycler{limit: opts.MemTableStopWritesThreshold + 1}, 181 closed: new(atomic.Value), 182 closedCh: make(chan struct{}), 183 } 184 d.mu.versions = &versionSet{} 185 d.diskAvailBytes.Store(math.MaxUint64) 186 187 defer func() { 188 // If an error or panic occurs during open, attempt to release the manually 189 // allocated memory resources. Note that rather than look for an error, we 190 // look for the return of a nil DB pointer. 191 if r := recover(); db == nil { 192 // Release our references to the Cache. Note that both the DB, and 193 // tableCache have a reference. When we release the reference to 194 // the tableCache, and if there are no other references to 195 // the tableCache, then the tableCache will also release its 196 // reference to the cache. 197 opts.Cache.Unref() 198 199 if d.tableCache != nil { 200 _ = d.tableCache.close() 201 } 202 203 for _, mem := range d.mu.mem.queue { 204 switch t := mem.flushable.(type) { 205 case *memTable: 206 manual.Free(t.arenaBuf) 207 t.arenaBuf = nil 208 } 209 } 210 if d.cleanupManager != nil { 211 d.cleanupManager.Close() 212 } 213 if d.objProvider != nil { 214 d.objProvider.Close() 215 } 216 if r != nil { 217 panic(r) 218 } 219 } 220 }() 221 222 d.commit = newCommitPipeline(commitEnv{ 223 logSeqNum: &d.mu.versions.logSeqNum, 224 visibleSeqNum: &d.mu.versions.visibleSeqNum, 225 apply: d.commitApply, 226 write: d.commitWrite, 227 }) 228 d.mu.nextJobID = 1 229 d.mu.mem.nextSize = opts.MemTableSize 230 if d.mu.mem.nextSize > initialMemTableSize { 231 d.mu.mem.nextSize = initialMemTableSize 232 } 233 d.mu.compact.cond.L = &d.mu.Mutex 234 d.mu.compact.inProgress = make(map[*compaction]struct{}) 235 d.mu.compact.noOngoingFlushStartTime = time.Now() 236 d.mu.snapshots.init() 237 // logSeqNum is the next sequence number that will be assigned. 238 // Start assigning sequence numbers from base.SeqNumStart to leave 239 // room for reserved sequence numbers (see comments around 240 // SeqNumStart). 241 d.mu.versions.logSeqNum.Store(base.SeqNumStart) 242 d.mu.formatVers.vers.Store(uint64(formatVersion)) 243 d.mu.formatVers.marker = formatVersionMarker 244 245 d.timeNow = time.Now 246 d.openedAt = d.timeNow() 247 248 d.mu.Lock() 249 defer d.mu.Unlock() 250 251 jobID := d.mu.nextJobID 252 d.mu.nextJobID++ 253 254 setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir) 255 256 if !manifestExists { 257 // DB does not exist. 258 if d.opts.ErrorIfNotExists || d.opts.ReadOnly { 259 return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname) 260 } 261 262 // Create the DB. 263 if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { 264 return nil, err 265 } 266 } else { 267 if opts.ErrorIfExists { 268 return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname) 269 } 270 // Load the version set. 271 if err := d.mu.versions.load(dirname, opts, manifestFileNum.FileNum(), manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { 272 return nil, err 273 } 274 if opts.ErrorIfNotPristine { 275 liveFileNums := make(map[base.DiskFileNum]struct{}) 276 d.mu.versions.addLiveFileNums(liveFileNums) 277 if len(liveFileNums) != 0 { 278 return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname) 279 } 280 } 281 } 282 283 // In read-only mode, we replay directly into the mutable memtable but never 284 // flush it. We need to delay creation of the memtable until we know the 285 // sequence number of the first batch that will be inserted. 286 if !d.opts.ReadOnly { 287 var entry *flushableEntry 288 d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load()) 289 d.mu.mem.queue = append(d.mu.mem.queue, entry) 290 } 291 292 // List the objects 293 ls, err := opts.FS.List(d.walDirname) 294 if err != nil { 295 return nil, err 296 } 297 if d.dirname != d.walDirname { 298 ls2, err := opts.FS.List(d.dirname) 299 if err != nil { 300 return nil, err 301 } 302 ls = append(ls, ls2...) 303 } 304 providerSettings := objstorageprovider.Settings{ 305 Logger: opts.Logger, 306 FS: opts.FS, 307 FSDirName: dirname, 308 FSDirInitialListing: ls, 309 FSCleaner: opts.Cleaner, 310 NoSyncOnClose: opts.NoSyncOnClose, 311 BytesPerSync: opts.BytesPerSync, 312 } 313 providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage 314 providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared 315 providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator 316 providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes 317 318 d.objProvider, err = objstorageprovider.Open(providerSettings) 319 if err != nil { 320 return nil, err 321 } 322 323 d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo) 324 325 if manifestExists { 326 curVersion := d.mu.versions.currentVersion() 327 if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil { 328 return nil, err 329 } 330 } 331 332 tableCacheSize := TableCacheSize(opts.MaxOpenFiles) 333 d.tableCache = newTableCacheContainer(opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize) 334 d.newIters = d.tableCache.newIters 335 d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter 336 337 // Replay any newer log files than the ones named in the manifest. 338 type fileNumAndName struct { 339 num FileNum 340 name string 341 } 342 var logFiles []fileNumAndName 343 var previousOptionsFileNum FileNum 344 var previousOptionsFilename string 345 for _, filename := range ls { 346 ft, fn, ok := base.ParseFilename(opts.FS, filename) 347 if !ok { 348 continue 349 } 350 351 // Don't reuse any obsolete file numbers to avoid modifying an 352 // ingested sstable's original external file. 353 if d.mu.versions.nextFileNum <= fn.FileNum() { 354 d.mu.versions.nextFileNum = fn.FileNum() + 1 355 } 356 357 switch ft { 358 case fileTypeLog: 359 if fn.FileNum() >= d.mu.versions.minUnflushedLogNum { 360 logFiles = append(logFiles, fileNumAndName{fn.FileNum(), filename}) 361 } 362 if d.logRecycler.minRecycleLogNum <= fn.FileNum() { 363 d.logRecycler.minRecycleLogNum = fn.FileNum() + 1 364 } 365 case fileTypeOptions: 366 if previousOptionsFileNum < fn.FileNum() { 367 previousOptionsFileNum = fn.FileNum() 368 previousOptionsFilename = filename 369 } 370 case fileTypeTemp, fileTypeOldTemp: 371 if !d.opts.ReadOnly { 372 // Some codepaths write to a temporary file and then 373 // rename it to its final location when complete. A 374 // temp file is leftover if a process exits before the 375 // rename. Remove it. 376 err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename)) 377 if err != nil { 378 return nil, err 379 } 380 } 381 } 382 } 383 384 // Ratchet d.mu.versions.nextFileNum ahead of all known objects in the 385 // objProvider. This avoids FileNum collisions with obsolete sstables. 386 objects := d.objProvider.List() 387 for _, obj := range objects { 388 if d.mu.versions.nextFileNum <= obj.DiskFileNum.FileNum() { 389 d.mu.versions.nextFileNum = obj.DiskFileNum.FileNum() + 1 390 } 391 } 392 393 // Validate the most-recent OPTIONS file, if there is one. 394 var strictWALTail bool 395 if previousOptionsFilename != "" { 396 path := opts.FS.PathJoin(dirname, previousOptionsFilename) 397 strictWALTail, err = checkOptions(opts, path) 398 if err != nil { 399 return nil, err 400 } 401 } 402 403 sort.Slice(logFiles, func(i, j int) bool { 404 return logFiles[i].num < logFiles[j].num 405 }) 406 407 var ve versionEdit 408 var toFlush flushableList 409 for i, lf := range logFiles { 410 lastWAL := i == len(logFiles)-1 411 flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, 412 opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL) 413 if err != nil { 414 return nil, err 415 } 416 toFlush = append(toFlush, flush...) 417 d.mu.versions.markFileNumUsed(lf.num) 418 if d.mu.versions.logSeqNum.Load() < maxSeqNum { 419 d.mu.versions.logSeqNum.Store(maxSeqNum) 420 } 421 } 422 d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load()) 423 424 if !d.opts.ReadOnly { 425 // Create an empty .log file. 426 newLogNum := d.mu.versions.getNextFileNum() 427 428 // This logic is slightly different than RocksDB's. Specifically, RocksDB 429 // sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the 430 // newLogNum. There should be no difference in using either value. 431 ve.MinUnflushedLogNum = newLogNum 432 433 // Create the manifest with the updated MinUnflushedLogNum before 434 // creating the new log file. If we created the log file first, a 435 // crash before the manifest is synced could leave two WALs with 436 // unclean tails. 437 d.mu.versions.logLock() 438 if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo { 439 return nil 440 }); err != nil { 441 return nil, err 442 } 443 444 for _, entry := range toFlush { 445 entry.readerUnrefLocked(true) 446 } 447 448 newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum.DiskFileNum()) 449 d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum.DiskFileNum(), fileSize: 0}) 450 logFile, err := opts.FS.Create(newLogName) 451 if err != nil { 452 return nil, err 453 } 454 if err := d.walDir.Sync(); err != nil { 455 return nil, err 456 } 457 d.opts.EventListener.WALCreated(WALCreateInfo{ 458 JobID: jobID, 459 Path: newLogName, 460 FileNum: newLogNum, 461 }) 462 // This isn't strictly necessary as we don't use the log number for 463 // memtables being flushed, only for the next unflushed memtable. 464 d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum 465 466 logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ 467 NoSyncOnClose: d.opts.NoSyncOnClose, 468 BytesPerSync: d.opts.WALBytesPerSync, 469 PreallocateSize: d.walPreallocateSize(), 470 }) 471 d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ 472 Buckets: FsyncLatencyBuckets, 473 }) 474 475 logWriterConfig := record.LogWriterConfig{ 476 WALMinSyncInterval: d.opts.WALMinSyncInterval, 477 WALFsyncLatency: d.mu.log.metrics.fsyncLatency, 478 QueueSemChan: d.commit.logSyncQSem, 479 } 480 d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig) 481 d.mu.versions.metrics.WAL.Files++ 482 } 483 d.updateReadStateLocked(d.opts.DebugCheck) 484 485 // If the Options specify a format major version higher than the 486 // loaded database's, upgrade it. If this is a new database, this 487 // code path also performs an initial upgrade from the starting 488 // implicit MostCompatible version. 489 // 490 // We ratchet the version this far into Open so that migrations have a read 491 // state available. 492 if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() { 493 if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil { 494 return nil, err 495 } 496 } 497 498 if !d.opts.ReadOnly { 499 // Write the current options to disk. 500 d.optionsFileNum = d.mu.versions.getNextFileNum().DiskFileNum() 501 tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum) 502 optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum) 503 504 // Write them to a temporary file first, in case we crash before 505 // we're done. A corrupt options file prevents opening the 506 // database. 507 optionsFile, err := opts.FS.Create(tmpPath) 508 if err != nil { 509 return nil, err 510 } 511 serializedOpts := []byte(opts.String()) 512 if _, err := optionsFile.Write(serializedOpts); err != nil { 513 return nil, errors.CombineErrors(err, optionsFile.Close()) 514 } 515 d.optionsFileSize = uint64(len(serializedOpts)) 516 if err := optionsFile.Sync(); err != nil { 517 return nil, errors.CombineErrors(err, optionsFile.Close()) 518 } 519 if err := optionsFile.Close(); err != nil { 520 return nil, err 521 } 522 // Atomically rename to the OPTIONS-XXXXXX path. This rename is 523 // guaranteed to be atomic because the destination path does not 524 // exist. 525 if err := opts.FS.Rename(tmpPath, optionsPath); err != nil { 526 return nil, err 527 } 528 if err := d.dataDir.Sync(); err != nil { 529 return nil, err 530 } 531 } 532 533 if !d.opts.ReadOnly { 534 d.scanObsoleteFiles(ls) 535 d.deleteObsoleteFiles(jobID) 536 } else { 537 // All the log files are obsolete. 538 d.mu.versions.metrics.WAL.Files = int64(len(logFiles)) 539 } 540 d.mu.tableStats.cond.L = &d.mu.Mutex 541 d.mu.tableValidation.cond.L = &d.mu.Mutex 542 if !d.opts.ReadOnly { 543 d.maybeCollectTableStatsLocked() 544 } 545 d.calculateDiskAvailableBytes() 546 547 d.maybeScheduleFlush() 548 d.maybeScheduleCompaction() 549 550 // Note: this is a no-op if invariants are disabled or race is enabled. 551 // 552 // Setting a finalizer on *DB causes *DB to never be reclaimed and the 553 // finalizer to never be run. The problem is due to this limitation of 554 // finalizers mention in the SetFinalizer docs: 555 // 556 // If a cyclic structure includes a block with a finalizer, that cycle is 557 // not guaranteed to be garbage collected and the finalizer is not 558 // guaranteed to run, because there is no ordering that respects the 559 // dependencies. 560 // 561 // DB has cycles with several of its internal structures: readState, 562 // newIters, tableCache, versions, etc. Each of this individually cause a 563 // cycle and prevent the finalizer from being run. But we can workaround this 564 // finializer limitation by setting a finalizer on another object that is 565 // tied to the lifetime of DB: the DB.closed atomic.Value. 566 dPtr := fmt.Sprintf("%p", d) 567 invariants.SetFinalizer(d.closed, func(obj interface{}) { 568 v := obj.(*atomic.Value) 569 if err := v.Load(); err == nil { 570 fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr) 571 os.Exit(1) 572 } 573 }) 574 575 return d, nil 576 } 577 578 // prepareAndOpenDirs opens the directories for the store (and creates them if 579 // necessary). 580 // 581 // Returns an error if ReadOnly is set and the directories don't exist. 582 func prepareAndOpenDirs( 583 dirname string, opts *Options, 584 ) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) { 585 walDirname = opts.WALDir 586 if opts.WALDir == "" { 587 walDirname = dirname 588 } 589 590 // Create directories if needed. 591 if !opts.ReadOnly { 592 if err := opts.FS.MkdirAll(dirname, 0755); err != nil { 593 return "", nil, nil, err 594 } 595 if walDirname != dirname { 596 if err := opts.FS.MkdirAll(walDirname, 0755); err != nil { 597 return "", nil, nil, err 598 } 599 } 600 } 601 602 dataDir, err = opts.FS.OpenDir(dirname) 603 if err != nil { 604 if opts.ReadOnly && oserror.IsNotExist(err) { 605 return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname) 606 } 607 return "", nil, nil, err 608 } 609 610 if walDirname == dirname { 611 walDir = dataDir 612 } else { 613 walDir, err = opts.FS.OpenDir(walDirname) 614 if err != nil { 615 dataDir.Close() 616 return "", nil, nil, err 617 } 618 } 619 return walDirname, dataDir, walDir, nil 620 } 621 622 // GetVersion returns the engine version string from the latest options 623 // file present in dir. Used to check what Pebble or RocksDB version was last 624 // used to write to the database stored in this directory. An empty string is 625 // returned if no valid OPTIONS file with a version key was found. 626 func GetVersion(dir string, fs vfs.FS) (string, error) { 627 ls, err := fs.List(dir) 628 if err != nil { 629 return "", err 630 } 631 var version string 632 lastOptionsSeen := FileNum(0) 633 for _, filename := range ls { 634 ft, fn, ok := base.ParseFilename(fs, filename) 635 if !ok { 636 continue 637 } 638 switch ft { 639 case fileTypeOptions: 640 // If this file has a higher number than the last options file 641 // processed, reset version. This is because rocksdb often 642 // writes multiple options files without deleting previous ones. 643 // Otherwise, skip parsing this options file. 644 if fn.FileNum() > lastOptionsSeen { 645 version = "" 646 lastOptionsSeen = fn.FileNum() 647 } else { 648 continue 649 } 650 f, err := fs.Open(fs.PathJoin(dir, filename)) 651 if err != nil { 652 return "", err 653 } 654 data, err := io.ReadAll(f) 655 f.Close() 656 657 if err != nil { 658 return "", err 659 } 660 err = parseOptions(string(data), func(section, key, value string) error { 661 switch { 662 case section == "Version": 663 switch key { 664 case "pebble_version": 665 version = value 666 case "rocksdb_version": 667 version = fmt.Sprintf("rocksdb v%s", value) 668 } 669 } 670 return nil 671 }) 672 if err != nil { 673 return "", err 674 } 675 } 676 } 677 return version, nil 678 } 679 680 // replayWAL replays the edits in the specified log file. If the DB is in 681 // read only mode, then the WALs are replayed into memtables and not flushed. If 682 // the DB is not in read only mode, then the contents of the WAL are guaranteed 683 // to be flushed. 684 // 685 // The toFlush return value is a list of flushables associated with the WAL 686 // being replayed which will be flushed. Once the version edit has been applied 687 // to the manifest, it is up to the caller of replayWAL to unreference the 688 // toFlush flushables returned by replayWAL. 689 // 690 // d.mu must be held when calling this, but the mutex may be dropped and 691 // re-acquired during the course of this method. 692 func (d *DB) replayWAL( 693 jobID int, ve *versionEdit, fs vfs.FS, filename string, logNum FileNum, strictWALTail bool, 694 ) (toFlush flushableList, maxSeqNum uint64, err error) { 695 file, err := fs.Open(filename) 696 if err != nil { 697 return nil, 0, err 698 } 699 defer file.Close() 700 var ( 701 b Batch 702 buf bytes.Buffer 703 mem *memTable 704 entry *flushableEntry 705 rr = record.NewReader(file, logNum) 706 offset int64 // byte offset in rr 707 lastFlushOffset int64 708 keysReplayed int64 // number of keys replayed 709 batchesReplayed int64 // number of batches replayed 710 ) 711 712 // TODO(jackson): This function is interspersed with panics, in addition to 713 // corruption error propagation. Audit them to ensure we're truly only 714 // panicking where the error points to Pebble bug and not user or 715 // hardware-induced corruption. 716 717 if d.opts.ReadOnly { 718 // In read-only mode, we replay directly into the mutable memtable which will 719 // never be flushed. 720 mem = d.mu.mem.mutable 721 if mem != nil { 722 entry = d.mu.mem.queue[len(d.mu.mem.queue)-1] 723 } 724 } 725 726 // Flushes the current memtable, if not nil. 727 flushMem := func() { 728 if mem == nil { 729 return 730 } 731 var logSize uint64 732 if offset >= lastFlushOffset { 733 logSize = uint64(offset - lastFlushOffset) 734 } 735 // Else, this was the initial memtable in the read-only case which must have 736 // been empty, but we need to flush it since we don't want to add to it later. 737 lastFlushOffset = offset 738 entry.logSize = logSize 739 if !d.opts.ReadOnly { 740 toFlush = append(toFlush, entry) 741 } 742 mem, entry = nil, nil 743 } 744 // Creates a new memtable if there is no current memtable. 745 ensureMem := func(seqNum uint64) { 746 if mem != nil { 747 return 748 } 749 mem, entry = d.newMemTable(logNum, seqNum) 750 if d.opts.ReadOnly { 751 d.mu.mem.mutable = mem 752 d.mu.mem.queue = append(d.mu.mem.queue, entry) 753 } 754 } 755 756 // updateVE is used to update ve with information about new files created 757 // during the flush of any flushable not of type ingestedFlushable. For the 758 // flushable of type ingestedFlushable we use custom handling below. 759 updateVE := func() error { 760 // TODO(bananabrick): See if we can use the actual base level here, 761 // instead of using 1. 762 c := newFlush(d.opts, d.mu.versions.currentVersion(), 763 1 /* base level */, toFlush, d.timeNow()) 764 newVE, _, _, err := d.runCompaction(jobID, c) 765 if err != nil { 766 return errors.Wrapf(err, "running compaction during WAL replay") 767 } 768 ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...) 769 return nil 770 } 771 defer func() { 772 if err != nil { 773 err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset) 774 } 775 }() 776 777 for { 778 offset = rr.Offset() 779 r, err := rr.Next() 780 if err == nil { 781 _, err = io.Copy(&buf, r) 782 } 783 if err != nil { 784 // It is common to encounter a zeroed or invalid chunk due to WAL 785 // preallocation and WAL recycling. We need to distinguish these 786 // errors from EOF in order to recognize that the record was 787 // truncated and to avoid replaying subsequent WALs, but want 788 // to otherwise treat them like EOF. 789 if err == io.EOF { 790 break 791 } else if record.IsInvalidRecord(err) && !strictWALTail { 792 break 793 } 794 return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL") 795 } 796 797 if buf.Len() < batchHeaderLen { 798 return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)", 799 filename, errors.Safe(logNum)) 800 } 801 802 if d.opts.ErrorIfNotPristine { 803 return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname) 804 } 805 806 // Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize 807 // which is used below. 808 b = Batch{} 809 b.db = d 810 b.SetRepr(buf.Bytes()) 811 seqNum := b.SeqNum() 812 maxSeqNum = seqNum + uint64(b.Count()) 813 keysReplayed += int64(b.Count()) 814 batchesReplayed++ 815 { 816 br := b.Reader() 817 if kind, encodedFileNum, _, ok, err := br.Next(); err != nil { 818 return nil, 0, err 819 } else if ok && kind == InternalKeyKindIngestSST { 820 fileNums := make([]base.DiskFileNum, 0, b.Count()) 821 addFileNum := func(encodedFileNum []byte) { 822 fileNum, n := binary.Uvarint(encodedFileNum) 823 if n <= 0 { 824 panic("pebble: ingest sstable file num is invalid.") 825 } 826 fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum()) 827 } 828 addFileNum(encodedFileNum) 829 830 for i := 1; i < int(b.Count()); i++ { 831 kind, encodedFileNum, _, ok, err := br.Next() 832 if err != nil { 833 return nil, 0, err 834 } 835 if kind != InternalKeyKindIngestSST { 836 panic("pebble: invalid batch key kind.") 837 } 838 if !ok { 839 panic("pebble: invalid batch count.") 840 } 841 addFileNum(encodedFileNum) 842 } 843 844 if _, _, _, ok, err := br.Next(); err != nil { 845 return nil, 0, err 846 } else if ok { 847 panic("pebble: invalid number of entries in batch.") 848 } 849 850 meta := make([]*fileMetadata, len(fileNums)) 851 for i, n := range fileNums { 852 var readable objstorage.Readable 853 objMeta, err := d.objProvider.Lookup(fileTypeTable, n) 854 if err != nil { 855 return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs") 856 } 857 if objMeta.IsRemote() { 858 readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true}) 859 if err != nil { 860 return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files") 861 } 862 } else { 863 path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n) 864 f, err := d.opts.FS.Open(path) 865 if err != nil { 866 return nil, 0, err 867 } 868 869 readable, err = sstable.NewSimpleReadable(f) 870 if err != nil { 871 return nil, 0, err 872 } 873 } 874 // NB: ingestLoad1 will close readable. 875 meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n) 876 if err != nil { 877 return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files") 878 } 879 } 880 881 if uint32(len(meta)) != b.Count() { 882 panic("pebble: couldn't load all files in WAL entry.") 883 } 884 885 entry, err = d.newIngestedFlushableEntry( 886 meta, seqNum, logNum, 887 ) 888 if err != nil { 889 return nil, 0, err 890 } 891 892 if d.opts.ReadOnly { 893 d.mu.mem.queue = append(d.mu.mem.queue, entry) 894 // We added the IngestSST flushable to the queue. But there 895 // must be at least one WAL entry waiting to be replayed. We 896 // have to ensure this newer WAL entry isn't replayed into 897 // the current value of d.mu.mem.mutable because the current 898 // mutable memtable exists before this flushable entry in 899 // the memtable queue. To ensure this, we just need to unset 900 // d.mu.mem.mutable. When a newer WAL is replayed, we will 901 // set d.mu.mem.mutable to a newer value. 902 d.mu.mem.mutable = nil 903 } else { 904 toFlush = append(toFlush, entry) 905 // During WAL replay, the lsm only has L0, hence, the 906 // baseLevel is 1. For the sake of simplicity, we place the 907 // ingested files in L0 here, instead of finding their 908 // target levels. This is a simplification for the sake of 909 // simpler code. It is expected that WAL replay should be 910 // rare, and that flushables of type ingestedFlushable 911 // should also be rare. So, placing the ingested files in L0 912 // is alright. 913 // 914 // TODO(bananabrick): Maybe refactor this function to allow 915 // us to easily place ingested files in levels as low as 916 // possible during WAL replay. It would require breaking up 917 // the application of ve to the manifest into chunks and is 918 // not pretty w/o a refactor to this function and how it's 919 // used. 920 c := newFlush( 921 d.opts, d.mu.versions.currentVersion(), 922 1, /* base level */ 923 []*flushableEntry{entry}, 924 d.timeNow(), 925 ) 926 for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { 927 ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata}) 928 } 929 } 930 return toFlush, maxSeqNum, nil 931 } 932 } 933 934 if b.memTableSize >= uint64(d.largeBatchThreshold) { 935 flushMem() 936 // Make a copy of the data slice since it is currently owned by buf and will 937 // be reused in the next iteration. 938 b.data = append([]byte(nil), b.data...) 939 b.flushable, err = newFlushableBatch(&b, d.opts.Comparer) 940 if err != nil { 941 return nil, 0, err 942 } 943 entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum()) 944 // Disable memory accounting by adding a reader ref that will never be 945 // removed. 946 entry.readerRefs.Add(1) 947 if d.opts.ReadOnly { 948 d.mu.mem.queue = append(d.mu.mem.queue, entry) 949 // We added the flushable batch to the flushable to the queue. 950 // But there must be at least one WAL entry waiting to be 951 // replayed. We have to ensure this newer WAL entry isn't 952 // replayed into the current value of d.mu.mem.mutable because 953 // the current mutable memtable exists before this flushable 954 // entry in the memtable queue. To ensure this, we just need to 955 // unset d.mu.mem.mutable. When a newer WAL is replayed, we will 956 // set d.mu.mem.mutable to a newer value. 957 d.mu.mem.mutable = nil 958 } else { 959 toFlush = append(toFlush, entry) 960 } 961 } else { 962 ensureMem(seqNum) 963 if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull { 964 return nil, 0, err 965 } 966 // We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the 967 // batch may not initially fit, but will eventually fit (since it is smaller than 968 // largeBatchThreshold). 969 for err == arenaskl.ErrArenaFull { 970 flushMem() 971 ensureMem(seqNum) 972 err = mem.prepare(&b) 973 if err != nil && err != arenaskl.ErrArenaFull { 974 return nil, 0, err 975 } 976 } 977 if err = mem.apply(&b, seqNum); err != nil { 978 return nil, 0, err 979 } 980 mem.writerUnref() 981 } 982 buf.Reset() 983 } 984 985 d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed) 986 flushMem() 987 988 // mem is nil here. 989 if !d.opts.ReadOnly { 990 err = updateVE() 991 if err != nil { 992 return nil, 0, err 993 } 994 } 995 return toFlush, maxSeqNum, err 996 } 997 998 func checkOptions(opts *Options, path string) (strictWALTail bool, err error) { 999 f, err := opts.FS.Open(path) 1000 if err != nil { 1001 return false, err 1002 } 1003 defer f.Close() 1004 1005 data, err := io.ReadAll(f) 1006 if err != nil { 1007 return false, err 1008 } 1009 return opts.checkOptions(string(data)) 1010 } 1011 1012 // DBDesc briefly describes high-level state about a database. 1013 type DBDesc struct { 1014 // Exists is true if an existing database was found. 1015 Exists bool 1016 // FormatMajorVersion indicates the database's current format 1017 // version. 1018 FormatMajorVersion FormatMajorVersion 1019 // ManifestFilename is the filename of the current active manifest, 1020 // if the database exists. 1021 ManifestFilename string 1022 } 1023 1024 // Peek looks for an existing database in dirname on the provided FS. It 1025 // returns a brief description of the database. Peek is read-only and 1026 // does not open the database 1027 func Peek(dirname string, fs vfs.FS) (*DBDesc, error) { 1028 vers, versMarker, err := lookupFormatMajorVersion(fs, dirname) 1029 if err != nil { 1030 return nil, err 1031 } 1032 // TODO(jackson): Immediately closing the marker is clunky. Add a 1033 // PeekMarker variant that avoids opening the directory. 1034 if err := versMarker.Close(); err != nil { 1035 return nil, err 1036 } 1037 1038 // Find the currently active manifest, if there is one. 1039 manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname) 1040 if err != nil { 1041 return nil, err 1042 } 1043 // TODO(jackson): Immediately closing the marker is clunky. Add a 1044 // PeekMarker variant that avoids opening the directory. 1045 if err := manifestMarker.Close(); err != nil { 1046 return nil, err 1047 } 1048 1049 desc := &DBDesc{ 1050 Exists: exists, 1051 FormatMajorVersion: vers, 1052 } 1053 if exists { 1054 desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum) 1055 } 1056 return desc, nil 1057 } 1058 1059 // LockDirectory acquires the database directory lock in the named directory, 1060 // preventing another process from opening the database. LockDirectory returns a 1061 // handle to the held lock that may be passed to Open through Options.Lock to 1062 // subsequently open the database, skipping lock acquistion during Open. 1063 // 1064 // LockDirectory may be used to expand the critical section protected by the 1065 // database lock to include setup before the call to Open. 1066 func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) { 1067 fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum())) 1068 if err != nil { 1069 return nil, err 1070 } 1071 l := &Lock{dirname: dirname, fileLock: fileLock} 1072 l.refs.Store(1) 1073 invariants.SetFinalizer(l, func(obj interface{}) { 1074 if refs := obj.(*Lock).refs.Load(); refs > 0 { 1075 panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs)) 1076 } 1077 }) 1078 return l, nil 1079 } 1080 1081 // Lock represents a file lock on a directory. It may be passed to Open through 1082 // Options.Lock to elide lock aquisition during Open. 1083 type Lock struct { 1084 dirname string 1085 fileLock io.Closer 1086 // refs is a count of the number of handles on the lock. refs must be 0, 1 1087 // or 2. 1088 // 1089 // When acquired by the client and passed to Open, refs = 1 and the Open 1090 // call increments it to 2. When the database is closed, it's decremented to 1091 // 1. Finally when the original caller, calls Close on the Lock, it's 1092 // drecemented to zero and the underlying file lock is released. 1093 // 1094 // When Open acquires the file lock, refs remains at 1 until the database is 1095 // closed. 1096 refs atomic.Int32 1097 } 1098 1099 func (l *Lock) refForOpen() error { 1100 // During Open, when a user passed in a lock, the reference count must be 1101 // exactly 1. If it's zero, the lock is no longer held and is invalid. If 1102 // it's 2, the lock is already in use by another database within the 1103 // process. 1104 if !l.refs.CompareAndSwap(1, 2) { 1105 return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?") 1106 } 1107 return nil 1108 } 1109 1110 // Close releases the lock, permitting another process to lock and open the 1111 // database. Close must not be called until after a database using the Lock has 1112 // been closed. 1113 func (l *Lock) Close() error { 1114 if l.refs.Add(-1) > 0 { 1115 return nil 1116 } 1117 defer func() { l.fileLock = nil }() 1118 return l.fileLock.Close() 1119 } 1120 1121 // ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database 1122 // does not exist. 1123 // 1124 // Note that errors can be wrapped with more details; use errors.Is(). 1125 var ErrDBDoesNotExist = errors.New("pebble: database does not exist") 1126 1127 // ErrDBAlreadyExists is generated when ErrorIfExists is set and the database 1128 // already exists. 1129 // 1130 // Note that errors can be wrapped with more details; use errors.Is(). 1131 var ErrDBAlreadyExists = errors.New("pebble: database already exists") 1132 1133 // ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database 1134 // already exists and is not pristine. 1135 // 1136 // Note that errors can be wrapped with more details; use errors.Is(). 1137 var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine") 1138 1139 // IsCorruptionError returns true if the given error indicates database 1140 // corruption. 1141 func IsCorruptionError(err error) bool { 1142 return errors.Is(err, base.ErrCorruption) 1143 } 1144 1145 func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error { 1146 var buf bytes.Buffer 1147 var args []interface{} 1148 1149 dedup := make(map[base.DiskFileNum]struct{}) 1150 for level, files := range v.Levels { 1151 iter := files.Iter() 1152 for f := iter.First(); f != nil; f = iter.Next() { 1153 backingState := f.FileBacking 1154 if _, ok := dedup[backingState.DiskFileNum]; ok { 1155 continue 1156 } 1157 dedup[backingState.DiskFileNum] = struct{}{} 1158 fileNum := backingState.DiskFileNum 1159 fileSize := backingState.Size 1160 // We allow foreign objects to have a mismatch between sizes. This is 1161 // because we might skew the backing size stored by our objprovider 1162 // to prevent us from over-prioritizing this file for compaction. 1163 meta, err := objProvider.Lookup(base.FileTypeTable, fileNum) 1164 var size int64 1165 if err == nil { 1166 if objProvider.IsSharedForeign(meta) { 1167 continue 1168 } 1169 size, err = objProvider.Size(meta) 1170 } 1171 if err != nil { 1172 buf.WriteString("L%d: %s: %v\n") 1173 args = append(args, errors.Safe(level), errors.Safe(fileNum), err) 1174 continue 1175 } 1176 1177 if size != int64(fileSize) { 1178 buf.WriteString("L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)\n") 1179 args = append(args, errors.Safe(level), errors.Safe(fileNum), objProvider.Path(meta), 1180 errors.Safe(size), errors.Safe(fileSize)) 1181 continue 1182 } 1183 } 1184 } 1185 1186 if buf.Len() == 0 { 1187 return nil 1188 } 1189 return errors.Errorf(buf.String(), args...) 1190 }