github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/open.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "io/ioutil" 12 "math" 13 "os" 14 "sort" 15 "sync/atomic" 16 "time" 17 18 "github.com/cockroachdb/errors" 19 "github.com/zuoyebang/bitalostable/internal/arenaskl" 20 "github.com/zuoyebang/bitalostable/internal/base" 21 "github.com/zuoyebang/bitalostable/internal/cache" 22 "github.com/zuoyebang/bitalostable/internal/invariants" 23 "github.com/zuoyebang/bitalostable/internal/manual" 24 "github.com/zuoyebang/bitalostable/internal/rate" 25 "github.com/zuoyebang/bitalostable/record" 26 "github.com/zuoyebang/bitalostable/vfs" 27 ) 28 29 const ( 30 initialMemTableSize = 256 << 10 // 256 KB 31 32 // The max batch size is limited by the uint32 offsets stored in 33 // internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry. 34 maxBatchSize = 4 << 30 // 4 GB 35 36 // The max memtable size is limited by the uint32 offsets stored in 37 // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. 38 maxMemTableSize = 4 << 30 // 4 GB 39 ) 40 41 // TableCacheSize can be used to determine the table 42 // cache size for a single db, given the maximum open 43 // files which can be used by a table cache which is 44 // only used by a single db. 45 func TableCacheSize(maxOpenFiles int) int { 46 tableCacheSize := maxOpenFiles - numNonTableCacheFiles 47 if tableCacheSize < minTableCacheSize { 48 tableCacheSize = minTableCacheSize 49 } 50 return tableCacheSize 51 } 52 53 // Open opens a DB whose files live in the given directory. 54 func Open(dirname string, opts *Options) (db *DB, _ error) { 55 // Make a copy of the options so that we don't mutate the passed in options. 56 opts = opts.Clone() 57 opts = opts.EnsureDefaults() 58 if err := opts.Validate(); err != nil { 59 return nil, err 60 } 61 62 if opts.Cache == nil { 63 opts.Cache = cache.New(cacheDefaultSize) 64 } else { 65 opts.Cache.Ref() 66 } 67 68 d := &DB{ 69 cacheID: opts.Cache.NewID(), 70 dirname: dirname, 71 walDirname: opts.WALDir, 72 opts: opts, 73 cmp: opts.Comparer.Compare, 74 equal: opts.equal(), 75 merge: opts.Merger.Merge, 76 split: opts.Comparer.Split, 77 abbreviatedKey: opts.Comparer.AbbreviatedKey, 78 largeBatchThreshold: (opts.MemTableSize - int(memTableEmptySize)) / 2, 79 logRecycler: logRecycler{limit: opts.MemTableStopWritesThreshold + 1}, 80 closed: new(atomic.Value), 81 closedCh: make(chan struct{}), 82 } 83 d.mu.versions = &versionSet{} 84 d.atomic.diskAvailBytes = math.MaxUint64 85 d.mu.versions.diskAvailBytes = d.getDiskAvailableBytesCached 86 87 defer func() { 88 // If an error or panic occurs during open, attempt to release the manually 89 // allocated memory resources. Note that rather than look for an error, we 90 // look for the return of a nil DB pointer. 91 if r := recover(); db == nil { 92 // Release our references to the Cache. Note that both the DB, and 93 // tableCache have a reference. When we release the reference to 94 // the tableCache, and if there are no other references to 95 // the tableCache, then the tableCache will also release its 96 // reference to the cache. 97 opts.Cache.Unref() 98 99 if d.tableCache != nil { 100 _ = d.tableCache.close() 101 } 102 103 for _, mem := range d.mu.mem.queue { 104 switch t := mem.flushable.(type) { 105 case *memTable: 106 manual.Free(t.arenaBuf) 107 t.arenaBuf = nil 108 } 109 } 110 if r != nil { 111 panic(r) 112 } 113 } 114 }() 115 116 tableCacheSize := TableCacheSize(opts.MaxOpenFiles) 117 d.tableCache = newTableCacheContainer(opts.TableCache, d.cacheID, dirname, opts.FS, d.opts, tableCacheSize) 118 d.newIters = d.tableCache.newIters 119 d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter 120 121 d.commit = newCommitPipeline(commitEnv{ 122 logSeqNum: &d.mu.versions.atomic.logSeqNum, 123 visibleSeqNum: &d.mu.versions.atomic.visibleSeqNum, 124 apply: d.commitApply, 125 write: d.commitWrite, 126 }) 127 d.deletionLimiter = rate.NewLimiter( 128 rate.Limit(d.opts.Experimental.MinDeletionRate), 129 d.opts.Experimental.MinDeletionRate) 130 d.mu.nextJobID = 1 131 d.mu.mem.nextSize = opts.MemTableSize 132 //if d.mu.mem.nextSize > initialMemTableSize { 133 // d.mu.mem.nextSize = initialMemTableSize 134 //} 135 d.mu.mem.cond.L = &d.mu.Mutex 136 d.mu.cleaner.cond.L = &d.mu.Mutex 137 d.mu.compact.cond.L = &d.mu.Mutex 138 d.mu.compact.inProgress = make(map[*compaction]struct{}) 139 d.mu.compact.noOngoingFlushStartTime = time.Now() 140 d.mu.snapshots.init() 141 // logSeqNum is the next sequence number that will be assigned. Start 142 // assigning sequence numbers from 1 to match rocksdb. 143 d.mu.versions.atomic.logSeqNum = 1 144 145 d.timeNow = time.Now 146 147 d.mu.Lock() 148 defer d.mu.Unlock() 149 150 if !d.opts.ReadOnly { 151 err := opts.FS.MkdirAll(dirname, 0755) 152 if err != nil { 153 return nil, err 154 } 155 } 156 157 // Ensure we close resources if we error out early. If the database is 158 // successfully opened, the named return value `db` will be set to `d`. 159 defer func() { 160 if db != nil { 161 // The database was successfully opened. 162 return 163 } 164 if d.dataDir != nil { 165 d.dataDir.Close() 166 } 167 if d.walDirname != d.dirname && d.walDir != nil { 168 d.walDir.Close() 169 } 170 if d.mu.formatVers.marker != nil { 171 d.mu.formatVers.marker.Close() 172 } 173 }() 174 175 // Open the database and WAL directories first in order to check for their 176 // existence. 177 var err error 178 d.dataDir, err = opts.FS.OpenDir(dirname) 179 if err != nil { 180 return nil, err 181 } 182 if d.walDirname == "" { 183 d.walDirname = d.dirname 184 } 185 if d.walDirname == d.dirname { 186 d.walDir = d.dataDir 187 } else { 188 if !d.opts.ReadOnly { 189 err := opts.FS.MkdirAll(d.walDirname, 0755) 190 if err != nil { 191 return nil, err 192 } 193 } 194 d.walDir, err = opts.FS.OpenDir(d.walDirname) 195 if err != nil { 196 return nil, err 197 } 198 } 199 200 // Lock the database directory. 201 fileLock, err := opts.FS.Lock(base.MakeFilepath(opts.FS, dirname, fileTypeLock, 0)) 202 if err != nil { 203 d.dataDir.Close() 204 if d.dataDir != d.walDir { 205 d.walDir.Close() 206 } 207 return nil, err 208 } 209 defer func() { 210 if fileLock != nil { 211 fileLock.Close() 212 } 213 }() 214 215 // Establish the format major version. 216 { 217 d.mu.formatVers.vers, d.mu.formatVers.marker, err = lookupFormatMajorVersion(opts.FS, dirname) 218 if err != nil { 219 return nil, err 220 } 221 if !d.opts.ReadOnly { 222 if err := d.mu.formatVers.marker.RemoveObsolete(); err != nil { 223 return nil, err 224 } 225 } 226 } 227 228 jobID := d.mu.nextJobID 229 d.mu.nextJobID++ 230 231 // Find the currently active manifest, if there is one. 232 manifestMarker, manifestFileNum, exists, err := findCurrentManifest(d.mu.formatVers.vers, opts.FS, dirname) 233 setCurrent := setCurrentFunc(d.mu.formatVers.vers, manifestMarker, opts.FS, dirname, d.dataDir) 234 defer func() { 235 // Ensure we close the manifest marker if we error out for any reason. 236 // If the database is successfully opened, the *versionSet will take 237 // ownership over the manifest marker, ensuring it's closed when the DB 238 // is closed. 239 if db == nil { 240 manifestMarker.Close() 241 } 242 }() 243 if err != nil { 244 return nil, errors.Wrapf(err, "bitalostable: database %q", dirname) 245 } else if !exists && !d.opts.ReadOnly && !d.opts.ErrorIfNotExists { 246 // Create the DB if it did not already exist. 247 248 if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, &d.mu.Mutex); err != nil { 249 return nil, err 250 } 251 } else if opts.ErrorIfExists { 252 return nil, errors.Errorf("bitalostable: database %q already exists", dirname) 253 } else { 254 // Load the version set. 255 if err := d.mu.versions.load(dirname, opts, manifestFileNum, manifestMarker, setCurrent, &d.mu.Mutex); err != nil { 256 return nil, err 257 } 258 if err := d.mu.versions.currentVersion().CheckConsistency(dirname, opts.FS); err != nil { 259 return nil, err 260 } 261 } 262 263 // If the Options specify a format major version higher than the 264 // loaded database's, upgrade it. If this is a new database, this 265 // code path also performs an initial upgrade from the starting 266 // implicit MostCompatible version. 267 if !d.opts.ReadOnly && opts.FormatMajorVersion > d.mu.formatVers.vers { 268 if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil { 269 return nil, err 270 } 271 } 272 273 // Atomic markers like the one used for the MANIFEST may leave 274 // behind obsolete files if there's a crash mid-update. Clean these 275 // up if we're not in read-only mode. 276 if !d.opts.ReadOnly { 277 if err := manifestMarker.RemoveObsolete(); err != nil { 278 return nil, err 279 } 280 } 281 282 // In read-only mode, we replay directly into the mutable memtable but never 283 // flush it. We need to delay creation of the memtable until we know the 284 // sequence number of the first batch that will be inserted. 285 if !d.opts.ReadOnly { 286 var entry *flushableEntry 287 d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.atomic.logSeqNum) 288 d.mu.mem.queue = append(d.mu.mem.queue, entry) 289 } 290 291 ls, err := opts.FS.List(d.walDirname) 292 if err != nil { 293 return nil, err 294 } 295 if d.dirname != d.walDirname { 296 ls2, err := opts.FS.List(d.dirname) 297 if err != nil { 298 return nil, err 299 } 300 ls = append(ls, ls2...) 301 } 302 303 // Replay any newer log files than the ones named in the manifest. 304 type fileNumAndName struct { 305 num FileNum 306 name string 307 } 308 var logFiles []fileNumAndName 309 var previousOptionsFileNum FileNum 310 var previousOptionsFilename string 311 for _, filename := range ls { 312 ft, fn, ok := base.ParseFilename(opts.FS, filename) 313 if !ok { 314 continue 315 } 316 317 // Don't reuse any obsolete file numbers to avoid modifying an 318 // ingested sstable's original external file. 319 if d.mu.versions.nextFileNum <= fn { 320 d.mu.versions.nextFileNum = fn + 1 321 } 322 323 switch ft { 324 case fileTypeLog: 325 if fn >= d.mu.versions.minUnflushedLogNum { 326 logFiles = append(logFiles, fileNumAndName{fn, filename}) 327 } 328 if d.logRecycler.minRecycleLogNum <= fn { 329 d.logRecycler.minRecycleLogNum = fn + 1 330 } 331 case fileTypeOptions: 332 if previousOptionsFileNum < fn { 333 previousOptionsFileNum = fn 334 previousOptionsFilename = filename 335 } 336 case fileTypeTemp, fileTypeOldTemp: 337 if !d.opts.ReadOnly { 338 // Some codepaths write to a temporary file and then 339 // rename it to its final location when complete. A 340 // temp file is leftover if a process exits before the 341 // rename. Remove it. 342 err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename)) 343 if err != nil { 344 return nil, err 345 } 346 } 347 } 348 } 349 350 // Validate the most-recent OPTIONS file, if there is one. 351 var strictWALTail bool 352 if previousOptionsFilename != "" { 353 path := opts.FS.PathJoin(dirname, previousOptionsFilename) 354 strictWALTail, err = checkOptions(opts, path) 355 if err != nil { 356 return nil, err 357 } 358 } 359 360 sort.Slice(logFiles, func(i, j int) bool { 361 return logFiles[i].num < logFiles[j].num 362 }) 363 364 var ve versionEdit 365 for i, lf := range logFiles { 366 lastWAL := i == len(logFiles)-1 367 maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, 368 opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL) 369 if err != nil { 370 return nil, err 371 } 372 d.mu.versions.markFileNumUsed(lf.num) 373 if d.mu.versions.atomic.logSeqNum < maxSeqNum { 374 d.mu.versions.atomic.logSeqNum = maxSeqNum 375 } 376 } 377 d.mu.versions.atomic.visibleSeqNum = d.mu.versions.atomic.logSeqNum 378 379 if !d.opts.ReadOnly { 380 // Create an empty .log file. 381 newLogNum := d.mu.versions.getNextFileNum() 382 383 // This logic is slightly different than RocksDB's. Specifically, RocksDB 384 // sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the 385 // newLogNum. There should be no difference in using either value. 386 ve.MinUnflushedLogNum = newLogNum 387 388 // Create the manifest with the updated MinUnflushedLogNum before 389 // creating the new log file. If we created the log file first, a 390 // crash before the manifest is synced could leave two WALs with 391 // unclean tails. 392 d.mu.versions.logLock() 393 if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo { 394 return nil 395 }); err != nil { 396 return nil, err 397 } 398 399 newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum) 400 d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0}) 401 logFile, err := opts.FS.Create(newLogName) 402 if err != nil { 403 return nil, err 404 } 405 if err := d.walDir.Sync(); err != nil { 406 return nil, err 407 } 408 d.opts.EventListener.WALCreated(WALCreateInfo{ 409 JobID: jobID, 410 Path: newLogName, 411 FileNum: newLogNum, 412 }) 413 // This isn't strictly necessary as we don't use the log number for 414 // memtables being flushed, only for the next unflushed memtable. 415 d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum 416 417 logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ 418 NoSyncOnClose: d.opts.NoSyncOnClose, 419 BytesPerSync: d.opts.WALBytesPerSync, 420 PreallocateSize: d.walPreallocateSize(), 421 }) 422 d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum) 423 d.mu.log.LogWriter.SetMinSyncInterval(d.opts.WALMinSyncInterval) 424 d.mu.versions.metrics.WAL.Files++ 425 } 426 d.updateReadStateLocked(d.opts.DebugCheck) 427 428 if !d.opts.ReadOnly { 429 // Write the current options to disk. 430 d.optionsFileNum = d.mu.versions.getNextFileNum() 431 tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum) 432 optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum) 433 434 // Write them to a temporary file first, in case we crash before 435 // we're done. A corrupt options file prevents opening the 436 // database. 437 optionsFile, err := opts.FS.Create(tmpPath) 438 if err != nil { 439 return nil, err 440 } 441 serializedOpts := []byte(opts.String()) 442 if _, err := optionsFile.Write(serializedOpts); err != nil { 443 return nil, errors.CombineErrors(err, optionsFile.Close()) 444 } 445 d.optionsFileSize = uint64(len(serializedOpts)) 446 if err := optionsFile.Sync(); err != nil { 447 return nil, errors.CombineErrors(err, optionsFile.Close()) 448 } 449 if err := optionsFile.Close(); err != nil { 450 return nil, err 451 } 452 // Atomically rename to the OPTIONS-XXXXXX path. This rename is 453 // guaranteed to be atomic because the destination path does not 454 // exist. 455 if err := opts.FS.Rename(tmpPath, optionsPath); err != nil { 456 return nil, err 457 } 458 if err := d.dataDir.Sync(); err != nil { 459 return nil, err 460 } 461 } 462 463 if !d.opts.ReadOnly { 464 d.scanObsoleteFiles(ls) 465 d.deleteObsoleteFiles(jobID, true /* waitForOngoing */) 466 } else { 467 // All the log files are obsolete. 468 d.mu.versions.metrics.WAL.Files = int64(len(logFiles)) 469 } 470 d.mu.tableStats.cond.L = &d.mu.Mutex 471 d.mu.tableValidation.cond.L = &d.mu.Mutex 472 if !d.opts.ReadOnly && !d.opts.private.disableTableStats { 473 d.maybeCollectTableStatsLocked() 474 } 475 d.calculateDiskAvailableBytes() 476 477 d.maybeScheduleFlush(false) 478 d.maybeScheduleCompaction() 479 480 // Note: this is a no-op if invariants are disabled or race is enabled. 481 // 482 // Setting a finalizer on *DB causes *DB to never be reclaimed and the 483 // finalizer to never be run. The problem is due to this limitation of 484 // finalizers mention in the SetFinalizer docs: 485 // 486 // If a cyclic structure includes a block with a finalizer, that cycle is 487 // not guaranteed to be garbage collected and the finalizer is not 488 // guaranteed to run, because there is no ordering that respects the 489 // dependencies. 490 // 491 // DB has cycles with several of its internal structures: readState, 492 // newIters, tableCache, versions, etc. Each of this individually cause a 493 // cycle and prevent the finalizer from being run. But we can workaround this 494 // finializer limitation by setting a finalizer on another object that is 495 // tied to the lifetime of DB: the DB.closed atomic.Value. 496 dPtr := fmt.Sprintf("%p", d) 497 invariants.SetFinalizer(d.closed, func(obj interface{}) { 498 v := obj.(*atomic.Value) 499 if err := v.Load(); err == nil { 500 fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr) 501 os.Exit(1) 502 } 503 }) 504 505 d.fileLock, fileLock = fileLock, nil 506 d.opts.Logger.Info("open bitalostable success") 507 return d, nil 508 } 509 510 // GetVersion returns the engine version string from the latest options 511 // file present in dir. Used to check what Pebble or RocksDB version was last 512 // used to write to the database stored in this directory. An empty string is 513 // returned if no valid OPTIONS file with a version key was found. 514 func GetVersion(dir string, fs vfs.FS) (string, error) { 515 ls, err := fs.List(dir) 516 if err != nil { 517 return "", err 518 } 519 var version string 520 lastOptionsSeen := FileNum(0) 521 for _, filename := range ls { 522 ft, fn, ok := base.ParseFilename(fs, filename) 523 if !ok { 524 continue 525 } 526 switch ft { 527 case fileTypeOptions: 528 // If this file has a higher number than the last options file 529 // processed, reset version. This is because rocksdb often 530 // writes multiple options files without deleting previous ones. 531 // Otherwise, skip parsing this options file. 532 if fn > lastOptionsSeen { 533 version = "" 534 lastOptionsSeen = fn 535 } else { 536 continue 537 } 538 f, err := fs.Open(fs.PathJoin(dir, filename)) 539 if err != nil { 540 return "", err 541 } 542 data, err := ioutil.ReadAll(f) 543 f.Close() 544 545 if err != nil { 546 return "", err 547 } 548 err = parseOptions(string(data), func(section, key, value string) error { 549 switch { 550 case section == "Version": 551 switch key { 552 case "bitalostable_version": 553 version = value 554 case "rocksdb_version": 555 version = fmt.Sprintf("rocksdb v%s", value) 556 } 557 } 558 return nil 559 }) 560 if err != nil { 561 return "", err 562 } 563 } 564 } 565 return version, nil 566 } 567 568 // replayWAL replays the edits in the specified log file. 569 // 570 // d.mu must be held when calling this, but the mutex may be dropped and 571 // re-acquired during the course of this method. 572 func (d *DB) replayWAL( 573 jobID int, ve *versionEdit, fs vfs.FS, filename string, logNum FileNum, strictWALTail bool, 574 ) (maxSeqNum uint64, err error) { 575 file, err := fs.Open(filename) 576 if err != nil { 577 return 0, err 578 } 579 defer file.Close() 580 581 var ( 582 b Batch 583 buf bytes.Buffer 584 mem *memTable 585 entry *flushableEntry 586 toFlush flushableList 587 rr = record.NewReader(file, logNum) 588 offset int64 // byte offset in rr 589 lastFlushOffset int64 590 ) 591 592 if d.opts.ReadOnly { 593 // In read-only mode, we replay directly into the mutable memtable which will 594 // never be flushed. 595 mem = d.mu.mem.mutable 596 if mem != nil { 597 entry = d.mu.mem.queue[len(d.mu.mem.queue)-1] 598 } 599 } 600 601 // Flushes the current memtable, if not nil. 602 flushMem := func() { 603 if mem == nil { 604 return 605 } 606 var logSize uint64 607 if offset >= lastFlushOffset { 608 logSize = uint64(offset - lastFlushOffset) 609 } 610 // Else, this was the initial memtable in the read-only case which must have 611 // been empty, but we need to flush it since we don't want to add to it later. 612 lastFlushOffset = offset 613 entry.logSize = logSize 614 if !d.opts.ReadOnly { 615 toFlush = append(toFlush, entry) 616 } 617 mem, entry = nil, nil 618 } 619 // Creates a new memtable if there is no current memtable. 620 ensureMem := func(seqNum uint64) { 621 if mem != nil { 622 return 623 } 624 mem, entry = d.newMemTable(logNum, seqNum) 625 if d.opts.ReadOnly { 626 d.mu.mem.mutable = mem 627 d.mu.mem.queue = append(d.mu.mem.queue, entry) 628 } 629 } 630 for { 631 offset = rr.Offset() 632 r, err := rr.Next() 633 if err == nil { 634 _, err = io.Copy(&buf, r) 635 } 636 if err != nil { 637 // It is common to encounter a zeroed or invalid chunk due to WAL 638 // preallocation and WAL recycling. We need to distinguish these 639 // errors from EOF in order to recognize that the record was 640 // truncated and to avoid replaying subsequent WALs, but want 641 // to otherwise treat them like EOF. 642 if err == io.EOF { 643 break 644 } else if record.IsInvalidRecord(err) && !strictWALTail { 645 break 646 } 647 return 0, errors.Wrap(err, "bitalostable: error when replaying WAL") 648 } 649 650 if buf.Len() < batchHeaderLen { 651 return 0, base.CorruptionErrorf("bitalostable: corrupt log file %q (num %s)", 652 filename, errors.Safe(logNum)) 653 } 654 655 // Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize 656 // which is used below. 657 b = Batch{db: d} 658 b.SetRepr(buf.Bytes()) 659 seqNum := b.SeqNum() 660 maxSeqNum = seqNum + uint64(b.Count()) 661 662 if b.memTableSize >= uint64(d.largeBatchThreshold) { 663 flushMem() 664 // Make a copy of the data slice since it is currently owned by buf and will 665 // be reused in the next iteration. 666 b.data = append([]byte(nil), b.data...) 667 b.flushable = newFlushableBatch(&b, d.opts.Comparer) 668 entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum()) 669 // Disable memory accounting by adding a reader ref that will never be 670 // removed. 671 entry.readerRefs++ 672 if d.opts.ReadOnly { 673 d.mu.mem.queue = append(d.mu.mem.queue, entry) 674 } else { 675 toFlush = append(toFlush, entry) 676 } 677 } else { 678 ensureMem(seqNum) 679 if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull { 680 return 0, err 681 } 682 // We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the 683 // batch may not initially fit, but will eventually fit (since it is smaller than 684 // largeBatchThreshold). 685 for err == arenaskl.ErrArenaFull { 686 flushMem() 687 ensureMem(seqNum) 688 err = mem.prepare(&b) 689 if err != nil && err != arenaskl.ErrArenaFull { 690 return 0, err 691 } 692 } 693 if err = mem.apply(&b, seqNum); err != nil { 694 return 0, err 695 } 696 mem.writerUnref() 697 } 698 buf.Reset() 699 } 700 flushMem() 701 // mem is nil here. 702 if !d.opts.ReadOnly { 703 c := newFlush(d.opts, d.mu.versions.currentVersion(), 704 1 /* base level */, toFlush) 705 newVE, _, err := d.runCompaction(jobID, c) 706 if err != nil { 707 return 0, err 708 } 709 ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...) 710 for i := range toFlush { 711 toFlush[i].readerUnref() 712 } 713 } 714 return maxSeqNum, err 715 } 716 717 func checkOptions(opts *Options, path string) (strictWALTail bool, err error) { 718 f, err := opts.FS.Open(path) 719 if err != nil { 720 return false, err 721 } 722 defer f.Close() 723 724 data, err := ioutil.ReadAll(f) 725 if err != nil { 726 return false, err 727 } 728 return opts.checkOptions(string(data)) 729 } 730 731 // DBDesc briefly describes high-level state about a database. 732 type DBDesc struct { 733 // Exists is true if an existing database was found. 734 Exists bool 735 // FormatMajorVersion indicates the database's current format 736 // version. 737 FormatMajorVersion FormatMajorVersion 738 // ManifestFilename is the filename of the current active manifest, 739 // if the database exists. 740 ManifestFilename string 741 } 742 743 // Peek looks for an existing database in dirname on the provided FS. It 744 // returns a brief description of the database. Peek is read-only and 745 // does not open the database. 746 func Peek(dirname string, fs vfs.FS) (*DBDesc, error) { 747 vers, versMarker, err := lookupFormatMajorVersion(fs, dirname) 748 if err != nil { 749 return nil, err 750 } 751 // TODO(jackson): Immediately closing the marker is clunky. Add a 752 // PeekMarker variant that avoids opening the directory. 753 if err := versMarker.Close(); err != nil { 754 return nil, err 755 } 756 757 // Find the currently active manifest, if there is one. 758 manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname) 759 if err != nil { 760 return nil, err 761 } 762 // TODO(jackson): Immediately closing the marker is clunky. Add a 763 // PeekMarker variant that avoids opening the directory. 764 if err := manifestMarker.Close(); err != nil { 765 return nil, err 766 } 767 768 desc := &DBDesc{ 769 Exists: exists, 770 FormatMajorVersion: vers, 771 } 772 if exists { 773 desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum) 774 } 775 return desc, nil 776 }