github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/snapshot.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "io" 10 "math" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/pebble/internal/invariants" 17 "github.com/cockroachdb/pebble/rangekey" 18 "github.com/cockroachdb/pebble/sstable" 19 ) 20 21 // ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise 22 // overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets 23 // applied before the transition of that EFOS to a file-only snapshot. 24 var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot") 25 26 // Snapshot provides a read-only point-in-time view of the DB state. 27 type Snapshot struct { 28 // The db the snapshot was created from. 29 db *DB 30 seqNum uint64 31 32 // Set if part of an EventuallyFileOnlySnapshot. 33 efos *EventuallyFileOnlySnapshot 34 35 // The list the snapshot is linked into. 36 list *snapshotList 37 38 // The next/prev link for the snapshotList doubly-linked list of snapshots. 39 prev, next *Snapshot 40 } 41 42 var _ Reader = (*Snapshot)(nil) 43 44 // Get gets the value for the given key. It returns ErrNotFound if the Snapshot 45 // does not contain the key. 46 // 47 // The caller should not modify the contents of the returned slice, but it is 48 // safe to modify the contents of the argument after Get returns. The returned 49 // slice will remain valid until the returned Closer is closed. On success, the 50 // caller MUST call closer.Close() or a memory leak will occur. 51 func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) { 52 if s.db == nil { 53 panic(ErrClosed) 54 } 55 return s.db.getInternal(key, nil /* batch */, s) 56 } 57 58 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 59 // return false). The iterator can be positioned via a call to SeekGE, 60 // SeekLT, First or Last. 61 func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) { 62 return s.NewIterWithContext(context.Background(), o) 63 } 64 65 // NewIterWithContext is like NewIter, and additionally accepts a context for 66 // tracing. 67 func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { 68 if s.db == nil { 69 panic(ErrClosed) 70 } 71 return s.db.newIter(ctx, nil /* batch */, newIterOpts{ 72 snapshot: snapshotIterOpts{seqNum: s.seqNum}, 73 }, o), nil 74 } 75 76 // ScanInternal scans all internal keys within the specified bounds, truncating 77 // any rangedels and rangekeys to those bounds. For use when an external user 78 // needs to be aware of all internal keys that make up a key range. 79 // 80 // See comment on db.ScanInternal for the behaviour that can be expected of 81 // point keys deleted by range dels and keys masked by range keys. 82 func (s *Snapshot) ScanInternal( 83 ctx context.Context, 84 categoryAndQoS sstable.CategoryAndQoS, 85 lower, upper []byte, 86 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, 87 visitRangeDel func(start, end []byte, seqNum uint64) error, 88 visitRangeKey func(start, end []byte, keys []rangekey.Key) error, 89 visitSharedFile func(sst *SharedSSTMeta) error, 90 ) error { 91 if s.db == nil { 92 panic(ErrClosed) 93 } 94 scanInternalOpts := &scanInternalOptions{ 95 CategoryAndQoS: categoryAndQoS, 96 visitPointKey: visitPointKey, 97 visitRangeDel: visitRangeDel, 98 visitRangeKey: visitRangeKey, 99 visitSharedFile: visitSharedFile, 100 skipSharedLevels: visitSharedFile != nil, 101 IterOptions: IterOptions{ 102 KeyTypes: IterKeyTypePointsAndRanges, 103 LowerBound: lower, 104 UpperBound: upper, 105 }, 106 } 107 108 iter, err := s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts) 109 if err != nil { 110 return err 111 } 112 defer iter.close() 113 114 return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) 115 } 116 117 // closeLocked is similar to Close(), except it requires that db.mu be held 118 // by the caller. 119 func (s *Snapshot) closeLocked() error { 120 s.db.mu.snapshots.remove(s) 121 122 // If s was the previous earliest snapshot, we might be able to reclaim 123 // disk space by dropping obsolete records that were pinned by s. 124 if e := s.db.mu.snapshots.earliest(); e > s.seqNum { 125 s.db.maybeScheduleCompactionPicker(pickElisionOnly) 126 } 127 s.db = nil 128 return nil 129 } 130 131 // Close closes the snapshot, releasing its resources. Close must be called. 132 // Failure to do so will result in a tiny memory leak and a large leak of 133 // resources on disk due to the entries the snapshot is preventing from being 134 // deleted. 135 // 136 // d.mu must NOT be held by the caller. 137 func (s *Snapshot) Close() error { 138 db := s.db 139 if db == nil { 140 panic(ErrClosed) 141 } 142 db.mu.Lock() 143 defer db.mu.Unlock() 144 return s.closeLocked() 145 } 146 147 type snapshotList struct { 148 root Snapshot 149 } 150 151 func (l *snapshotList) init() { 152 l.root.next = &l.root 153 l.root.prev = &l.root 154 } 155 156 func (l *snapshotList) empty() bool { 157 return l.root.next == &l.root 158 } 159 160 func (l *snapshotList) count() int { 161 if l.empty() { 162 return 0 163 } 164 var count int 165 for i := l.root.next; i != &l.root; i = i.next { 166 count++ 167 } 168 return count 169 } 170 171 func (l *snapshotList) earliest() uint64 { 172 v := uint64(math.MaxUint64) 173 if !l.empty() { 174 v = l.root.next.seqNum 175 } 176 return v 177 } 178 179 func (l *snapshotList) toSlice() []uint64 { 180 if l.empty() { 181 return nil 182 } 183 var results []uint64 184 for i := l.root.next; i != &l.root; i = i.next { 185 results = append(results, i.seqNum) 186 } 187 return results 188 } 189 190 func (l *snapshotList) pushBack(s *Snapshot) { 191 if s.list != nil || s.prev != nil || s.next != nil { 192 panic("pebble: snapshot list is inconsistent") 193 } 194 s.prev = l.root.prev 195 s.prev.next = s 196 s.next = &l.root 197 s.next.prev = s 198 s.list = l 199 } 200 201 func (l *snapshotList) remove(s *Snapshot) { 202 if s == &l.root { 203 panic("pebble: cannot remove snapshot list root node") 204 } 205 if s.list != l { 206 panic("pebble: snapshot list is inconsistent") 207 } 208 s.prev.next = s.next 209 s.next.prev = s.prev 210 s.next = nil // avoid memory leaks 211 s.prev = nil // avoid memory leaks 212 s.list = nil // avoid memory leaks 213 } 214 215 // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view 216 // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot 217 // induces less write amplification than Snapshot, at the cost of increased space 218 // amplification. While a Snapshot may increase write amplification across all 219 // flushes and compactions for the duration of its lifetime, an 220 // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if 221 // memtables at the time of EFOS instantiation contained keys that the EFOS is 222 // interested in (i.e. its protectedRanges). In that case, the EFOS prevents 223 // elision of keys visible to it, similar to a Snapshot, until those memtables 224 // are flushed, and once that happens, the "EventuallyFileOnlySnapshot" 225 // transitions to a file-only snapshot state in which it pins zombies sstables 226 // like an open Iterator would, without pinning any memtables. Callers that can 227 // tolerate the increased space amplification of pinning zombie sstables until 228 // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their 229 // reduced write amplification. Callers that desire the benefits of the file-only 230 // state that requires no pinning of memtables should call 231 // `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns 232 // ErrSnapshotExcised) before relying on the EFOS to keep producing iterators 233 // with zero write-amp and zero pinning of memtables in memory. 234 // 235 // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in 236 // subtle ways. No new iterators can be created once 237 // EventuallyFileOnlySnapshot.excised is set to true. 238 type EventuallyFileOnlySnapshot struct { 239 mu struct { 240 // NB: If both this mutex and db.mu are being grabbed, db.mu should be 241 // grabbed _before_ grabbing this one. 242 sync.Mutex 243 244 // Either the snap field is set below, or the version is set at any given 245 // point of time. If a snapshot is referenced, this is not a file-only 246 // snapshot yet, and if a version is set (and ref'd) this is a file-only 247 // snapshot. 248 249 // The wrapped regular snapshot, if not a file-only snapshot yet. 250 snap *Snapshot 251 // The wrapped version reference, if a file-only snapshot. 252 vers *version 253 254 // The readState corresponding to when this EFOS was created. Only set 255 // if alwaysCreateIters is true. 256 rs *readState 257 } 258 259 // Key ranges to watch for an excise on. 260 protectedRanges []KeyRange 261 // excised, if true, signals that the above ranges were excised during the 262 // lifetime of this snapshot. 263 excised atomic.Bool 264 265 // The db the snapshot was created from. 266 db *DB 267 seqNum uint64 268 269 // If true, this EventuallyFileOnlySnapshot will always generate iterators that 270 // retain snapshot semantics, by holding onto the readState if a conflicting 271 // excise were to happen. Only used in some tests to enforce deterministic 272 // behaviour around excises. 273 alwaysCreateIters bool 274 275 closed chan struct{} 276 } 277 278 func (d *DB) makeEventuallyFileOnlySnapshot( 279 keyRanges []KeyRange, internalKeyRanges []internalKeyRange, 280 ) *EventuallyFileOnlySnapshot { 281 isFileOnly := true 282 283 d.mu.Lock() 284 defer d.mu.Unlock() 285 seqNum := d.mu.versions.visibleSeqNum.Load() 286 // Check if any of the keyRanges overlap with a memtable. 287 for i := range d.mu.mem.queue { 288 mem := d.mu.mem.queue[i] 289 if ingestMemtableOverlaps(d.cmp, mem, internalKeyRanges) { 290 isFileOnly = false 291 break 292 } 293 } 294 es := &EventuallyFileOnlySnapshot{ 295 db: d, 296 seqNum: seqNum, 297 protectedRanges: keyRanges, 298 closed: make(chan struct{}), 299 alwaysCreateIters: d.opts.private.efosAlwaysCreatesIterators, 300 } 301 if es.alwaysCreateIters { 302 es.mu.rs = d.loadReadState() 303 } 304 if isFileOnly { 305 es.mu.vers = d.mu.versions.currentVersion() 306 es.mu.vers.Ref() 307 } else { 308 s := &Snapshot{ 309 db: d, 310 seqNum: seqNum, 311 } 312 s.efos = es 313 es.mu.snap = s 314 d.mu.snapshots.pushBack(s) 315 } 316 return es 317 } 318 319 // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires 320 // earliestUnflushedSeqNum and vers to correspond to the same Version from the 321 // current or a past acquisition of db.mu. vers must have been Ref()'d before 322 // that mutex was released, if it was released. 323 // 324 // NB: The caller is expected to check for es.excised before making this 325 // call. 326 // 327 // d.mu must be held when calling this method. 328 func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error { 329 es.mu.Lock() 330 select { 331 case <-es.closed: 332 vers.UnrefLocked() 333 es.mu.Unlock() 334 return ErrClosed 335 default: 336 } 337 if es.mu.snap == nil { 338 es.mu.Unlock() 339 panic("pebble: tried to transition an eventually-file-only-snapshot twice") 340 } 341 // The caller has already called Ref() on vers. 342 es.mu.vers = vers 343 // NB: The callers should have already done a check of es.excised. 344 oldSnap := es.mu.snap 345 es.mu.snap = nil 346 es.mu.Unlock() 347 return oldSnap.closeLocked() 348 } 349 350 // hasTransitioned returns true if this EFOS has transitioned to a file-only 351 // snapshot. 352 func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool { 353 es.mu.Lock() 354 defer es.mu.Unlock() 355 return es.mu.vers != nil 356 } 357 358 // waitForFlush waits for a flush on any memtables that need to be flushed 359 // before this EFOS can transition to a file-only snapshot. If this EFOS is 360 // waiting on a flush of the mutable memtable, it forces a rotation within 361 // `dur` duration. For immutable memtables, it schedules a flush and waits for 362 // it to finish. 363 func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error { 364 es.db.mu.Lock() 365 defer es.db.mu.Unlock() 366 367 earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked() 368 for earliestUnflushedSeqNum < es.seqNum { 369 select { 370 case <-es.closed: 371 return ErrClosed 372 case <-ctx.Done(): 373 return ctx.Err() 374 default: 375 } 376 // Check if the current mutable memtable contains keys less than seqNum. 377 // If so, rotate it. 378 if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 { 379 es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur) 380 } else { 381 // Find the last memtable that contains seqNums less than es.seqNum, 382 // and force a flush on it. 383 var mem *flushableEntry 384 for i := range es.db.mu.mem.queue { 385 if es.db.mu.mem.queue[i].logSeqNum < es.seqNum { 386 mem = es.db.mu.mem.queue[i] 387 } 388 } 389 mem.flushForced = true 390 es.db.maybeScheduleFlush() 391 } 392 es.db.mu.compact.cond.Wait() 393 394 earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked() 395 } 396 if es.excised.Load() { 397 return ErrSnapshotExcised 398 } 399 return nil 400 } 401 402 // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot 403 // has been converted into a file-only snapshot (i.e. all memtables containing 404 // keys < seqNum are flushed). A duration can be passed in, and if nonzero, 405 // a delayed flush will be scheduled at that duration if necessary. 406 // 407 // Idempotent; can be called multiple times with no side effects. 408 func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot( 409 ctx context.Context, dur time.Duration, 410 ) error { 411 if es.hasTransitioned() { 412 return nil 413 } 414 415 if err := es.waitForFlush(ctx, dur); err != nil { 416 return err 417 } 418 419 if invariants.Enabled { 420 // Since we aren't returning an error, we _must_ have transitioned to a 421 // file-only snapshot by now. 422 if !es.hasTransitioned() { 423 panic("expected EFOS to have transitioned to file-only snapshot after flush") 424 } 425 } 426 return nil 427 } 428 429 // Close closes the file-only snapshot and releases all referenced resources. 430 // Not idempotent. 431 func (es *EventuallyFileOnlySnapshot) Close() error { 432 close(es.closed) 433 es.db.mu.Lock() 434 defer es.db.mu.Unlock() 435 es.mu.Lock() 436 defer es.mu.Unlock() 437 438 if es.mu.snap != nil { 439 if err := es.mu.snap.closeLocked(); err != nil { 440 return err 441 } 442 } 443 if es.mu.vers != nil { 444 es.mu.vers.UnrefLocked() 445 } 446 if es.mu.rs != nil { 447 es.mu.rs.unrefLocked() 448 } 449 return nil 450 } 451 452 // Get implements the Reader interface. 453 func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) { 454 // TODO(jackson): Use getInternal. 455 iter, err := es.NewIter(nil) 456 if err != nil { 457 return nil, nil, err 458 } 459 var valid bool 460 if es.db.opts.Comparer.Split != nil { 461 valid = iter.SeekPrefixGE(key) 462 } else { 463 valid = iter.SeekGE(key) 464 } 465 if !valid { 466 if err = firstError(iter.Error(), iter.Close()); err != nil { 467 return nil, nil, err 468 } 469 return nil, nil, ErrNotFound 470 } 471 if !es.db.equal(iter.Key(), key) { 472 return nil, nil, firstError(iter.Close(), ErrNotFound) 473 } 474 return iter.Value(), iter, nil 475 } 476 477 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 478 // return false). The iterator can be positioned via a call to SeekGE, 479 // SeekLT, First or Last. 480 func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) { 481 return es.NewIterWithContext(context.Background(), o) 482 } 483 484 func (es *EventuallyFileOnlySnapshot) newAlwaysCreateIterWithContext( 485 ctx context.Context, o *IterOptions, 486 ) (*Iterator, error) { 487 // Grab the db mutex. This avoids races down below, where we could get 488 // excised between the es.excised.Load() call, and the newIter call. 489 es.db.mu.Lock() 490 defer es.db.mu.Unlock() 491 es.mu.Lock() 492 defer es.mu.Unlock() 493 if es.mu.vers != nil { 494 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers} 495 return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil 496 } 497 498 sOpts := snapshotIterOpts{seqNum: es.seqNum} 499 if es.excised.Load() { 500 if es.mu.rs == nil { 501 return nil, errors.AssertionFailedf("unexpected nil readState in EFOS' alwaysCreateIters mode") 502 } 503 sOpts.readState = es.mu.rs 504 } 505 iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o) 506 return iter, nil 507 } 508 509 // NewIterWithContext is like NewIter, and additionally accepts a context for 510 // tracing. 511 func (es *EventuallyFileOnlySnapshot) NewIterWithContext( 512 ctx context.Context, o *IterOptions, 513 ) (*Iterator, error) { 514 select { 515 case <-es.closed: 516 panic(ErrClosed) 517 default: 518 } 519 520 if es.alwaysCreateIters { 521 return es.newAlwaysCreateIterWithContext(ctx, o) 522 } 523 es.mu.Lock() 524 defer es.mu.Unlock() 525 if es.mu.vers != nil { 526 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers} 527 return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil 528 } 529 530 sOpts := snapshotIterOpts{seqNum: es.seqNum} 531 if es.excised.Load() { 532 return nil, ErrSnapshotExcised 533 } 534 iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o) 535 536 // If excised is true, then keys relevant to the snapshot might not be 537 // present in the readState being used by the iterator. 538 if es.excised.Load() { 539 iter.Close() 540 return nil, ErrSnapshotExcised 541 } 542 return iter, nil 543 } 544 545 // ScanInternal scans all internal keys within the specified bounds, truncating 546 // any rangedels and rangekeys to those bounds. For use when an external user 547 // needs to be aware of all internal keys that make up a key range. 548 // 549 // See comment on db.ScanInternal for the behaviour that can be expected of 550 // point keys deleted by range dels and keys masked by range keys. 551 func (es *EventuallyFileOnlySnapshot) ScanInternal( 552 ctx context.Context, 553 categoryAndQoS sstable.CategoryAndQoS, 554 lower, upper []byte, 555 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, 556 visitRangeDel func(start, end []byte, seqNum uint64) error, 557 visitRangeKey func(start, end []byte, keys []rangekey.Key) error, 558 visitSharedFile func(sst *SharedSSTMeta) error, 559 ) error { 560 if es.db == nil { 561 panic(ErrClosed) 562 } 563 if es.excised.Load() && !es.alwaysCreateIters { 564 return ErrSnapshotExcised 565 } 566 var sOpts snapshotIterOpts 567 opts := &scanInternalOptions{ 568 CategoryAndQoS: categoryAndQoS, 569 IterOptions: IterOptions{ 570 KeyTypes: IterKeyTypePointsAndRanges, 571 LowerBound: lower, 572 UpperBound: upper, 573 }, 574 visitPointKey: visitPointKey, 575 visitRangeDel: visitRangeDel, 576 visitRangeKey: visitRangeKey, 577 visitSharedFile: visitSharedFile, 578 skipSharedLevels: visitSharedFile != nil, 579 } 580 if es.alwaysCreateIters { 581 // Grab the db mutex. This avoids races down below as it prevents excises 582 // from taking effect until the iterator is instantiated. 583 es.db.mu.Lock() 584 } 585 es.mu.Lock() 586 if es.mu.vers != nil { 587 sOpts = snapshotIterOpts{ 588 seqNum: es.seqNum, 589 vers: es.mu.vers, 590 } 591 } else { 592 if es.excised.Load() && es.alwaysCreateIters { 593 sOpts = snapshotIterOpts{ 594 readState: es.mu.rs, 595 seqNum: es.seqNum, 596 } 597 } else { 598 sOpts = snapshotIterOpts{ 599 seqNum: es.seqNum, 600 } 601 } 602 } 603 es.mu.Unlock() 604 iter, err := es.db.newInternalIter(ctx, sOpts, opts) 605 if err != nil { 606 return err 607 } 608 defer iter.close() 609 if es.alwaysCreateIters { 610 // See the similar conditional above where we grab this mutex. 611 es.db.mu.Unlock() 612 } 613 614 // If excised is true, then keys relevant to the snapshot might not be 615 // present in the readState being used by the iterator. Error out. 616 if es.excised.Load() && !es.alwaysCreateIters { 617 return ErrSnapshotExcised 618 } 619 620 return scanInternalImpl(ctx, lower, upper, iter, opts) 621 }