github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/snapshot.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "io" 10 "math" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/pebble/internal/invariants" 17 "github.com/cockroachdb/pebble/rangekey" 18 ) 19 20 // ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise 21 // overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets 22 // applied before the transition of that EFOS to a file-only snapshot. 23 var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot") 24 25 // Snapshot provides a read-only point-in-time view of the DB state. 26 type Snapshot struct { 27 // The db the snapshot was created from. 28 db *DB 29 seqNum uint64 30 31 // Set if part of an EventuallyFileOnlySnapshot. 32 efos *EventuallyFileOnlySnapshot 33 34 // The list the snapshot is linked into. 35 list *snapshotList 36 37 // The next/prev link for the snapshotList doubly-linked list of snapshots. 38 prev, next *Snapshot 39 } 40 41 var _ Reader = (*Snapshot)(nil) 42 43 // Get gets the value for the given key. It returns ErrNotFound if the Snapshot 44 // does not contain the key. 45 // 46 // The caller should not modify the contents of the returned slice, but it is 47 // safe to modify the contents of the argument after Get returns. The returned 48 // slice will remain valid until the returned Closer is closed. On success, the 49 // caller MUST call closer.Close() or a memory leak will occur. 50 func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) { 51 if s.db == nil { 52 panic(ErrClosed) 53 } 54 return s.db.getInternal(key, nil /* batch */, s) 55 } 56 57 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 58 // return false). The iterator can be positioned via a call to SeekGE, 59 // SeekLT, First or Last. 60 func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) { 61 return s.NewIterWithContext(context.Background(), o) 62 } 63 64 // NewIterWithContext is like NewIter, and additionally accepts a context for 65 // tracing. 66 func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { 67 if s.db == nil { 68 panic(ErrClosed) 69 } 70 return s.db.newIter(ctx, nil /* batch */, snapshotIterOpts{seqNum: s.seqNum}, o), nil 71 } 72 73 // ScanInternal scans all internal keys within the specified bounds, truncating 74 // any rangedels and rangekeys to those bounds. For use when an external user 75 // needs to be aware of all internal keys that make up a key range. 76 // 77 // See comment on db.ScanInternal for the behaviour that can be expected of 78 // point keys deleted by range dels and keys masked by range keys. 79 func (s *Snapshot) ScanInternal( 80 ctx context.Context, 81 lower, upper []byte, 82 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, 83 visitRangeDel func(start, end []byte, seqNum uint64) error, 84 visitRangeKey func(start, end []byte, keys []rangekey.Key) error, 85 visitSharedFile func(sst *SharedSSTMeta) error, 86 ) error { 87 if s.db == nil { 88 panic(ErrClosed) 89 } 90 scanInternalOpts := &scanInternalOptions{ 91 visitPointKey: visitPointKey, 92 visitRangeDel: visitRangeDel, 93 visitRangeKey: visitRangeKey, 94 visitSharedFile: visitSharedFile, 95 skipSharedLevels: visitSharedFile != nil, 96 IterOptions: IterOptions{ 97 KeyTypes: IterKeyTypePointsAndRanges, 98 LowerBound: lower, 99 UpperBound: upper, 100 }, 101 } 102 103 iter := s.db.newInternalIter(snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts) 104 defer iter.close() 105 106 return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) 107 } 108 109 // closeLocked is similar to Close(), except it requires that db.mu be held 110 // by the caller. 111 func (s *Snapshot) closeLocked() error { 112 s.db.mu.snapshots.remove(s) 113 114 // If s was the previous earliest snapshot, we might be able to reclaim 115 // disk space by dropping obsolete records that were pinned by s. 116 if e := s.db.mu.snapshots.earliest(); e > s.seqNum { 117 s.db.maybeScheduleCompactionPicker(pickElisionOnly) 118 } 119 s.db = nil 120 return nil 121 } 122 123 // Close closes the snapshot, releasing its resources. Close must be called. 124 // Failure to do so will result in a tiny memory leak and a large leak of 125 // resources on disk due to the entries the snapshot is preventing from being 126 // deleted. 127 // 128 // d.mu must NOT be held by the caller. 129 func (s *Snapshot) Close() error { 130 db := s.db 131 if db == nil { 132 panic(ErrClosed) 133 } 134 db.mu.Lock() 135 defer db.mu.Unlock() 136 return s.closeLocked() 137 } 138 139 type snapshotList struct { 140 root Snapshot 141 } 142 143 func (l *snapshotList) init() { 144 l.root.next = &l.root 145 l.root.prev = &l.root 146 } 147 148 func (l *snapshotList) empty() bool { 149 return l.root.next == &l.root 150 } 151 152 func (l *snapshotList) count() int { 153 if l.empty() { 154 return 0 155 } 156 var count int 157 for i := l.root.next; i != &l.root; i = i.next { 158 count++ 159 } 160 return count 161 } 162 163 func (l *snapshotList) earliest() uint64 { 164 v := uint64(math.MaxUint64) 165 if !l.empty() { 166 v = l.root.next.seqNum 167 } 168 return v 169 } 170 171 func (l *snapshotList) toSlice() []uint64 { 172 if l.empty() { 173 return nil 174 } 175 var results []uint64 176 for i := l.root.next; i != &l.root; i = i.next { 177 results = append(results, i.seqNum) 178 } 179 return results 180 } 181 182 func (l *snapshotList) pushBack(s *Snapshot) { 183 if s.list != nil || s.prev != nil || s.next != nil { 184 panic("pebble: snapshot list is inconsistent") 185 } 186 s.prev = l.root.prev 187 s.prev.next = s 188 s.next = &l.root 189 s.next.prev = s 190 s.list = l 191 } 192 193 func (l *snapshotList) remove(s *Snapshot) { 194 if s == &l.root { 195 panic("pebble: cannot remove snapshot list root node") 196 } 197 if s.list != l { 198 panic("pebble: snapshot list is inconsistent") 199 } 200 s.prev.next = s.next 201 s.next.prev = s.prev 202 s.next = nil // avoid memory leaks 203 s.prev = nil // avoid memory leaks 204 s.list = nil // avoid memory leaks 205 } 206 207 // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view 208 // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot 209 // induces less write amplification than Snapshot, at the cost of increased space 210 // amplification. While a Snapshot may increase write amplification across all 211 // flushes and compactions for the duration of its lifetime, an 212 // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if 213 // memtables at the time of EFOS instantiation contained keys that the EFOS is 214 // interested in (i.e. its protectedRanges). In that case, the EFOS prevents 215 // elision of keys visible to it, similar to a Snapshot, until those memtables 216 // are flushed, and once that happens, the "EventuallyFileOnlySnapshot" 217 // transitions to a file-only snapshot state in which it pins zombies sstables 218 // like an open Iterator would, without pinning any memtables. Callers that can 219 // tolerate the increased space amplification of pinning zombie sstables until 220 // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their 221 // reduced write amplification. Callers that desire the benefits of the file-only 222 // state that requires no pinning of memtables should call 223 // `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns 224 // ErrSnapshotExcised) before relying on the EFOS to keep producing iterators 225 // with zero write-amp and zero pinning of memtables in memory. 226 // 227 // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in 228 // subtle ways. No new iterators can be created once 229 // EventuallyFileOnlySnapshot.excised is set to true. 230 type EventuallyFileOnlySnapshot struct { 231 mu struct { 232 // NB: If both this mutex and db.mu are being grabbed, db.mu should be 233 // grabbed _before_ grabbing this one. 234 sync.Mutex 235 236 // Either the snap field is set below, or the version is set at any given 237 // point of time. If a snapshot is referenced, this is not a file-only 238 // snapshot yet, and if a version is set (and ref'd) this is a file-only 239 // snapshot. 240 241 // The wrapped regular snapshot, if not a file-only snapshot yet. 242 snap *Snapshot 243 // The wrapped version reference, if a file-only snapshot. 244 vers *version 245 } 246 247 // Key ranges to watch for an excise on. 248 protectedRanges []KeyRange 249 // excised, if true, signals that the above ranges were excised during the 250 // lifetime of this snapshot. 251 excised atomic.Bool 252 253 // The db the snapshot was created from. 254 db *DB 255 seqNum uint64 256 257 closed chan struct{} 258 } 259 260 func (d *DB) makeEventuallyFileOnlySnapshot( 261 keyRanges []KeyRange, internalKeyRanges []internalKeyRange, 262 ) *EventuallyFileOnlySnapshot { 263 isFileOnly := true 264 265 d.mu.Lock() 266 defer d.mu.Unlock() 267 seqNum := d.mu.versions.visibleSeqNum.Load() 268 // Check if any of the keyRanges overlap with a memtable. 269 for i := range d.mu.mem.queue { 270 mem := d.mu.mem.queue[i] 271 if ingestMemtableOverlaps(d.cmp, mem, internalKeyRanges) { 272 isFileOnly = false 273 break 274 } 275 } 276 es := &EventuallyFileOnlySnapshot{ 277 db: d, 278 seqNum: seqNum, 279 protectedRanges: keyRanges, 280 closed: make(chan struct{}), 281 } 282 if isFileOnly { 283 es.mu.vers = d.mu.versions.currentVersion() 284 es.mu.vers.Ref() 285 } else { 286 s := &Snapshot{ 287 db: d, 288 seqNum: seqNum, 289 } 290 s.efos = es 291 es.mu.snap = s 292 d.mu.snapshots.pushBack(s) 293 } 294 return es 295 } 296 297 // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires 298 // earliestUnflushedSeqNum and vers to correspond to the same Version from the 299 // current or a past acquisition of db.mu. vers must have been Ref()'d before 300 // that mutex was released, if it was released. 301 // 302 // NB: The caller is expected to check for es.excised before making this 303 // call. 304 // 305 // d.mu must be held when calling this method. 306 func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error { 307 es.mu.Lock() 308 select { 309 case <-es.closed: 310 vers.UnrefLocked() 311 es.mu.Unlock() 312 return ErrClosed 313 default: 314 } 315 if es.mu.snap == nil { 316 es.mu.Unlock() 317 panic("pebble: tried to transition an eventually-file-only-snapshot twice") 318 } 319 // The caller has already called Ref() on vers. 320 es.mu.vers = vers 321 // NB: The callers should have already done a check of es.excised. 322 oldSnap := es.mu.snap 323 es.mu.snap = nil 324 es.mu.Unlock() 325 return oldSnap.closeLocked() 326 } 327 328 // hasTransitioned returns true if this EFOS has transitioned to a file-only 329 // snapshot. 330 func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool { 331 es.mu.Lock() 332 defer es.mu.Unlock() 333 return es.mu.vers != nil 334 } 335 336 // waitForFlush waits for a flush on any memtables that need to be flushed 337 // before this EFOS can transition to a file-only snapshot. If this EFOS is 338 // waiting on a flush of the mutable memtable, it forces a rotation within 339 // `dur` duration. For immutable memtables, it schedules a flush and waits for 340 // it to finish. 341 func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error { 342 es.db.mu.Lock() 343 defer es.db.mu.Unlock() 344 345 earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked() 346 for earliestUnflushedSeqNum < es.seqNum { 347 select { 348 case <-es.closed: 349 return ErrClosed 350 case <-ctx.Done(): 351 return ctx.Err() 352 default: 353 } 354 // Check if the current mutable memtable contains keys less than seqNum. 355 // If so, rotate it. 356 if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 { 357 es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur) 358 } else { 359 // Find the last memtable that contains seqNums less than es.seqNum, 360 // and force a flush on it. 361 var mem *flushableEntry 362 for i := range es.db.mu.mem.queue { 363 if es.db.mu.mem.queue[i].logSeqNum < es.seqNum { 364 mem = es.db.mu.mem.queue[i] 365 } 366 } 367 mem.flushForced = true 368 es.db.maybeScheduleFlush() 369 } 370 es.db.mu.compact.cond.Wait() 371 372 earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked() 373 } 374 if es.excised.Load() { 375 return ErrSnapshotExcised 376 } 377 return nil 378 } 379 380 // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot 381 // has been converted into a file-only snapshot (i.e. all memtables containing 382 // keys < seqNum are flushed). A duration can be passed in, and if nonzero, 383 // a delayed flush will be scheduled at that duration if necessary. 384 // 385 // Idempotent; can be called multiple times with no side effects. 386 func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot( 387 ctx context.Context, dur time.Duration, 388 ) error { 389 if es.hasTransitioned() { 390 return nil 391 } 392 393 if err := es.waitForFlush(ctx, dur); err != nil { 394 return err 395 } 396 397 if invariants.Enabled { 398 // Since we aren't returning an error, we _must_ have transitioned to a 399 // file-only snapshot by now. 400 if !es.hasTransitioned() { 401 panic("expected EFOS to have transitioned to file-only snapshot after flush") 402 } 403 } 404 return nil 405 } 406 407 // Close closes the file-only snapshot and releases all referenced resources. 408 // Not idempotent. 409 func (es *EventuallyFileOnlySnapshot) Close() error { 410 close(es.closed) 411 es.db.mu.Lock() 412 defer es.db.mu.Unlock() 413 es.mu.Lock() 414 defer es.mu.Unlock() 415 416 if es.mu.snap != nil { 417 if err := es.mu.snap.closeLocked(); err != nil { 418 return err 419 } 420 } 421 if es.mu.vers != nil { 422 es.mu.vers.UnrefLocked() 423 } 424 return nil 425 } 426 427 // Get implements the Reader interface. 428 func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) { 429 // TODO(jackson): Use getInternal. 430 iter, err := es.NewIter(nil) 431 if err != nil { 432 return nil, nil, err 433 } 434 var valid bool 435 if es.db.opts.Comparer.Split != nil { 436 valid = iter.SeekPrefixGE(key) 437 } else { 438 valid = iter.SeekGE(key) 439 } 440 if !valid { 441 if err = firstError(iter.Error(), iter.Close()); err != nil { 442 return nil, nil, err 443 } 444 return nil, nil, ErrNotFound 445 } 446 if !es.db.equal(iter.Key(), key) { 447 return nil, nil, firstError(iter.Close(), ErrNotFound) 448 } 449 return iter.Value(), iter, nil 450 } 451 452 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 453 // return false). The iterator can be positioned via a call to SeekGE, 454 // SeekLT, First or Last. 455 func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) { 456 return es.NewIterWithContext(context.Background(), o) 457 } 458 459 // NewIterWithContext is like NewIter, and additionally accepts a context for 460 // tracing. 461 func (es *EventuallyFileOnlySnapshot) NewIterWithContext( 462 ctx context.Context, o *IterOptions, 463 ) (*Iterator, error) { 464 select { 465 case <-es.closed: 466 panic(ErrClosed) 467 default: 468 } 469 470 es.mu.Lock() 471 defer es.mu.Unlock() 472 if es.mu.vers != nil { 473 sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers} 474 return es.db.newIter(ctx, nil /* batch */, sOpts, o), nil 475 } 476 477 if es.excised.Load() { 478 return nil, ErrSnapshotExcised 479 } 480 sOpts := snapshotIterOpts{seqNum: es.seqNum} 481 iter := es.db.newIter(ctx, nil /* batch */, sOpts, o) 482 483 // If excised is true, then keys relevant to the snapshot might not be 484 // present in the readState being used by the iterator. Error out. 485 if es.excised.Load() { 486 iter.Close() 487 return nil, ErrSnapshotExcised 488 } 489 return iter, nil 490 } 491 492 // ScanInternal scans all internal keys within the specified bounds, truncating 493 // any rangedels and rangekeys to those bounds. For use when an external user 494 // needs to be aware of all internal keys that make up a key range. 495 // 496 // See comment on db.ScanInternal for the behaviour that can be expected of 497 // point keys deleted by range dels and keys masked by range keys. 498 func (es *EventuallyFileOnlySnapshot) ScanInternal( 499 ctx context.Context, 500 lower, upper []byte, 501 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, 502 visitRangeDel func(start, end []byte, seqNum uint64) error, 503 visitRangeKey func(start, end []byte, keys []rangekey.Key) error, 504 visitSharedFile func(sst *SharedSSTMeta) error, 505 ) error { 506 if es.db == nil { 507 panic(ErrClosed) 508 } 509 if es.excised.Load() { 510 return ErrSnapshotExcised 511 } 512 var sOpts snapshotIterOpts 513 es.mu.Lock() 514 if es.mu.vers != nil { 515 sOpts = snapshotIterOpts{ 516 seqNum: es.seqNum, 517 vers: es.mu.vers, 518 } 519 } else { 520 sOpts = snapshotIterOpts{ 521 seqNum: es.seqNum, 522 } 523 } 524 es.mu.Unlock() 525 opts := &scanInternalOptions{ 526 IterOptions: IterOptions{ 527 KeyTypes: IterKeyTypePointsAndRanges, 528 LowerBound: lower, 529 UpperBound: upper, 530 }, 531 visitPointKey: visitPointKey, 532 visitRangeDel: visitRangeDel, 533 visitRangeKey: visitRangeKey, 534 visitSharedFile: visitSharedFile, 535 skipSharedLevels: visitSharedFile != nil, 536 } 537 iter := es.db.newInternalIter(sOpts, opts) 538 defer iter.close() 539 540 // If excised is true, then keys relevant to the snapshot might not be 541 // present in the readState being used by the iterator. Error out. 542 if es.excised.Load() { 543 return ErrSnapshotExcised 544 } 545 546 return scanInternalImpl(ctx, lower, upper, iter, opts) 547 }