github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/snapshot.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"io"
    10  	"math"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/cockroachdb/errors"
    16  	"github.com/cockroachdb/pebble/internal/invariants"
    17  	"github.com/cockroachdb/pebble/rangekey"
    18  )
    19  
    20  // ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise
    21  // overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets
    22  // applied before the transition of that EFOS to a file-only snapshot.
    23  var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot")
    24  
    25  // Snapshot provides a read-only point-in-time view of the DB state.
    26  type Snapshot struct {
    27  	// The db the snapshot was created from.
    28  	db     *DB
    29  	seqNum uint64
    30  
    31  	// Set if part of an EventuallyFileOnlySnapshot.
    32  	efos *EventuallyFileOnlySnapshot
    33  
    34  	// The list the snapshot is linked into.
    35  	list *snapshotList
    36  
    37  	// The next/prev link for the snapshotList doubly-linked list of snapshots.
    38  	prev, next *Snapshot
    39  }
    40  
    41  var _ Reader = (*Snapshot)(nil)
    42  
    43  // Get gets the value for the given key. It returns ErrNotFound if the Snapshot
    44  // does not contain the key.
    45  //
    46  // The caller should not modify the contents of the returned slice, but it is
    47  // safe to modify the contents of the argument after Get returns. The returned
    48  // slice will remain valid until the returned Closer is closed. On success, the
    49  // caller MUST call closer.Close() or a memory leak will occur.
    50  func (s *Snapshot) Get(key []byte) ([]byte, io.Closer, error) {
    51  	if s.db == nil {
    52  		panic(ErrClosed)
    53  	}
    54  	return s.db.getInternal(key, nil /* batch */, s)
    55  }
    56  
    57  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
    58  // return false). The iterator can be positioned via a call to SeekGE,
    59  // SeekLT, First or Last.
    60  func (s *Snapshot) NewIter(o *IterOptions) (*Iterator, error) {
    61  	return s.NewIterWithContext(context.Background(), o)
    62  }
    63  
    64  // NewIterWithContext is like NewIter, and additionally accepts a context for
    65  // tracing.
    66  func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
    67  	if s.db == nil {
    68  		panic(ErrClosed)
    69  	}
    70  	return s.db.newIter(ctx, nil /* batch */, snapshotIterOpts{seqNum: s.seqNum}, o), nil
    71  }
    72  
    73  // ScanInternal scans all internal keys within the specified bounds, truncating
    74  // any rangedels and rangekeys to those bounds. For use when an external user
    75  // needs to be aware of all internal keys that make up a key range.
    76  //
    77  // See comment on db.ScanInternal for the behaviour that can be expected of
    78  // point keys deleted by range dels and keys masked by range keys.
    79  func (s *Snapshot) ScanInternal(
    80  	ctx context.Context,
    81  	lower, upper []byte,
    82  	visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
    83  	visitRangeDel func(start, end []byte, seqNum uint64) error,
    84  	visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
    85  	visitSharedFile func(sst *SharedSSTMeta) error,
    86  ) error {
    87  	if s.db == nil {
    88  		panic(ErrClosed)
    89  	}
    90  	scanInternalOpts := &scanInternalOptions{
    91  		visitPointKey:    visitPointKey,
    92  		visitRangeDel:    visitRangeDel,
    93  		visitRangeKey:    visitRangeKey,
    94  		visitSharedFile:  visitSharedFile,
    95  		skipSharedLevels: visitSharedFile != nil,
    96  		IterOptions: IterOptions{
    97  			KeyTypes:   IterKeyTypePointsAndRanges,
    98  			LowerBound: lower,
    99  			UpperBound: upper,
   100  		},
   101  	}
   102  
   103  	iter := s.db.newInternalIter(snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts)
   104  	defer iter.close()
   105  
   106  	return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts)
   107  }
   108  
   109  // closeLocked is similar to Close(), except it requires that db.mu be held
   110  // by the caller.
   111  func (s *Snapshot) closeLocked() error {
   112  	s.db.mu.snapshots.remove(s)
   113  
   114  	// If s was the previous earliest snapshot, we might be able to reclaim
   115  	// disk space by dropping obsolete records that were pinned by s.
   116  	if e := s.db.mu.snapshots.earliest(); e > s.seqNum {
   117  		s.db.maybeScheduleCompactionPicker(pickElisionOnly)
   118  	}
   119  	s.db = nil
   120  	return nil
   121  }
   122  
   123  // Close closes the snapshot, releasing its resources. Close must be called.
   124  // Failure to do so will result in a tiny memory leak and a large leak of
   125  // resources on disk due to the entries the snapshot is preventing from being
   126  // deleted.
   127  //
   128  // d.mu must NOT be held by the caller.
   129  func (s *Snapshot) Close() error {
   130  	db := s.db
   131  	if db == nil {
   132  		panic(ErrClosed)
   133  	}
   134  	db.mu.Lock()
   135  	defer db.mu.Unlock()
   136  	return s.closeLocked()
   137  }
   138  
   139  type snapshotList struct {
   140  	root Snapshot
   141  }
   142  
   143  func (l *snapshotList) init() {
   144  	l.root.next = &l.root
   145  	l.root.prev = &l.root
   146  }
   147  
   148  func (l *snapshotList) empty() bool {
   149  	return l.root.next == &l.root
   150  }
   151  
   152  func (l *snapshotList) count() int {
   153  	if l.empty() {
   154  		return 0
   155  	}
   156  	var count int
   157  	for i := l.root.next; i != &l.root; i = i.next {
   158  		count++
   159  	}
   160  	return count
   161  }
   162  
   163  func (l *snapshotList) earliest() uint64 {
   164  	v := uint64(math.MaxUint64)
   165  	if !l.empty() {
   166  		v = l.root.next.seqNum
   167  	}
   168  	return v
   169  }
   170  
   171  func (l *snapshotList) toSlice() []uint64 {
   172  	if l.empty() {
   173  		return nil
   174  	}
   175  	var results []uint64
   176  	for i := l.root.next; i != &l.root; i = i.next {
   177  		results = append(results, i.seqNum)
   178  	}
   179  	return results
   180  }
   181  
   182  func (l *snapshotList) pushBack(s *Snapshot) {
   183  	if s.list != nil || s.prev != nil || s.next != nil {
   184  		panic("pebble: snapshot list is inconsistent")
   185  	}
   186  	s.prev = l.root.prev
   187  	s.prev.next = s
   188  	s.next = &l.root
   189  	s.next.prev = s
   190  	s.list = l
   191  }
   192  
   193  func (l *snapshotList) remove(s *Snapshot) {
   194  	if s == &l.root {
   195  		panic("pebble: cannot remove snapshot list root node")
   196  	}
   197  	if s.list != l {
   198  		panic("pebble: snapshot list is inconsistent")
   199  	}
   200  	s.prev.next = s.next
   201  	s.next.prev = s.prev
   202  	s.next = nil // avoid memory leaks
   203  	s.prev = nil // avoid memory leaks
   204  	s.list = nil // avoid memory leaks
   205  }
   206  
   207  // EventuallyFileOnlySnapshot (aka EFOS) provides a read-only point-in-time view
   208  // of the database state, similar to Snapshot. An EventuallyFileOnlySnapshot
   209  // induces less write amplification than Snapshot, at the cost of increased space
   210  // amplification. While a Snapshot may increase write amplification across all
   211  // flushes and compactions for the duration of its lifetime, an
   212  // EventuallyFileOnlySnapshot only incurs that cost for flushes/compactions if
   213  // memtables at the time of EFOS instantiation contained keys that the EFOS is
   214  // interested in (i.e. its protectedRanges). In that case, the EFOS prevents
   215  // elision of keys visible to it, similar to a Snapshot, until those memtables
   216  // are flushed, and once that happens, the "EventuallyFileOnlySnapshot"
   217  // transitions to a file-only snapshot state in which it pins zombies sstables
   218  // like an open Iterator would, without pinning any memtables. Callers that can
   219  // tolerate the increased space amplification of pinning zombie sstables until
   220  // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their
   221  // reduced write amplification. Callers that desire the benefits of the file-only
   222  // state that requires no pinning of memtables should call
   223  // `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns
   224  // ErrSnapshotExcised) before relying on the EFOS to keep producing iterators
   225  // with zero write-amp and zero pinning of memtables in memory.
   226  //
   227  // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in
   228  // subtle ways. No new iterators can be created once
   229  // EventuallyFileOnlySnapshot.excised is set to true.
   230  type EventuallyFileOnlySnapshot struct {
   231  	mu struct {
   232  		// NB: If both this mutex and db.mu are being grabbed, db.mu should be
   233  		// grabbed _before_ grabbing this one.
   234  		sync.Mutex
   235  
   236  		// Either the snap field is set below, or the version is set at any given
   237  		// point of time. If a snapshot is referenced, this is not a file-only
   238  		// snapshot yet, and if a version is set (and ref'd) this is a file-only
   239  		// snapshot.
   240  
   241  		// The wrapped regular snapshot, if not a file-only snapshot yet.
   242  		snap *Snapshot
   243  		// The wrapped version reference, if a file-only snapshot.
   244  		vers *version
   245  	}
   246  
   247  	// Key ranges to watch for an excise on.
   248  	protectedRanges []KeyRange
   249  	// excised, if true, signals that the above ranges were excised during the
   250  	// lifetime of this snapshot.
   251  	excised atomic.Bool
   252  
   253  	// The db the snapshot was created from.
   254  	db     *DB
   255  	seqNum uint64
   256  
   257  	closed chan struct{}
   258  }
   259  
   260  func (d *DB) makeEventuallyFileOnlySnapshot(
   261  	keyRanges []KeyRange, internalKeyRanges []internalKeyRange,
   262  ) *EventuallyFileOnlySnapshot {
   263  	isFileOnly := true
   264  
   265  	d.mu.Lock()
   266  	defer d.mu.Unlock()
   267  	seqNum := d.mu.versions.visibleSeqNum.Load()
   268  	// Check if any of the keyRanges overlap with a memtable.
   269  	for i := range d.mu.mem.queue {
   270  		mem := d.mu.mem.queue[i]
   271  		if ingestMemtableOverlaps(d.cmp, mem, internalKeyRanges) {
   272  			isFileOnly = false
   273  			break
   274  		}
   275  	}
   276  	es := &EventuallyFileOnlySnapshot{
   277  		db:              d,
   278  		seqNum:          seqNum,
   279  		protectedRanges: keyRanges,
   280  		closed:          make(chan struct{}),
   281  	}
   282  	if isFileOnly {
   283  		es.mu.vers = d.mu.versions.currentVersion()
   284  		es.mu.vers.Ref()
   285  	} else {
   286  		s := &Snapshot{
   287  			db:     d,
   288  			seqNum: seqNum,
   289  		}
   290  		s.efos = es
   291  		es.mu.snap = s
   292  		d.mu.snapshots.pushBack(s)
   293  	}
   294  	return es
   295  }
   296  
   297  // Transitions this EventuallyFileOnlySnapshot to a file-only snapshot. Requires
   298  // earliestUnflushedSeqNum and vers to correspond to the same Version from the
   299  // current or a past acquisition of db.mu. vers must have been Ref()'d before
   300  // that mutex was released, if it was released.
   301  //
   302  // NB: The caller is expected to check for es.excised before making this
   303  // call.
   304  //
   305  // d.mu must be held when calling this method.
   306  func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error {
   307  	es.mu.Lock()
   308  	select {
   309  	case <-es.closed:
   310  		vers.UnrefLocked()
   311  		es.mu.Unlock()
   312  		return ErrClosed
   313  	default:
   314  	}
   315  	if es.mu.snap == nil {
   316  		es.mu.Unlock()
   317  		panic("pebble: tried to transition an eventually-file-only-snapshot twice")
   318  	}
   319  	// The caller has already called Ref() on vers.
   320  	es.mu.vers = vers
   321  	// NB: The callers should have already done a check of es.excised.
   322  	oldSnap := es.mu.snap
   323  	es.mu.snap = nil
   324  	es.mu.Unlock()
   325  	return oldSnap.closeLocked()
   326  }
   327  
   328  // hasTransitioned returns true if this EFOS has transitioned to a file-only
   329  // snapshot.
   330  func (es *EventuallyFileOnlySnapshot) hasTransitioned() bool {
   331  	es.mu.Lock()
   332  	defer es.mu.Unlock()
   333  	return es.mu.vers != nil
   334  }
   335  
   336  // waitForFlush waits for a flush on any memtables that need to be flushed
   337  // before this EFOS can transition to a file-only snapshot. If this EFOS is
   338  // waiting on a flush of the mutable memtable, it forces a rotation within
   339  // `dur` duration. For immutable memtables, it schedules a flush and waits for
   340  // it to finish.
   341  func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time.Duration) error {
   342  	es.db.mu.Lock()
   343  	defer es.db.mu.Unlock()
   344  
   345  	earliestUnflushedSeqNum := es.db.getEarliestUnflushedSeqNumLocked()
   346  	for earliestUnflushedSeqNum < es.seqNum {
   347  		select {
   348  		case <-es.closed:
   349  			return ErrClosed
   350  		case <-ctx.Done():
   351  			return ctx.Err()
   352  		default:
   353  		}
   354  		// Check if the current mutable memtable contains keys less than seqNum.
   355  		// If so, rotate it.
   356  		if es.db.mu.mem.mutable.logSeqNum < es.seqNum && dur.Nanoseconds() > 0 {
   357  			es.db.maybeScheduleDelayedFlush(es.db.mu.mem.mutable, dur)
   358  		} else {
   359  			// Find the last memtable that contains seqNums less than es.seqNum,
   360  			// and force a flush on it.
   361  			var mem *flushableEntry
   362  			for i := range es.db.mu.mem.queue {
   363  				if es.db.mu.mem.queue[i].logSeqNum < es.seqNum {
   364  					mem = es.db.mu.mem.queue[i]
   365  				}
   366  			}
   367  			mem.flushForced = true
   368  			es.db.maybeScheduleFlush()
   369  		}
   370  		es.db.mu.compact.cond.Wait()
   371  
   372  		earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked()
   373  	}
   374  	if es.excised.Load() {
   375  		return ErrSnapshotExcised
   376  	}
   377  	return nil
   378  }
   379  
   380  // WaitForFileOnlySnapshot blocks the calling goroutine until this snapshot
   381  // has been converted into a file-only snapshot (i.e. all memtables containing
   382  // keys < seqNum are flushed). A duration can be passed in, and if nonzero,
   383  // a delayed flush will be scheduled at that duration if necessary.
   384  //
   385  // Idempotent; can be called multiple times with no side effects.
   386  func (es *EventuallyFileOnlySnapshot) WaitForFileOnlySnapshot(
   387  	ctx context.Context, dur time.Duration,
   388  ) error {
   389  	if es.hasTransitioned() {
   390  		return nil
   391  	}
   392  
   393  	if err := es.waitForFlush(ctx, dur); err != nil {
   394  		return err
   395  	}
   396  
   397  	if invariants.Enabled {
   398  		// Since we aren't returning an error, we _must_ have transitioned to a
   399  		// file-only snapshot by now.
   400  		if !es.hasTransitioned() {
   401  			panic("expected EFOS to have transitioned to file-only snapshot after flush")
   402  		}
   403  	}
   404  	return nil
   405  }
   406  
   407  // Close closes the file-only snapshot and releases all referenced resources.
   408  // Not idempotent.
   409  func (es *EventuallyFileOnlySnapshot) Close() error {
   410  	close(es.closed)
   411  	es.db.mu.Lock()
   412  	defer es.db.mu.Unlock()
   413  	es.mu.Lock()
   414  	defer es.mu.Unlock()
   415  
   416  	if es.mu.snap != nil {
   417  		if err := es.mu.snap.closeLocked(); err != nil {
   418  			return err
   419  		}
   420  	}
   421  	if es.mu.vers != nil {
   422  		es.mu.vers.UnrefLocked()
   423  	}
   424  	return nil
   425  }
   426  
   427  // Get implements the Reader interface.
   428  func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.Closer, err error) {
   429  	// TODO(jackson): Use getInternal.
   430  	iter, err := es.NewIter(nil)
   431  	if err != nil {
   432  		return nil, nil, err
   433  	}
   434  	var valid bool
   435  	if es.db.opts.Comparer.Split != nil {
   436  		valid = iter.SeekPrefixGE(key)
   437  	} else {
   438  		valid = iter.SeekGE(key)
   439  	}
   440  	if !valid {
   441  		if err = firstError(iter.Error(), iter.Close()); err != nil {
   442  			return nil, nil, err
   443  		}
   444  		return nil, nil, ErrNotFound
   445  	}
   446  	if !es.db.equal(iter.Key(), key) {
   447  		return nil, nil, firstError(iter.Close(), ErrNotFound)
   448  	}
   449  	return iter.Value(), iter, nil
   450  }
   451  
   452  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
   453  // return false). The iterator can be positioned via a call to SeekGE,
   454  // SeekLT, First or Last.
   455  func (es *EventuallyFileOnlySnapshot) NewIter(o *IterOptions) (*Iterator, error) {
   456  	return es.NewIterWithContext(context.Background(), o)
   457  }
   458  
   459  // NewIterWithContext is like NewIter, and additionally accepts a context for
   460  // tracing.
   461  func (es *EventuallyFileOnlySnapshot) NewIterWithContext(
   462  	ctx context.Context, o *IterOptions,
   463  ) (*Iterator, error) {
   464  	select {
   465  	case <-es.closed:
   466  		panic(ErrClosed)
   467  	default:
   468  	}
   469  
   470  	es.mu.Lock()
   471  	defer es.mu.Unlock()
   472  	if es.mu.vers != nil {
   473  		sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers}
   474  		return es.db.newIter(ctx, nil /* batch */, sOpts, o), nil
   475  	}
   476  
   477  	if es.excised.Load() {
   478  		return nil, ErrSnapshotExcised
   479  	}
   480  	sOpts := snapshotIterOpts{seqNum: es.seqNum}
   481  	iter := es.db.newIter(ctx, nil /* batch */, sOpts, o)
   482  
   483  	// If excised is true, then keys relevant to the snapshot might not be
   484  	// present in the readState being used by the iterator. Error out.
   485  	if es.excised.Load() {
   486  		iter.Close()
   487  		return nil, ErrSnapshotExcised
   488  	}
   489  	return iter, nil
   490  }
   491  
   492  // ScanInternal scans all internal keys within the specified bounds, truncating
   493  // any rangedels and rangekeys to those bounds. For use when an external user
   494  // needs to be aware of all internal keys that make up a key range.
   495  //
   496  // See comment on db.ScanInternal for the behaviour that can be expected of
   497  // point keys deleted by range dels and keys masked by range keys.
   498  func (es *EventuallyFileOnlySnapshot) ScanInternal(
   499  	ctx context.Context,
   500  	lower, upper []byte,
   501  	visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error,
   502  	visitRangeDel func(start, end []byte, seqNum uint64) error,
   503  	visitRangeKey func(start, end []byte, keys []rangekey.Key) error,
   504  	visitSharedFile func(sst *SharedSSTMeta) error,
   505  ) error {
   506  	if es.db == nil {
   507  		panic(ErrClosed)
   508  	}
   509  	if es.excised.Load() {
   510  		return ErrSnapshotExcised
   511  	}
   512  	var sOpts snapshotIterOpts
   513  	es.mu.Lock()
   514  	if es.mu.vers != nil {
   515  		sOpts = snapshotIterOpts{
   516  			seqNum: es.seqNum,
   517  			vers:   es.mu.vers,
   518  		}
   519  	} else {
   520  		sOpts = snapshotIterOpts{
   521  			seqNum: es.seqNum,
   522  		}
   523  	}
   524  	es.mu.Unlock()
   525  	opts := &scanInternalOptions{
   526  		IterOptions: IterOptions{
   527  			KeyTypes:   IterKeyTypePointsAndRanges,
   528  			LowerBound: lower,
   529  			UpperBound: upper,
   530  		},
   531  		visitPointKey:    visitPointKey,
   532  		visitRangeDel:    visitRangeDel,
   533  		visitRangeKey:    visitRangeKey,
   534  		visitSharedFile:  visitSharedFile,
   535  		skipSharedLevels: visitSharedFile != nil,
   536  	}
   537  	iter := es.db.newInternalIter(sOpts, opts)
   538  	defer iter.close()
   539  
   540  	// If excised is true, then keys relevant to the snapshot might not be
   541  	// present in the readState being used by the iterator. Error out.
   542  	if es.excised.Load() {
   543  		return ErrSnapshotExcised
   544  	}
   545  
   546  	return scanInternalImpl(ctx, lower, upper, iter, opts)
   547  }