github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/commit.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"runtime"
     9  	"sync"
    10  	"sync/atomic"
    11  	"time"
    12  
    13  	"github.com/cockroachdb/pebble/record"
    14  )
    15  
    16  // commitQueue is a lock-free fixed-size single-producer, multi-consumer
    17  // queue. The single producer can enqueue (push) to the head, and consumers can
    18  // dequeue (pop) from the tail.
    19  //
    20  // It has the added feature that it nils out unused slots to avoid unnecessary
    21  // retention of objects.
    22  type commitQueue struct {
    23  	// headTail packs together a 32-bit head index and a 32-bit tail index. Both
    24  	// are indexes into slots modulo len(slots)-1.
    25  	//
    26  	// tail = index of oldest data in queue
    27  	// head = index of next slot to fill
    28  	//
    29  	// Slots in the range [tail, head) are owned by consumers.  A consumer
    30  	// continues to own a slot outside this range until it nils the slot, at
    31  	// which point ownership passes to the producer.
    32  	//
    33  	// The head index is stored in the most-significant bits so that we can
    34  	// atomically add to it and the overflow is harmless.
    35  	headTail atomic.Uint64
    36  
    37  	// slots is a ring buffer of values stored in this queue. The size must be a
    38  	// power of 2. A slot is in use until *both* the tail index has moved beyond
    39  	// it and the slot value has been set to nil. The slot value is set to nil
    40  	// atomically by the consumer and read atomically by the producer.
    41  	slots [record.SyncConcurrency]atomic.Pointer[Batch]
    42  }
    43  
    44  const dequeueBits = 32
    45  
    46  func (q *commitQueue) unpack(ptrs uint64) (head, tail uint32) {
    47  	const mask = 1<<dequeueBits - 1
    48  	head = uint32((ptrs >> dequeueBits) & mask)
    49  	tail = uint32(ptrs & mask)
    50  	return
    51  }
    52  
    53  func (q *commitQueue) pack(head, tail uint32) uint64 {
    54  	const mask = 1<<dequeueBits - 1
    55  	return (uint64(head) << dequeueBits) |
    56  		uint64(tail&mask)
    57  }
    58  
    59  func (q *commitQueue) enqueue(b *Batch) {
    60  	ptrs := q.headTail.Load()
    61  	head, tail := q.unpack(ptrs)
    62  	if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head {
    63  		// Queue is full. This should never be reached because commitPipeline.commitQueueSem
    64  		// limits the number of concurrent operations.
    65  		panic("pebble: not reached")
    66  	}
    67  	slot := &q.slots[head&uint32(len(q.slots)-1)]
    68  
    69  	// Check if the head slot has been released by dequeueApplied.
    70  	for slot.Load() != nil {
    71  		// Another goroutine is still cleaning up the tail, so the queue is
    72  		// actually still full. We spin because this should resolve itself
    73  		// momentarily.
    74  		runtime.Gosched()
    75  	}
    76  
    77  	// The head slot is free, so we own it.
    78  	slot.Store(b)
    79  
    80  	// Increment head. This passes ownership of slot to dequeueApplied and acts as a
    81  	// store barrier for writing the slot.
    82  	q.headTail.Add(1 << dequeueBits)
    83  }
    84  
    85  // dequeueApplied removes the earliest enqueued Batch, if it is applied.
    86  //
    87  // Returns nil if the commit queue is empty or the earliest Batch is not yet
    88  // applied.
    89  func (q *commitQueue) dequeueApplied() *Batch {
    90  	for {
    91  		ptrs := q.headTail.Load()
    92  		head, tail := q.unpack(ptrs)
    93  		if tail == head {
    94  			// Queue is empty.
    95  			return nil
    96  		}
    97  
    98  		slot := &q.slots[tail&uint32(len(q.slots)-1)]
    99  		b := slot.Load()
   100  		if b == nil || !b.applied.Load() {
   101  			// The batch is not ready to be dequeued, or another goroutine has
   102  			// already dequeued it.
   103  			return nil
   104  		}
   105  
   106  		// Confirm head and tail (for our speculative check above) and increment
   107  		// tail. If this succeeds, then we own the slot at tail.
   108  		ptrs2 := q.pack(head, tail+1)
   109  		if q.headTail.CompareAndSwap(ptrs, ptrs2) {
   110  			// We now own slot.
   111  			//
   112  			// Tell enqueue that we're done with this slot. Zeroing the slot is also
   113  			// important so we don't leave behind references that could keep this object
   114  			// live longer than necessary.
   115  			slot.Store(nil)
   116  			// At this point enqueue owns the slot.
   117  			return b
   118  		}
   119  	}
   120  }
   121  
   122  // commitEnv contains the environment that a commitPipeline interacts
   123  // with. This allows fine-grained testing of commitPipeline behavior without
   124  // construction of an entire DB.
   125  type commitEnv struct {
   126  	// The next sequence number to give to a batch. Protected by
   127  	// commitPipeline.mu.
   128  	logSeqNum *atomic.Uint64
   129  	// The visible sequence number at which reads should be performed. Ratcheted
   130  	// upwards atomically as batches are applied to the memtable.
   131  	visibleSeqNum *atomic.Uint64
   132  
   133  	// Apply the batch to the specified memtable. Called concurrently.
   134  	apply func(b *Batch, mem *memTable) error
   135  	// Write the batch to the WAL. If wg != nil, the data will be persisted
   136  	// asynchronously and done will be called on wg upon completion. If wg != nil
   137  	// and err != nil, a failure to persist the WAL will populate *err. Returns
   138  	// the memtable the batch should be applied to. Serial execution enforced by
   139  	// commitPipeline.mu.
   140  	write func(b *Batch, wg *sync.WaitGroup, err *error) (*memTable, error)
   141  }
   142  
   143  // A commitPipeline manages the stages of committing a set of mutations
   144  // (contained in a single Batch) atomically to the DB. The steps are
   145  // conceptually:
   146  //
   147  //  1. Write the batch to the WAL and optionally sync the WAL
   148  //  2. Apply the mutations in the batch to the memtable
   149  //
   150  // These two simple steps are made complicated by the desire for high
   151  // performance. In the absence of concurrency, performance is limited by how
   152  // fast a batch can be written (and synced) to the WAL and then added to the
   153  // memtable, both of which are outside the purview of the commit
   154  // pipeline. Performance under concurrency is the primary concern of the commit
   155  // pipeline, though it also needs to maintain two invariants:
   156  //
   157  //  1. Batches need to be written to the WAL in sequence number order.
   158  //  2. Batches need to be made visible for reads in sequence number order. This
   159  //     invariant arises from the use of a single sequence number which
   160  //     indicates which mutations are visible.
   161  //
   162  // Taking these invariants into account, let's revisit the work the commit
   163  // pipeline needs to perform. Writing the batch to the WAL is necessarily
   164  // serialized as there is a single WAL object. The order of the entries in the
   165  // WAL defines the sequence number order. Note that writing to the WAL is
   166  // extremely fast, usually just a memory copy. Applying the mutations in a
   167  // batch to the memtable can occur concurrently as the underlying skiplist
   168  // supports concurrent insertions. Publishing the visible sequence number is
   169  // another serialization point, but one with a twist: the visible sequence
   170  // number cannot be bumped until the mutations for earlier batches have
   171  // finished applying to the memtable (the visible sequence number only ratchets
   172  // up). Lastly, if requested, the commit waits for the WAL to sync. Note that
   173  // waiting for the WAL sync after ratcheting the visible sequence number allows
   174  // another goroutine to read committed data before the WAL has synced. This is
   175  // similar behavior to RocksDB's manual WAL flush functionality. Application
   176  // code needs to protect against this if necessary.
   177  //
   178  // The full outline of the commit pipeline operation is as follows:
   179  //
   180  //	with commitPipeline mutex locked:
   181  //	  assign batch sequence number
   182  //	  write batch to WAL
   183  //	(optionally) add batch to WAL sync list
   184  //	apply batch to memtable (concurrently)
   185  //	wait for earlier batches to apply
   186  //	ratchet read sequence number
   187  //	(optionally) wait for the WAL to sync
   188  //
   189  // As soon as a batch has been written to the WAL, the commitPipeline mutex is
   190  // released allowing another batch to write to the WAL. Each commit operation
   191  // individually applies its batch to the memtable providing concurrency. The
   192  // WAL sync happens concurrently with applying to the memtable (see
   193  // commitPipeline.syncLoop).
   194  //
   195  // The "waits for earlier batches to apply" work is more complicated than might
   196  // be expected. The obvious approach would be to keep a queue of pending
   197  // batches and for each batch to wait for the previous batch to finish
   198  // committing. This approach was tried initially and turned out to be too
   199  // slow. The problem is that it causes excessive goroutine activity as each
   200  // committing goroutine needs to wake up in order for the next goroutine to be
   201  // unblocked. The approach taken in the current code is conceptually similar,
   202  // though it avoids waking a goroutine to perform work that another goroutine
   203  // can perform. A commitQueue (a single-producer, multiple-consumer queue)
   204  // holds the ordered list of committing batches. Addition to the queue is done
   205  // while holding commitPipeline.mutex ensuring the same ordering of batches in
   206  // the queue as the ordering in the WAL. When a batch finishes applying to the
   207  // memtable, it atomically updates its Batch.applied field. Ratcheting of the
   208  // visible sequence number is done by commitPipeline.publish which loops
   209  // dequeueing "applied" batches and ratcheting the visible sequence number. If
   210  // we hit an unapplied batch at the head of the queue we can block as we know
   211  // that committing of that unapplied batch will eventually find our (applied)
   212  // batch in the queue. See commitPipeline.publish for additional commentary.
   213  type commitPipeline struct {
   214  	// WARNING: The following struct `commitQueue` contains fields which will
   215  	// be accessed atomically.
   216  	//
   217  	// Go allocations are guaranteed to be 64-bit aligned which we take advantage
   218  	// of by placing the 64-bit fields which we access atomically at the beginning
   219  	// of the commitPipeline struct.
   220  	// For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
   221  	// Queue of pending batches to commit.
   222  	pending commitQueue
   223  	env     commitEnv
   224  	// The commit path has two queues:
   225  	// - commitPipeline.pending contains batches whose seqnums have not yet been
   226  	//   published. It is a lock-free single producer multi consumer queue.
   227  	// - LogWriter.flusher.syncQ contains state for batches that have asked for
   228  	//   a sync. It is a lock-free single producer single consumer queue.
   229  	// These lock-free queues have a fixed capacity. And since they are
   230  	// lock-free, we cannot do blocking waits when pushing onto these queues, in
   231  	// case they are full. Additionally, adding to these queues happens while
   232  	// holding commitPipeline.mu, and we don't want to block while holding that
   233  	// mutex since it is also needed by other code.
   234  	//
   235  	// Popping from these queues is independent and for a particular batch can
   236  	// occur in either order, though it is more common that popping from the
   237  	// commitPipeline.pending will happen first.
   238  	//
   239  	// Due to these constraints, we reserve a unit of space in each queue before
   240  	// acquiring commitPipeline.mu, which also ensures that the push operation
   241  	// is guaranteed to have space in the queue. The commitQueueSem and
   242  	// logSyncQSem are used for this reservation.
   243  	commitQueueSem chan struct{}
   244  	logSyncQSem    chan struct{}
   245  	ingestSem      chan struct{}
   246  	// The mutex to use for synchronizing access to logSeqNum and serializing
   247  	// calls to commitEnv.write().
   248  	mu sync.Mutex
   249  }
   250  
   251  func newCommitPipeline(env commitEnv) *commitPipeline {
   252  	p := &commitPipeline{
   253  		env: env,
   254  		// The capacity of both commitQueue.slots and syncQueue.slots is set to
   255  		// record.SyncConcurrency, which also determines the value of these
   256  		// semaphores. We used to have a single semaphore, which required that the
   257  		// capacity of these queues be the same. Now that we have two semaphores,
   258  		// the capacity of these queues could be changed to be different. Say half
   259  		// of the batches asked to be synced, but syncing took 5x the latency of
   260  		// adding to the memtable and publishing. Then syncQueue.slots could be
   261  		// sized as 0.5*5 of the commitQueue.slots. We can explore this if we find
   262  		// that LogWriterMetrics.SyncQueueLen has high utilization under some
   263  		// workloads.
   264  		//
   265  		// NB: the commit concurrency is one less than SyncConcurrency because we
   266  		// have to allow one "slot" for a concurrent WAL rotation which will close
   267  		// and sync the WAL.
   268  		commitQueueSem: make(chan struct{}, record.SyncConcurrency-1),
   269  		logSyncQSem:    make(chan struct{}, record.SyncConcurrency-1),
   270  		ingestSem:      make(chan struct{}, 1),
   271  	}
   272  	return p
   273  }
   274  
   275  // directWrite is used to directly write to the WAL. commitPipeline.mu must be
   276  // held while this is called. DB.mu must not be held. directWrite will only
   277  // return once the WAL sync is complete. Note that DirectWrite is a special case
   278  // function which is currently only used when ingesting sstables as a flushable.
   279  // Reason carefully about the correctness argument when calling this function
   280  // from any context.
   281  func (p *commitPipeline) directWrite(b *Batch) error {
   282  	var syncWG sync.WaitGroup
   283  	var syncErr error
   284  	syncWG.Add(1)
   285  	p.logSyncQSem <- struct{}{}
   286  	_, err := p.env.write(b, &syncWG, &syncErr)
   287  	syncWG.Wait()
   288  	err = firstError(err, syncErr)
   289  	return err
   290  }
   291  
   292  // Commit the specified batch, writing it to the WAL, optionally syncing the
   293  // WAL, and applying the batch to the memtable. Upon successful return the
   294  // batch's mutations will be visible for reading.
   295  // REQUIRES: noSyncWait => syncWAL
   296  func (p *commitPipeline) Commit(b *Batch, syncWAL bool, noSyncWait bool) error {
   297  	if b.Empty() {
   298  		return nil
   299  	}
   300  
   301  	commitStartTime := time.Now()
   302  	// Acquire semaphores.
   303  	p.commitQueueSem <- struct{}{}
   304  	if syncWAL {
   305  		p.logSyncQSem <- struct{}{}
   306  	}
   307  	b.commitStats.SemaphoreWaitDuration = time.Since(commitStartTime)
   308  
   309  	// Prepare the batch for committing: enqueuing the batch in the pending
   310  	// queue, determining the batch sequence number and writing the data to the
   311  	// WAL.
   312  	//
   313  	// NB: We set Batch.commitErr on error so that the batch won't be a candidate
   314  	// for reuse. See Batch.release().
   315  	mem, err := p.prepare(b, syncWAL, noSyncWait)
   316  	if err != nil {
   317  		b.db = nil // prevent batch reuse on error
   318  		// NB: we are not doing <-p.commitQueueSem since the batch is still
   319  		// sitting in the pending queue. We should consider fixing this by also
   320  		// removing the batch from the pending queue.
   321  		return err
   322  	}
   323  
   324  	// Apply the batch to the memtable.
   325  	if err := p.env.apply(b, mem); err != nil {
   326  		b.db = nil // prevent batch reuse on error
   327  		// NB: we are not doing <-p.commitQueueSem since the batch is still
   328  		// sitting in the pending queue. We should consider fixing this by also
   329  		// removing the batch from the pending queue.
   330  		return err
   331  	}
   332  
   333  	// Publish the batch sequence number.
   334  	p.publish(b)
   335  
   336  	<-p.commitQueueSem
   337  
   338  	if !noSyncWait {
   339  		// Already waited for commit, so look at the error.
   340  		if b.commitErr != nil {
   341  			b.db = nil // prevent batch reuse on error
   342  			err = b.commitErr
   343  		}
   344  	}
   345  	// Else noSyncWait. The LogWriter can be concurrently writing to
   346  	// b.commitErr. We will read b.commitErr in Batch.SyncWait after the
   347  	// LogWriter is done writing.
   348  
   349  	b.commitStats.TotalDuration = time.Since(commitStartTime)
   350  
   351  	return err
   352  }
   353  
   354  // AllocateSeqNum allocates count sequence numbers, invokes the prepare
   355  // callback, then the apply callback, and then publishes the sequence
   356  // numbers. AllocateSeqNum does not write to the WAL or add entries to the
   357  // memtable. AllocateSeqNum can be used to sequence an operation such as
   358  // sstable ingestion within the commit pipeline. The prepare callback is
   359  // invoked with commitPipeline.mu held, but note that DB.mu is not held and
   360  // must be locked if necessary.
   361  func (p *commitPipeline) AllocateSeqNum(
   362  	count int, prepare func(seqNum uint64), apply func(seqNum uint64),
   363  ) {
   364  	// This method is similar to Commit and prepare. Be careful about trying to
   365  	// share additional code with those methods because Commit and prepare are
   366  	// performance critical code paths.
   367  
   368  	b := newBatch(nil)
   369  	defer b.release()
   370  
   371  	// Give the batch a count of 1 so that the log and visible sequence number
   372  	// are incremented correctly.
   373  	b.data = make([]byte, batchHeaderLen)
   374  	b.setCount(uint32(count))
   375  	b.commit.Add(1)
   376  
   377  	p.commitQueueSem <- struct{}{}
   378  
   379  	p.mu.Lock()
   380  
   381  	// Enqueue the batch in the pending queue. Note that while the pending queue
   382  	// is lock-free, we want the order of batches to be the same as the sequence
   383  	// number order.
   384  	p.pending.enqueue(b)
   385  
   386  	// Assign the batch a sequence number. Note that we use atomic operations
   387  	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
   388  	// mutual exclusion for other goroutines writing to logSeqNum.
   389  	logSeqNum := p.env.logSeqNum.Add(uint64(count)) - uint64(count)
   390  	seqNum := logSeqNum
   391  	if seqNum == 0 {
   392  		// We can't use the value 0 for the global seqnum during ingestion, because
   393  		// 0 indicates no global seqnum. So allocate one more seqnum.
   394  		p.env.logSeqNum.Add(1)
   395  		seqNum++
   396  	}
   397  	b.setSeqNum(seqNum)
   398  
   399  	// Wait for any outstanding writes to the memtable to complete. This is
   400  	// necessary for ingestion so that the check for memtable overlap can see any
   401  	// writes that were sequenced before the ingestion. The spin loop is
   402  	// unfortunate, but obviates the need for additional synchronization.
   403  	for {
   404  		visibleSeqNum := p.env.visibleSeqNum.Load()
   405  		if visibleSeqNum == logSeqNum {
   406  			break
   407  		}
   408  		runtime.Gosched()
   409  	}
   410  
   411  	// Invoke the prepare callback. Note the lack of error reporting. Even if the
   412  	// callback internally fails, the sequence number needs to be published in
   413  	// order to allow the commit pipeline to proceed.
   414  	prepare(b.SeqNum())
   415  
   416  	p.mu.Unlock()
   417  
   418  	// Invoke the apply callback.
   419  	apply(b.SeqNum())
   420  
   421  	// Publish the sequence number.
   422  	p.publish(b)
   423  
   424  	<-p.commitQueueSem
   425  }
   426  
   427  func (p *commitPipeline) prepare(b *Batch, syncWAL bool, noSyncWait bool) (*memTable, error) {
   428  	n := uint64(b.Count())
   429  	if n == invalidBatchCount {
   430  		return nil, ErrInvalidBatch
   431  	}
   432  	var syncWG *sync.WaitGroup
   433  	var syncErr *error
   434  	switch {
   435  	case !syncWAL:
   436  		// Only need to wait for the publish.
   437  		b.commit.Add(1)
   438  	// Remaining cases represent syncWAL=true.
   439  	case noSyncWait:
   440  		syncErr = &b.commitErr
   441  		syncWG = &b.fsyncWait
   442  		// Only need to wait synchronously for the publish. The user will
   443  		// (asynchronously) wait on the batch's fsyncWait.
   444  		b.commit.Add(1)
   445  		b.fsyncWait.Add(1)
   446  	case !noSyncWait:
   447  		syncErr = &b.commitErr
   448  		syncWG = &b.commit
   449  		// Must wait for both the publish and the WAL fsync.
   450  		b.commit.Add(2)
   451  	}
   452  
   453  	p.mu.Lock()
   454  
   455  	// Enqueue the batch in the pending queue. Note that while the pending queue
   456  	// is lock-free, we want the order of batches to be the same as the sequence
   457  	// number order.
   458  	p.pending.enqueue(b)
   459  
   460  	// Assign the batch a sequence number. Note that we use atomic operations
   461  	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
   462  	// mutual exclusion for other goroutines writing to logSeqNum.
   463  	b.setSeqNum(p.env.logSeqNum.Add(n) - n)
   464  
   465  	// Write the data to the WAL.
   466  	mem, err := p.env.write(b, syncWG, syncErr)
   467  
   468  	p.mu.Unlock()
   469  
   470  	return mem, err
   471  }
   472  
   473  func (p *commitPipeline) publish(b *Batch) {
   474  	// Mark the batch as applied.
   475  	b.applied.Store(true)
   476  
   477  	// Loop dequeuing applied batches from the pending queue. If our batch was
   478  	// the head of the pending queue we are guaranteed that either we'll publish
   479  	// it or someone else will dequeueApplied and publish it. If our batch is not the
   480  	// head of the queue then either we'll dequeueApplied applied batches and reach our
   481  	// batch or there is an unapplied batch blocking us. When that unapplied
   482  	// batch applies it will go through the same process and publish our batch
   483  	// for us.
   484  	for {
   485  		t := p.pending.dequeueApplied()
   486  		if t == nil {
   487  			// Wait for another goroutine to publish us. We might also be waiting for
   488  			// the WAL sync to finish.
   489  			now := time.Now()
   490  			b.commit.Wait()
   491  			b.commitStats.CommitWaitDuration += time.Since(now)
   492  			break
   493  		}
   494  		if !t.applied.Load() {
   495  			panic("not reached")
   496  		}
   497  
   498  		// We're responsible for publishing the sequence number for batch t, but
   499  		// another concurrent goroutine might sneak in and publish the sequence
   500  		// number for a subsequent batch. That's ok as all we're guaranteeing is
   501  		// that the sequence number ratchets up.
   502  		for {
   503  			curSeqNum := p.env.visibleSeqNum.Load()
   504  			newSeqNum := t.SeqNum() + uint64(t.Count())
   505  			if newSeqNum <= curSeqNum {
   506  				// t's sequence number has already been published.
   507  				break
   508  			}
   509  			if p.env.visibleSeqNum.CompareAndSwap(curSeqNum, newSeqNum) {
   510  				// We successfully published t's sequence number.
   511  				break
   512  			}
   513  		}
   514  
   515  		t.commit.Done()
   516  	}
   517  }