github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/commit.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/commit.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"runtime"
     9  	"sync"
    10  	"sync/atomic"
    11  	"unsafe"
    12  
    13  	"github.com/petermattis/pebble/internal/record"
    14  )
    15  
    16  // The maximum concurrency allowed for commit operations. This limit is
    17  // enforced by commitPipeline.sem.
    18  const commitConcurrency = record.SyncConcurrency
    19  
    20  // commitQueue is a lock-free fixed-size single-producer, multi-consumer
    21  // queue. The single producer can enqueue (push) to the head, and consumers can
    22  // dequeue (pop) from the tail.
    23  //
    24  // It has the added feature that it nils out unused slots to avoid unnecessary
    25  // retention of objects.
    26  type commitQueue struct {
    27  	// headTail packs together a 32-bit head index and a 32-bit tail index. Both
    28  	// are indexes into slots modulo len(slots)-1.
    29  	//
    30  	// tail = index of oldest data in queue
    31  	// head = index of next slot to fill
    32  	//
    33  	// Slots in the range [tail, head) are owned by consumers.  A consumer
    34  	// continues to own a slot outside this range until it nils the slot, at
    35  	// which point ownership passes to the producer.
    36  	//
    37  	// The head index is stored in the most-significant bits so that we can
    38  	// atomically add to it and the overflow is harmless.
    39  	headTail uint64
    40  
    41  	// slots is a ring buffer of values stored in this queue. The size must be a
    42  	// power of 2. A slot is in use until *both* the tail index has moved beyond
    43  	// it and the slot value has been set to nil. The slot value is set to nil
    44  	// atomically by the consumer and read atomically by the producer.
    45  	slots [commitConcurrency]unsafe.Pointer
    46  }
    47  
    48  const dequeueBits = 32
    49  
    50  func (q *commitQueue) unpack(ptrs uint64) (head, tail uint32) {
    51  	const mask = 1<<dequeueBits - 1
    52  	head = uint32((ptrs >> dequeueBits) & mask)
    53  	tail = uint32(ptrs & mask)
    54  	return
    55  }
    56  
    57  func (q *commitQueue) pack(head, tail uint32) uint64 {
    58  	const mask = 1<<dequeueBits - 1
    59  	return (uint64(head) << dequeueBits) |
    60  		uint64(tail&mask)
    61  }
    62  
    63  func (q *commitQueue) enqueue(b *Batch) {
    64  	for {
    65  		ptrs := atomic.LoadUint64(&q.headTail)
    66  		head, tail := q.unpack(ptrs)
    67  		if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head {
    68  			// Queue is full.
    69  			panic("not reached")
    70  		}
    71  		slot := &q.slots[head&uint32(len(q.slots)-1)]
    72  
    73  		// Check if the head slot has been released by dequeue.
    74  		for atomic.LoadPointer(slot) != nil {
    75  			// Another goroutine is still cleaning up the tail, so the queue is
    76  			// actually still full. We spin because this should resolve itself
    77  			// momentarily.
    78  			runtime.Gosched()
    79  		}
    80  
    81  		// The head slot is free, so we own it.
    82  		atomic.StorePointer(slot, unsafe.Pointer(b))
    83  
    84  		// Increment head. This passes ownership of slot to dequeue and acts as a
    85  		// store barrier for writing the slot.
    86  		atomic.AddUint64(&q.headTail, 1<<dequeueBits)
    87  		return
    88  	}
    89  }
    90  
    91  func (q *commitQueue) dequeue() *Batch {
    92  	for {
    93  		ptrs := atomic.LoadUint64(&q.headTail)
    94  		head, tail := q.unpack(ptrs)
    95  		if tail == head {
    96  			// Queue is empty.
    97  			return nil
    98  		}
    99  
   100  		slot := &q.slots[tail&uint32(len(q.slots)-1)]
   101  		b := (*Batch)(atomic.LoadPointer(slot))
   102  		if b == nil || atomic.LoadUint32(&b.applied) == 0 {
   103  			// The batch is not ready to be dequeued, or another goroutine has
   104  			// already dequeued it.
   105  			return nil
   106  		}
   107  
   108  		// Confirm head and tail (for our speculative check above) and increment
   109  		// tail. If this succeeds, then we own the slot at tail.
   110  		ptrs2 := q.pack(head, tail+1)
   111  		if atomic.CompareAndSwapUint64(&q.headTail, ptrs, ptrs2) {
   112  			// We now own slot.
   113  			//
   114  			// Tell enqueue that we're done with this slot. Zeroing the slot is also
   115  			// important so we don't leave behind references that could keep this object
   116  			// live longer than necessary.
   117  			atomic.StorePointer(slot, nil)
   118  			// At this point enqueue owns the slot.
   119  			return b
   120  		}
   121  	}
   122  }
   123  
   124  // commitEnv contains the environment that a commitPipeline interacts
   125  // with. This allows fine-grained testing of commitPipeline behavior without
   126  // construction of an entire DB.
   127  type commitEnv struct {
   128  	// The next sequence number to give to a batch. Protected by
   129  	// commitPipeline.mu.
   130  	logSeqNum *uint64
   131  	// The visible sequence number at which reads should be performed. Ratcheted
   132  	// upwards atomically as batches are applied to the memtable.
   133  	visibleSeqNum *uint64
   134  
   135  	// Apply the batch to the specified memtable. Called concurrently.
   136  	apply func(b *Batch, mem *memTable) error
   137  	// Write the batch to the WAL. If wg!=nil, the data will be persisted
   138  	// asynchronously and done will be called on wait group upon
   139  	// completion. Returns the memtable the batch should be applied to. Serial
   140  	// execution enforced by commitPipeline.mu.
   141  	write func(b *Batch, wg *sync.WaitGroup) (*memTable, error)
   142  }
   143  
   144  // A commitPipeline manages the stages of committing a set of mutations
   145  // (contained in a single Batch) atomically to the DB. The steps are
   146  // conceptually:
   147  //
   148  //   1. Write the batch to the WAL and optionally sync the WAL
   149  //   2. Apply the mutations in the batch to the memtable
   150  //
   151  // These two simple steps are made complicated by the desire for high
   152  // performance. In the absence of concurrency, performance is limited by how
   153  // fast a batch can be written (and synced) to the WAL and then added to the
   154  // memtable, both of which are outside the purview of the commit
   155  // pipeline. Performance under concurrency is the primary concern of the commit
   156  // pipeline, though it also needs to maintain two invariants:
   157  //
   158  //   1. Batches need to be written to the WAL in sequence number order.
   159  //   2. Batches need to be made visible for reads in sequence number order. This
   160  //      invariant arises from the use of a single sequence number which
   161  //      indicates which mutations are visible.
   162  //
   163  // Taking these invariants into account, let's revisit the work the commit
   164  // pipeline needs to perform. Writing the batch to the WAL is necessarily
   165  // serialized as there is a single WAL object. The order of the entries in the
   166  // WAL defines the sequence number order. Note that writing to the WAL is
   167  // extremely fast, usually just a memory copy. Applying the mutations in a
   168  // batch to the memtable can occur concurrently as the underlying skiplist
   169  // supports concurrent insertions. Publishing the visible sequence number is
   170  // another serialization point, but one with a twist: the visible sequence
   171  // number cannot be bumped until the mutations for earlier batches have
   172  // finished applying to the memtable (the visible sequence number only ratchets
   173  // up). Lastly, if requested, the commit waits for the WAL to sync. Note that
   174  // waiting for the WAL sync after ratcheting the visible sequence number allows
   175  // another goroutine to read committed data before the WAL has synced. This is
   176  // similar behavior to RocksDB's manual WAL flush functionality. Application
   177  // code needs to protect against this if necessary.
   178  //
   179  // The full outline of the commit pipeline operation is as follows:
   180  //
   181  //   with commitPipeline mutex locked:
   182  //     assign batch sequence number
   183  //     write batch to WAL
   184  //   (optionally) add batch to WAL sync list
   185  //   apply batch to memtable (concurrently)
   186  //   wait for earlier batches to apply
   187  //   ratchet read sequence number
   188  //   (optionally) wait for the WAL to sync
   189  //
   190  // As soon as a batch has been written to the WAL, the commitPipeline mutex is
   191  // released allowing another batch to write to the WAL. Each commit operation
   192  // individually applies its batch to the memtable providing concurrency. The
   193  // WAL sync happens concurrently with applying to the memtable (see
   194  // commitPipeline.syncLoop).
   195  //
   196  // The "waits for earlier batches to apply" work is more complicated than might
   197  // be expected. The obvious approach would be to keep a queue of pending
   198  // batches and for each batch to wait for the previous batch to finish
   199  // committing. This approach was tried initially and turned out to be too
   200  // slow. The problem is that it causes excessive goroutine activity as each
   201  // committing goroutine needs to wake up in order for the next goroutine to be
   202  // unblocked. The approach taken in the current code is conceptually similar,
   203  // though it avoids waking a goroutine to perform work that another goroutine
   204  // can perform. A commitQueue (a single-producer, multiple-consumer queue)
   205  // holds the ordered list of committing batches. Addition to the queue is done
   206  // while holding commitPipeline.mutex ensuring the same ordering of batches in
   207  // the queue as the ordering in the WAL. When a batch finishes applying to the
   208  // memtable, it atomically updates its Batch.applied field. Ratcheting of the
   209  // visible sequence number is done by commitPipeline.publish which loops
   210  // dequeueing "applied" batches and ratcheting the visible sequence number. If
   211  // we hit an unapplied batch at the head of the queue we can block as we know
   212  // that committing of that unapplied batch will eventually find our (applied)
   213  // batch in the queue. See commitPipeline.publish for additional commentary.
   214  type commitPipeline struct {
   215  	env commitEnv
   216  	sem chan struct{}
   217  	// The mutex to use for synchronizing access to logSeqNum and serializing
   218  	// calls to commitEnv.write().
   219  	mu sync.Mutex
   220  	// Queue of pending batches to commit.
   221  	pending commitQueue
   222  }
   223  
   224  func newCommitPipeline(env commitEnv) *commitPipeline {
   225  	p := &commitPipeline{
   226  		env: env,
   227  		sem: make(chan struct{}, commitConcurrency),
   228  	}
   229  	return p
   230  }
   231  
   232  func (p *commitPipeline) Close() {
   233  }
   234  
   235  // Commit the specified batch, writing it to the WAL, optionally syncing the
   236  // WAL, and applying the batch to the memtable. Upon successful return the
   237  // batch's mutations will be visible for reading.
   238  func (p *commitPipeline) Commit(b *Batch, syncWAL bool) error {
   239  	if b.Empty() {
   240  		return nil
   241  	}
   242  
   243  	p.sem <- struct{}{}
   244  
   245  	// Prepare the batch for committing: enqueuing the batch in the pending
   246  	// queue, determining the batch sequence number and writing the data to the
   247  	// WAL.
   248  	mem, err := p.prepare(b, syncWAL)
   249  	if err != nil {
   250  		// TODO(peter): what to do on error? the pipeline will be horked at this
   251  		// point.
   252  		panic(err)
   253  	}
   254  
   255  	// Apply the batch to the memtable.
   256  	if err := p.env.apply(b, mem); err != nil {
   257  		// TODO(peter): what to do on error? the pipeline will be horked at this
   258  		// point.
   259  		panic(err)
   260  	}
   261  
   262  	// Publish the batch sequence number.
   263  	p.publish(b)
   264  
   265  	<-p.sem
   266  	return nil
   267  }
   268  
   269  // AllocateSeqNum allocates count sequence numbers, invokes the prepare
   270  // callback, then the apply callback, and then publishes the sequence
   271  // numbers. AllocateSeqNum does not write to the WAL or add entries to the
   272  // memtable. AllocateSeqNum can be used to sequence an operation such as
   273  // sstable ingestion within the commit pipeline. The prepare callback is
   274  // invoked with commitPipeline.mu held, but note that DB.mu is not held and
   275  // must be locked if necessary.
   276  func (p *commitPipeline) AllocateSeqNum(count int, prepare func(), apply func(seqNum uint64)) {
   277  	// This method is similar to Commit and prepare. Be careful about trying to
   278  	// share additional code with those methods because Commit and prepare are
   279  	// performance critical code paths.
   280  
   281  	b := newBatch(nil)
   282  	defer b.release()
   283  
   284  	// Give the batch a count of 1 so that the log and visible sequence number
   285  	// are incremented correctly.
   286  	b.storage.data = make([]byte, batchHeaderLen)
   287  	b.setCount(uint32(count))
   288  	b.commit.Add(1)
   289  
   290  	p.sem <- struct{}{}
   291  
   292  	p.mu.Lock()
   293  
   294  	// Enqueue the batch in the pending queue. Note that while the pending queue
   295  	// is lock-free, we want the order of batches to be the same as the sequence
   296  	// number order.
   297  	p.pending.enqueue(b)
   298  
   299  	// Assign the batch a sequence number. Note that we use atomic operations
   300  	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
   301  	// mutual exclusion for other goroutines writing to logSeqNum.
   302  	seqNum := atomic.AddUint64(p.env.logSeqNum, uint64(count)) - uint64(count)
   303  	if seqNum == 0 {
   304  		// We can't use the value 0 for the global seqnum during ingestion, because
   305  		// 0 indicates no global seqnum. So allocate one more seqnum.
   306  		atomic.AddUint64(p.env.logSeqNum, 1)
   307  		seqNum++
   308  		b.setCount(1 + uint32(count))
   309  	}
   310  	b.setSeqNum(seqNum)
   311  
   312  	// Invoke the prepare callback. Note the lack of error reporting. Even if the
   313  	// callback internally fails, the sequence number needs to be published in
   314  	// order to allow the commit pipeline to proceed.
   315  	prepare()
   316  
   317  	p.mu.Unlock()
   318  
   319  	// Invoke the apply callback.
   320  	apply(b.SeqNum())
   321  
   322  	// Publish the sequence number.
   323  	p.publish(b)
   324  
   325  	<-p.sem
   326  }
   327  
   328  func (p *commitPipeline) prepare(b *Batch, syncWAL bool) (*memTable, error) {
   329  	n := uint64(b.Count())
   330  	if n == invalidBatchCount {
   331  		return nil, ErrInvalidBatch
   332  	}
   333  	count := 1
   334  	if syncWAL {
   335  		count++
   336  	}
   337  	b.commit.Add(count)
   338  
   339  	var syncWG *sync.WaitGroup
   340  	if syncWAL {
   341  		syncWG = &b.commit
   342  	}
   343  
   344  	p.mu.Lock()
   345  
   346  	// Enqueue the batch in the pending queue. Note that while the pending queue
   347  	// is lock-free, we want the order of batches to be the same as the sequence
   348  	// number order.
   349  	p.pending.enqueue(b)
   350  
   351  	// Assign the batch a sequence number. Note that we use atomic operations
   352  	// here to handle concurrent reads of logSeqNum. commitPipeline.mu provides
   353  	// mutual exclusion for other goroutines writing to logSeqNum.
   354  	b.setSeqNum(atomic.AddUint64(p.env.logSeqNum, n) - n)
   355  
   356  	// Write the data to the WAL.
   357  	mem, err := p.env.write(b, syncWG)
   358  
   359  	p.mu.Unlock()
   360  
   361  	return mem, err
   362  }
   363  
   364  func (p *commitPipeline) publish(b *Batch) {
   365  	// Mark the batch as applied.
   366  	atomic.StoreUint32(&b.applied, 1)
   367  
   368  	// Loop dequeuing applied batches from the pending queue. If our batch was
   369  	// the head of the pending queue we are guaranteed that either we'll publish
   370  	// it or someone else will dequeue and publish it. If our batch is not the
   371  	// head of the queue then either we'll dequeue applied batches and reach our
   372  	// batch or there is an unapplied batch blocking us. When that unapplied
   373  	// batch applies it will go through the same process and publish our batch
   374  	// for us.
   375  	for {
   376  		t := p.pending.dequeue()
   377  		if t == nil {
   378  			// Wait for another goroutine to publish us. We might also be waiting for
   379  			// the WAL sync to finish.
   380  			b.commit.Wait()
   381  			break
   382  		}
   383  		if atomic.LoadUint32(&t.applied) != 1 {
   384  			panic("not reached")
   385  		}
   386  
   387  		// We're responsible for publishing the sequence number for batch t, but
   388  		// another concurrent goroutine might sneak in and publish the sequence
   389  		// number for a subsequent batch. That's ok as all we're guaranteeing is
   390  		// that the sequence number ratchets up.
   391  		for {
   392  			curSeqNum := atomic.LoadUint64(p.env.visibleSeqNum)
   393  			newSeqNum := t.SeqNum() + uint64(t.Count())
   394  			if newSeqNum <= curSeqNum {
   395  				// t's sequence number has already been published.
   396  				break
   397  			}
   398  			if atomic.CompareAndSwapUint64(p.env.visibleSeqNum, curSeqNum, newSeqNum) {
   399  				// We successfully published t's sequence number.
   400  				break
   401  			}
   402  		}
   403  
   404  		t.commit.Done()
   405  	}
   406  }