github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/commit.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "runtime" 9 "sync" 10 "sync/atomic" 11 "unsafe" 12 13 "github.com/petermattis/pebble/internal/record" 14 ) 15 16 // The maximum concurrency allowed for commit operations. This limit is 17 // enforced by commitPipeline.sem. 18 const commitConcurrency = record.SyncConcurrency 19 20 // commitQueue is a lock-free fixed-size single-producer, multi-consumer 21 // queue. The single producer can enqueue (push) to the head, and consumers can 22 // dequeue (pop) from the tail. 23 // 24 // It has the added feature that it nils out unused slots to avoid unnecessary 25 // retention of objects. 26 type commitQueue struct { 27 // headTail packs together a 32-bit head index and a 32-bit tail index. Both 28 // are indexes into slots modulo len(slots)-1. 29 // 30 // tail = index of oldest data in queue 31 // head = index of next slot to fill 32 // 33 // Slots in the range [tail, head) are owned by consumers. A consumer 34 // continues to own a slot outside this range until it nils the slot, at 35 // which point ownership passes to the producer. 36 // 37 // The head index is stored in the most-significant bits so that we can 38 // atomically add to it and the overflow is harmless. 39 headTail uint64 40 41 // slots is a ring buffer of values stored in this queue. The size must be a 42 // power of 2. A slot is in use until *both* the tail index has moved beyond 43 // it and the slot value has been set to nil. The slot value is set to nil 44 // atomically by the consumer and read atomically by the producer. 45 slots [commitConcurrency]unsafe.Pointer 46 } 47 48 const dequeueBits = 32 49 50 func (q *commitQueue) unpack(ptrs uint64) (head, tail uint32) { 51 const mask = 1<<dequeueBits - 1 52 head = uint32((ptrs >> dequeueBits) & mask) 53 tail = uint32(ptrs & mask) 54 return 55 } 56 57 func (q *commitQueue) pack(head, tail uint32) uint64 { 58 const mask = 1<<dequeueBits - 1 59 return (uint64(head) << dequeueBits) | 60 uint64(tail&mask) 61 } 62 63 func (q *commitQueue) enqueue(b *Batch) { 64 for { 65 ptrs := atomic.LoadUint64(&q.headTail) 66 head, tail := q.unpack(ptrs) 67 if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head { 68 // Queue is full. 69 panic("not reached") 70 } 71 slot := &q.slots[head&uint32(len(q.slots)-1)] 72 73 // Check if the head slot has been released by dequeue. 74 for atomic.LoadPointer(slot) != nil { 75 // Another goroutine is still cleaning up the tail, so the queue is 76 // actually still full. We spin because this should resolve itself 77 // momentarily. 78 runtime.Gosched() 79 } 80 81 // The head slot is free, so we own it. 82 atomic.StorePointer(slot, unsafe.Pointer(b)) 83 84 // Increment head. This passes ownership of slot to dequeue and acts as a 85 // store barrier for writing the slot. 86 atomic.AddUint64(&q.headTail, 1<<dequeueBits) 87 return 88 } 89 } 90 91 func (q *commitQueue) dequeue() *Batch { 92 for { 93 ptrs := atomic.LoadUint64(&q.headTail) 94 head, tail := q.unpack(ptrs) 95 if tail == head { 96 // Queue is empty. 97 return nil 98 } 99 100 slot := &q.slots[tail&uint32(len(q.slots)-1)] 101 b := (*Batch)(atomic.LoadPointer(slot)) 102 if b == nil || atomic.LoadUint32(&b.applied) == 0 { 103 // The batch is not ready to be dequeued, or another goroutine has 104 // already dequeued it. 105 return nil 106 } 107 108 // Confirm head and tail (for our speculative check above) and increment 109 // tail. If this succeeds, then we own the slot at tail. 110 ptrs2 := q.pack(head, tail+1) 111 if atomic.CompareAndSwapUint64(&q.headTail, ptrs, ptrs2) { 112 // We now own slot. 113 // 114 // Tell enqueue that we're done with this slot. Zeroing the slot is also 115 // important so we don't leave behind references that could keep this object 116 // live longer than necessary. 117 atomic.StorePointer(slot, nil) 118 // At this point enqueue owns the slot. 119 return b 120 } 121 } 122 } 123 124 // commitEnv contains the environment that a commitPipeline interacts 125 // with. This allows fine-grained testing of commitPipeline behavior without 126 // construction of an entire DB. 127 type commitEnv struct { 128 // The next sequence number to give to a batch. Protected by 129 // commitPipeline.mu. 130 logSeqNum *uint64 131 // The visible sequence number at which reads should be performed. Ratcheted 132 // upwards atomically as batches are applied to the memtable. 133 visibleSeqNum *uint64 134 135 // Apply the batch to the specified memtable. Called concurrently. 136 apply func(b *Batch, mem *memTable) error 137 // Write the batch to the WAL. If wg!=nil, the data will be persisted 138 // asynchronously and done will be called on wait group upon 139 // completion. Returns the memtable the batch should be applied to. Serial 140 // execution enforced by commitPipeline.mu. 141 write func(b *Batch, wg *sync.WaitGroup) (*memTable, error) 142 } 143 144 // A commitPipeline manages the stages of committing a set of mutations 145 // (contained in a single Batch) atomically to the DB. The steps are 146 // conceptually: 147 // 148 // 1. Write the batch to the WAL and optionally sync the WAL 149 // 2. Apply the mutations in the batch to the memtable 150 // 151 // These two simple steps are made complicated by the desire for high 152 // performance. In the absence of concurrency, performance is limited by how 153 // fast a batch can be written (and synced) to the WAL and then added to the 154 // memtable, both of which are outside the purview of the commit 155 // pipeline. Performance under concurrency is the primary concern of the commit 156 // pipeline, though it also needs to maintain two invariants: 157 // 158 // 1. Batches need to be written to the WAL in sequence number order. 159 // 2. Batches need to be made visible for reads in sequence number order. This 160 // invariant arises from the use of a single sequence number which 161 // indicates which mutations are visible. 162 // 163 // Taking these invariants into account, let's revisit the work the commit 164 // pipeline needs to perform. Writing the batch to the WAL is necessarily 165 // serialized as there is a single WAL object. The order of the entries in the 166 // WAL defines the sequence number order. Note that writing to the WAL is 167 // extremely fast, usually just a memory copy. Applying the mutations in a 168 // batch to the memtable can occur concurrently as the underlying skiplist 169 // supports concurrent insertions. Publishing the visible sequence number is 170 // another serialization point, but one with a twist: the visible sequence 171 // number cannot be bumped until the mutations for earlier batches have 172 // finished applying to the memtable (the visible sequence number only ratchets 173 // up). Lastly, if requested, the commit waits for the WAL to sync. Note that 174 // waiting for the WAL sync after ratcheting the visible sequence number allows 175 // another goroutine to read committed data before the WAL has synced. This is 176 // similar behavior to RocksDB's manual WAL flush functionality. Application 177 // code needs to protect against this if necessary. 178 // 179 // The full outline of the commit pipeline operation is as follows: 180 // 181 // with commitPipeline mutex locked: 182 // assign batch sequence number 183 // write batch to WAL 184 // (optionally) add batch to WAL sync list 185 // apply batch to memtable (concurrently) 186 // wait for earlier batches to apply 187 // ratchet read sequence number 188 // (optionally) wait for the WAL to sync 189 // 190 // As soon as a batch has been written to the WAL, the commitPipeline mutex is 191 // released allowing another batch to write to the WAL. Each commit operation 192 // individually applies its batch to the memtable providing concurrency. The 193 // WAL sync happens concurrently with applying to the memtable (see 194 // commitPipeline.syncLoop). 195 // 196 // The "waits for earlier batches to apply" work is more complicated than might 197 // be expected. The obvious approach would be to keep a queue of pending 198 // batches and for each batch to wait for the previous batch to finish 199 // committing. This approach was tried initially and turned out to be too 200 // slow. The problem is that it causes excessive goroutine activity as each 201 // committing goroutine needs to wake up in order for the next goroutine to be 202 // unblocked. The approach taken in the current code is conceptually similar, 203 // though it avoids waking a goroutine to perform work that another goroutine 204 // can perform. A commitQueue (a single-producer, multiple-consumer queue) 205 // holds the ordered list of committing batches. Addition to the queue is done 206 // while holding commitPipeline.mutex ensuring the same ordering of batches in 207 // the queue as the ordering in the WAL. When a batch finishes applying to the 208 // memtable, it atomically updates its Batch.applied field. Ratcheting of the 209 // visible sequence number is done by commitPipeline.publish which loops 210 // dequeueing "applied" batches and ratcheting the visible sequence number. If 211 // we hit an unapplied batch at the head of the queue we can block as we know 212 // that committing of that unapplied batch will eventually find our (applied) 213 // batch in the queue. See commitPipeline.publish for additional commentary. 214 type commitPipeline struct { 215 env commitEnv 216 sem chan struct{} 217 // The mutex to use for synchronizing access to logSeqNum and serializing 218 // calls to commitEnv.write(). 219 mu sync.Mutex 220 // Queue of pending batches to commit. 221 pending commitQueue 222 } 223 224 func newCommitPipeline(env commitEnv) *commitPipeline { 225 p := &commitPipeline{ 226 env: env, 227 sem: make(chan struct{}, commitConcurrency), 228 } 229 return p 230 } 231 232 func (p *commitPipeline) Close() { 233 } 234 235 // Commit the specified batch, writing it to the WAL, optionally syncing the 236 // WAL, and applying the batch to the memtable. Upon successful return the 237 // batch's mutations will be visible for reading. 238 func (p *commitPipeline) Commit(b *Batch, syncWAL bool) error { 239 if b.Empty() { 240 return nil 241 } 242 243 p.sem <- struct{}{} 244 245 // Prepare the batch for committing: enqueuing the batch in the pending 246 // queue, determining the batch sequence number and writing the data to the 247 // WAL. 248 mem, err := p.prepare(b, syncWAL) 249 if err != nil { 250 // TODO(peter): what to do on error? the pipeline will be horked at this 251 // point. 252 panic(err) 253 } 254 255 // Apply the batch to the memtable. 256 if err := p.env.apply(b, mem); err != nil { 257 // TODO(peter): what to do on error? the pipeline will be horked at this 258 // point. 259 panic(err) 260 } 261 262 // Publish the batch sequence number. 263 p.publish(b) 264 265 <-p.sem 266 return nil 267 } 268 269 // AllocateSeqNum allocates count sequence numbers, invokes the prepare 270 // callback, then the apply callback, and then publishes the sequence 271 // numbers. AllocateSeqNum does not write to the WAL or add entries to the 272 // memtable. AllocateSeqNum can be used to sequence an operation such as 273 // sstable ingestion within the commit pipeline. The prepare callback is 274 // invoked with commitPipeline.mu held, but note that DB.mu is not held and 275 // must be locked if necessary. 276 func (p *commitPipeline) AllocateSeqNum(count int, prepare func(), apply func(seqNum uint64)) { 277 // This method is similar to Commit and prepare. Be careful about trying to 278 // share additional code with those methods because Commit and prepare are 279 // performance critical code paths. 280 281 b := newBatch(nil) 282 defer b.release() 283 284 // Give the batch a count of 1 so that the log and visible sequence number 285 // are incremented correctly. 286 b.storage.data = make([]byte, batchHeaderLen) 287 b.setCount(uint32(count)) 288 b.commit.Add(1) 289 290 p.sem <- struct{}{} 291 292 p.mu.Lock() 293 294 // Enqueue the batch in the pending queue. Note that while the pending queue 295 // is lock-free, we want the order of batches to be the same as the sequence 296 // number order. 297 p.pending.enqueue(b) 298 299 // Assign the batch a sequence number. Note that we use atomic operations 300 // here to handle concurrent reads of logSeqNum. commitPipeline.mu provides 301 // mutual exclusion for other goroutines writing to logSeqNum. 302 seqNum := atomic.AddUint64(p.env.logSeqNum, uint64(count)) - uint64(count) 303 if seqNum == 0 { 304 // We can't use the value 0 for the global seqnum during ingestion, because 305 // 0 indicates no global seqnum. So allocate one more seqnum. 306 atomic.AddUint64(p.env.logSeqNum, 1) 307 seqNum++ 308 b.setCount(1 + uint32(count)) 309 } 310 b.setSeqNum(seqNum) 311 312 // Invoke the prepare callback. Note the lack of error reporting. Even if the 313 // callback internally fails, the sequence number needs to be published in 314 // order to allow the commit pipeline to proceed. 315 prepare() 316 317 p.mu.Unlock() 318 319 // Invoke the apply callback. 320 apply(b.SeqNum()) 321 322 // Publish the sequence number. 323 p.publish(b) 324 325 <-p.sem 326 } 327 328 func (p *commitPipeline) prepare(b *Batch, syncWAL bool) (*memTable, error) { 329 n := uint64(b.Count()) 330 if n == invalidBatchCount { 331 return nil, ErrInvalidBatch 332 } 333 count := 1 334 if syncWAL { 335 count++ 336 } 337 b.commit.Add(count) 338 339 var syncWG *sync.WaitGroup 340 if syncWAL { 341 syncWG = &b.commit 342 } 343 344 p.mu.Lock() 345 346 // Enqueue the batch in the pending queue. Note that while the pending queue 347 // is lock-free, we want the order of batches to be the same as the sequence 348 // number order. 349 p.pending.enqueue(b) 350 351 // Assign the batch a sequence number. Note that we use atomic operations 352 // here to handle concurrent reads of logSeqNum. commitPipeline.mu provides 353 // mutual exclusion for other goroutines writing to logSeqNum. 354 b.setSeqNum(atomic.AddUint64(p.env.logSeqNum, n) - n) 355 356 // Write the data to the WAL. 357 mem, err := p.env.write(b, syncWG) 358 359 p.mu.Unlock() 360 361 return mem, err 362 } 363 364 func (p *commitPipeline) publish(b *Batch) { 365 // Mark the batch as applied. 366 atomic.StoreUint32(&b.applied, 1) 367 368 // Loop dequeuing applied batches from the pending queue. If our batch was 369 // the head of the pending queue we are guaranteed that either we'll publish 370 // it or someone else will dequeue and publish it. If our batch is not the 371 // head of the queue then either we'll dequeue applied batches and reach our 372 // batch or there is an unapplied batch blocking us. When that unapplied 373 // batch applies it will go through the same process and publish our batch 374 // for us. 375 for { 376 t := p.pending.dequeue() 377 if t == nil { 378 // Wait for another goroutine to publish us. We might also be waiting for 379 // the WAL sync to finish. 380 b.commit.Wait() 381 break 382 } 383 if atomic.LoadUint32(&t.applied) != 1 { 384 panic("not reached") 385 } 386 387 // We're responsible for publishing the sequence number for batch t, but 388 // another concurrent goroutine might sneak in and publish the sequence 389 // number for a subsequent batch. That's ok as all we're guaranteeing is 390 // that the sequence number ratchets up. 391 for { 392 curSeqNum := atomic.LoadUint64(p.env.visibleSeqNum) 393 newSeqNum := t.SeqNum() + uint64(t.Count()) 394 if newSeqNum <= curSeqNum { 395 // t's sequence number has already been published. 396 break 397 } 398 if atomic.CompareAndSwapUint64(p.env.visibleSeqNum, curSeqNum, newSeqNum) { 399 // We successfully published t's sequence number. 400 break 401 } 402 } 403 404 t.commit.Done() 405 } 406 }