github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/record/log_writer.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package record 16 17 import ( 18 "context" 19 "encoding/binary" 20 "io" 21 "runtime/pprof" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/zuoyebang/bitalosdb/internal/base" 27 "github.com/zuoyebang/bitalosdb/internal/crc" 28 29 "github.com/cockroachdb/errors" 30 ) 31 32 var walSyncLabels = pprof.Labels("bitalosdb", "wal-sync") 33 34 type block struct { 35 // buf[:written] has already been filled with fragments. Updated atomically. 36 written int32 37 // buf[:flushed] has already been flushed to w. 38 flushed int32 39 buf [blockSize]byte 40 } 41 42 type flusher interface { 43 Flush() error 44 } 45 46 type syncer interface { 47 Sync() error 48 } 49 50 const ( 51 syncConcurrencyBits = 9 52 53 // SyncConcurrency is the maximum number of concurrent sync operations that 54 // can be performed. Note that a sync operation is initiated either by a call 55 // to SyncRecord or by a call to Close. Exported as this value also limits 56 // the commit concurrency in commitPipeline. 57 SyncConcurrency = 1 << syncConcurrencyBits 58 ) 59 60 type syncSlot struct { 61 wg *sync.WaitGroup 62 err *error 63 } 64 65 // syncQueue is a lock-free fixed-size single-producer, single-consumer 66 // queue. The single-producer can push to the head, and the single-consumer can 67 // pop multiple values from the tail. Popping calls Done() on each of the 68 // available *sync.WaitGroup elements. 69 type syncQueue struct { 70 // headTail packs together a 32-bit head index and a 32-bit tail index. Both 71 // are indexes into slots modulo len(slots)-1. 72 // 73 // tail = index of oldest data in queue 74 // head = index of next slot to fill 75 // 76 // Slots in the range [tail, head) are owned by consumers. A consumer 77 // continues to own a slot outside this range until it nils the slot, at 78 // which point ownership passes to the producer. 79 // 80 // The head index is stored in the most-significant bits so that we can 81 // atomically add to it and the overflow is harmless. 82 headTail uint64 83 84 // slots is a ring buffer of values stored in this queue. The size must be a 85 // power of 2. A slot is in use until the tail index has moved beyond it. 86 slots [SyncConcurrency]syncSlot 87 88 // blocked is an atomic boolean which indicates whether syncing is currently 89 // blocked or can proceed. It is used by the implementation of 90 // min-sync-interval to block syncing until the min interval has passed. 91 blocked uint32 92 } 93 94 const dequeueBits = 32 95 96 func (q *syncQueue) unpack(ptrs uint64) (head, tail uint32) { 97 const mask = 1<<dequeueBits - 1 98 head = uint32((ptrs >> dequeueBits) & mask) 99 tail = uint32(ptrs & mask) 100 return 101 } 102 103 func (q *syncQueue) push(wg *sync.WaitGroup, err *error) { 104 ptrs := atomic.LoadUint64(&q.headTail) 105 head, tail := q.unpack(ptrs) 106 if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head { 107 panic("bitalosdb: queue is full") 108 } 109 110 slot := &q.slots[head&uint32(len(q.slots)-1)] 111 slot.wg = wg 112 slot.err = err 113 114 // Increment head. This passes ownership of slot to dequeue and acts as a 115 // store barrier for writing the slot. 116 atomic.AddUint64(&q.headTail, 1<<dequeueBits) 117 } 118 119 func (q *syncQueue) setBlocked() { 120 atomic.StoreUint32(&q.blocked, 1) 121 } 122 123 func (q *syncQueue) clearBlocked() { 124 atomic.StoreUint32(&q.blocked, 0) 125 } 126 127 func (q *syncQueue) empty() bool { 128 head, tail := q.load() 129 return head == tail 130 } 131 132 func (q *syncQueue) load() (head, tail uint32) { 133 if atomic.LoadUint32(&q.blocked) == 1 { 134 return 0, 0 135 } 136 137 ptrs := atomic.LoadUint64(&q.headTail) 138 head, tail = q.unpack(ptrs) 139 return head, tail 140 } 141 142 func (q *syncQueue) pop(head, tail uint32, err error) error { 143 if tail == head { 144 // Queue is empty. 145 return nil 146 } 147 148 for ; tail != head; tail++ { 149 slot := &q.slots[tail&uint32(len(q.slots)-1)] 150 wg := slot.wg 151 if wg == nil { 152 return errors.Errorf("nil waiter at %d", errors.Safe(tail&uint32(len(q.slots)-1))) 153 } 154 *slot.err = err 155 slot.wg = nil 156 slot.err = nil 157 // We need to bump the tail count before signalling the wait group as 158 // signalling the wait group can trigger release a blocked goroutine which 159 // will try to enqueue before we've "freed" space in the queue. 160 atomic.AddUint64(&q.headTail, 1) 161 wg.Done() 162 } 163 164 return nil 165 } 166 167 // flusherCond is a specialized condition variable that allows its condition to 168 // change and readiness be signalled without holding its associated mutex. In 169 // particular, when a waiter is added to syncQueue atomically, this condition 170 // variable can be signalled without holding flusher.Mutex. 171 type flusherCond struct { 172 mu *sync.Mutex 173 q *syncQueue 174 cond sync.Cond 175 } 176 177 func (c *flusherCond) init(mu *sync.Mutex, q *syncQueue) { 178 c.mu = mu 179 c.q = q 180 // Yes, this is a bit circular, but that is intentional. flusherCond.cond.L 181 // points flusherCond so that when cond.L.Unlock is called flusherCond.Unlock 182 // will be called and we can check the !syncQueue.empty() condition. 183 c.cond.L = c 184 } 185 186 func (c *flusherCond) Signal() { 187 // Pass-through to the cond var. 188 c.cond.Signal() 189 } 190 191 func (c *flusherCond) Wait() { 192 // Pass-through to the cond var. Note that internally the cond var implements 193 // Wait as: 194 // 195 // t := notifyListAdd() 196 // L.Unlock() 197 // notifyListWait(t) 198 // L.Lock() 199 // 200 // We've configured the cond var to call flusherReady.Unlock() which allows 201 // us to check the !syncQueue.empty() condition without a danger of missing a 202 // notification. Any call to flusherReady.Signal() after notifyListAdd() is 203 // called will cause the subsequent notifyListWait() to return immediately. 204 c.cond.Wait() 205 } 206 207 func (c *flusherCond) Lock() { 208 c.mu.Lock() 209 } 210 211 func (c *flusherCond) Unlock() { 212 c.mu.Unlock() 213 if !c.q.empty() { 214 // If the current goroutine is about to block on sync.Cond.Wait, this call 215 // to Signal will prevent that. The comment in Wait above explains a bit 216 // about what is going on here, but it is worth reiterating: 217 // 218 // flusherCond.Wait() 219 // sync.Cond.Wait() 220 // t := notifyListAdd() 221 // flusherCond.Unlock() <-- we are here 222 // notifyListWait(t) 223 // flusherCond.Lock() 224 // 225 // The call to Signal here results in: 226 // 227 // sync.Cond.Signal() 228 // notifyListNotifyOne() 229 // 230 // The call to notifyListNotifyOne() will prevent the call to 231 // notifyListWait(t) from blocking. 232 c.cond.Signal() 233 } 234 } 235 236 type durationFunc func() time.Duration 237 238 // syncTimer is an interface for timers, modeled on the closure callback mode 239 // of time.Timer. See time.AfterFunc and LogWriter.afterFunc. syncTimer is used 240 // by tests to mock out the timer functionality used to implement 241 // min-sync-interval. 242 type syncTimer interface { 243 Reset(time.Duration) bool 244 Stop() bool 245 } 246 247 // LogWriter writes records to an underlying io.Writer. In order to support WAL 248 // file reuse, a LogWriter's records are tagged with the WAL's file 249 // number. When reading a log file a record from a previous incarnation of the 250 // file will return the error ErrInvalidLogNum. 251 type LogWriter struct { 252 // w is the underlying writer. 253 w io.Writer 254 // c is w as a closer. 255 c io.Closer 256 // s is w as a syncer. 257 s syncer 258 // logNum is the low 32-bits of the log's file number. 259 logNum uint32 260 // blockNum is the zero based block number for the current block. 261 blockNum int64 262 // err is any accumulated error. 263 err error 264 // block is the current block being written. Protected by flusher.Mutex. 265 block *block 266 free struct { 267 sync.Mutex 268 // Condition variable used to signal a block is freed. 269 cond sync.Cond 270 blocks []*block 271 allocated int 272 } 273 274 flusher struct { 275 sync.Mutex 276 // Flusher ready is a condition variable that is signalled when there are 277 // blocks to flush, syncing has been requested, or the LogWriter has been 278 // closed. For signalling of a sync, it is safe to call without holding 279 // flusher.Mutex. 280 ready flusherCond 281 // Set to true when the flush loop should be closed. 282 close bool 283 // Closed when the flush loop has terminated. 284 closed chan struct{} 285 // Accumulated flush error. 286 err error 287 // minSyncInterval is the minimum duration between syncs. 288 minSyncInterval durationFunc 289 pending []*block 290 syncQ syncQueue 291 } 292 293 // afterFunc is a hook to allow tests to mock out the timer functionality 294 // used for min-sync-interval. In normal operation this points to 295 // time.AfterFunc. 296 afterFunc func(d time.Duration, f func()) syncTimer 297 } 298 299 // NewLogWriter returns a new LogWriter. 300 func NewLogWriter(w io.Writer, logNum base.FileNum) *LogWriter { 301 c, _ := w.(io.Closer) 302 s, _ := w.(syncer) 303 r := &LogWriter{ 304 w: w, 305 c: c, 306 s: s, 307 // NB: we truncate the 64-bit log number to 32-bits. This is ok because a) 308 // we are very unlikely to reach a file number of 4 billion and b) the log 309 // number is used as a validation check and using only the low 32-bits is 310 // sufficient for that purpose. 311 logNum: uint32(logNum), 312 afterFunc: func(d time.Duration, f func()) syncTimer { 313 return time.AfterFunc(d, f) 314 }, 315 } 316 r.free.cond.L = &r.free.Mutex 317 r.free.blocks = make([]*block, 0, 16) 318 r.free.allocated = 1 319 r.block = &block{} 320 r.flusher.ready.init(&r.flusher.Mutex, &r.flusher.syncQ) 321 r.flusher.closed = make(chan struct{}) 322 r.flusher.pending = make([]*block, 0, cap(r.free.blocks)) 323 go func() { 324 pprof.Do(context.Background(), walSyncLabels, r.flushLoop) 325 }() 326 return r 327 } 328 329 // SetMinSyncInterval sets the closure to invoke for retrieving the minimum 330 // sync duration between syncs. 331 func (w *LogWriter) SetMinSyncInterval(minSyncInterval durationFunc) { 332 f := &w.flusher 333 f.Lock() 334 f.minSyncInterval = minSyncInterval 335 f.Unlock() 336 } 337 338 func (w *LogWriter) flushLoop(context.Context) { 339 f := &w.flusher 340 f.Lock() 341 342 var syncTimer syncTimer 343 defer func() { 344 if syncTimer != nil { 345 syncTimer.Stop() 346 } 347 close(f.closed) 348 f.Unlock() 349 }() 350 351 // The flush loop performs flushing of full and partial data blocks to the 352 // underlying writer (LogWriter.w), syncing of the writer, and notification 353 // to sync requests that they have completed. 354 // 355 // - flusher.ready is a condition variable that is signalled when there is 356 // work to do. Full blocks are contained in flusher.pending. The current 357 // partial block is in LogWriter.block. And sync operations are held in 358 // flusher.syncQ. 359 // 360 // - The decision to sync is determined by whether there are any sync 361 // requests present in flusher.syncQ and whether enough time has elapsed 362 // since the last sync. If not enough time has elapsed since the last sync, 363 // flusher.syncQ.blocked will be set to 1. If syncing is blocked, 364 // syncQueue.empty() will return true and syncQueue.load() will return 0,0 365 // (i.e. an empty list). 366 // 367 // - flusher.syncQ.blocked is cleared by a timer that is initialized when 368 // blocked is set to 1. When blocked is 1, no syncing will take place, but 369 // flushing will continue to be performed. The on/off toggle for syncing 370 // does not need to be carefully synchronized with the rest of processing 371 // -- all we need to ensure is that after any transition to blocked=1 there 372 // is eventually a transition to blocked=0. syncTimer performs this 373 // transition. Note that any change to min-sync-interval will not take 374 // effect until the previous timer elapses. 375 // 376 // - Picking up the syncing work to perform requires coordination with 377 // picking up the flushing work. Specifically, flushing work is queued 378 // before syncing work. The guarantee of this code is that when a sync is 379 // requested, any previously queued flush work will be synced. This 380 // motivates reading the syncing work (f.syncQ.load()) before picking up 381 // the flush work (atomic.LoadInt32(&w.block.written)). 382 383 // The list of full blocks that need to be written. This is copied from 384 // f.pending on every loop iteration, though the number of elements is small 385 // (usually 1, max 16). 386 pending := make([]*block, 0, cap(f.pending)) 387 388 for { 389 for { 390 // Grab the portion of the current block that requires flushing. Note that 391 // the current block can be added to the pending blocks list after we release 392 // the flusher lock, but it won't be part of pending. 393 written := atomic.LoadInt32(&w.block.written) 394 if len(f.pending) > 0 || written > w.block.flushed || !f.syncQ.empty() { 395 break 396 } 397 if f.close { 398 // If the writer is closed, pretend the sync timer fired immediately so 399 // that we can process any queued sync requests. 400 f.syncQ.clearBlocked() 401 if !f.syncQ.empty() { 402 break 403 } 404 return 405 } 406 f.ready.Wait() 407 continue 408 } 409 410 pending = pending[:len(f.pending)] 411 copy(pending, f.pending) 412 f.pending = f.pending[:0] 413 414 // Grab the list of sync waiters. Note that syncQueue.load() will return 415 // 0,0 while we're waiting for the min-sync-interval to expire. This 416 // allows flushing to proceed even if we're not ready to sync. 417 head, tail := f.syncQ.load() 418 419 // Grab the portion of the current block that requires flushing. Note that 420 // the current block can be added to the pending blocks list after we 421 // release the flusher lock, but it won't be part of pending. This has to 422 // be ordered after we get the list of sync waiters from syncQ in order to 423 // prevent a race where a waiter adds itself to syncQ, but this thread 424 // picks up the entry in syncQ and not the buffered data. 425 written := atomic.LoadInt32(&w.block.written) 426 data := w.block.buf[w.block.flushed:written] 427 w.block.flushed = written 428 429 // If flusher has an error, we propagate it to waiters. Note in spite of 430 // error we consume the pending list above to free blocks for writers. 431 if f.err != nil { 432 f.syncQ.pop(head, tail, f.err) 433 continue 434 } 435 f.Unlock() 436 synced, err := w.flushPending(data, pending, head, tail) 437 f.Lock() 438 f.err = err 439 if f.err != nil { 440 f.syncQ.clearBlocked() 441 continue 442 } 443 444 if synced && f.minSyncInterval != nil { 445 // A sync was performed. Make sure we've waited for the min sync 446 // interval before syncing again. 447 if min := f.minSyncInterval(); min > 0 { 448 f.syncQ.setBlocked() 449 if syncTimer == nil { 450 syncTimer = w.afterFunc(min, func() { 451 f.syncQ.clearBlocked() 452 f.ready.Signal() 453 }) 454 } else { 455 syncTimer.Reset(min) 456 } 457 } 458 } 459 } 460 } 461 462 func (w *LogWriter) flushPending( 463 data []byte, pending []*block, head, tail uint32, 464 ) (synced bool, err error) { 465 defer func() { 466 // Translate panics into errors. The errors will cause flushLoop to shut 467 // down, but allows us to do so in a controlled way and avoid swallowing 468 // the stack that created the panic if panic'ing itself hits a panic 469 // (e.g. unlock of unlocked mutex). 470 if r := recover(); r != nil { 471 err = errors.Newf("%v", r) 472 } 473 }() 474 475 for _, b := range pending { 476 if err = w.flushBlock(b); err != nil { 477 break 478 } 479 } 480 if err == nil && len(data) > 0 { 481 _, err = w.w.Write(data) 482 } 483 484 synced = head != tail 485 if synced { 486 if err == nil && w.s != nil { 487 err = w.s.Sync() 488 } 489 f := &w.flusher 490 if popErr := f.syncQ.pop(head, tail, err); popErr != nil { 491 return synced, popErr 492 } 493 } 494 495 return synced, err 496 } 497 498 func (w *LogWriter) flushBlock(b *block) error { 499 if _, err := w.w.Write(b.buf[b.flushed:]); err != nil { 500 return err 501 } 502 b.written = 0 503 b.flushed = 0 504 w.free.Lock() 505 w.free.blocks = append(w.free.blocks, b) 506 w.free.cond.Signal() 507 w.free.Unlock() 508 return nil 509 } 510 511 // queueBlock queues the current block for writing to the underlying writer, 512 // allocates a new block and reserves space for the next header. 513 func (w *LogWriter) queueBlock() { 514 // Allocate a new block, blocking until one is available. We do this first 515 // because w.block is protected by w.flusher.Mutex. 516 w.free.Lock() 517 if len(w.free.blocks) == 0 { 518 if w.free.allocated < cap(w.free.blocks) { 519 w.free.allocated++ 520 w.free.blocks = append(w.free.blocks, &block{}) 521 } else { 522 for len(w.free.blocks) == 0 { 523 w.free.cond.Wait() 524 } 525 } 526 } 527 nextBlock := w.free.blocks[len(w.free.blocks)-1] 528 w.free.blocks = w.free.blocks[:len(w.free.blocks)-1] 529 w.free.Unlock() 530 531 f := &w.flusher 532 f.Lock() 533 f.pending = append(f.pending, w.block) 534 w.block = nextBlock 535 f.ready.Signal() 536 w.err = w.flusher.err 537 f.Unlock() 538 539 w.blockNum++ 540 } 541 542 // Close flushes and syncs any unwritten data and closes the writer. 543 // Where required, external synchronisation is provided by commitPipeline.mu. 544 func (w *LogWriter) Close() error { 545 f := &w.flusher 546 547 // Emit an EOF trailer signifying the end of this log. This helps readers 548 // differentiate between a corrupted entry in the middle of a log from 549 // garbage at the tail from a recycled log file. 550 w.emitEOFTrailer() 551 552 // Signal the flush loop to close. 553 f.Lock() 554 f.close = true 555 f.ready.Signal() 556 f.Unlock() 557 558 // Wait for the flush loop to close. The flush loop will not close until all 559 // pending data has been written or an error occurs. 560 <-f.closed 561 562 // Sync any flushed data to disk. NB: flushLoop will sync after flushing the 563 // last buffered data only if it was requested via syncQ, so we need to sync 564 // here to ensure that all the data is synced. 565 err := w.flusher.err 566 if err == nil && w.s != nil { 567 err = w.s.Sync() 568 } 569 570 if w.c != nil { 571 cerr := w.c.Close() 572 w.c = nil 573 if cerr != nil { 574 return cerr 575 } 576 } 577 w.err = errors.New("bitalosdb/record: closed LogWriter") 578 return err 579 } 580 581 // WriteRecord writes a complete record. Returns the offset just past the end 582 // of the record. 583 // External synchronisation provided by commitPipeline.mu. 584 func (w *LogWriter) WriteRecord(p []byte) (int64, error) { 585 return w.SyncRecord(p, nil, nil) 586 } 587 588 // SyncRecord writes a complete record. If wg!= nil the record will be 589 // asynchronously persisted to the underlying writer and done will be called on 590 // the wait group upon completion. Returns the offset just past the end of the 591 // record. 592 // External synchronisation provided by commitPipeline.mu. 593 func (w *LogWriter) SyncRecord(p []byte, wg *sync.WaitGroup, err *error) (int64, error) { 594 if w.err != nil { 595 return -1, w.err 596 } 597 598 // The `i == 0` condition ensures we handle empty records. Such records can 599 // possibly be generated for VersionEdits stored in the MANIFEST. While the 600 // MANIFEST is currently written using Writer, it is good to support the same 601 // semantics with LogWriter. 602 603 for i := 0; i == 0 || len(p) > 0; i++ { 604 p = w.emitFragment(i, p) 605 } 606 607 if wg != nil { 608 // If we've been asked to persist the record, add the WaitGroup to the sync 609 // queue and signal the flushLoop. Note that flushLoop will write partial 610 // blocks to the file if syncing has been requested. The contract is that 611 // any record written to the LogWriter to this point will be flushed to the 612 // OS and synced to disk. 613 f := &w.flusher 614 f.syncQ.push(wg, err) 615 f.ready.Signal() 616 } 617 618 offset := w.blockNum*blockSize + int64(w.block.written) 619 // Note that we don't return w.err here as a concurrent call to Close would 620 // race with our read. That's ok because the only error we could be seeing is 621 // one to syncing for which the caller can receive notification of by passing 622 // in a non-nil err argument. 623 return offset, nil 624 } 625 626 // Size returns the current size of the file. 627 // External synchronisation provided by commitPipeline.mu. 628 func (w *LogWriter) Size() int64 { 629 return w.blockNum*blockSize + int64(w.block.written) 630 } 631 632 func (w *LogWriter) emitEOFTrailer() { 633 // Write a recyclable chunk header with a different log number. Readers 634 // will treat the header as EOF when the log number does not match. 635 b := w.block 636 i := b.written 637 binary.LittleEndian.PutUint32(b.buf[i+0:i+4], 0) // CRC 638 binary.LittleEndian.PutUint16(b.buf[i+4:i+6], 0) // Size 639 b.buf[i+6] = recyclableFullChunkType 640 binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum+1) // Log number 641 atomic.StoreInt32(&b.written, i+int32(recyclableHeaderSize)) 642 } 643 644 func (w *LogWriter) emitFragment(n int, p []byte) []byte { 645 b := w.block 646 i := b.written 647 first := n == 0 648 last := blockSize-i-recyclableHeaderSize >= int32(len(p)) 649 650 if last { 651 if first { 652 b.buf[i+6] = recyclableFullChunkType 653 } else { 654 b.buf[i+6] = recyclableLastChunkType 655 } 656 } else { 657 if first { 658 b.buf[i+6] = recyclableFirstChunkType 659 } else { 660 b.buf[i+6] = recyclableMiddleChunkType 661 } 662 } 663 664 binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum) 665 666 r := copy(b.buf[i+recyclableHeaderSize:], p) 667 j := i + int32(recyclableHeaderSize+r) 668 binary.LittleEndian.PutUint32(b.buf[i+0:i+4], crc.New(b.buf[i+6:j]).Value()) 669 binary.LittleEndian.PutUint16(b.buf[i+4:i+6], uint16(r)) 670 atomic.StoreInt32(&b.written, j) 671 672 if blockSize-b.written < recyclableHeaderSize { 673 // There is no room for another fragment in the block, so fill the 674 // remaining bytes with zeros and queue the block for flushing. 675 for i := b.written; i < blockSize; i++ { 676 b.buf[i] = 0 677 } 678 w.queueBlock() 679 } 680 return p[r:] 681 }