github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/record/log_writer.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package record 6 7 import ( 8 "context" 9 "encoding/binary" 10 "io" 11 "runtime/pprof" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/cockroachdb/errors" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/crc" 19 "github.com/prometheus/client_golang/prometheus" 20 ) 21 22 var walSyncLabels = pprof.Labels("pebble", "wal-sync") 23 var errClosedWriter = errors.New("pebble/record: closed LogWriter") 24 25 type block struct { 26 // buf[:written] has already been filled with fragments. Updated atomically. 27 written atomic.Int32 28 // buf[:flushed] has already been flushed to w. 29 flushed int32 30 buf [blockSize]byte 31 } 32 33 type flusher interface { 34 Flush() error 35 } 36 37 type syncer interface { 38 Sync() error 39 } 40 41 const ( 42 syncConcurrencyBits = 12 43 44 // SyncConcurrency is the maximum number of concurrent sync operations that 45 // can be performed. Note that a sync operation is initiated either by a call 46 // to SyncRecord or by a call to Close. Exported as this value also limits 47 // the commit concurrency in commitPipeline. 48 SyncConcurrency = 1 << syncConcurrencyBits 49 ) 50 51 type syncSlot struct { 52 wg *sync.WaitGroup 53 err *error 54 } 55 56 // syncQueue is a lock-free fixed-size single-producer, single-consumer 57 // queue. The single-producer can push to the head, and the single-consumer can 58 // pop multiple values from the tail. Popping calls Done() on each of the 59 // available *sync.WaitGroup elements. 60 type syncQueue struct { 61 // headTail packs together a 32-bit head index and a 32-bit tail index. Both 62 // are indexes into slots modulo len(slots)-1. 63 // 64 // tail = index of oldest data in queue 65 // head = index of next slot to fill 66 // 67 // Slots in the range [tail, head) are owned by consumers. A consumer 68 // continues to own a slot outside this range until it nils the slot, at 69 // which point ownership passes to the producer. 70 // 71 // The head index is stored in the most-significant bits so that we can 72 // atomically add to it and the overflow is harmless. 73 headTail atomic.Uint64 74 75 // slots is a ring buffer of values stored in this queue. The size must be a 76 // power of 2. A slot is in use until the tail index has moved beyond it. 77 slots [SyncConcurrency]syncSlot 78 79 // blocked is an atomic boolean which indicates whether syncing is currently 80 // blocked or can proceed. It is used by the implementation of 81 // min-sync-interval to block syncing until the min interval has passed. 82 blocked atomic.Bool 83 } 84 85 const dequeueBits = 32 86 87 func (q *syncQueue) unpack(ptrs uint64) (head, tail uint32) { 88 const mask = 1<<dequeueBits - 1 89 head = uint32((ptrs >> dequeueBits) & mask) 90 tail = uint32(ptrs & mask) 91 return 92 } 93 94 func (q *syncQueue) push(wg *sync.WaitGroup, err *error) { 95 ptrs := q.headTail.Load() 96 head, tail := q.unpack(ptrs) 97 if (tail+uint32(len(q.slots)))&(1<<dequeueBits-1) == head { 98 panic("pebble: queue is full") 99 } 100 101 slot := &q.slots[head&uint32(len(q.slots)-1)] 102 slot.wg = wg 103 slot.err = err 104 105 // Increment head. This passes ownership of slot to dequeue and acts as a 106 // store barrier for writing the slot. 107 q.headTail.Add(1 << dequeueBits) 108 } 109 110 func (q *syncQueue) setBlocked() { 111 q.blocked.Store(true) 112 } 113 114 func (q *syncQueue) clearBlocked() { 115 q.blocked.Store(false) 116 } 117 118 func (q *syncQueue) empty() bool { 119 head, tail, _ := q.load() 120 return head == tail 121 } 122 123 // load returns the head, tail of the queue for what should be synced to the 124 // caller. It can return a head, tail of zero if syncing is blocked due to 125 // min-sync-interval. It additionally returns the real length of this queue, 126 // regardless of whether syncing is blocked. 127 func (q *syncQueue) load() (head, tail, realLength uint32) { 128 ptrs := q.headTail.Load() 129 head, tail = q.unpack(ptrs) 130 realLength = head - tail 131 if q.blocked.Load() { 132 return 0, 0, realLength 133 } 134 return head, tail, realLength 135 } 136 137 // REQUIRES: queueSemChan is non-nil. 138 func (q *syncQueue) pop(head, tail uint32, err error, queueSemChan chan struct{}) error { 139 if tail == head { 140 // Queue is empty. 141 return nil 142 } 143 144 for ; tail != head; tail++ { 145 slot := &q.slots[tail&uint32(len(q.slots)-1)] 146 wg := slot.wg 147 if wg == nil { 148 return errors.Errorf("nil waiter at %d", errors.Safe(tail&uint32(len(q.slots)-1))) 149 } 150 *slot.err = err 151 slot.wg = nil 152 slot.err = nil 153 // We need to bump the tail count before signalling the wait group as 154 // signalling the wait group can trigger release a blocked goroutine which 155 // will try to enqueue before we've "freed" space in the queue. 156 q.headTail.Add(1) 157 wg.Done() 158 // Is always non-nil in production. 159 if queueSemChan != nil { 160 <-queueSemChan 161 } 162 } 163 164 return nil 165 } 166 167 // flusherCond is a specialized condition variable that allows its condition to 168 // change and readiness be signalled without holding its associated mutex. In 169 // particular, when a waiter is added to syncQueue atomically, this condition 170 // variable can be signalled without holding flusher.Mutex. 171 type flusherCond struct { 172 mu *sync.Mutex 173 q *syncQueue 174 cond sync.Cond 175 } 176 177 func (c *flusherCond) init(mu *sync.Mutex, q *syncQueue) { 178 c.mu = mu 179 c.q = q 180 // Yes, this is a bit circular, but that is intentional. flusherCond.cond.L 181 // points flusherCond so that when cond.L.Unlock is called flusherCond.Unlock 182 // will be called and we can check the !syncQueue.empty() condition. 183 c.cond.L = c 184 } 185 186 func (c *flusherCond) Signal() { 187 // Pass-through to the cond var. 188 c.cond.Signal() 189 } 190 191 func (c *flusherCond) Wait() { 192 // Pass-through to the cond var. Note that internally the cond var implements 193 // Wait as: 194 // 195 // t := notifyListAdd() 196 // L.Unlock() 197 // notifyListWait(t) 198 // L.Lock() 199 // 200 // We've configured the cond var to call flusherReady.Unlock() which allows 201 // us to check the !syncQueue.empty() condition without a danger of missing a 202 // notification. Any call to flusherReady.Signal() after notifyListAdd() is 203 // called will cause the subsequent notifyListWait() to return immediately. 204 c.cond.Wait() 205 } 206 207 func (c *flusherCond) Lock() { 208 c.mu.Lock() 209 } 210 211 func (c *flusherCond) Unlock() { 212 c.mu.Unlock() 213 if !c.q.empty() { 214 // If the current goroutine is about to block on sync.Cond.Wait, this call 215 // to Signal will prevent that. The comment in Wait above explains a bit 216 // about what is going on here, but it is worth reiterating: 217 // 218 // flusherCond.Wait() 219 // sync.Cond.Wait() 220 // t := notifyListAdd() 221 // flusherCond.Unlock() <-- we are here 222 // notifyListWait(t) 223 // flusherCond.Lock() 224 // 225 // The call to Signal here results in: 226 // 227 // sync.Cond.Signal() 228 // notifyListNotifyOne() 229 // 230 // The call to notifyListNotifyOne() will prevent the call to 231 // notifyListWait(t) from blocking. 232 c.cond.Signal() 233 } 234 } 235 236 type durationFunc func() time.Duration 237 238 // syncTimer is an interface for timers, modeled on the closure callback mode 239 // of time.Timer. See time.AfterFunc and LogWriter.afterFunc. syncTimer is used 240 // by tests to mock out the timer functionality used to implement 241 // min-sync-interval. 242 type syncTimer interface { 243 Reset(time.Duration) bool 244 Stop() bool 245 } 246 247 // LogWriter writes records to an underlying io.Writer. In order to support WAL 248 // file reuse, a LogWriter's records are tagged with the WAL's file 249 // number. When reading a log file a record from a previous incarnation of the 250 // file will return the error ErrInvalidLogNum. 251 type LogWriter struct { 252 // w is the underlying writer. 253 w io.Writer 254 // c is w as a closer. 255 c io.Closer 256 // s is w as a syncer. 257 s syncer 258 // logNum is the low 32-bits of the log's file number. 259 logNum uint32 260 // blockNum is the zero based block number for the current block. 261 blockNum int64 262 // err is any accumulated error. TODO(peter): This needs to be protected in 263 // some fashion. Perhaps using atomic.Value. 264 err error 265 // block is the current block being written. Protected by flusher.Mutex. 266 block *block 267 free struct { 268 sync.Mutex 269 blocks []*block 270 } 271 272 flusher struct { 273 sync.Mutex 274 // Flusher ready is a condition variable that is signalled when there are 275 // blocks to flush, syncing has been requested, or the LogWriter has been 276 // closed. For signalling of a sync, it is safe to call without holding 277 // flusher.Mutex. 278 ready flusherCond 279 // Set to true when the flush loop should be closed. 280 close bool 281 // Closed when the flush loop has terminated. 282 closed chan struct{} 283 // Accumulated flush error. 284 err error 285 // minSyncInterval is the minimum duration between syncs. 286 minSyncInterval durationFunc 287 fsyncLatency prometheus.Histogram 288 pending []*block 289 syncQ syncQueue 290 metrics *LogWriterMetrics 291 } 292 293 // afterFunc is a hook to allow tests to mock out the timer functionality 294 // used for min-sync-interval. In normal operation this points to 295 // time.AfterFunc. 296 afterFunc func(d time.Duration, f func()) syncTimer 297 298 // See the comment for LogWriterConfig.QueueSemChan. 299 queueSemChan chan struct{} 300 } 301 302 // LogWriterConfig is a struct used for configuring new LogWriters 303 type LogWriterConfig struct { 304 WALMinSyncInterval durationFunc 305 WALFsyncLatency prometheus.Histogram 306 // QueueSemChan is an optional channel to pop from when popping from 307 // LogWriter.flusher.syncQueue. It functions as a semaphore that prevents 308 // the syncQueue from overflowing (which will cause a panic). All production 309 // code ensures this is non-nil. 310 QueueSemChan chan struct{} 311 } 312 313 // initialAllocatedBlocksCap is the initial capacity of the various slices 314 // intended to hold LogWriter blocks. The LogWriter may allocate more blocks 315 // than this threshold allows. 316 const initialAllocatedBlocksCap = 32 317 318 // blockPool pools *blocks to avoid allocations. Blocks are only added to the 319 // Pool when a LogWriter is closed. Before that, free blocks are maintained 320 // within a LogWriter's own internal free list `w.free.blocks`. 321 var blockPool = sync.Pool{ 322 New: func() any { return &block{} }, 323 } 324 325 // NewLogWriter returns a new LogWriter. 326 func NewLogWriter( 327 w io.Writer, logNum base.DiskFileNum, logWriterConfig LogWriterConfig, 328 ) *LogWriter { 329 c, _ := w.(io.Closer) 330 s, _ := w.(syncer) 331 r := &LogWriter{ 332 w: w, 333 c: c, 334 s: s, 335 // NB: we truncate the 64-bit log number to 32-bits. This is ok because a) 336 // we are very unlikely to reach a file number of 4 billion and b) the log 337 // number is used as a validation check and using only the low 32-bits is 338 // sufficient for that purpose. 339 logNum: uint32(logNum), 340 afterFunc: func(d time.Duration, f func()) syncTimer { 341 return time.AfterFunc(d, f) 342 }, 343 queueSemChan: logWriterConfig.QueueSemChan, 344 } 345 r.free.blocks = make([]*block, 0, initialAllocatedBlocksCap) 346 r.block = blockPool.Get().(*block) 347 r.flusher.ready.init(&r.flusher.Mutex, &r.flusher.syncQ) 348 r.flusher.closed = make(chan struct{}) 349 r.flusher.pending = make([]*block, 0, cap(r.free.blocks)) 350 r.flusher.metrics = &LogWriterMetrics{} 351 352 f := &r.flusher 353 f.minSyncInterval = logWriterConfig.WALMinSyncInterval 354 f.fsyncLatency = logWriterConfig.WALFsyncLatency 355 356 go func() { 357 pprof.Do(context.Background(), walSyncLabels, r.flushLoop) 358 }() 359 return r 360 } 361 362 func (w *LogWriter) flushLoop(context.Context) { 363 f := &w.flusher 364 f.Lock() 365 366 // Initialize idleStartTime to when the loop starts. 367 idleStartTime := time.Now() 368 var syncTimer syncTimer 369 defer func() { 370 // Capture the idle duration between the last piece of work and when the 371 // loop terminated. 372 f.metrics.WriteThroughput.IdleDuration += time.Since(idleStartTime) 373 if syncTimer != nil { 374 syncTimer.Stop() 375 } 376 close(f.closed) 377 f.Unlock() 378 }() 379 380 // The flush loop performs flushing of full and partial data blocks to the 381 // underlying writer (LogWriter.w), syncing of the writer, and notification 382 // to sync requests that they have completed. 383 // 384 // - flusher.ready is a condition variable that is signalled when there is 385 // work to do. Full blocks are contained in flusher.pending. The current 386 // partial block is in LogWriter.block. And sync operations are held in 387 // flusher.syncQ. 388 // 389 // - The decision to sync is determined by whether there are any sync 390 // requests present in flusher.syncQ and whether enough time has elapsed 391 // since the last sync. If not enough time has elapsed since the last sync, 392 // flusher.syncQ.blocked will be set to 1. If syncing is blocked, 393 // syncQueue.empty() will return true and syncQueue.load() will return 0,0 394 // (i.e. an empty list). 395 // 396 // - flusher.syncQ.blocked is cleared by a timer that is initialized when 397 // blocked is set to 1. When blocked is 1, no syncing will take place, but 398 // flushing will continue to be performed. The on/off toggle for syncing 399 // does not need to be carefully synchronized with the rest of processing 400 // -- all we need to ensure is that after any transition to blocked=1 there 401 // is eventually a transition to blocked=0. syncTimer performs this 402 // transition. Note that any change to min-sync-interval will not take 403 // effect until the previous timer elapses. 404 // 405 // - Picking up the syncing work to perform requires coordination with 406 // picking up the flushing work. Specifically, flushing work is queued 407 // before syncing work. The guarantee of this code is that when a sync is 408 // requested, any previously queued flush work will be synced. This 409 // motivates reading the syncing work (f.syncQ.load()) before picking up 410 // the flush work (w.block.written.Load()). 411 412 // The list of full blocks that need to be written. This is copied from 413 // f.pending on every loop iteration, though the number of elements is 414 // usually small (most frequently 1). In the case of the WAL LogWriter, the 415 // number of blocks is bounded by the size of the WAL's corresponding 416 // memtable (MemtableSize/BlockSize). With the default 64 MiB memtables, 417 // this works out to at most 2048 elements if the entirety of the memtable's 418 // contents are queued. 419 pending := make([]*block, 0, cap(f.pending)) 420 for { 421 for { 422 // Grab the portion of the current block that requires flushing. Note that 423 // the current block can be added to the pending blocks list after we release 424 // the flusher lock, but it won't be part of pending. 425 written := w.block.written.Load() 426 if len(f.pending) > 0 || written > w.block.flushed || !f.syncQ.empty() { 427 break 428 } 429 if f.close { 430 // If the writer is closed, pretend the sync timer fired immediately so 431 // that we can process any queued sync requests. 432 f.syncQ.clearBlocked() 433 if !f.syncQ.empty() { 434 break 435 } 436 return 437 } 438 f.ready.Wait() 439 continue 440 } 441 // Found work to do, so no longer idle. 442 workStartTime := time.Now() 443 idleDuration := workStartTime.Sub(idleStartTime) 444 pending = append(pending[:0], f.pending...) 445 f.pending = f.pending[:0] 446 f.metrics.PendingBufferLen.AddSample(int64(len(pending))) 447 448 // Grab the list of sync waiters. Note that syncQueue.load() will return 449 // 0,0 while we're waiting for the min-sync-interval to expire. This 450 // allows flushing to proceed even if we're not ready to sync. 451 head, tail, realSyncQLen := f.syncQ.load() 452 f.metrics.SyncQueueLen.AddSample(int64(realSyncQLen)) 453 454 // Grab the portion of the current block that requires flushing. Note that 455 // the current block can be added to the pending blocks list after we 456 // release the flusher lock, but it won't be part of pending. This has to 457 // be ordered after we get the list of sync waiters from syncQ in order to 458 // prevent a race where a waiter adds itself to syncQ, but this thread 459 // picks up the entry in syncQ and not the buffered data. 460 written := w.block.written.Load() 461 data := w.block.buf[w.block.flushed:written] 462 w.block.flushed = written 463 464 // If flusher has an error, we propagate it to waiters. Note in spite of 465 // error we consume the pending list above to free blocks for writers. 466 if f.err != nil { 467 f.syncQ.pop(head, tail, f.err, w.queueSemChan) 468 // Update the idleStartTime if work could not be done, so that we don't 469 // include the duration we tried to do work as idle. We don't bother 470 // with the rest of the accounting, which means we will undercount. 471 idleStartTime = time.Now() 472 continue 473 } 474 f.Unlock() 475 synced, syncLatency, bytesWritten, err := w.flushPending(data, pending, head, tail) 476 f.Lock() 477 if synced && f.fsyncLatency != nil { 478 f.fsyncLatency.Observe(float64(syncLatency)) 479 } 480 f.err = err 481 if f.err != nil { 482 f.syncQ.clearBlocked() 483 // Update the idleStartTime if work could not be done, so that we don't 484 // include the duration we tried to do work as idle. We don't bother 485 // with the rest of the accounting, which means we will undercount. 486 idleStartTime = time.Now() 487 continue 488 } 489 490 if synced && f.minSyncInterval != nil { 491 // A sync was performed. Make sure we've waited for the min sync 492 // interval before syncing again. 493 if min := f.minSyncInterval(); min > 0 { 494 f.syncQ.setBlocked() 495 if syncTimer == nil { 496 syncTimer = w.afterFunc(min, func() { 497 f.syncQ.clearBlocked() 498 f.ready.Signal() 499 }) 500 } else { 501 syncTimer.Reset(min) 502 } 503 } 504 } 505 // Finished work, and started idling. 506 idleStartTime = time.Now() 507 workDuration := idleStartTime.Sub(workStartTime) 508 f.metrics.WriteThroughput.Bytes += bytesWritten 509 f.metrics.WriteThroughput.WorkDuration += workDuration 510 f.metrics.WriteThroughput.IdleDuration += idleDuration 511 } 512 } 513 514 func (w *LogWriter) flushPending( 515 data []byte, pending []*block, head, tail uint32, 516 ) (synced bool, syncLatency time.Duration, bytesWritten int64, err error) { 517 defer func() { 518 // Translate panics into errors. The errors will cause flushLoop to shut 519 // down, but allows us to do so in a controlled way and avoid swallowing 520 // the stack that created the panic if panic'ing itself hits a panic 521 // (e.g. unlock of unlocked mutex). 522 if r := recover(); r != nil { 523 err = errors.Newf("%v", r) 524 } 525 }() 526 527 for _, b := range pending { 528 bytesWritten += blockSize - int64(b.flushed) 529 if err = w.flushBlock(b); err != nil { 530 break 531 } 532 } 533 if n := len(data); err == nil && n > 0 { 534 bytesWritten += int64(n) 535 _, err = w.w.Write(data) 536 } 537 538 synced = head != tail 539 if synced { 540 if err == nil && w.s != nil { 541 syncLatency, err = w.syncWithLatency() 542 } 543 f := &w.flusher 544 if popErr := f.syncQ.pop(head, tail, err, w.queueSemChan); popErr != nil { 545 return synced, syncLatency, bytesWritten, popErr 546 } 547 } 548 549 return synced, syncLatency, bytesWritten, err 550 } 551 552 func (w *LogWriter) syncWithLatency() (time.Duration, error) { 553 start := time.Now() 554 err := w.s.Sync() 555 syncLatency := time.Since(start) 556 return syncLatency, err 557 } 558 559 func (w *LogWriter) flushBlock(b *block) error { 560 if _, err := w.w.Write(b.buf[b.flushed:]); err != nil { 561 return err 562 } 563 b.written.Store(0) 564 b.flushed = 0 565 w.free.Lock() 566 w.free.blocks = append(w.free.blocks, b) 567 w.free.Unlock() 568 return nil 569 } 570 571 // queueBlock queues the current block for writing to the underlying writer, 572 // allocates a new block and reserves space for the next header. 573 func (w *LogWriter) queueBlock() { 574 // Allocate a new block, blocking until one is available. We do this first 575 // because w.block is protected by w.flusher.Mutex. 576 w.free.Lock() 577 if len(w.free.blocks) == 0 { 578 w.free.blocks = append(w.free.blocks, blockPool.Get().(*block)) 579 } 580 nextBlock := w.free.blocks[len(w.free.blocks)-1] 581 w.free.blocks = w.free.blocks[:len(w.free.blocks)-1] 582 w.free.Unlock() 583 584 f := &w.flusher 585 f.Lock() 586 f.pending = append(f.pending, w.block) 587 w.block = nextBlock 588 f.ready.Signal() 589 w.err = w.flusher.err 590 f.Unlock() 591 592 w.blockNum++ 593 } 594 595 // Close flushes and syncs any unwritten data and closes the writer. 596 // Where required, external synchronisation is provided by commitPipeline.mu. 597 func (w *LogWriter) Close() error { 598 f := &w.flusher 599 600 // Emit an EOF trailer signifying the end of this log. This helps readers 601 // differentiate between a corrupted entry in the middle of a log from 602 // garbage at the tail from a recycled log file. 603 w.emitEOFTrailer() 604 605 // Signal the flush loop to close. 606 f.Lock() 607 f.close = true 608 f.ready.Signal() 609 f.Unlock() 610 611 // Wait for the flush loop to close. The flush loop will not close until all 612 // pending data has been written or an error occurs. 613 <-f.closed 614 615 // Sync any flushed data to disk. NB: flushLoop will sync after flushing the 616 // last buffered data only if it was requested via syncQ, so we need to sync 617 // here to ensure that all the data is synced. 618 err := w.flusher.err 619 var syncLatency time.Duration 620 if err == nil && w.s != nil { 621 syncLatency, err = w.syncWithLatency() 622 } 623 f.Lock() 624 if f.fsyncLatency != nil { 625 f.fsyncLatency.Observe(float64(syncLatency)) 626 } 627 free := w.free.blocks 628 f.Unlock() 629 630 if w.c != nil { 631 cerr := w.c.Close() 632 w.c = nil 633 if cerr != nil { 634 return cerr 635 } 636 } 637 638 for _, b := range free { 639 b.flushed = 0 640 b.written.Store(0) 641 blockPool.Put(b) 642 } 643 644 w.err = errClosedWriter 645 return err 646 } 647 648 // WriteRecord writes a complete record. Returns the offset just past the end 649 // of the record. 650 // External synchronisation provided by commitPipeline.mu. 651 func (w *LogWriter) WriteRecord(p []byte) (int64, error) { 652 logSize, err := w.SyncRecord(p, nil, nil) 653 return logSize, err 654 } 655 656 // SyncRecord writes a complete record. If wg != nil the record will be 657 // asynchronously persisted to the underlying writer and done will be called on 658 // the wait group upon completion. Returns the offset just past the end of the 659 // record. 660 // External synchronisation provided by commitPipeline.mu. 661 func (w *LogWriter) SyncRecord( 662 p []byte, wg *sync.WaitGroup, err *error, 663 ) (logSize int64, err2 error) { 664 if w.err != nil { 665 return -1, w.err 666 } 667 668 // The `i == 0` condition ensures we handle empty records. Such records can 669 // possibly be generated for VersionEdits stored in the MANIFEST. While the 670 // MANIFEST is currently written using Writer, it is good to support the same 671 // semantics with LogWriter. 672 for i := 0; i == 0 || len(p) > 0; i++ { 673 p = w.emitFragment(i, p) 674 } 675 676 if wg != nil { 677 // If we've been asked to persist the record, add the WaitGroup to the sync 678 // queue and signal the flushLoop. Note that flushLoop will write partial 679 // blocks to the file if syncing has been requested. The contract is that 680 // any record written to the LogWriter to this point will be flushed to the 681 // OS and synced to disk. 682 f := &w.flusher 683 f.syncQ.push(wg, err) 684 f.ready.Signal() 685 } 686 687 offset := w.blockNum*blockSize + int64(w.block.written.Load()) 688 // Note that we don't return w.err here as a concurrent call to Close would 689 // race with our read. That's ok because the only error we could be seeing is 690 // one to syncing for which the caller can receive notification of by passing 691 // in a non-nil err argument. 692 return offset, nil 693 } 694 695 // Size returns the current size of the file. 696 // External synchronisation provided by commitPipeline.mu. 697 func (w *LogWriter) Size() int64 { 698 return w.blockNum*blockSize + int64(w.block.written.Load()) 699 } 700 701 func (w *LogWriter) emitEOFTrailer() { 702 // Write a recyclable chunk header with a different log number. Readers 703 // will treat the header as EOF when the log number does not match. 704 b := w.block 705 i := b.written.Load() 706 binary.LittleEndian.PutUint32(b.buf[i+0:i+4], 0) // CRC 707 binary.LittleEndian.PutUint16(b.buf[i+4:i+6], 0) // Size 708 b.buf[i+6] = recyclableFullChunkType 709 binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum+1) // Log number 710 b.written.Store(i + int32(recyclableHeaderSize)) 711 } 712 713 func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) { 714 b := w.block 715 i := b.written.Load() 716 first := n == 0 717 last := blockSize-i-recyclableHeaderSize >= int32(len(p)) 718 719 if last { 720 if first { 721 b.buf[i+6] = recyclableFullChunkType 722 } else { 723 b.buf[i+6] = recyclableLastChunkType 724 } 725 } else { 726 if first { 727 b.buf[i+6] = recyclableFirstChunkType 728 } else { 729 b.buf[i+6] = recyclableMiddleChunkType 730 } 731 } 732 733 binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum) 734 735 r := copy(b.buf[i+recyclableHeaderSize:], p) 736 j := i + int32(recyclableHeaderSize+r) 737 binary.LittleEndian.PutUint32(b.buf[i+0:i+4], crc.New(b.buf[i+6:j]).Value()) 738 binary.LittleEndian.PutUint16(b.buf[i+4:i+6], uint16(r)) 739 b.written.Store(j) 740 741 if blockSize-b.written.Load() < recyclableHeaderSize { 742 // There is no room for another fragment in the block, so fill the 743 // remaining bytes with zeros and queue the block for flushing. 744 clear(b.buf[b.written.Load():]) 745 w.queueBlock() 746 } 747 return p[r:] 748 } 749 750 // Metrics must be called after Close. The callee will no longer modify the 751 // returned LogWriterMetrics. 752 func (w *LogWriter) Metrics() *LogWriterMetrics { 753 return w.flusher.metrics 754 } 755 756 // LogWriterMetrics contains misc metrics for the log writer. 757 type LogWriterMetrics struct { 758 WriteThroughput base.ThroughputMetric 759 PendingBufferLen base.GaugeSampleMetric 760 SyncQueueLen base.GaugeSampleMetric 761 } 762 763 // Merge merges metrics from x. Requires that x is non-nil. 764 func (m *LogWriterMetrics) Merge(x *LogWriterMetrics) error { 765 m.WriteThroughput.Merge(x.WriteThroughput) 766 m.PendingBufferLen.Merge(x.PendingBufferLen) 767 m.SyncQueueLen.Merge(x.SyncQueueLen) 768 return nil 769 }