github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/record/log_writer_test.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package record 6 7 import ( 8 "bytes" 9 "fmt" 10 "math" 11 "sort" 12 "sync" 13 "sync/atomic" 14 "testing" 15 "time" 16 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/humanize" 19 "github.com/cockroachdb/pebble/vfs" 20 "github.com/cockroachdb/pebble/vfs/errorfs" 21 "github.com/cockroachdb/pebble/vfs/vfstest" 22 "github.com/prometheus/client_golang/prometheus" 23 prometheusgo "github.com/prometheus/client_model/go" 24 "github.com/stretchr/testify/require" 25 ) 26 27 type syncErrorFile struct { 28 vfs.File 29 err error 30 } 31 32 func (f syncErrorFile) Sync() error { 33 return f.err 34 } 35 36 func TestSyncQueue(t *testing.T) { 37 var q syncQueue 38 var closed atomic.Bool 39 40 var flusherWG sync.WaitGroup 41 flusherWG.Add(1) 42 go func() { 43 defer flusherWG.Done() 44 for { 45 if closed.Load() { 46 return 47 } 48 head, tail, _ := q.load() 49 q.pop(head, tail, nil, nil) 50 } 51 }() 52 53 var commitMu sync.Mutex 54 var doneWG sync.WaitGroup 55 for i := 0; i < SyncConcurrency; i++ { 56 doneWG.Add(1) 57 go func(i int) { 58 defer doneWG.Done() 59 for j := 0; j < 1000; j++ { 60 wg := &sync.WaitGroup{} 61 wg.Add(1) 62 // syncQueue is a single-producer, single-consumer queue. We need to 63 // provide mutual exclusion on the producer side. 64 commitMu.Lock() 65 q.push(wg, new(error)) 66 commitMu.Unlock() 67 wg.Wait() 68 } 69 }(i) 70 } 71 doneWG.Wait() 72 73 closed.Store(true) 74 flusherWG.Wait() 75 } 76 77 func TestFlusherCond(t *testing.T) { 78 var mu sync.Mutex 79 var q syncQueue 80 var c flusherCond 81 var closed bool 82 83 c.init(&mu, &q) 84 85 var flusherWG sync.WaitGroup 86 flusherWG.Add(1) 87 go func() { 88 defer flusherWG.Done() 89 90 mu.Lock() 91 defer mu.Unlock() 92 93 for { 94 for { 95 if closed { 96 return 97 } 98 if !q.empty() { 99 break 100 } 101 c.Wait() 102 } 103 104 head, tail, _ := q.load() 105 q.pop(head, tail, nil, nil) 106 } 107 }() 108 109 var commitMu sync.Mutex 110 var doneWG sync.WaitGroup 111 // NB: we're testing with low concurrency here, because what we want to 112 // stress is that signalling of the flusherCond works 113 // correctly. Specifically, we want to make sure that a signal is "lost", 114 // causing the test to wedge. 115 for i := 0; i < 2; i++ { 116 doneWG.Add(1) 117 go func(i int) { 118 defer doneWG.Done() 119 for j := 0; j < 10000; j++ { 120 wg := &sync.WaitGroup{} 121 wg.Add(1) 122 // syncQueue is a single-producer, single-consumer queue. We need to 123 // provide mutual exclusion on the producer side. 124 commitMu.Lock() 125 q.push(wg, new(error)) 126 commitMu.Unlock() 127 c.Signal() 128 wg.Wait() 129 } 130 }(i) 131 } 132 doneWG.Wait() 133 134 mu.Lock() 135 closed = true 136 c.Signal() 137 mu.Unlock() 138 flusherWG.Wait() 139 } 140 141 func TestSyncError(t *testing.T) { 142 mem := vfs.NewMem() 143 f, err := mem.Create("log") 144 require.NoError(t, err) 145 146 injectedErr := errors.New("injected error") 147 w := NewLogWriter(syncErrorFile{f, injectedErr}, 0, LogWriterConfig{ 148 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 149 }) 150 151 syncRecord := func() { 152 var syncErr error 153 var syncWG sync.WaitGroup 154 syncWG.Add(1) 155 _, err = w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 156 require.NoError(t, err) 157 syncWG.Wait() 158 if injectedErr != syncErr { 159 t.Fatalf("unexpected %v but found %v", injectedErr, syncErr) 160 } 161 } 162 // First waiter receives error. 163 syncRecord() 164 // All subsequent waiters also receive the error. 165 syncRecord() 166 syncRecord() 167 } 168 169 type syncFile struct { 170 writePos atomic.Int64 171 syncPos atomic.Int64 172 } 173 174 func (f *syncFile) Write(buf []byte) (int, error) { 175 n := len(buf) 176 f.writePos.Add(int64(n)) 177 return n, nil 178 } 179 180 func (f *syncFile) Sync() error { 181 f.syncPos.Store(f.writePos.Load()) 182 return nil 183 } 184 185 func TestSyncRecord(t *testing.T) { 186 f := &syncFile{} 187 w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})}) 188 189 var syncErr error 190 for i := 0; i < 100000; i++ { 191 var syncWG sync.WaitGroup 192 syncWG.Add(1) 193 offset, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 194 require.NoError(t, err) 195 syncWG.Wait() 196 require.NoError(t, syncErr) 197 if v := f.writePos.Load(); offset != v { 198 t.Fatalf("expected write pos %d, but found %d", offset, v) 199 } 200 if v := f.syncPos.Load(); offset != v { 201 t.Fatalf("expected sync pos %d, but found %d", offset, v) 202 } 203 } 204 } 205 206 func TestSyncRecordWithSignalChan(t *testing.T) { 207 f := &syncFile{} 208 semChan := make(chan struct{}, 5) 209 for i := 0; i < cap(semChan); i++ { 210 semChan <- struct{}{} 211 } 212 w := NewLogWriter(f, 0, LogWriterConfig{ 213 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 214 QueueSemChan: semChan, 215 }) 216 require.Equal(t, cap(semChan), len(semChan)) 217 var syncErr error 218 for i := 0; i < 5; i++ { 219 var syncWG sync.WaitGroup 220 syncWG.Add(1) 221 _, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 222 require.NoError(t, err) 223 syncWG.Wait() 224 require.NoError(t, syncErr) 225 // The waitgroup is released before the channel is read, so wait if 226 // necessary. 227 require.Eventually(t, func() bool { 228 return cap(semChan)-(i+1) == len(semChan) 229 }, 10*time.Second, time.Millisecond) 230 } 231 } 232 233 type fakeTimer struct { 234 f func() 235 } 236 237 func (t *fakeTimer) Reset(d time.Duration) bool { 238 return false 239 } 240 241 func (t *fakeTimer) Stop() bool { 242 return false 243 } 244 245 func try(initialSleep, maxTotalSleep time.Duration, f func() error) error { 246 totalSleep := time.Duration(0) 247 for d := initialSleep; ; d *= 2 { 248 time.Sleep(d) 249 totalSleep += d 250 if err := f(); err == nil || totalSleep >= maxTotalSleep { 251 return err 252 } 253 } 254 } 255 256 func TestMinSyncInterval(t *testing.T) { 257 const minSyncInterval = 100 * time.Millisecond 258 259 f := &syncFile{} 260 w := NewLogWriter(f, 0, LogWriterConfig{ 261 WALMinSyncInterval: func() time.Duration { 262 return minSyncInterval 263 }, 264 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 265 }) 266 267 var timer fakeTimer 268 w.afterFunc = func(d time.Duration, f func()) syncTimer { 269 if d != minSyncInterval { 270 t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d) 271 } 272 timer.f = f 273 timer.Reset(d) 274 return &timer 275 } 276 277 syncRecord := func(n int) *sync.WaitGroup { 278 wg := &sync.WaitGroup{} 279 wg.Add(1) 280 _, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error)) 281 require.NoError(t, err) 282 return wg 283 } 284 285 // Sync one record which will cause the sync timer to kick in. 286 syncRecord(1).Wait() 287 288 startWritePos := f.writePos.Load() 289 startSyncPos := f.syncPos.Load() 290 291 // Write a bunch of large records. The sync position should not change 292 // because we haven't triggered the timer. But note that the writes should 293 // not block either even though syncing isn't being done. 294 var wg *sync.WaitGroup 295 for i := 0; i < 100; i++ { 296 wg = syncRecord(10000) 297 if v := f.syncPos.Load(); startSyncPos != v { 298 t.Fatalf("expected syncPos %d, but found %d", startSyncPos, v) 299 } 300 // NB: we can't use syncQueue.load() here as that will return 0,0 while the 301 // syncQueue is blocked. 302 head, tail := w.flusher.syncQ.unpack(w.flusher.syncQ.headTail.Load()) 303 waiters := head - tail 304 if waiters != uint32(i+1) { 305 t.Fatalf("expected %d waiters, but found %d", i+1, waiters) 306 } 307 } 308 309 err := try(time.Millisecond, 5*time.Second, func() error { 310 v := f.writePos.Load() 311 if v > startWritePos { 312 return nil 313 } 314 return errors.Errorf("expected writePos > %d, but found %d", startWritePos, v) 315 }) 316 require.NoError(t, err) 317 318 // Fire the timer, and then wait for the last record to sync. 319 timer.f() 320 wg.Wait() 321 322 if w, s := f.writePos.Load(), f.syncPos.Load(); w != s { 323 t.Fatalf("expected syncPos %d, but found %d", s, w) 324 } 325 } 326 327 func TestMinSyncIntervalClose(t *testing.T) { 328 const minSyncInterval = 100 * time.Millisecond 329 330 f := &syncFile{} 331 w := NewLogWriter(f, 0, LogWriterConfig{ 332 WALMinSyncInterval: func() time.Duration { 333 return minSyncInterval 334 }, 335 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 336 }) 337 338 var timer fakeTimer 339 w.afterFunc = func(d time.Duration, f func()) syncTimer { 340 if d != minSyncInterval { 341 t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d) 342 } 343 timer.f = f 344 timer.Reset(d) 345 return &timer 346 } 347 348 syncRecord := func(n int) *sync.WaitGroup { 349 wg := &sync.WaitGroup{} 350 wg.Add(1) 351 _, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error)) 352 require.NoError(t, err) 353 return wg 354 } 355 356 // Sync one record which will cause the sync timer to kick in. 357 syncRecord(1).Wait() 358 359 // Syncing another record will not complete until the timer is fired OR the 360 // writer is closed. 361 wg := syncRecord(1) 362 require.NoError(t, w.Close()) 363 wg.Wait() 364 } 365 366 type syncFileWithWait struct { 367 f syncFile 368 writeWG sync.WaitGroup 369 syncWG sync.WaitGroup 370 } 371 372 func (f *syncFileWithWait) Write(buf []byte) (int, error) { 373 f.writeWG.Wait() 374 return f.f.Write(buf) 375 } 376 377 func (f *syncFileWithWait) Sync() error { 378 f.syncWG.Wait() 379 return f.f.Sync() 380 } 381 382 func TestMetricsWithoutSync(t *testing.T) { 383 f := &syncFileWithWait{} 384 f.writeWG.Add(1) 385 w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})}) 386 offset, err := w.SyncRecord([]byte("hello"), nil, nil) 387 require.NoError(t, err) 388 const recordSize = 16 389 require.EqualValues(t, recordSize, offset) 390 // We have 512KB of buffer capacity, and 5 bytes + overhead = 16 bytes for 391 // each record. Write 28 * 1024 records to fill it up to 87.5%. This 392 // constitutes ~14 blocks (each 32KB). 393 const numRecords = 28 << 10 394 for i := 0; i < numRecords; i++ { 395 _, err = w.SyncRecord([]byte("hello"), nil, nil) 396 require.NoError(t, err) 397 } 398 // Unblock the flush loop. It will run once or twice to write these blocks, 399 // plus may run one more time due to the Close, so up to 3 runs. So ~14 400 // blocks flushed over up to 3 runs. 401 f.writeWG.Done() 402 w.Close() 403 m := w.Metrics() 404 // Mean is >= 4 filled blocks. 405 require.LessOrEqual(t, float64(4), m.PendingBufferLen.Mean()) 406 // None of these writes asked to be synced. 407 require.EqualValues(t, 0, int(m.SyncQueueLen.Mean())) 408 require.Less(t, int64(numRecords*recordSize), m.WriteThroughput.Bytes) 409 } 410 411 func TestMetricsWithSync(t *testing.T) { 412 f := &syncFileWithWait{} 413 f.syncWG.Add(1) 414 syncLatencyMicros := prometheus.NewHistogram(prometheus.HistogramOpts{ 415 Buckets: []float64{0, 416 float64(time.Millisecond), 417 float64(2 * time.Millisecond), 418 float64(3 * time.Millisecond), 419 float64(4 * time.Millisecond), 420 float64(5 * time.Millisecond), 421 float64(6 * time.Millisecond), 422 float64(7 * time.Millisecond), 423 float64(8 * time.Millisecond), 424 float64(9 * time.Millisecond), 425 float64(10 * time.Millisecond)}, 426 }) 427 428 w := NewLogWriter(f, 0, LogWriterConfig{ 429 WALFsyncLatency: syncLatencyMicros, 430 }, 431 ) 432 var wg sync.WaitGroup 433 wg.Add(100) 434 for i := 0; i < 100; i++ { 435 var syncErr error 436 _, err := w.SyncRecord([]byte("hello"), &wg, &syncErr) 437 require.NoError(t, err) 438 } 439 440 const syncLatency = 100 * time.Millisecond 441 go func() { 442 time.Sleep(syncLatency) 443 // Unblock the flush loop. It may have run once or twice for these writes, 444 // plus may run one more time due to the Close, so up to 3 runs. So 100 445 // elements in the sync queue, spread over up to 3 runs. 446 f.syncWG.Done() 447 }() 448 449 // Close() will only return after flushing is finished. 450 require.NoError(t, w.Close()) 451 452 m := w.Metrics() 453 require.LessOrEqual(t, float64(30), m.SyncQueueLen.Mean()) 454 455 writeTo := &prometheusgo.Metric{} 456 require.NoError(t, syncLatencyMicros.Write(writeTo)) 457 for i := 0; i < 100; i += 10 { 458 t.Logf("%d%%: %v", i, valueAtQuantileWindowed(writeTo.Histogram, float64(i))) 459 } 460 // Allow for some inaccuracy in sleep and for two syncs, one of which was 461 // fast. 462 require.LessOrEqual(t, float64(syncLatency/(2*time.Microsecond)), 463 valueAtQuantileWindowed(writeTo.Histogram, 90)) 464 require.LessOrEqual(t, syncLatency/2, m.WriteThroughput.WorkDuration) 465 } 466 467 func valueAtQuantileWindowed(histogram *prometheusgo.Histogram, q float64) float64 { 468 buckets := histogram.Bucket 469 n := float64(*histogram.SampleCount) 470 if n == 0 { 471 return 0 472 } 473 474 // NB: The 0.5 is added for rounding purposes; it helps in cases where 475 // SampleCount is small. 476 rank := uint64(((q / 100) * n) + 0.5) 477 478 // Since we are missing the +Inf bucket, CumulativeCounts may never exceed 479 // rank. By omitting the highest bucket we have from the search, the failed 480 // search will land on that last bucket and we don't have to do any special 481 // checks regarding landing on a non-existent bucket. 482 b := sort.Search(len(buckets)-1, func(i int) bool { return *buckets[i].CumulativeCount >= rank }) 483 484 var ( 485 bucketStart float64 // defaults to 0, which we assume is the lower bound of the smallest bucket 486 bucketEnd = *buckets[b].UpperBound 487 count = *buckets[b].CumulativeCount 488 ) 489 490 // Calculate the linearly interpolated value within the bucket. 491 if b > 0 { 492 bucketStart = *buckets[b-1].UpperBound 493 count -= *buckets[b-1].CumulativeCount 494 rank -= *buckets[b-1].CumulativeCount 495 } 496 val := bucketStart + (bucketEnd-bucketStart)*(float64(rank)/float64(count)) 497 if math.IsNaN(val) || math.IsInf(val, -1) { 498 return 0 499 } 500 501 // Should not extrapolate past the upper bound of the largest bucket. 502 // 503 // NB: SampleCount includes the implicit +Inf bucket but the 504 // buckets[len(buckets)-1].UpperBound refers to the largest bucket defined 505 // by us -- the client library doesn't give us access to the +Inf bucket 506 // which Prometheus uses under the hood. With a high enough quantile, the 507 // val computed further below surpasses the upper bound of the largest 508 // bucket. Using that interpolated value feels wrong since we'd be 509 // extrapolating. Also, for specific metrics if we see our q99 values to be 510 // hitting the top-most bucket boundary, that's an indication for us to 511 // choose better buckets for more accuracy. It's also worth noting that the 512 // prometheus client library does the same thing when the resulting value is 513 // in the +Inf bucket, whereby they return the upper bound of the second 514 // last bucket -- see [1]. 515 // 516 // [1]: https://github.com/prometheus/prometheus/blob/d9162189/promql/quantile.go#L103. 517 if val > *buckets[len(buckets)-1].UpperBound { 518 return *buckets[len(buckets)-1].UpperBound 519 } 520 521 return val 522 } 523 524 // TestQueueWALBlocks tests queueing many un-flushed WAL blocks when syncing is 525 // blocked. 526 func TestQueueWALBlocks(t *testing.T) { 527 blockWriteCh := make(chan struct{}, 1) 528 f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error { 529 if op.Kind == errorfs.OpFileWrite { 530 <-blockWriteCh 531 } 532 return nil 533 })) 534 w := NewLogWriter(f, 0, LogWriterConfig{ 535 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 536 }) 537 const numBlocks = 1024 538 var b [blockSize]byte 539 var logSize int64 540 for i := 0; i < numBlocks; i++ { 541 var err error 542 logSize, err = w.SyncRecord(b[:], nil, nil) 543 if err != nil { 544 t.Fatal(err) 545 } 546 } 547 close(blockWriteCh) 548 require.NoError(t, w.Close()) 549 550 m := w.Metrics() 551 t.Logf("LogSize is %s", humanize.Bytes.Int64(logSize)) 552 t.Logf("Mean pending buffer len is %.2f", m.PendingBufferLen.Mean()) 553 require.GreaterOrEqual(t, logSize, int64(numBlocks*blockSize)) 554 } 555 556 // BenchmarkQueueWALBlocks exercises queueing within the LogWriter. It can be 557 // useful to measure allocations involved when flushing is slow enough to 558 // accumulate a large backlog fo queued blocks. 559 func BenchmarkQueueWALBlocks(b *testing.B) { 560 const dataVolume = 64 << 20 /* 64 MB */ 561 for _, writeSize := range []int64{64, 512, 1024, 2048, 32768} { 562 b.Run(fmt.Sprintf("record-size=%s", humanize.Bytes.Int64(writeSize)), func(b *testing.B) { 563 record := make([]byte, writeSize) 564 numRecords := int(dataVolume / writeSize) 565 566 for j := 0; j < b.N; j++ { 567 b.StopTimer() 568 blockWriteCh := make(chan struct{}, 1) 569 f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op) error { 570 if op.Kind == errorfs.OpFileWrite { 571 <-blockWriteCh 572 } 573 return nil 574 })) 575 w := NewLogWriter(f, 0, LogWriterConfig{ 576 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 577 }) 578 579 b.StartTimer() 580 for n := numRecords; n > 0; n-- { 581 if _, err := w.SyncRecord(record[:], nil, nil); err != nil { 582 b.Fatal(err) 583 } 584 } 585 b.StopTimer() 586 587 b.SetBytes(dataVolume) 588 close(blockWriteCh) 589 require.NoError(b, w.Close()) 590 } 591 }) 592 } 593 }