github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/record/log_writer_test.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package record 6 7 import ( 8 "bytes" 9 "fmt" 10 "math" 11 "sort" 12 "sync" 13 "sync/atomic" 14 "testing" 15 "time" 16 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/humanize" 19 "github.com/cockroachdb/pebble/vfs" 20 "github.com/cockroachdb/pebble/vfs/errorfs" 21 "github.com/cockroachdb/pebble/vfs/vfstest" 22 "github.com/prometheus/client_golang/prometheus" 23 prometheusgo "github.com/prometheus/client_model/go" 24 "github.com/stretchr/testify/require" 25 ) 26 27 type syncErrorFile struct { 28 vfs.File 29 err error 30 } 31 32 func (f syncErrorFile) Sync() error { 33 return f.err 34 } 35 36 func TestSyncQueue(t *testing.T) { 37 var q syncQueue 38 var closed atomic.Bool 39 40 var flusherWG sync.WaitGroup 41 flusherWG.Add(1) 42 go func() { 43 defer flusherWG.Done() 44 for { 45 if closed.Load() { 46 return 47 } 48 head, tail, _ := q.load() 49 q.pop(head, tail, nil, nil) 50 } 51 }() 52 53 var commitMu sync.Mutex 54 var doneWG sync.WaitGroup 55 for i := 0; i < SyncConcurrency; i++ { 56 doneWG.Add(1) 57 go func(i int) { 58 defer doneWG.Done() 59 for j := 0; j < 1000; j++ { 60 wg := &sync.WaitGroup{} 61 wg.Add(1) 62 // syncQueue is a single-producer, single-consumer queue. We need to 63 // provide mutual exclusion on the producer side. 64 commitMu.Lock() 65 q.push(wg, new(error)) 66 commitMu.Unlock() 67 wg.Wait() 68 } 69 }(i) 70 } 71 doneWG.Wait() 72 73 closed.Store(true) 74 flusherWG.Wait() 75 } 76 77 func TestFlusherCond(t *testing.T) { 78 var mu sync.Mutex 79 var q syncQueue 80 var c flusherCond 81 var closed bool 82 83 c.init(&mu, &q) 84 85 var flusherWG sync.WaitGroup 86 flusherWG.Add(1) 87 go func() { 88 defer flusherWG.Done() 89 90 mu.Lock() 91 defer mu.Unlock() 92 93 for { 94 for { 95 if closed { 96 return 97 } 98 if !q.empty() { 99 break 100 } 101 c.Wait() 102 } 103 104 head, tail, _ := q.load() 105 q.pop(head, tail, nil, nil) 106 } 107 }() 108 109 var commitMu sync.Mutex 110 var doneWG sync.WaitGroup 111 // NB: we're testing with low concurrency here, because what we want to 112 // stress is that signalling of the flusherCond works 113 // correctly. Specifically, we want to make sure that a signal is "lost", 114 // causing the test to wedge. 115 for i := 0; i < 2; i++ { 116 doneWG.Add(1) 117 go func(i int) { 118 defer doneWG.Done() 119 for j := 0; j < 10000; j++ { 120 wg := &sync.WaitGroup{} 121 wg.Add(1) 122 // syncQueue is a single-producer, single-consumer queue. We need to 123 // provide mutual exclusion on the producer side. 124 commitMu.Lock() 125 q.push(wg, new(error)) 126 commitMu.Unlock() 127 c.Signal() 128 wg.Wait() 129 } 130 }(i) 131 } 132 doneWG.Wait() 133 134 mu.Lock() 135 closed = true 136 c.Signal() 137 mu.Unlock() 138 flusherWG.Wait() 139 } 140 141 func TestSyncError(t *testing.T) { 142 mem := vfs.NewMem() 143 f, err := mem.Create("log") 144 require.NoError(t, err) 145 146 injectedErr := errors.New("injected error") 147 w := NewLogWriter(syncErrorFile{f, injectedErr}, 0, LogWriterConfig{ 148 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 149 }) 150 151 syncRecord := func() { 152 var syncErr error 153 var syncWG sync.WaitGroup 154 syncWG.Add(1) 155 _, err = w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 156 require.NoError(t, err) 157 syncWG.Wait() 158 if injectedErr != syncErr { 159 t.Fatalf("unexpected %v but found %v", injectedErr, syncErr) 160 } 161 } 162 // First waiter receives error. 163 syncRecord() 164 // All subsequent waiters also receive the error. 165 syncRecord() 166 syncRecord() 167 } 168 169 type syncFile struct { 170 writePos atomic.Int64 171 syncPos atomic.Int64 172 } 173 174 func (f *syncFile) Write(buf []byte) (int, error) { 175 n := len(buf) 176 f.writePos.Add(int64(n)) 177 return n, nil 178 } 179 180 func (f *syncFile) Sync() error { 181 f.syncPos.Store(f.writePos.Load()) 182 return nil 183 } 184 185 func TestSyncRecord(t *testing.T) { 186 f := &syncFile{} 187 w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})}) 188 189 var syncErr error 190 for i := 0; i < 100000; i++ { 191 var syncWG sync.WaitGroup 192 syncWG.Add(1) 193 offset, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 194 require.NoError(t, err) 195 syncWG.Wait() 196 require.NoError(t, syncErr) 197 if v := f.writePos.Load(); offset != v { 198 t.Fatalf("expected write pos %d, but found %d", offset, v) 199 } 200 if v := f.syncPos.Load(); offset != v { 201 t.Fatalf("expected sync pos %d, but found %d", offset, v) 202 } 203 } 204 } 205 206 func TestSyncRecordWithSignalChan(t *testing.T) { 207 f := &syncFile{} 208 semChan := make(chan struct{}, 5) 209 for i := 0; i < cap(semChan); i++ { 210 semChan <- struct{}{} 211 } 212 w := NewLogWriter(f, 0, LogWriterConfig{ 213 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 214 QueueSemChan: semChan, 215 }) 216 require.Equal(t, cap(semChan), len(semChan)) 217 var syncErr error 218 for i := 0; i < 5; i++ { 219 var syncWG sync.WaitGroup 220 syncWG.Add(1) 221 _, err := w.SyncRecord([]byte("hello"), &syncWG, &syncErr) 222 require.NoError(t, err) 223 syncWG.Wait() 224 require.NoError(t, syncErr) 225 // The waitgroup is released before the channel is read, so wait if 226 // necessary. 227 require.Eventually(t, func() bool { 228 return cap(semChan)-(i+1) == len(semChan) 229 }, 10*time.Second, time.Millisecond) 230 } 231 } 232 233 type fakeTimer struct { 234 f func() 235 } 236 237 func (t *fakeTimer) Reset(d time.Duration) bool { 238 return false 239 } 240 241 func (t *fakeTimer) Stop() bool { 242 return false 243 } 244 245 func try(initialSleep, maxTotalSleep time.Duration, f func() error) error { 246 totalSleep := time.Duration(0) 247 for d := initialSleep; ; d *= 2 { 248 time.Sleep(d) 249 totalSleep += d 250 if err := f(); err == nil || totalSleep >= maxTotalSleep { 251 return err 252 } 253 } 254 } 255 256 func TestMinSyncInterval(t *testing.T) { 257 const minSyncInterval = 100 * time.Millisecond 258 259 f := &syncFile{} 260 w := NewLogWriter(f, 0, LogWriterConfig{ 261 WALMinSyncInterval: func() time.Duration { 262 return minSyncInterval 263 }, 264 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 265 }) 266 267 var timer fakeTimer 268 w.afterFunc = func(d time.Duration, f func()) syncTimer { 269 if d != minSyncInterval { 270 t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d) 271 } 272 timer.f = f 273 timer.Reset(d) 274 return &timer 275 } 276 277 syncRecord := func(n int) *sync.WaitGroup { 278 wg := &sync.WaitGroup{} 279 wg.Add(1) 280 _, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error)) 281 require.NoError(t, err) 282 return wg 283 } 284 285 // Sync one record which will cause the sync timer to kick in. 286 syncRecord(1).Wait() 287 288 startWritePos := f.writePos.Load() 289 startSyncPos := f.syncPos.Load() 290 291 // Write a bunch of large records. The sync position should not change 292 // because we haven't triggered the timer. But note that the writes should 293 // not block either even though syncing isn't being done. 294 var wg *sync.WaitGroup 295 for i := 0; i < 100; i++ { 296 wg = syncRecord(10000) 297 if v := f.syncPos.Load(); startSyncPos != v { 298 t.Fatalf("expected syncPos %d, but found %d", startSyncPos, v) 299 } 300 // NB: we can't use syncQueue.load() here as that will return 0,0 while the 301 // syncQueue is blocked. 302 head, tail := w.flusher.syncQ.unpack(w.flusher.syncQ.headTail.Load()) 303 waiters := head - tail 304 if waiters != uint32(i+1) { 305 t.Fatalf("expected %d waiters, but found %d", i+1, waiters) 306 } 307 } 308 309 err := try(time.Millisecond, 5*time.Second, func() error { 310 v := f.writePos.Load() 311 if v > startWritePos { 312 return nil 313 } 314 return errors.Errorf("expected writePos > %d, but found %d", startWritePos, v) 315 }) 316 require.NoError(t, err) 317 318 // Fire the timer, and then wait for the last record to sync. 319 timer.f() 320 wg.Wait() 321 322 if w, s := f.writePos.Load(), f.syncPos.Load(); w != s { 323 t.Fatalf("expected syncPos %d, but found %d", s, w) 324 } 325 } 326 327 func TestMinSyncIntervalClose(t *testing.T) { 328 const minSyncInterval = 100 * time.Millisecond 329 330 f := &syncFile{} 331 w := NewLogWriter(f, 0, LogWriterConfig{ 332 WALMinSyncInterval: func() time.Duration { 333 return minSyncInterval 334 }, 335 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 336 }) 337 338 var timer fakeTimer 339 w.afterFunc = func(d time.Duration, f func()) syncTimer { 340 if d != minSyncInterval { 341 t.Fatalf("expected minSyncInterval %s, but found %s", minSyncInterval, d) 342 } 343 timer.f = f 344 timer.Reset(d) 345 return &timer 346 } 347 348 syncRecord := func(n int) *sync.WaitGroup { 349 wg := &sync.WaitGroup{} 350 wg.Add(1) 351 _, err := w.SyncRecord(bytes.Repeat([]byte{'a'}, n), wg, new(error)) 352 require.NoError(t, err) 353 return wg 354 } 355 356 // Sync one record which will cause the sync timer to kick in. 357 syncRecord(1).Wait() 358 359 // Syncing another record will not complete until the timer is fired OR the 360 // writer is closed. 361 wg := syncRecord(1) 362 require.NoError(t, w.Close()) 363 wg.Wait() 364 } 365 366 type syncFileWithWait struct { 367 f syncFile 368 writeWG sync.WaitGroup 369 syncWG sync.WaitGroup 370 } 371 372 func (f *syncFileWithWait) Write(buf []byte) (int, error) { 373 f.writeWG.Wait() 374 return f.f.Write(buf) 375 } 376 377 func (f *syncFileWithWait) Sync() error { 378 f.syncWG.Wait() 379 return f.f.Sync() 380 } 381 382 func TestMetricsWithoutSync(t *testing.T) { 383 f := &syncFileWithWait{} 384 f.writeWG.Add(1) 385 w := NewLogWriter(f, 0, LogWriterConfig{WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{})}) 386 offset, err := w.SyncRecord([]byte("hello"), nil, nil) 387 require.NoError(t, err) 388 const recordSize = 16 389 require.EqualValues(t, recordSize, offset) 390 // We have 512KB of buffer capacity, and 5 bytes + overhead = 16 bytes for 391 // each record. Write 28 * 1024 records to fill it up to 87.5%. This 392 // constitutes ~14 blocks (each 32KB). 393 const numRecords = 28 << 10 394 for i := 0; i < numRecords; i++ { 395 _, err = w.SyncRecord([]byte("hello"), nil, nil) 396 require.NoError(t, err) 397 } 398 // Unblock the flush loop. It will run once or twice to write these blocks, 399 // plus may run one more time due to the Close, so up to 3 runs. So ~14 400 // blocks flushed over up to 3 runs. 401 f.writeWG.Done() 402 w.Close() 403 m := w.Metrics() 404 // Mean is >= 4 filled blocks. 405 require.LessOrEqual(t, float64(4), m.PendingBufferLen.Mean()) 406 // None of these writes asked to be synced. 407 require.EqualValues(t, 0, int(m.SyncQueueLen.Mean())) 408 require.Less(t, int64(numRecords*recordSize), m.WriteThroughput.Bytes) 409 } 410 411 func TestMetricsWithSync(t *testing.T) { 412 f := &syncFileWithWait{} 413 f.syncWG.Add(1) 414 writeTo := &prometheusgo.Metric{} 415 syncLatencyMicros := prometheus.NewHistogram(prometheus.HistogramOpts{ 416 Buckets: []float64{0, 417 float64(time.Millisecond), 418 float64(2 * time.Millisecond), 419 float64(3 * time.Millisecond), 420 float64(4 * time.Millisecond), 421 float64(5 * time.Millisecond), 422 float64(6 * time.Millisecond), 423 float64(7 * time.Millisecond), 424 float64(8 * time.Millisecond), 425 float64(9 * time.Millisecond), 426 float64(10 * time.Millisecond)}, 427 }) 428 429 w := NewLogWriter(f, 0, LogWriterConfig{ 430 WALFsyncLatency: syncLatencyMicros, 431 }, 432 ) 433 var wg sync.WaitGroup 434 wg.Add(100) 435 for i := 0; i < 100; i++ { 436 var syncErr error 437 _, err := w.SyncRecord([]byte("hello"), &wg, &syncErr) 438 require.NoError(t, err) 439 } 440 // Unblock the flush loop. It may have run once or twice for these writes, 441 // plus may run one more time due to the Close, so up to 3 runs. So 100 442 // elements in the sync queue, spread over up to 3 runs. 443 syncLatency := 10 * time.Millisecond 444 time.Sleep(syncLatency) 445 f.syncWG.Done() 446 w.Close() 447 m := w.Metrics() 448 require.LessOrEqual(t, float64(30), m.SyncQueueLen.Mean()) 449 syncLatencyMicros.Write(writeTo) 450 // Allow for some inaccuracy in sleep and for two syncs, one of which was 451 // fast. 452 require.LessOrEqual(t, float64(syncLatency/(2*time.Microsecond)), 453 valueAtQuantileWindowed(writeTo.Histogram, 90)) 454 require.LessOrEqual(t, int64(syncLatency/2), int64(m.WriteThroughput.WorkDuration)) 455 } 456 457 func valueAtQuantileWindowed(histogram *prometheusgo.Histogram, q float64) float64 { 458 buckets := histogram.Bucket 459 n := float64(*histogram.SampleCount) 460 if n == 0 { 461 return 0 462 } 463 464 // NB: The 0.5 is added for rounding purposes; it helps in cases where 465 // SampleCount is small. 466 rank := uint64(((q / 100) * n) + 0.5) 467 468 // Since we are missing the +Inf bucket, CumulativeCounts may never exceed 469 // rank. By omitting the highest bucket we have from the search, the failed 470 // search will land on that last bucket and we don't have to do any special 471 // checks regarding landing on a non-existent bucket. 472 b := sort.Search(len(buckets)-1, func(i int) bool { return *buckets[i].CumulativeCount >= rank }) 473 474 var ( 475 bucketStart float64 // defaults to 0, which we assume is the lower bound of the smallest bucket 476 bucketEnd = *buckets[b].UpperBound 477 count = *buckets[b].CumulativeCount 478 ) 479 480 // Calculate the linearly interpolated value within the bucket. 481 if b > 0 { 482 bucketStart = *buckets[b-1].UpperBound 483 count -= *buckets[b-1].CumulativeCount 484 rank -= *buckets[b-1].CumulativeCount 485 } 486 val := bucketStart + (bucketEnd-bucketStart)*(float64(rank)/float64(count)) 487 if math.IsNaN(val) || math.IsInf(val, -1) { 488 return 0 489 } 490 491 // Should not extrapolate past the upper bound of the largest bucket. 492 // 493 // NB: SampleCount includes the implicit +Inf bucket but the 494 // buckets[len(buckets)-1].UpperBound refers to the largest bucket defined 495 // by us -- the client library doesn't give us access to the +Inf bucket 496 // which Prometheus uses under the hood. With a high enough quantile, the 497 // val computed further below surpasses the upper bound of the largest 498 // bucket. Using that interpolated value feels wrong since we'd be 499 // extrapolating. Also, for specific metrics if we see our q99 values to be 500 // hitting the top-most bucket boundary, that's an indication for us to 501 // choose better buckets for more accuracy. It's also worth noting that the 502 // prometheus client library does the same thing when the resulting value is 503 // in the +Inf bucket, whereby they return the upper bound of the second 504 // last bucket -- see [1]. 505 // 506 // [1]: https://github.com/prometheus/prometheus/blob/d9162189/promql/quantile.go#L103. 507 if val > *buckets[len(buckets)-1].UpperBound { 508 return *buckets[len(buckets)-1].UpperBound 509 } 510 511 return val 512 } 513 514 // TestQueueWALBlocks tests queueing many un-flushed WAL blocks when syncing is 515 // blocked. 516 func TestQueueWALBlocks(t *testing.T) { 517 blockWriteCh := make(chan struct{}, 1) 518 f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op, path string) error { 519 if op == errorfs.OpFileWrite { 520 <-blockWriteCh 521 } 522 return nil 523 })) 524 w := NewLogWriter(f, 0, LogWriterConfig{ 525 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 526 }) 527 const numBlocks = 1024 528 var b [blockSize]byte 529 var logSize int64 530 for i := 0; i < numBlocks; i++ { 531 var err error 532 logSize, err = w.SyncRecord(b[:], nil, nil) 533 if err != nil { 534 t.Fatal(err) 535 } 536 } 537 close(blockWriteCh) 538 require.NoError(t, w.Close()) 539 540 m := w.Metrics() 541 t.Logf("LogSize is %s", humanize.Bytes.Int64(logSize)) 542 t.Logf("Mean pending buffer len is %.2f", m.PendingBufferLen.Mean()) 543 require.GreaterOrEqual(t, logSize, int64(numBlocks*blockSize)) 544 } 545 546 // BenchmarkQueueWALBlocks exercises queueing within the LogWriter. It can be 547 // useful to measure allocations involved when flushing is slow enough to 548 // accumulate a large backlog fo queued blocks. 549 func BenchmarkQueueWALBlocks(b *testing.B) { 550 const dataVolume = 64 << 20 /* 64 MB */ 551 for _, writeSize := range []int64{64, 512, 1024, 2048, 32768} { 552 b.Run(fmt.Sprintf("record-size=%s", humanize.Bytes.Int64(writeSize)), func(b *testing.B) { 553 record := make([]byte, writeSize) 554 numRecords := int(dataVolume / writeSize) 555 556 for j := 0; j < b.N; j++ { 557 b.StopTimer() 558 blockWriteCh := make(chan struct{}, 1) 559 f := errorfs.WrapFile(vfstest.DiscardFile, errorfs.InjectorFunc(func(op errorfs.Op, path string) error { 560 if op == errorfs.OpFileWrite { 561 <-blockWriteCh 562 } 563 return nil 564 })) 565 w := NewLogWriter(f, 0, LogWriterConfig{ 566 WALFsyncLatency: prometheus.NewHistogram(prometheus.HistogramOpts{}), 567 }) 568 569 b.StartTimer() 570 for n := numRecords; n > 0; n-- { 571 if _, err := w.SyncRecord(record[:], nil, nil); err != nil { 572 b.Fatal(err) 573 } 574 } 575 b.StopTimer() 576 577 b.SetBytes(dataVolume) 578 close(blockWriteCh) 579 require.NoError(b, w.Close()) 580 } 581 }) 582 } 583 }