github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/cmd/pebble/write_bench.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package main 6 7 import ( 8 "context" 9 "fmt" 10 "sync" 11 "time" 12 13 "github.com/cockroachdb/pebble/internal/ackseq" 14 "github.com/cockroachdb/pebble/internal/randvar" 15 "github.com/cockroachdb/pebble/internal/rate" 16 "github.com/spf13/cobra" 17 ) 18 19 // The following constants match the values that Cockroach uses in Admission 20 // Control at the time of writing. 21 // See: https://github.com/cockroachdb/cockroach/blob/cb5d5108a7705eac7be82bc7f0f8b6f4dc825b96/pkg/util/admission/granter.go#L1212-L1229 22 const ( 23 defaultL0FileLimit = 1000 24 defaultL0SubLevelLimit = 20 25 ) 26 27 var writeBenchConfig struct { 28 batch *randvar.Flag 29 keys string 30 values *randvar.BytesFlag 31 concurrency int 32 rateStart int 33 incBase int 34 testPeriod time.Duration 35 cooloffPeriod time.Duration 36 targetL0Files int 37 targetL0SubLevels int 38 maxRateDipFraction float64 39 debug bool 40 } 41 42 var writeBenchCmd = &cobra.Command{ 43 Use: "write <dir>", 44 Short: "Run YCSB F to find an a sustainable write throughput", 45 Long: ` 46 Run YCSB F (100% writes) at varying levels of sustained write load (ops/sec) to 47 determine an optimal value of write throughput. 48 49 The benchmark works by maintaining a fixed amount of write load on the DB for a 50 fixed amount of time. If the database can handle the sustained load - determined 51 by a heuristic that takes into account the number of files in L0 sub-levels, the 52 number of L0 sub-levels, and whether the DB has encountered a write stall (i.e. 53 measured load on the DB drops to zero) - the load is increased on the DB. 54 55 Load increases exponentially from an initial load. If the DB fails the heuristic 56 at the given write load, the load on the DB is paused for a period of time (the 57 cool-off period) before returning to the last value at which the DB could handle 58 the load. The exponent is then reset and the process repeats from this new 59 initial value. This allows the benchmark to converge on and oscillate around the 60 optimal write load. 61 62 The values of load at which the DB passes and fails the heuristic are maintained 63 over the duration of the benchmark. On completion of the benchmark, an "optimal" 64 value is computed. The optimal value is computed as the value that minimizes the 65 mis-classification of the recorded "passes" and "fails"". This can be visualized 66 as a point on the x-axis that separates the passes and fails into the left and 67 right half-planes, minimizing the number of fails that fall to the left of this 68 point (i.e. mis-classified fails) and the number of passes that fall to the 69 right (i.e. mis-classified passes). 70 71 The resultant "optimal sustained write load" value provides an estimate of the 72 write load that the DB can sustain without failing the target heuristic. 73 74 A typical invocation of the benchmark is as follows: 75 76 pebble bench write [PATH] --wipe -c 1024 -d 8h --rate-start 30000 --debug 77 `, 78 Args: cobra.ExactArgs(1), 79 RunE: runWriteBenchmark, 80 } 81 82 func init() { 83 initWriteBench(writeBenchCmd) 84 } 85 86 func initWriteBench(cmd *cobra.Command) { 87 // Default values for custom flags. 88 writeBenchConfig.batch = randvar.NewFlag("1") 89 writeBenchConfig.values = randvar.NewBytesFlag("1000") 90 91 cmd.Flags().Var( 92 writeBenchConfig.batch, "batch", 93 "batch size distribution [{zipf,uniform}:]min[-max]") 94 cmd.Flags().StringVar( 95 &writeBenchConfig.keys, "keys", "zipf", "latest, uniform, or zipf") 96 cmd.Flags().Var( 97 writeBenchConfig.values, "values", 98 "value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]") 99 cmd.Flags().IntVarP( 100 &writeBenchConfig.concurrency, "concurrency", "c", 101 1, "number of concurrent workers") 102 cmd.Flags().IntVar( 103 &writeBenchConfig.rateStart, "rate-start", 104 1000, "starting write load (ops/sec)") 105 cmd.Flags().IntVar( 106 &writeBenchConfig.incBase, "rate-inc-base", 107 100, "increment / decrement base") 108 cmd.Flags().DurationVar( 109 &writeBenchConfig.testPeriod, "test-period", 110 60*time.Second, "time to run at a given write load") 111 cmd.Flags().DurationVar( 112 &writeBenchConfig.cooloffPeriod, "cooloff-period", 113 30*time.Second, "time to pause write load after a failure") 114 cmd.Flags().IntVar( 115 &writeBenchConfig.targetL0Files, "l0-files", 116 defaultL0FileLimit, "target L0 file count") 117 cmd.Flags().IntVar( 118 &writeBenchConfig.targetL0SubLevels, "l0-sublevels", 119 defaultL0SubLevelLimit, "target L0 sublevel count") 120 cmd.Flags().BoolVarP( 121 &wipe, "wipe", "w", false, "wipe the database before starting") 122 cmd.Flags().Float64Var( 123 &writeBenchConfig.maxRateDipFraction, "max-rate-dip-fraction", 0.1, 124 "fraction at which to mark a test-run as failed if the actual rate dips below (relative to the desired rate)") 125 cmd.Flags().BoolVar( 126 &writeBenchConfig.debug, "debug", false, "print benchmark debug information") 127 } 128 129 // writeBenchResult contains the results of a test run at a given rate. The 130 // independent variable is the rate (in ops/sec) and the dependent variable is 131 // whether the test passed or failed. Additional metadata associated with the 132 // test run is also captured. 133 type writeBenchResult struct { 134 name string 135 rate int // The rate at which the test is currently running. 136 passed bool // Was the test successful at this rate. 137 elapsed time.Duration // The total elapsed time of the test. 138 bytes uint64 // The size of the LSM. 139 levels int // The number of levels occupied in the LSM. 140 writeAmp float64 // The write amplification. 141 } 142 143 // String implements fmt.Stringer, printing a raw benchmark line. These lines 144 // are used when performing analysis on a given benchmark run. 145 func (r writeBenchResult) String() string { 146 return fmt.Sprintf("BenchmarkRaw%s %d ops/sec %v pass %s elapsed %d bytes %d levels %.2f writeAmp", 147 r.name, 148 r.rate, 149 r.passed, 150 r.elapsed, 151 r.bytes, 152 r.levels, 153 r.writeAmp, 154 ) 155 } 156 157 func runWriteBenchmark(_ *cobra.Command, args []string) error { 158 const workload = "F" // 100% inserts. 159 var ( 160 writers []*pauseWriter 161 writersWg *sync.WaitGroup // Tracks completion of all pauseWriters. 162 cooloff bool // Is cool-off enabled. 163 streak int // The number of successive passes. 164 clockStart time.Time // Start time for current load. 165 cooloffStart time.Time // When cool-off was enabled. 166 stack []int // Stack of passing load values. 167 pass, fail []int // Values of load that pass and fail, respectively. 168 rateAcc float64 // Accumulator of measured rates for a single test run. 169 ) 170 171 desiredRate := writeBenchConfig.rateStart 172 incBase := writeBenchConfig.incBase 173 weights, err := ycsbParseWorkload(workload) 174 175 if err != nil { 176 return err 177 } 178 179 keyDist, err := ycsbParseKeyDist(writeBenchConfig.keys) 180 if err != nil { 181 return err 182 } 183 batchDist := writeBenchConfig.batch 184 valueDist := writeBenchConfig.values 185 186 // Construct a new YCSB F benchmark with the configured values. 187 y := newYcsb(weights, keyDist, batchDist, nil /* scans */, valueDist) 188 y.keyNum = ackseq.New(0) 189 190 setLimit := func(l int) { 191 perWriterRate := float64(l) / float64(len(writers)) 192 for _, w := range writers { 193 w.setRate(perWriterRate) 194 } 195 } 196 197 // Function closure to run on test-run failure. 198 onTestFail := func(r writeBenchResult, cancel func()) { 199 fail = append(fail, desiredRate) 200 201 // Emit a benchmark raw datapoint. 202 fmt.Println(r) 203 204 // We failed at the current load, we have two options: 205 206 // a) No room to backtrack. We're done. 207 if len(stack) == 0 { 208 debugPrint("no room to backtrack; exiting ...\n") 209 cancel() 210 writersWg.Wait() 211 return 212 } 213 214 // b) We still have room to backtrack. Reduce the load to the 215 // last known passing value. 216 desiredRate, stack = stack[len(stack)-1], stack[:len(stack)-1] 217 setLimit(desiredRate) 218 219 // Enter the cool-off period. 220 cooloff = true 221 var wg sync.WaitGroup 222 for _, w := range writers { 223 // With a large number of writers, pausing synchronously can 224 // take a material amount of time. Instead, pause the 225 // writers in parallel in the background, and wait for all 226 // to complete before continuing. 227 wg.Add(1) 228 go func(writer *pauseWriter) { 229 writer.pause() 230 wg.Done() 231 }(w) 232 } 233 wg.Wait() 234 235 // Reset the counters and clocks. 236 streak = 0 237 rateAcc = 0 238 cooloffStart = time.Now() 239 clockStart = time.Now() 240 debugPrint("Fail. Pausing writers for cool-off period.\n") 241 debugPrint(fmt.Sprintf("new rate=%d\npasses=%v\nfails=%v\nstack=%v\n", 242 desiredRate, pass, fail, stack)) 243 } 244 245 // Function closure to run on test-run success. 246 onTestSuccess := func(r writeBenchResult) { 247 streak++ 248 pass = append(pass, desiredRate) 249 stack = append(stack, desiredRate) 250 251 // Emit a benchmark raw datapoint. 252 r.passed = true 253 fmt.Println(r) 254 255 // Increase the rate. 256 desiredRate = desiredRate + incBase*(1<<(streak-1)) 257 setLimit(desiredRate) 258 259 // Restart the test. 260 rateAcc = 0 261 clockStart = time.Now() 262 263 debugPrint(fmt.Sprintf("Pass.\nnew rate=%d\npasses=%v\nfails=%v\nstreak=%d\nstack=%v\n", 264 desiredRate, pass, fail, streak, stack)) 265 } 266 267 name := fmt.Sprintf("write/values=%s", writeBenchConfig.values) 268 ctx, cancel := context.WithCancel(context.Background()) 269 runTest(args[0], test{ 270 init: func(db DB, wg *sync.WaitGroup) { 271 y.db = db 272 writersWg = wg 273 274 // Spawn the writers. 275 for i := 0; i < writeBenchConfig.concurrency; i++ { 276 writer := newPauseWriter(y, float64(desiredRate)) 277 writers = append(writers, writer) 278 writersWg.Add(1) 279 go writer.run(ctx, wg) 280 } 281 setLimit(desiredRate) 282 283 // Start the clock on the current load. 284 clockStart = time.Now() 285 }, 286 tick: func(elapsed time.Duration, i int) { 287 m := y.db.Metrics() 288 if i%20 == 0 { 289 if writeBenchConfig.debug && i > 0 { 290 fmt.Printf("%s\n", m) 291 } 292 fmt.Println("___elapsed___clock___rate(desired)___rate(actual)___L0files___L0levels___levels______lsmBytes___writeAmp") 293 } 294 295 // Print the current stats. 296 l0Files := m.Levels[0].NumFiles 297 l0Sublevels := m.Levels[0].Sublevels 298 nLevels := 0 299 for _, l := range m.Levels { 300 if l.BytesIn > 0 { 301 nLevels++ 302 } 303 } 304 lsmBytes := m.DiskSpaceUsage() 305 total := m.Total() 306 writeAmp := (&total).WriteAmp() 307 308 var currRate float64 309 var stalled bool 310 y.reg.Tick(func(tick histogramTick) { 311 h := tick.Hist 312 currRate = float64(h.TotalCount()) / tick.Elapsed.Seconds() 313 stalled = !cooloff && currRate == 0 314 }) 315 rateAcc += currRate 316 317 // The heuristic by which the DB can sustain a given write load is 318 // determined by whether the DB, for the configured window of time: 319 // 1) did not encounter a write stall (i.e. write load fell to 320 // zero), 321 // 2) number of files in L0 was at or below the target, and 322 // 3) number of L0 sub-levels is at or below the target. 323 failed := stalled || 324 int(l0Files) > writeBenchConfig.targetL0Files || 325 int(l0Sublevels) > writeBenchConfig.targetL0SubLevels 326 327 // Print the result for this tick. 328 fmt.Printf("%10s %7s %15d %14.1f %9d %10d %8d %13d %10.2f\n", 329 time.Duration(elapsed.Seconds()+0.5)*time.Second, 330 time.Duration(time.Since(clockStart).Seconds()+0.5)*time.Second, 331 desiredRate, 332 currRate, 333 l0Files, 334 l0Sublevels, 335 nLevels, 336 lsmBytes, 337 writeAmp, 338 ) 339 340 // If we're in cool-off mode, allow it to complete before resuming 341 // writing. 342 if cooloff { 343 if time.Since(cooloffStart) < writeBenchConfig.cooloffPeriod { 344 return 345 } 346 debugPrint("ending cool-off") 347 348 // Else, resume writing. 349 cooloff = false 350 for _, w := range writers { 351 w.unpause() 352 } 353 clockStart = time.Now() 354 355 return 356 } 357 358 r := writeBenchResult{ 359 name: name, 360 rate: desiredRate, 361 elapsed: time.Duration(elapsed.Seconds()+0.5) * time.Second, 362 bytes: lsmBytes, 363 levels: nLevels, 364 writeAmp: writeAmp, 365 } 366 367 if failed { 368 onTestFail(r, cancel) 369 return 370 } 371 372 // Else, the DB could handle the current load. We only increase 373 // after a fixed amount of time at this load as elapsed. 374 testElapsed := time.Since(clockStart) 375 if testElapsed < writeBenchConfig.testPeriod { 376 // This test-run still has time on the clock. 377 return 378 } 379 380 // This test-run has completed. 381 382 // If the average rate over the test is less than the desired rate, 383 // we mark this test-run as a failure. This handles cases where we 384 // encounter a bottleneck that limits write throughput but 385 // incorrectly mark the test as passed. 386 diff := 1 - rateAcc/(float64(desiredRate)*testElapsed.Seconds()) 387 if diff > writeBenchConfig.maxRateDipFraction { 388 if writeBenchConfig.debug { 389 debugPrint(fmt.Sprintf( 390 "difference in rates (%.2f) exceeded threshold (%.2f); marking test as failed\n", 391 diff, writeBenchConfig.maxRateDipFraction, 392 )) 393 } 394 onTestFail(r, cancel) 395 return 396 } 397 398 // Mark this test-run as passed. 399 onTestSuccess(r) 400 }, 401 done: func(elapsed time.Duration) { 402 // Print final analysis. 403 var total int64 404 y.reg.Tick(func(tick histogramTick) { 405 total = tick.Cumulative.TotalCount() 406 }) 407 fmt.Println("___elapsed___ops(total)") 408 fmt.Printf("%10s %12d\n", elapsed.Truncate(time.Second), total) 409 }, 410 }) 411 412 return nil 413 } 414 415 // debugPrint prints a debug line to stdout if debug logging is enabled via the 416 // --debug flag. 417 func debugPrint(s string) { 418 if !writeBenchConfig.debug { 419 return 420 } 421 fmt.Print("DEBUG: " + s) 422 } 423 424 // pauseWriter issues load against a pebble instance, and can be paused on 425 // demand to allow the DB to recover. 426 type pauseWriter struct { 427 y *ycsb 428 limiter *rate.Limiter 429 pauseC chan struct{} 430 unpauseC chan struct{} 431 } 432 433 // newPauseWriter returns a new pauseWriter. 434 func newPauseWriter(y *ycsb, initialRate float64) *pauseWriter { 435 // Set the burst rate for the limiter to the lowest sensible value to 436 // prevent excessive bursting. Note that a burst of zero effectively 437 // disables the rate limiter, as a wait time of +Inf is returned from all 438 // calls, and `wait(l *rate.Limiter)` will not sleep in this case. 439 const burst = 1 440 return &pauseWriter{ 441 y: y, 442 limiter: rate.NewLimiter(float64(initialRate), burst), 443 pauseC: make(chan struct{}), 444 unpauseC: make(chan struct{}), 445 } 446 } 447 448 // run starts the pauseWriter, issuing load against the DB. 449 func (w *pauseWriter) run(ctx context.Context, wg *sync.WaitGroup) { 450 defer wg.Done() 451 452 buf := &ycsbBuf{rng: randvar.NewRand()} 453 hist := w.y.reg.Register("insert") 454 for { 455 select { 456 case <-ctx.Done(): 457 return 458 case <-w.pauseC: 459 // Hold the goroutine here until we unpause. 460 <-w.unpauseC 461 default: 462 wait(w.limiter) 463 start := time.Now() 464 w.y.insert(w.y.db, buf) 465 hist.Record(time.Since(start)) 466 } 467 } 468 } 469 470 // pause signals that the writer should pause after the current operation. 471 func (w *pauseWriter) pause() { 472 w.pauseC <- struct{}{} 473 } 474 475 // unpause unpauses the writer. 476 func (w *pauseWriter) unpause() { 477 w.unpauseC <- struct{}{} 478 } 479 480 // setRate sets the rate limit for this writer. 481 func (w *pauseWriter) setRate(r float64) { 482 w.limiter.SetRate(r) 483 }