github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/cmd/pebble/write_bench.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/cmd/pebble/write_bench.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/cockroachdb/pebble/internal/ackseq"
    14  	"github.com/cockroachdb/pebble/internal/randvar"
    15  	"github.com/cockroachdb/pebble/internal/rate"
    16  	"github.com/spf13/cobra"
    17  )
    18  
    19  // The following constants match the values that Cockroach uses in Admission
    20  // Control at the time of writing.
    21  // See: https://github.com/cockroachdb/cockroach/blob/cb5d5108a7705eac7be82bc7f0f8b6f4dc825b96/pkg/util/admission/granter.go#L1212-L1229
    22  const (
    23  	defaultL0FileLimit     = 1000
    24  	defaultL0SubLevelLimit = 20
    25  )
    26  
    27  var writeBenchConfig struct {
    28  	batch              *randvar.Flag
    29  	keys               string
    30  	values             *randvar.BytesFlag
    31  	concurrency        int
    32  	rateStart          int
    33  	incBase            int
    34  	testPeriod         time.Duration
    35  	cooloffPeriod      time.Duration
    36  	targetL0Files      int
    37  	targetL0SubLevels  int
    38  	maxRateDipFraction float64
    39  	debug              bool
    40  }
    41  
    42  var writeBenchCmd = &cobra.Command{
    43  	Use:   "write <dir>",
    44  	Short: "Run YCSB F to find an a sustainable write throughput",
    45  	Long: `
    46  Run YCSB F (100% writes) at varying levels of sustained write load (ops/sec) to
    47  determine an optimal value of write throughput.
    48  
    49  The benchmark works by maintaining a fixed amount of write load on the DB for a
    50  fixed amount of time. If the database can handle the sustained load - determined
    51  by a heuristic that takes into account the number of files in L0 sub-levels, the
    52  number of L0 sub-levels, and whether the DB has encountered a write stall (i.e.
    53  measured load on the DB drops to zero) - the load is increased on the DB.
    54  
    55  Load increases exponentially from an initial load. If the DB fails the heuristic
    56  at the given write load, the load on the DB is paused for a period of time (the
    57  cool-off period) before returning to the last value at which the DB could handle
    58  the load. The exponent is then reset and the process repeats from this new
    59  initial value. This allows the benchmark to converge on and oscillate around the
    60  optimal write load.
    61  
    62  The values of load at which the DB passes and fails the heuristic are maintained
    63  over the duration of the benchmark. On completion of the benchmark, an "optimal"
    64  value is computed. The optimal value is computed as the value that minimizes the
    65  mis-classification of the recorded "passes" and "fails"". This can be visualized
    66  as a point on the x-axis that separates the passes and fails into the left and
    67  right half-planes, minimizing the number of fails that fall to the left of this
    68  point (i.e. mis-classified fails) and the number of passes that fall to the
    69  right (i.e. mis-classified passes).
    70  
    71  The resultant "optimal sustained write load" value provides an estimate of the
    72  write load that the DB can sustain without failing the target heuristic.
    73  
    74  A typical invocation of the benchmark is as follows:
    75  
    76    pebble bench write [PATH] --wipe -c 1024 -d 8h --rate-start 30000 --debug
    77  `,
    78  	Args: cobra.ExactArgs(1),
    79  	RunE: runWriteBenchmark,
    80  }
    81  
    82  func init() {
    83  	initWriteBench(writeBenchCmd)
    84  }
    85  
    86  func initWriteBench(cmd *cobra.Command) {
    87  	// Default values for custom flags.
    88  	writeBenchConfig.batch = randvar.NewFlag("1")
    89  	writeBenchConfig.values = randvar.NewBytesFlag("1000")
    90  
    91  	cmd.Flags().Var(
    92  		writeBenchConfig.batch, "batch",
    93  		"batch size distribution [{zipf,uniform}:]min[-max]")
    94  	cmd.Flags().StringVar(
    95  		&writeBenchConfig.keys, "keys", "zipf", "latest, uniform, or zipf")
    96  	cmd.Flags().Var(
    97  		writeBenchConfig.values, "values",
    98  		"value size distribution [{zipf,uniform}:]min[-max][/<target-compression>]")
    99  	cmd.Flags().IntVarP(
   100  		&writeBenchConfig.concurrency, "concurrency", "c",
   101  		1, "number of concurrent workers")
   102  	cmd.Flags().IntVar(
   103  		&writeBenchConfig.rateStart, "rate-start",
   104  		1000, "starting write load (ops/sec)")
   105  	cmd.Flags().IntVar(
   106  		&writeBenchConfig.incBase, "rate-inc-base",
   107  		100, "increment / decrement base")
   108  	cmd.Flags().DurationVar(
   109  		&writeBenchConfig.testPeriod, "test-period",
   110  		60*time.Second, "time to run at a given write load")
   111  	cmd.Flags().DurationVar(
   112  		&writeBenchConfig.cooloffPeriod, "cooloff-period",
   113  		30*time.Second, "time to pause write load after a failure")
   114  	cmd.Flags().IntVar(
   115  		&writeBenchConfig.targetL0Files, "l0-files",
   116  		defaultL0FileLimit, "target L0 file count")
   117  	cmd.Flags().IntVar(
   118  		&writeBenchConfig.targetL0SubLevels, "l0-sublevels",
   119  		defaultL0SubLevelLimit, "target L0 sublevel count")
   120  	cmd.Flags().BoolVarP(
   121  		&wipe, "wipe", "w", false, "wipe the database before starting")
   122  	cmd.Flags().Float64Var(
   123  		&writeBenchConfig.maxRateDipFraction, "max-rate-dip-fraction", 0.1,
   124  		"fraction at which to mark a test-run as failed if the actual rate dips below (relative to the desired rate)")
   125  	cmd.Flags().BoolVar(
   126  		&writeBenchConfig.debug, "debug", false, "print benchmark debug information")
   127  }
   128  
   129  // writeBenchResult contains the results of a test run at a given rate. The
   130  // independent variable is the rate (in ops/sec) and the dependent variable is
   131  // whether the test passed or failed. Additional metadata associated with the
   132  // test run is also captured.
   133  type writeBenchResult struct {
   134  	name     string
   135  	rate     int           // The rate at which the test is currently running.
   136  	passed   bool          // Was the test successful at this rate.
   137  	elapsed  time.Duration // The total elapsed time of the test.
   138  	bytes    uint64        // The size of the LSM.
   139  	levels   int           // The number of levels occupied in the LSM.
   140  	writeAmp float64       // The write amplification.
   141  }
   142  
   143  // String implements fmt.Stringer, printing a raw benchmark line. These lines
   144  // are used when performing analysis on a given benchmark run.
   145  func (r writeBenchResult) String() string {
   146  	return fmt.Sprintf("BenchmarkRaw%s %d ops/sec %v pass %s elapsed %d bytes %d levels %.2f writeAmp",
   147  		r.name,
   148  		r.rate,
   149  		r.passed,
   150  		r.elapsed,
   151  		r.bytes,
   152  		r.levels,
   153  		r.writeAmp,
   154  	)
   155  }
   156  
   157  func runWriteBenchmark(_ *cobra.Command, args []string) error {
   158  	const workload = "F" // 100% inserts.
   159  	var (
   160  		writers      []*pauseWriter
   161  		writersWg    *sync.WaitGroup // Tracks completion of all pauseWriters.
   162  		cooloff      bool            // Is cool-off enabled.
   163  		streak       int             // The number of successive passes.
   164  		clockStart   time.Time       // Start time for current load.
   165  		cooloffStart time.Time       // When cool-off was enabled.
   166  		stack        []int           // Stack of passing load values.
   167  		pass, fail   []int           // Values of load that pass and fail, respectively.
   168  		rateAcc      float64         // Accumulator of measured rates for a single test run.
   169  	)
   170  
   171  	desiredRate := writeBenchConfig.rateStart
   172  	incBase := writeBenchConfig.incBase
   173  	weights, err := ycsbParseWorkload(workload)
   174  
   175  	if err != nil {
   176  		return err
   177  	}
   178  
   179  	keyDist, err := ycsbParseKeyDist(writeBenchConfig.keys)
   180  	if err != nil {
   181  		return err
   182  	}
   183  	batchDist := writeBenchConfig.batch
   184  	valueDist := writeBenchConfig.values
   185  
   186  	// Construct a new YCSB F benchmark with the configured values.
   187  	y := newYcsb(weights, keyDist, batchDist, nil /* scans */, valueDist)
   188  	y.keyNum = ackseq.New(0)
   189  
   190  	setLimit := func(l int) {
   191  		perWriterRate := float64(l) / float64(len(writers))
   192  		for _, w := range writers {
   193  			w.setRate(perWriterRate)
   194  		}
   195  	}
   196  
   197  	// Function closure to run on test-run failure.
   198  	onTestFail := func(r writeBenchResult, cancel func()) {
   199  		fail = append(fail, desiredRate)
   200  
   201  		// Emit a benchmark raw datapoint.
   202  		fmt.Println(r)
   203  
   204  		// We failed at the current load, we have two options:
   205  
   206  		// a) No room to backtrack. We're done.
   207  		if len(stack) == 0 {
   208  			debugPrint("no room to backtrack; exiting ...\n")
   209  			cancel()
   210  			writersWg.Wait()
   211  			return
   212  		}
   213  
   214  		// b) We still have room to backtrack. Reduce the load to the
   215  		// last known passing value.
   216  		desiredRate, stack = stack[len(stack)-1], stack[:len(stack)-1]
   217  		setLimit(desiredRate)
   218  
   219  		// Enter the cool-off period.
   220  		cooloff = true
   221  		var wg sync.WaitGroup
   222  		for _, w := range writers {
   223  			// With a large number of writers, pausing synchronously can
   224  			// take a material amount of time. Instead, pause the
   225  			// writers in parallel in the background, and wait for all
   226  			// to complete before continuing.
   227  			wg.Add(1)
   228  			go func(writer *pauseWriter) {
   229  				writer.pause()
   230  				wg.Done()
   231  			}(w)
   232  		}
   233  		wg.Wait()
   234  
   235  		// Reset the counters and clocks.
   236  		streak = 0
   237  		rateAcc = 0
   238  		cooloffStart = time.Now()
   239  		clockStart = time.Now()
   240  		debugPrint("Fail. Pausing writers for cool-off period.\n")
   241  		debugPrint(fmt.Sprintf("new rate=%d\npasses=%v\nfails=%v\nstack=%v\n",
   242  			desiredRate, pass, fail, stack))
   243  	}
   244  
   245  	// Function closure to run on test-run success.
   246  	onTestSuccess := func(r writeBenchResult) {
   247  		streak++
   248  		pass = append(pass, desiredRate)
   249  		stack = append(stack, desiredRate)
   250  
   251  		// Emit a benchmark raw datapoint.
   252  		r.passed = true
   253  		fmt.Println(r)
   254  
   255  		// Increase the rate.
   256  		desiredRate = desiredRate + incBase*(1<<(streak-1))
   257  		setLimit(desiredRate)
   258  
   259  		// Restart the test.
   260  		rateAcc = 0
   261  		clockStart = time.Now()
   262  
   263  		debugPrint(fmt.Sprintf("Pass.\nnew rate=%d\npasses=%v\nfails=%v\nstreak=%d\nstack=%v\n",
   264  			desiredRate, pass, fail, streak, stack))
   265  	}
   266  
   267  	name := fmt.Sprintf("write/values=%s", writeBenchConfig.values)
   268  	ctx, cancel := context.WithCancel(context.Background())
   269  	runTest(args[0], test{
   270  		init: func(db DB, wg *sync.WaitGroup) {
   271  			y.db = db
   272  			writersWg = wg
   273  
   274  			// Spawn the writers.
   275  			for i := 0; i < writeBenchConfig.concurrency; i++ {
   276  				writer := newPauseWriter(y, float64(desiredRate))
   277  				writers = append(writers, writer)
   278  				writersWg.Add(1)
   279  				go writer.run(ctx, wg)
   280  			}
   281  			setLimit(desiredRate)
   282  
   283  			// Start the clock on the current load.
   284  			clockStart = time.Now()
   285  		},
   286  		tick: func(elapsed time.Duration, i int) {
   287  			m := y.db.Metrics()
   288  			if i%20 == 0 {
   289  				if writeBenchConfig.debug && i > 0 {
   290  					fmt.Printf("%s\n", m)
   291  				}
   292  				fmt.Println("___elapsed___clock___rate(desired)___rate(actual)___L0files___L0levels___levels______lsmBytes___writeAmp")
   293  			}
   294  
   295  			// Print the current stats.
   296  			l0Files := m.Levels[0].NumFiles
   297  			l0Sublevels := m.Levels[0].Sublevels
   298  			nLevels := 0
   299  			for _, l := range m.Levels {
   300  				if l.BytesIn > 0 {
   301  					nLevels++
   302  				}
   303  			}
   304  			lsmBytes := m.DiskSpaceUsage()
   305  			total := m.Total()
   306  			writeAmp := (&total).WriteAmp()
   307  
   308  			var currRate float64
   309  			var stalled bool
   310  			y.reg.Tick(func(tick histogramTick) {
   311  				h := tick.Hist
   312  				currRate = float64(h.TotalCount()) / tick.Elapsed.Seconds()
   313  				stalled = !cooloff && currRate == 0
   314  			})
   315  			rateAcc += currRate
   316  
   317  			// The heuristic by which the DB can sustain a given write load is
   318  			// determined by whether the DB, for the configured window of time:
   319  			// 1) did not encounter a write stall (i.e. write load fell to
   320  			//    zero),
   321  			// 2) number of files in L0 was at or below the target, and
   322  			// 3) number of L0 sub-levels is at or below the target.
   323  			failed := stalled ||
   324  				int(l0Files) > writeBenchConfig.targetL0Files ||
   325  				int(l0Sublevels) > writeBenchConfig.targetL0SubLevels
   326  
   327  			// Print the result for this tick.
   328  			fmt.Printf("%10s %7s %15d %14.1f %9d %10d %8d %13d %10.2f\n",
   329  				time.Duration(elapsed.Seconds()+0.5)*time.Second,
   330  				time.Duration(time.Since(clockStart).Seconds()+0.5)*time.Second,
   331  				desiredRate,
   332  				currRate,
   333  				l0Files,
   334  				l0Sublevels,
   335  				nLevels,
   336  				lsmBytes,
   337  				writeAmp,
   338  			)
   339  
   340  			// If we're in cool-off mode, allow it to complete before resuming
   341  			// writing.
   342  			if cooloff {
   343  				if time.Since(cooloffStart) < writeBenchConfig.cooloffPeriod {
   344  					return
   345  				}
   346  				debugPrint("ending cool-off")
   347  
   348  				// Else, resume writing.
   349  				cooloff = false
   350  				for _, w := range writers {
   351  					w.unpause()
   352  				}
   353  				clockStart = time.Now()
   354  
   355  				return
   356  			}
   357  
   358  			r := writeBenchResult{
   359  				name:     name,
   360  				rate:     desiredRate,
   361  				elapsed:  time.Duration(elapsed.Seconds()+0.5) * time.Second,
   362  				bytes:    lsmBytes,
   363  				levels:   nLevels,
   364  				writeAmp: writeAmp,
   365  			}
   366  
   367  			if failed {
   368  				onTestFail(r, cancel)
   369  				return
   370  			}
   371  
   372  			// Else, the DB could handle the current load. We only increase
   373  			// after a fixed amount of time at this load as elapsed.
   374  			testElapsed := time.Since(clockStart)
   375  			if testElapsed < writeBenchConfig.testPeriod {
   376  				// This test-run still has time on the clock.
   377  				return
   378  			}
   379  
   380  			// This test-run has completed.
   381  
   382  			// If the average rate over the test is less than the desired rate,
   383  			// we mark this test-run as a failure. This handles cases where we
   384  			// encounter a bottleneck that limits write throughput but
   385  			// incorrectly mark the test as passed.
   386  			diff := 1 - rateAcc/(float64(desiredRate)*testElapsed.Seconds())
   387  			if diff > writeBenchConfig.maxRateDipFraction {
   388  				if writeBenchConfig.debug {
   389  					debugPrint(fmt.Sprintf(
   390  						"difference in rates (%.2f) exceeded threshold (%.2f); marking test as failed\n",
   391  						diff, writeBenchConfig.maxRateDipFraction,
   392  					))
   393  				}
   394  				onTestFail(r, cancel)
   395  				return
   396  			}
   397  
   398  			// Mark this test-run as passed.
   399  			onTestSuccess(r)
   400  		},
   401  		done: func(elapsed time.Duration) {
   402  			// Print final analysis.
   403  			var total int64
   404  			y.reg.Tick(func(tick histogramTick) {
   405  				total = tick.Cumulative.TotalCount()
   406  			})
   407  			fmt.Println("___elapsed___ops(total)")
   408  			fmt.Printf("%10s %12d\n", elapsed.Truncate(time.Second), total)
   409  		},
   410  	})
   411  
   412  	return nil
   413  }
   414  
   415  // debugPrint prints a debug line to stdout if debug logging is enabled via the
   416  // --debug flag.
   417  func debugPrint(s string) {
   418  	if !writeBenchConfig.debug {
   419  		return
   420  	}
   421  	fmt.Print("DEBUG: " + s)
   422  }
   423  
   424  // pauseWriter issues load against a pebble instance, and can be paused on
   425  // demand to allow the DB to recover.
   426  type pauseWriter struct {
   427  	y        *ycsb
   428  	limiter  *rate.Limiter
   429  	pauseC   chan struct{}
   430  	unpauseC chan struct{}
   431  }
   432  
   433  // newPauseWriter returns a new pauseWriter.
   434  func newPauseWriter(y *ycsb, initialRate float64) *pauseWriter {
   435  	// Set the burst rate for the limiter to the lowest sensible value to
   436  	// prevent excessive bursting. Note that a burst of zero effectively
   437  	// disables the rate limiter, as a wait time of +Inf is returned from all
   438  	// calls, and `wait(l *rate.Limiter)` will not sleep in this case.
   439  	const burst = 1
   440  	return &pauseWriter{
   441  		y:        y,
   442  		limiter:  rate.NewLimiter(float64(initialRate), burst),
   443  		pauseC:   make(chan struct{}),
   444  		unpauseC: make(chan struct{}),
   445  	}
   446  }
   447  
   448  // run starts the pauseWriter, issuing load against the DB.
   449  func (w *pauseWriter) run(ctx context.Context, wg *sync.WaitGroup) {
   450  	defer wg.Done()
   451  
   452  	buf := &ycsbBuf{rng: randvar.NewRand()}
   453  	hist := w.y.reg.Register("insert")
   454  	for {
   455  		select {
   456  		case <-ctx.Done():
   457  			return
   458  		case <-w.pauseC:
   459  			// Hold the goroutine here until we unpause.
   460  			<-w.unpauseC
   461  		default:
   462  			wait(w.limiter)
   463  			start := time.Now()
   464  			w.y.insert(w.y.db, buf)
   465  			hist.Record(time.Since(start))
   466  		}
   467  	}
   468  }
   469  
   470  // pause signals that the writer should pause after the current operation.
   471  func (w *pauseWriter) pause() {
   472  	w.pauseC <- struct{}{}
   473  }
   474  
   475  // unpause unpauses the writer.
   476  func (w *pauseWriter) unpause() {
   477  	w.unpauseC <- struct{}{}
   478  }
   479  
   480  // setRate sets the rate limit for this writer.
   481  func (w *pauseWriter) setRate(r float64) {
   482  	w.limiter.SetRate(r)
   483  }