github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/tool/db_io_bench.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/tool/db_io_bench.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package tool
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"math/rand"
    13  	"sort"
    14  	"strconv"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/errors"
    20  	"github.com/cockroachdb/pebble"
    21  	"github.com/cockroachdb/pebble/internal/base"
    22  	"github.com/cockroachdb/pebble/objstorage"
    23  	"github.com/spf13/cobra"
    24  )
    25  
    26  type benchIO struct {
    27  	readableIdx int
    28  	ofs         int64
    29  	size        int
    30  	// elapsed time for the IO, filled out by performIOs.
    31  	elapsed time.Duration
    32  }
    33  
    34  const maxIOSize = 1024 * 1024
    35  
    36  // runIOBench runs an IO benchmark against the current sstables of a database.
    37  // The workload is random IO, with various IO sizes. The main goal of the
    38  // benchmark is to establish the relationship between IO size and latency,
    39  // especially against shared object storage.
    40  func (d *dbT) runIOBench(cmd *cobra.Command, args []string) {
    41  	stdout := cmd.OutOrStdout()
    42  
    43  	ioSizes, err := parseIOSizes(d.ioSizes)
    44  	if err != nil {
    45  		fmt.Fprintf(stdout, "error parsing io-sizes: %s\n", err)
    46  		return
    47  	}
    48  
    49  	db, err := d.openDB(args[0])
    50  	if err != nil {
    51  		fmt.Fprintf(stdout, "%s\n", err)
    52  		return
    53  	}
    54  	defer d.closeDB(stdout, db)
    55  
    56  	readables, err := d.openBenchTables(db)
    57  	if err != nil {
    58  		fmt.Fprintf(stdout, "%s\n", err)
    59  		return
    60  	}
    61  
    62  	defer func() {
    63  		for _, r := range readables {
    64  			r.Close()
    65  		}
    66  	}()
    67  
    68  	ios := genBenchIOs(stdout, readables, d.ioCount, ioSizes)
    69  
    70  	levels := "L5,L6"
    71  	if d.allLevels {
    72  		levels = "all"
    73  	}
    74  	fmt.Fprintf(stdout, "IO count: %d  Parallelism: %d  Levels: %s\n", d.ioCount, d.ioParallelism, levels)
    75  
    76  	var wg sync.WaitGroup
    77  	wg.Add(d.ioParallelism)
    78  	remainingIOs := ios
    79  	for i := 0; i < d.ioParallelism; i++ {
    80  		// We want to distribute the IOs among d.ioParallelism goroutines. At each
    81  		// step, we look at the number of IOs remaining and take the average (across
    82  		// the goroutines that are left); this deals with any rounding issues.
    83  		n := len(remainingIOs) / (d.ioParallelism - i)
    84  		go func(workerIdx int, ios []benchIO) {
    85  			defer wg.Done()
    86  			if err := performIOs(readables, ios); err != nil {
    87  				fmt.Fprintf(stdout, "worker %d encountered error: %v", workerIdx, err)
    88  			}
    89  		}(i, remainingIOs[:n])
    90  		remainingIOs = remainingIOs[n:]
    91  	}
    92  	wg.Wait()
    93  
    94  	elapsed := make([]time.Duration, d.ioCount)
    95  	for _, ioSize := range ioSizes {
    96  		elapsed = elapsed[:0]
    97  		for i := range ios {
    98  			if ios[i].size == ioSize {
    99  				elapsed = append(elapsed, ios[i].elapsed)
   100  			}
   101  		}
   102  		fmt.Fprintf(stdout, "%4dKB  --  %s\n", ioSize/1024, getStats(elapsed))
   103  	}
   104  }
   105  
   106  // genBenchIOs generates <count> IOs for each given size. All IOs (across all
   107  // sizes) are in random order.
   108  func genBenchIOs(
   109  	stdout io.Writer, readables []objstorage.Readable, count int, sizes []int,
   110  ) []benchIO {
   111  	// size[i] is the size of the object, in blocks of maxIOSize.
   112  	size := make([]int, len(readables))
   113  	// sum[i] is the sum (size[0] + ... + size[i]).
   114  	sum := make([]int, len(readables))
   115  	total := 0
   116  	for i, r := range readables {
   117  		size[i] = int(r.Size() / maxIOSize)
   118  		total += size[i]
   119  		sum[i] = total
   120  	}
   121  	fmt.Fprintf(stdout, "Opened %d objects; total size %d MB.\n", len(readables), total*maxIOSize/(1024*1024))
   122  
   123  	// To avoid a lot of overlap between the reads, the total size should be a
   124  	// factor larger than the size we will actually read (for the largest IO
   125  	// size).
   126  	const sizeFactor = 2
   127  	if total*maxIOSize < count*sizes[len(sizes)-1]*sizeFactor {
   128  		fmt.Fprintf(stdout, "Warning: store too small for the given IO count and sizes.\n")
   129  	}
   130  
   131  	// Choose how many IOs we do for each object, by selecting a random block
   132  	// across all file blocks.
   133  	// The choice of objects will be the same across all IO sizes.
   134  	b := make([]int, count)
   135  	for i := range b {
   136  		b[i] = rand.Intn(total)
   137  	}
   138  	// For each b[i], find the index such that sum[idx-1] <= b < sum[idx].
   139  	// Sorting b makes this easier: we can "merge" the sorted arrays b and sum.
   140  	sort.Ints(b)
   141  	rIdx := make([]int, count)
   142  	currIdx := 0
   143  	for i := range b {
   144  		for b[i] >= sum[currIdx] {
   145  			currIdx++
   146  		}
   147  		rIdx[i] = currIdx
   148  	}
   149  
   150  	res := make([]benchIO, 0, count*len(sizes))
   151  	for _, ioSize := range sizes {
   152  		for _, idx := range rIdx {
   153  			// Random ioSize aligned offset.
   154  			ofs := ioSize * rand.Intn(size[idx]*maxIOSize/ioSize)
   155  
   156  			res = append(res, benchIO{
   157  				readableIdx: idx,
   158  				ofs:         int64(ofs),
   159  				size:        ioSize,
   160  			})
   161  		}
   162  	}
   163  	rand.Shuffle(len(res), func(i, j int) {
   164  		res[i], res[j] = res[j], res[i]
   165  	})
   166  	return res
   167  }
   168  
   169  // openBenchTables opens the sstables for the benchmark and returns them as a
   170  // list of Readables.
   171  //
   172  // By default, only L5/L6 sstables are used; all levels are used if the
   173  // allLevels flag is set.
   174  //
   175  // Note that only sstables that are at least maxIOSize (1MB) are used.
   176  func (d *dbT) openBenchTables(db *pebble.DB) ([]objstorage.Readable, error) {
   177  	tables, err := db.SSTables()
   178  	if err != nil {
   179  		return nil, err
   180  	}
   181  	startLevel := 5
   182  	if d.allLevels {
   183  		startLevel = 0
   184  	}
   185  
   186  	var nums []base.DiskFileNum
   187  	numsMap := make(map[base.DiskFileNum]struct{})
   188  	for l := startLevel; l < len(tables); l++ {
   189  		for _, t := range tables[l] {
   190  			n := t.BackingSSTNum.DiskFileNum()
   191  			if _, ok := numsMap[n]; !ok {
   192  				nums = append(nums, n)
   193  				numsMap[n] = struct{}{}
   194  			}
   195  		}
   196  	}
   197  
   198  	p := db.ObjProvider()
   199  	var res []objstorage.Readable
   200  	for _, n := range nums {
   201  		r, err := p.OpenForReading(context.Background(), base.FileTypeTable, n, objstorage.OpenOptions{})
   202  		if err != nil {
   203  			for _, r := range res {
   204  				_ = r.Close()
   205  			}
   206  			return nil, err
   207  		}
   208  		if r.Size() < maxIOSize {
   209  			_ = r.Close()
   210  			continue
   211  		}
   212  		res = append(res, r)
   213  	}
   214  	if len(res) == 0 {
   215  		return nil, errors.Errorf("no sstables (with size at least %d)", maxIOSize)
   216  	}
   217  
   218  	return res, nil
   219  }
   220  
   221  // parseIOSizes parses a comma-separated list of IO sizes, in KB.
   222  func parseIOSizes(sizes string) ([]int, error) {
   223  	var res []int
   224  	for _, s := range strings.Split(sizes, ",") {
   225  		n, err := strconv.Atoi(s)
   226  		if err != nil {
   227  			return nil, err
   228  		}
   229  		ioSize := n * 1024
   230  		if ioSize > maxIOSize {
   231  			return nil, errors.Errorf("IO sizes over %d not supported", maxIOSize)
   232  		}
   233  		if maxIOSize%ioSize != 0 {
   234  			return nil, errors.Errorf("IO size must be a divisor of %d", maxIOSize)
   235  		}
   236  		res = append(res, ioSize)
   237  	}
   238  	if len(res) == 0 {
   239  		return nil, errors.Errorf("no IO sizes specified")
   240  	}
   241  	sort.Ints(res)
   242  	return res, nil
   243  }
   244  
   245  // performIOs performs the given list of IOs and populates the elapsed fields.
   246  func performIOs(readables []objstorage.Readable, ios []benchIO) error {
   247  	ctx := context.Background()
   248  	rh := make([]objstorage.ReadHandle, len(readables))
   249  	for i := range rh {
   250  		rh[i] = readables[i].NewReadHandle(ctx)
   251  	}
   252  	defer func() {
   253  		for i := range rh {
   254  			rh[i].Close()
   255  		}
   256  	}()
   257  
   258  	buf := make([]byte, maxIOSize)
   259  	startTime := time.Now()
   260  	var firstErr error
   261  	var nOtherErrs int
   262  	for i := range ios {
   263  		if err := rh[ios[i].readableIdx].ReadAt(ctx, buf[:ios[i].size], ios[i].ofs); err != nil {
   264  			if firstErr == nil {
   265  				firstErr = err
   266  			} else {
   267  				nOtherErrs++
   268  			}
   269  		}
   270  		endTime := time.Now()
   271  		ios[i].elapsed = endTime.Sub(startTime)
   272  		startTime = endTime
   273  	}
   274  	if nOtherErrs > 0 {
   275  		return errors.Errorf("%v; plus %d more errors", firstErr, nOtherErrs)
   276  	}
   277  	return firstErr
   278  }
   279  
   280  // getStats calculates various statistics given a list of elapsed times.
   281  func getStats(d []time.Duration) string {
   282  	sort.Slice(d, func(i, j int) bool { return d[i] < d[j] })
   283  
   284  	factor := 1.0 / float64(len(d))
   285  	var mean float64
   286  	for i := range d {
   287  		mean += float64(d[i]) * factor
   288  	}
   289  	var variance float64
   290  	for i := range d {
   291  		delta := float64(d[i]) - mean
   292  		variance += delta * delta * factor
   293  	}
   294  
   295  	toStr := func(d time.Duration) string {
   296  		if d < 10*time.Millisecond {
   297  			return fmt.Sprintf("%1.2fms", float64(d)/float64(time.Millisecond))
   298  		}
   299  		if d < 100*time.Millisecond {
   300  			return fmt.Sprintf("%2.1fms", float64(d)/float64(time.Millisecond))
   301  		}
   302  		return fmt.Sprintf("%4dms", d/time.Millisecond)
   303  	}
   304  
   305  	return fmt.Sprintf(
   306  		"avg %s   stddev %s   p10 %s   p50 %s   p90 %s   p95 %s   p99 %s",
   307  		toStr(time.Duration(mean)),
   308  		toStr(time.Duration(math.Sqrt(variance))),
   309  		toStr(d[len(d)*10/100]),
   310  		toStr(d[len(d)*50/100]),
   311  		toStr(d[len(d)*90/100]),
   312  		toStr(d[len(d)*95/100]),
   313  		toStr(d[len(d)*99/100]),
   314  	)
   315  }