github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/tool/db_io_bench.go

github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/tool/db_io_bench.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package tool
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"io"
    11  	"math"
    12  	"math/rand"
    13  	"slices"
    14  	"sort"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/errors"
    21  	"github.com/cockroachdb/pebble"
    22  	"github.com/cockroachdb/pebble/internal/base"
    23  	"github.com/cockroachdb/pebble/objstorage"
    24  	"github.com/spf13/cobra"
    25  )
    26  
    27  type benchIO struct {
    28  	readableIdx int
    29  	ofs         int64
    30  	size        int
    31  	// elapsed time for the IO, filled out by performIOs.
    32  	elapsed time.Duration
    33  }
    34  
    35  const maxIOSize = 1024 * 1024
    36  
    37  // runIOBench runs an IO benchmark against the current sstables of a database.
    38  // The workload is random IO, with various IO sizes. The main goal of the
    39  // benchmark is to establish the relationship between IO size and latency,
    40  // especially against shared object storage.
    41  func (d *dbT) runIOBench(cmd *cobra.Command, args []string) {
    42  	stdout := cmd.OutOrStdout()
    43  
    44  	ioSizes, err := parseIOSizes(d.ioSizes)
    45  	if err != nil {
    46  		fmt.Fprintf(stdout, "error parsing io-sizes: %s\n", err)
    47  		return
    48  	}
    49  
    50  	db, err := d.openDB(args[0])
    51  	if err != nil {
    52  		fmt.Fprintf(stdout, "%s\n", err)
    53  		return
    54  	}
    55  	defer d.closeDB(stdout, db)
    56  
    57  	readables, err := d.openBenchTables(db)
    58  	if err != nil {
    59  		fmt.Fprintf(stdout, "%s\n", err)
    60  		return
    61  	}
    62  
    63  	defer func() {
    64  		for _, r := range readables {
    65  			r.Close()
    66  		}
    67  	}()
    68  
    69  	ios := genBenchIOs(stdout, readables, d.ioCount, ioSizes)
    70  
    71  	levels := "L5,L6"
    72  	if d.allLevels {
    73  		levels = "all"
    74  	}
    75  	fmt.Fprintf(stdout, "IO count: %d  Parallelism: %d  Levels: %s\n", d.ioCount, d.ioParallelism, levels)
    76  
    77  	var wg sync.WaitGroup
    78  	wg.Add(d.ioParallelism)
    79  	remainingIOs := ios
    80  	for i := 0; i < d.ioParallelism; i++ {
    81  		// We want to distribute the IOs among d.ioParallelism goroutines. At each
    82  		// step, we look at the number of IOs remaining and take the average (across
    83  		// the goroutines that are left); this deals with any rounding issues.
    84  		n := len(remainingIOs) / (d.ioParallelism - i)
    85  		go func(workerIdx int, ios []benchIO) {
    86  			defer wg.Done()
    87  			if err := performIOs(readables, ios); err != nil {
    88  				fmt.Fprintf(stdout, "worker %d encountered error: %v", workerIdx, err)
    89  			}
    90  		}(i, remainingIOs[:n])
    91  		remainingIOs = remainingIOs[n:]
    92  	}
    93  	wg.Wait()
    94  
    95  	elapsed := make([]time.Duration, d.ioCount)
    96  	for _, ioSize := range ioSizes {
    97  		elapsed = elapsed[:0]
    98  		for i := range ios {
    99  			if ios[i].size == ioSize {
   100  				elapsed = append(elapsed, ios[i].elapsed)
   101  			}
   102  		}
   103  		fmt.Fprintf(stdout, "%4dKB  --  %s\n", ioSize/1024, getStats(elapsed))
   104  	}
   105  }
   106  
   107  // genBenchIOs generates <count> IOs for each given size. All IOs (across all
   108  // sizes) are in random order.
   109  func genBenchIOs(
   110  	stdout io.Writer, readables []objstorage.Readable, count int, sizes []int,
   111  ) []benchIO {
   112  	// size[i] is the size of the object, in blocks of maxIOSize.
   113  	size := make([]int, len(readables))
   114  	// sum[i] is the sum (size[0] + ... + size[i]).
   115  	sum := make([]int, len(readables))
   116  	total := 0
   117  	for i, r := range readables {
   118  		size[i] = int(r.Size() / maxIOSize)
   119  		total += size[i]
   120  		sum[i] = total
   121  	}
   122  	fmt.Fprintf(stdout, "Opened %d objects; total size %d MB.\n", len(readables), total*maxIOSize/(1024*1024))
   123  
   124  	// To avoid a lot of overlap between the reads, the total size should be a
   125  	// factor larger than the size we will actually read (for the largest IO
   126  	// size).
   127  	const sizeFactor = 2
   128  	if total*maxIOSize < count*sizes[len(sizes)-1]*sizeFactor {
   129  		fmt.Fprintf(stdout, "Warning: store too small for the given IO count and sizes.\n")
   130  	}
   131  
   132  	// Choose how many IOs we do for each object, by selecting a random block
   133  	// across all file blocks.
   134  	// The choice of objects will be the same across all IO sizes.
   135  	b := make([]int, count)
   136  	for i := range b {
   137  		b[i] = rand.Intn(total)
   138  	}
   139  	// For each b[i], find the index such that sum[idx-1] <= b < sum[idx].
   140  	// Sorting b makes this easier: we can "merge" the sorted arrays b and sum.
   141  	sort.Ints(b)
   142  	rIdx := make([]int, count)
   143  	currIdx := 0
   144  	for i := range b {
   145  		for b[i] >= sum[currIdx] {
   146  			currIdx++
   147  		}
   148  		rIdx[i] = currIdx
   149  	}
   150  
   151  	res := make([]benchIO, 0, count*len(sizes))
   152  	for _, ioSize := range sizes {
   153  		for _, idx := range rIdx {
   154  			// Random ioSize aligned offset.
   155  			ofs := ioSize * rand.Intn(size[idx]*maxIOSize/ioSize)
   156  
   157  			res = append(res, benchIO{
   158  				readableIdx: idx,
   159  				ofs:         int64(ofs),
   160  				size:        ioSize,
   161  			})
   162  		}
   163  	}
   164  	rand.Shuffle(len(res), func(i, j int) {
   165  		res[i], res[j] = res[j], res[i]
   166  	})
   167  	return res
   168  }
   169  
   170  // openBenchTables opens the sstables for the benchmark and returns them as a
   171  // list of Readables.
   172  //
   173  // By default, only L5/L6 sstables are used; all levels are used if the
   174  // allLevels flag is set.
   175  //
   176  // Note that only sstables that are at least maxIOSize (1MB) are used.
   177  func (d *dbT) openBenchTables(db *pebble.DB) ([]objstorage.Readable, error) {
   178  	tables, err := db.SSTables()
   179  	if err != nil {
   180  		return nil, err
   181  	}
   182  	startLevel := 5
   183  	if d.allLevels {
   184  		startLevel = 0
   185  	}
   186  
   187  	var nums []base.DiskFileNum
   188  	numsMap := make(map[base.DiskFileNum]struct{})
   189  	for l := startLevel; l < len(tables); l++ {
   190  		for _, t := range tables[l] {
   191  			n := t.BackingSSTNum.DiskFileNum()
   192  			if _, ok := numsMap[n]; !ok {
   193  				nums = append(nums, n)
   194  				numsMap[n] = struct{}{}
   195  			}
   196  		}
   197  	}
   198  
   199  	p := db.ObjProvider()
   200  	var res []objstorage.Readable
   201  	for _, n := range nums {
   202  		r, err := p.OpenForReading(context.Background(), base.FileTypeTable, n, objstorage.OpenOptions{})
   203  		if err != nil {
   204  			for _, r := range res {
   205  				_ = r.Close()
   206  			}
   207  			return nil, err
   208  		}
   209  		if r.Size() < maxIOSize {
   210  			_ = r.Close()
   211  			continue
   212  		}
   213  		res = append(res, r)
   214  	}
   215  	if len(res) == 0 {
   216  		return nil, errors.Errorf("no sstables (with size at least %d)", maxIOSize)
   217  	}
   218  
   219  	return res, nil
   220  }
   221  
   222  // parseIOSizes parses a comma-separated list of IO sizes, in KB.
   223  func parseIOSizes(sizes string) ([]int, error) {
   224  	var res []int
   225  	for _, s := range strings.Split(sizes, ",") {
   226  		n, err := strconv.Atoi(s)
   227  		if err != nil {
   228  			return nil, err
   229  		}
   230  		ioSize := n * 1024
   231  		if ioSize > maxIOSize {
   232  			return nil, errors.Errorf("IO sizes over %d not supported", maxIOSize)
   233  		}
   234  		if maxIOSize%ioSize != 0 {
   235  			return nil, errors.Errorf("IO size must be a divisor of %d", maxIOSize)
   236  		}
   237  		res = append(res, ioSize)
   238  	}
   239  	if len(res) == 0 {
   240  		return nil, errors.Errorf("no IO sizes specified")
   241  	}
   242  	sort.Ints(res)
   243  	return res, nil
   244  }
   245  
   246  // performIOs performs the given list of IOs and populates the elapsed fields.
   247  func performIOs(readables []objstorage.Readable, ios []benchIO) error {
   248  	ctx := context.Background()
   249  	rh := make([]objstorage.ReadHandle, len(readables))
   250  	for i := range rh {
   251  		rh[i] = readables[i].NewReadHandle(ctx)
   252  	}
   253  	defer func() {
   254  		for i := range rh {
   255  			rh[i].Close()
   256  		}
   257  	}()
   258  
   259  	buf := make([]byte, maxIOSize)
   260  	startTime := time.Now()
   261  	var firstErr error
   262  	var nOtherErrs int
   263  	for i := range ios {
   264  		if err := rh[ios[i].readableIdx].ReadAt(ctx, buf[:ios[i].size], ios[i].ofs); err != nil {
   265  			if firstErr == nil {
   266  				firstErr = err
   267  			} else {
   268  				nOtherErrs++
   269  			}
   270  		}
   271  		endTime := time.Now()
   272  		ios[i].elapsed = endTime.Sub(startTime)
   273  		startTime = endTime
   274  	}
   275  	if nOtherErrs > 0 {
   276  		return errors.Errorf("%v; plus %d more errors", firstErr, nOtherErrs)
   277  	}
   278  	return firstErr
   279  }
   280  
   281  // getStats calculates various statistics given a list of elapsed times.
   282  func getStats(d []time.Duration) string {
   283  	slices.Sort(d)
   284  
   285  	factor := 1.0 / float64(len(d))
   286  	var mean float64
   287  	for i := range d {
   288  		mean += float64(d[i]) * factor
   289  	}
   290  	var variance float64
   291  	for i := range d {
   292  		delta := float64(d[i]) - mean
   293  		variance += delta * delta * factor
   294  	}
   295  
   296  	toStr := func(d time.Duration) string {
   297  		if d < 10*time.Millisecond {
   298  			return fmt.Sprintf("%1.2fms", float64(d)/float64(time.Millisecond))
   299  		}
   300  		if d < 100*time.Millisecond {
   301  			return fmt.Sprintf("%2.1fms", float64(d)/float64(time.Millisecond))
   302  		}
   303  		return fmt.Sprintf("%4dms", d/time.Millisecond)
   304  	}
   305  
   306  	return fmt.Sprintf(
   307  		"avg %s   stddev %s   p10 %s   p50 %s   p90 %s   p95 %s   p99 %s",
   308  		toStr(time.Duration(mean)),
   309  		toStr(time.Duration(math.Sqrt(variance))),
   310  		toStr(d[len(d)*10/100]),
   311  		toStr(d[len(d)*50/100]),
   312  		toStr(d[len(d)*90/100]),
   313  		toStr(d[len(d)*95/100]),
   314  		toStr(d[len(d)*99/100]),
   315  	)
   316  }