github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/tool/db_io_bench.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package tool 6 7 import ( 8 "context" 9 "fmt" 10 "io" 11 "math" 12 "math/rand" 13 "sort" 14 "strconv" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/cockroachdb/errors" 20 "github.com/cockroachdb/pebble" 21 "github.com/cockroachdb/pebble/internal/base" 22 "github.com/cockroachdb/pebble/objstorage" 23 "github.com/spf13/cobra" 24 ) 25 26 type benchIO struct { 27 readableIdx int 28 ofs int64 29 size int 30 // elapsed time for the IO, filled out by performIOs. 31 elapsed time.Duration 32 } 33 34 const maxIOSize = 1024 * 1024 35 36 // runIOBench runs an IO benchmark against the current sstables of a database. 37 // The workload is random IO, with various IO sizes. The main goal of the 38 // benchmark is to establish the relationship between IO size and latency, 39 // especially against shared object storage. 40 func (d *dbT) runIOBench(cmd *cobra.Command, args []string) { 41 stdout := cmd.OutOrStdout() 42 43 ioSizes, err := parseIOSizes(d.ioSizes) 44 if err != nil { 45 fmt.Fprintf(stdout, "error parsing io-sizes: %s\n", err) 46 return 47 } 48 49 db, err := d.openDB(args[0]) 50 if err != nil { 51 fmt.Fprintf(stdout, "%s\n", err) 52 return 53 } 54 defer d.closeDB(stdout, db) 55 56 readables, err := d.openBenchTables(db) 57 if err != nil { 58 fmt.Fprintf(stdout, "%s\n", err) 59 return 60 } 61 62 defer func() { 63 for _, r := range readables { 64 r.Close() 65 } 66 }() 67 68 ios := genBenchIOs(stdout, readables, d.ioCount, ioSizes) 69 70 levels := "L5,L6" 71 if d.allLevels { 72 levels = "all" 73 } 74 fmt.Fprintf(stdout, "IO count: %d Parallelism: %d Levels: %s\n", d.ioCount, d.ioParallelism, levels) 75 76 var wg sync.WaitGroup 77 wg.Add(d.ioParallelism) 78 remainingIOs := ios 79 for i := 0; i < d.ioParallelism; i++ { 80 // We want to distribute the IOs among d.ioParallelism goroutines. At each 81 // step, we look at the number of IOs remaining and take the average (across 82 // the goroutines that are left); this deals with any rounding issues. 83 n := len(remainingIOs) / (d.ioParallelism - i) 84 go func(workerIdx int, ios []benchIO) { 85 defer wg.Done() 86 if err := performIOs(readables, ios); err != nil { 87 fmt.Fprintf(stdout, "worker %d encountered error: %v", workerIdx, err) 88 } 89 }(i, remainingIOs[:n]) 90 remainingIOs = remainingIOs[n:] 91 } 92 wg.Wait() 93 94 elapsed := make([]time.Duration, d.ioCount) 95 for _, ioSize := range ioSizes { 96 elapsed = elapsed[:0] 97 for i := range ios { 98 if ios[i].size == ioSize { 99 elapsed = append(elapsed, ios[i].elapsed) 100 } 101 } 102 fmt.Fprintf(stdout, "%4dKB -- %s\n", ioSize/1024, getStats(elapsed)) 103 } 104 } 105 106 // genBenchIOs generates <count> IOs for each given size. All IOs (across all 107 // sizes) are in random order. 108 func genBenchIOs( 109 stdout io.Writer, readables []objstorage.Readable, count int, sizes []int, 110 ) []benchIO { 111 // size[i] is the size of the object, in blocks of maxIOSize. 112 size := make([]int, len(readables)) 113 // sum[i] is the sum (size[0] + ... + size[i]). 114 sum := make([]int, len(readables)) 115 total := 0 116 for i, r := range readables { 117 size[i] = int(r.Size() / maxIOSize) 118 total += size[i] 119 sum[i] = total 120 } 121 fmt.Fprintf(stdout, "Opened %d objects; total size %d MB.\n", len(readables), total*maxIOSize/(1024*1024)) 122 123 // To avoid a lot of overlap between the reads, the total size should be a 124 // factor larger than the size we will actually read (for the largest IO 125 // size). 126 const sizeFactor = 2 127 if total*maxIOSize < count*sizes[len(sizes)-1]*sizeFactor { 128 fmt.Fprintf(stdout, "Warning: store too small for the given IO count and sizes.\n") 129 } 130 131 // Choose how many IOs we do for each object, by selecting a random block 132 // across all file blocks. 133 // The choice of objects will be the same across all IO sizes. 134 b := make([]int, count) 135 for i := range b { 136 b[i] = rand.Intn(total) 137 } 138 // For each b[i], find the index such that sum[idx-1] <= b < sum[idx]. 139 // Sorting b makes this easier: we can "merge" the sorted arrays b and sum. 140 sort.Ints(b) 141 rIdx := make([]int, count) 142 currIdx := 0 143 for i := range b { 144 for b[i] >= sum[currIdx] { 145 currIdx++ 146 } 147 rIdx[i] = currIdx 148 } 149 150 res := make([]benchIO, 0, count*len(sizes)) 151 for _, ioSize := range sizes { 152 for _, idx := range rIdx { 153 // Random ioSize aligned offset. 154 ofs := ioSize * rand.Intn(size[idx]*maxIOSize/ioSize) 155 156 res = append(res, benchIO{ 157 readableIdx: idx, 158 ofs: int64(ofs), 159 size: ioSize, 160 }) 161 } 162 } 163 rand.Shuffle(len(res), func(i, j int) { 164 res[i], res[j] = res[j], res[i] 165 }) 166 return res 167 } 168 169 // openBenchTables opens the sstables for the benchmark and returns them as a 170 // list of Readables. 171 // 172 // By default, only L5/L6 sstables are used; all levels are used if the 173 // allLevels flag is set. 174 // 175 // Note that only sstables that are at least maxIOSize (1MB) are used. 176 func (d *dbT) openBenchTables(db *pebble.DB) ([]objstorage.Readable, error) { 177 tables, err := db.SSTables() 178 if err != nil { 179 return nil, err 180 } 181 startLevel := 5 182 if d.allLevels { 183 startLevel = 0 184 } 185 186 var nums []base.DiskFileNum 187 numsMap := make(map[base.DiskFileNum]struct{}) 188 for l := startLevel; l < len(tables); l++ { 189 for _, t := range tables[l] { 190 n := t.BackingSSTNum.DiskFileNum() 191 if _, ok := numsMap[n]; !ok { 192 nums = append(nums, n) 193 numsMap[n] = struct{}{} 194 } 195 } 196 } 197 198 p := db.ObjProvider() 199 var res []objstorage.Readable 200 for _, n := range nums { 201 r, err := p.OpenForReading(context.Background(), base.FileTypeTable, n, objstorage.OpenOptions{}) 202 if err != nil { 203 for _, r := range res { 204 _ = r.Close() 205 } 206 return nil, err 207 } 208 if r.Size() < maxIOSize { 209 _ = r.Close() 210 continue 211 } 212 res = append(res, r) 213 } 214 if len(res) == 0 { 215 return nil, errors.Errorf("no sstables (with size at least %d)", maxIOSize) 216 } 217 218 return res, nil 219 } 220 221 // parseIOSizes parses a comma-separated list of IO sizes, in KB. 222 func parseIOSizes(sizes string) ([]int, error) { 223 var res []int 224 for _, s := range strings.Split(sizes, ",") { 225 n, err := strconv.Atoi(s) 226 if err != nil { 227 return nil, err 228 } 229 ioSize := n * 1024 230 if ioSize > maxIOSize { 231 return nil, errors.Errorf("IO sizes over %d not supported", maxIOSize) 232 } 233 if maxIOSize%ioSize != 0 { 234 return nil, errors.Errorf("IO size must be a divisor of %d", maxIOSize) 235 } 236 res = append(res, ioSize) 237 } 238 if len(res) == 0 { 239 return nil, errors.Errorf("no IO sizes specified") 240 } 241 sort.Ints(res) 242 return res, nil 243 } 244 245 // performIOs performs the given list of IOs and populates the elapsed fields. 246 func performIOs(readables []objstorage.Readable, ios []benchIO) error { 247 ctx := context.Background() 248 rh := make([]objstorage.ReadHandle, len(readables)) 249 for i := range rh { 250 rh[i] = readables[i].NewReadHandle(ctx) 251 } 252 defer func() { 253 for i := range rh { 254 rh[i].Close() 255 } 256 }() 257 258 buf := make([]byte, maxIOSize) 259 startTime := time.Now() 260 var firstErr error 261 var nOtherErrs int 262 for i := range ios { 263 if err := rh[ios[i].readableIdx].ReadAt(ctx, buf[:ios[i].size], ios[i].ofs); err != nil { 264 if firstErr == nil { 265 firstErr = err 266 } else { 267 nOtherErrs++ 268 } 269 } 270 endTime := time.Now() 271 ios[i].elapsed = endTime.Sub(startTime) 272 startTime = endTime 273 } 274 if nOtherErrs > 0 { 275 return errors.Errorf("%v; plus %d more errors", firstErr, nOtherErrs) 276 } 277 return firstErr 278 } 279 280 // getStats calculates various statistics given a list of elapsed times. 281 func getStats(d []time.Duration) string { 282 sort.Slice(d, func(i, j int) bool { return d[i] < d[j] }) 283 284 factor := 1.0 / float64(len(d)) 285 var mean float64 286 for i := range d { 287 mean += float64(d[i]) * factor 288 } 289 var variance float64 290 for i := range d { 291 delta := float64(d[i]) - mean 292 variance += delta * delta * factor 293 } 294 295 toStr := func(d time.Duration) string { 296 if d < 10*time.Millisecond { 297 return fmt.Sprintf("%1.2fms", float64(d)/float64(time.Millisecond)) 298 } 299 if d < 100*time.Millisecond { 300 return fmt.Sprintf("%2.1fms", float64(d)/float64(time.Millisecond)) 301 } 302 return fmt.Sprintf("%4dms", d/time.Millisecond) 303 } 304 305 return fmt.Sprintf( 306 "avg %s stddev %s p10 %s p50 %s p90 %s p95 %s p99 %s", 307 toStr(time.Duration(mean)), 308 toStr(time.Duration(math.Sqrt(variance))), 309 toStr(d[len(d)*10/100]), 310 toStr(d[len(d)*50/100]), 311 toStr(d[len(d)*90/100]), 312 toStr(d[len(d)*95/100]), 313 toStr(d[len(d)*99/100]), 314 ) 315 }