github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/tool/db_io_bench.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package tool 6 7 import ( 8 "context" 9 "fmt" 10 "io" 11 "math" 12 "math/rand" 13 "slices" 14 "sort" 15 "strconv" 16 "strings" 17 "sync" 18 "time" 19 20 "github.com/cockroachdb/errors" 21 "github.com/cockroachdb/pebble" 22 "github.com/cockroachdb/pebble/internal/base" 23 "github.com/cockroachdb/pebble/objstorage" 24 "github.com/spf13/cobra" 25 ) 26 27 type benchIO struct { 28 readableIdx int 29 ofs int64 30 size int 31 // elapsed time for the IO, filled out by performIOs. 32 elapsed time.Duration 33 } 34 35 const maxIOSize = 1024 * 1024 36 37 // runIOBench runs an IO benchmark against the current sstables of a database. 38 // The workload is random IO, with various IO sizes. The main goal of the 39 // benchmark is to establish the relationship between IO size and latency, 40 // especially against shared object storage. 41 func (d *dbT) runIOBench(cmd *cobra.Command, args []string) { 42 stdout := cmd.OutOrStdout() 43 44 ioSizes, err := parseIOSizes(d.ioSizes) 45 if err != nil { 46 fmt.Fprintf(stdout, "error parsing io-sizes: %s\n", err) 47 return 48 } 49 50 db, err := d.openDB(args[0]) 51 if err != nil { 52 fmt.Fprintf(stdout, "%s\n", err) 53 return 54 } 55 defer d.closeDB(stdout, db) 56 57 readables, err := d.openBenchTables(db) 58 if err != nil { 59 fmt.Fprintf(stdout, "%s\n", err) 60 return 61 } 62 63 defer func() { 64 for _, r := range readables { 65 r.Close() 66 } 67 }() 68 69 ios := genBenchIOs(stdout, readables, d.ioCount, ioSizes) 70 71 levels := "L5,L6" 72 if d.allLevels { 73 levels = "all" 74 } 75 fmt.Fprintf(stdout, "IO count: %d Parallelism: %d Levels: %s\n", d.ioCount, d.ioParallelism, levels) 76 77 var wg sync.WaitGroup 78 wg.Add(d.ioParallelism) 79 remainingIOs := ios 80 for i := 0; i < d.ioParallelism; i++ { 81 // We want to distribute the IOs among d.ioParallelism goroutines. At each 82 // step, we look at the number of IOs remaining and take the average (across 83 // the goroutines that are left); this deals with any rounding issues. 84 n := len(remainingIOs) / (d.ioParallelism - i) 85 go func(workerIdx int, ios []benchIO) { 86 defer wg.Done() 87 if err := performIOs(readables, ios); err != nil { 88 fmt.Fprintf(stdout, "worker %d encountered error: %v", workerIdx, err) 89 } 90 }(i, remainingIOs[:n]) 91 remainingIOs = remainingIOs[n:] 92 } 93 wg.Wait() 94 95 elapsed := make([]time.Duration, d.ioCount) 96 for _, ioSize := range ioSizes { 97 elapsed = elapsed[:0] 98 for i := range ios { 99 if ios[i].size == ioSize { 100 elapsed = append(elapsed, ios[i].elapsed) 101 } 102 } 103 fmt.Fprintf(stdout, "%4dKB -- %s\n", ioSize/1024, getStats(elapsed)) 104 } 105 } 106 107 // genBenchIOs generates <count> IOs for each given size. All IOs (across all 108 // sizes) are in random order. 109 func genBenchIOs( 110 stdout io.Writer, readables []objstorage.Readable, count int, sizes []int, 111 ) []benchIO { 112 // size[i] is the size of the object, in blocks of maxIOSize. 113 size := make([]int, len(readables)) 114 // sum[i] is the sum (size[0] + ... + size[i]). 115 sum := make([]int, len(readables)) 116 total := 0 117 for i, r := range readables { 118 size[i] = int(r.Size() / maxIOSize) 119 total += size[i] 120 sum[i] = total 121 } 122 fmt.Fprintf(stdout, "Opened %d objects; total size %d MB.\n", len(readables), total*maxIOSize/(1024*1024)) 123 124 // To avoid a lot of overlap between the reads, the total size should be a 125 // factor larger than the size we will actually read (for the largest IO 126 // size). 127 const sizeFactor = 2 128 if total*maxIOSize < count*sizes[len(sizes)-1]*sizeFactor { 129 fmt.Fprintf(stdout, "Warning: store too small for the given IO count and sizes.\n") 130 } 131 132 // Choose how many IOs we do for each object, by selecting a random block 133 // across all file blocks. 134 // The choice of objects will be the same across all IO sizes. 135 b := make([]int, count) 136 for i := range b { 137 b[i] = rand.Intn(total) 138 } 139 // For each b[i], find the index such that sum[idx-1] <= b < sum[idx]. 140 // Sorting b makes this easier: we can "merge" the sorted arrays b and sum. 141 sort.Ints(b) 142 rIdx := make([]int, count) 143 currIdx := 0 144 for i := range b { 145 for b[i] >= sum[currIdx] { 146 currIdx++ 147 } 148 rIdx[i] = currIdx 149 } 150 151 res := make([]benchIO, 0, count*len(sizes)) 152 for _, ioSize := range sizes { 153 for _, idx := range rIdx { 154 // Random ioSize aligned offset. 155 ofs := ioSize * rand.Intn(size[idx]*maxIOSize/ioSize) 156 157 res = append(res, benchIO{ 158 readableIdx: idx, 159 ofs: int64(ofs), 160 size: ioSize, 161 }) 162 } 163 } 164 rand.Shuffle(len(res), func(i, j int) { 165 res[i], res[j] = res[j], res[i] 166 }) 167 return res 168 } 169 170 // openBenchTables opens the sstables for the benchmark and returns them as a 171 // list of Readables. 172 // 173 // By default, only L5/L6 sstables are used; all levels are used if the 174 // allLevels flag is set. 175 // 176 // Note that only sstables that are at least maxIOSize (1MB) are used. 177 func (d *dbT) openBenchTables(db *pebble.DB) ([]objstorage.Readable, error) { 178 tables, err := db.SSTables() 179 if err != nil { 180 return nil, err 181 } 182 startLevel := 5 183 if d.allLevels { 184 startLevel = 0 185 } 186 187 var nums []base.DiskFileNum 188 numsMap := make(map[base.DiskFileNum]struct{}) 189 for l := startLevel; l < len(tables); l++ { 190 for _, t := range tables[l] { 191 n := t.BackingSSTNum.DiskFileNum() 192 if _, ok := numsMap[n]; !ok { 193 nums = append(nums, n) 194 numsMap[n] = struct{}{} 195 } 196 } 197 } 198 199 p := db.ObjProvider() 200 var res []objstorage.Readable 201 for _, n := range nums { 202 r, err := p.OpenForReading(context.Background(), base.FileTypeTable, n, objstorage.OpenOptions{}) 203 if err != nil { 204 for _, r := range res { 205 _ = r.Close() 206 } 207 return nil, err 208 } 209 if r.Size() < maxIOSize { 210 _ = r.Close() 211 continue 212 } 213 res = append(res, r) 214 } 215 if len(res) == 0 { 216 return nil, errors.Errorf("no sstables (with size at least %d)", maxIOSize) 217 } 218 219 return res, nil 220 } 221 222 // parseIOSizes parses a comma-separated list of IO sizes, in KB. 223 func parseIOSizes(sizes string) ([]int, error) { 224 var res []int 225 for _, s := range strings.Split(sizes, ",") { 226 n, err := strconv.Atoi(s) 227 if err != nil { 228 return nil, err 229 } 230 ioSize := n * 1024 231 if ioSize > maxIOSize { 232 return nil, errors.Errorf("IO sizes over %d not supported", maxIOSize) 233 } 234 if maxIOSize%ioSize != 0 { 235 return nil, errors.Errorf("IO size must be a divisor of %d", maxIOSize) 236 } 237 res = append(res, ioSize) 238 } 239 if len(res) == 0 { 240 return nil, errors.Errorf("no IO sizes specified") 241 } 242 sort.Ints(res) 243 return res, nil 244 } 245 246 // performIOs performs the given list of IOs and populates the elapsed fields. 247 func performIOs(readables []objstorage.Readable, ios []benchIO) error { 248 ctx := context.Background() 249 rh := make([]objstorage.ReadHandle, len(readables)) 250 for i := range rh { 251 rh[i] = readables[i].NewReadHandle(ctx) 252 } 253 defer func() { 254 for i := range rh { 255 rh[i].Close() 256 } 257 }() 258 259 buf := make([]byte, maxIOSize) 260 startTime := time.Now() 261 var firstErr error 262 var nOtherErrs int 263 for i := range ios { 264 if err := rh[ios[i].readableIdx].ReadAt(ctx, buf[:ios[i].size], ios[i].ofs); err != nil { 265 if firstErr == nil { 266 firstErr = err 267 } else { 268 nOtherErrs++ 269 } 270 } 271 endTime := time.Now() 272 ios[i].elapsed = endTime.Sub(startTime) 273 startTime = endTime 274 } 275 if nOtherErrs > 0 { 276 return errors.Errorf("%v; plus %d more errors", firstErr, nOtherErrs) 277 } 278 return firstErr 279 } 280 281 // getStats calculates various statistics given a list of elapsed times. 282 func getStats(d []time.Duration) string { 283 slices.Sort(d) 284 285 factor := 1.0 / float64(len(d)) 286 var mean float64 287 for i := range d { 288 mean += float64(d[i]) * factor 289 } 290 var variance float64 291 for i := range d { 292 delta := float64(d[i]) - mean 293 variance += delta * delta * factor 294 } 295 296 toStr := func(d time.Duration) string { 297 if d < 10*time.Millisecond { 298 return fmt.Sprintf("%1.2fms", float64(d)/float64(time.Millisecond)) 299 } 300 if d < 100*time.Millisecond { 301 return fmt.Sprintf("%2.1fms", float64(d)/float64(time.Millisecond)) 302 } 303 return fmt.Sprintf("%4dms", d/time.Millisecond) 304 } 305 306 return fmt.Sprintf( 307 "avg %s stddev %s p10 %s p50 %s p90 %s p95 %s p99 %s", 308 toStr(time.Duration(mean)), 309 toStr(time.Duration(math.Sqrt(variance))), 310 toStr(d[len(d)*10/100]), 311 toStr(d[len(d)*50/100]), 312 toStr(d[len(d)*90/100]), 313 toStr(d[len(d)*95/100]), 314 toStr(d[len(d)*99/100]), 315 ) 316 }