github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/mkbench/ycsb.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package main 6 7 import ( 8 "bufio" 9 "bytes" 10 "compress/bzip2" 11 "compress/gzip" 12 "encoding/json" 13 "fmt" 14 "io" 15 "log" 16 "math" 17 "os" 18 "sort" 19 "strings" 20 21 "github.com/cockroachdb/errors/oserror" 22 "github.com/spf13/cobra" 23 ) 24 25 const ( 26 defaultDir = "data" 27 defaultCookedFile = "data.js" 28 ) 29 30 func getYCSBCommand() *cobra.Command { 31 c := &cobra.Command{ 32 Use: "ycsb", 33 Short: "parse YCSB benchmark data", 34 RunE: func(cmd *cobra.Command, args []string) error { 35 dataDir, err := cmd.Flags().GetString("dir") 36 if err != nil { 37 return err 38 } 39 40 inFile, err := cmd.Flags().GetString("in") 41 if err != nil { 42 return err 43 } 44 45 outFile, err := cmd.Flags().GetString("out") 46 if err != nil { 47 return err 48 } 49 50 parseYCSB(dataDir, inFile, outFile) 51 return nil 52 }, 53 } 54 55 c.Flags().String("dir", defaultDir, "path to data directory") 56 c.Flags().String("in", defaultCookedFile, "path to (possibly non-empty) input cooked data file") 57 c.Flags().String("out", defaultCookedFile, "path to output data file") 58 c.SilenceUsage = true 59 60 return c 61 } 62 63 type ycsbRun struct { 64 opsSec float64 65 readBytes int64 66 writeBytes int64 67 readAmp float64 68 writeAmp float64 69 } 70 71 func (r ycsbRun) formatCSV() string { 72 return fmt.Sprintf("%.1f,%d,%d,%.1f,%.1f", 73 r.opsSec, r.readBytes, r.writeBytes, r.readAmp, r.writeAmp) 74 } 75 76 type ycsbWorkload struct { 77 days map[string][]ycsbRun // data -> runs 78 } 79 80 type ycsbLoader struct { 81 cookedDays map[string]bool // set of already cooked days 82 data map[string]*ycsbWorkload // workload name -> workload data 83 } 84 85 func newYCSBLoader() *ycsbLoader { 86 return &ycsbLoader{ 87 cookedDays: make(map[string]bool), 88 data: make(map[string]*ycsbWorkload), 89 } 90 } 91 92 func (l *ycsbLoader) addRun(name, day string, r ycsbRun) { 93 w := l.data[name] 94 if w == nil { 95 w = &ycsbWorkload{days: make(map[string][]ycsbRun)} 96 l.data[name] = w 97 } 98 w.days[day] = append(w.days[day], r) 99 } 100 101 func (l *ycsbLoader) loadCooked(path string) { 102 data, err := os.ReadFile(path) 103 if oserror.IsNotExist(err) { 104 return 105 } 106 if err != nil { 107 log.Fatal(err) 108 } 109 110 data = bytes.TrimSpace(data) 111 112 prefix := []byte("data = ") 113 if !bytes.HasPrefix(data, prefix) { 114 log.Fatalf("missing '%s' prefix", prefix) 115 } 116 data = bytes.TrimPrefix(data, prefix) 117 118 suffix := []byte(";") 119 if !bytes.HasSuffix(data, suffix) { 120 log.Fatalf("missing '%s' suffix", suffix) 121 } 122 data = bytes.TrimSuffix(data, suffix) 123 124 m := make(map[string]string) 125 if err := json.Unmarshal(data, &m); err != nil { 126 log.Fatal(err) 127 } 128 129 for name, data := range m { 130 s := bufio.NewScanner(strings.NewReader(data)) 131 for s.Scan() { 132 line := s.Text() 133 line = strings.Replace(line, ",", " ", -1) 134 135 var r ycsbRun 136 var day string 137 n, err := fmt.Sscanf(line, "%s %f %d %d %f %f", 138 &day, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp) 139 if err != nil || n != 6 { 140 log.Fatalf("%s: %+v", line, err) 141 } 142 l.cookedDays[day] = true 143 l.addRun(name, day, r) 144 } 145 } 146 } 147 148 func (l *ycsbLoader) loadRaw(dir string) { 149 walkFn := func(path, pathRel string, info os.FileInfo) error { 150 // The directory structure is of the form: 151 // $date/pebble/ycsb/$name/$run/$file 152 parts := strings.Split(pathRel, string(os.PathSeparator)) 153 if len(parts) < 6 { 154 return nil // stumble forward on invalid paths 155 } 156 157 // We're only interested in YCSB benchmark data. 158 if parts[2] != "ycsb" { 159 return nil 160 } 161 162 day := parts[0] 163 if l.cookedDays[day] { 164 return nil 165 } 166 167 f, err := os.Open(path) 168 if err != nil { 169 fmt.Fprintf(os.Stderr, "%+v\n", err) 170 return nil // stumble forward on error 171 } 172 defer f.Close() 173 174 r := io.Reader(f) 175 if strings.HasSuffix(path, ".bz2") { 176 r = bzip2.NewReader(f) 177 } else if strings.HasSuffix(path, ".gz") { 178 var err error 179 r, err = gzip.NewReader(f) 180 if err != nil { 181 fmt.Fprintf(os.Stderr, "%+v\n", err) 182 return nil // stumble forward on error 183 } 184 } 185 186 s := bufio.NewScanner(r) 187 for s.Scan() { 188 line := s.Text() 189 if !strings.HasPrefix(line, "Benchmark") { 190 continue 191 } 192 193 var r ycsbRun 194 var name string 195 var ops int64 196 n, err := fmt.Sscanf(line, 197 "Benchmark%s %d %f ops/sec %d read %d write %f r-amp %f w-amp", 198 &name, &ops, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp) 199 if err != nil || n != 7 { 200 fmt.Fprintf(os.Stderr, "%s: %v\n", s.Text(), err) 201 // Stumble forward on error. 202 continue 203 } 204 205 fmt.Fprintf(os.Stderr, "%s: adding %s\n", day, name) 206 l.addRun(name, day, r) 207 } 208 return nil 209 } 210 211 _ = walkDir(dir, walkFn) 212 } 213 214 func (l *ycsbLoader) cook(path string) { 215 m := make(map[string]string) 216 for name, workload := range l.data { 217 m[name] = l.cookWorkload(workload) 218 } 219 220 out := []byte("data = ") 221 out = append(out, prettyJSON(m)...) 222 out = append(out, []byte(";\n")...) 223 if err := os.WriteFile(path, out, 0644); err != nil { 224 log.Fatal(err) 225 } 226 } 227 228 func (l *ycsbLoader) cookWorkload(w *ycsbWorkload) string { 229 days := make([]string, 0, len(w.days)) 230 for day := range w.days { 231 days = append(days, day) 232 } 233 sort.Strings(days) 234 235 var buf bytes.Buffer 236 for _, day := range days { 237 fmt.Fprintf(&buf, "%s,%s\n", day, l.cookDay(w.days[day])) 238 } 239 return buf.String() 240 } 241 242 func (l *ycsbLoader) cookDay(runs []ycsbRun) string { 243 if len(runs) == 1 { 244 return runs[0].formatCSV() 245 } 246 247 // The benchmarks show significant run-to-run variance due to 248 // instance-to-instance performance variability on AWS. We attempt to smooth 249 // out this variance by excluding outliers: any run that is more than one 250 // stddev from the average, and then taking the average of the remaining 251 // runs. Note that the runs on a given day are all from the same SHA, so this 252 // smoothing will not affect exceptional day-to-day performance changes. 253 254 var sum float64 255 for i := range runs { 256 sum += runs[i].opsSec 257 } 258 mean := sum / float64(len(runs)) 259 260 var sum2 float64 261 for i := range runs { 262 v := runs[i].opsSec - mean 263 sum2 += v * v 264 } 265 266 stddev := math.Sqrt(sum2 / float64(len(runs))) 267 lo := mean - stddev 268 hi := mean + stddev 269 270 var avg ycsbRun 271 var count int 272 for i := range runs { 273 r := &runs[i] 274 if r.opsSec < lo || r.opsSec > hi { 275 continue 276 } 277 count++ 278 avg.opsSec += r.opsSec 279 avg.readBytes += r.readBytes 280 avg.writeBytes += r.writeBytes 281 avg.readAmp += r.readAmp 282 avg.writeAmp += r.writeAmp 283 } 284 285 avg.opsSec /= float64(count) 286 avg.readBytes /= int64(count) 287 avg.writeBytes /= int64(count) 288 avg.readAmp /= float64(count) 289 avg.writeAmp /= float64(count) 290 return avg.formatCSV() 291 } 292 293 // parseYCSB coalesces YCSB benchmark data. 294 func parseYCSB(dataDir, inFile, outFile string) { 295 log.SetFlags(log.Lshortfile) 296 297 l := newYCSBLoader() 298 l.loadCooked(inFile) 299 l.loadRaw(dataDir) 300 l.cook(outFile) 301 }