github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/mkbench/ycsb.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"compress/bzip2"
    11  	"compress/gzip"
    12  	"encoding/json"
    13  	"fmt"
    14  	"io"
    15  	"log"
    16  	"math"
    17  	"os"
    18  	"sort"
    19  	"strings"
    20  
    21  	"github.com/cockroachdb/errors/oserror"
    22  	"github.com/spf13/cobra"
    23  )
    24  
    25  const (
    26  	defaultDir        = "data"
    27  	defaultCookedFile = "data.js"
    28  )
    29  
    30  func getYCSBCommand() *cobra.Command {
    31  	c := &cobra.Command{
    32  		Use:   "ycsb",
    33  		Short: "parse YCSB benchmark data",
    34  		RunE: func(cmd *cobra.Command, args []string) error {
    35  			dataDir, err := cmd.Flags().GetString("dir")
    36  			if err != nil {
    37  				return err
    38  			}
    39  
    40  			inFile, err := cmd.Flags().GetString("in")
    41  			if err != nil {
    42  				return err
    43  			}
    44  
    45  			outFile, err := cmd.Flags().GetString("out")
    46  			if err != nil {
    47  				return err
    48  			}
    49  
    50  			parseYCSB(dataDir, inFile, outFile)
    51  			return nil
    52  		},
    53  	}
    54  
    55  	c.Flags().String("dir", defaultDir, "path to data directory")
    56  	c.Flags().String("in", defaultCookedFile, "path to (possibly non-empty) input cooked data file")
    57  	c.Flags().String("out", defaultCookedFile, "path to output data file")
    58  	c.SilenceUsage = true
    59  
    60  	return c
    61  }
    62  
    63  type ycsbRun struct {
    64  	opsSec     float64
    65  	readBytes  int64
    66  	writeBytes int64
    67  	readAmp    float64
    68  	writeAmp   float64
    69  }
    70  
    71  func (r ycsbRun) formatCSV() string {
    72  	return fmt.Sprintf("%.1f,%d,%d,%.1f,%.1f",
    73  		r.opsSec, r.readBytes, r.writeBytes, r.readAmp, r.writeAmp)
    74  }
    75  
    76  type ycsbWorkload struct {
    77  	days map[string][]ycsbRun // data -> runs
    78  }
    79  
    80  type ycsbLoader struct {
    81  	cookedDays map[string]bool          // set of already cooked days
    82  	data       map[string]*ycsbWorkload // workload name -> workload data
    83  }
    84  
    85  func newYCSBLoader() *ycsbLoader {
    86  	return &ycsbLoader{
    87  		cookedDays: make(map[string]bool),
    88  		data:       make(map[string]*ycsbWorkload),
    89  	}
    90  }
    91  
    92  func (l *ycsbLoader) addRun(name, day string, r ycsbRun) {
    93  	w := l.data[name]
    94  	if w == nil {
    95  		w = &ycsbWorkload{days: make(map[string][]ycsbRun)}
    96  		l.data[name] = w
    97  	}
    98  	w.days[day] = append(w.days[day], r)
    99  }
   100  
   101  func (l *ycsbLoader) loadCooked(path string) {
   102  	data, err := os.ReadFile(path)
   103  	if oserror.IsNotExist(err) {
   104  		return
   105  	}
   106  	if err != nil {
   107  		log.Fatal(err)
   108  	}
   109  
   110  	data = bytes.TrimSpace(data)
   111  
   112  	prefix := []byte("data = ")
   113  	if !bytes.HasPrefix(data, prefix) {
   114  		log.Fatalf("missing '%s' prefix", prefix)
   115  	}
   116  	data = bytes.TrimPrefix(data, prefix)
   117  
   118  	suffix := []byte(";")
   119  	if !bytes.HasSuffix(data, suffix) {
   120  		log.Fatalf("missing '%s' suffix", suffix)
   121  	}
   122  	data = bytes.TrimSuffix(data, suffix)
   123  
   124  	m := make(map[string]string)
   125  	if err := json.Unmarshal(data, &m); err != nil {
   126  		log.Fatal(err)
   127  	}
   128  
   129  	for name, data := range m {
   130  		s := bufio.NewScanner(strings.NewReader(data))
   131  		for s.Scan() {
   132  			line := s.Text()
   133  			line = strings.Replace(line, ",", " ", -1)
   134  
   135  			var r ycsbRun
   136  			var day string
   137  			n, err := fmt.Sscanf(line, "%s %f %d %d %f %f",
   138  				&day, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp)
   139  			if err != nil || n != 6 {
   140  				log.Fatalf("%s: %+v", line, err)
   141  			}
   142  			l.cookedDays[day] = true
   143  			l.addRun(name, day, r)
   144  		}
   145  	}
   146  }
   147  
   148  func (l *ycsbLoader) loadRaw(dir string) {
   149  	walkFn := func(path, pathRel string, info os.FileInfo) error {
   150  		// The directory structure is of the form:
   151  		//   $date/pebble/ycsb/$name/$run/$file
   152  		parts := strings.Split(pathRel, string(os.PathSeparator))
   153  		if len(parts) < 6 {
   154  			return nil // stumble forward on invalid paths
   155  		}
   156  
   157  		// We're only interested in YCSB benchmark data.
   158  		if parts[2] != "ycsb" {
   159  			return nil
   160  		}
   161  
   162  		day := parts[0]
   163  		if l.cookedDays[day] {
   164  			return nil
   165  		}
   166  
   167  		f, err := os.Open(path)
   168  		if err != nil {
   169  			fmt.Fprintf(os.Stderr, "%+v\n", err)
   170  			return nil // stumble forward on error
   171  		}
   172  		defer f.Close()
   173  
   174  		r := io.Reader(f)
   175  		if strings.HasSuffix(path, ".bz2") {
   176  			r = bzip2.NewReader(f)
   177  		} else if strings.HasSuffix(path, ".gz") {
   178  			var err error
   179  			r, err = gzip.NewReader(f)
   180  			if err != nil {
   181  				fmt.Fprintf(os.Stderr, "%+v\n", err)
   182  				return nil // stumble forward on error
   183  			}
   184  		}
   185  
   186  		s := bufio.NewScanner(r)
   187  		for s.Scan() {
   188  			line := s.Text()
   189  			if !strings.HasPrefix(line, "Benchmark") {
   190  				continue
   191  			}
   192  
   193  			var r ycsbRun
   194  			var name string
   195  			var ops int64
   196  			n, err := fmt.Sscanf(line,
   197  				"Benchmark%s %d %f ops/sec %d read %d write %f r-amp %f w-amp",
   198  				&name, &ops, &r.opsSec, &r.readBytes, &r.writeBytes, &r.readAmp, &r.writeAmp)
   199  			if err != nil || n != 7 {
   200  				fmt.Fprintf(os.Stderr, "%s: %v\n", s.Text(), err)
   201  				// Stumble forward on error.
   202  				continue
   203  			}
   204  
   205  			fmt.Fprintf(os.Stderr, "%s: adding %s\n", day, name)
   206  			l.addRun(name, day, r)
   207  		}
   208  		return nil
   209  	}
   210  
   211  	_ = walkDir(dir, walkFn)
   212  }
   213  
   214  func (l *ycsbLoader) cook(path string) {
   215  	m := make(map[string]string)
   216  	for name, workload := range l.data {
   217  		m[name] = l.cookWorkload(workload)
   218  	}
   219  
   220  	out := []byte("data = ")
   221  	out = append(out, prettyJSON(m)...)
   222  	out = append(out, []byte(";\n")...)
   223  	if err := os.WriteFile(path, out, 0644); err != nil {
   224  		log.Fatal(err)
   225  	}
   226  }
   227  
   228  func (l *ycsbLoader) cookWorkload(w *ycsbWorkload) string {
   229  	days := make([]string, 0, len(w.days))
   230  	for day := range w.days {
   231  		days = append(days, day)
   232  	}
   233  	sort.Strings(days)
   234  
   235  	var buf bytes.Buffer
   236  	for _, day := range days {
   237  		fmt.Fprintf(&buf, "%s,%s\n", day, l.cookDay(w.days[day]))
   238  	}
   239  	return buf.String()
   240  }
   241  
   242  func (l *ycsbLoader) cookDay(runs []ycsbRun) string {
   243  	if len(runs) == 1 {
   244  		return runs[0].formatCSV()
   245  	}
   246  
   247  	// The benchmarks show significant run-to-run variance due to
   248  	// instance-to-instance performance variability on AWS. We attempt to smooth
   249  	// out this variance by excluding outliers: any run that is more than one
   250  	// stddev from the average, and then taking the average of the remaining
   251  	// runs. Note that the runs on a given day are all from the same SHA, so this
   252  	// smoothing will not affect exceptional day-to-day performance changes.
   253  
   254  	var sum float64
   255  	for i := range runs {
   256  		sum += runs[i].opsSec
   257  	}
   258  	mean := sum / float64(len(runs))
   259  
   260  	var sum2 float64
   261  	for i := range runs {
   262  		v := runs[i].opsSec - mean
   263  		sum2 += v * v
   264  	}
   265  
   266  	stddev := math.Sqrt(sum2 / float64(len(runs)))
   267  	lo := mean - stddev
   268  	hi := mean + stddev
   269  
   270  	var avg ycsbRun
   271  	var count int
   272  	for i := range runs {
   273  		r := &runs[i]
   274  		if r.opsSec < lo || r.opsSec > hi {
   275  			continue
   276  		}
   277  		count++
   278  		avg.opsSec += r.opsSec
   279  		avg.readBytes += r.readBytes
   280  		avg.writeBytes += r.writeBytes
   281  		avg.readAmp += r.readAmp
   282  		avg.writeAmp += r.writeAmp
   283  	}
   284  
   285  	avg.opsSec /= float64(count)
   286  	avg.readBytes /= int64(count)
   287  	avg.writeBytes /= int64(count)
   288  	avg.readAmp /= float64(count)
   289  	avg.writeAmp /= float64(count)
   290  	return avg.formatCSV()
   291  }
   292  
   293  // parseYCSB coalesces YCSB benchmark data.
   294  func parseYCSB(dataDir, inFile, outFile string) {
   295  	log.SetFlags(log.Lshortfile)
   296  
   297  	l := newYCSBLoader()
   298  	l.loadCooked(inFile)
   299  	l.loadRaw(dataDir)
   300  	l.cook(outFile)
   301  }