github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/file/filebench/bigmachine.go (about)

     1  package filebench
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/gob"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"os/exec"
    11  	"path"
    12  	"time"
    13  
    14  	"github.com/Schaudge/grailbase/errors"
    15  	"github.com/Schaudge/grailbase/must"
    16  	"github.com/Schaudge/grailbase/traverse"
    17  	"github.com/grailbio/bigmachine"
    18  )
    19  
    20  // Bigmachine configures a cluster of remote machines to each execute benchmarks and then report
    21  // their results.
    22  type Bigmachine struct {
    23  	// Bs is the collection of bigmachines in which to run benchmarks. EC2 instance type is a
    24  	// property of *bigmachine.B, so this lets callers benchmark several EC2 instance types.
    25  	// (*B).Name() is printed to identify resutls. Caller remains responsible for shutdown.
    26  	Bs       []*bigmachine.B
    27  	Environ  bigmachine.Environ
    28  	Services bigmachine.Services
    29  }
    30  
    31  // NewBigmachine returns a new configuration, ready for callers to configure. Callers likely want
    32  // to add bigmachines for remote execution (otherwise it just falls back to local).
    33  // Or, they may add environment variables or services for AWS credentials.
    34  func NewBigmachine(rs ReadSizes) Bigmachine {
    35  	return Bigmachine{
    36  		Services: bigmachine.Services{
    37  			"FileBench": benchService{rs},
    38  		},
    39  	}
    40  }
    41  
    42  // RunAndPrint starts a machine in each d.Bs and then executes ReadSizes.RunAndPrint on it.
    43  // It writes all the machine results to out, identifying each section by d.Bs's keys.
    44  func (d Bigmachine) RunAndPrint(
    45  	ctx context.Context,
    46  	out io.Writer,
    47  	pathPrefixes []Prefix,
    48  	pathSuffix0 string,
    49  	pathSuffixes ...string,
    50  ) error {
    51  	var results = make([]string, len(d.Bs))
    52  
    53  	err := traverse.Each(len(d.Bs), func(bIdx int) error {
    54  		b := d.Bs[bIdx]
    55  		machines, err := b.Start(ctx, 1, d.Environ, d.Services)
    56  		if err != nil {
    57  			return err
    58  		}
    59  		machine := machines[0]
    60  
    61  		// Benchmark runs have encountered some throttling from S3 (503 SlowDown). Sleep a bit
    62  		// to separate out the benchmark runs, so that each benchmarking machine is likely to
    63  		// locate some distinct S3 remote IPs. (Due to VPC DNS caching, if all the machines start
    64  		// simultaneously, they're likely to use the same S3 peers.) Of course, this introduces
    65  		// systematic bias in comparing results between machines, but we accept that for now.
    66  		time.Sleep(time.Minute * time.Duration(bIdx))
    67  
    68  		return machine.Call(ctx, "FileBench.Run",
    69  			benchRequest{pathPrefixes, pathSuffix0, pathSuffixes},
    70  			&results[bIdx])
    71  	})
    72  
    73  	for bIdx, result := range results {
    74  		if result == "" {
    75  			continue
    76  		}
    77  		if bIdx > 0 {
    78  			_, err := fmt.Fprintln(out)
    79  			must.Nil(err)
    80  		}
    81  		_, err := fmt.Fprintf(out, "[%d] %s\n%s", bIdx, d.Bs[bIdx].Name(), result)
    82  		must.Nil(err)
    83  	}
    84  	return err
    85  }
    86  
    87  type (
    88  	benchService struct{ ReadSizes }
    89  	benchRequest struct {
    90  		PathPrefixes []Prefix
    91  		PathSuffix0  string
    92  		PathSuffixes []string
    93  	}
    94  
    95  	fuseService struct{}
    96  )
    97  
    98  func init() {
    99  	gob.Register(benchService{})
   100  	gob.Register(fuseService{})
   101  }
   102  
   103  func (s benchService) Run(ctx context.Context, req benchRequest, out *string) error {
   104  	var buf bytes.Buffer
   105  	s.ReadSizes.RunAndPrint(ctx, &buf, req.PathPrefixes, req.PathSuffix0, req.PathSuffixes...)
   106  	*out = buf.String()
   107  	return nil
   108  }
   109  
   110  // AddS3FUSE configures d so that each machine running benchmarks can access S3 objects through
   111  // the local filesystem, at mountPath. For example, object s3://b/my/key will appear at
   112  // $mountPath/b/my/key. Callers can use this to construct paths for RunAndPrint.
   113  func (d Bigmachine) AddS3FUSE() (mountPath string) {
   114  	must.True(len(s3FUSEBinary) > 0)
   115  	d.Services["s3FUSE"] = fuseService{}
   116  	return s3FUSEPath
   117  }
   118  
   119  const s3FUSEPath = "/tmp/s3"
   120  
   121  func (fuseService) Init(*bigmachine.B) (err error) {
   122  	defer func() {
   123  		if err != nil {
   124  			err = errors.E(err, errors.Fatal)
   125  		}
   126  	}()
   127  	if err := os.MkdirAll(s3FUSEPath, 0700); err != nil {
   128  		return err
   129  	}
   130  	ents, err := os.ReadDir(s3FUSEPath)
   131  	if err != nil {
   132  		return err
   133  	}
   134  	if len(ents) > 0 {
   135  		return errors.New("s3 fuse mount is non-empty")
   136  	}
   137  	tmpDir, err := os.MkdirTemp("", "s3fuse-*")
   138  	if err != nil {
   139  		return err
   140  	}
   141  	exe := path.Join(tmpDir, "s3fuse")
   142  	if err := os.WriteFile(exe, s3FUSEBinary, 0700); err != nil {
   143  		return err
   144  	}
   145  	cmdErrC := make(chan error)
   146  	go func() {
   147  		out, err := exec.Command(exe, s3FUSEPath).CombinedOutput()
   148  		if err == nil {
   149  			err = errors.E("s3fuse exited unexpectedly")
   150  		}
   151  		cmdErrC <- errors.E(err, fmt.Sprintf("s3fuse output:\n%s", out))
   152  	}()
   153  	readDirC := make(chan error)
   154  	go func() {
   155  		for {
   156  			ents, err := os.ReadDir(s3FUSEPath)
   157  			if err != nil {
   158  				readDirC <- err
   159  				return
   160  			}
   161  			if len(ents) > 0 {
   162  				readDirC <- nil
   163  			}
   164  			time.Sleep(time.Second)
   165  		}
   166  	}()
   167  	select {
   168  	case err = <-cmdErrC:
   169  	case err = <-readDirC:
   170  	case <-time.After(10 * time.Second):
   171  		err = errors.New("ran out of time waiting for FUSE mount")
   172  	}
   173  	return err
   174  }