github.com/grailbio/base@v0.0.11/file/filebench/bigmachine.go (about) 1 package filebench 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/gob" 7 "fmt" 8 "io" 9 "os" 10 "os/exec" 11 "path" 12 "time" 13 14 "github.com/grailbio/base/errors" 15 "github.com/grailbio/base/must" 16 "github.com/grailbio/base/traverse" 17 "github.com/grailbio/bigmachine" 18 ) 19 20 // Bigmachine configures a cluster of remote machines to each execute benchmarks and then report 21 // their results. 22 type Bigmachine struct { 23 // Bs is the collection of bigmachines in which to run benchmarks. EC2 instance type is a 24 // property of *bigmachine.B, so this lets callers benchmark several EC2 instance types. 25 // (*B).Name() is printed to identify resutls. Caller remains responsible for shutdown. 26 Bs []*bigmachine.B 27 Environ bigmachine.Environ 28 Services bigmachine.Services 29 } 30 31 // NewBigmachine returns a new configuration, ready for callers to configure. Callers likely want 32 // to add bigmachines for remote execution (otherwise it just falls back to local). 33 // Or, they may add environment variables or services for AWS credentials. 34 func NewBigmachine(rs ReadSizes) Bigmachine { 35 return Bigmachine{ 36 Services: bigmachine.Services{ 37 "FileBench": benchService{rs}, 38 }, 39 } 40 } 41 42 // RunAndPrint starts a machine in each d.Bs and then executes ReadSizes.RunAndPrint on it. 43 // It writes all the machine results to out, identifying each section by d.Bs's keys. 44 func (d Bigmachine) RunAndPrint( 45 ctx context.Context, 46 out io.Writer, 47 pathPrefixes []Prefix, 48 pathSuffix0 string, 49 pathSuffixes ...string, 50 ) error { 51 var results = make([]string, len(d.Bs)) 52 53 err := traverse.Each(len(d.Bs), func(bIdx int) error { 54 b := d.Bs[bIdx] 55 machines, err := b.Start(ctx, 1, d.Environ, d.Services) 56 if err != nil { 57 return err 58 } 59 machine := machines[0] 60 61 // Benchmark runs have encountered some throttling from S3 (503 SlowDown). Sleep a bit 62 // to separate out the benchmark runs, so that each benchmarking machine is likely to 63 // locate some distinct S3 remote IPs. (Due to VPC DNS caching, if all the machines start 64 // simultaneously, they're likely to use the same S3 peers.) Of course, this introduces 65 // systematic bias in comparing results between machines, but we accept that for now. 66 time.Sleep(time.Minute * time.Duration(bIdx)) 67 68 return machine.Call(ctx, "FileBench.Run", 69 benchRequest{pathPrefixes, pathSuffix0, pathSuffixes}, 70 &results[bIdx]) 71 }) 72 73 for bIdx, result := range results { 74 if result == "" { 75 continue 76 } 77 if bIdx > 0 { 78 _, err := fmt.Fprintln(out) 79 must.Nil(err) 80 } 81 _, err := fmt.Fprintf(out, "[%d] %s\n%s", bIdx, d.Bs[bIdx].Name(), result) 82 must.Nil(err) 83 } 84 return err 85 } 86 87 type ( 88 benchService struct{ ReadSizes } 89 benchRequest struct { 90 PathPrefixes []Prefix 91 PathSuffix0 string 92 PathSuffixes []string 93 } 94 95 fuseService struct{} 96 ) 97 98 func init() { 99 gob.Register(benchService{}) 100 gob.Register(fuseService{}) 101 } 102 103 func (s benchService) Run(ctx context.Context, req benchRequest, out *string) error { 104 var buf bytes.Buffer 105 s.ReadSizes.RunAndPrint(ctx, &buf, req.PathPrefixes, req.PathSuffix0, req.PathSuffixes...) 106 *out = buf.String() 107 return nil 108 } 109 110 // AddS3FUSE configures d so that each machine running benchmarks can access S3 objects through 111 // the local filesystem, at mountPath. For example, object s3://b/my/key will appear at 112 // $mountPath/b/my/key. Callers can use this to construct paths for RunAndPrint. 113 func (d Bigmachine) AddS3FUSE() (mountPath string) { 114 must.True(len(s3FUSEBinary) > 0) 115 d.Services["s3FUSE"] = fuseService{} 116 return s3FUSEPath 117 } 118 119 const s3FUSEPath = "/tmp/s3" 120 121 func (fuseService) Init(*bigmachine.B) (err error) { 122 defer func() { 123 if err != nil { 124 err = errors.E(err, errors.Fatal) 125 } 126 }() 127 if err := os.MkdirAll(s3FUSEPath, 0700); err != nil { 128 return err 129 } 130 ents, err := os.ReadDir(s3FUSEPath) 131 if err != nil { 132 return err 133 } 134 if len(ents) > 0 { 135 return errors.New("s3 fuse mount is non-empty") 136 } 137 tmpDir, err := os.MkdirTemp("", "s3fuse-*") 138 if err != nil { 139 return err 140 } 141 exe := path.Join(tmpDir, "s3fuse") 142 if err := os.WriteFile(exe, s3FUSEBinary, 0700); err != nil { 143 return err 144 } 145 cmdErrC := make(chan error) 146 go func() { 147 out, err := exec.Command(exe, s3FUSEPath).CombinedOutput() 148 if err == nil { 149 err = errors.E("s3fuse exited unexpectedly") 150 } 151 cmdErrC <- errors.E(err, fmt.Sprintf("s3fuse output:\n%s", out)) 152 }() 153 readDirC := make(chan error) 154 go func() { 155 for { 156 ents, err := os.ReadDir(s3FUSEPath) 157 if err != nil { 158 readDirC <- err 159 return 160 } 161 if len(ents) > 0 { 162 readDirC <- nil 163 } 164 time.Sleep(time.Second) 165 } 166 }() 167 select { 168 case err = <-cmdErrC: 169 case err = <-readDirC: 170 case <-time.After(10 * time.Second): 171 err = errors.New("ran out of time waiting for FUSE mount") 172 } 173 return err 174 }