github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/compute/client.go (about) 1 // Copyright 2023 Block, Inc. 2 3 package compute 4 5 import ( 6 "context" 7 "encoding/json" 8 "fmt" 9 "log" 10 "net/http" 11 "os" 12 "path/filepath" 13 "strings" 14 "time" 15 16 "github.com/square/finch" 17 "github.com/square/finch/config" 18 "github.com/square/finch/data" 19 "github.com/square/finch/proto" 20 "github.com/square/finch/stage" 21 "github.com/square/finch/stats" 22 ) 23 24 // Client is a remote Instance that sends everything to the --server specified 25 // on the command line. The client handles client-server communication, and it 26 // wraps a Local that runs stages locally. 27 type Client struct { 28 name string 29 addr string 30 // -- 31 gds *data.Scope 32 client *proto.Client 33 } 34 35 func NewClient(name, addr string) *Client { 36 if !strings.HasPrefix(addr, "http://") { 37 addr = "http://" + addr 38 } 39 40 return &Client{ 41 name: name, 42 addr: strings.TrimSuffix(addr, "/"), 43 // -- 44 gds: data.NewScope(), 45 client: proto.NewClient(name, addr), 46 } 47 } 48 49 func (c *Client) Run(ctxFinch context.Context) error { 50 //for { 51 c.gds.Reset() // keep data from globally-scoped generators; delete the rest 52 if err := c.run(ctxFinch); err != nil { 53 if ctxFinch.Err() != nil { 54 return nil 55 } 56 log.Println(err) 57 time.Sleep(2 * time.Second) // prevent uncontrolled error loop 58 } 59 //} 60 return nil 61 } 62 63 func (c *Client) run(ctxFinch context.Context) error { 64 // ------------------------------------------------------------------ 65 // Fetch stage fails (wait for GET /boot to return) 66 var cfg config.Stage 67 log.Printf("Waiting to boot from %s...", c.addr) 68 c.client.PrintErrors = false 69 _, body, err := c.client.Get(ctxFinch, "/boot", nil, proto.R{2 * time.Second, 1 * time.Second, -1}) 70 if err != nil { 71 return err 72 } 73 c.client.PrintErrors = true 74 if err := json.Unmarshal(body, &cfg); err != nil { 75 return fmt.Errorf("cannot decode stage config file from server: %s", err) 76 } 77 stageName := cfg.Name 78 c.client.StageId = cfg.Id 79 defer func() { c.client.StageId = "" }() 80 fmt.Printf("#\n# %s (%s)\n#\n", stageName, cfg.Id) 81 82 // ---------------------------------------------------------------------- 83 // Fetch all stage and trx files from server, put in local temp dir 84 tmpdir, err := os.MkdirTemp("", "finch") 85 if err != nil { 86 return fmt.Errorf("cannot make temp dir: %s", err) 87 } 88 if !finch.Debugging { 89 defer os.RemoveAll(tmpdir) 90 } 91 finch.Debug("tmp dir: %s", tmpdir) 92 if err := c.getTrxFiles(ctxFinch, cfg, tmpdir); err != nil { 93 return err 94 } 95 96 // ------------------------------------------------------------------ 97 // Local boot and ack 98 for k := range cfg.Stats.Report { 99 if k == "stdout" { 100 continue 101 } 102 delete(cfg.Stats.Report, k) 103 } 104 cfg.Stats.Report["server"] = map[string]string{ 105 "server": c.addr, 106 "client": c.name, 107 "stage-id": c.client.StageId, 108 } 109 stats, err := stats.NewCollector(cfg.Stats, c.name, 1) 110 if err != nil { 111 return err 112 } 113 114 log.Printf("[%s] Booting", stageName) 115 local := stage.New(cfg, c.gds, stats) 116 if err := local.Prepare(ctxFinch); err != nil { 117 log.Printf("[%s] Boot error, notifying server: %s", stageName, err) 118 c.client.Send(ctxFinch, "/boot", err.Error(), proto.R{500 * time.Millisecond, 100 * time.Millisecond, 3}) // don't care if this fails 119 return err // return original error not Send error 120 } 121 122 // Boot ack; don't continue on error because we're no longer in sync with server 123 log.Printf("[%s] Boot successful, notifying server", stageName) 124 if err := c.client.Send(ctxFinch, "/boot", nil, proto.R{1 * time.Second, 100 * time.Millisecond, 10}); err != nil { 125 log.Printf("[%s] Sending book ack to server failed: %s", stageName, err) 126 return err 127 } 128 129 // ---------------------------------------------------------------------- 130 // Wait for run signal. This might be a little while if server is for 131 // other remote instances. 132 log.Printf("[%s] Waiting for run signal", stageName) 133 resp, _, err := c.client.Get(ctxFinch, "/run", nil, proto.R{60 * time.Second, 100 * time.Millisecond, 3}) 134 if err != nil { 135 log.Printf("[%s] Timeout waiting for run signal after successful boot, giving up (is the server offline?)", stageName) 136 return err 137 } 138 if resp.StatusCode == http.StatusResetContent { 139 log.Printf("[%s] Boot test successful", stageName) 140 return nil 141 } 142 143 // ---------------------------------------------------------------------- 144 // Local run and ack 145 ctxRun, cancelRun := context.WithCancel(ctxFinch) 146 doneChan := make(chan struct{}) 147 defer close(doneChan) 148 lostServer := false 149 stageDone := false 150 go func() { 151 defer cancelRun() 152 for { 153 time.Sleep(1 * time.Second) 154 select { 155 case <-doneChan: 156 finch.Debug("stop check goroutine stopped") 157 return 158 default: 159 } 160 resp, _, err := c.client.Get(ctxFinch, "/ping", nil, proto.R{500 * time.Millisecond, 100 * time.Millisecond, 5}) 161 if err != nil { 162 log.Printf("[%s] Lost contact with server while running, aborting", stageName) 163 lostServer = true 164 return 165 } 166 if resp.StatusCode == http.StatusResetContent { 167 stageDone = true 168 log.Printf("[%s] Server stopped stage", stageName) 169 return 170 } 171 } 172 }() 173 174 local.Run(ctxRun) 175 log.Printf("[%s] Run stopped: %v (lost server:%v stage stopped:%v); sending done signal to server (5s timeout)", stageName, err, lostServer, stageDone) 176 177 // Run ack; ok if this fails because we're done, nothing left to sync with server 178 ctxDone, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second) 179 defer ctxCancel() 180 if err := c.client.Send(ctxDone, "/run", err, proto.R{500 * time.Millisecond, 100 * time.Millisecond, 3}); err != nil { 181 log.Printf("[%s] Sending done signal to server failed, ignoring: %s", stageName, err) 182 } 183 184 return nil 185 } 186 187 func (c *Client) getTrxFiles(ctxFinch context.Context, cfg config.Stage, tmpdir string) error { 188 trx := cfg.Trx 189 for i := range trx { 190 if config.FileExists(trx[i].File) { 191 log.Printf("Have local stage %s file %s; not fetching from server", cfg.Name, trx[i].File) 192 continue 193 } 194 log.Printf("Fetching stage %s file %s...", cfg.Name, trx[i].File) 195 ref := [][]string{ 196 {"stage", cfg.Name}, 197 {"i", fmt.Sprintf("%d", i)}, 198 } 199 resp, body, err := c.client.Get(ctxFinch, "/file", ref, proto.R{5 * time.Second, 100 * time.Millisecond, 3}) 200 if err != nil { 201 return err // Get retries so error is final 202 } 203 finch.Debug("%+v", resp) 204 205 filename := filepath.Join(tmpdir, filepath.Base(trx[i].File)) 206 f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0440) 207 if err != nil { 208 return err 209 } 210 if _, err := f.Write(body); err != nil { 211 return err 212 } 213 if err := f.Close(); err != nil { 214 return err 215 } 216 finch.Debug("wrote %s", filename) 217 trx[i].File = filename 218 } 219 return nil 220 }