github.com/square/finch@v0.0.0-20240412205204-6530c03e2b96/compute/server.go (about) 1 // Copyright 2024 Block, Inc. 2 3 package compute 4 5 import ( 6 "context" 7 "fmt" 8 "log" 9 "os" 10 "path/filepath" 11 "sync" 12 13 "github.com/rs/xid" 14 15 "github.com/square/finch" 16 "github.com/square/finch/config" 17 "github.com/square/finch/data" 18 "github.com/square/finch/stage" 19 "github.com/square/finch/stats" 20 ) 21 22 // Server coordinates instances: the local and any remotes. Server implements 23 // Compute so server.Server (the Finch core server) can run as a client or server. 24 type Server struct { 25 api *API // handles remote compute (rc) 26 name string // defaults to "local" 27 test bool 28 // -- 29 gds *data.Scope // global data scope 30 cfg config.Stage 31 } 32 33 type ack struct { 34 name string // "" for local, else remote.name 35 err error 36 } 37 38 func NewServer(name, addr string, test bool) *Server { 39 s := &Server{ 40 name: name, 41 test: test, 42 gds: data.NewScope(), // global data 43 } 44 if addr != "" { 45 s.api = NewAPI(finch.WithPort(addr, finch.DEFAULT_SERVER_PORT)) 46 } 47 return s 48 } 49 50 func (s *Server) Run(ctxFinch context.Context, stages []config.Stage) error { 51 for _, cfg := range stages { 52 // cd dir of config file so relative file paths in config work 53 if err := os.Chdir(filepath.Dir(cfg.File)); err != nil { 54 return err 55 } 56 57 if err := s.run(ctxFinch, cfg); err != nil { 58 return err 59 } 60 61 if ctxFinch.Err() != nil { 62 finch.Debug("finch terminated") 63 return nil 64 } 65 } 66 return nil 67 } 68 69 // Run runs all the stages on all the instances (local and remote). 70 func (s *Server) run(ctxFinch context.Context, cfg config.Stage) error { 71 var err error 72 stageName := cfg.Name 73 74 nInstances := finch.Uint(cfg.Compute.Instances) 75 nRemotes := nInstances - 1 // -1 for local unless.. 76 if cfg.Compute.DisableLocal { 77 nRemotes += 1 // no local, so all instances are remote 78 } 79 if nRemotes == 0 { 80 fmt.Printf("#\n# %s\n#\n", stageName) 81 } else { 82 cfg.Id = xid.New().String() // unique stage ID for remotes 83 fmt.Printf("#\n# %s (%s)\n#\n", stageName, cfg.Id) 84 } 85 86 m := &stageMeta{ 87 Mutex: &sync.Mutex{}, 88 cfg: cfg, 89 nRemotes: nRemotes, 90 bootChan: make(chan ack, nInstances), 91 runChan: make(chan struct{}), 92 doneChan: make(chan ack, nInstances), 93 clients: map[string]*client{}, 94 } 95 96 if !config.True(cfg.Stats.Disable) { 97 m.stats, err = stats.NewCollector(cfg.Stats, s.name, nInstances) 98 if err != nil { 99 return err 100 } 101 } 102 103 s.gds.Reset() // keep data global and stage data, delete the rest 104 105 // Create and boot local instance first because if this doesn't work, 106 // then remotes shouldn't work either because they all boot with the 107 // exact same config. 108 var local *stage.Stage 109 if !cfg.Compute.DisableLocal { 110 local = stage.New(cfg, s.gds, m.stats) 111 if err := local.Prepare(ctxFinch); err != nil { 112 return err 113 } 114 m.bootChan <- ack{name: s.name} // must ack local, too 115 } 116 117 // Set stage in API to trigger remote instances to boot 118 if s.api != nil && nRemotes > 0 { 119 if err := s.api.Stage(m); err != nil { 120 return err 121 } 122 } 123 124 // Wait for the required number instances to boot. If running only local, 125 // this will be instant because local already booted and acked above. 126 // But with remotes, this might take a few milliseconds over the network. 127 if nInstances > 1 { 128 log.Printf("Waiting for %d instances to boot...", nInstances) 129 } 130 booted := uint(0) 131 for booted < nInstances { 132 select { 133 case ack := <-m.bootChan: 134 if ack.err != nil { 135 log.Printf("Remote %s error on boot: %s", ack.name, ack.err) 136 continue 137 } 138 booted += 1 139 if nInstances > 1 { 140 log.Printf("%s booted", ack.name) 141 } 142 case <-ctxFinch.Done(): 143 return nil 144 } 145 } 146 147 // Close stage in API to prevent remotes from joining 148 m.Lock() 149 m.booted = true 150 m.Unlock() 151 152 if s.test { 153 return nil 154 } 155 156 // ---------------------------------------------------------------------- 157 // Run stage 158 // ---------------------------------------------------------------------- 159 160 finch.Debug("run %s", stageName) 161 close(m.runChan) // signal remotes to run 162 163 if local != nil { // start local instance 164 go func() { 165 local.Run(ctxFinch) 166 m.doneChan <- ack{name: s.name} 167 }() 168 } 169 170 // Wait for instances to finish running 171 running := booted 172 for running > 0 { 173 select { 174 case ack := <-m.doneChan: 175 running -= 1 176 if ack.err != nil { 177 log.Printf("%s error running stage %s: %s", ack.name, stageName, ack.err) 178 } 179 if nInstances > 1 { 180 log.Printf("%s completed stage %s", ack.name, stageName) 181 if running > 0 { 182 log.Printf("%d/%d instances running", running, nInstances) 183 } 184 } 185 case <-ctxFinch.Done(): 186 // Signal remote instances to stop early and (maybe) send finals stats 187 if s.api != nil { 188 s.api.Stage(nil) 189 } 190 } 191 } 192 193 return nil 194 }