code.vegaprotocol.io/vega@v0.79.0/visor/binaries_runner.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package visor 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "os" 23 "os/exec" 24 "path" 25 "path/filepath" 26 "strconv" 27 "sync" 28 "syscall" 29 "time" 30 31 "code.vegaprotocol.io/vega/core/types" 32 "code.vegaprotocol.io/vega/logging" 33 "code.vegaprotocol.io/vega/visor/config" 34 "code.vegaprotocol.io/vega/visor/utils" 35 36 "golang.org/x/sync/errgroup" 37 ) 38 39 const snapshotBlockHeightFlagName = "--snapshot.load-from-block-height" 40 41 type BinariesRunner struct { 42 mut sync.RWMutex 43 running map[int]*exec.Cmd 44 binsFolder string 45 log *logging.Logger 46 stopDelay time.Duration 47 stopTimeout time.Duration 48 releaseInfo *types.ReleaseInfo 49 } 50 51 func NewBinariesRunner(log *logging.Logger, binsFolder string, stopDelay, stopTimeout time.Duration, rInfo *types.ReleaseInfo) *BinariesRunner { 52 return &BinariesRunner{ 53 binsFolder: binsFolder, 54 running: map[int]*exec.Cmd{}, 55 log: log, 56 stopDelay: stopDelay, 57 stopTimeout: stopTimeout, 58 releaseInfo: rInfo, 59 } 60 } 61 62 func (r *BinariesRunner) cleanBinaryPath(binPath string) string { 63 if !filepath.IsAbs(binPath) { 64 return path.Join(r.binsFolder, binPath) 65 } 66 67 return binPath 68 } 69 70 func (r *BinariesRunner) runBinary(ctx context.Context, binPath string, args []string) error { 71 binPath = r.cleanBinaryPath(binPath) 72 73 if err := utils.EnsureBinary(binPath); err != nil { 74 return fmt.Errorf("failed to locate binary %s %v: %w", binPath, args, err) 75 } 76 77 if r.releaseInfo != nil { 78 if err := ensureBinaryVersion(binPath, r.releaseInfo.VegaReleaseTag); err != nil { 79 return err 80 } 81 } 82 83 cmd := exec.CommandContext(ctx, binPath, args...) 84 cmd.Stdout = os.Stdout 85 cmd.Stderr = os.Stderr 86 87 r.log.Debug("Starting binary", 88 logging.String("binaryPath", binPath), 89 logging.Strings("args", args), 90 ) 91 92 if err := cmd.Start(); err != nil { 93 return fmt.Errorf("failed to start binary %s %v: %w", binPath, args, err) 94 } 95 96 processID := cmd.Process.Pid 97 98 // Ensures that if one binary fails all of them are killed 99 go func() { 100 <-ctx.Done() 101 102 if cmd.Process == nil { 103 return 104 } 105 106 // Process has already exited - no need to kill it 107 if cmd.ProcessState != nil { 108 return 109 } 110 111 r.log.Debug("Stopping binary", logging.String("binaryPath", binPath)) 112 113 if err := cmd.Process.Signal(syscall.SIGTERM); err != nil { 114 r.log.Debug("Failed to stop binary, resorting to force kill", 115 logging.String("binaryPath", binPath), 116 logging.Error(err), 117 ) 118 if err := cmd.Process.Kill(); err != nil { 119 r.log.Debug("Failed to force kill binary", 120 logging.String("binaryPath", binPath), 121 logging.Error(err), 122 ) 123 } 124 } 125 }() 126 127 r.mut.Lock() 128 r.running[processID] = cmd 129 r.mut.Unlock() 130 131 defer func() { 132 r.mut.Lock() 133 delete(r.running, processID) 134 r.mut.Unlock() 135 }() 136 137 if err := cmd.Wait(); err != nil { 138 return fmt.Errorf("failed after waiting for binary %s %v: %w", binPath, args, err) 139 } 140 141 return nil 142 } 143 144 func (r *BinariesRunner) prepareVegaArgs(runConf *config.RunConfig, isRestart bool) (Args, error) { 145 args := Args(runConf.Vega.Binary.Args) 146 147 // if a node restart happens (not due protocol upgrade) and data node is present 148 // we need to make sure that they will start on the block that data node has already processed. 149 if isRestart && runConf.DataNode != nil { 150 r.log.Debug("Getting latest history segment from data node (will lock the latest LevelDB snapshot!)") 151 // this locks the levelDB file 152 latestSegment, err := latestDataNodeHistorySegment( 153 r.cleanBinaryPath(runConf.DataNode.Binary.Path), 154 runConf.DataNode.Binary.Args, 155 ) 156 r.log.Debug("Got latest history segment from data node", logging.Bool("success", err == nil)) 157 158 if err == nil { 159 args.Set(snapshotBlockHeightFlagName, strconv.FormatUint(uint64(latestSegment.LatestSegment.Height), 10)) 160 return args, nil 161 } 162 163 // no segment was found - do not load from snapshot 164 if errors.Is(err, ErrNoHistorySegmentFound) { 165 return args, nil 166 } 167 168 return nil, fmt.Errorf("failed to get latest history segment from data node: %w", err) 169 } 170 171 if r.releaseInfo != nil { 172 args.Set(snapshotBlockHeightFlagName, strconv.FormatUint(r.releaseInfo.UpgradeBlockHeight, 10)) 173 } 174 175 return args, nil 176 } 177 178 func (r *BinariesRunner) Run(ctx context.Context, runConf *config.RunConfig, isRestart bool) chan error { 179 r.log.Debug("Preparing Vega args") 180 errChan := make(chan error) 181 eg, ctx := errgroup.WithContext(ctx) 182 183 // this may call a datanode CLI so to avoid a race we do this sync before we start 184 // the data node process 185 args, err := r.prepareVegaArgs(runConf, isRestart) 186 if err != nil { 187 go func() { 188 errChan <- fmt.Errorf("failed to prepare args for Vega binary: %w", err) 189 }() 190 return errChan 191 } 192 193 eg.Go(func() error { 194 r.log.Debug("Starting Vega binary") 195 return r.runBinary(ctx, runConf.Vega.Binary.Path, args) 196 }) 197 198 if runConf.DataNode != nil { 199 eg.Go(func() error { 200 r.log.Debug("Starting Data Node binary") 201 return r.runBinary(ctx, runConf.DataNode.Binary.Path, runConf.DataNode.Binary.Args) 202 }) 203 } 204 205 go func() { 206 err := eg.Wait() 207 if err != nil { 208 errChan <- err 209 } 210 }() 211 212 return errChan 213 } 214 215 func (r *BinariesRunner) signal(signal syscall.Signal) error { 216 r.mut.RLock() 217 defer r.mut.RUnlock() 218 219 var err error 220 for _, c := range r.running { 221 r.log.Info("Signaling process", 222 logging.String("binaryName", c.Path), 223 logging.String("signal", signal.String()), 224 logging.Strings("args", c.Args), 225 ) 226 227 err = c.Process.Signal(signal) 228 if err != nil { 229 r.log.Error("Failed to signal running binary", 230 logging.String("binaryPath", c.Path), 231 logging.Strings("args", c.Args), 232 logging.Error(err), 233 ) 234 } 235 } 236 237 return err 238 } 239 240 func (r *BinariesRunner) Stop() error { 241 r.log.Info("Stopping binaries", logging.Duration("stop delay", r.stopDelay)) 242 243 time.Sleep(r.stopDelay) 244 245 if err := r.signal(syscall.SIGTERM); err != nil { 246 return err 247 } 248 249 r.mut.RLock() 250 timeout := time.After(r.stopTimeout) 251 r.mut.RUnlock() 252 253 ticker := time.NewTicker(time.Second / 10) 254 defer ticker.Stop() 255 256 for { 257 select { 258 case <-timeout: 259 return fmt.Errorf("failed to gracefully shut down processes: timed out") 260 case <-ticker.C: 261 r.mut.RLock() 262 if len(r.running) == 0 { 263 r.mut.RUnlock() 264 return nil 265 } 266 r.mut.RUnlock() 267 } 268 } 269 } 270 271 func (r *BinariesRunner) Kill() error { 272 return r.signal(syscall.SIGKILL) 273 }