code.vegaprotocol.io/vega@v0.79.0/visor/binaries_runner.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package visor
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"os"
    23  	"os/exec"
    24  	"path"
    25  	"path/filepath"
    26  	"strconv"
    27  	"sync"
    28  	"syscall"
    29  	"time"
    30  
    31  	"code.vegaprotocol.io/vega/core/types"
    32  	"code.vegaprotocol.io/vega/logging"
    33  	"code.vegaprotocol.io/vega/visor/config"
    34  	"code.vegaprotocol.io/vega/visor/utils"
    35  
    36  	"golang.org/x/sync/errgroup"
    37  )
    38  
    39  const snapshotBlockHeightFlagName = "--snapshot.load-from-block-height"
    40  
    41  type BinariesRunner struct {
    42  	mut         sync.RWMutex
    43  	running     map[int]*exec.Cmd
    44  	binsFolder  string
    45  	log         *logging.Logger
    46  	stopDelay   time.Duration
    47  	stopTimeout time.Duration
    48  	releaseInfo *types.ReleaseInfo
    49  }
    50  
    51  func NewBinariesRunner(log *logging.Logger, binsFolder string, stopDelay, stopTimeout time.Duration, rInfo *types.ReleaseInfo) *BinariesRunner {
    52  	return &BinariesRunner{
    53  		binsFolder:  binsFolder,
    54  		running:     map[int]*exec.Cmd{},
    55  		log:         log,
    56  		stopDelay:   stopDelay,
    57  		stopTimeout: stopTimeout,
    58  		releaseInfo: rInfo,
    59  	}
    60  }
    61  
    62  func (r *BinariesRunner) cleanBinaryPath(binPath string) string {
    63  	if !filepath.IsAbs(binPath) {
    64  		return path.Join(r.binsFolder, binPath)
    65  	}
    66  
    67  	return binPath
    68  }
    69  
    70  func (r *BinariesRunner) runBinary(ctx context.Context, binPath string, args []string) error {
    71  	binPath = r.cleanBinaryPath(binPath)
    72  
    73  	if err := utils.EnsureBinary(binPath); err != nil {
    74  		return fmt.Errorf("failed to locate binary %s %v: %w", binPath, args, err)
    75  	}
    76  
    77  	if r.releaseInfo != nil {
    78  		if err := ensureBinaryVersion(binPath, r.releaseInfo.VegaReleaseTag); err != nil {
    79  			return err
    80  		}
    81  	}
    82  
    83  	cmd := exec.CommandContext(ctx, binPath, args...)
    84  	cmd.Stdout = os.Stdout
    85  	cmd.Stderr = os.Stderr
    86  
    87  	r.log.Debug("Starting binary",
    88  		logging.String("binaryPath", binPath),
    89  		logging.Strings("args", args),
    90  	)
    91  
    92  	if err := cmd.Start(); err != nil {
    93  		return fmt.Errorf("failed to start binary %s %v: %w", binPath, args, err)
    94  	}
    95  
    96  	processID := cmd.Process.Pid
    97  
    98  	// Ensures that if one binary fails all of them are killed
    99  	go func() {
   100  		<-ctx.Done()
   101  
   102  		if cmd.Process == nil {
   103  			return
   104  		}
   105  
   106  		// Process has already exited - no need to kill it
   107  		if cmd.ProcessState != nil {
   108  			return
   109  		}
   110  
   111  		r.log.Debug("Stopping binary", logging.String("binaryPath", binPath))
   112  
   113  		if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
   114  			r.log.Debug("Failed to stop binary, resorting to force kill",
   115  				logging.String("binaryPath", binPath),
   116  				logging.Error(err),
   117  			)
   118  			if err := cmd.Process.Kill(); err != nil {
   119  				r.log.Debug("Failed to force kill binary",
   120  					logging.String("binaryPath", binPath),
   121  					logging.Error(err),
   122  				)
   123  			}
   124  		}
   125  	}()
   126  
   127  	r.mut.Lock()
   128  	r.running[processID] = cmd
   129  	r.mut.Unlock()
   130  
   131  	defer func() {
   132  		r.mut.Lock()
   133  		delete(r.running, processID)
   134  		r.mut.Unlock()
   135  	}()
   136  
   137  	if err := cmd.Wait(); err != nil {
   138  		return fmt.Errorf("failed after waiting for binary %s %v: %w", binPath, args, err)
   139  	}
   140  
   141  	return nil
   142  }
   143  
   144  func (r *BinariesRunner) prepareVegaArgs(runConf *config.RunConfig, isRestart bool) (Args, error) {
   145  	args := Args(runConf.Vega.Binary.Args)
   146  
   147  	// if a node restart happens (not due protocol upgrade) and data node is present
   148  	// we need to make sure that they will start on the block that data node has already processed.
   149  	if isRestart && runConf.DataNode != nil {
   150  		r.log.Debug("Getting latest history segment from data node (will lock the latest LevelDB snapshot!)")
   151  		// this locks the levelDB file
   152  		latestSegment, err := latestDataNodeHistorySegment(
   153  			r.cleanBinaryPath(runConf.DataNode.Binary.Path),
   154  			runConf.DataNode.Binary.Args,
   155  		)
   156  		r.log.Debug("Got latest history segment from data node", logging.Bool("success", err == nil))
   157  
   158  		if err == nil {
   159  			args.Set(snapshotBlockHeightFlagName, strconv.FormatUint(uint64(latestSegment.LatestSegment.Height), 10))
   160  			return args, nil
   161  		}
   162  
   163  		// no segment was found - do not load from snapshot
   164  		if errors.Is(err, ErrNoHistorySegmentFound) {
   165  			return args, nil
   166  		}
   167  
   168  		return nil, fmt.Errorf("failed to get latest history segment from data node: %w", err)
   169  	}
   170  
   171  	if r.releaseInfo != nil {
   172  		args.Set(snapshotBlockHeightFlagName, strconv.FormatUint(r.releaseInfo.UpgradeBlockHeight, 10))
   173  	}
   174  
   175  	return args, nil
   176  }
   177  
   178  func (r *BinariesRunner) Run(ctx context.Context, runConf *config.RunConfig, isRestart bool) chan error {
   179  	r.log.Debug("Preparing Vega args")
   180  	errChan := make(chan error)
   181  	eg, ctx := errgroup.WithContext(ctx)
   182  
   183  	// this may call a datanode CLI so to avoid a race we do this sync before we start
   184  	// the data node process
   185  	args, err := r.prepareVegaArgs(runConf, isRestart)
   186  	if err != nil {
   187  		go func() {
   188  			errChan <- fmt.Errorf("failed to prepare args for Vega binary: %w", err)
   189  		}()
   190  		return errChan
   191  	}
   192  
   193  	eg.Go(func() error {
   194  		r.log.Debug("Starting Vega binary")
   195  		return r.runBinary(ctx, runConf.Vega.Binary.Path, args)
   196  	})
   197  
   198  	if runConf.DataNode != nil {
   199  		eg.Go(func() error {
   200  			r.log.Debug("Starting Data Node binary")
   201  			return r.runBinary(ctx, runConf.DataNode.Binary.Path, runConf.DataNode.Binary.Args)
   202  		})
   203  	}
   204  
   205  	go func() {
   206  		err := eg.Wait()
   207  		if err != nil {
   208  			errChan <- err
   209  		}
   210  	}()
   211  
   212  	return errChan
   213  }
   214  
   215  func (r *BinariesRunner) signal(signal syscall.Signal) error {
   216  	r.mut.RLock()
   217  	defer r.mut.RUnlock()
   218  
   219  	var err error
   220  	for _, c := range r.running {
   221  		r.log.Info("Signaling process",
   222  			logging.String("binaryName", c.Path),
   223  			logging.String("signal", signal.String()),
   224  			logging.Strings("args", c.Args),
   225  		)
   226  
   227  		err = c.Process.Signal(signal)
   228  		if err != nil {
   229  			r.log.Error("Failed to signal running binary",
   230  				logging.String("binaryPath", c.Path),
   231  				logging.Strings("args", c.Args),
   232  				logging.Error(err),
   233  			)
   234  		}
   235  	}
   236  
   237  	return err
   238  }
   239  
   240  func (r *BinariesRunner) Stop() error {
   241  	r.log.Info("Stopping binaries", logging.Duration("stop delay", r.stopDelay))
   242  
   243  	time.Sleep(r.stopDelay)
   244  
   245  	if err := r.signal(syscall.SIGTERM); err != nil {
   246  		return err
   247  	}
   248  
   249  	r.mut.RLock()
   250  	timeout := time.After(r.stopTimeout)
   251  	r.mut.RUnlock()
   252  
   253  	ticker := time.NewTicker(time.Second / 10)
   254  	defer ticker.Stop()
   255  
   256  	for {
   257  		select {
   258  		case <-timeout:
   259  			return fmt.Errorf("failed to gracefully shut down processes: timed out")
   260  		case <-ticker.C:
   261  			r.mut.RLock()
   262  			if len(r.running) == 0 {
   263  				r.mut.RUnlock()
   264  				return nil
   265  			}
   266  			r.mut.RUnlock()
   267  		}
   268  	}
   269  }
   270  
   271  func (r *BinariesRunner) Kill() error {
   272  	return r.signal(syscall.SIGKILL)
   273  }