code.vegaprotocol.io/vega@v0.79.0/visor/visor.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package visor
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"time"
    22  
    23  	"code.vegaprotocol.io/vega/core/types"
    24  	"code.vegaprotocol.io/vega/logging"
    25  	"code.vegaprotocol.io/vega/visor/client"
    26  	"code.vegaprotocol.io/vega/visor/config"
    27  	"code.vegaprotocol.io/vega/visor/utils"
    28  )
    29  
    30  const (
    31  	upgradeAPICallTickerDuration = time.Second * 2
    32  	namedLogger                  = "visor"
    33  )
    34  
    35  type Visor struct {
    36  	conf          *config.VisorConfig
    37  	clientFactory client.Factory
    38  	log           *logging.Logger
    39  }
    40  
    41  func NewVisor(ctx context.Context, log *logging.Logger, clientFactory client.Factory, homePath string) (*Visor, error) {
    42  	homePath, err := utils.AbsPath(homePath)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  
    47  	homeExists, err := utils.PathExists(homePath)
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  
    52  	if !homeExists {
    53  		return nil, fmt.Errorf("visor is not initialized, call the `init` command first")
    54  	}
    55  
    56  	visorConf, err := config.NewVisorConfig(log, homePath)
    57  	if err != nil {
    58  		// Do not wrap error as underlying errors are meaningful enough.
    59  		return nil, err
    60  	}
    61  
    62  	currentFolderExists, err := utils.PathExists(visorConf.CurrentRunConfigPath())
    63  	if err != nil {
    64  		return nil, err
    65  	}
    66  
    67  	v := &Visor{
    68  		conf:          visorConf,
    69  		clientFactory: clientFactory,
    70  		log:           log.Named(namedLogger),
    71  	}
    72  
    73  	if !currentFolderExists {
    74  		if err := v.setCurrentFolder(visorConf.GenesisFolder(), visorConf.CurrentFolder()); err != nil {
    75  			return nil, fmt.Errorf("failed to set current folder to %q: %w", visorConf.CurrentFolder(), err)
    76  		}
    77  	}
    78  
    79  	go v.watchForConfigUpdates(ctx)
    80  
    81  	return v, nil
    82  }
    83  
    84  func (v *Visor) watchForConfigUpdates(ctx context.Context) {
    85  	for {
    86  		v.log.Debug("starting config file watcher")
    87  		if err := v.conf.WatchForUpdate(ctx); err != nil {
    88  			v.log.Error("config file watcher has failed", logging.Error(err))
    89  		}
    90  	}
    91  }
    92  
    93  func (v *Visor) Run(ctx context.Context) error {
    94  	numOfRestarts := 0
    95  	var currentReleaseInfo *types.ReleaseInfo
    96  
    97  	upgradeTicker := time.NewTicker(upgradeAPICallTickerDuration)
    98  	defer upgradeTicker.Stop()
    99  
   100  	ctx, cancel := context.WithCancel(ctx)
   101  	defer cancel()
   102  
   103  	var isRestarting bool
   104  
   105  	for {
   106  		runConf, err := config.ParseRunConfig(v.conf.CurrentRunConfigPath())
   107  		if err != nil {
   108  			return fmt.Errorf("failed to parse run config: %w", err)
   109  		}
   110  
   111  		c := v.clientFactory.GetClient(
   112  			runConf.Vega.RCP.SocketPath,
   113  			runConf.Vega.RCP.HTTPPath,
   114  		)
   115  
   116  		// how many times to try and connect on the first start up of the binaries
   117  		maxNumberOfFirstConnectionRetries := v.conf.MaxNumberOfFirstConnectionRetries()
   118  
   119  		// how many times to try and connect in subsequent restarts of the binaries where it is expected to be much quicker
   120  		maxUpgradeStatusErrs := v.conf.MaxNumberOfRestartConnectionRetries()
   121  		numOfUpgradeStatusErrs := 0
   122  		maxNumRestarts := v.conf.MaxNumberOfRestarts()
   123  		restartsDelay := time.Second * time.Duration(v.conf.RestartsDelaySeconds())
   124  
   125  		if isRestarting {
   126  			v.log.Info("Restarting binaries")
   127  		} else {
   128  			v.log.Info("Starting binaries")
   129  		}
   130  
   131  		binRunner := NewBinariesRunner(
   132  			v.log,
   133  			v.conf.CurrentFolder(),
   134  			time.Second*time.Duration(v.conf.StopDelaySeconds()),
   135  			time.Second*time.Duration(v.conf.StopSignalTimeoutSeconds()),
   136  			currentReleaseInfo,
   137  		)
   138  		binErrs := binRunner.Run(ctx, runConf, isRestarting)
   139  
   140  		upgradeTicker.Reset(upgradeAPICallTickerDuration)
   141  		isRestarting = false
   142  
   143  	CheckLoop:
   144  		for {
   145  			select {
   146  			case <-ctx.Done():
   147  				return ctx.Err()
   148  			case err := <-binErrs:
   149  				v.log.Error("Binaries executions has failed", logging.Error(err))
   150  
   151  				if numOfRestarts >= maxNumRestarts {
   152  					return fmt.Errorf("maximum number of possible restarts has been reached: %w", err)
   153  				}
   154  
   155  				numOfRestarts++
   156  				v.log.Info("Binaries restart is scheduled", logging.Duration("restartDelay", restartsDelay))
   157  				time.Sleep(restartsDelay)
   158  				v.log.Info("Restarting binaries", logging.Int("remainingRestarts", maxNumRestarts-numOfRestarts))
   159  
   160  				isRestarting = true
   161  
   162  				break CheckLoop
   163  			case <-upgradeTicker.C:
   164  				upStatus, err := c.UpgradeStatus(ctx)
   165  				if err != nil {
   166  					// Binary has not started yet - waiting for first startup
   167  					isFirstStartup := numOfRestarts == 0
   168  					if isFirstStartup {
   169  						if numOfUpgradeStatusErrs >= maxNumberOfFirstConnectionRetries {
   170  							return failedToGetStatusErr(maxNumberOfFirstConnectionRetries, err)
   171  						}
   172  					} else { // Binary has been started already. Something has failed after the startup
   173  						if numOfUpgradeStatusErrs >= maxUpgradeStatusErrs {
   174  							return failedToGetStatusErr(maxUpgradeStatusErrs, err)
   175  						}
   176  					}
   177  
   178  					v.log.Debug("Failed to get upgrade status from API", logging.Error(err))
   179  
   180  					numOfUpgradeStatusErrs++
   181  
   182  					attemptsLeft := maxUpgradeStatusErrs - numOfUpgradeStatusErrs
   183  					if isFirstStartup {
   184  						attemptsLeft = maxNumberOfFirstConnectionRetries - numOfUpgradeStatusErrs
   185  					}
   186  					v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", attemptsLeft))
   187  
   188  					break
   189  				}
   190  
   191  				if !upStatus.ReadyToUpgrade {
   192  					numOfUpgradeStatusErrs = 0
   193  					break
   194  				}
   195  
   196  				currentReleaseInfo = upStatus.AcceptedReleaseInfo
   197  
   198  				v.log.Info("Preparing upgrade")
   199  
   200  				if err := binRunner.Stop(); err != nil {
   201  					v.log.Info("Failed to stop binaries, resorting to force kill", logging.Error(err))
   202  					if err := binRunner.Kill(); err != nil {
   203  						return fmt.Errorf("failed to force kill the running processes: %w", err)
   204  					}
   205  				}
   206  
   207  				v.log.Info("Starting upgrade")
   208  
   209  				if err := v.prepareNextUpgradeFolder(ctx, currentReleaseInfo.VegaReleaseTag); err != nil {
   210  					return fmt.Errorf("failed to prepare next upgrade folder: %w", err)
   211  				}
   212  
   213  				numOfRestarts = 0
   214  				numOfUpgradeStatusErrs = 0
   215  
   216  				break CheckLoop
   217  			}
   218  		}
   219  	}
   220  }
   221  
   222  func failedToGetStatusErr(numberOfErrs int, err error) error {
   223  	return fmt.Errorf("failed to get upgrade status for maximum amount of %d times: %w", numberOfErrs, err)
   224  }