code.vegaprotocol.io/vega@v0.79.0/visor/visor.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package visor 17 18 import ( 19 "context" 20 "fmt" 21 "time" 22 23 "code.vegaprotocol.io/vega/core/types" 24 "code.vegaprotocol.io/vega/logging" 25 "code.vegaprotocol.io/vega/visor/client" 26 "code.vegaprotocol.io/vega/visor/config" 27 "code.vegaprotocol.io/vega/visor/utils" 28 ) 29 30 const ( 31 upgradeAPICallTickerDuration = time.Second * 2 32 namedLogger = "visor" 33 ) 34 35 type Visor struct { 36 conf *config.VisorConfig 37 clientFactory client.Factory 38 log *logging.Logger 39 } 40 41 func NewVisor(ctx context.Context, log *logging.Logger, clientFactory client.Factory, homePath string) (*Visor, error) { 42 homePath, err := utils.AbsPath(homePath) 43 if err != nil { 44 return nil, err 45 } 46 47 homeExists, err := utils.PathExists(homePath) 48 if err != nil { 49 return nil, err 50 } 51 52 if !homeExists { 53 return nil, fmt.Errorf("visor is not initialized, call the `init` command first") 54 } 55 56 visorConf, err := config.NewVisorConfig(log, homePath) 57 if err != nil { 58 // Do not wrap error as underlying errors are meaningful enough. 59 return nil, err 60 } 61 62 currentFolderExists, err := utils.PathExists(visorConf.CurrentRunConfigPath()) 63 if err != nil { 64 return nil, err 65 } 66 67 v := &Visor{ 68 conf: visorConf, 69 clientFactory: clientFactory, 70 log: log.Named(namedLogger), 71 } 72 73 if !currentFolderExists { 74 if err := v.setCurrentFolder(visorConf.GenesisFolder(), visorConf.CurrentFolder()); err != nil { 75 return nil, fmt.Errorf("failed to set current folder to %q: %w", visorConf.CurrentFolder(), err) 76 } 77 } 78 79 go v.watchForConfigUpdates(ctx) 80 81 return v, nil 82 } 83 84 func (v *Visor) watchForConfigUpdates(ctx context.Context) { 85 for { 86 v.log.Debug("starting config file watcher") 87 if err := v.conf.WatchForUpdate(ctx); err != nil { 88 v.log.Error("config file watcher has failed", logging.Error(err)) 89 } 90 } 91 } 92 93 func (v *Visor) Run(ctx context.Context) error { 94 numOfRestarts := 0 95 var currentReleaseInfo *types.ReleaseInfo 96 97 upgradeTicker := time.NewTicker(upgradeAPICallTickerDuration) 98 defer upgradeTicker.Stop() 99 100 ctx, cancel := context.WithCancel(ctx) 101 defer cancel() 102 103 var isRestarting bool 104 105 for { 106 runConf, err := config.ParseRunConfig(v.conf.CurrentRunConfigPath()) 107 if err != nil { 108 return fmt.Errorf("failed to parse run config: %w", err) 109 } 110 111 c := v.clientFactory.GetClient( 112 runConf.Vega.RCP.SocketPath, 113 runConf.Vega.RCP.HTTPPath, 114 ) 115 116 // how many times to try and connect on the first start up of the binaries 117 maxNumberOfFirstConnectionRetries := v.conf.MaxNumberOfFirstConnectionRetries() 118 119 // how many times to try and connect in subsequent restarts of the binaries where it is expected to be much quicker 120 maxUpgradeStatusErrs := v.conf.MaxNumberOfRestartConnectionRetries() 121 numOfUpgradeStatusErrs := 0 122 maxNumRestarts := v.conf.MaxNumberOfRestarts() 123 restartsDelay := time.Second * time.Duration(v.conf.RestartsDelaySeconds()) 124 125 if isRestarting { 126 v.log.Info("Restarting binaries") 127 } else { 128 v.log.Info("Starting binaries") 129 } 130 131 binRunner := NewBinariesRunner( 132 v.log, 133 v.conf.CurrentFolder(), 134 time.Second*time.Duration(v.conf.StopDelaySeconds()), 135 time.Second*time.Duration(v.conf.StopSignalTimeoutSeconds()), 136 currentReleaseInfo, 137 ) 138 binErrs := binRunner.Run(ctx, runConf, isRestarting) 139 140 upgradeTicker.Reset(upgradeAPICallTickerDuration) 141 isRestarting = false 142 143 CheckLoop: 144 for { 145 select { 146 case <-ctx.Done(): 147 return ctx.Err() 148 case err := <-binErrs: 149 v.log.Error("Binaries executions has failed", logging.Error(err)) 150 151 if numOfRestarts >= maxNumRestarts { 152 return fmt.Errorf("maximum number of possible restarts has been reached: %w", err) 153 } 154 155 numOfRestarts++ 156 v.log.Info("Binaries restart is scheduled", logging.Duration("restartDelay", restartsDelay)) 157 time.Sleep(restartsDelay) 158 v.log.Info("Restarting binaries", logging.Int("remainingRestarts", maxNumRestarts-numOfRestarts)) 159 160 isRestarting = true 161 162 break CheckLoop 163 case <-upgradeTicker.C: 164 upStatus, err := c.UpgradeStatus(ctx) 165 if err != nil { 166 // Binary has not started yet - waiting for first startup 167 isFirstStartup := numOfRestarts == 0 168 if isFirstStartup { 169 if numOfUpgradeStatusErrs >= maxNumberOfFirstConnectionRetries { 170 return failedToGetStatusErr(maxNumberOfFirstConnectionRetries, err) 171 } 172 } else { // Binary has been started already. Something has failed after the startup 173 if numOfUpgradeStatusErrs >= maxUpgradeStatusErrs { 174 return failedToGetStatusErr(maxUpgradeStatusErrs, err) 175 } 176 } 177 178 v.log.Debug("Failed to get upgrade status from API", logging.Error(err)) 179 180 numOfUpgradeStatusErrs++ 181 182 attemptsLeft := maxUpgradeStatusErrs - numOfUpgradeStatusErrs 183 if isFirstStartup { 184 attemptsLeft = maxNumberOfFirstConnectionRetries - numOfUpgradeStatusErrs 185 } 186 v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", attemptsLeft)) 187 188 break 189 } 190 191 if !upStatus.ReadyToUpgrade { 192 numOfUpgradeStatusErrs = 0 193 break 194 } 195 196 currentReleaseInfo = upStatus.AcceptedReleaseInfo 197 198 v.log.Info("Preparing upgrade") 199 200 if err := binRunner.Stop(); err != nil { 201 v.log.Info("Failed to stop binaries, resorting to force kill", logging.Error(err)) 202 if err := binRunner.Kill(); err != nil { 203 return fmt.Errorf("failed to force kill the running processes: %w", err) 204 } 205 } 206 207 v.log.Info("Starting upgrade") 208 209 if err := v.prepareNextUpgradeFolder(ctx, currentReleaseInfo.VegaReleaseTag); err != nil { 210 return fmt.Errorf("failed to prepare next upgrade folder: %w", err) 211 } 212 213 numOfRestarts = 0 214 numOfUpgradeStatusErrs = 0 215 216 break CheckLoop 217 } 218 } 219 } 220 } 221 222 func failedToGetStatusErr(numberOfErrs int, err error) error { 223 return fmt.Errorf("failed to get upgrade status for maximum amount of %d times: %w", numberOfErrs, err) 224 }