github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradesteps/worker.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package upgradesteps 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "github.com/juju/names/v5" 13 "github.com/juju/retry" 14 "github.com/juju/version/v2" 15 "github.com/juju/worker/v3" 16 "gopkg.in/tomb.v2" 17 18 "github.com/juju/juju/agent" 19 "github.com/juju/juju/api" 20 agenterrors "github.com/juju/juju/cmd/jujud/agent/errors" 21 "github.com/juju/juju/core/status" 22 "github.com/juju/juju/state" 23 "github.com/juju/juju/upgrades" 24 jujuversion "github.com/juju/juju/version" 25 "github.com/juju/juju/worker/gate" 26 "github.com/juju/juju/wrench" 27 ) 28 29 var logger = loggo.GetLogger("juju.worker.upgradesteps") 30 31 // TODO (manadart 2021-05-18): These are exported for tests and in the case of 32 // the timeout, for feature tests. That especially should be a dependency of the 33 // worker. 34 var ( 35 PerformUpgrade = upgrades.PerformUpgrade 36 37 // UpgradeStartTimeoutController the maximum time a controller will 38 // wait for other controllers to come up and indicate they are ready 39 // to begin running upgrade steps. 40 UpgradeStartTimeoutController = time.Minute * 15 41 ) 42 43 // NewLock creates a gate.Lock to be used to synchronise workers which 44 // need to start after upgrades have completed. The returned Lock should 45 // be passed to NewWorker. If the agent has already upgraded to the 46 // current version, then the lock will be returned in the released state. 47 func NewLock(agentConfig agent.Config) gate.Lock { 48 lock := gate.NewLock() 49 50 if wrench.IsActive(wrenchKey(agentConfig), "always-try-upgrade") { 51 // Always enter upgrade mode. This allows test of upgrades 52 // even when there's actually no upgrade steps to run. 53 return lock 54 } 55 56 // Build numbers are irrelevant to upgrade steps. 57 upgradedToVersion := agentConfig.UpgradedToVersion().ToPatch() 58 currentVersion := jujuversion.Current.ToPatch() 59 if upgradedToVersion == currentVersion { 60 logger.Infof( 61 "upgrade steps for %v have already been run.", 62 jujuversion.Current, 63 ) 64 lock.Unlock() 65 } 66 67 return lock 68 } 69 70 // StatusSetter defines the single method required to set an agent's 71 // status. 72 type StatusSetter interface { 73 SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error 74 } 75 76 // NewWorker returns a new instance of the upgradeSteps worker. It 77 // will run any required steps to upgrade to the currently running 78 // Juju version. 79 func NewWorker( 80 upgradeComplete gate.Lock, 81 agent agent.Agent, 82 apiConn api.Connection, 83 isController bool, 84 openState func() (*state.StatePool, error), 85 preUpgradeSteps upgrades.PreUpgradeStepsFunc, 86 retryStrategy retry.CallArgs, 87 entity StatusSetter, 88 isCaas bool, 89 ) (worker.Worker, error) { 90 w := &upgradeSteps{ 91 upgradeComplete: upgradeComplete, 92 agent: agent, 93 apiConn: apiConn, 94 openState: openState, 95 preUpgradeSteps: preUpgradeSteps, 96 retryStrategy: retryStrategy, 97 entity: entity, 98 tag: agent.CurrentConfig().Tag(), 99 isController: isController, 100 isCaas: isCaas, 101 } 102 w.tomb.Go(w.run) 103 return w, nil 104 } 105 106 type upgradeSteps struct { 107 tomb tomb.Tomb 108 upgradeComplete gate.Lock 109 agent agent.Agent 110 apiConn api.Connection 111 openState func() (*state.StatePool, error) 112 preUpgradeSteps upgrades.PreUpgradeStepsFunc 113 entity StatusSetter 114 retryStrategy retry.CallArgs 115 116 fromVersion version.Number 117 toVersion version.Number 118 tag names.Tag 119 // If the agent is a machine agent for a controller, flag that state 120 // needs to be opened before running upgrade steps 121 isController bool 122 isCaas bool 123 pool *state.StatePool 124 } 125 126 // Kill is part of the worker.Worker interface. 127 func (w *upgradeSteps) Kill() { 128 w.tomb.Kill(nil) 129 } 130 131 // Wait is part of the worker.Worker interface. 132 func (w *upgradeSteps) Wait() error { 133 return w.tomb.Wait() 134 } 135 136 type apiLostDuringUpgrade struct { 137 err error 138 } 139 140 func (e *apiLostDuringUpgrade) Error() string { 141 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 142 } 143 144 func isAPILostDuringUpgrade(err error) bool { 145 _, ok := err.(*apiLostDuringUpgrade) 146 return ok 147 } 148 149 func (w *upgradeSteps) wrenchKey() string { 150 return wrenchKey(w.agent.CurrentConfig()) 151 } 152 153 func wrenchKey(agentConfig agent.Config) string { 154 return agentConfig.Tag().Kind() + "-agent" 155 } 156 157 func (w *upgradeSteps) run() error { 158 if wrench.IsActive(w.wrenchKey(), "fail-upgrade-start") { 159 return nil // Make the worker stop 160 } 161 162 if w.upgradeComplete.IsUnlocked() { 163 // Our work is already done (we're probably being restarted 164 // because the API connection has gone down), so do nothing. 165 return nil 166 } 167 168 w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion() 169 w.toVersion = jujuversion.Current 170 if w.fromVersion == w.toVersion { 171 logger.Infof("upgrade to %v already completed.", w.toVersion) 172 w.upgradeComplete.Unlock() 173 return nil 174 } 175 176 // We need a *state.State for upgrades. We open it independently 177 // of StateWorker, because we have no guarantees about when 178 // and how often StateWorker might run. 179 if w.isController { 180 var err error 181 if w.pool, err = w.openState(); err != nil { 182 return err 183 } 184 defer func() { _ = w.pool.Close() }() 185 186 st, err := w.pool.SystemState() 187 if err != nil { 188 return errors.Trace(err) 189 } 190 model, err := st.Model() 191 if err != nil { 192 return errors.Trace(err) 193 } 194 w.isCaas = model.Type() == state.ModelTypeCAAS 195 } 196 197 if err := w.runUpgrades(); err != nil { 198 // Only return an error from the worker if the connection to 199 // state went away (possible mongo primary change). Returning 200 // an error when the connection is lost will cause the agent 201 // to restart. 202 // 203 // For other errors, the error is not returned because we want 204 // the agent to stay running in an error state waiting 205 // for user intervention. 206 if isAPILostDuringUpgrade(err) { 207 return err 208 } 209 w.reportUpgradeFailure(err, false) 210 } else { 211 // Upgrade succeeded - signal that the upgrade is complete. 212 logger.Infof("upgrade to %v completed successfully.", w.toVersion) 213 _ = w.entity.SetStatus(status.Started, "", nil) 214 w.upgradeComplete.Unlock() 215 } 216 return nil 217 } 218 219 // runUpgrades runs the upgrade operations for each job type and 220 // updates the updatedToVersion on success. 221 func (w *upgradeSteps) runUpgrades() error { 222 upgradeInfo, err := w.prepareForUpgrade() 223 if err != nil { 224 return err 225 } 226 227 if wrench.IsActive(w.wrenchKey(), "fail-upgrade") { 228 return errors.New("wrench") 229 } 230 231 if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil { 232 return err 233 } 234 235 if err := w.finaliseUpgrade(upgradeInfo); err != nil { 236 return err 237 } 238 return nil 239 } 240 241 func (w *upgradeSteps) prepareForUpgrade() (*state.UpgradeInfo, error) { 242 logger.Infof("checking that upgrade can proceed") 243 if err := w.preUpgradeSteps(w.pool, w.agent.CurrentConfig(), w.pool != nil, w.isCaas); err != nil { 244 return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag)) 245 } 246 247 if w.isController { 248 return w.prepareControllerForUpgrade() 249 } 250 return nil, nil 251 } 252 253 func (w *upgradeSteps) prepareControllerForUpgrade() (*state.UpgradeInfo, error) { 254 logger.Infof("signalling that this controller is ready for upgrade") 255 st, err := w.pool.SystemState() 256 if err != nil { 257 return nil, errors.Trace(err) 258 } 259 info, err := st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion) 260 if err != nil { 261 return nil, errors.Trace(err) 262 } 263 264 // controllers need to wait for other controllers to be ready 265 // to run the upgrade steps. 266 logger.Infof("waiting for other controllers to be ready for upgrade") 267 if err := w.waitForOtherControllers(info); err != nil { 268 if err == tomb.ErrDying { 269 logger.Warningf("stopped waiting for other controllers: %v", err) 270 return nil, err 271 } 272 logger.Errorf("aborted wait for other controllers: %v", err) 273 return nil, errors.Annotate(err, "aborted wait for other controllers") 274 } 275 276 logger.Infof("finished waiting - all controllers are ready to run upgrade steps") 277 return info, nil 278 } 279 280 func (w *upgradeSteps) waitForOtherControllers(info *state.UpgradeInfo) error { 281 watcher := info.Watch() 282 defer func() { _ = watcher.Stop() }() 283 284 maxWait := w.getUpgradeStartTimeout() 285 timeout := time.After(maxWait) 286 for { 287 select { 288 case <-watcher.Changes(): 289 if err := info.Refresh(); err != nil { 290 return errors.Trace(err) 291 } 292 293 allReady, err := info.AllProvisionedControllersReady() 294 if err != nil { 295 return errors.Trace(err) 296 } 297 if allReady { 298 return errors.Trace(info.SetStatus(state.UpgradeRunning)) 299 } 300 case <-timeout: 301 if err := info.Abort(); err != nil { 302 return errors.Annotate(err, "unable to abort upgrade") 303 } 304 return errors.Errorf("timed out after %s", maxWait) 305 case <-w.tomb.Dying(): 306 return tomb.ErrDying 307 } 308 } 309 } 310 311 // runUpgradeSteps runs the required upgrade steps for the agent, 312 // retrying on failure. The agent's UpgradedToVersion is set 313 // once the upgrade is complete. 314 // 315 // This function conforms to the agent.ConfigMutator type and is 316 // designed to be called via an agent's ChangeConfig method. 317 func (w *upgradeSteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 318 if err := w.entity.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil); err != nil { 319 return errors.Trace(err) 320 } 321 322 stBackend := upgrades.NewStateBackend(w.pool) 323 context := upgrades.NewContext(agentConfig, w.apiConn, stBackend) 324 logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag) 325 326 targets := upgradeTargets(w.isController) 327 328 retryStrategy := w.retryStrategy 329 retryStrategy.IsFatalError = func(err error) bool { 330 // Abort if API connection has gone away! 331 return agenterrors.ConnectionIsDead(logger, w.apiConn) 332 } 333 retryStrategy.NotifyFunc = func(lastErr error, attempt int) { 334 if retryStrategy.Attempts != 0 && attempt != retryStrategy.Attempts { 335 w.reportUpgradeFailure(lastErr, true) 336 } 337 } 338 retryStrategy.Func = func() error { 339 err := PerformUpgrade(w.fromVersion, targets, context) 340 // w.entity.SetStatus(status.Error, fmt.Sprintf("TEST inner %v", err), nil) 341 return err 342 } 343 344 err := retry.Call(retryStrategy) 345 // w.entity.SetStatus(status.Error, fmt.Sprintf("TEST outer %v", err), nil) 346 if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) { 347 err = retry.LastError(err) 348 return err 349 } 350 if err != nil { 351 return &apiLostDuringUpgrade{err} 352 } 353 354 agentConfig.SetUpgradedToVersion(w.toVersion) 355 return nil 356 } 357 358 func (w *upgradeSteps) reportUpgradeFailure(err error, willRetry bool) { 359 retryText := "will retry" 360 if !willRetry { 361 retryText = "giving up" 362 } 363 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 364 w.fromVersion, w.toVersion, w.tag, retryText, err) 365 _ = w.entity.SetStatus(status.Error, 366 fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil) 367 } 368 369 func (w *upgradeSteps) finaliseUpgrade(info *state.UpgradeInfo) error { 370 if !w.isController { 371 return nil 372 } 373 374 if err := info.SetControllerDone(w.tag.Id()); err != nil { 375 return errors.Annotate(err, "upgrade done but failed to synchronise") 376 } 377 378 return nil 379 } 380 381 func (w *upgradeSteps) getUpgradeStartTimeout() time.Duration { 382 if wrench.IsActive(w.wrenchKey(), "short-upgrade-timeout") { 383 // This duration is fairly arbitrary. During manual testing it 384 // avoids the normal long wait but still provides a small 385 // window to check the environment status and logs before the 386 // timeout is triggered. 387 return time.Minute 388 } 389 return UpgradeStartTimeoutController 390 } 391 392 // upgradeTargets determines the upgrade targets corresponding to the 393 // role of an agent. This determines the upgrade steps 394 // which will run during an upgrade. 395 func upgradeTargets(isController bool) []upgrades.Target { 396 var targets []upgrades.Target 397 if isController { 398 targets = []upgrades.Target{upgrades.Controller} 399 } 400 return append(targets, upgrades.HostMachine) 401 }