github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/upgradesteps/worker.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package upgradesteps 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "github.com/juju/utils" 13 "github.com/juju/version" 14 "gopkg.in/juju/names.v2" 15 "gopkg.in/juju/worker.v1" 16 "gopkg.in/tomb.v2" 17 18 "github.com/juju/juju/agent" 19 "github.com/juju/juju/api" 20 cmdutil "github.com/juju/juju/cmd/jujud/util" 21 "github.com/juju/juju/core/status" 22 "github.com/juju/juju/environs" 23 "github.com/juju/juju/mongo" 24 "github.com/juju/juju/state" 25 "github.com/juju/juju/state/multiwatcher" 26 "github.com/juju/juju/upgrades" 27 jujuversion "github.com/juju/juju/version" 28 "github.com/juju/juju/worker/gate" 29 "github.com/juju/juju/wrench" 30 ) 31 32 var logger = loggo.GetLogger("juju.worker.upgradesteps") 33 34 var ( 35 PerformUpgrade = upgrades.PerformUpgrade // Allow patching 36 37 // The maximum time a master controller will wait for other 38 // controllers to come up and indicate they are ready to begin 39 // running upgrade steps. 40 UpgradeStartTimeoutMaster = time.Minute * 15 41 42 // The maximum time a secondary controller will wait for other 43 // controllers to come up and indicate they are ready to begin 44 // running upgrade steps. This is effectively "forever" because we 45 // don't really want secondaries to ever give up once they've 46 // indicated that they're ready to upgrade. It's up to the master 47 // to abort the upgrade if required. 48 // 49 // This should get reduced when/if master re-elections are 50 // introduce in the case a master that failing to come up for 51 // upgrade. 52 UpgradeStartTimeoutSecondary = time.Hour * 4 53 ) 54 55 // NewLock creates a gate.Lock to be used to synchronise workers which 56 // need to start after upgrades have completed. The returned Lock should 57 // be passed to NewWorker. If the agent has already upgraded to the 58 // current version, then the lock will be returned in the released state. 59 func NewLock(agentConfig agent.Config) gate.Lock { 60 lock := gate.NewLock() 61 62 if wrench.IsActive(wrenchKey(agentConfig), "always-try-upgrade") { 63 // Always enter upgrade mode. This allows test of upgrades 64 // even when there's actually no upgrade steps to run. 65 return lock 66 } 67 68 // Build numbers are irrelevant to upgrade steps. 69 upgradedToVersion := agentConfig.UpgradedToVersion() 70 upgradedToVersion.Build = 0 71 currentVersion := jujuversion.Current 72 currentVersion.Build = 0 73 if upgradedToVersion == currentVersion { 74 logger.Infof( 75 "upgrade steps for %v have already been run.", 76 jujuversion.Current, 77 ) 78 lock.Unlock() 79 } 80 81 return lock 82 } 83 84 // StatusSetter defines the single method required to set an agent's 85 // status. 86 type StatusSetter interface { 87 SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error 88 } 89 90 // NewWorker returns a new instance of the upgradesteps worker. It 91 // will run any required steps to upgrade to the currently running 92 // Juju version. 93 func NewWorker( 94 upgradeComplete gate.Lock, 95 agent agent.Agent, 96 apiConn api.Connection, 97 jobs []multiwatcher.MachineJob, 98 openState func() (*state.StatePool, error), 99 preUpgradeSteps func(st *state.StatePool, agentConf agent.Config, isController, isMasterServer bool) error, 100 machine StatusSetter, 101 newEnvironFunc environs.NewEnvironFunc, 102 ) (worker.Worker, error) { 103 w := &upgradesteps{ 104 upgradeComplete: upgradeComplete, 105 agent: agent, 106 apiConn: apiConn, 107 jobs: jobs, 108 openState: openState, 109 preUpgradeSteps: preUpgradeSteps, 110 machine: machine, 111 tag: agent.CurrentConfig().Tag(), 112 } 113 w.tomb.Go(w.run) 114 return w, nil 115 } 116 117 type upgradesteps struct { 118 tomb tomb.Tomb 119 upgradeComplete gate.Lock 120 agent agent.Agent 121 apiConn api.Connection 122 jobs []multiwatcher.MachineJob 123 openState func() (*state.StatePool, error) 124 preUpgradeSteps func(st *state.StatePool, agentConf agent.Config, isController, isMaster bool) error 125 machine StatusSetter 126 127 fromVersion version.Number 128 toVersion version.Number 129 tag names.Tag 130 isMaster bool 131 isController bool 132 pool *state.StatePool 133 } 134 135 // Kill is part of the worker.Worker interface. 136 func (w *upgradesteps) Kill() { 137 w.tomb.Kill(nil) 138 } 139 140 // Wait is part of the worker.Worker interface. 141 func (w *upgradesteps) Wait() error { 142 return w.tomb.Wait() 143 } 144 145 type apiLostDuringUpgrade struct { 146 err error 147 } 148 149 func (e *apiLostDuringUpgrade) Error() string { 150 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 151 } 152 153 func isAPILostDuringUpgrade(err error) bool { 154 _, ok := err.(*apiLostDuringUpgrade) 155 return ok 156 } 157 158 func (w *upgradesteps) wrenchKey() string { 159 return wrenchKey(w.agent.CurrentConfig()) 160 } 161 162 func wrenchKey(agentConfig agent.Config) string { 163 return agentConfig.Tag().Kind() + "-agent" 164 } 165 166 func (w *upgradesteps) run() error { 167 if wrench.IsActive(w.wrenchKey(), "fail-upgrade-start") { 168 return nil // Make the worker stop 169 } 170 171 if w.upgradeComplete.IsUnlocked() { 172 // Our work is already done (we're probably being restarted 173 // because the API connection has gone down), so do nothing. 174 return nil 175 } 176 177 w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion() 178 w.toVersion = jujuversion.Current 179 if w.fromVersion == w.toVersion { 180 logger.Infof("upgrade to %v already completed.", w.toVersion) 181 w.upgradeComplete.Unlock() 182 return nil 183 } 184 185 // If the agent is a machine agent for a controller, flag that state 186 // needs to be opened before running upgrade steps 187 for _, job := range w.jobs { 188 if job == multiwatcher.JobManageModel { 189 w.isController = true 190 } 191 } 192 193 // We need a *state.State for upgrades. We open it independently 194 // of StateWorker, because we have no guarantees about when 195 // and how often StateWorker might run. 196 if w.isController { 197 var err error 198 if w.pool, err = w.openState(); err != nil { 199 return err 200 } 201 defer w.pool.Close() 202 203 if w.isMaster, err = IsMachineMaster(w.pool, w.tag.Id()); err != nil { 204 return errors.Trace(err) 205 } 206 } 207 208 if err := w.runUpgrades(); err != nil { 209 // Only return an error from the worker if the connection to 210 // state went away (possible mongo master change). Returning 211 // an error when the connection is lost will cause the agent 212 // to restart. 213 // 214 // For other errors, the error is not returned because we want 215 // the agent to stay running in an error state waiting 216 // for user intervention. 217 if isAPILostDuringUpgrade(err) { 218 return err 219 } 220 w.reportUpgradeFailure(err, false) 221 222 } else { 223 // Upgrade succeeded - signal that the upgrade is complete. 224 logger.Infof("upgrade to %v completed successfully.", w.toVersion) 225 w.machine.SetStatus(status.Started, "", nil) 226 w.upgradeComplete.Unlock() 227 } 228 return nil 229 } 230 231 // runUpgrades runs the upgrade operations for each job type and 232 // updates the updatedToVersion on success. 233 func (w *upgradesteps) runUpgrades() error { 234 upgradeInfo, err := w.prepareForUpgrade() 235 if err != nil { 236 return err 237 } 238 239 if wrench.IsActive(w.wrenchKey(), "fail-upgrade") { 240 return errors.New("wrench") 241 } 242 243 if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil { 244 return err 245 } 246 247 if err := w.finaliseUpgrade(upgradeInfo); err != nil { 248 return err 249 } 250 return nil 251 } 252 253 func (w *upgradesteps) prepareForUpgrade() (*state.UpgradeInfo, error) { 254 logger.Infof("checking that upgrade can proceed") 255 if err := w.preUpgradeSteps(w.pool, w.agent.CurrentConfig(), w.pool != nil, w.isMaster); err != nil { 256 return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag)) 257 } 258 259 if w.isController { 260 return w.prepareControllerForUpgrade() 261 } 262 return nil, nil 263 } 264 265 func (w *upgradesteps) prepareControllerForUpgrade() (*state.UpgradeInfo, error) { 266 logger.Infof("signalling that this controller is ready for upgrade") 267 st := w.pool.SystemState() 268 info, err := st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion) 269 if err != nil { 270 return nil, errors.Trace(err) 271 } 272 273 // controllers need to wait for other controllers to be ready 274 // to run the upgrade steps. 275 logger.Infof("waiting for other controllers to be ready for upgrade") 276 if err := w.waitForOtherControllers(info); err != nil { 277 if err == tomb.ErrDying { 278 logger.Warningf(`stopped waiting for other controllers: %v`, err) 279 return nil, err 280 } 281 logger.Errorf(`aborted wait for other controllers: %v`, err) 282 // If master, trigger a rollback to the previous agent version. 283 if w.isMaster { 284 logger.Errorf("downgrading model agent version to %v due to aborted upgrade", 285 w.fromVersion) 286 if rollbackErr := st.SetModelAgentVersion(w.fromVersion, true); rollbackErr != nil { 287 logger.Errorf("rollback failed: %v", rollbackErr) 288 return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version") 289 } 290 } 291 return nil, errors.Annotate(err, "aborted wait for other controllers") 292 } 293 if w.isMaster { 294 logger.Infof("finished waiting - all controllers are ready to run upgrade steps") 295 } else { 296 logger.Infof("finished waiting - the master has completed its upgrade steps") 297 } 298 return info, nil 299 } 300 301 func (w *upgradesteps) waitForOtherControllers(info *state.UpgradeInfo) error { 302 watcher := info.Watch() 303 defer watcher.Stop() 304 305 maxWait := w.getUpgradeStartTimeout() 306 timeout := time.After(maxWait) 307 for { 308 select { 309 case <-watcher.Changes(): 310 if err := info.Refresh(); err != nil { 311 return errors.Trace(err) 312 } 313 if w.isMaster { 314 if ready, err := info.AllProvisionedControllersReady(); err != nil { 315 return errors.Trace(err) 316 } else if ready { 317 // All controllers ready to start upgrade 318 err := info.SetStatus(state.UpgradeRunning) 319 return errors.Trace(err) 320 } 321 } else { 322 if info.Status() == state.UpgradeFinishing { 323 // Master is done, ok to proceed 324 return nil 325 } 326 } 327 case <-timeout: 328 if w.isMaster { 329 if err := info.Abort(); err != nil { 330 return errors.Annotate(err, "unable to abort upgrade") 331 } 332 } 333 return errors.Errorf("timed out after %s", maxWait) 334 case <-w.tomb.Dying(): 335 return tomb.ErrDying 336 } 337 338 } 339 } 340 341 // runUpgradeSteps runs the required upgrade steps for the agent, 342 // retrying on failure. The agent's UpgradedToVersion is set 343 // once the upgrade is complete. 344 // 345 // This function conforms to the agent.ConfigMutator type and is 346 // designed to be called via an agent's ChangeConfig method. 347 func (w *upgradesteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 348 var upgradeErr error 349 w.machine.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil) 350 351 stBackend := upgrades.NewStateBackend(w.pool) 352 context := upgrades.NewContext(agentConfig, w.apiConn, stBackend) 353 logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag) 354 355 targets := jobsToTargets(w.jobs, w.isMaster) 356 attempts := getUpgradeRetryStrategy() 357 for attempt := attempts.Start(); attempt.Next(); { 358 upgradeErr = PerformUpgrade(w.fromVersion, targets, context) 359 if upgradeErr == nil { 360 break 361 } 362 if cmdutil.ConnectionIsDead(logger, w.apiConn) { 363 // API connection has gone away - abort! 364 return &apiLostDuringUpgrade{upgradeErr} 365 } 366 if attempt.HasNext() { 367 w.reportUpgradeFailure(upgradeErr, true) 368 } 369 } 370 if upgradeErr != nil { 371 return upgradeErr 372 } 373 agentConfig.SetUpgradedToVersion(w.toVersion) 374 return nil 375 } 376 377 func (w *upgradesteps) reportUpgradeFailure(err error, willRetry bool) { 378 retryText := "will retry" 379 if !willRetry { 380 retryText = "giving up" 381 } 382 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 383 w.fromVersion, w.toVersion, w.tag, retryText, err) 384 w.machine.SetStatus(status.Error, 385 fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil) 386 } 387 388 func (w *upgradesteps) finaliseUpgrade(info *state.UpgradeInfo) error { 389 if !w.isController { 390 return nil 391 } 392 393 if w.isMaster { 394 // Tell other controllers that the master has completed its 395 // upgrade steps. 396 if err := info.SetStatus(state.UpgradeFinishing); err != nil { 397 return errors.Annotate(err, "upgrade done but") 398 } 399 } 400 401 if err := info.SetControllerDone(w.tag.Id()); err != nil { 402 return errors.Annotate(err, "upgrade done but failed to synchronise") 403 } 404 405 return nil 406 } 407 408 func (w *upgradesteps) getUpgradeStartTimeout() time.Duration { 409 if wrench.IsActive(w.wrenchKey(), "short-upgrade-timeout") { 410 // This duration is fairly arbitrary. During manual testing it 411 // avoids the normal long wait but still provides a small 412 // window to check the environment status and logs before the 413 // timeout is triggered. 414 return time.Minute 415 } 416 417 if w.isMaster { 418 return UpgradeStartTimeoutMaster 419 } 420 return UpgradeStartTimeoutSecondary 421 } 422 423 var IsMachineMaster = func(pool *state.StatePool, machineId string) (bool, error) { 424 if pool == nil { 425 // If there is no state pool, we aren't a master. 426 return false, nil 427 } 428 // Not calling the agent openState method as it does other checks 429 // we really don't care about here. All we need here is the machine 430 // so we can determine if we are the master or not. 431 st := pool.SystemState() 432 machine, err := st.Machine(machineId) 433 if err != nil { 434 // This shouldn't happen, and if it does, the state worker will have 435 // found out before us, and already errored, or is likely to error out 436 // very shortly. All we do here is return the error. The state worker 437 // returns an error that will cause the agent to be terminated. 438 return false, errors.Trace(err) 439 } 440 isMaster, err := mongo.IsMaster(st.MongoSession(), machine) 441 if err != nil { 442 return false, errors.Trace(err) 443 } 444 return isMaster, nil 445 } 446 447 // TODO(katco): 2016-08-09: lp:1611427 448 var getUpgradeRetryStrategy = func() utils.AttemptStrategy { 449 return utils.AttemptStrategy{ 450 Delay: 2 * time.Minute, 451 Min: 5, 452 } 453 } 454 455 // jobsToTargets determines the upgrade targets corresponding to the 456 // jobs assigned to an agent. This determines the upgrade steps 457 // which will run during an upgrade. 458 func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) { 459 for _, job := range jobs { 460 switch job { 461 case multiwatcher.JobManageModel: 462 targets = append(targets, upgrades.Controller) 463 if isMaster { 464 targets = append(targets, upgrades.DatabaseMaster) 465 } 466 case multiwatcher.JobHostUnits: 467 targets = append(targets, upgrades.HostMachine) 468 } 469 } 470 return 471 }