github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/worker/upgradesteps/worker.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package upgradesteps 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "github.com/juju/utils" 13 "gopkg.in/juju/names.v2" 14 "gopkg.in/tomb.v1" 15 16 "github.com/juju/juju/agent" 17 "github.com/juju/juju/api" 18 cmdutil "github.com/juju/juju/cmd/jujud/util" 19 "github.com/juju/juju/mongo" 20 "github.com/juju/juju/state" 21 "github.com/juju/juju/state/multiwatcher" 22 "github.com/juju/juju/status" 23 "github.com/juju/juju/upgrades" 24 jujuversion "github.com/juju/juju/version" 25 "github.com/juju/juju/worker" 26 "github.com/juju/juju/worker/gate" 27 "github.com/juju/juju/wrench" 28 "github.com/juju/version" 29 ) 30 31 var logger = loggo.GetLogger("juju.worker.upgradesteps") 32 33 var ( 34 PerformUpgrade = upgrades.PerformUpgrade // Allow patching 35 36 // The maximum time a master controller will wait for other 37 // controllers to come up and indicate they are ready to begin 38 // running upgrade steps. 39 UpgradeStartTimeoutMaster = time.Minute * 15 40 41 // The maximum time a secondary controller will wait for other 42 // controllers to come up and indicate they are ready to begin 43 // running upgrade steps. This is effectively "forever" because we 44 // don't really want secondaries to ever give up once they've 45 // indicated that they're ready to upgrade. It's up to the master 46 // to abort the upgrade if required. 47 // 48 // This should get reduced when/if master re-elections are 49 // introduce in the case a master that failing to come up for 50 // upgrade. 51 UpgradeStartTimeoutSecondary = time.Hour * 4 52 ) 53 54 // NewLock creates a gate.Lock to be used to synchronise workers which 55 // need to start after upgrades have completed. If no upgrade steps 56 // are required the Lock is unlocked and the version in agent's 57 // configuration is updated to the currently running version. 58 // 59 // The returned Lock should be passed to NewWorker. 60 func NewLock(a agent.Agent) (gate.Lock, error) { 61 lock := gate.NewLock() 62 63 if wrench.IsActive("machine-agent", "always-try-upgrade") { 64 // Always enter upgrade mode. This allows test of upgrades 65 // even when there's actually no upgrade steps to run. 66 return lock, nil 67 } 68 69 err := a.ChangeConfig(func(agentConfig agent.ConfigSetter) error { 70 if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) { 71 logger.Infof("no upgrade steps required or upgrade steps for %v "+ 72 "have already been run.", jujuversion.Current) 73 lock.Unlock() 74 75 // Even if no upgrade is required the version number in 76 // the agent's config still needs to be bumped. 77 agentConfig.SetUpgradedToVersion(jujuversion.Current) 78 } 79 return nil 80 }) 81 if err != nil { 82 return nil, err 83 } 84 return lock, nil 85 } 86 87 // StatusSetter defines the single method required to set an agent's 88 // status. 89 type StatusSetter interface { 90 SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error 91 } 92 93 // NewWorker returns a new instance of the upgradesteps worker. It 94 // will run any required steps to upgrade to the currently running 95 // Juju version. 96 func NewWorker( 97 upgradeComplete gate.Lock, 98 agent agent.Agent, 99 apiConn api.Connection, 100 jobs []multiwatcher.MachineJob, 101 openState func() (*state.State, error), 102 preUpgradeSteps func(st *state.State, agentConf agent.Config, isController, isMasterServer bool) error, 103 machine StatusSetter, 104 ) (worker.Worker, error) { 105 tag, ok := agent.CurrentConfig().Tag().(names.MachineTag) 106 if !ok { 107 return nil, errors.New("machine agent's tag is not a MachineTag") 108 } 109 w := &upgradesteps{ 110 upgradeComplete: upgradeComplete, 111 agent: agent, 112 apiConn: apiConn, 113 jobs: jobs, 114 openState: openState, 115 preUpgradeSteps: preUpgradeSteps, 116 machine: machine, 117 tag: tag, 118 } 119 go func() { 120 defer w.tomb.Done() 121 w.tomb.Kill(w.run()) 122 }() 123 return w, nil 124 } 125 126 type upgradesteps struct { 127 tomb tomb.Tomb 128 upgradeComplete gate.Lock 129 agent agent.Agent 130 apiConn api.Connection 131 jobs []multiwatcher.MachineJob 132 openState func() (*state.State, error) 133 preUpgradeSteps func(st *state.State, agentConf agent.Config, isController, isMaster bool) error 134 machine StatusSetter 135 136 fromVersion version.Number 137 toVersion version.Number 138 tag names.MachineTag 139 isMaster bool 140 isController bool 141 st *state.State 142 } 143 144 // Kill is part of the worker.Worker interface. 145 func (w *upgradesteps) Kill() { 146 w.tomb.Kill(nil) 147 } 148 149 // Wait is part of the worker.Worker interface. 150 func (w *upgradesteps) Wait() error { 151 return w.tomb.Wait() 152 } 153 154 type apiLostDuringUpgrade struct { 155 err error 156 } 157 158 func (e *apiLostDuringUpgrade) Error() string { 159 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 160 } 161 162 func isAPILostDuringUpgrade(err error) bool { 163 _, ok := err.(*apiLostDuringUpgrade) 164 return ok 165 } 166 167 func (w *upgradesteps) run() error { 168 if wrench.IsActive("machine-agent", "fail-upgrade-start") { 169 return nil // Make the worker stop 170 } 171 172 if w.upgradeComplete.IsUnlocked() { 173 // Our work is already done (we're probably being restarted 174 // because the API connection has gone down), so do nothing. 175 return nil 176 } 177 178 w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion() 179 w.toVersion = jujuversion.Current 180 if w.fromVersion == w.toVersion { 181 logger.Infof("upgrade to %v already completed.", w.toVersion) 182 w.upgradeComplete.Unlock() 183 return nil 184 } 185 186 // If the machine agent is a controller, flag that state 187 // needs to be opened before running upgrade steps 188 for _, job := range w.jobs { 189 if job == multiwatcher.JobManageModel { 190 w.isController = true 191 } 192 } 193 194 // We need a *state.State for upgrades. We open it independently 195 // of StateWorker, because we have no guarantees about when 196 // and how often StateWorker might run. 197 if w.isController { 198 var err error 199 if w.st, err = w.openState(); err != nil { 200 return err 201 } 202 defer w.st.Close() 203 204 if w.isMaster, err = IsMachineMaster(w.st, w.tag.Id()); err != nil { 205 return errors.Trace(err) 206 } 207 } 208 209 if err := w.runUpgrades(); err != nil { 210 // Only return an error from the worker if the connection to 211 // state went away (possible mongo master change). Returning 212 // an error when the connection is lost will cause the agent 213 // to restart. 214 // 215 // For other errors, the error is not returned because we want 216 // the machine agent to stay running in an error state waiting 217 // for user intervention. 218 if isAPILostDuringUpgrade(err) { 219 return err 220 } 221 w.reportUpgradeFailure(err, false) 222 223 } else { 224 // Upgrade succeeded - signal that the upgrade is complete. 225 logger.Infof("upgrade to %v completed successfully.", w.toVersion) 226 w.machine.SetStatus(status.Started, "", nil) 227 w.upgradeComplete.Unlock() 228 } 229 return nil 230 } 231 232 // runUpgrades runs the upgrade operations for each job type and 233 // updates the updatedToVersion on success. 234 func (w *upgradesteps) runUpgrades() error { 235 upgradeInfo, err := w.prepareForUpgrade() 236 if err != nil { 237 return err 238 } 239 240 if wrench.IsActive("machine-agent", "fail-upgrade") { 241 return errors.New("wrench") 242 } 243 244 if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil { 245 return err 246 } 247 248 if err := w.finaliseUpgrade(upgradeInfo); err != nil { 249 return err 250 } 251 return nil 252 } 253 254 func (w *upgradesteps) prepareForUpgrade() (*state.UpgradeInfo, error) { 255 logger.Infof("checking that upgrade can proceed") 256 if err := w.preUpgradeSteps(w.st, w.agent.CurrentConfig(), w.st != nil, w.isMaster); err != nil { 257 return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag)) 258 } 259 260 if !w.isController { 261 return nil, nil 262 } 263 264 logger.Infof("signalling that this controller is ready for upgrade") 265 info, err := w.st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion) 266 if err != nil { 267 return nil, errors.Trace(err) 268 } 269 270 // controllers need to wait for other controllers to be ready 271 // to run the upgrade steps. 272 logger.Infof("waiting for other controllers to be ready for upgrade") 273 if err := w.waitForOtherControllers(info); err != nil { 274 if err == tomb.ErrDying { 275 logger.Warningf(`stopped waiting for other controllers: %v`, err) 276 return nil, err 277 } 278 logger.Errorf(`aborted wait for other controllers: %v`, err) 279 // If master, trigger a rollback to the previous agent version. 280 if w.isMaster { 281 logger.Errorf("downgrading model agent version to %v due to aborted upgrade", 282 w.fromVersion) 283 if rollbackErr := w.st.SetModelAgentVersion(w.fromVersion); rollbackErr != nil { 284 logger.Errorf("rollback failed: %v", rollbackErr) 285 return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version") 286 } 287 } 288 return nil, errors.Annotate(err, "aborted wait for other controllers") 289 } 290 if w.isMaster { 291 logger.Infof("finished waiting - all controllers are ready to run upgrade steps") 292 } else { 293 logger.Infof("finished waiting - the master has completed its upgrade steps") 294 } 295 return info, nil 296 } 297 298 func (w *upgradesteps) waitForOtherControllers(info *state.UpgradeInfo) error { 299 watcher := info.Watch() 300 defer watcher.Stop() 301 302 maxWait := getUpgradeStartTimeout(w.isMaster) 303 timeout := time.After(maxWait) 304 for { 305 select { 306 case <-watcher.Changes(): 307 if err := info.Refresh(); err != nil { 308 return errors.Trace(err) 309 } 310 if w.isMaster { 311 if ready, err := info.AllProvisionedControllersReady(); err != nil { 312 return errors.Trace(err) 313 } else if ready { 314 // All controllers ready to start upgrade 315 err := info.SetStatus(state.UpgradeRunning) 316 return errors.Trace(err) 317 } 318 } else { 319 if info.Status() == state.UpgradeFinishing { 320 // Master is done, ok to proceed 321 return nil 322 } 323 } 324 case <-timeout: 325 if w.isMaster { 326 if err := info.Abort(); err != nil { 327 return errors.Annotate(err, "unable to abort upgrade") 328 } 329 } 330 return errors.Errorf("timed out after %s", maxWait) 331 case <-w.tomb.Dying(): 332 return tomb.ErrDying 333 } 334 335 } 336 } 337 338 // runUpgradeSteps runs the required upgrade steps for the machine 339 // agent, retrying on failure. The agent's UpgradedToVersion is set 340 // once the upgrade is complete. 341 // 342 // This function conforms to the agent.ConfigMutator type and is 343 // designed to be called via a machine agent's ChangeConfig method. 344 func (w *upgradesteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 345 var upgradeErr error 346 w.machine.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil) 347 348 context := upgrades.NewContext(agentConfig, w.apiConn, w.st) 349 logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag) 350 351 targets := jobsToTargets(w.jobs, w.isMaster) 352 attempts := getUpgradeRetryStrategy() 353 for attempt := attempts.Start(); attempt.Next(); { 354 upgradeErr = PerformUpgrade(w.fromVersion, targets, context) 355 if upgradeErr == nil { 356 break 357 } 358 if cmdutil.ConnectionIsDead(logger, w.apiConn) { 359 // API connection has gone away - abort! 360 return &apiLostDuringUpgrade{upgradeErr} 361 } 362 if attempt.HasNext() { 363 w.reportUpgradeFailure(upgradeErr, true) 364 } 365 } 366 if upgradeErr != nil { 367 return upgradeErr 368 } 369 agentConfig.SetUpgradedToVersion(w.toVersion) 370 return nil 371 } 372 373 func (w *upgradesteps) reportUpgradeFailure(err error, willRetry bool) { 374 retryText := "will retry" 375 if !willRetry { 376 retryText = "giving up" 377 } 378 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 379 w.fromVersion, w.toVersion, w.tag, retryText, err) 380 w.machine.SetStatus(status.Error, 381 fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil) 382 } 383 384 func (w *upgradesteps) finaliseUpgrade(info *state.UpgradeInfo) error { 385 if !w.isController { 386 return nil 387 } 388 389 if w.isMaster { 390 // Tell other controllers that the master has completed its 391 // upgrade steps. 392 if err := info.SetStatus(state.UpgradeFinishing); err != nil { 393 return errors.Annotate(err, "upgrade done but") 394 } 395 } 396 397 if err := info.SetControllerDone(w.tag.Id()); err != nil { 398 return errors.Annotate(err, "upgrade done but failed to synchronise") 399 } 400 401 return nil 402 } 403 404 func getUpgradeStartTimeout(isMaster bool) time.Duration { 405 if wrench.IsActive("machine-agent", "short-upgrade-timeout") { 406 // This duration is fairly arbitrary. During manual testing it 407 // avoids the normal long wait but still provides a small 408 // window to check the environment status and logs before the 409 // timeout is triggered. 410 return time.Minute 411 } 412 413 if isMaster { 414 return UpgradeStartTimeoutMaster 415 } 416 return UpgradeStartTimeoutSecondary 417 } 418 419 var IsMachineMaster = func(st *state.State, machineId string) (bool, error) { 420 if st == nil { 421 // If there is no state, we aren't a master. 422 return false, nil 423 } 424 // Not calling the agent openState method as it does other checks 425 // we really don't care about here. All we need here is the machine 426 // so we can determine if we are the master or not. 427 machine, err := st.Machine(machineId) 428 if err != nil { 429 // This shouldn't happen, and if it does, the state worker will have 430 // found out before us, and already errored, or is likely to error out 431 // very shortly. All we do here is return the error. The state worker 432 // returns an error that will cause the agent to be terminated. 433 return false, errors.Trace(err) 434 } 435 isMaster, err := mongo.IsMaster(st.MongoSession(), machine) 436 if err != nil { 437 return false, errors.Trace(err) 438 } 439 return isMaster, nil 440 } 441 442 // TODO(katco): 2016-08-09: lp:1611427 443 var getUpgradeRetryStrategy = func() utils.AttemptStrategy { 444 return utils.AttemptStrategy{ 445 Delay: 2 * time.Minute, 446 Min: 5, 447 } 448 } 449 450 // jobsToTargets determines the upgrade targets corresponding to the 451 // jobs assigned to a machine agent. This determines the upgrade steps 452 // which will run during an upgrade. 453 func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) { 454 for _, job := range jobs { 455 switch job { 456 case multiwatcher.JobManageModel: 457 targets = append(targets, upgrades.Controller) 458 if isMaster { 459 targets = append(targets, upgrades.DatabaseMaster) 460 } 461 case multiwatcher.JobHostUnits: 462 targets = append(targets, upgrades.HostMachine) 463 } 464 } 465 return 466 }