github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/upgrade.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/juju/errors" 8 "github.com/juju/names" 9 "github.com/juju/utils" 10 11 "github.com/juju/juju/agent" 12 "github.com/juju/juju/api" 13 "github.com/juju/juju/apiserver/params" 14 cmdutil "github.com/juju/juju/cmd/jujud/util" 15 "github.com/juju/juju/environs" 16 "github.com/juju/juju/mongo" 17 "github.com/juju/juju/state" 18 "github.com/juju/juju/state/multiwatcher" 19 "github.com/juju/juju/state/storage" 20 "github.com/juju/juju/upgrades" 21 "github.com/juju/juju/version" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/wrench" 24 ) 25 26 type upgradingMachineAgent interface { 27 ensureMongoServer(agent.Config) error 28 setMachineStatus(*api.State, params.Status, string) error 29 CurrentConfig() agent.Config 30 ChangeConfig(AgentConfigMutator) error 31 Dying() <-chan struct{} 32 } 33 34 var ( 35 upgradesPerformUpgrade = upgrades.PerformUpgrade // Allow patching 36 37 // The maximum time a master state server will wait for other 38 // state servers to come up and indicate they are ready to begin 39 // running upgrade steps. 40 upgradeStartTimeoutMaster = time.Minute * 15 41 42 // The maximum time a secondary state server will wait for other 43 // state servers to come up and indicate they are ready to begin 44 // running upgrade steps. This is effectively "forever" because we 45 // don't really want secondaries to ever give up once they've 46 // indicated that they're ready to upgrade. It's up to the master 47 // to abort the upgrade if required. 48 // 49 // This should get reduced when/if master re-elections are 50 // introduce in the case a master that failing to come up for 51 // upgrade. 52 upgradeStartTimeoutSecondary = time.Hour * 4 53 ) 54 55 func NewUpgradeWorkerContext() *upgradeWorkerContext { 56 return &upgradeWorkerContext{ 57 UpgradeComplete: make(chan struct{}), 58 } 59 } 60 61 type upgradeWorkerContext struct { 62 UpgradeComplete chan struct{} 63 fromVersion version.Number 64 toVersion version.Number 65 agent upgradingMachineAgent 66 tag names.MachineTag 67 machineId string 68 isMaster bool 69 apiState *api.State 70 jobs []multiwatcher.MachineJob 71 agentConfig agent.Config 72 isStateServer bool 73 st *state.State 74 } 75 76 // InitialiseUsingAgent sets up a upgradeWorkerContext from a machine agent instance. 77 // It may update the agent's configuration. 78 func (c *upgradeWorkerContext) InitializeUsingAgent(a upgradingMachineAgent) error { 79 if wrench.IsActive("machine-agent", "always-try-upgrade") { 80 // Always enter upgrade mode. This allows test of upgrades 81 // even when there's actually no upgrade steps to run. 82 return nil 83 } 84 return a.ChangeConfig(func(agentConfig agent.ConfigSetter) error { 85 if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) { 86 logger.Infof("no upgrade steps required or upgrade steps for %v "+ 87 "have already been run.", version.Current.Number) 88 close(c.UpgradeComplete) 89 90 // Even if no upgrade is required the version number in 91 // the agent's config still needs to be bumped. 92 agentConfig.SetUpgradedToVersion(version.Current.Number) 93 } 94 return nil 95 }) 96 } 97 98 func (c *upgradeWorkerContext) Worker( 99 agent upgradingMachineAgent, 100 apiState *api.State, 101 jobs []multiwatcher.MachineJob, 102 ) worker.Worker { 103 c.agent = agent 104 c.apiState = apiState 105 c.jobs = jobs 106 return worker.NewSimpleWorker(c.run) 107 } 108 109 func (c *upgradeWorkerContext) IsUpgradeRunning() bool { 110 select { 111 case <-c.UpgradeComplete: 112 return false 113 default: 114 return true 115 } 116 } 117 118 type apiLostDuringUpgrade struct { 119 err error 120 } 121 122 func (e *apiLostDuringUpgrade) Error() string { 123 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 124 } 125 126 func isAPILostDuringUpgrade(err error) bool { 127 _, ok := err.(*apiLostDuringUpgrade) 128 return ok 129 } 130 131 func (c *upgradeWorkerContext) run(stop <-chan struct{}) error { 132 if wrench.IsActive("machine-agent", "fail-upgrade-start") { 133 return nil // Make the worker stop 134 } 135 136 select { 137 case <-c.UpgradeComplete: 138 // Our work is already done (we're probably being restarted 139 // because the API connection has gone down), so do nothing. 140 return nil 141 default: 142 } 143 144 c.agentConfig = c.agent.CurrentConfig() 145 146 c.fromVersion = c.agentConfig.UpgradedToVersion() 147 c.toVersion = version.Current.Number 148 if c.fromVersion == c.toVersion { 149 logger.Infof("upgrade to %v already completed.", c.toVersion) 150 close(c.UpgradeComplete) 151 return nil 152 } 153 154 if err := c.initTag(c.agentConfig.Tag()); err != nil { 155 return errors.Trace(err) 156 } 157 158 // If the machine agent is a state server, flag that state 159 // needs to be opened before running upgrade steps 160 for _, job := range c.jobs { 161 if job == multiwatcher.JobManageEnviron { 162 c.isStateServer = true 163 } 164 } 165 166 // We need a *state.State for upgrades. We open it independently 167 // of StateWorker, because we have no guarantees about when 168 // and how often StateWorker might run. 169 if c.isStateServer { 170 var err error 171 if c.st, err = openStateForUpgrade(c.agent, c.agentConfig); err != nil { 172 return err 173 } 174 defer c.st.Close() 175 176 if c.isMaster, err = isMachineMaster(c.st, c.machineId); err != nil { 177 return errors.Trace(err) 178 } 179 180 stor := storage.NewStorage(c.st.EnvironUUID(), c.st.MongoSession()) 181 registerSimplestreamsDataSource(stor) 182 } 183 if err := c.runUpgrades(); err != nil { 184 // Only return an error from the worker if the connection to 185 // state went away (possible mongo master change). Returning 186 // an error when the connection is lost will cause the agent 187 // to restart. 188 // 189 // For other errors, the error is not returned because we want 190 // the machine agent to stay running in an error state waiting 191 // for user intervention. 192 if isAPILostDuringUpgrade(err) { 193 return err 194 } 195 c.reportUpgradeFailure(err, false) 196 197 } else { 198 // Upgrade succeeded - signal that the upgrade is complete. 199 logger.Infof("upgrade to %v completed successfully.", c.toVersion) 200 c.agent.setMachineStatus(c.apiState, params.StatusStarted, "") 201 close(c.UpgradeComplete) 202 } 203 return nil 204 } 205 206 func (c *upgradeWorkerContext) initTag(tag names.Tag) error { 207 var ok bool 208 if c.tag, ok = tag.(names.MachineTag); !ok { 209 return errors.New("machine agent's tag is not a MachineTag") 210 } 211 c.machineId = c.tag.Id() 212 return nil 213 } 214 215 var agentTerminating = errors.New("machine agent is terminating") 216 217 // runUpgrades runs the upgrade operations for each job type and 218 // updates the updatedToVersion on success. 219 func (c *upgradeWorkerContext) runUpgrades() error { 220 upgradeInfo, err := c.prepareForUpgrade() 221 if err != nil { 222 return err 223 } 224 225 if wrench.IsActive("machine-agent", "fail-upgrade") { 226 return errors.New("wrench") 227 } 228 229 if err := c.agent.ChangeConfig(c.runUpgradeSteps); err != nil { 230 return err 231 } 232 233 if err := c.finaliseUpgrade(upgradeInfo); err != nil { 234 return err 235 } 236 return nil 237 } 238 239 func (c *upgradeWorkerContext) prepareForUpgrade() (*state.UpgradeInfo, error) { 240 if !c.isStateServer { 241 return nil, nil 242 } 243 244 logger.Infof("signalling that this state server is ready for upgrade") 245 info, err := c.st.EnsureUpgradeInfo(c.machineId, c.fromVersion, c.toVersion) 246 if err != nil { 247 return nil, errors.Trace(err) 248 } 249 250 // State servers need to wait for other state servers to be ready 251 // to run the upgrade steps. 252 logger.Infof("waiting for other state servers to be ready for upgrade") 253 if err := c.waitForOtherStateServers(info); err != nil { 254 if err == agentTerminating { 255 logger.Warningf(`stopped waiting for other state servers: %v`, err) 256 } else { 257 logger.Errorf(`aborted wait for other state servers: %v`, err) 258 // If master, trigger a rollback to the previous agent version. 259 if c.isMaster { 260 logger.Errorf("downgrading environment agent version to %v due to aborted upgrade", 261 c.fromVersion) 262 if rollbackErr := c.st.SetEnvironAgentVersion(c.fromVersion); rollbackErr != nil { 263 logger.Errorf("rollback failed: %v", rollbackErr) 264 return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version") 265 } 266 } 267 } 268 return nil, errors.Annotate(err, "aborted wait for other state servers") 269 } 270 if c.isMaster { 271 logger.Infof("finished waiting - all state servers are ready to run upgrade steps") 272 } else { 273 logger.Infof("finished waiting - the master has completed its upgrade steps") 274 } 275 return info, nil 276 } 277 278 func (c *upgradeWorkerContext) waitForOtherStateServers(info *state.UpgradeInfo) error { 279 watcher := info.Watch() 280 281 maxWait := getUpgradeStartTimeout(c.isMaster) 282 timeout := time.After(maxWait) 283 for { 284 select { 285 case <-watcher.Changes(): 286 if err := info.Refresh(); err != nil { 287 return errors.Trace(err) 288 } 289 if c.isMaster { 290 if ready, err := info.AllProvisionedStateServersReady(); err != nil { 291 return errors.Trace(err) 292 } else if ready { 293 // All state servers ready to start upgrade 294 err := info.SetStatus(state.UpgradeRunning) 295 return errors.Trace(err) 296 } 297 } else { 298 if info.Status() == state.UpgradeFinishing { 299 // Master is done, ok to proceed 300 return nil 301 } 302 } 303 case <-timeout: 304 if c.isMaster { 305 if err := info.Abort(); err != nil { 306 return errors.Annotate(err, "unable to abort upgrade") 307 } 308 } 309 return errors.Errorf("timed out after %s", maxWait) 310 case <-c.agent.Dying(): 311 return agentTerminating 312 } 313 314 } 315 } 316 317 // runUpgradeSteps runs the required upgrade steps for the machine 318 // agent, retrying on failure. The agent's UpgradedToVersion is set 319 // once the upgrade is complete. 320 // 321 // This function conforms to the AgentConfigMutator type and is 322 // designed to be called via a machine agent's ChangeConfig method. 323 func (c *upgradeWorkerContext) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 324 var upgradeErr error 325 a := c.agent 326 a.setMachineStatus(c.apiState, params.StatusStarted, fmt.Sprintf("upgrading to %v", c.toVersion)) 327 328 context := upgrades.NewContext(agentConfig, c.apiState, c.st) 329 logger.Infof("starting upgrade from %v to %v for %q", c.fromVersion, c.toVersion, c.tag) 330 331 targets := jobsToTargets(c.jobs, c.isMaster) 332 attempts := getUpgradeRetryStrategy() 333 for attempt := attempts.Start(); attempt.Next(); { 334 upgradeErr = upgradesPerformUpgrade(c.fromVersion, targets, context) 335 if upgradeErr == nil { 336 break 337 } 338 if cmdutil.ConnectionIsDead(logger, c.apiState) { 339 // API connection has gone away - abort! 340 return &apiLostDuringUpgrade{upgradeErr} 341 } 342 if attempt.HasNext() { 343 c.reportUpgradeFailure(upgradeErr, true) 344 } 345 } 346 if upgradeErr != nil { 347 return upgradeErr 348 } 349 agentConfig.SetUpgradedToVersion(c.toVersion) 350 return nil 351 } 352 353 func (c *upgradeWorkerContext) reportUpgradeFailure(err error, willRetry bool) { 354 retryText := "will retry" 355 if !willRetry { 356 retryText = "giving up" 357 } 358 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 359 c.fromVersion, c.toVersion, c.tag, retryText, err) 360 c.agent.setMachineStatus(c.apiState, params.StatusError, 361 fmt.Sprintf("upgrade to %v failed (%s): %v", c.toVersion, retryText, err)) 362 } 363 364 func (c *upgradeWorkerContext) finaliseUpgrade(info *state.UpgradeInfo) error { 365 if !c.isStateServer { 366 return nil 367 } 368 369 if c.isMaster { 370 // Tell other state servers that the master has completed its 371 // upgrade steps. 372 if err := info.SetStatus(state.UpgradeFinishing); err != nil { 373 return errors.Annotate(err, "upgrade done but") 374 } 375 } 376 377 if err := info.SetStateServerDone(c.machineId); err != nil { 378 return errors.Annotate(err, "upgrade done but failed to synchronise") 379 } 380 381 return nil 382 } 383 384 func getUpgradeStartTimeout(isMaster bool) time.Duration { 385 if wrench.IsActive("machine-agent", "short-upgrade-timeout") { 386 // This duration is fairly arbitrary. During manual testing it 387 // avoids the normal long wait but still provides a small 388 // window to check the environment status and logs before the 389 // timeout is triggered. 390 return time.Minute 391 } 392 393 if isMaster { 394 return upgradeStartTimeoutMaster 395 } 396 return upgradeStartTimeoutSecondary 397 } 398 399 var openStateForUpgrade = func( 400 agent upgradingMachineAgent, 401 agentConfig agent.Config, 402 ) (*state.State, error) { 403 if err := agent.ensureMongoServer(agentConfig); err != nil { 404 return nil, err 405 } 406 var err error 407 info, ok := agentConfig.MongoInfo() 408 if !ok { 409 return nil, fmt.Errorf("no state info available") 410 } 411 st, err := state.Open(info, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 412 if err != nil { 413 return nil, err 414 } 415 return st, nil 416 } 417 418 var isMachineMaster = func(st *state.State, machineId string) (bool, error) { 419 if st == nil { 420 // If there is no state, we aren't a master. 421 return false, nil 422 } 423 // Not calling the agent openState method as it does other checks 424 // we really don't care about here. All we need here is the machine 425 // so we can determine if we are the master or not. 426 machine, err := st.Machine(machineId) 427 if err != nil { 428 // This shouldn't happen, and if it does, the state worker will have 429 // found out before us, and already errored, or is likely to error out 430 // very shortly. All we do here is return the error. The state worker 431 // returns an error that will cause the agent to be terminated. 432 return false, errors.Trace(err) 433 } 434 isMaster, err := mongo.IsMaster(st.MongoSession(), machine) 435 if err != nil { 436 return false, errors.Trace(err) 437 } 438 return isMaster, nil 439 } 440 441 var getUpgradeRetryStrategy = func() utils.AttemptStrategy { 442 return utils.AttemptStrategy{ 443 Delay: 2 * time.Minute, 444 Min: 5, 445 } 446 } 447 448 // jobsToTargets determines the upgrade targets corresponding to the 449 // jobs assigned to a machine agent. This determines the upgrade steps 450 // which will run during an upgrade. 451 func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) { 452 for _, job := range jobs { 453 switch job { 454 case multiwatcher.JobManageEnviron: 455 targets = append(targets, upgrades.StateServer) 456 if isMaster { 457 targets = append(targets, upgrades.DatabaseMaster) 458 } 459 case multiwatcher.JobHostUnits: 460 targets = append(targets, upgrades.HostMachine) 461 } 462 } 463 return 464 }