github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/cmd/jujud/agent/upgrade.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/juju/errors" 8 "github.com/juju/names" 9 "github.com/juju/utils" 10 11 "github.com/juju/juju/agent" 12 "github.com/juju/juju/api" 13 "github.com/juju/juju/apiserver/params" 14 cmdutil "github.com/juju/juju/cmd/jujud/util" 15 "github.com/juju/juju/environs" 16 "github.com/juju/juju/mongo" 17 "github.com/juju/juju/state" 18 "github.com/juju/juju/state/multiwatcher" 19 "github.com/juju/juju/state/storage" 20 "github.com/juju/juju/upgrades" 21 "github.com/juju/juju/version" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/wrench" 24 ) 25 26 type upgradingMachineAgent interface { 27 ensureMongoServer(agent.Config) error 28 setMachineStatus(api.Connection, params.Status, string) error 29 CurrentConfig() agent.Config 30 ChangeConfig(agent.ConfigMutator) error 31 Dying() <-chan struct{} 32 } 33 34 var ( 35 upgradesPerformUpgrade = upgrades.PerformUpgrade // Allow patching 36 37 // The maximum time a master state server will wait for other 38 // state servers to come up and indicate they are ready to begin 39 // running upgrade steps. 40 upgradeStartTimeoutMaster = time.Minute * 15 41 42 // The maximum time a secondary state server will wait for other 43 // state servers to come up and indicate they are ready to begin 44 // running upgrade steps. This is effectively "forever" because we 45 // don't really want secondaries to ever give up once they've 46 // indicated that they're ready to upgrade. It's up to the master 47 // to abort the upgrade if required. 48 // 49 // This should get reduced when/if master re-elections are 50 // introduce in the case a master that failing to come up for 51 // upgrade. 52 upgradeStartTimeoutSecondary = time.Hour * 4 53 ) 54 55 func NewUpgradeWorkerContext() *upgradeWorkerContext { 56 return &upgradeWorkerContext{ 57 UpgradeComplete: make(chan struct{}), 58 } 59 } 60 61 type upgradeWorkerContext struct { 62 UpgradeComplete chan struct{} 63 fromVersion version.Number 64 toVersion version.Number 65 agent upgradingMachineAgent 66 tag names.MachineTag 67 machineId string 68 isMaster bool 69 apiState api.Connection 70 jobs []multiwatcher.MachineJob 71 agentConfig agent.Config 72 isStateServer bool 73 st *state.State 74 } 75 76 // InitialiseUsingAgent sets up a upgradeWorkerContext from a machine agent instance. 77 // It may update the agent's configuration. 78 func (c *upgradeWorkerContext) InitializeUsingAgent(a upgradingMachineAgent) error { 79 if wrench.IsActive("machine-agent", "always-try-upgrade") { 80 // Always enter upgrade mode. This allows test of upgrades 81 // even when there's actually no upgrade steps to run. 82 return nil 83 } 84 return a.ChangeConfig(func(agentConfig agent.ConfigSetter) error { 85 if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) { 86 logger.Infof("no upgrade steps required or upgrade steps for %v "+ 87 "have already been run.", version.Current.Number) 88 close(c.UpgradeComplete) 89 90 // Even if no upgrade is required the version number in 91 // the agent's config still needs to be bumped. 92 agentConfig.SetUpgradedToVersion(version.Current.Number) 93 } 94 return nil 95 }) 96 } 97 98 func (c *upgradeWorkerContext) Worker( 99 agent upgradingMachineAgent, 100 apiState api.Connection, 101 jobs []multiwatcher.MachineJob, 102 ) worker.Worker { 103 c.agent = agent 104 c.apiState = apiState 105 c.jobs = jobs 106 return worker.NewSimpleWorker(c.run) 107 } 108 109 func (c *upgradeWorkerContext) IsUpgradeRunning() bool { 110 select { 111 case <-c.UpgradeComplete: 112 return false 113 default: 114 return true 115 } 116 } 117 118 type apiLostDuringUpgrade struct { 119 err error 120 } 121 122 func (e *apiLostDuringUpgrade) Error() string { 123 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 124 } 125 126 func isAPILostDuringUpgrade(err error) bool { 127 _, ok := err.(*apiLostDuringUpgrade) 128 return ok 129 } 130 131 func (c *upgradeWorkerContext) run(stop <-chan struct{}) error { 132 if wrench.IsActive("machine-agent", "fail-upgrade-start") { 133 return nil // Make the worker stop 134 } 135 136 select { 137 case <-c.UpgradeComplete: 138 // Our work is already done (we're probably being restarted 139 // because the API connection has gone down), so do nothing. 140 return nil 141 default: 142 } 143 144 c.agentConfig = c.agent.CurrentConfig() 145 146 c.fromVersion = c.agentConfig.UpgradedToVersion() 147 c.toVersion = version.Current.Number 148 if c.fromVersion == c.toVersion { 149 logger.Infof("upgrade to %v already completed.", c.toVersion) 150 close(c.UpgradeComplete) 151 return nil 152 } 153 154 if err := c.initTag(c.agentConfig.Tag()); err != nil { 155 return errors.Trace(err) 156 } 157 158 // If the machine agent is a state server, flag that state 159 // needs to be opened before running upgrade steps 160 for _, job := range c.jobs { 161 if job == multiwatcher.JobManageEnviron { 162 c.isStateServer = true 163 } 164 } 165 166 // We need a *state.State for upgrades. We open it independently 167 // of StateWorker, because we have no guarantees about when 168 // and how often StateWorker might run. 169 if c.isStateServer { 170 var err error 171 if c.st, err = openStateForUpgrade(c.agent, c.agentConfig); err != nil { 172 return err 173 } 174 defer c.st.Close() 175 176 if c.isMaster, err = isMachineMaster(c.st, c.machineId); err != nil { 177 return errors.Trace(err) 178 } 179 180 stor := storage.NewStorage(c.st.EnvironUUID(), c.st.MongoSession()) 181 registerSimplestreamsDataSource(stor) 182 183 // This state-dependent data source will be useless 184 // once state is closed in previous defer - un-register it. 185 defer unregisterSimplestreamsDataSource() 186 } 187 if err := c.runUpgrades(); err != nil { 188 // Only return an error from the worker if the connection to 189 // state went away (possible mongo master change). Returning 190 // an error when the connection is lost will cause the agent 191 // to restart. 192 // 193 // For other errors, the error is not returned because we want 194 // the machine agent to stay running in an error state waiting 195 // for user intervention. 196 if isAPILostDuringUpgrade(err) { 197 return err 198 } 199 c.reportUpgradeFailure(err, false) 200 201 } else { 202 // Upgrade succeeded - signal that the upgrade is complete. 203 logger.Infof("upgrade to %v completed successfully.", c.toVersion) 204 c.agent.setMachineStatus(c.apiState, params.StatusStarted, "") 205 close(c.UpgradeComplete) 206 } 207 return nil 208 } 209 210 func (c *upgradeWorkerContext) initTag(tag names.Tag) error { 211 var ok bool 212 if c.tag, ok = tag.(names.MachineTag); !ok { 213 return errors.New("machine agent's tag is not a MachineTag") 214 } 215 c.machineId = c.tag.Id() 216 return nil 217 } 218 219 var agentTerminating = errors.New("machine agent is terminating") 220 221 // runUpgrades runs the upgrade operations for each job type and 222 // updates the updatedToVersion on success. 223 func (c *upgradeWorkerContext) runUpgrades() error { 224 upgradeInfo, err := c.prepareForUpgrade() 225 if err != nil { 226 return err 227 } 228 229 if wrench.IsActive("machine-agent", "fail-upgrade") { 230 return errors.New("wrench") 231 } 232 233 if err := c.agent.ChangeConfig(c.runUpgradeSteps); err != nil { 234 return err 235 } 236 237 if err := c.finaliseUpgrade(upgradeInfo); err != nil { 238 return err 239 } 240 return nil 241 } 242 243 func (c *upgradeWorkerContext) prepareForUpgrade() (*state.UpgradeInfo, error) { 244 if !c.isStateServer { 245 return nil, nil 246 } 247 248 logger.Infof("signalling that this state server is ready for upgrade") 249 info, err := c.st.EnsureUpgradeInfo(c.machineId, c.fromVersion, c.toVersion) 250 if err != nil { 251 return nil, errors.Trace(err) 252 } 253 254 // State servers need to wait for other state servers to be ready 255 // to run the upgrade steps. 256 logger.Infof("waiting for other state servers to be ready for upgrade") 257 if err := c.waitForOtherStateServers(info); err != nil { 258 if err == agentTerminating { 259 logger.Warningf(`stopped waiting for other state servers: %v`, err) 260 } else { 261 logger.Errorf(`aborted wait for other state servers: %v`, err) 262 // If master, trigger a rollback to the previous agent version. 263 if c.isMaster { 264 logger.Errorf("downgrading environment agent version to %v due to aborted upgrade", 265 c.fromVersion) 266 if rollbackErr := c.st.SetEnvironAgentVersion(c.fromVersion); rollbackErr != nil { 267 logger.Errorf("rollback failed: %v", rollbackErr) 268 return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version") 269 } 270 } 271 } 272 return nil, errors.Annotate(err, "aborted wait for other state servers") 273 } 274 if c.isMaster { 275 logger.Infof("finished waiting - all state servers are ready to run upgrade steps") 276 } else { 277 logger.Infof("finished waiting - the master has completed its upgrade steps") 278 } 279 return info, nil 280 } 281 282 func (c *upgradeWorkerContext) waitForOtherStateServers(info *state.UpgradeInfo) error { 283 watcher := info.Watch() 284 defer watcher.Stop() 285 286 maxWait := getUpgradeStartTimeout(c.isMaster) 287 timeout := time.After(maxWait) 288 for { 289 select { 290 case <-watcher.Changes(): 291 if err := info.Refresh(); err != nil { 292 return errors.Trace(err) 293 } 294 if c.isMaster { 295 if ready, err := info.AllProvisionedStateServersReady(); err != nil { 296 return errors.Trace(err) 297 } else if ready { 298 // All state servers ready to start upgrade 299 err := info.SetStatus(state.UpgradeRunning) 300 return errors.Trace(err) 301 } 302 } else { 303 if info.Status() == state.UpgradeFinishing { 304 // Master is done, ok to proceed 305 return nil 306 } 307 } 308 case <-timeout: 309 if c.isMaster { 310 if err := info.Abort(); err != nil { 311 return errors.Annotate(err, "unable to abort upgrade") 312 } 313 } 314 return errors.Errorf("timed out after %s", maxWait) 315 case <-c.agent.Dying(): 316 return agentTerminating 317 } 318 319 } 320 } 321 322 // runUpgradeSteps runs the required upgrade steps for the machine 323 // agent, retrying on failure. The agent's UpgradedToVersion is set 324 // once the upgrade is complete. 325 // 326 // This function conforms to the agent.ConfigMutator type and is 327 // designed to be called via a machine agent's ChangeConfig method. 328 func (c *upgradeWorkerContext) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 329 var upgradeErr error 330 a := c.agent 331 a.setMachineStatus(c.apiState, params.StatusStarted, fmt.Sprintf("upgrading to %v", c.toVersion)) 332 333 context := upgrades.NewContext(agentConfig, c.apiState, c.st) 334 logger.Infof("starting upgrade from %v to %v for %q", c.fromVersion, c.toVersion, c.tag) 335 336 targets := jobsToTargets(c.jobs, c.isMaster) 337 attempts := getUpgradeRetryStrategy() 338 for attempt := attempts.Start(); attempt.Next(); { 339 upgradeErr = upgradesPerformUpgrade(c.fromVersion, targets, context) 340 if upgradeErr == nil { 341 break 342 } 343 if cmdutil.ConnectionIsDead(logger, c.apiState) { 344 // API connection has gone away - abort! 345 return &apiLostDuringUpgrade{upgradeErr} 346 } 347 if attempt.HasNext() { 348 c.reportUpgradeFailure(upgradeErr, true) 349 } 350 } 351 if upgradeErr != nil { 352 return upgradeErr 353 } 354 agentConfig.SetUpgradedToVersion(c.toVersion) 355 return nil 356 } 357 358 func (c *upgradeWorkerContext) reportUpgradeFailure(err error, willRetry bool) { 359 retryText := "will retry" 360 if !willRetry { 361 retryText = "giving up" 362 } 363 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 364 c.fromVersion, c.toVersion, c.tag, retryText, err) 365 c.agent.setMachineStatus(c.apiState, params.StatusError, 366 fmt.Sprintf("upgrade to %v failed (%s): %v", c.toVersion, retryText, err)) 367 } 368 369 func (c *upgradeWorkerContext) finaliseUpgrade(info *state.UpgradeInfo) error { 370 if !c.isStateServer { 371 return nil 372 } 373 374 if c.isMaster { 375 // Tell other state servers that the master has completed its 376 // upgrade steps. 377 if err := info.SetStatus(state.UpgradeFinishing); err != nil { 378 return errors.Annotate(err, "upgrade done but") 379 } 380 } 381 382 if err := info.SetStateServerDone(c.machineId); err != nil { 383 return errors.Annotate(err, "upgrade done but failed to synchronise") 384 } 385 386 return nil 387 } 388 389 func getUpgradeStartTimeout(isMaster bool) time.Duration { 390 if wrench.IsActive("machine-agent", "short-upgrade-timeout") { 391 // This duration is fairly arbitrary. During manual testing it 392 // avoids the normal long wait but still provides a small 393 // window to check the environment status and logs before the 394 // timeout is triggered. 395 return time.Minute 396 } 397 398 if isMaster { 399 return upgradeStartTimeoutMaster 400 } 401 return upgradeStartTimeoutSecondary 402 } 403 404 var openStateForUpgrade = func( 405 agent upgradingMachineAgent, 406 agentConfig agent.Config, 407 ) (*state.State, error) { 408 if err := agent.ensureMongoServer(agentConfig); err != nil { 409 return nil, err 410 } 411 var err error 412 info, ok := agentConfig.MongoInfo() 413 if !ok { 414 return nil, fmt.Errorf("no state info available") 415 } 416 st, err := state.Open(agentConfig.Environment(), info, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 417 if err != nil { 418 return nil, err 419 } 420 return st, nil 421 } 422 423 var isMachineMaster = func(st *state.State, machineId string) (bool, error) { 424 if st == nil { 425 // If there is no state, we aren't a master. 426 return false, nil 427 } 428 // Not calling the agent openState method as it does other checks 429 // we really don't care about here. All we need here is the machine 430 // so we can determine if we are the master or not. 431 machine, err := st.Machine(machineId) 432 if err != nil { 433 // This shouldn't happen, and if it does, the state worker will have 434 // found out before us, and already errored, or is likely to error out 435 // very shortly. All we do here is return the error. The state worker 436 // returns an error that will cause the agent to be terminated. 437 return false, errors.Trace(err) 438 } 439 isMaster, err := mongo.IsMaster(st.MongoSession(), machine) 440 if err != nil { 441 return false, errors.Trace(err) 442 } 443 return isMaster, nil 444 } 445 446 var getUpgradeRetryStrategy = func() utils.AttemptStrategy { 447 return utils.AttemptStrategy{ 448 Delay: 2 * time.Minute, 449 Min: 5, 450 } 451 } 452 453 // jobsToTargets determines the upgrade targets corresponding to the 454 // jobs assigned to a machine agent. This determines the upgrade steps 455 // which will run during an upgrade. 456 func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) { 457 for _, job := range jobs { 458 switch job { 459 case multiwatcher.JobManageEnviron: 460 targets = append(targets, upgrades.StateServer) 461 if isMaster { 462 targets = append(targets, upgrades.DatabaseMaster) 463 } 464 case multiwatcher.JobHostUnits: 465 targets = append(targets, upgrades.HostMachine) 466 } 467 } 468 return 469 }