github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/cmd/jujud/agent/upgrade.go (about) 1 package agent 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/juju/errors" 8 "github.com/juju/names" 9 "github.com/juju/utils" 10 11 "github.com/juju/juju/agent" 12 "github.com/juju/juju/api" 13 "github.com/juju/juju/apiserver/params" 14 cmdutil "github.com/juju/juju/cmd/jujud/util" 15 "github.com/juju/juju/environs" 16 "github.com/juju/juju/mongo" 17 "github.com/juju/juju/state" 18 "github.com/juju/juju/state/multiwatcher" 19 "github.com/juju/juju/state/storage" 20 "github.com/juju/juju/upgrades" 21 "github.com/juju/juju/version" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/wrench" 24 ) 25 26 type upgradingMachineAgent interface { 27 ensureMongoServer(agent.Config) error 28 setMachineStatus(api.Connection, params.Status, string) error 29 CurrentConfig() agent.Config 30 ChangeConfig(agent.ConfigMutator) error 31 Dying() <-chan struct{} 32 } 33 34 var ( 35 upgradesPerformUpgrade = upgrades.PerformUpgrade // Allow patching 36 37 // The maximum time a master state server will wait for other 38 // state servers to come up and indicate they are ready to begin 39 // running upgrade steps. 40 upgradeStartTimeoutMaster = time.Minute * 15 41 42 // The maximum time a secondary state server will wait for other 43 // state servers to come up and indicate they are ready to begin 44 // running upgrade steps. This is effectively "forever" because we 45 // don't really want secondaries to ever give up once they've 46 // indicated that they're ready to upgrade. It's up to the master 47 // to abort the upgrade if required. 48 // 49 // This should get reduced when/if master re-elections are 50 // introduce in the case a master that failing to come up for 51 // upgrade. 52 upgradeStartTimeoutSecondary = time.Hour * 4 53 ) 54 55 func NewUpgradeWorkerContext() *upgradeWorkerContext { 56 return &upgradeWorkerContext{ 57 UpgradeComplete: make(chan struct{}), 58 } 59 } 60 61 type upgradeWorkerContext struct { 62 UpgradeComplete chan struct{} 63 fromVersion version.Number 64 toVersion version.Number 65 agent upgradingMachineAgent 66 tag names.MachineTag 67 machineId string 68 isMaster bool 69 apiState api.Connection 70 jobs []multiwatcher.MachineJob 71 agentConfig agent.Config 72 isStateServer bool 73 st *state.State 74 } 75 76 // InitialiseUsingAgent sets up a upgradeWorkerContext from a machine agent instance. 77 // It may update the agent's configuration. 78 func (c *upgradeWorkerContext) InitializeUsingAgent(a upgradingMachineAgent) error { 79 if wrench.IsActive("machine-agent", "always-try-upgrade") { 80 // Always enter upgrade mode. This allows test of upgrades 81 // even when there's actually no upgrade steps to run. 82 return nil 83 } 84 return a.ChangeConfig(func(agentConfig agent.ConfigSetter) error { 85 if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) { 86 logger.Infof("no upgrade steps required or upgrade steps for %v "+ 87 "have already been run.", version.Current.Number) 88 close(c.UpgradeComplete) 89 90 // Even if no upgrade is required the version number in 91 // the agent's config still needs to be bumped. 92 agentConfig.SetUpgradedToVersion(version.Current.Number) 93 } 94 return nil 95 }) 96 } 97 98 func (c *upgradeWorkerContext) Worker( 99 agent upgradingMachineAgent, 100 apiState api.Connection, 101 jobs []multiwatcher.MachineJob, 102 ) worker.Worker { 103 c.agent = agent 104 c.apiState = apiState 105 c.jobs = jobs 106 return worker.NewSimpleWorker(c.run) 107 } 108 109 func (c *upgradeWorkerContext) IsUpgradeRunning() bool { 110 select { 111 case <-c.UpgradeComplete: 112 return false 113 default: 114 return true 115 } 116 } 117 118 type apiLostDuringUpgrade struct { 119 err error 120 } 121 122 func (e *apiLostDuringUpgrade) Error() string { 123 return fmt.Sprintf("API connection lost during upgrade: %v", e.err) 124 } 125 126 func isAPILostDuringUpgrade(err error) bool { 127 _, ok := err.(*apiLostDuringUpgrade) 128 return ok 129 } 130 131 func (c *upgradeWorkerContext) run(stop <-chan struct{}) error { 132 if wrench.IsActive("machine-agent", "fail-upgrade-start") { 133 return nil // Make the worker stop 134 } 135 136 select { 137 case <-c.UpgradeComplete: 138 // Our work is already done (we're probably being restarted 139 // because the API connection has gone down), so do nothing. 140 return nil 141 default: 142 } 143 144 c.agentConfig = c.agent.CurrentConfig() 145 146 c.fromVersion = c.agentConfig.UpgradedToVersion() 147 c.toVersion = version.Current.Number 148 if c.fromVersion == c.toVersion { 149 logger.Infof("upgrade to %v already completed.", c.toVersion) 150 close(c.UpgradeComplete) 151 return nil 152 } 153 154 if err := c.initTag(c.agentConfig.Tag()); err != nil { 155 return errors.Trace(err) 156 } 157 158 // If the machine agent is a state server, flag that state 159 // needs to be opened before running upgrade steps 160 for _, job := range c.jobs { 161 if job == multiwatcher.JobManageEnviron { 162 c.isStateServer = true 163 } 164 } 165 166 // We need a *state.State for upgrades. We open it independently 167 // of StateWorker, because we have no guarantees about when 168 // and how often StateWorker might run. 169 if c.isStateServer { 170 var err error 171 if c.st, err = openStateForUpgrade(c.agent, c.agentConfig); err != nil { 172 return err 173 } 174 defer c.st.Close() 175 176 if c.isMaster, err = isMachineMaster(c.st, c.machineId); err != nil { 177 return errors.Trace(err) 178 } 179 180 stor := storage.NewStorage(c.st.EnvironUUID(), c.st.MongoSession()) 181 registerSimplestreamsDataSource(stor) 182 } 183 if err := c.runUpgrades(); err != nil { 184 // Only return an error from the worker if the connection to 185 // state went away (possible mongo master change). Returning 186 // an error when the connection is lost will cause the agent 187 // to restart. 188 // 189 // For other errors, the error is not returned because we want 190 // the machine agent to stay running in an error state waiting 191 // for user intervention. 192 if isAPILostDuringUpgrade(err) { 193 return err 194 } 195 c.reportUpgradeFailure(err, false) 196 197 } else { 198 // Upgrade succeeded - signal that the upgrade is complete. 199 logger.Infof("upgrade to %v completed successfully.", c.toVersion) 200 c.agent.setMachineStatus(c.apiState, params.StatusStarted, "") 201 close(c.UpgradeComplete) 202 } 203 return nil 204 } 205 206 func (c *upgradeWorkerContext) initTag(tag names.Tag) error { 207 var ok bool 208 if c.tag, ok = tag.(names.MachineTag); !ok { 209 return errors.New("machine agent's tag is not a MachineTag") 210 } 211 c.machineId = c.tag.Id() 212 return nil 213 } 214 215 var agentTerminating = errors.New("machine agent is terminating") 216 217 // runUpgrades runs the upgrade operations for each job type and 218 // updates the updatedToVersion on success. 219 func (c *upgradeWorkerContext) runUpgrades() error { 220 upgradeInfo, err := c.prepareForUpgrade() 221 if err != nil { 222 return err 223 } 224 225 if wrench.IsActive("machine-agent", "fail-upgrade") { 226 return errors.New("wrench") 227 } 228 229 if err := c.agent.ChangeConfig(c.runUpgradeSteps); err != nil { 230 return err 231 } 232 233 if err := c.finaliseUpgrade(upgradeInfo); err != nil { 234 return err 235 } 236 return nil 237 } 238 239 func (c *upgradeWorkerContext) prepareForUpgrade() (*state.UpgradeInfo, error) { 240 if !c.isStateServer { 241 return nil, nil 242 } 243 244 logger.Infof("signalling that this state server is ready for upgrade") 245 info, err := c.st.EnsureUpgradeInfo(c.machineId, c.fromVersion, c.toVersion) 246 if err != nil { 247 return nil, errors.Trace(err) 248 } 249 250 // State servers need to wait for other state servers to be ready 251 // to run the upgrade steps. 252 logger.Infof("waiting for other state servers to be ready for upgrade") 253 if err := c.waitForOtherStateServers(info); err != nil { 254 if err == agentTerminating { 255 logger.Warningf(`stopped waiting for other state servers: %v`, err) 256 } else { 257 logger.Errorf(`aborted wait for other state servers: %v`, err) 258 // If master, trigger a rollback to the previous agent version. 259 if c.isMaster { 260 logger.Errorf("downgrading environment agent version to %v due to aborted upgrade", 261 c.fromVersion) 262 if rollbackErr := c.st.SetEnvironAgentVersion(c.fromVersion); rollbackErr != nil { 263 logger.Errorf("rollback failed: %v", rollbackErr) 264 return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version") 265 } 266 } 267 } 268 return nil, errors.Annotate(err, "aborted wait for other state servers") 269 } 270 if c.isMaster { 271 logger.Infof("finished waiting - all state servers are ready to run upgrade steps") 272 } else { 273 logger.Infof("finished waiting - the master has completed its upgrade steps") 274 } 275 return info, nil 276 } 277 278 func (c *upgradeWorkerContext) waitForOtherStateServers(info *state.UpgradeInfo) error { 279 watcher := info.Watch() 280 defer watcher.Stop() 281 282 maxWait := getUpgradeStartTimeout(c.isMaster) 283 timeout := time.After(maxWait) 284 for { 285 select { 286 case <-watcher.Changes(): 287 if err := info.Refresh(); err != nil { 288 return errors.Trace(err) 289 } 290 if c.isMaster { 291 if ready, err := info.AllProvisionedStateServersReady(); err != nil { 292 return errors.Trace(err) 293 } else if ready { 294 // All state servers ready to start upgrade 295 err := info.SetStatus(state.UpgradeRunning) 296 return errors.Trace(err) 297 } 298 } else { 299 if info.Status() == state.UpgradeFinishing { 300 // Master is done, ok to proceed 301 return nil 302 } 303 } 304 case <-timeout: 305 if c.isMaster { 306 if err := info.Abort(); err != nil { 307 return errors.Annotate(err, "unable to abort upgrade") 308 } 309 } 310 return errors.Errorf("timed out after %s", maxWait) 311 case <-c.agent.Dying(): 312 return agentTerminating 313 } 314 315 } 316 } 317 318 // runUpgradeSteps runs the required upgrade steps for the machine 319 // agent, retrying on failure. The agent's UpgradedToVersion is set 320 // once the upgrade is complete. 321 // 322 // This function conforms to the agent.ConfigMutator type and is 323 // designed to be called via a machine agent's ChangeConfig method. 324 func (c *upgradeWorkerContext) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 325 var upgradeErr error 326 a := c.agent 327 a.setMachineStatus(c.apiState, params.StatusStarted, fmt.Sprintf("upgrading to %v", c.toVersion)) 328 329 context := upgrades.NewContext(agentConfig, c.apiState, c.st) 330 logger.Infof("starting upgrade from %v to %v for %q", c.fromVersion, c.toVersion, c.tag) 331 332 targets := jobsToTargets(c.jobs, c.isMaster) 333 attempts := getUpgradeRetryStrategy() 334 for attempt := attempts.Start(); attempt.Next(); { 335 upgradeErr = upgradesPerformUpgrade(c.fromVersion, targets, context) 336 if upgradeErr == nil { 337 break 338 } 339 if cmdutil.ConnectionIsDead(logger, c.apiState) { 340 // API connection has gone away - abort! 341 return &apiLostDuringUpgrade{upgradeErr} 342 } 343 if attempt.HasNext() { 344 c.reportUpgradeFailure(upgradeErr, true) 345 } 346 } 347 if upgradeErr != nil { 348 return upgradeErr 349 } 350 agentConfig.SetUpgradedToVersion(c.toVersion) 351 return nil 352 } 353 354 func (c *upgradeWorkerContext) reportUpgradeFailure(err error, willRetry bool) { 355 retryText := "will retry" 356 if !willRetry { 357 retryText = "giving up" 358 } 359 logger.Errorf("upgrade from %v to %v for %q failed (%s): %v", 360 c.fromVersion, c.toVersion, c.tag, retryText, err) 361 c.agent.setMachineStatus(c.apiState, params.StatusError, 362 fmt.Sprintf("upgrade to %v failed (%s): %v", c.toVersion, retryText, err)) 363 } 364 365 func (c *upgradeWorkerContext) finaliseUpgrade(info *state.UpgradeInfo) error { 366 if !c.isStateServer { 367 return nil 368 } 369 370 if c.isMaster { 371 // Tell other state servers that the master has completed its 372 // upgrade steps. 373 if err := info.SetStatus(state.UpgradeFinishing); err != nil { 374 return errors.Annotate(err, "upgrade done but") 375 } 376 } 377 378 if err := info.SetStateServerDone(c.machineId); err != nil { 379 return errors.Annotate(err, "upgrade done but failed to synchronise") 380 } 381 382 return nil 383 } 384 385 func getUpgradeStartTimeout(isMaster bool) time.Duration { 386 if wrench.IsActive("machine-agent", "short-upgrade-timeout") { 387 // This duration is fairly arbitrary. During manual testing it 388 // avoids the normal long wait but still provides a small 389 // window to check the environment status and logs before the 390 // timeout is triggered. 391 return time.Minute 392 } 393 394 if isMaster { 395 return upgradeStartTimeoutMaster 396 } 397 return upgradeStartTimeoutSecondary 398 } 399 400 var openStateForUpgrade = func( 401 agent upgradingMachineAgent, 402 agentConfig agent.Config, 403 ) (*state.State, error) { 404 if err := agent.ensureMongoServer(agentConfig); err != nil { 405 return nil, err 406 } 407 var err error 408 info, ok := agentConfig.MongoInfo() 409 if !ok { 410 return nil, fmt.Errorf("no state info available") 411 } 412 st, err := state.Open(agentConfig.Environment(), info, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 413 if err != nil { 414 return nil, err 415 } 416 return st, nil 417 } 418 419 var isMachineMaster = func(st *state.State, machineId string) (bool, error) { 420 if st == nil { 421 // If there is no state, we aren't a master. 422 return false, nil 423 } 424 // Not calling the agent openState method as it does other checks 425 // we really don't care about here. All we need here is the machine 426 // so we can determine if we are the master or not. 427 machine, err := st.Machine(machineId) 428 if err != nil { 429 // This shouldn't happen, and if it does, the state worker will have 430 // found out before us, and already errored, or is likely to error out 431 // very shortly. All we do here is return the error. The state worker 432 // returns an error that will cause the agent to be terminated. 433 return false, errors.Trace(err) 434 } 435 isMaster, err := mongo.IsMaster(st.MongoSession(), machine) 436 if err != nil { 437 return false, errors.Trace(err) 438 } 439 return isMaster, nil 440 } 441 442 var getUpgradeRetryStrategy = func() utils.AttemptStrategy { 443 return utils.AttemptStrategy{ 444 Delay: 2 * time.Minute, 445 Min: 5, 446 } 447 } 448 449 // jobsToTargets determines the upgrade targets corresponding to the 450 // jobs assigned to a machine agent. This determines the upgrade steps 451 // which will run during an upgrade. 452 func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) { 453 for _, job := range jobs { 454 switch job { 455 case multiwatcher.JobManageEnviron: 456 targets = append(targets, upgrades.StateServer) 457 if isMaster { 458 targets = append(targets, upgrades.DatabaseMaster) 459 } 460 case multiwatcher.JobHostUnits: 461 targets = append(targets, upgrades.HostMachine) 462 } 463 } 464 return 465 }