github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/upgrade_test.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "sync" 12 "time" 13 14 "github.com/juju/errors" 15 "github.com/juju/loggo" 16 "github.com/juju/names" 17 jc "github.com/juju/testing/checkers" 18 "github.com/juju/utils" 19 "github.com/juju/utils/apt" 20 gc "gopkg.in/check.v1" 21 22 "github.com/juju/juju/agent" 23 "github.com/juju/juju/api" 24 "github.com/juju/juju/apiserver/params" 25 cmdutil "github.com/juju/juju/cmd/jujud/util" 26 "github.com/juju/juju/constraints" 27 "github.com/juju/juju/environs" 28 "github.com/juju/juju/environs/config" 29 envtesting "github.com/juju/juju/environs/testing" 30 "github.com/juju/juju/mongo" 31 "github.com/juju/juju/state" 32 "github.com/juju/juju/state/multiwatcher" 33 "github.com/juju/juju/state/watcher" 34 coretesting "github.com/juju/juju/testing" 35 "github.com/juju/juju/upgrades" 36 "github.com/juju/juju/version" 37 "github.com/juju/juju/worker" 38 "github.com/juju/juju/worker/upgrader" 39 ) 40 41 type UpgradeSuite struct { 42 commonMachineSuite 43 44 aptCmds []*exec.Cmd 45 oldVersion version.Binary 46 logWriter loggo.TestWriter 47 connectionDead bool 48 machineIsMaster bool 49 aptMutex sync.Mutex 50 } 51 52 var _ = gc.Suite(&UpgradeSuite{}) 53 54 type exposedAPI bool 55 56 var ( 57 FullAPIExposed exposedAPI = true 58 RestrictedAPIExposed exposedAPI = false 59 ) 60 61 const fails = true 62 const succeeds = false 63 64 func (s *UpgradeSuite) setAptCmds(cmd *exec.Cmd) []*exec.Cmd { 65 s.aptMutex.Lock() 66 defer s.aptMutex.Unlock() 67 if cmd == nil { 68 s.aptCmds = nil 69 } else { 70 s.aptCmds = append(s.aptCmds, cmd) 71 } 72 return s.aptCmds 73 } 74 75 func (s *UpgradeSuite) getAptCmds() []*exec.Cmd { 76 s.aptMutex.Lock() 77 defer s.aptMutex.Unlock() 78 return s.aptCmds 79 } 80 81 func (s *UpgradeSuite) SetUpTest(c *gc.C) { 82 s.commonMachineSuite.SetUpTest(c) 83 84 // Capture all apt commands. 85 s.aptCmds = nil 86 aptCmds := s.AgentSuite.HookCommandOutput(&apt.CommandOutput, nil, nil) 87 go func() { 88 for cmd := range aptCmds { 89 s.setAptCmds(cmd) 90 } 91 }() 92 93 s.oldVersion = version.Current 94 s.oldVersion.Major = 1 95 s.oldVersion.Minor = 16 96 97 // Don't wait so long in tests. 98 s.PatchValue(&upgradeStartTimeoutMaster, time.Duration(time.Millisecond*50)) 99 s.PatchValue(&upgradeStartTimeoutSecondary, time.Duration(time.Millisecond*60)) 100 101 // Allow tests to make the API connection appear to be dead. 102 s.connectionDead = false 103 s.PatchValue(&cmdutil.ConnectionIsDead, func(loggo.Logger, cmdutil.Pinger) bool { 104 return s.connectionDead 105 }) 106 107 var fakeOpenStateForUpgrade = func(upgradingMachineAgent, agent.Config) (*state.State, error) { 108 mongoInfo := s.State.MongoConnectionInfo() 109 st, err := state.Open(mongoInfo, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 110 c.Assert(err, jc.ErrorIsNil) 111 return st, nil 112 } 113 s.PatchValue(&openStateForUpgrade, fakeOpenStateForUpgrade) 114 115 s.machineIsMaster = true 116 fakeIsMachineMaster := func(*state.State, string) (bool, error) { 117 return s.machineIsMaster, nil 118 } 119 s.PatchValue(&isMachineMaster, fakeIsMachineMaster) 120 } 121 122 func (s *UpgradeSuite) captureLogs(c *gc.C) { 123 c.Assert(loggo.RegisterWriter("upgrade-tests", &s.logWriter, loggo.INFO), gc.IsNil) 124 s.AddCleanup(func(*gc.C) { 125 loggo.RemoveWriter("upgrade-tests") 126 s.logWriter.Clear() 127 }) 128 } 129 130 func (s *UpgradeSuite) countUpgradeAttempts(upgradeErr error) *int { 131 count := 0 132 s.PatchValue(&upgradesPerformUpgrade, func(version.Number, []upgrades.Target, upgrades.Context) error { 133 count++ 134 return upgradeErr 135 }) 136 return &count 137 } 138 139 func (s *UpgradeSuite) TestContextInitializeWhenNoUpgradeRequired(c *gc.C) { 140 // Set the agent's initial upgradedToVersion to almost the same as 141 // the current version. We want it to be different to 142 // version.Current (so that we can see it change) but not to 143 // trigger upgrade steps. 144 config := NewFakeConfigSetter(names.NewMachineTag("0"), makeBumpedCurrentVersion().Number) 145 agent := NewFakeUpgradingMachineAgent(config) 146 147 context := NewUpgradeWorkerContext() 148 context.InitializeUsingAgent(agent) 149 150 select { 151 case <-context.UpgradeComplete: 152 // Success 153 default: 154 c.Fatal("UpgradeComplete channel should be closed because no upgrade is required") 155 } 156 // The agent's version should have been updated. 157 c.Assert(config.Version, gc.Equals, version.Current.Number) 158 159 } 160 161 func (s *UpgradeSuite) TestContextInitializeWhenUpgradeRequired(c *gc.C) { 162 // Set the agent's upgradedToVersion so that upgrade steps are required. 163 initialVersion := version.MustParse("1.16.0") 164 config := NewFakeConfigSetter(names.NewMachineTag("0"), initialVersion) 165 agent := NewFakeUpgradingMachineAgent(config) 166 167 context := NewUpgradeWorkerContext() 168 context.InitializeUsingAgent(agent) 169 170 select { 171 case <-context.UpgradeComplete: 172 c.Fatal("UpgradeComplete channel shouldn't be closed because upgrade is required") 173 default: 174 // Success 175 } 176 // The agent's version should NOT have been updated. 177 c.Assert(config.Version, gc.Equals, initialVersion) 178 } 179 180 func (s *UpgradeSuite) TestRetryStrategy(c *gc.C) { 181 retries := getUpgradeRetryStrategy() 182 c.Assert(retries.Delay, gc.Equals, 2*time.Minute) 183 c.Assert(retries.Min, gc.Equals, 5) 184 } 185 186 func (s *UpgradeSuite) TestIsUpgradeRunning(c *gc.C) { 187 context := NewUpgradeWorkerContext() 188 c.Assert(context.IsUpgradeRunning(), jc.IsTrue) 189 190 close(context.UpgradeComplete) 191 c.Assert(context.IsUpgradeRunning(), jc.IsFalse) 192 } 193 194 func (s *UpgradeSuite) TestNoUpgradeNecessary(c *gc.C) { 195 attemptsP := s.countUpgradeAttempts(nil) 196 s.captureLogs(c) 197 s.oldVersion = version.Current // nothing to do 198 199 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 200 201 c.Check(workerErr, gc.IsNil) 202 c.Check(*attemptsP, gc.Equals, 0) 203 c.Check(config.Version, gc.Equals, version.Current.Number) 204 assertUpgradeComplete(c, context) 205 } 206 207 func (s *UpgradeSuite) TestUpgradeStepsFailure(c *gc.C) { 208 // This test checks what happens when every upgrade attempt fails. 209 // A number of retries should be observed and the agent should end 210 // up in a state where it is is still running but is reporting an 211 // error and the upgrade is not flagged as having completed (which 212 // prevents most of the agent's workers from running and keeps the 213 // API in restricted mode). 214 215 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 216 s.captureLogs(c) 217 218 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 219 220 // The worker shouldn't return an error so that the worker and 221 // agent keep running. 222 c.Check(workerErr, gc.IsNil) 223 224 c.Check(*attemptsP, gc.Equals, maxUpgradeRetries) 225 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 226 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 227 s.makeExpectedStatusCalls(maxUpgradeRetries-1, fails, "boom")) 228 c.Assert(s.logWriter.Log(), jc.LogMatches, 229 s.makeExpectedUpgradeLogs(maxUpgradeRetries-1, "hostMachine", fails, "boom")) 230 assertUpgradeNotComplete(c, context) 231 } 232 233 func (s *UpgradeSuite) TestUpgradeStepsRetries(c *gc.C) { 234 // This test checks what happens when the first upgrade attempt 235 // fails but the following on succeeds. The final state should be 236 // the same as a successful upgrade which worked first go. 237 attempts := 0 238 fail := true 239 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 240 attempts++ 241 if fail { 242 fail = false 243 return errors.New("boom") 244 } else { 245 return nil 246 } 247 } 248 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 249 s.captureLogs(c) 250 251 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 252 253 c.Check(workerErr, gc.IsNil) 254 c.Check(attempts, gc.Equals, 2) 255 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 256 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(1, succeeds, "boom")) 257 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(1, "hostMachine", succeeds, "boom")) 258 assertUpgradeComplete(c, context) 259 } 260 261 func (s *UpgradeSuite) TestOtherUpgradeRunFailure(c *gc.C) { 262 // This test checks what happens something other than the upgrade 263 // steps themselves fails, ensuring the something is logged and 264 // the agent status is updated. 265 266 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 267 // Delete UpgradeInfo for the upgrade so that finaliseUpgrade() will fail 268 s.State.ClearUpgradeInfo() 269 return nil 270 } 271 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 272 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 273 s.captureLogs(c) 274 275 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 276 277 c.Check(workerErr, gc.IsNil) 278 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade almost finished 279 failReason := `upgrade done but: cannot set upgrade status to "finishing": ` + 280 `Another status change may have occurred concurrently` 281 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 282 s.makeExpectedStatusCalls(0, fails, failReason)) 283 c.Assert(s.logWriter.Log(), jc.LogMatches, 284 s.makeExpectedUpgradeLogs(0, "databaseMaster", fails, failReason)) 285 assertUpgradeNotComplete(c, context) 286 } 287 288 func (s *UpgradeSuite) TestApiConnectionFailure(c *gc.C) { 289 // This test checks what happens when an upgrade fails because the 290 // connection to mongo has gone away. This will happen when the 291 // mongo master changes. In this case we want the upgrade worker 292 // to return immediately without further retries. The error should 293 // be returned by the worker so that the agent will restart. 294 295 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 296 s.connectionDead = true // Make the connection to state appear to be dead 297 s.captureLogs(c) 298 299 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 300 301 c.Check(workerErr, gc.ErrorMatches, "API connection lost during upgrade: boom") 302 c.Check(*attemptsP, gc.Equals, 1) 303 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 304 assertUpgradeNotComplete(c, context) 305 } 306 307 func (s *UpgradeSuite) TestAbortWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 308 // This test checks when a state server is upgrading and one of 309 // the other state servers doesn't signal it is ready in time. 310 311 // The master state server in this scenario is functionally tested 312 // elsewhere in this suite. 313 s.machineIsMaster = false 314 315 s.createUpgradingStateServers(c) 316 s.captureLogs(c) 317 attemptsP := s.countUpgradeAttempts(nil) 318 319 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 320 321 c.Check(workerErr, gc.IsNil) 322 c.Check(*attemptsP, gc.Equals, 0) 323 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 324 assertUpgradeNotComplete(c, context) 325 326 // The environment agent-version should still be the new version. 327 // It's up to the master to trigger the rollback. 328 s.assertEnvironAgentVersion(c, version.Current.Number) 329 330 causeMsg := " timed out after 60ms" 331 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 332 {loggo.INFO, "waiting for other state servers to be ready for upgrade"}, 333 {loggo.ERROR, "aborted wait for other state servers: timed out after 60ms"}, 334 {loggo.ERROR, `upgrade from .+ to .+ for "machine-0" failed \(giving up\): ` + 335 "aborted wait for other state servers:" + causeMsg}, 336 }) 337 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, []MachineStatusCall{{ 338 params.StatusError, 339 fmt.Sprintf( 340 "upgrade to %s failed (giving up): aborted wait for other state servers:"+causeMsg, 341 version.Current.Number), 342 }}) 343 } 344 345 func (s *UpgradeSuite) TestWorkerAbortsIfAgentDies(c *gc.C) { 346 s.machineIsMaster = false 347 s.captureLogs(c) 348 attemptsP := s.countUpgradeAttempts(nil) 349 350 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 351 352 config := s.makeFakeConfig() 353 agent := NewFakeUpgradingMachineAgent(config) 354 close(agent.DyingCh) 355 workerErr, context := s.runUpgradeWorkerUsingAgent(c, agent, multiwatcher.JobManageEnviron) 356 357 c.Check(workerErr, gc.IsNil) 358 c.Check(*attemptsP, gc.Equals, 0) 359 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 360 assertUpgradeNotComplete(c, context) 361 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 362 {loggo.WARNING, "stopped waiting for other state servers: machine agent is terminating"}, 363 }) 364 } 365 366 func (s *UpgradeSuite) TestSuccessMaster(c *gc.C) { 367 // This test checks what happens when an upgrade works on the 368 // first attempt on a master state server. 369 s.machineIsMaster = true 370 info := s.checkSuccess(c, "databaseMaster", func(*state.UpgradeInfo) {}) 371 c.Assert(info.Status(), gc.Equals, state.UpgradeFinishing) 372 } 373 374 func (s *UpgradeSuite) TestSuccessSecondary(c *gc.C) { 375 // This test checks what happens when an upgrade works on the 376 // first attempt on a secondary state server. 377 s.machineIsMaster = false 378 mungeInfo := func(info *state.UpgradeInfo) { 379 // Indicate that the master is done 380 err := info.SetStatus(state.UpgradeRunning) 381 c.Assert(err, jc.ErrorIsNil) 382 err = info.SetStatus(state.UpgradeFinishing) 383 c.Assert(err, jc.ErrorIsNil) 384 } 385 s.checkSuccess(c, "stateServer", mungeInfo) 386 } 387 388 func (s *UpgradeSuite) checkSuccess(c *gc.C, target string, mungeInfo func(*state.UpgradeInfo)) *state.UpgradeInfo { 389 _, machineIdB, machineIdC := s.createUpgradingStateServers(c) 390 391 // Indicate that machine B and C are ready to upgrade 392 vPrevious := s.oldVersion.Number 393 vNext := version.Current.Number 394 info, err := s.State.EnsureUpgradeInfo(machineIdB, vPrevious, vNext) 395 c.Assert(err, jc.ErrorIsNil) 396 _, err = s.State.EnsureUpgradeInfo(machineIdC, vPrevious, vNext) 397 c.Assert(err, jc.ErrorIsNil) 398 399 mungeInfo(info) 400 401 attemptsP := s.countUpgradeAttempts(nil) 402 s.captureLogs(c) 403 404 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 405 406 c.Check(workerErr, gc.IsNil) 407 c.Check(*attemptsP, gc.Equals, 1) 408 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 409 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(0, succeeds, "")) 410 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(0, target, succeeds, "")) 411 assertUpgradeComplete(c, context) 412 413 err = info.Refresh() 414 c.Assert(err, jc.ErrorIsNil) 415 c.Assert(info.StateServersDone(), jc.DeepEquals, []string{"0"}) 416 return info 417 } 418 419 func (s *UpgradeSuite) TestJobsToTargets(c *gc.C) { 420 check := func(jobs []multiwatcher.MachineJob, isMaster bool, expectedTargets ...upgrades.Target) { 421 c.Assert(jobsToTargets(jobs, isMaster), jc.SameContents, expectedTargets) 422 } 423 424 check([]multiwatcher.MachineJob{multiwatcher.JobHostUnits}, false, upgrades.HostMachine) 425 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, false, upgrades.StateServer) 426 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, true, 427 upgrades.StateServer, upgrades.DatabaseMaster) 428 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, false, 429 upgrades.StateServer, upgrades.HostMachine) 430 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, true, 431 upgrades.StateServer, upgrades.DatabaseMaster, upgrades.HostMachine) 432 } 433 434 func (s *UpgradeSuite) TestUpgradeStepsStateServer(c *gc.C) { 435 s.setInstantRetryStrategy(c) 436 // Upload tools to provider storage, so they can be migrated to environment storage. 437 stor, err := environs.LegacyStorage(s.State) 438 if !errors.IsNotSupported(err) { 439 c.Assert(err, jc.ErrorIsNil) 440 envtesting.AssertUploadFakeToolsVersions( 441 c, stor, "releases", s.Environ.Config().AgentStream(), s.oldVersion) 442 } 443 444 s.assertUpgradeSteps(c, state.JobManageEnviron) 445 s.assertStateServerUpgrades(c) 446 } 447 448 func (s *UpgradeSuite) TestUpgradeStepsHostMachine(c *gc.C) { 449 s.setInstantRetryStrategy(c) 450 // We need to first start up a state server that thinks it has already been upgraded. 451 ss, _, _ := s.primeAgent(c, version.Current, state.JobManageEnviron) 452 a := s.newAgent(c, ss) 453 go func() { c.Check(a.Run(nil), gc.IsNil) }() 454 defer func() { c.Check(a.Stop(), gc.IsNil) }() 455 // Now run the test. 456 s.assertUpgradeSteps(c, state.JobHostUnits) 457 s.assertHostUpgrades(c) 458 } 459 460 func (s *UpgradeSuite) TestLoginsDuringUpgrade(c *gc.C) { 461 // Create machine agent to upgrade 462 machine, machine0Conf, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 463 a := s.newAgent(c, machine) 464 465 // Mock out upgrade logic, using a channel so that the test knows 466 // when upgrades have started and can control when upgrades 467 // should finish. 468 upgradeCh := make(chan bool) 469 abort := make(chan bool) 470 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 471 // Signal that upgrade has started. 472 select { 473 case upgradeCh <- true: 474 case <-abort: 475 return nil 476 } 477 478 // Wait for signal that upgrades should finish. 479 select { 480 case <-upgradeCh: 481 case <-abort: 482 return nil 483 } 484 return nil 485 } 486 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 487 488 // Start the API server and upgrade-steps works just as the agent would. 489 runner := worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant) 490 defer func() { 491 close(abort) 492 runner.Kill() 493 runner.Wait() 494 }() 495 certChangedChan := make(chan params.StateServingInfo) 496 runner.StartWorker("apiserver", a.apiserverWorkerStarter(s.State, certChangedChan)) 497 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter( 498 s.APIState, 499 []multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, 500 )) 501 502 // Set up a second machine to log in as. 503 // API logins are tested manually so there's no need to actually 504 // start this machine. 505 var machine1Conf agent.Config 506 _, machine1Conf, _ = s.primeAgent(c, version.Current, state.JobHostUnits) 507 508 c.Assert(waitForUpgradeToStart(upgradeCh), jc.IsTrue) 509 510 // Only user and local logins are allowed during upgrade. Users get a restricted API. 511 s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed) 512 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 513 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse) 514 515 close(upgradeCh) // Allow upgrade to complete 516 517 waitForUpgradeToFinish(c, machine0Conf) 518 519 // All logins are allowed after upgrade 520 s.checkLoginToAPIAsUser(c, machine0Conf, FullAPIExposed) 521 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 522 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsTrue) 523 } 524 525 func (s *UpgradeSuite) TestUpgradeSkippedIfNoUpgradeRequired(c *gc.C) { 526 attempts := 0 527 upgradeCh := make(chan bool) 528 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 529 // Note: this shouldn't run. 530 attempts++ 531 // If execution ends up here, wait so it can be detected (by 532 // checking for restricted API 533 <-upgradeCh 534 return nil 535 } 536 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 537 538 // Set up machine agent running the current version. 539 // 540 // Set the agent's initial upgradedToVersion to be almost the same 541 // as version.Current but not quite. We want it to be different to 542 // version.Current (so that we can see it change) but not to 543 // trigger upgrade steps. 544 initialVersion := makeBumpedCurrentVersion() 545 machine, agentConf, _ := s.primeAgent(c, initialVersion, state.JobManageEnviron) 546 a := s.newAgent(c, machine) 547 go func() { c.Check(a.Run(nil), gc.IsNil) }() 548 defer func() { 549 close(upgradeCh) 550 c.Check(a.Stop(), gc.IsNil) 551 }() 552 553 // Test that unrestricted API logins are possible (i.e. no 554 // "upgrade mode" in force) 555 s.checkLoginToAPIAsUser(c, agentConf, FullAPIExposed) 556 c.Assert(attempts, gc.Equals, 0) // There should have been no attempt to upgrade. 557 558 // Even though no upgrade was done upgradedToVersion should have been updated. 559 c.Assert(a.CurrentConfig().UpgradedToVersion(), gc.Equals, version.Current.Number) 560 } 561 562 func (s *UpgradeSuite) TestDowngradeOnMasterWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 563 // This test checks that the master triggers a downgrade if one of 564 // the other state server fails to signal it is ready for upgrade. 565 // 566 // This test is functional, ensuring that the upgrader worker 567 // terminates the machine agent with the UpgradeReadyError which 568 // makes the downgrade happen. 569 570 // Speed up the watcher frequency to make the test much faster. 571 s.PatchValue(&watcher.Period, 200*time.Millisecond) 572 573 // Provide (fake) tools so that the upgrader has something to downgrade to. 574 envtesting.AssertUploadFakeToolsVersions( 575 c, s.DefaultToolsStorage, s.Environ.Config().AgentStream(), s.Environ.Config().AgentStream(), s.oldVersion) 576 577 // Only the first machine is going to be ready for upgrade. 578 machineIdA, machineIdB, _ := s.createUpgradingStateServers(c) 579 580 // One of the other state servers is ready for upgrade (but machine C doesn't). 581 info, err := s.State.EnsureUpgradeInfo(machineIdB, s.oldVersion.Number, version.Current.Number) 582 c.Assert(err, jc.ErrorIsNil) 583 584 agent := s.newAgentFromMachineId(c, machineIdA) 585 defer agent.Stop() 586 587 s.machineIsMaster = true 588 589 var agentErr error 590 agentDone := make(chan bool) 591 go func() { 592 agentErr = agent.Run(nil) 593 close(agentDone) 594 }() 595 596 select { 597 case <-agentDone: 598 upgradeReadyErr, ok := agentErr.(*upgrader.UpgradeReadyError) 599 if !ok { 600 c.Fatalf("didn't see UpgradeReadyError, instead got: %v", agentErr) 601 } 602 // Confirm that the downgrade is back to the previous version. 603 c.Assert(upgradeReadyErr.OldTools, gc.Equals, version.Current) 604 c.Assert(upgradeReadyErr.NewTools, gc.Equals, s.oldVersion) 605 606 case <-time.After(coretesting.LongWait): 607 c.Fatal("machine agent did not exit as expected") 608 } 609 610 // UpgradeInfo doc should now be archived. 611 err = info.Refresh() 612 c.Assert(err, gc.ErrorMatches, "current upgrade info not found") 613 } 614 615 // Run just the upgrade-steps worker with a fake machine agent and 616 // fake agent config. 617 func (s *UpgradeSuite) runUpgradeWorker(c *gc.C, jobs ...multiwatcher.MachineJob) ( 618 error, *fakeConfigSetter, *fakeUpgradingMachineAgent, *upgradeWorkerContext, 619 ) { 620 config := s.makeFakeConfig() 621 agent := NewFakeUpgradingMachineAgent(config) 622 err, context := s.runUpgradeWorkerUsingAgent(c, agent, jobs...) 623 return err, config, agent, context 624 } 625 626 // Run just the upgrade-steps worker with the fake machine agent 627 // provided. 628 func (s *UpgradeSuite) runUpgradeWorkerUsingAgent( 629 c *gc.C, 630 agent *fakeUpgradingMachineAgent, 631 jobs ...multiwatcher.MachineJob, 632 ) (error, *upgradeWorkerContext) { 633 s.setInstantRetryStrategy(c) 634 context := NewUpgradeWorkerContext() 635 worker := context.Worker(agent, nil, jobs) 636 return worker.Wait(), context 637 } 638 639 func (s *UpgradeSuite) makeFakeConfig() *fakeConfigSetter { 640 return NewFakeConfigSetter(names.NewMachineTag("0"), s.oldVersion.Number) 641 } 642 643 // Create 3 configured state servers that appear to be running tools 644 // with version s.oldVersion and return their ids. 645 func (s *UpgradeSuite) createUpgradingStateServers(c *gc.C) (machineIdA, machineIdB, machineIdC string) { 646 machine0, _, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 647 machineIdA = machine0.Id() 648 649 changes, err := s.State.EnsureAvailability(3, constraints.Value{}, "quantal", nil) 650 c.Assert(err, jc.ErrorIsNil) 651 c.Assert(len(changes.Added), gc.Equals, 2) 652 machineIdB = changes.Added[0] 653 s.configureMachine(c, machineIdB, s.oldVersion) 654 machineIdC = changes.Added[1] 655 s.configureMachine(c, machineIdC, s.oldVersion) 656 657 return 658 } 659 660 func (s *UpgradeSuite) newAgentFromMachineId(c *gc.C, machineId string) *MachineAgent { 661 machine, err := s.State.Machine(machineId) 662 c.Assert(err, jc.ErrorIsNil) 663 return s.newAgent(c, machine) 664 } 665 666 // Return a version the same as the current software version, but with 667 // the build number bumped. 668 // 669 // The version Tag is also cleared so that upgrades.PerformUpgrade 670 // doesn't think it needs to run upgrade steps unnecessarily. 671 func makeBumpedCurrentVersion() version.Binary { 672 v := version.Current 673 v.Build++ 674 v.Tag = "" 675 return v 676 } 677 678 func waitForUpgradeToStart(upgradeCh chan bool) bool { 679 select { 680 case <-upgradeCh: 681 return true 682 case <-time.After(coretesting.LongWait): 683 return false 684 } 685 } 686 687 const maxUpgradeRetries = 3 688 689 func (s *UpgradeSuite) setInstantRetryStrategy(c *gc.C) { 690 s.PatchValue(&getUpgradeRetryStrategy, func() utils.AttemptStrategy { 691 c.Logf("setting instant retry strategy for upgrade: retries=%d", maxUpgradeRetries) 692 return utils.AttemptStrategy{ 693 Delay: 0, 694 Min: maxUpgradeRetries, 695 } 696 }) 697 } 698 699 func (s *UpgradeSuite) makeExpectedStatusCalls(retryCount int, expectFail bool, failReason string) []MachineStatusCall { 700 calls := []MachineStatusCall{{ 701 params.StatusStarted, 702 fmt.Sprintf("upgrading to %s", version.Current.Number), 703 }} 704 for i := 0; i < retryCount; i++ { 705 calls = append(calls, MachineStatusCall{ 706 params.StatusError, 707 fmt.Sprintf("upgrade to %s failed (will retry): %s", version.Current.Number, failReason), 708 }) 709 } 710 if expectFail { 711 calls = append(calls, MachineStatusCall{ 712 params.StatusError, 713 fmt.Sprintf("upgrade to %s failed (giving up): %s", version.Current.Number, failReason), 714 }) 715 } else { 716 calls = append(calls, MachineStatusCall{params.StatusStarted, ""}) 717 } 718 return calls 719 } 720 721 func (s *UpgradeSuite) makeExpectedUpgradeLogs( 722 retryCount int, 723 target string, 724 expectFail bool, 725 failReason string, 726 ) []jc.SimpleMessage { 727 outLogs := []jc.SimpleMessage{} 728 729 if target == "databaseMaster" || target == "stateServer" { 730 outLogs = append(outLogs, jc.SimpleMessage{ 731 loggo.INFO, "waiting for other state servers to be ready for upgrade", 732 }) 733 var waitMsg string 734 switch target { 735 case "databaseMaster": 736 waitMsg = "all state servers are ready to run upgrade steps" 737 case "stateServer": 738 waitMsg = "the master has completed its upgrade steps" 739 } 740 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, "finished waiting - " + waitMsg}) 741 } 742 743 outLogs = append(outLogs, jc.SimpleMessage{ 744 loggo.INFO, fmt.Sprintf( 745 `starting upgrade from %s to %s for "machine-0"`, 746 s.oldVersion.Number, version.Current.Number), 747 }) 748 749 failMessage := fmt.Sprintf( 750 `upgrade from %s to %s for "machine-0" failed \(%%s\): %s`, 751 s.oldVersion.Number, version.Current.Number, failReason) 752 753 for i := 0; i < retryCount; i++ { 754 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "will retry")}) 755 } 756 if expectFail { 757 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "giving up")}) 758 } else { 759 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, 760 fmt.Sprintf(`upgrade to %s completed successfully.`, version.Current.Number)}) 761 } 762 return outLogs 763 } 764 765 func (s *UpgradeSuite) assertUpgradeSteps(c *gc.C, job state.MachineJob) { 766 agent, stopFunc := s.createAgentAndStartUpgrade(c, job) 767 defer stopFunc() 768 waitForUpgradeToFinish(c, agent.CurrentConfig()) 769 } 770 771 func (s *UpgradeSuite) keyFile() string { 772 return filepath.Join(s.DataDir(), "system-identity") 773 } 774 775 func (s *UpgradeSuite) assertCommonUpgrades(c *gc.C) { 776 // rsyslog-gnutls should have been installed. 777 cmds := s.getAptCmds() 778 c.Assert(cmds, gc.HasLen, 1) 779 args := cmds[0].Args 780 c.Assert(len(args), jc.GreaterThan, 1) 781 c.Assert(args[0], gc.Equals, "apt-get") 782 c.Assert(args[len(args)-1], gc.Equals, "rsyslog-gnutls") 783 } 784 785 func (s *UpgradeSuite) assertStateServerUpgrades(c *gc.C) { 786 s.assertCommonUpgrades(c) 787 // System SSH key 788 c.Assert(s.keyFile(), jc.IsNonEmptyFile) 789 // Syslog port should have been updated 790 cfg, err := s.State.EnvironConfig() 791 c.Assert(err, jc.ErrorIsNil) 792 c.Assert(cfg.SyslogPort(), gc.Equals, config.DefaultSyslogPort) 793 // Deprecated attributes should have been deleted - just test a couple. 794 allAttrs := cfg.AllAttrs() 795 _, ok := allAttrs["public-bucket"] 796 c.Assert(ok, jc.IsFalse) 797 _, ok = allAttrs["public-bucket-region"] 798 c.Assert(ok, jc.IsFalse) 799 } 800 801 func (s *UpgradeSuite) assertHostUpgrades(c *gc.C) { 802 s.assertCommonUpgrades(c) 803 // Lock directory 804 lockdir := filepath.Join(s.DataDir(), "locks") 805 c.Assert(lockdir, jc.IsDirectory) 806 // SSH key file should not be generated for hosts. 807 _, err := os.Stat(s.keyFile()) 808 c.Assert(err, jc.Satisfies, os.IsNotExist) 809 // Syslog port should not have been updated 810 cfg, err := s.State.EnvironConfig() 811 c.Assert(err, jc.ErrorIsNil) 812 c.Assert(cfg.SyslogPort(), gc.Not(gc.Equals), config.DefaultSyslogPort) 813 // Add other checks as needed... 814 } 815 816 func (s *UpgradeSuite) createAgentAndStartUpgrade(c *gc.C, job state.MachineJob) (*MachineAgent, func()) { 817 machine, _, _ := s.primeAgent(c, s.oldVersion, job) 818 a := s.newAgent(c, machine) 819 go func() { c.Check(a.Run(nil), gc.IsNil) }() 820 return a, func() { c.Check(a.Stop(), gc.IsNil) } 821 } 822 823 func (s *UpgradeSuite) assertEnvironAgentVersion(c *gc.C, expected version.Number) { 824 envConfig, err := s.State.EnvironConfig() 825 c.Assert(err, jc.ErrorIsNil) 826 agentVersion, ok := envConfig.AgentVersion() 827 c.Assert(ok, jc.IsTrue) 828 c.Assert(agentVersion, gc.Equals, expected) 829 } 830 831 func waitForUpgradeToFinish(c *gc.C, conf agent.Config) { 832 success := false 833 for attempt := coretesting.LongAttempt.Start(); attempt.Next(); { 834 diskConf := readConfigFromDisk(c, conf.DataDir(), conf.Tag()) 835 success = diskConf.UpgradedToVersion() == version.Current.Number 836 if success { 837 break 838 } 839 } 840 c.Assert(success, jc.IsTrue) 841 } 842 843 func readConfigFromDisk(c *gc.C, dir string, tag names.Tag) agent.Config { 844 conf, err := agent.ReadConfig(agent.ConfigPath(dir, tag)) 845 c.Assert(err, jc.ErrorIsNil) 846 return conf 847 } 848 849 func (s *UpgradeSuite) checkLoginToAPIAsUser(c *gc.C, conf agent.Config, expectFullApi exposedAPI) { 850 info := conf.APIInfo() 851 info.Tag = s.AdminUserTag(c) 852 info.Password = "dummy-secret" 853 info.Nonce = "" 854 855 apiState, err := api.Open(info, upgradeTestDialOpts) 856 c.Assert(err, jc.ErrorIsNil) 857 defer apiState.Close() 858 859 // this call should always work 860 var result api.Status 861 err = apiState.APICall("Client", 0, "", "FullStatus", nil, &result) 862 c.Assert(err, jc.ErrorIsNil) 863 864 // this call should only work if API is not restricted 865 err = apiState.APICall("Client", 0, "", "DestroyEnvironment", nil, nil) 866 if expectFullApi { 867 c.Assert(err, jc.ErrorIsNil) 868 } else { 869 c.Assert(err, gc.ErrorMatches, "upgrade in progress .+") 870 } 871 } 872 873 func canLoginToAPIAsMachine(c *gc.C, fromConf, toConf agent.Config) bool { 874 info := fromConf.APIInfo() 875 info.Addrs = toConf.APIInfo().Addrs 876 apiState, err := api.Open(info, upgradeTestDialOpts) 877 if apiState != nil { 878 apiState.Close() 879 } 880 return apiState != nil && err == nil 881 } 882 883 var upgradeTestDialOpts = api.DialOpts{ 884 Timeout: 2 * time.Minute, 885 RetryDelay: 250 * time.Millisecond, 886 DialAddressInterval: 50 * time.Millisecond, 887 } 888 889 func assertUpgradeComplete(c *gc.C, context *upgradeWorkerContext) { 890 select { 891 case <-context.UpgradeComplete: 892 default: 893 c.Error("UpgradeComplete channel is open but shouldn't be") 894 } 895 } 896 897 func assertUpgradeNotComplete(c *gc.C, context *upgradeWorkerContext) { 898 select { 899 case <-context.UpgradeComplete: 900 c.Error("UpgradeComplete channel is closed but shouldn't be") 901 default: 902 } 903 } 904 905 // NewFakeConfigSetter returns a fakeConfigSetter which implements 906 // just enough of the agent.ConfigSetter interface to keep the upgrade 907 // steps worker happy. 908 func NewFakeConfigSetter(agentTag names.Tag, initialVersion version.Number) *fakeConfigSetter { 909 return &fakeConfigSetter{ 910 AgentTag: agentTag, 911 Version: initialVersion, 912 } 913 } 914 915 type fakeConfigSetter struct { 916 agent.ConfigSetter 917 AgentTag names.Tag 918 Version version.Number 919 } 920 921 func (s *fakeConfigSetter) Tag() names.Tag { 922 return s.AgentTag 923 } 924 925 func (s *fakeConfigSetter) UpgradedToVersion() version.Number { 926 return s.Version 927 } 928 929 func (s *fakeConfigSetter) SetUpgradedToVersion(newVersion version.Number) { 930 s.Version = newVersion 931 } 932 933 // NewFakeUpgradingMachineAgent returns a fakeUpgradingMachineAgent which implements 934 // the upgradingMachineAgent interface. This provides enough 935 // MachineAgent functionality to support upgrades. 936 func NewFakeUpgradingMachineAgent(confSetter agent.ConfigSetter) *fakeUpgradingMachineAgent { 937 return &fakeUpgradingMachineAgent{ 938 config: confSetter, 939 DyingCh: make(chan struct{}), 940 } 941 } 942 943 type fakeUpgradingMachineAgent struct { 944 config agent.ConfigSetter 945 DyingCh chan struct{} 946 MachineStatusCalls []MachineStatusCall 947 } 948 949 type MachineStatusCall struct { 950 Status params.Status 951 Info string 952 } 953 954 func (a *fakeUpgradingMachineAgent) setMachineStatus(_ *api.State, status params.Status, info string) error { 955 // Record setMachineStatus calls for later inspection. 956 a.MachineStatusCalls = append(a.MachineStatusCalls, MachineStatusCall{status, info}) 957 return nil 958 } 959 960 func (a *fakeUpgradingMachineAgent) ensureMongoServer(agent.Config) error { 961 return nil 962 } 963 964 func (a *fakeUpgradingMachineAgent) CurrentConfig() agent.Config { 965 return a.config 966 } 967 968 func (a *fakeUpgradingMachineAgent) ChangeConfig(mutate AgentConfigMutator) error { 969 return mutate(a.config) 970 } 971 972 func (a *fakeUpgradingMachineAgent) Dying() <-chan struct{} { 973 return a.DyingCh 974 }