github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/cmd/jujud/agent/upgrade_test.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/juju/errors" 16 "github.com/juju/loggo" 17 "github.com/juju/names" 18 jc "github.com/juju/testing/checkers" 19 "github.com/juju/utils" 20 pacman "github.com/juju/utils/packaging/manager" 21 gc "gopkg.in/check.v1" 22 23 "github.com/juju/juju/agent" 24 "github.com/juju/juju/api" 25 "github.com/juju/juju/apiserver/params" 26 cmdutil "github.com/juju/juju/cmd/jujud/util" 27 "github.com/juju/juju/constraints" 28 "github.com/juju/juju/environs" 29 "github.com/juju/juju/environs/config" 30 envtesting "github.com/juju/juju/environs/testing" 31 "github.com/juju/juju/mongo" 32 "github.com/juju/juju/state" 33 "github.com/juju/juju/state/multiwatcher" 34 "github.com/juju/juju/state/watcher" 35 coretesting "github.com/juju/juju/testing" 36 "github.com/juju/juju/upgrades" 37 "github.com/juju/juju/version" 38 "github.com/juju/juju/worker" 39 "github.com/juju/juju/worker/upgrader" 40 ) 41 42 type UpgradeSuite struct { 43 commonMachineSuite 44 45 aptCmds []*exec.Cmd 46 oldVersion version.Binary 47 logWriter loggo.TestWriter 48 connectionDead bool 49 machineIsMaster bool 50 aptMutex sync.Mutex 51 } 52 53 var _ = gc.Suite(&UpgradeSuite{}) 54 55 type exposedAPI bool 56 57 var ( 58 FullAPIExposed exposedAPI = true 59 RestrictedAPIExposed exposedAPI = false 60 ) 61 62 const fails = true 63 const succeeds = false 64 65 func (s *UpgradeSuite) setAptCmds(cmd *exec.Cmd) { 66 s.aptMutex.Lock() 67 defer s.aptMutex.Unlock() 68 if cmd == nil { 69 s.aptCmds = nil 70 } else { 71 s.aptCmds = append(s.aptCmds, cmd) 72 } 73 } 74 75 func (s *UpgradeSuite) getInstallCmds() []*exec.Cmd { 76 s.aptMutex.Lock() 77 defer s.aptMutex.Unlock() 78 return s.aptCmds 79 } 80 81 func (s *UpgradeSuite) SetUpTest(c *gc.C) { 82 s.commonMachineSuite.SetUpTest(c) 83 84 // clear s.aptCmds 85 s.setAptCmds(nil) 86 87 // Capture all apt commands. 88 aptCmds := s.AgentSuite.HookCommandOutput(&pacman.CommandOutput, nil, nil) 89 go func() { 90 for cmd := range aptCmds { 91 s.setAptCmds(cmd) 92 } 93 }() 94 95 s.oldVersion = version.Current 96 s.oldVersion.Major = 1 97 s.oldVersion.Minor = 16 98 99 // Don't wait so long in tests. 100 s.PatchValue(&upgradeStartTimeoutMaster, time.Duration(time.Millisecond*50)) 101 s.PatchValue(&upgradeStartTimeoutSecondary, time.Duration(time.Millisecond*60)) 102 103 // Allow tests to make the API connection appear to be dead. 104 s.connectionDead = false 105 s.PatchValue(&cmdutil.ConnectionIsDead, func(loggo.Logger, cmdutil.Pinger) bool { 106 return s.connectionDead 107 }) 108 109 var fakeOpenStateForUpgrade = func(upgradingMachineAgent, agent.Config) (*state.State, error) { 110 mongoInfo := s.State.MongoConnectionInfo() 111 st, err := state.Open(s.State.EnvironTag(), mongoInfo, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 112 c.Assert(err, jc.ErrorIsNil) 113 return st, nil 114 } 115 s.PatchValue(&openStateForUpgrade, fakeOpenStateForUpgrade) 116 117 s.machineIsMaster = true 118 fakeIsMachineMaster := func(*state.State, string) (bool, error) { 119 return s.machineIsMaster, nil 120 } 121 s.PatchValue(&isMachineMaster, fakeIsMachineMaster) 122 // Most of these tests normally finish sub-second on a fast machine. 123 // If any given test hits a minute, we have almost certainly become 124 // wedged, so dump the logs. 125 coretesting.DumpTestLogsAfter(time.Minute, c, s) 126 } 127 128 func (s *UpgradeSuite) captureLogs(c *gc.C) { 129 c.Assert(loggo.RegisterWriter("upgrade-tests", &s.logWriter, loggo.INFO), gc.IsNil) 130 s.AddCleanup(func(*gc.C) { 131 loggo.RemoveWriter("upgrade-tests") 132 s.logWriter.Clear() 133 }) 134 } 135 136 func (s *UpgradeSuite) countUpgradeAttempts(upgradeErr error) *int { 137 count := 0 138 s.PatchValue(&upgradesPerformUpgrade, func(version.Number, []upgrades.Target, upgrades.Context) error { 139 count++ 140 return upgradeErr 141 }) 142 return &count 143 } 144 145 func (s *UpgradeSuite) TestContextInitializeWhenNoUpgradeRequired(c *gc.C) { 146 // Set the agent's initial upgradedToVersion to almost the same as 147 // the current version. We want it to be different to 148 // version.Current (so that we can see it change) but not to 149 // trigger upgrade steps. 150 config := NewFakeConfigSetter(names.NewMachineTag("0"), makeBumpedCurrentVersion().Number) 151 agent := NewFakeUpgradingMachineAgent(config) 152 153 context := NewUpgradeWorkerContext() 154 context.InitializeUsingAgent(agent) 155 156 select { 157 case <-context.UpgradeComplete: 158 // Success 159 default: 160 c.Fatal("UpgradeComplete channel should be closed because no upgrade is required") 161 } 162 // The agent's version should have been updated. 163 c.Assert(config.Version, gc.Equals, version.Current.Number) 164 165 } 166 167 func (s *UpgradeSuite) TestContextInitializeWhenUpgradeRequired(c *gc.C) { 168 // Set the agent's upgradedToVersion so that upgrade steps are required. 169 initialVersion := version.MustParse("1.16.0") 170 config := NewFakeConfigSetter(names.NewMachineTag("0"), initialVersion) 171 agent := NewFakeUpgradingMachineAgent(config) 172 173 context := NewUpgradeWorkerContext() 174 context.InitializeUsingAgent(agent) 175 176 select { 177 case <-context.UpgradeComplete: 178 c.Fatal("UpgradeComplete channel shouldn't be closed because upgrade is required") 179 default: 180 // Success 181 } 182 // The agent's version should NOT have been updated. 183 c.Assert(config.Version, gc.Equals, initialVersion) 184 } 185 186 func (s *UpgradeSuite) TestRetryStrategy(c *gc.C) { 187 retries := getUpgradeRetryStrategy() 188 c.Assert(retries.Delay, gc.Equals, 2*time.Minute) 189 c.Assert(retries.Min, gc.Equals, 5) 190 } 191 192 func (s *UpgradeSuite) TestIsUpgradeRunning(c *gc.C) { 193 context := NewUpgradeWorkerContext() 194 c.Assert(context.IsUpgradeRunning(), jc.IsTrue) 195 196 close(context.UpgradeComplete) 197 c.Assert(context.IsUpgradeRunning(), jc.IsFalse) 198 } 199 200 func (s *UpgradeSuite) TestNoUpgradeNecessary(c *gc.C) { 201 attemptsP := s.countUpgradeAttempts(nil) 202 s.captureLogs(c) 203 s.oldVersion = version.Current // nothing to do 204 205 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 206 207 c.Check(workerErr, gc.IsNil) 208 c.Check(*attemptsP, gc.Equals, 0) 209 c.Check(config.Version, gc.Equals, version.Current.Number) 210 assertUpgradeComplete(c, context) 211 } 212 213 func (s *UpgradeSuite) TestUpgradeStepsFailure(c *gc.C) { 214 // This test checks what happens when every upgrade attempt fails. 215 // A number of retries should be observed and the agent should end 216 // up in a state where it is is still running but is reporting an 217 // error and the upgrade is not flagged as having completed (which 218 // prevents most of the agent's workers from running and keeps the 219 // API in restricted mode). 220 221 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 222 s.captureLogs(c) 223 224 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 225 226 // The worker shouldn't return an error so that the worker and 227 // agent keep running. 228 c.Check(workerErr, gc.IsNil) 229 230 c.Check(*attemptsP, gc.Equals, maxUpgradeRetries) 231 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 232 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 233 s.makeExpectedStatusCalls(maxUpgradeRetries-1, fails, "boom")) 234 c.Assert(s.logWriter.Log(), jc.LogMatches, 235 s.makeExpectedUpgradeLogs(maxUpgradeRetries-1, "hostMachine", fails, "boom")) 236 assertUpgradeNotComplete(c, context) 237 } 238 239 func (s *UpgradeSuite) TestUpgradeStepsRetries(c *gc.C) { 240 // This test checks what happens when the first upgrade attempt 241 // fails but the following on succeeds. The final state should be 242 // the same as a successful upgrade which worked first go. 243 attempts := 0 244 fail := true 245 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 246 attempts++ 247 if fail { 248 fail = false 249 return errors.New("boom") 250 } else { 251 return nil 252 } 253 } 254 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 255 s.captureLogs(c) 256 257 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 258 259 c.Check(workerErr, gc.IsNil) 260 c.Check(attempts, gc.Equals, 2) 261 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 262 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(1, succeeds, "boom")) 263 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(1, "hostMachine", succeeds, "boom")) 264 assertUpgradeComplete(c, context) 265 } 266 267 func (s *UpgradeSuite) TestOtherUpgradeRunFailure(c *gc.C) { 268 // This test checks what happens something other than the upgrade 269 // steps themselves fails, ensuring the something is logged and 270 // the agent status is updated. 271 272 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 273 // Delete UpgradeInfo for the upgrade so that finaliseUpgrade() will fail 274 s.State.ClearUpgradeInfo() 275 return nil 276 } 277 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 278 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 279 s.captureLogs(c) 280 281 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 282 283 c.Check(workerErr, gc.IsNil) 284 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade almost finished 285 failReason := `upgrade done but: cannot set upgrade status to "finishing": ` + 286 `Another status change may have occurred concurrently` 287 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 288 s.makeExpectedStatusCalls(0, fails, failReason)) 289 c.Assert(s.logWriter.Log(), jc.LogMatches, 290 s.makeExpectedUpgradeLogs(0, "databaseMaster", fails, failReason)) 291 assertUpgradeNotComplete(c, context) 292 } 293 294 func (s *UpgradeSuite) TestApiConnectionFailure(c *gc.C) { 295 // This test checks what happens when an upgrade fails because the 296 // connection to mongo has gone away. This will happen when the 297 // mongo master changes. In this case we want the upgrade worker 298 // to return immediately without further retries. The error should 299 // be returned by the worker so that the agent will restart. 300 301 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 302 s.connectionDead = true // Make the connection to state appear to be dead 303 s.captureLogs(c) 304 305 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 306 307 c.Check(workerErr, gc.ErrorMatches, "API connection lost during upgrade: boom") 308 c.Check(*attemptsP, gc.Equals, 1) 309 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 310 assertUpgradeNotComplete(c, context) 311 } 312 313 func (s *UpgradeSuite) TestAbortWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 314 // This test checks when a state server is upgrading and one of 315 // the other state servers doesn't signal it is ready in time. 316 317 // The master state server in this scenario is functionally tested 318 // elsewhere in this suite. 319 s.machineIsMaster = false 320 321 s.createUpgradingStateServers(c) 322 s.captureLogs(c) 323 attemptsP := s.countUpgradeAttempts(nil) 324 325 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 326 327 c.Check(workerErr, gc.IsNil) 328 c.Check(*attemptsP, gc.Equals, 0) 329 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 330 assertUpgradeNotComplete(c, context) 331 332 // The environment agent-version should still be the new version. 333 // It's up to the master to trigger the rollback. 334 s.assertEnvironAgentVersion(c, version.Current.Number) 335 336 causeMsg := " timed out after 60ms" 337 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 338 {loggo.INFO, "waiting for other state servers to be ready for upgrade"}, 339 {loggo.ERROR, "aborted wait for other state servers: timed out after 60ms"}, 340 {loggo.ERROR, `upgrade from .+ to .+ for "machine-0" failed \(giving up\): ` + 341 "aborted wait for other state servers:" + causeMsg}, 342 }) 343 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, []MachineStatusCall{{ 344 params.StatusError, 345 fmt.Sprintf( 346 "upgrade to %s failed (giving up): aborted wait for other state servers:"+causeMsg, 347 version.Current.Number), 348 }}) 349 } 350 351 func (s *UpgradeSuite) TestWorkerAbortsIfAgentDies(c *gc.C) { 352 s.machineIsMaster = false 353 s.captureLogs(c) 354 attemptsP := s.countUpgradeAttempts(nil) 355 356 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 357 358 config := s.makeFakeConfig() 359 agent := NewFakeUpgradingMachineAgent(config) 360 close(agent.DyingCh) 361 workerErr, context := s.runUpgradeWorkerUsingAgent(c, agent, multiwatcher.JobManageEnviron) 362 363 c.Check(workerErr, gc.IsNil) 364 c.Check(*attemptsP, gc.Equals, 0) 365 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 366 assertUpgradeNotComplete(c, context) 367 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 368 {loggo.WARNING, "stopped waiting for other state servers: machine agent is terminating"}, 369 }) 370 } 371 372 func (s *UpgradeSuite) TestSuccessMaster(c *gc.C) { 373 // This test checks what happens when an upgrade works on the 374 // first attempt on a master state server. 375 s.machineIsMaster = true 376 info := s.checkSuccess(c, "databaseMaster", func(*state.UpgradeInfo) {}) 377 c.Assert(info.Status(), gc.Equals, state.UpgradeFinishing) 378 } 379 380 func (s *UpgradeSuite) TestSuccessSecondary(c *gc.C) { 381 // This test checks what happens when an upgrade works on the 382 // first attempt on a secondary state server. 383 s.machineIsMaster = false 384 mungeInfo := func(info *state.UpgradeInfo) { 385 // Indicate that the master is done 386 err := info.SetStatus(state.UpgradeRunning) 387 c.Assert(err, jc.ErrorIsNil) 388 err = info.SetStatus(state.UpgradeFinishing) 389 c.Assert(err, jc.ErrorIsNil) 390 } 391 s.checkSuccess(c, "stateServer", mungeInfo) 392 } 393 394 func (s *UpgradeSuite) checkSuccess(c *gc.C, target string, mungeInfo func(*state.UpgradeInfo)) *state.UpgradeInfo { 395 _, machineIdB, machineIdC := s.createUpgradingStateServers(c) 396 397 // Indicate that machine B and C are ready to upgrade 398 vPrevious := s.oldVersion.Number 399 vNext := version.Current.Number 400 info, err := s.State.EnsureUpgradeInfo(machineIdB, vPrevious, vNext) 401 c.Assert(err, jc.ErrorIsNil) 402 _, err = s.State.EnsureUpgradeInfo(machineIdC, vPrevious, vNext) 403 c.Assert(err, jc.ErrorIsNil) 404 405 mungeInfo(info) 406 407 attemptsP := s.countUpgradeAttempts(nil) 408 s.captureLogs(c) 409 410 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 411 412 c.Check(workerErr, gc.IsNil) 413 c.Check(*attemptsP, gc.Equals, 1) 414 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 415 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(0, succeeds, "")) 416 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(0, target, succeeds, "")) 417 assertUpgradeComplete(c, context) 418 419 err = info.Refresh() 420 c.Assert(err, jc.ErrorIsNil) 421 c.Assert(info.StateServersDone(), jc.DeepEquals, []string{"0"}) 422 return info 423 } 424 425 func (s *UpgradeSuite) TestJobsToTargets(c *gc.C) { 426 check := func(jobs []multiwatcher.MachineJob, isMaster bool, expectedTargets ...upgrades.Target) { 427 c.Assert(jobsToTargets(jobs, isMaster), jc.SameContents, expectedTargets) 428 } 429 430 check([]multiwatcher.MachineJob{multiwatcher.JobHostUnits}, false, upgrades.HostMachine) 431 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, false, upgrades.StateServer) 432 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, true, 433 upgrades.StateServer, upgrades.DatabaseMaster) 434 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, false, 435 upgrades.StateServer, upgrades.HostMachine) 436 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, true, 437 upgrades.StateServer, upgrades.DatabaseMaster, upgrades.HostMachine) 438 } 439 440 func (s *UpgradeSuite) TestUpgradeStepsStateServer(c *gc.C) { 441 coretesting.SkipIfI386(c, "lp:1444576") 442 coretesting.SkipIfPPC64EL(c, "lp:1444576") 443 coretesting.SkipIfWindowsBug(c, "lp:1446885") 444 s.setInstantRetryStrategy(c) 445 // Upload tools to provider storage, so they can be migrated to environment storage. 446 stor, err := environs.LegacyStorage(s.State) 447 if !errors.IsNotSupported(err) { 448 c.Assert(err, jc.ErrorIsNil) 449 envtesting.AssertUploadFakeToolsVersions( 450 c, stor, "releases", s.Environ.Config().AgentStream(), s.oldVersion) 451 } 452 s.assertUpgradeSteps(c, state.JobManageEnviron) 453 s.assertStateServerUpgrades(c) 454 } 455 456 func (s *UpgradeSuite) TestUpgradeStepsHostMachine(c *gc.C) { 457 coretesting.SkipIfPPC64EL(c, "lp:1444576") 458 coretesting.SkipIfWindowsBug(c, "lp:1446885") 459 s.setInstantRetryStrategy(c) 460 // We need to first start up a state server that thinks it has already been upgraded. 461 ss, _, _ := s.primeAgent(c, version.Current, state.JobManageEnviron) 462 a := s.newAgent(c, ss) 463 go func() { c.Check(a.Run(nil), gc.IsNil) }() 464 defer func() { c.Check(a.Stop(), gc.IsNil) }() 465 // Now run the test. 466 s.assertUpgradeSteps(c, state.JobHostUnits) 467 s.assertHostUpgrades(c) 468 } 469 470 func (s *UpgradeSuite) TestLoginsDuringUpgrade(c *gc.C) { 471 // Create machine agent to upgrade 472 machine, machine0Conf, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 473 a := s.newAgent(c, machine) 474 475 // Mock out upgrade logic, using a channel so that the test knows 476 // when upgrades have started and can control when upgrades 477 // should finish. 478 upgradeCh := make(chan bool) 479 abort := make(chan bool) 480 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 481 // Signal that upgrade has started. 482 select { 483 case upgradeCh <- true: 484 case <-abort: 485 return nil 486 } 487 488 // Wait for signal that upgrades should finish. 489 select { 490 case <-upgradeCh: 491 case <-abort: 492 return nil 493 } 494 return nil 495 } 496 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 497 498 // Start the API server and upgrade-steps works just as the agent would. 499 runner := worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant) 500 defer func() { 501 close(abort) 502 runner.Kill() 503 runner.Wait() 504 }() 505 certChangedChan := make(chan params.StateServingInfo) 506 runner.StartWorker("apiserver", a.apiserverWorkerStarter(s.State, certChangedChan)) 507 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter( 508 s.APIState, 509 []multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, 510 )) 511 512 // Set up a second machine to log in as. 513 // API logins are tested manually so there's no need to actually 514 // start this machine. 515 var machine1Conf agent.Config 516 _, machine1Conf, _ = s.primeAgent(c, version.Current, state.JobHostUnits) 517 518 c.Assert(waitForUpgradeToStart(upgradeCh), jc.IsTrue) 519 520 // Only user and local logins are allowed during upgrade. Users get a restricted API. 521 s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed) 522 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 523 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse) 524 525 close(upgradeCh) // Allow upgrade to complete 526 527 waitForUpgradeToFinish(c, machine0Conf) 528 529 // Only user and local logins are allowed even after upgrade steps because 530 // agent upgrade not finished yet. 531 s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed) 532 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 533 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse) 534 535 machineAPI := s.OpenAPIAsMachine(c, machine.Tag(), initialMachinePassword, agent.BootstrapNonce) 536 runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(machineAPI.Upgrader(), machine0Conf)) 537 // Wait for agent upgrade worker to determine that no 538 // agent upgrades are required. 539 select { 540 case <-a.initialAgentUpgradeCheckComplete: 541 case <-time.After(coretesting.LongWait): 542 c.Fatalf("timeout waiting for upgrade check") 543 } 544 545 // All logins are allowed after upgrade 546 s.checkLoginToAPIAsUser(c, machine0Conf, FullAPIExposed) 547 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 548 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsTrue) 549 } 550 551 func (s *UpgradeSuite) TestUpgradeSkippedIfNoUpgradeRequired(c *gc.C) { 552 attempts := 0 553 upgradeCh := make(chan bool) 554 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 555 // Note: this shouldn't run. 556 attempts++ 557 // If execution ends up here, wait so it can be detected (by 558 // checking for restricted API 559 <-upgradeCh 560 return nil 561 } 562 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 563 564 // Set up machine agent running the current version. 565 // 566 // Set the agent's initial upgradedToVersion to be almost the same 567 // as version.Current but not quite. We want it to be different to 568 // version.Current (so that we can see it change) but not to 569 // trigger upgrade steps. 570 initialVersion := makeBumpedCurrentVersion() 571 machine, agentConf, _ := s.primeAgent(c, initialVersion, state.JobManageEnviron) 572 a := s.newAgent(c, machine) 573 go func() { c.Check(a.Run(nil), gc.IsNil) }() 574 defer func() { 575 close(upgradeCh) 576 c.Check(a.Stop(), gc.IsNil) 577 }() 578 579 // Test that unrestricted API logins are possible (i.e. no 580 // "upgrade mode" in force) 581 s.checkLoginToAPIAsUser(c, agentConf, FullAPIExposed) 582 c.Assert(attempts, gc.Equals, 0) // There should have been no attempt to upgrade. 583 584 // Even though no upgrade was done upgradedToVersion should have been updated. 585 c.Assert(a.CurrentConfig().UpgradedToVersion(), gc.Equals, version.Current.Number) 586 } 587 588 func (s *UpgradeSuite) TestDowngradeOnMasterWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 589 coretesting.SkipIfWindowsBug(c, "lp:1446885") 590 // This test checks that the master triggers a downgrade if one of 591 // the other state server fails to signal it is ready for upgrade. 592 // 593 // This test is functional, ensuring that the upgrader worker 594 // terminates the machine agent with the UpgradeReadyError which 595 // makes the downgrade happen. 596 597 // Speed up the watcher frequency to make the test much faster. 598 s.PatchValue(&watcher.Period, 200*time.Millisecond) 599 600 // Provide (fake) tools so that the upgrader has something to downgrade to. 601 envtesting.AssertUploadFakeToolsVersions( 602 c, s.DefaultToolsStorage, s.Environ.Config().AgentStream(), s.Environ.Config().AgentStream(), s.oldVersion) 603 604 // Only the first machine is going to be ready for upgrade. 605 machineIdA, machineIdB, _ := s.createUpgradingStateServers(c) 606 607 // One of the other state servers is ready for upgrade (but machine C doesn't). 608 info, err := s.State.EnsureUpgradeInfo(machineIdB, s.oldVersion.Number, version.Current.Number) 609 c.Assert(err, jc.ErrorIsNil) 610 611 agent := s.newAgentFromMachineId(c, machineIdA) 612 defer agent.Stop() 613 614 s.machineIsMaster = true 615 616 var agentErr error 617 agentDone := make(chan bool) 618 go func() { 619 agentErr = agent.Run(nil) 620 close(agentDone) 621 }() 622 623 select { 624 case <-agentDone: 625 upgradeReadyErr, ok := agentErr.(*upgrader.UpgradeReadyError) 626 if !ok { 627 c.Fatalf("didn't see UpgradeReadyError, instead got: %v", agentErr) 628 } 629 // Confirm that the downgrade is back to the previous version. 630 c.Assert(upgradeReadyErr.OldTools, gc.Equals, version.Current) 631 c.Assert(upgradeReadyErr.NewTools, gc.Equals, s.oldVersion) 632 633 case <-time.After(coretesting.LongWait): 634 c.Fatal("machine agent did not exit as expected") 635 } 636 637 // UpgradeInfo doc should now be archived. 638 err = info.Refresh() 639 c.Assert(err, gc.ErrorMatches, "current upgrade info not found") 640 } 641 642 // Run just the upgrade-steps worker with a fake machine agent and 643 // fake agent config. 644 func (s *UpgradeSuite) runUpgradeWorker(c *gc.C, jobs ...multiwatcher.MachineJob) ( 645 error, *fakeConfigSetter, *fakeUpgradingMachineAgent, *upgradeWorkerContext, 646 ) { 647 config := s.makeFakeConfig() 648 agent := NewFakeUpgradingMachineAgent(config) 649 err, context := s.runUpgradeWorkerUsingAgent(c, agent, jobs...) 650 return err, config, agent, context 651 } 652 653 // Run just the upgrade-steps worker with the fake machine agent 654 // provided. 655 func (s *UpgradeSuite) runUpgradeWorkerUsingAgent( 656 c *gc.C, 657 agent *fakeUpgradingMachineAgent, 658 jobs ...multiwatcher.MachineJob, 659 ) (error, *upgradeWorkerContext) { 660 s.setInstantRetryStrategy(c) 661 context := NewUpgradeWorkerContext() 662 worker := context.Worker(agent, nil, jobs) 663 return worker.Wait(), context 664 } 665 666 func (s *UpgradeSuite) makeFakeConfig() *fakeConfigSetter { 667 return NewFakeConfigSetter(names.NewMachineTag("0"), s.oldVersion.Number) 668 } 669 670 // Create 3 configured state servers that appear to be running tools 671 // with version s.oldVersion and return their ids. 672 func (s *UpgradeSuite) createUpgradingStateServers(c *gc.C) (machineIdA, machineIdB, machineIdC string) { 673 machine0, _, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 674 machineIdA = machine0.Id() 675 676 changes, err := s.State.EnsureAvailability(3, constraints.Value{}, "quantal", nil) 677 c.Assert(err, jc.ErrorIsNil) 678 c.Assert(len(changes.Added), gc.Equals, 2) 679 machineIdB = changes.Added[0] 680 s.configureMachine(c, machineIdB, s.oldVersion) 681 machineIdC = changes.Added[1] 682 s.configureMachine(c, machineIdC, s.oldVersion) 683 684 return 685 } 686 687 func (s *UpgradeSuite) newAgentFromMachineId(c *gc.C, machineId string) *MachineAgent { 688 machine, err := s.State.Machine(machineId) 689 c.Assert(err, jc.ErrorIsNil) 690 return s.newAgent(c, machine) 691 } 692 693 // Return a version the same as the current software version, but with 694 // the build number bumped. 695 // 696 // The version Tag is also cleared so that upgrades.PerformUpgrade 697 // doesn't think it needs to run upgrade steps unnecessarily. 698 func makeBumpedCurrentVersion() version.Binary { 699 v := version.Current 700 v.Build++ 701 v.Tag = "" 702 return v 703 } 704 705 func waitForUpgradeToStart(upgradeCh chan bool) bool { 706 select { 707 case <-upgradeCh: 708 return true 709 case <-time.After(coretesting.LongWait): 710 return false 711 } 712 } 713 714 const maxUpgradeRetries = 3 715 716 func (s *UpgradeSuite) setInstantRetryStrategy(c *gc.C) { 717 s.PatchValue(&getUpgradeRetryStrategy, func() utils.AttemptStrategy { 718 c.Logf("setting instant retry strategy for upgrade: retries=%d", maxUpgradeRetries) 719 return utils.AttemptStrategy{ 720 Delay: 0, 721 Min: maxUpgradeRetries, 722 } 723 }) 724 } 725 726 func (s *UpgradeSuite) makeExpectedStatusCalls(retryCount int, expectFail bool, failReason string) []MachineStatusCall { 727 calls := []MachineStatusCall{{ 728 params.StatusStarted, 729 fmt.Sprintf("upgrading to %s", version.Current.Number), 730 }} 731 for i := 0; i < retryCount; i++ { 732 calls = append(calls, MachineStatusCall{ 733 params.StatusError, 734 fmt.Sprintf("upgrade to %s failed (will retry): %s", version.Current.Number, failReason), 735 }) 736 } 737 if expectFail { 738 calls = append(calls, MachineStatusCall{ 739 params.StatusError, 740 fmt.Sprintf("upgrade to %s failed (giving up): %s", version.Current.Number, failReason), 741 }) 742 } else { 743 calls = append(calls, MachineStatusCall{params.StatusStarted, ""}) 744 } 745 return calls 746 } 747 748 func (s *UpgradeSuite) makeExpectedUpgradeLogs( 749 retryCount int, 750 target string, 751 expectFail bool, 752 failReason string, 753 ) []jc.SimpleMessage { 754 outLogs := []jc.SimpleMessage{} 755 756 if target == "databaseMaster" || target == "stateServer" { 757 outLogs = append(outLogs, jc.SimpleMessage{ 758 loggo.INFO, "waiting for other state servers to be ready for upgrade", 759 }) 760 var waitMsg string 761 switch target { 762 case "databaseMaster": 763 waitMsg = "all state servers are ready to run upgrade steps" 764 case "stateServer": 765 waitMsg = "the master has completed its upgrade steps" 766 } 767 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, "finished waiting - " + waitMsg}) 768 } 769 770 outLogs = append(outLogs, jc.SimpleMessage{ 771 loggo.INFO, fmt.Sprintf( 772 `starting upgrade from %s to %s for "machine-0"`, 773 s.oldVersion.Number, version.Current.Number), 774 }) 775 776 failMessage := fmt.Sprintf( 777 `upgrade from %s to %s for "machine-0" failed \(%%s\): %s`, 778 s.oldVersion.Number, version.Current.Number, failReason) 779 780 for i := 0; i < retryCount; i++ { 781 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "will retry")}) 782 } 783 if expectFail { 784 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "giving up")}) 785 } else { 786 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, 787 fmt.Sprintf(`upgrade to %s completed successfully.`, version.Current.Number)}) 788 } 789 return outLogs 790 } 791 792 func (s *UpgradeSuite) assertUpgradeSteps(c *gc.C, job state.MachineJob) { 793 agent, stopFunc := s.createAgentAndStartUpgrade(c, job) 794 defer stopFunc() 795 waitForUpgradeToFinish(c, agent.CurrentConfig()) 796 } 797 798 func (s *UpgradeSuite) keyFile() string { 799 return filepath.Join(s.DataDir(), "system-identity") 800 } 801 802 func (s *UpgradeSuite) assertCommonUpgrades(c *gc.C) { 803 // rsyslog-gnutls should have been installed. 804 cmds := s.getInstallCmds() 805 c.Assert(cmds, gc.HasLen, 1) 806 args := cmds[0].Args 807 c.Assert(len(args), jc.GreaterThan, 1) 808 809 pm, err := coretesting.GetPackageManager() 810 c.Assert(err, jc.ErrorIsNil) 811 812 c.Assert(args[0], gc.Equals, pm.PackageManager) 813 814 c.Assert(args[len(args)-1], gc.Equals, "rsyslog-gnutls") 815 } 816 817 func (s *UpgradeSuite) assertStateServerUpgrades(c *gc.C) { 818 s.assertCommonUpgrades(c) 819 // System SSH key 820 c.Assert(s.keyFile(), jc.IsNonEmptyFile) 821 // Syslog port should have been updated 822 cfg, err := s.State.EnvironConfig() 823 c.Assert(err, jc.ErrorIsNil) 824 c.Assert(cfg.SyslogPort(), gc.Equals, config.DefaultSyslogPort) 825 // Deprecated attributes should have been deleted - just test a couple. 826 allAttrs := cfg.AllAttrs() 827 _, ok := allAttrs["public-bucket"] 828 c.Assert(ok, jc.IsFalse) 829 _, ok = allAttrs["public-bucket-region"] 830 c.Assert(ok, jc.IsFalse) 831 } 832 833 func (s *UpgradeSuite) assertHostUpgrades(c *gc.C) { 834 s.assertCommonUpgrades(c) 835 // Lock directory 836 // TODO(bogdanteleaga): Fix this on windows. Currently a bash script is 837 // used to create the directory which partially works on windows 8 but 838 // doesn't work on windows server. 839 lockdir := filepath.Join(s.DataDir(), "locks") 840 c.Assert(lockdir, jc.IsDirectory) 841 // SSH key file should not be generated for hosts. 842 _, err := os.Stat(s.keyFile()) 843 c.Assert(err, jc.Satisfies, os.IsNotExist) 844 // Syslog port should not have been updated 845 cfg, err := s.State.EnvironConfig() 846 c.Assert(err, jc.ErrorIsNil) 847 c.Assert(cfg.SyslogPort(), gc.Not(gc.Equals), config.DefaultSyslogPort) 848 // Add other checks as needed... 849 } 850 851 func (s *UpgradeSuite) createAgentAndStartUpgrade(c *gc.C, job state.MachineJob) (*MachineAgent, func()) { 852 machine, _, _ := s.primeAgent(c, s.oldVersion, job) 853 a := s.newAgent(c, machine) 854 go func() { c.Check(a.Run(nil), gc.IsNil) }() 855 return a, func() { c.Check(a.Stop(), gc.IsNil) } 856 } 857 858 func (s *UpgradeSuite) assertEnvironAgentVersion(c *gc.C, expected version.Number) { 859 envConfig, err := s.State.EnvironConfig() 860 c.Assert(err, jc.ErrorIsNil) 861 agentVersion, ok := envConfig.AgentVersion() 862 c.Assert(ok, jc.IsTrue) 863 c.Assert(agentVersion, gc.Equals, expected) 864 } 865 866 func waitForUpgradeToFinish(c *gc.C, conf agent.Config) { 867 success := false 868 for attempt := coretesting.LongAttempt.Start(); attempt.Next(); { 869 diskConf := readConfigFromDisk(c, conf.DataDir(), conf.Tag()) 870 success = diskConf.UpgradedToVersion() == version.Current.Number 871 if success { 872 break 873 } 874 } 875 c.Assert(success, jc.IsTrue) 876 } 877 878 func readConfigFromDisk(c *gc.C, dir string, tag names.Tag) agent.Config { 879 conf, err := agent.ReadConfig(agent.ConfigPath(dir, tag)) 880 c.Assert(err, jc.ErrorIsNil) 881 return conf 882 } 883 884 func (s *UpgradeSuite) checkLoginToAPIAsUser(c *gc.C, conf agent.Config, expectFullApi exposedAPI) { 885 var err error 886 // Multiple attempts may be necessary because there is a small gap 887 // between the post-upgrade version being written to the agent's 888 // config (as observed by waitForUpgradeToFinish) and the end of 889 // "upgrade mode" (i.e. when the agent's UpgradeComplete channel 890 // is closed). Without this tests that call checkLoginToAPIAsUser 891 // can occasionally fail. 892 for a := coretesting.LongAttempt.Start(); a.Next(); { 893 err = s.attemptRestrictedAPIAsUser(c, conf) 894 switch expectFullApi { 895 case FullAPIExposed: 896 if err == nil { 897 return 898 } 899 case RestrictedAPIExposed: 900 if err != nil && strings.HasPrefix(err.Error(), "upgrade in progress") { 901 return 902 } 903 } 904 } 905 c.Fatalf("timed out waiting for expected API behaviour. last error was: %v", err) 906 } 907 908 func (s *UpgradeSuite) attemptRestrictedAPIAsUser(c *gc.C, conf agent.Config) error { 909 info := conf.APIInfo() 910 info.Tag = s.AdminUserTag(c) 911 info.Password = "dummy-secret" 912 info.Nonce = "" 913 914 apiState, err := api.Open(info, upgradeTestDialOpts) 915 c.Assert(err, jc.ErrorIsNil) 916 defer apiState.Close() 917 918 // this call should always work 919 var result params.FullStatus 920 err = apiState.APICall("Client", 0, "", "FullStatus", nil, &result) 921 c.Assert(err, jc.ErrorIsNil) 922 923 // this call should only work if API is not restricted 924 return apiState.APICall("Client", 0, "", "WatchAll", nil, nil) 925 } 926 927 func canLoginToAPIAsMachine(c *gc.C, fromConf, toConf agent.Config) bool { 928 info := fromConf.APIInfo() 929 info.Addrs = toConf.APIInfo().Addrs 930 apiState, err := api.Open(info, upgradeTestDialOpts) 931 if apiState != nil { 932 apiState.Close() 933 } 934 return apiState != nil && err == nil 935 } 936 937 var upgradeTestDialOpts = api.DialOpts{ 938 Timeout: 2 * time.Minute, 939 RetryDelay: 250 * time.Millisecond, 940 DialAddressInterval: 50 * time.Millisecond, 941 } 942 943 func assertUpgradeComplete(c *gc.C, context *upgradeWorkerContext) { 944 select { 945 case <-context.UpgradeComplete: 946 default: 947 c.Error("UpgradeComplete channel is open but shouldn't be") 948 } 949 } 950 951 func assertUpgradeNotComplete(c *gc.C, context *upgradeWorkerContext) { 952 select { 953 case <-context.UpgradeComplete: 954 c.Error("UpgradeComplete channel is closed but shouldn't be") 955 default: 956 } 957 } 958 959 // NewFakeConfigSetter returns a fakeConfigSetter which implements 960 // just enough of the agent.ConfigSetter interface to keep the upgrade 961 // steps worker happy. 962 func NewFakeConfigSetter(agentTag names.Tag, initialVersion version.Number) *fakeConfigSetter { 963 return &fakeConfigSetter{ 964 AgentTag: agentTag, 965 Version: initialVersion, 966 } 967 } 968 969 type fakeConfigSetter struct { 970 agent.ConfigSetter 971 AgentTag names.Tag 972 Version version.Number 973 } 974 975 func (s *fakeConfigSetter) Tag() names.Tag { 976 return s.AgentTag 977 } 978 979 func (s *fakeConfigSetter) UpgradedToVersion() version.Number { 980 return s.Version 981 } 982 983 func (s *fakeConfigSetter) SetUpgradedToVersion(newVersion version.Number) { 984 s.Version = newVersion 985 } 986 987 // NewFakeUpgradingMachineAgent returns a fakeUpgradingMachineAgent which implements 988 // the upgradingMachineAgent interface. This provides enough 989 // MachineAgent functionality to support upgrades. 990 func NewFakeUpgradingMachineAgent(confSetter agent.ConfigSetter) *fakeUpgradingMachineAgent { 991 return &fakeUpgradingMachineAgent{ 992 config: confSetter, 993 DyingCh: make(chan struct{}), 994 } 995 } 996 997 type fakeUpgradingMachineAgent struct { 998 config agent.ConfigSetter 999 DyingCh chan struct{} 1000 MachineStatusCalls []MachineStatusCall 1001 } 1002 1003 type MachineStatusCall struct { 1004 Status params.Status 1005 Info string 1006 } 1007 1008 func (a *fakeUpgradingMachineAgent) setMachineStatus(_ api.Connection, status params.Status, info string) error { 1009 // Record setMachineStatus calls for later inspection. 1010 a.MachineStatusCalls = append(a.MachineStatusCalls, MachineStatusCall{status, info}) 1011 return nil 1012 } 1013 1014 func (a *fakeUpgradingMachineAgent) ensureMongoServer(agent.Config) error { 1015 return nil 1016 } 1017 1018 func (a *fakeUpgradingMachineAgent) CurrentConfig() agent.Config { 1019 return a.config 1020 } 1021 1022 func (a *fakeUpgradingMachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 1023 return mutate(a.config) 1024 } 1025 1026 func (a *fakeUpgradingMachineAgent) Dying() <-chan struct{} { 1027 return a.DyingCh 1028 }