github.com/Pankov404/juju@v0.0.0-20150703034450-be266991dceb/cmd/jujud/agent/upgrade_test.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/juju/errors" 16 "github.com/juju/loggo" 17 "github.com/juju/names" 18 jc "github.com/juju/testing/checkers" 19 "github.com/juju/utils" 20 pacman "github.com/juju/utils/packaging/manager" 21 gc "gopkg.in/check.v1" 22 23 "github.com/juju/juju/agent" 24 "github.com/juju/juju/api" 25 "github.com/juju/juju/apiserver/params" 26 cmdutil "github.com/juju/juju/cmd/jujud/util" 27 "github.com/juju/juju/constraints" 28 "github.com/juju/juju/environs" 29 "github.com/juju/juju/environs/config" 30 envtesting "github.com/juju/juju/environs/testing" 31 "github.com/juju/juju/mongo" 32 "github.com/juju/juju/state" 33 "github.com/juju/juju/state/multiwatcher" 34 "github.com/juju/juju/state/watcher" 35 coretesting "github.com/juju/juju/testing" 36 "github.com/juju/juju/upgrades" 37 "github.com/juju/juju/version" 38 "github.com/juju/juju/worker" 39 "github.com/juju/juju/worker/upgrader" 40 ) 41 42 type UpgradeSuite struct { 43 commonMachineSuite 44 45 aptCmds []*exec.Cmd 46 oldVersion version.Binary 47 logWriter loggo.TestWriter 48 connectionDead bool 49 machineIsMaster bool 50 aptMutex sync.Mutex 51 } 52 53 var _ = gc.Suite(&UpgradeSuite{}) 54 55 type exposedAPI bool 56 57 var ( 58 FullAPIExposed exposedAPI = true 59 RestrictedAPIExposed exposedAPI = false 60 ) 61 62 const fails = true 63 const succeeds = false 64 65 func (s *UpgradeSuite) setAptCmds(cmd *exec.Cmd) { 66 s.aptMutex.Lock() 67 defer s.aptMutex.Unlock() 68 if cmd == nil { 69 s.aptCmds = nil 70 } else { 71 s.aptCmds = append(s.aptCmds, cmd) 72 } 73 } 74 75 func (s *UpgradeSuite) getAptCmds() []*exec.Cmd { 76 s.aptMutex.Lock() 77 defer s.aptMutex.Unlock() 78 return s.aptCmds 79 } 80 81 func (s *UpgradeSuite) SetUpTest(c *gc.C) { 82 s.commonMachineSuite.SetUpTest(c) 83 84 // clear s.aptCmds 85 s.setAptCmds(nil) 86 87 // Capture all apt commands. 88 aptCmds := s.AgentSuite.HookCommandOutput(&pacman.CommandOutput, nil, nil) 89 go func() { 90 for cmd := range aptCmds { 91 s.setAptCmds(cmd) 92 } 93 }() 94 95 s.oldVersion = version.Current 96 s.oldVersion.Major = 1 97 s.oldVersion.Minor = 16 98 99 // Don't wait so long in tests. 100 s.PatchValue(&upgradeStartTimeoutMaster, time.Duration(time.Millisecond*50)) 101 s.PatchValue(&upgradeStartTimeoutSecondary, time.Duration(time.Millisecond*60)) 102 103 // Allow tests to make the API connection appear to be dead. 104 s.connectionDead = false 105 s.PatchValue(&cmdutil.ConnectionIsDead, func(loggo.Logger, cmdutil.Pinger) bool { 106 return s.connectionDead 107 }) 108 109 var fakeOpenStateForUpgrade = func(upgradingMachineAgent, agent.Config) (*state.State, error) { 110 mongoInfo := s.State.MongoConnectionInfo() 111 st, err := state.Open(mongoInfo, mongo.DefaultDialOpts(), environs.NewStatePolicy()) 112 c.Assert(err, jc.ErrorIsNil) 113 return st, nil 114 } 115 s.PatchValue(&openStateForUpgrade, fakeOpenStateForUpgrade) 116 117 s.machineIsMaster = true 118 fakeIsMachineMaster := func(*state.State, string) (bool, error) { 119 return s.machineIsMaster, nil 120 } 121 s.PatchValue(&isMachineMaster, fakeIsMachineMaster) 122 // Most of these tests normally finish sub-second on a fast machine. 123 // If any given test hits a minute, we have almost certainly become 124 // wedged, so dump the logs. 125 coretesting.DumpTestLogsAfter(time.Minute, c, s) 126 } 127 128 func (s *UpgradeSuite) captureLogs(c *gc.C) { 129 c.Assert(loggo.RegisterWriter("upgrade-tests", &s.logWriter, loggo.INFO), gc.IsNil) 130 s.AddCleanup(func(*gc.C) { 131 loggo.RemoveWriter("upgrade-tests") 132 s.logWriter.Clear() 133 }) 134 } 135 136 func (s *UpgradeSuite) countUpgradeAttempts(upgradeErr error) *int { 137 count := 0 138 s.PatchValue(&upgradesPerformUpgrade, func(version.Number, []upgrades.Target, upgrades.Context) error { 139 count++ 140 return upgradeErr 141 }) 142 return &count 143 } 144 145 func (s *UpgradeSuite) TestContextInitializeWhenNoUpgradeRequired(c *gc.C) { 146 // Set the agent's initial upgradedToVersion to almost the same as 147 // the current version. We want it to be different to 148 // version.Current (so that we can see it change) but not to 149 // trigger upgrade steps. 150 config := NewFakeConfigSetter(names.NewMachineTag("0"), makeBumpedCurrentVersion().Number) 151 agent := NewFakeUpgradingMachineAgent(config) 152 153 context := NewUpgradeWorkerContext() 154 context.InitializeUsingAgent(agent) 155 156 select { 157 case <-context.UpgradeComplete: 158 // Success 159 default: 160 c.Fatal("UpgradeComplete channel should be closed because no upgrade is required") 161 } 162 // The agent's version should have been updated. 163 c.Assert(config.Version, gc.Equals, version.Current.Number) 164 165 } 166 167 func (s *UpgradeSuite) TestContextInitializeWhenUpgradeRequired(c *gc.C) { 168 // Set the agent's upgradedToVersion so that upgrade steps are required. 169 initialVersion := version.MustParse("1.16.0") 170 config := NewFakeConfigSetter(names.NewMachineTag("0"), initialVersion) 171 agent := NewFakeUpgradingMachineAgent(config) 172 173 context := NewUpgradeWorkerContext() 174 context.InitializeUsingAgent(agent) 175 176 select { 177 case <-context.UpgradeComplete: 178 c.Fatal("UpgradeComplete channel shouldn't be closed because upgrade is required") 179 default: 180 // Success 181 } 182 // The agent's version should NOT have been updated. 183 c.Assert(config.Version, gc.Equals, initialVersion) 184 } 185 186 func (s *UpgradeSuite) TestRetryStrategy(c *gc.C) { 187 retries := getUpgradeRetryStrategy() 188 c.Assert(retries.Delay, gc.Equals, 2*time.Minute) 189 c.Assert(retries.Min, gc.Equals, 5) 190 } 191 192 func (s *UpgradeSuite) TestIsUpgradeRunning(c *gc.C) { 193 context := NewUpgradeWorkerContext() 194 c.Assert(context.IsUpgradeRunning(), jc.IsTrue) 195 196 close(context.UpgradeComplete) 197 c.Assert(context.IsUpgradeRunning(), jc.IsFalse) 198 } 199 200 func (s *UpgradeSuite) TestNoUpgradeNecessary(c *gc.C) { 201 attemptsP := s.countUpgradeAttempts(nil) 202 s.captureLogs(c) 203 s.oldVersion = version.Current // nothing to do 204 205 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 206 207 c.Check(workerErr, gc.IsNil) 208 c.Check(*attemptsP, gc.Equals, 0) 209 c.Check(config.Version, gc.Equals, version.Current.Number) 210 assertUpgradeComplete(c, context) 211 } 212 213 func (s *UpgradeSuite) TestUpgradeStepsFailure(c *gc.C) { 214 // This test checks what happens when every upgrade attempt fails. 215 // A number of retries should be observed and the agent should end 216 // up in a state where it is is still running but is reporting an 217 // error and the upgrade is not flagged as having completed (which 218 // prevents most of the agent's workers from running and keeps the 219 // API in restricted mode). 220 221 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 222 s.captureLogs(c) 223 224 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 225 226 // The worker shouldn't return an error so that the worker and 227 // agent keep running. 228 c.Check(workerErr, gc.IsNil) 229 230 c.Check(*attemptsP, gc.Equals, maxUpgradeRetries) 231 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 232 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 233 s.makeExpectedStatusCalls(maxUpgradeRetries-1, fails, "boom")) 234 c.Assert(s.logWriter.Log(), jc.LogMatches, 235 s.makeExpectedUpgradeLogs(maxUpgradeRetries-1, "hostMachine", fails, "boom")) 236 assertUpgradeNotComplete(c, context) 237 } 238 239 func (s *UpgradeSuite) TestUpgradeStepsRetries(c *gc.C) { 240 // This test checks what happens when the first upgrade attempt 241 // fails but the following on succeeds. The final state should be 242 // the same as a successful upgrade which worked first go. 243 attempts := 0 244 fail := true 245 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 246 attempts++ 247 if fail { 248 fail = false 249 return errors.New("boom") 250 } else { 251 return nil 252 } 253 } 254 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 255 s.captureLogs(c) 256 257 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 258 259 c.Check(workerErr, gc.IsNil) 260 c.Check(attempts, gc.Equals, 2) 261 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 262 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(1, succeeds, "boom")) 263 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(1, "hostMachine", succeeds, "boom")) 264 assertUpgradeComplete(c, context) 265 } 266 267 func (s *UpgradeSuite) TestOtherUpgradeRunFailure(c *gc.C) { 268 // This test checks what happens something other than the upgrade 269 // steps themselves fails, ensuring the something is logged and 270 // the agent status is updated. 271 272 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 273 // Delete UpgradeInfo for the upgrade so that finaliseUpgrade() will fail 274 s.State.ClearUpgradeInfo() 275 return nil 276 } 277 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 278 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 279 s.captureLogs(c) 280 281 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 282 283 c.Check(workerErr, gc.IsNil) 284 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade almost finished 285 failReason := `upgrade done but: cannot set upgrade status to "finishing": ` + 286 `Another status change may have occurred concurrently` 287 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, 288 s.makeExpectedStatusCalls(0, fails, failReason)) 289 c.Assert(s.logWriter.Log(), jc.LogMatches, 290 s.makeExpectedUpgradeLogs(0, "databaseMaster", fails, failReason)) 291 assertUpgradeNotComplete(c, context) 292 } 293 294 func (s *UpgradeSuite) TestApiConnectionFailure(c *gc.C) { 295 // This test checks what happens when an upgrade fails because the 296 // connection to mongo has gone away. This will happen when the 297 // mongo master changes. In this case we want the upgrade worker 298 // to return immediately without further retries. The error should 299 // be returned by the worker so that the agent will restart. 300 301 attemptsP := s.countUpgradeAttempts(errors.New("boom")) 302 s.connectionDead = true // Make the connection to state appear to be dead 303 s.captureLogs(c) 304 305 workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits) 306 307 c.Check(workerErr, gc.ErrorMatches, "API connection lost during upgrade: boom") 308 c.Check(*attemptsP, gc.Equals, 1) 309 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish 310 assertUpgradeNotComplete(c, context) 311 } 312 313 func (s *UpgradeSuite) TestAbortWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 314 // This test checks when a state server is upgrading and one of 315 // the other state servers doesn't signal it is ready in time. 316 317 // The master state server in this scenario is functionally tested 318 // elsewhere in this suite. 319 s.machineIsMaster = false 320 321 s.createUpgradingStateServers(c) 322 s.captureLogs(c) 323 attemptsP := s.countUpgradeAttempts(nil) 324 325 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 326 327 c.Check(workerErr, gc.IsNil) 328 c.Check(*attemptsP, gc.Equals, 0) 329 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 330 assertUpgradeNotComplete(c, context) 331 332 // The environment agent-version should still be the new version. 333 // It's up to the master to trigger the rollback. 334 s.assertEnvironAgentVersion(c, version.Current.Number) 335 336 causeMsg := " timed out after 60ms" 337 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 338 {loggo.INFO, "waiting for other state servers to be ready for upgrade"}, 339 {loggo.ERROR, "aborted wait for other state servers: timed out after 60ms"}, 340 {loggo.ERROR, `upgrade from .+ to .+ for "machine-0" failed \(giving up\): ` + 341 "aborted wait for other state servers:" + causeMsg}, 342 }) 343 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, []MachineStatusCall{{ 344 params.StatusError, 345 fmt.Sprintf( 346 "upgrade to %s failed (giving up): aborted wait for other state servers:"+causeMsg, 347 version.Current.Number), 348 }}) 349 } 350 351 func (s *UpgradeSuite) TestWorkerAbortsIfAgentDies(c *gc.C) { 352 s.machineIsMaster = false 353 s.captureLogs(c) 354 attemptsP := s.countUpgradeAttempts(nil) 355 356 s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 357 358 config := s.makeFakeConfig() 359 agent := NewFakeUpgradingMachineAgent(config) 360 close(agent.DyingCh) 361 workerErr, context := s.runUpgradeWorkerUsingAgent(c, agent, multiwatcher.JobManageEnviron) 362 363 c.Check(workerErr, gc.IsNil) 364 c.Check(*attemptsP, gc.Equals, 0) 365 c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen 366 assertUpgradeNotComplete(c, context) 367 c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{ 368 {loggo.WARNING, "stopped waiting for other state servers: machine agent is terminating"}, 369 }) 370 } 371 372 func (s *UpgradeSuite) TestSuccessMaster(c *gc.C) { 373 // This test checks what happens when an upgrade works on the 374 // first attempt on a master state server. 375 s.machineIsMaster = true 376 info := s.checkSuccess(c, "databaseMaster", func(*state.UpgradeInfo) {}) 377 c.Assert(info.Status(), gc.Equals, state.UpgradeFinishing) 378 } 379 380 func (s *UpgradeSuite) TestSuccessSecondary(c *gc.C) { 381 // This test checks what happens when an upgrade works on the 382 // first attempt on a secondary state server. 383 s.machineIsMaster = false 384 mungeInfo := func(info *state.UpgradeInfo) { 385 // Indicate that the master is done 386 err := info.SetStatus(state.UpgradeRunning) 387 c.Assert(err, jc.ErrorIsNil) 388 err = info.SetStatus(state.UpgradeFinishing) 389 c.Assert(err, jc.ErrorIsNil) 390 } 391 s.checkSuccess(c, "stateServer", mungeInfo) 392 } 393 394 func (s *UpgradeSuite) checkSuccess(c *gc.C, target string, mungeInfo func(*state.UpgradeInfo)) *state.UpgradeInfo { 395 _, machineIdB, machineIdC := s.createUpgradingStateServers(c) 396 397 // Indicate that machine B and C are ready to upgrade 398 vPrevious := s.oldVersion.Number 399 vNext := version.Current.Number 400 info, err := s.State.EnsureUpgradeInfo(machineIdB, vPrevious, vNext) 401 c.Assert(err, jc.ErrorIsNil) 402 _, err = s.State.EnsureUpgradeInfo(machineIdC, vPrevious, vNext) 403 c.Assert(err, jc.ErrorIsNil) 404 405 mungeInfo(info) 406 407 attemptsP := s.countUpgradeAttempts(nil) 408 s.captureLogs(c) 409 410 workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron) 411 412 c.Check(workerErr, gc.IsNil) 413 c.Check(*attemptsP, gc.Equals, 1) 414 c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished 415 c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(0, succeeds, "")) 416 c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(0, target, succeeds, "")) 417 assertUpgradeComplete(c, context) 418 419 err = info.Refresh() 420 c.Assert(err, jc.ErrorIsNil) 421 c.Assert(info.StateServersDone(), jc.DeepEquals, []string{"0"}) 422 return info 423 } 424 425 func (s *UpgradeSuite) TestJobsToTargets(c *gc.C) { 426 check := func(jobs []multiwatcher.MachineJob, isMaster bool, expectedTargets ...upgrades.Target) { 427 c.Assert(jobsToTargets(jobs, isMaster), jc.SameContents, expectedTargets) 428 } 429 430 check([]multiwatcher.MachineJob{multiwatcher.JobHostUnits}, false, upgrades.HostMachine) 431 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, false, upgrades.StateServer) 432 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, true, 433 upgrades.StateServer, upgrades.DatabaseMaster) 434 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, false, 435 upgrades.StateServer, upgrades.HostMachine) 436 check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, true, 437 upgrades.StateServer, upgrades.DatabaseMaster, upgrades.HostMachine) 438 } 439 440 func (s *UpgradeSuite) TestUpgradeStepsStateServer(c *gc.C) { 441 coretesting.SkipIfI386(c, "lp:1444576") 442 coretesting.SkipIfPPC64EL(c, "lp:1444576") 443 coretesting.SkipIfWindowsBug(c, "lp:1446885") 444 s.setInstantRetryStrategy(c) 445 // Upload tools to provider storage, so they can be migrated to environment storage. 446 stor, err := environs.LegacyStorage(s.State) 447 if !errors.IsNotSupported(err) { 448 c.Assert(err, jc.ErrorIsNil) 449 envtesting.AssertUploadFakeToolsVersions( 450 c, stor, "releases", s.Environ.Config().AgentStream(), s.oldVersion) 451 } 452 453 s.assertUpgradeSteps(c, state.JobManageEnviron) 454 s.assertStateServerUpgrades(c) 455 } 456 457 func (s *UpgradeSuite) TestUpgradeStepsHostMachine(c *gc.C) { 458 coretesting.SkipIfPPC64EL(c, "lp:1444576") 459 coretesting.SkipIfWindowsBug(c, "lp:1446885") 460 s.setInstantRetryStrategy(c) 461 // We need to first start up a state server that thinks it has already been upgraded. 462 ss, _, _ := s.primeAgent(c, version.Current, state.JobManageEnviron) 463 a := s.newAgent(c, ss) 464 go func() { c.Check(a.Run(nil), gc.IsNil) }() 465 defer func() { c.Check(a.Stop(), gc.IsNil) }() 466 // Now run the test. 467 s.assertUpgradeSteps(c, state.JobHostUnits) 468 s.assertHostUpgrades(c) 469 } 470 471 func (s *UpgradeSuite) TestLoginsDuringUpgrade(c *gc.C) { 472 // Create machine agent to upgrade 473 machine, machine0Conf, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 474 a := s.newAgent(c, machine) 475 476 // Mock out upgrade logic, using a channel so that the test knows 477 // when upgrades have started and can control when upgrades 478 // should finish. 479 upgradeCh := make(chan bool) 480 abort := make(chan bool) 481 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 482 // Signal that upgrade has started. 483 select { 484 case upgradeCh <- true: 485 case <-abort: 486 return nil 487 } 488 489 // Wait for signal that upgrades should finish. 490 select { 491 case <-upgradeCh: 492 case <-abort: 493 return nil 494 } 495 return nil 496 } 497 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 498 499 // Start the API server and upgrade-steps works just as the agent would. 500 runner := worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant) 501 defer func() { 502 close(abort) 503 runner.Kill() 504 runner.Wait() 505 }() 506 certChangedChan := make(chan params.StateServingInfo) 507 runner.StartWorker("apiserver", a.apiserverWorkerStarter(s.State, certChangedChan)) 508 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter( 509 s.APIState, 510 []multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, 511 )) 512 513 // Set up a second machine to log in as. 514 // API logins are tested manually so there's no need to actually 515 // start this machine. 516 var machine1Conf agent.Config 517 _, machine1Conf, _ = s.primeAgent(c, version.Current, state.JobHostUnits) 518 519 c.Assert(waitForUpgradeToStart(upgradeCh), jc.IsTrue) 520 521 // Only user and local logins are allowed during upgrade. Users get a restricted API. 522 s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed) 523 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 524 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse) 525 526 close(upgradeCh) // Allow upgrade to complete 527 528 waitForUpgradeToFinish(c, machine0Conf) 529 530 // Only user and local logins are allowed even after upgrade steps because 531 // agent upgrade not finished yet. 532 s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed) 533 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 534 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse) 535 536 machineAPI := s.OpenAPIAsMachine(c, machine.Tag(), initialMachinePassword, agent.BootstrapNonce) 537 runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(machineAPI.Upgrader(), machine0Conf)) 538 // Wait for agent upgrade worker to determine that no 539 // agent upgrades are required. 540 select { 541 case <-a.initialAgentUpgradeCheckComplete: 542 case <-time.After(coretesting.LongWait): 543 c.Fatalf("timeout waiting for upgrade check") 544 } 545 546 // All logins are allowed after upgrade 547 s.checkLoginToAPIAsUser(c, machine0Conf, FullAPIExposed) 548 c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue) 549 c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsTrue) 550 } 551 552 func (s *UpgradeSuite) TestUpgradeSkippedIfNoUpgradeRequired(c *gc.C) { 553 attempts := 0 554 upgradeCh := make(chan bool) 555 fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error { 556 // Note: this shouldn't run. 557 attempts++ 558 // If execution ends up here, wait so it can be detected (by 559 // checking for restricted API 560 <-upgradeCh 561 return nil 562 } 563 s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade) 564 565 // Set up machine agent running the current version. 566 // 567 // Set the agent's initial upgradedToVersion to be almost the same 568 // as version.Current but not quite. We want it to be different to 569 // version.Current (so that we can see it change) but not to 570 // trigger upgrade steps. 571 initialVersion := makeBumpedCurrentVersion() 572 machine, agentConf, _ := s.primeAgent(c, initialVersion, state.JobManageEnviron) 573 a := s.newAgent(c, machine) 574 go func() { c.Check(a.Run(nil), gc.IsNil) }() 575 defer func() { 576 close(upgradeCh) 577 c.Check(a.Stop(), gc.IsNil) 578 }() 579 580 // Test that unrestricted API logins are possible (i.e. no 581 // "upgrade mode" in force) 582 s.checkLoginToAPIAsUser(c, agentConf, FullAPIExposed) 583 c.Assert(attempts, gc.Equals, 0) // There should have been no attempt to upgrade. 584 585 // Even though no upgrade was done upgradedToVersion should have been updated. 586 c.Assert(a.CurrentConfig().UpgradedToVersion(), gc.Equals, version.Current.Number) 587 } 588 589 func (s *UpgradeSuite) TestDowngradeOnMasterWhenOtherStateServerDoesntStartUpgrade(c *gc.C) { 590 coretesting.SkipIfWindowsBug(c, "lp:1446885") 591 // This test checks that the master triggers a downgrade if one of 592 // the other state server fails to signal it is ready for upgrade. 593 // 594 // This test is functional, ensuring that the upgrader worker 595 // terminates the machine agent with the UpgradeReadyError which 596 // makes the downgrade happen. 597 598 // Speed up the watcher frequency to make the test much faster. 599 s.PatchValue(&watcher.Period, 200*time.Millisecond) 600 601 // Provide (fake) tools so that the upgrader has something to downgrade to. 602 envtesting.AssertUploadFakeToolsVersions( 603 c, s.DefaultToolsStorage, s.Environ.Config().AgentStream(), s.Environ.Config().AgentStream(), s.oldVersion) 604 605 // Only the first machine is going to be ready for upgrade. 606 machineIdA, machineIdB, _ := s.createUpgradingStateServers(c) 607 608 // One of the other state servers is ready for upgrade (but machine C doesn't). 609 info, err := s.State.EnsureUpgradeInfo(machineIdB, s.oldVersion.Number, version.Current.Number) 610 c.Assert(err, jc.ErrorIsNil) 611 612 agent := s.newAgentFromMachineId(c, machineIdA) 613 defer agent.Stop() 614 615 s.machineIsMaster = true 616 617 var agentErr error 618 agentDone := make(chan bool) 619 go func() { 620 agentErr = agent.Run(nil) 621 close(agentDone) 622 }() 623 624 select { 625 case <-agentDone: 626 upgradeReadyErr, ok := agentErr.(*upgrader.UpgradeReadyError) 627 if !ok { 628 c.Fatalf("didn't see UpgradeReadyError, instead got: %v", agentErr) 629 } 630 // Confirm that the downgrade is back to the previous version. 631 c.Assert(upgradeReadyErr.OldTools, gc.Equals, version.Current) 632 c.Assert(upgradeReadyErr.NewTools, gc.Equals, s.oldVersion) 633 634 case <-time.After(coretesting.LongWait): 635 c.Fatal("machine agent did not exit as expected") 636 } 637 638 // UpgradeInfo doc should now be archived. 639 err = info.Refresh() 640 c.Assert(err, gc.ErrorMatches, "current upgrade info not found") 641 } 642 643 // Run just the upgrade-steps worker with a fake machine agent and 644 // fake agent config. 645 func (s *UpgradeSuite) runUpgradeWorker(c *gc.C, jobs ...multiwatcher.MachineJob) ( 646 error, *fakeConfigSetter, *fakeUpgradingMachineAgent, *upgradeWorkerContext, 647 ) { 648 config := s.makeFakeConfig() 649 agent := NewFakeUpgradingMachineAgent(config) 650 err, context := s.runUpgradeWorkerUsingAgent(c, agent, jobs...) 651 return err, config, agent, context 652 } 653 654 // Run just the upgrade-steps worker with the fake machine agent 655 // provided. 656 func (s *UpgradeSuite) runUpgradeWorkerUsingAgent( 657 c *gc.C, 658 agent *fakeUpgradingMachineAgent, 659 jobs ...multiwatcher.MachineJob, 660 ) (error, *upgradeWorkerContext) { 661 s.setInstantRetryStrategy(c) 662 context := NewUpgradeWorkerContext() 663 worker := context.Worker(agent, nil, jobs) 664 return worker.Wait(), context 665 } 666 667 func (s *UpgradeSuite) makeFakeConfig() *fakeConfigSetter { 668 return NewFakeConfigSetter(names.NewMachineTag("0"), s.oldVersion.Number) 669 } 670 671 // Create 3 configured state servers that appear to be running tools 672 // with version s.oldVersion and return their ids. 673 func (s *UpgradeSuite) createUpgradingStateServers(c *gc.C) (machineIdA, machineIdB, machineIdC string) { 674 machine0, _, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron) 675 machineIdA = machine0.Id() 676 677 changes, err := s.State.EnsureAvailability(3, constraints.Value{}, "quantal", nil) 678 c.Assert(err, jc.ErrorIsNil) 679 c.Assert(len(changes.Added), gc.Equals, 2) 680 machineIdB = changes.Added[0] 681 s.configureMachine(c, machineIdB, s.oldVersion) 682 machineIdC = changes.Added[1] 683 s.configureMachine(c, machineIdC, s.oldVersion) 684 685 return 686 } 687 688 func (s *UpgradeSuite) newAgentFromMachineId(c *gc.C, machineId string) *MachineAgent { 689 machine, err := s.State.Machine(machineId) 690 c.Assert(err, jc.ErrorIsNil) 691 return s.newAgent(c, machine) 692 } 693 694 // Return a version the same as the current software version, but with 695 // the build number bumped. 696 // 697 // The version Tag is also cleared so that upgrades.PerformUpgrade 698 // doesn't think it needs to run upgrade steps unnecessarily. 699 func makeBumpedCurrentVersion() version.Binary { 700 v := version.Current 701 v.Build++ 702 v.Tag = "" 703 return v 704 } 705 706 func waitForUpgradeToStart(upgradeCh chan bool) bool { 707 select { 708 case <-upgradeCh: 709 return true 710 case <-time.After(coretesting.LongWait): 711 return false 712 } 713 } 714 715 const maxUpgradeRetries = 3 716 717 func (s *UpgradeSuite) setInstantRetryStrategy(c *gc.C) { 718 s.PatchValue(&getUpgradeRetryStrategy, func() utils.AttemptStrategy { 719 c.Logf("setting instant retry strategy for upgrade: retries=%d", maxUpgradeRetries) 720 return utils.AttemptStrategy{ 721 Delay: 0, 722 Min: maxUpgradeRetries, 723 } 724 }) 725 } 726 727 func (s *UpgradeSuite) makeExpectedStatusCalls(retryCount int, expectFail bool, failReason string) []MachineStatusCall { 728 calls := []MachineStatusCall{{ 729 params.StatusStarted, 730 fmt.Sprintf("upgrading to %s", version.Current.Number), 731 }} 732 for i := 0; i < retryCount; i++ { 733 calls = append(calls, MachineStatusCall{ 734 params.StatusError, 735 fmt.Sprintf("upgrade to %s failed (will retry): %s", version.Current.Number, failReason), 736 }) 737 } 738 if expectFail { 739 calls = append(calls, MachineStatusCall{ 740 params.StatusError, 741 fmt.Sprintf("upgrade to %s failed (giving up): %s", version.Current.Number, failReason), 742 }) 743 } else { 744 calls = append(calls, MachineStatusCall{params.StatusStarted, ""}) 745 } 746 return calls 747 } 748 749 func (s *UpgradeSuite) makeExpectedUpgradeLogs( 750 retryCount int, 751 target string, 752 expectFail bool, 753 failReason string, 754 ) []jc.SimpleMessage { 755 outLogs := []jc.SimpleMessage{} 756 757 if target == "databaseMaster" || target == "stateServer" { 758 outLogs = append(outLogs, jc.SimpleMessage{ 759 loggo.INFO, "waiting for other state servers to be ready for upgrade", 760 }) 761 var waitMsg string 762 switch target { 763 case "databaseMaster": 764 waitMsg = "all state servers are ready to run upgrade steps" 765 case "stateServer": 766 waitMsg = "the master has completed its upgrade steps" 767 } 768 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, "finished waiting - " + waitMsg}) 769 } 770 771 outLogs = append(outLogs, jc.SimpleMessage{ 772 loggo.INFO, fmt.Sprintf( 773 `starting upgrade from %s to %s for "machine-0"`, 774 s.oldVersion.Number, version.Current.Number), 775 }) 776 777 failMessage := fmt.Sprintf( 778 `upgrade from %s to %s for "machine-0" failed \(%%s\): %s`, 779 s.oldVersion.Number, version.Current.Number, failReason) 780 781 for i := 0; i < retryCount; i++ { 782 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "will retry")}) 783 } 784 if expectFail { 785 outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "giving up")}) 786 } else { 787 outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, 788 fmt.Sprintf(`upgrade to %s completed successfully.`, version.Current.Number)}) 789 } 790 return outLogs 791 } 792 793 func (s *UpgradeSuite) assertUpgradeSteps(c *gc.C, job state.MachineJob) { 794 agent, stopFunc := s.createAgentAndStartUpgrade(c, job) 795 defer stopFunc() 796 waitForUpgradeToFinish(c, agent.CurrentConfig()) 797 } 798 799 func (s *UpgradeSuite) keyFile() string { 800 return filepath.Join(s.DataDir(), "system-identity") 801 } 802 803 func (s *UpgradeSuite) assertCommonUpgrades(c *gc.C) { 804 // rsyslog-gnutls should have been installed. 805 cmds := s.getAptCmds() 806 c.Assert(cmds, gc.HasLen, 1) 807 args := cmds[0].Args 808 c.Assert(len(args), jc.GreaterThan, 1) 809 c.Assert(args[0], gc.Equals, "apt-get") 810 c.Assert(args[len(args)-1], gc.Equals, "rsyslog-gnutls") 811 } 812 813 func (s *UpgradeSuite) assertStateServerUpgrades(c *gc.C) { 814 s.assertCommonUpgrades(c) 815 // System SSH key 816 c.Assert(s.keyFile(), jc.IsNonEmptyFile) 817 // Syslog port should have been updated 818 cfg, err := s.State.EnvironConfig() 819 c.Assert(err, jc.ErrorIsNil) 820 c.Assert(cfg.SyslogPort(), gc.Equals, config.DefaultSyslogPort) 821 // Deprecated attributes should have been deleted - just test a couple. 822 allAttrs := cfg.AllAttrs() 823 _, ok := allAttrs["public-bucket"] 824 c.Assert(ok, jc.IsFalse) 825 _, ok = allAttrs["public-bucket-region"] 826 c.Assert(ok, jc.IsFalse) 827 } 828 829 func (s *UpgradeSuite) assertHostUpgrades(c *gc.C) { 830 s.assertCommonUpgrades(c) 831 // Lock directory 832 // TODO(bogdanteleaga): Fix this on windows. Currently a bash script is 833 // used to create the directory which partially works on windows 8 but 834 // doesn't work on windows server. 835 lockdir := filepath.Join(s.DataDir(), "locks") 836 c.Assert(lockdir, jc.IsDirectory) 837 // SSH key file should not be generated for hosts. 838 _, err := os.Stat(s.keyFile()) 839 c.Assert(err, jc.Satisfies, os.IsNotExist) 840 // Syslog port should not have been updated 841 cfg, err := s.State.EnvironConfig() 842 c.Assert(err, jc.ErrorIsNil) 843 c.Assert(cfg.SyslogPort(), gc.Not(gc.Equals), config.DefaultSyslogPort) 844 // Add other checks as needed... 845 } 846 847 func (s *UpgradeSuite) createAgentAndStartUpgrade(c *gc.C, job state.MachineJob) (*MachineAgent, func()) { 848 machine, _, _ := s.primeAgent(c, s.oldVersion, job) 849 a := s.newAgent(c, machine) 850 go func() { c.Check(a.Run(nil), gc.IsNil) }() 851 return a, func() { c.Check(a.Stop(), gc.IsNil) } 852 } 853 854 func (s *UpgradeSuite) assertEnvironAgentVersion(c *gc.C, expected version.Number) { 855 envConfig, err := s.State.EnvironConfig() 856 c.Assert(err, jc.ErrorIsNil) 857 agentVersion, ok := envConfig.AgentVersion() 858 c.Assert(ok, jc.IsTrue) 859 c.Assert(agentVersion, gc.Equals, expected) 860 } 861 862 func waitForUpgradeToFinish(c *gc.C, conf agent.Config) { 863 success := false 864 for attempt := coretesting.LongAttempt.Start(); attempt.Next(); { 865 diskConf := readConfigFromDisk(c, conf.DataDir(), conf.Tag()) 866 success = diskConf.UpgradedToVersion() == version.Current.Number 867 if success { 868 break 869 } 870 } 871 c.Assert(success, jc.IsTrue) 872 } 873 874 func readConfigFromDisk(c *gc.C, dir string, tag names.Tag) agent.Config { 875 conf, err := agent.ReadConfig(agent.ConfigPath(dir, tag)) 876 c.Assert(err, jc.ErrorIsNil) 877 return conf 878 } 879 880 func (s *UpgradeSuite) checkLoginToAPIAsUser(c *gc.C, conf agent.Config, expectFullApi exposedAPI) { 881 var err error 882 // Multiple attempts may be necessary because there is a small gap 883 // between the post-upgrade version being written to the agent's 884 // config (as observed by waitForUpgradeToFinish) and the end of 885 // "upgrade mode" (i.e. when the agent's UpgradeComplete channel 886 // is closed). Without this tests that call checkLoginToAPIAsUser 887 // can occasionally fail. 888 for a := coretesting.LongAttempt.Start(); a.Next(); { 889 err = s.attemptRestrictedAPIAsUser(c, conf) 890 switch expectFullApi { 891 case FullAPIExposed: 892 if err == nil { 893 return 894 } 895 case RestrictedAPIExposed: 896 if err != nil && strings.HasPrefix(err.Error(), "upgrade in progress") { 897 return 898 } 899 } 900 } 901 c.Fatalf("timed out waiting for expected API behaviour. last error was: %v", err) 902 } 903 904 func (s *UpgradeSuite) attemptRestrictedAPIAsUser(c *gc.C, conf agent.Config) error { 905 info := conf.APIInfo() 906 info.Tag = s.AdminUserTag(c) 907 info.Password = "dummy-secret" 908 info.Nonce = "" 909 910 apiState, err := api.Open(info, upgradeTestDialOpts) 911 c.Assert(err, jc.ErrorIsNil) 912 defer apiState.Close() 913 914 // this call should always work 915 var result api.Status 916 err = apiState.APICall("Client", 0, "", "FullStatus", nil, &result) 917 c.Assert(err, jc.ErrorIsNil) 918 919 // this call should only work if API is not restricted 920 return apiState.APICall("Client", 0, "", "WatchAll", nil, nil) 921 } 922 923 func canLoginToAPIAsMachine(c *gc.C, fromConf, toConf agent.Config) bool { 924 info := fromConf.APIInfo() 925 info.Addrs = toConf.APIInfo().Addrs 926 apiState, err := api.Open(info, upgradeTestDialOpts) 927 if apiState != nil { 928 apiState.Close() 929 } 930 return apiState != nil && err == nil 931 } 932 933 var upgradeTestDialOpts = api.DialOpts{ 934 Timeout: 2 * time.Minute, 935 RetryDelay: 250 * time.Millisecond, 936 DialAddressInterval: 50 * time.Millisecond, 937 } 938 939 func assertUpgradeComplete(c *gc.C, context *upgradeWorkerContext) { 940 select { 941 case <-context.UpgradeComplete: 942 default: 943 c.Error("UpgradeComplete channel is open but shouldn't be") 944 } 945 } 946 947 func assertUpgradeNotComplete(c *gc.C, context *upgradeWorkerContext) { 948 select { 949 case <-context.UpgradeComplete: 950 c.Error("UpgradeComplete channel is closed but shouldn't be") 951 default: 952 } 953 } 954 955 // NewFakeConfigSetter returns a fakeConfigSetter which implements 956 // just enough of the agent.ConfigSetter interface to keep the upgrade 957 // steps worker happy. 958 func NewFakeConfigSetter(agentTag names.Tag, initialVersion version.Number) *fakeConfigSetter { 959 return &fakeConfigSetter{ 960 AgentTag: agentTag, 961 Version: initialVersion, 962 } 963 } 964 965 type fakeConfigSetter struct { 966 agent.ConfigSetter 967 AgentTag names.Tag 968 Version version.Number 969 } 970 971 func (s *fakeConfigSetter) Tag() names.Tag { 972 return s.AgentTag 973 } 974 975 func (s *fakeConfigSetter) UpgradedToVersion() version.Number { 976 return s.Version 977 } 978 979 func (s *fakeConfigSetter) SetUpgradedToVersion(newVersion version.Number) { 980 s.Version = newVersion 981 } 982 983 // NewFakeUpgradingMachineAgent returns a fakeUpgradingMachineAgent which implements 984 // the upgradingMachineAgent interface. This provides enough 985 // MachineAgent functionality to support upgrades. 986 func NewFakeUpgradingMachineAgent(confSetter agent.ConfigSetter) *fakeUpgradingMachineAgent { 987 return &fakeUpgradingMachineAgent{ 988 config: confSetter, 989 DyingCh: make(chan struct{}), 990 } 991 } 992 993 type fakeUpgradingMachineAgent struct { 994 config agent.ConfigSetter 995 DyingCh chan struct{} 996 MachineStatusCalls []MachineStatusCall 997 } 998 999 type MachineStatusCall struct { 1000 Status params.Status 1001 Info string 1002 } 1003 1004 func (a *fakeUpgradingMachineAgent) setMachineStatus(_ *api.State, status params.Status, info string) error { 1005 // Record setMachineStatus calls for later inspection. 1006 a.MachineStatusCalls = append(a.MachineStatusCalls, MachineStatusCall{status, info}) 1007 return nil 1008 } 1009 1010 func (a *fakeUpgradingMachineAgent) ensureMongoServer(agent.Config) error { 1011 return nil 1012 } 1013 1014 func (a *fakeUpgradingMachineAgent) CurrentConfig() agent.Config { 1015 return a.config 1016 } 1017 1018 func (a *fakeUpgradingMachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 1019 return mutate(a.config) 1020 } 1021 1022 func (a *fakeUpgradingMachineAgent) Dying() <-chan struct{} { 1023 return a.DyingCh 1024 }