github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/peergrouper/worker_test.go (about) 1 // Copyright 2014 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package peergrouper 5 6 import ( 7 "errors" 8 "fmt" 9 "net" 10 "sort" 11 "strconv" 12 "time" 13 14 "github.com/juju/clock/testclock" 15 "github.com/juju/loggo" 16 "github.com/juju/pubsub" 17 "github.com/juju/replicaset" 18 jc "github.com/juju/testing/checkers" 19 "github.com/juju/utils/voyeur" 20 "github.com/kr/pretty" 21 gc "gopkg.in/check.v1" 22 "gopkg.in/juju/worker.v1" 23 "gopkg.in/juju/worker.v1/workertest" 24 25 "github.com/juju/juju/core/status" 26 "github.com/juju/juju/network" 27 "github.com/juju/juju/pubsub/apiserver" 28 "github.com/juju/juju/state" 29 coretesting "github.com/juju/juju/testing" 30 ) 31 32 type TestIPVersion struct { 33 version string 34 formatHost string 35 extraHost string 36 addressType network.AddressType 37 } 38 39 var ( 40 testIPv4 = TestIPVersion{ 41 version: "IPv4", 42 formatHost: "0.1.2.%d", 43 extraHost: "0.1.99.13", 44 addressType: network.IPv4Address, 45 } 46 testIPv6 = TestIPVersion{ 47 version: "IPv6", 48 formatHost: "2001:DB8::%d", 49 extraHost: "2001:DB8::99:13", 50 addressType: network.IPv6Address, 51 } 52 ) 53 54 type workerSuite struct { 55 coretesting.BaseSuite 56 clock *testclock.Clock 57 hub Hub 58 } 59 60 var _ = gc.Suite(&workerSuite{}) 61 62 func (s *workerSuite) SetUpTest(c *gc.C) { 63 s.BaseSuite.SetUpTest(c) 64 s.clock = testclock.NewClock(time.Now()) 65 s.hub = nopHub{} 66 logger.SetLogLevel(loggo.TRACE) 67 } 68 69 type testSuite interface { 70 SetUpTest(c *gc.C) 71 TearDownTest(c *gc.C) 72 } 73 74 // DoTestForIPv4AndIPv6 runs the passed test for IPv4 and IPv6. 75 // 76 // TODO(axw) the type of address has little to do with the 77 // behaviour of this worker. so we should not need to run the 78 // tests for each address type. We can introduce a limited 79 // number (probably one) of feature tests to check that we 80 // handle both address types as expected. 81 func DoTestForIPv4AndIPv6(c *gc.C, s testSuite, t func(ipVersion TestIPVersion)) { 82 t(testIPv4) 83 s.TearDownTest(c) 84 s.SetUpTest(c) 85 t(testIPv6) 86 } 87 88 // InitState initializes the fake state with a single replica-set member and 89 // numMachines machines primed to vote. 90 func InitState(c *gc.C, st *fakeState, numMachines int, ipVersion TestIPVersion) { 91 var ids []string 92 for i := 10; i < 10+numMachines; i++ { 93 id := fmt.Sprint(i) 94 m := st.addMachine(id, true) 95 m.setAddresses(network.NewAddress(fmt.Sprintf(ipVersion.formatHost, i))) 96 ids = append(ids, id) 97 c.Assert(m.Addresses(), gc.HasLen, 1) 98 } 99 st.setControllers(ids...) 100 st.session.Set(mkMembers("0v", ipVersion)) 101 st.session.setStatus(mkStatuses("0p", ipVersion)) 102 st.machine("10").SetHasVote(true) 103 st.setCheck(checkInvariants) 104 } 105 106 // ExpectedAPIHostPorts returns the expected addresses 107 // of the machines as created by InitState. 108 func ExpectedAPIHostPorts(n int, ipVersion TestIPVersion) [][]network.HostPort { 109 servers := make([][]network.HostPort, n) 110 for i := range servers { 111 servers[i] = network.NewHostPorts( 112 apiPort, 113 fmt.Sprintf(ipVersion.formatHost, i+10), 114 ) 115 } 116 return servers 117 } 118 119 func (s *workerSuite) TestSetsAndUpdatesMembersIPv4(c *gc.C) { 120 s.doTestSetAndUpdateMembers(c, testIPv4) 121 } 122 123 func (s *workerSuite) TestSetsAndUpdatesMembersIPv6(c *gc.C) { 124 s.doTestSetAndUpdateMembers(c, testIPv6) 125 } 126 127 func (s *workerSuite) doTestSetAndUpdateMembers(c *gc.C, ipVersion TestIPVersion) { 128 c.Logf("\n\nTestSetsAndUpdatesMembers: %s", ipVersion.version) 129 st := NewFakeState() 130 InitState(c, st, 3, ipVersion) 131 memberWatcher := st.session.members.Watch() 132 mustNext(c, memberWatcher, "init") 133 assertMembers(c, memberWatcher.Value(), mkMembers("0v", ipVersion)) 134 135 logger.Infof("starting worker") 136 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 137 defer workertest.CleanKill(c, w) 138 139 // Due to the inherit complexity of the multiple goroutines running 140 // and listen do different watchers, there is no way to manually 141 // advance the testing clock in a controlled manner as the clock.After 142 // calls can be replaced in response to other watcher events. Hence 143 // using the standard testing clock wait / advance method does not 144 // work. So we use the real clock to advance the test clock for this 145 // test. 146 // Every 5ms we advance the testing clock by pollInterval (1min) 147 done := make(chan struct{}) 148 clockAdvancerFinished := make(chan struct{}) 149 defer func() { 150 close(done) 151 select { 152 case <-clockAdvancerFinished: 153 return 154 case <-time.After(coretesting.LongWait): 155 c.Error("advancing goroutine didn't finish") 156 } 157 }() 158 go func() { 159 defer close(clockAdvancerFinished) 160 for { 161 select { 162 case <-time.After(5 * time.Millisecond): 163 s.clock.Advance(pollInterval) 164 case <-done: 165 return 166 } 167 } 168 }() 169 170 // Wait for the worker to set the initial members. 171 mustNext(c, memberWatcher, "initial members") 172 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2", ipVersion)) 173 174 // Update the status of the new members 175 // and check that they become voting. 176 c.Logf("\nupdating new member status") 177 st.session.setStatus(mkStatuses("0s 1p 2s", ipVersion)) 178 mustNext(c, memberWatcher, "new member status") 179 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v", ipVersion)) 180 181 c.Logf("\nadding another machine") 182 m13 := st.addMachine("13", false) 183 m13.setAddresses(network.NewAddress(fmt.Sprintf(ipVersion.formatHost, 13))) 184 st.setControllers("10", "11", "12", "13") 185 186 mustNext(c, memberWatcher, "waiting for new member to be added") 187 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3", ipVersion)) 188 189 // Remove vote from an existing member; and give it to the new 190 // machine. Also set the status of the new machine to healthy. 191 c.Logf("\nremoving vote from machine 10 and adding it to machine 13") 192 st.machine("10").setWantsVote(false) 193 mustNext(c, memberWatcher, "waiting for vote switch") 194 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2 3", ipVersion)) 195 196 st.machine("13").setWantsVote(true) 197 198 st.session.setStatus(mkStatuses("0s 1p 2s 3s", ipVersion)) 199 200 // Check that the new machine gets the vote and the 201 // old machine loses it. 202 mustNext(c, memberWatcher, "waiting for vote switch") 203 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v", ipVersion)) 204 205 c.Logf("\nremoving old machine") 206 // Remove the old machine. 207 st.removeMachine("10") 208 st.setControllers("11", "12", "13") 209 210 // Check that it's removed from the members. 211 mustNext(c, memberWatcher, "waiting for removal") 212 assertMembers(c, memberWatcher.Value(), mkMembers("1v 2v 3v", ipVersion)) 213 } 214 215 func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFailsIPv4(c *gc.C) { 216 s.doTestHasVoteMaintainsEvenWhenReplicaSetFails(c, testIPv4) 217 } 218 219 func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFailsIPv6(c *gc.C) { 220 s.doTestHasVoteMaintainsEvenWhenReplicaSetFails(c, testIPv6) 221 } 222 223 func (s *workerSuite) doTestHasVoteMaintainsEvenWhenReplicaSetFails(c *gc.C, ipVersion TestIPVersion) { 224 st := NewFakeState() 225 226 // Simulate a state where we have four controllers, 227 // one has gone down, and we're replacing it: 228 // 0 - hasvote true, wantsvote false, down 229 // 1 - hasvote true, wantsvote true 230 // 2 - hasvote true, wantsvote true 231 // 3 - hasvote false, wantsvote true 232 // 233 // When it starts, the worker should move the vote from 234 // 0 to 3. We'll arrange things so that it will succeed in 235 // setting the membership but fail setting the HasVote 236 // to false. 237 InitState(c, st, 4, ipVersion) 238 st.machine("10").SetHasVote(true) 239 st.machine("11").SetHasVote(true) 240 st.machine("12").SetHasVote(true) 241 st.machine("13").SetHasVote(false) 242 243 st.machine("10").setWantsVote(false) 244 st.machine("11").setWantsVote(true) 245 st.machine("12").setWantsVote(true) 246 st.machine("13").setWantsVote(true) 247 248 st.session.Set(mkMembers("0v 1v 2v 3", ipVersion)) 249 st.session.setStatus(mkStatuses("0H 1p 2s 3s", ipVersion)) 250 251 // Make the worker fail to set HasVote to false 252 // after changing the replica set membership. 253 st.errors.setErrorFor("Machine.SetHasVote * false", errors.New("frood")) 254 255 memberWatcher := st.session.members.Watch() 256 mustNext(c, memberWatcher, "waiting for SetHasVote failure") 257 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3", ipVersion)) 258 259 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 260 defer workertest.DirtyKill(c, w) 261 262 // Wait for the worker to set the initial members. 263 mustNext(c, memberWatcher, "initial members") 264 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v", ipVersion)) 265 266 // The worker should encounter an error setting the 267 // has-vote status to false and exit. 268 err := workertest.CheckKilled(c, w) 269 c.Assert(err, gc.ErrorMatches, `removing non-voters: cannot set voting status of "[0-9]+" to false: frood`) 270 271 // Start the worker again - although the membership should 272 // not change, the HasVote status should be updated correctly. 273 st.errors.resetErrors() 274 w = s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 275 defer workertest.CleanKill(c, w) 276 277 // Watch all the machines for changes, so we can check 278 // their has-vote status without polling. 279 changed := make(chan struct{}, 1) 280 for i := 10; i < 14; i++ { 281 watcher := st.machine(fmt.Sprint(i)).val.Watch() 282 defer watcher.Close() 283 go func() { 284 for watcher.Next() { 285 select { 286 case changed <- struct{}{}: 287 default: 288 } 289 } 290 }() 291 } 292 timeout := time.After(coretesting.LongWait) 293 loop: 294 for { 295 select { 296 case <-changed: 297 correct := true 298 for i := 10; i < 14; i++ { 299 hasVote := st.machine(fmt.Sprint(i)).HasVote() 300 expectHasVote := i != 10 301 if hasVote != expectHasVote { 302 correct = false 303 } 304 } 305 if correct { 306 break loop 307 } 308 case <-timeout: 309 c.Fatalf("timed out waiting for vote to be set") 310 } 311 } 312 } 313 314 func (s *workerSuite) TestAddressChange(c *gc.C) { 315 DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) { 316 st := NewFakeState() 317 InitState(c, st, 3, ipVersion) 318 319 memberWatcher := st.session.members.Watch() 320 mustNext(c, memberWatcher, "init") 321 assertMembers(c, memberWatcher.Value(), mkMembers("0v", ipVersion)) 322 323 logger.Infof("starting worker") 324 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 325 defer workertest.CleanKill(c, w) 326 327 // Wait for the worker to set the initial members. 328 mustNext(c, memberWatcher, "initial members") 329 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2", ipVersion)) 330 331 // Change an address and wait for it to be changed in the 332 // members. 333 st.machine("11").setAddresses(network.NewAddress(ipVersion.extraHost)) 334 335 mustNext(c, memberWatcher, "waiting for new address") 336 expectMembers := mkMembers("0v 1 2", ipVersion) 337 expectMembers[1].Address = net.JoinHostPort(ipVersion.extraHost, fmt.Sprint(mongoPort)) 338 assertMembers(c, memberWatcher.Value(), expectMembers) 339 }) 340 } 341 342 var fatalErrorsTests = []struct { 343 errPattern string 344 err error 345 expectErr string 346 advanceCount int 347 }{{ 348 errPattern: "State.ControllerInfo", 349 expectErr: "cannot get controller info: sample", 350 }, { 351 errPattern: "Machine.SetHasVote 11 true", 352 expectErr: `adding new voters: cannot set voting status of "11" to true: sample`, 353 advanceCount: 2, 354 }, { 355 errPattern: "Session.CurrentStatus", 356 expectErr: "creating peer group info: cannot get replica set status: sample", 357 }, { 358 errPattern: "Session.CurrentMembers", 359 expectErr: "creating peer group info: cannot get replica set members: sample", 360 }, { 361 errPattern: "State.Machine *", 362 expectErr: `cannot get machine "10": sample`, 363 }} 364 365 func (s *workerSuite) TestFatalErrors(c *gc.C) { 366 DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) { 367 s.PatchValue(&pollInterval, 5*time.Millisecond) 368 for i, testCase := range fatalErrorsTests { 369 c.Logf("\n(%s) test %d: %s -> %s", ipVersion.version, i, testCase.errPattern, testCase.expectErr) 370 st := NewFakeState() 371 st.session.InstantlyReady = true 372 InitState(c, st, 3, ipVersion) 373 st.errors.setErrorFor(testCase.errPattern, errors.New("sample")) 374 375 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 376 defer workertest.DirtyKill(c, w) 377 378 for j := 0; j < testCase.advanceCount; j++ { 379 s.clock.WaitAdvance(pollInterval, coretesting.ShortWait, 1) 380 } 381 done := make(chan error) 382 go func() { 383 done <- w.Wait() 384 }() 385 select { 386 case err := <-done: 387 c.Assert(err, gc.ErrorMatches, testCase.expectErr) 388 case <-time.After(coretesting.LongWait): 389 c.Fatalf("timed out waiting for error") 390 } 391 } 392 }) 393 } 394 395 func (s *workerSuite) TestSetMembersErrorIsNotFatal(c *gc.C) { 396 DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) { 397 st := NewFakeState() 398 InitState(c, st, 3, ipVersion) 399 st.session.setStatus(mkStatuses("0p 1s 2s", ipVersion)) 400 called := make(chan error) 401 setErr := errors.New("sample") 402 st.errors.setErrorFuncFor("Session.Set", func() error { 403 called <- setErr 404 return setErr 405 }) 406 407 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 408 defer workertest.CleanKill(c, w) 409 410 // Just watch three error retries 411 retryInterval := initialRetryInterval 412 for i := 0; i < 3; i++ { 413 s.clock.WaitAdvance(retryInterval, coretesting.ShortWait, 1) 414 retryInterval = scaleRetry(retryInterval) 415 select { 416 case err := <-called: 417 c.Check(err, gc.Equals, setErr) 418 case <-time.After(coretesting.LongWait): 419 c.Fatalf("timed out waiting for loop #%d", i) 420 } 421 } 422 }) 423 } 424 425 type SetAPIHostPortsFunc func(apiServers [][]network.HostPort) error 426 427 func (f SetAPIHostPortsFunc) SetAPIHostPorts(apiServers [][]network.HostPort) error { 428 return f(apiServers) 429 } 430 431 func (s *workerSuite) TestControllersArePublished(c *gc.C) { 432 DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) { 433 publishCh := make(chan [][]network.HostPort) 434 publish := func(apiServers [][]network.HostPort) error { 435 publishCh <- apiServers 436 return nil 437 } 438 439 st := NewFakeState() 440 InitState(c, st, 3, ipVersion) 441 w := s.newWorker(c, st, st.session, SetAPIHostPortsFunc(publish)) 442 defer workertest.CleanKill(c, w) 443 444 select { 445 case servers := <-publishCh: 446 AssertAPIHostPorts(c, servers, ExpectedAPIHostPorts(3, ipVersion)) 447 case <-time.After(coretesting.LongWait): 448 c.Fatalf("timed out waiting for publish") 449 } 450 451 // If a config change wakes up the loop *after* the controller topology 452 // is published, then we will get another call to setAPIHostPorts. 453 select { 454 case <-publishCh: 455 case <-time.After(coretesting.ShortWait): 456 } 457 458 // Change one of the server API addresses and check that it is 459 // published. 460 newMachine10Addresses := network.NewAddresses(ipVersion.extraHost) 461 st.machine("10").setAddresses(newMachine10Addresses...) 462 select { 463 case servers := <-publishCh: 464 expected := ExpectedAPIHostPorts(3, ipVersion) 465 expected[0] = network.AddressesWithPort(newMachine10Addresses, apiPort) 466 AssertAPIHostPorts(c, servers, expected) 467 case <-time.After(coretesting.LongWait): 468 c.Fatalf("timed out waiting for publish") 469 } 470 }) 471 } 472 473 func (s *workerSuite) TestControllersArePublishedOverHub(c *gc.C) { 474 st := NewFakeState() 475 InitState(c, st, 3, testIPv4) 476 477 hub := pubsub.NewStructuredHub(nil) 478 event := make(chan apiserver.Details) 479 _, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) { 480 c.Check(err, jc.ErrorIsNil) 481 event <- data 482 }) 483 c.Assert(err, jc.ErrorIsNil) 484 s.hub = hub 485 486 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 487 defer workertest.CleanKill(c, w) 488 489 expected := apiserver.Details{ 490 Servers: map[string]apiserver.APIServer{ 491 "10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:5678"}, 492 "11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:5678"}, 493 "12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:5678"}, 494 }, 495 LocalOnly: true, 496 } 497 498 select { 499 case obtained := <-event: 500 c.Assert(obtained, jc.DeepEquals, expected) 501 case <-time.After(coretesting.LongWait): 502 c.Fatalf("timed out waiting for event") 503 } 504 } 505 506 func (s *workerSuite) TestControllersPublishedWithControllerAPIPort(c *gc.C) { 507 st := NewFakeState() 508 InitState(c, st, 3, testIPv4) 509 510 hub := pubsub.NewStructuredHub(nil) 511 event := make(chan apiserver.Details) 512 _, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) { 513 c.Check(err, jc.ErrorIsNil) 514 event <- data 515 }) 516 c.Assert(err, jc.ErrorIsNil) 517 s.hub = hub 518 519 w := s.newWorkerWithConfig(c, Config{ 520 Clock: s.clock, 521 State: st, 522 MongoSession: st.session, 523 APIHostPortsSetter: nopAPIHostPortsSetter{}, 524 MongoPort: mongoPort, 525 APIPort: apiPort, 526 ControllerAPIPort: controllerAPIPort, 527 Hub: s.hub, 528 }) 529 defer workertest.CleanKill(c, w) 530 531 expected := apiserver.Details{ 532 Servers: map[string]apiserver.APIServer{ 533 "10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:9876"}, 534 "11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:9876"}, 535 "12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:9876"}, 536 }, 537 LocalOnly: true, 538 } 539 540 select { 541 case obtained := <-event: 542 c.Assert(obtained, jc.DeepEquals, expected) 543 case <-time.After(coretesting.LongWait): 544 c.Fatalf("timed out waiting for event") 545 } 546 } 547 548 func (s *workerSuite) TestControllersArePublishedOverHubWithNewVoters(c *gc.C) { 549 st := NewFakeState() 550 var ids []string 551 for i := 10; i < 13; i++ { 552 id := fmt.Sprint(i) 553 m := st.addMachine(id, true) 554 m.SetHasVote(true) 555 m.setAddresses(network.NewAddress(fmt.Sprintf(testIPv4.formatHost, i))) 556 ids = append(ids, id) 557 c.Assert(m.Addresses(), gc.HasLen, 1) 558 } 559 st.setControllers(ids...) 560 st.session.Set(mkMembers("0v 1 2", testIPv4)) 561 st.session.setStatus(mkStatuses("0p 1s 2s", testIPv4)) 562 st.setCheck(checkInvariants) 563 564 hub := pubsub.NewStructuredHub(nil) 565 event := make(chan apiserver.Details) 566 _, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) { 567 c.Check(err, jc.ErrorIsNil) 568 event <- data 569 }) 570 c.Assert(err, jc.ErrorIsNil) 571 s.hub = hub 572 573 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 574 defer workertest.CleanKill(c, w) 575 576 expected := apiserver.Details{ 577 Servers: map[string]apiserver.APIServer{ 578 "10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:5678"}, 579 "11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:5678"}, 580 "12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:5678"}, 581 }, 582 LocalOnly: true, 583 } 584 585 select { 586 case obtained := <-event: 587 c.Assert(obtained, jc.DeepEquals, expected) 588 case <-time.After(coretesting.LongWait): 589 c.Fatalf("timed out waiting for event") 590 } 591 592 // And check that they can be republished on request. 593 _, err = hub.Publish(apiserver.DetailsRequestTopic, apiserver.DetailsRequest{ 594 Requester: "dad", 595 LocalOnly: true, 596 }) 597 c.Assert(err, jc.ErrorIsNil) 598 select { 599 case obtained := <-event: 600 c.Assert(obtained, jc.DeepEquals, expected) 601 case <-time.After(coretesting.LongWait): 602 c.Fatalf("timed out waiting for event") 603 } 604 } 605 606 func haSpaceTestCommonSetup(c *gc.C, ipVersion TestIPVersion, members string) *fakeState { 607 st := NewFakeState() 608 InitState(c, st, 3, ipVersion) 609 610 addrs := network.NewAddresses( 611 fmt.Sprintf(ipVersion.formatHost, 1), 612 fmt.Sprintf(ipVersion.formatHost, 2), 613 fmt.Sprintf(ipVersion.formatHost, 3), 614 ) 615 for i := range addrs { 616 addrs[i].Scope = network.ScopeCloudLocal 617 } 618 619 spaces := []string{"one", "two", "three"} 620 machines := []int{10, 11, 12} 621 for _, id := range machines { 622 machine := st.machine(strconv.Itoa(id)) 623 machine.SetHasVote(true) 624 machine.setWantsVote(true) 625 626 // Each machine gets 3 addresses in 3 different spaces. 627 // Space "one" address on machine 10 ends with "10" 628 // Space "two" address ends with "11" 629 // Space "three" address ends with "12" 630 // Space "one" address on machine 20 ends with "20" 631 // Space "two" address ends with "21" 632 // ... 633 addrs := make([]network.Address, 3) 634 for i, name := range spaces { 635 addr := network.NewAddressOnSpace(name, fmt.Sprintf(ipVersion.formatHost, i*10+id)) 636 addr.Scope = network.ScopeCloudLocal 637 addrs[i] = addr 638 } 639 machine.setAddresses(addrs...) 640 } 641 642 st.session.Set(mkMembers(members, ipVersion)) 643 return st 644 } 645 646 func (s *workerSuite) TestUsesConfiguredHASpaceIPv4(c *gc.C) { 647 s.doTestUsesConfiguredHASpace(c, testIPv4) 648 } 649 650 func (s *workerSuite) TestUsesConfiguredHASpaceIPv6(c *gc.C) { 651 s.doTestUsesConfiguredHASpace(c, testIPv6) 652 } 653 654 func (s *workerSuite) doTestUsesConfiguredHASpace(c *gc.C, ipVersion TestIPVersion) { 655 st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v") 656 657 // Set one of the statuses to ensure it is cleared upon determination 658 // of a new peer group. 659 now := time.Now() 660 err := st.machine("11").SetStatus(status.StatusInfo{ 661 Status: status.Started, 662 Message: "You said that would be bad, Egon", 663 Since: &now, 664 }) 665 c.Assert(err, gc.IsNil) 666 667 st.setHASpace("two") 668 s.runUntilPublish(c, st, "") 669 assertMemberAddresses(c, st, ipVersion.formatHost, 2) 670 671 sInfo, err := st.machine("11").Status() 672 c.Assert(err, gc.IsNil) 673 c.Check(sInfo.Status, gc.Equals, status.Started) 674 c.Check(sInfo.Message, gc.Equals, "") 675 } 676 677 // runUntilPublish runs a worker until addresses are published over the pub/sub 678 // hub. Note that the replica-set is updated earlier than the publish, 679 // so this sync can be used to check for those changes. 680 // If errMsg is not empty, it is used to check for a matching error. 681 func (s *workerSuite) runUntilPublish(c *gc.C, st *fakeState, errMsg string) { 682 hub := pubsub.NewStructuredHub(nil) 683 event := make(chan apiserver.Details) 684 _, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) { 685 c.Check(err, jc.ErrorIsNil) 686 event <- data 687 }) 688 c.Assert(err, jc.ErrorIsNil) 689 s.hub = hub 690 691 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 692 defer func() { 693 if errMsg == "" { 694 workertest.CleanKill(c, w) 695 } else { 696 err := workertest.CheckKill(c, w) 697 c.Assert(err, gc.ErrorMatches, errMsg) 698 } 699 }() 700 701 select { 702 case <-event: 703 case <-time.After(coretesting.LongWait): 704 c.Fatalf("timed out waiting for event") 705 } 706 } 707 708 func (s *workerSuite) TestDetectsAndUsesHASpaceChangeIPv4(c *gc.C) { 709 s.doTestDetectsAndUsesHASpaceChange(c, testIPv4) 710 } 711 712 func (s *workerSuite) TestDetectsAndUsesHASpaceChangeIPv6(c *gc.C) { 713 s.doTestDetectsAndUsesHASpaceChange(c, testIPv6) 714 } 715 716 func (s *workerSuite) doTestDetectsAndUsesHASpaceChange(c *gc.C, ipVersion TestIPVersion) { 717 st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v") 718 st.setHASpace("one") 719 720 // Set up a hub and channel on which to receive notifications. 721 hub := pubsub.NewStructuredHub(nil) 722 event := make(chan apiserver.Details) 723 _, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) { 724 c.Check(err, jc.ErrorIsNil) 725 event <- data 726 }) 727 c.Assert(err, jc.ErrorIsNil) 728 s.hub = hub 729 730 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 731 defer workertest.CleanKill(c, w) 732 733 select { 734 case <-event: 735 case <-time.After(coretesting.LongWait): 736 c.Fatalf("timed out waiting for event") 737 } 738 assertMemberAddresses(c, st, ipVersion.formatHost, 1) 739 740 // Changing the space does not change the API server details, so the 741 // change will not be broadcast via the hub. 742 // We watch the members collection, which *will* change. 743 memberWatcher := st.session.members.Watch() 744 mustNext(c, memberWatcher, "initial watch") 745 746 // HA space config change should invoke the worker. 747 // Replica set addresses should change to the new space. 748 st.setHASpace("three") 749 mustNext(c, memberWatcher, "waiting for members to be updated for space change") 750 assertMemberAddresses(c, st, ipVersion.formatHost, 3) 751 } 752 753 func assertMemberAddresses(c *gc.C, st *fakeState, addrTemplate string, addrDesignator int) { 754 members, _ := st.session.CurrentMembers() 755 obtained := make([]string, 3) 756 for i, m := range members { 757 obtained[i] = m.Address 758 } 759 sort.Strings(obtained) 760 761 expected := make([]string, 3) 762 for i := 0; i < 3; i++ { 763 expected[i] = net.JoinHostPort(fmt.Sprintf(addrTemplate, 10*addrDesignator+i), fmt.Sprint(mongoPort)) 764 } 765 766 c.Check(obtained, gc.DeepEquals, expected) 767 } 768 769 func (s *workerSuite) TestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddrIPv4(c *gc.C) { 770 s.doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv4) 771 } 772 773 func (s *workerSuite) TestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddrIPv6(c *gc.C) { 774 s.doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv6) 775 } 776 777 func (s *workerSuite) doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr( 778 c *gc.C, ipVersion TestIPVersion, 779 ) { 780 st := haSpaceTestCommonSetup(c, ipVersion, "0v") 781 err := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}).Wait() 782 errMsg := `computing desired peer group: updating member addresses: ` + 783 `juju-ha-space is not set and these machines have more than one usable address: 1[12], 1[12]` + 784 "\nrun \"juju config juju-ha-space=<name>\" to set a space for Mongo peer communication" 785 c.Check(err, gc.ErrorMatches, errMsg) 786 787 for _, id := range []string{"11", "12"} { 788 sInfo, err := st.machine(id).Status() 789 c.Assert(err, gc.IsNil) 790 c.Check(sInfo.Status, gc.Equals, status.Started) 791 c.Check(sInfo.Message, gc.Not(gc.Equals), "") 792 } 793 } 794 795 func (s *workerSuite) TestErrorAndStatusForHASpaceWithNoAddressesAddrIPv4(c *gc.C) { 796 s.doTestErrorAndStatusForHASpaceWithNoAddresses(c, testIPv4) 797 } 798 799 func (s *workerSuite) TestErrorAndStatusForHASpaceWithNoAddressesAddrIPv6(c *gc.C) { 800 s.doTestErrorAndStatusForHASpaceWithNoAddresses(c, testIPv6) 801 } 802 803 func (s *workerSuite) doTestErrorAndStatusForHASpaceWithNoAddresses( 804 c *gc.C, ipVersion TestIPVersion, 805 ) { 806 st := haSpaceTestCommonSetup(c, ipVersion, "0v") 807 st.setHASpace("nope") 808 809 err := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}).Wait() 810 errMsg := `computing desired peer group: updating member addresses: ` + 811 `no usable Mongo addresses found in configured juju-ha-space "nope" for machines: 1[012], 1[012], 1[012]` 812 c.Check(err, gc.ErrorMatches, errMsg) 813 814 for _, id := range []string{"10", "11", "12"} { 815 sInfo, err := st.machine(id).Status() 816 c.Assert(err, gc.IsNil) 817 c.Check(sInfo.Status, gc.Equals, status.Started) 818 c.Check(sInfo.Message, gc.Not(gc.Equals), "") 819 } 820 } 821 822 func (s *workerSuite) TestSamePeersAndNoHASpaceAndMachinesWithMultiAddrIPv4(c *gc.C) { 823 s.doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv4) 824 } 825 826 func (s *workerSuite) TestSamePeersAndNoHASpaceAndMachinesWithMultiAddrIPv6(c *gc.C) { 827 s.doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv6) 828 } 829 830 func (s *workerSuite) doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c *gc.C, ipVersion TestIPVersion) { 831 st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v") 832 s.runUntilPublish(c, st, "") 833 assertMemberAddresses(c, st, ipVersion.formatHost, 1) 834 } 835 836 func (s *workerSuite) TestWorkerRetriesOnSetAPIHostPortsErrorIPv4(c *gc.C) { 837 s.doTestWorkerRetriesOnSetAPIHostPortsError(c, testIPv4) 838 } 839 840 func (s *workerSuite) TestWorkerRetriesOnSetAPIHostPortsErrorIPv6(c *gc.C) { 841 s.doTestWorkerRetriesOnSetAPIHostPortsError(c, testIPv6) 842 } 843 844 func (s *workerSuite) doTestWorkerRetriesOnSetAPIHostPortsError(c *gc.C, ipVersion TestIPVersion) { 845 logger.SetLogLevel(loggo.TRACE) 846 847 publishCh := make(chan [][]network.HostPort, 10) 848 failedOnce := false 849 publish := func(apiServers [][]network.HostPort) error { 850 if !failedOnce { 851 failedOnce = true 852 return fmt.Errorf("publish error") 853 } 854 publishCh <- apiServers 855 return nil 856 } 857 st := NewFakeState() 858 InitState(c, st, 3, ipVersion) 859 860 w := s.newWorker(c, st, st.session, SetAPIHostPortsFunc(publish)) 861 defer workertest.CleanKill(c, w) 862 863 retryInterval := initialRetryInterval 864 s.clock.WaitAdvance(retryInterval, coretesting.ShortWait, 1) 865 select { 866 case servers := <-publishCh: 867 AssertAPIHostPorts(c, servers, ExpectedAPIHostPorts(3, ipVersion)) 868 break 869 case <-time.After(coretesting.ShortWait): 870 c.Fatal("APIHostPorts were not published") 871 } 872 // There isn't any point checking for additional publish 873 // calls as we are also racing against config changed, which 874 // will also call SetAPIHostPorts. But we may not get this. 875 } 876 877 func (s *workerSuite) initialize3Voters(c *gc.C) (*fakeState, worker.Worker, *voyeur.Watcher) { 878 st := NewFakeState() 879 InitState(c, st, 1, testIPv4) 880 st.machine("10").SetHasVote(true) 881 st.session.setStatus(mkStatuses("0p", testIPv4)) 882 883 w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}) 884 defer func() { 885 if r := recover(); r != nil { 886 // we aren't exiting cleanly, so kill the worker 887 workertest.CleanKill(c, w) 888 // but let the stack trace continue 889 panic(r) 890 } 891 }() 892 893 memberWatcher := st.session.members.Watch() 894 mustNext(c, memberWatcher, "init") 895 assertMembers(c, memberWatcher.Value(), mkMembers("0v", testIPv4)) 896 // Now that 1 has come up successfully, bring in the next 2 897 for i := 11; i < 13; i++ { 898 id := fmt.Sprint(i) 899 m := st.addMachine(id, true) 900 m.setAddresses(network.NewAddress(fmt.Sprintf(testIPv4.formatHost, i))) 901 c.Check(m.Addresses(), gc.HasLen, 1) 902 } 903 // Now that we've added 2 more, flag them as started and mark them as participating 904 st.session.setStatus(mkStatuses("0p 1 2", testIPv4)) 905 st.setControllers("10", "11", "12") 906 mustNext(c, memberWatcher, "nonvoting members") 907 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2", testIPv4)) 908 st.session.setStatus(mkStatuses("0p 1s 2s", testIPv4)) 909 c.Assert(s.clock.WaitAdvance(pollInterval, time.Second, 1), jc.ErrorIsNil) 910 mustNext(c, memberWatcher, "status ok") 911 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v", testIPv4)) 912 st.machine("11").SetHasVote(true) 913 st.machine("12").SetHasVote(true) 914 return st, w, memberWatcher 915 } 916 917 func (s *workerSuite) TestDyingMachinesAreRemoved(c *gc.C) { 918 st, w, memberWatcher := s.initialize3Voters(c) 919 defer workertest.CleanKill(c, w) 920 // Now we have gotten to a prepared replicaset. 921 922 // When we advance the lifecycle (aka machine.Destroy()), we should notice that the machine no longer wants a vote 923 // machine.Destroy() advances to both Dying and SetWantsVote(false) 924 st.machine("11").advanceLifecycle(state.Dying, false) 925 // we should notice that we want to remove the vote first 926 mustNext(c, memberWatcher, "removing vote") 927 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2", testIPv4)) 928 // And once we don't have the vote, and we see the machine is Dying we should remove it 929 mustNext(c, memberWatcher, "remove dying machine") 930 assertMembers(c, memberWatcher.Value(), mkMembers("0v 2", testIPv4)) 931 932 // Now, machine 2 no longer has the vote, but if we now flag it as dying, 933 // then it should also get progressed to dead as well. 934 st.machine("12").advanceLifecycle(state.Dying, false) 935 mustNext(c, memberWatcher, "removing dying machine") 936 assertMembers(c, memberWatcher.Value(), mkMembers("0v", testIPv4)) 937 } 938 939 func (s *workerSuite) TestRemovePrimaryValidSecondaries(c *gc.C) { 940 st, w, memberWatcher := s.initialize3Voters(c) 941 defer workertest.CleanKill(c, w) 942 statusWatcher := st.session.status.Watch() 943 status := mustNextStatus(c, statusWatcher, "init") 944 c.Check(status.Members, gc.DeepEquals, mkStatuses("0p 1s 2s", testIPv4)) 945 primaryMemberIndex := 0 946 947 st.machine("10").setWantsVote(false) 948 // we should notice that the primary has failed, and have called StepDownPrimary which should ultimately cause 949 // a change in the Status. 950 status = mustNextStatus(c, statusWatcher, "stepping down primary") 951 // find out which one is primary, should only be one of 1 or 2 952 c.Assert(status.Members, gc.HasLen, 3) 953 c.Check(status.Members[0].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState)) 954 if status.Members[1].State == replicaset.PrimaryState { 955 primaryMemberIndex = 1 956 c.Check(status.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState)) 957 } else { 958 primaryMemberIndex = 2 959 c.Check(status.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState)) 960 } 961 // Now we have to wait for time to advance for us to reevaluate the system 962 c.Assert(s.clock.WaitAdvance(2*pollInterval, coretesting.ShortWait, 2), jc.ErrorIsNil) 963 mustNext(c, memberWatcher, "reevaluting member post-step-down") 964 // we should now have switch the vote over to whoever became the primary 965 if primaryMemberIndex == 1 { 966 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2", testIPv4)) 967 } else { 968 assertMembers(c, memberWatcher.Value(), mkMembers("0 1 2v", testIPv4)) 969 } 970 // Now we ask the primary to step down again, and we should first reconfigure the group to include 971 // the other secondary. We first unset the invariant checker, because we are intentionally going to an even number 972 // of voters, but this is not the normal condition 973 st.setCheck(nil) 974 if primaryMemberIndex == 1 { 975 st.machine("11").setWantsVote(false) 976 } else { 977 st.machine("12").setWantsVote(false) 978 } 979 // member watcher must fire first 980 mustNext(c, memberWatcher, "observing member step down") 981 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v", testIPv4)) 982 // as part of stepping down the only primary, we re-enable the vote for the other secondary, and then can call 983 // StepDownPrimary and then can remove its vote. 984 // now we timeout so that the system will notice we really do still want to step down the primary, and ask 985 // for it to revote. 986 c.Assert(s.clock.WaitAdvance(2*pollInterval, coretesting.ShortWait, 1), jc.ErrorIsNil) 987 status = mustNextStatus(c, statusWatcher, "stepping down new primary") 988 if primaryMemberIndex == 1 { 989 // 11 was the primary, now 12 is 990 c.Check(status.Members[1].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState)) 991 c.Check(status.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState)) 992 } else { 993 c.Check(status.Members[1].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState)) 994 c.Check(status.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState)) 995 } 996 // and then we again notice that the primary has been rescheduled and changed the member votes again 997 c.Assert(s.clock.WaitAdvance(pollInterval, coretesting.ShortWait, 1), jc.ErrorIsNil) 998 mustNext(c, memberWatcher, "reevaluting member post-step-down") 999 if primaryMemberIndex == 1 { 1000 // primary was 11, now it is 12 as the only voter 1001 assertMembers(c, memberWatcher.Value(), mkMembers("0 1 2v", testIPv4)) 1002 } else { 1003 // primary was 12, now it is 11 as the only voter 1004 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2", testIPv4)) 1005 } 1006 } 1007 1008 // mustNext waits for w's value to be set and returns it. 1009 func mustNext(c *gc.C, w *voyeur.Watcher, context string) (val interface{}) { 1010 type voyeurResult struct { 1011 ok bool 1012 val interface{} 1013 } 1014 done := make(chan voyeurResult) 1015 go func() { 1016 c.Logf("mustNext %v", context) 1017 ok := w.Next() 1018 val = w.Value() 1019 if ok { 1020 members := val.([]replicaset.Member) 1021 val = "\n" + prettyReplicaSetMembersSlice(members) 1022 } 1023 c.Logf("mustNext %v done, ok: %v, val: %v", context, ok, val) 1024 done <- voyeurResult{ok, val} 1025 }() 1026 select { 1027 case result := <-done: 1028 c.Assert(result.ok, jc.IsTrue) 1029 return result.val 1030 case <-time.After(coretesting.LongWait): 1031 c.Fatalf("timed out waiting for value to be set %v", context) 1032 } 1033 panic("unreachable") 1034 } 1035 1036 func mustNextStatus(c *gc.C, w *voyeur.Watcher, context string) *replicaset.Status { 1037 type voyeurResult struct { 1038 ok bool 1039 val *replicaset.Status 1040 } 1041 done := make(chan voyeurResult) 1042 go func() { 1043 c.Logf("mustNextStatus %v", context) 1044 var result voyeurResult 1045 result.ok = w.Next() 1046 if result.ok { 1047 val := w.Value() 1048 result.val = val.(*replicaset.Status) 1049 } 1050 c.Logf("mustNextStatus %v done, ok: %v, val: %v", context, result.ok, pretty.Sprint(result.val)) 1051 done <- result 1052 }() 1053 select { 1054 case result := <-done: 1055 c.Assert(result.ok, jc.IsTrue) 1056 return result.val 1057 case <-time.After(coretesting.LongWait): 1058 c.Fatalf("timed out waiting for value to be set %v", context) 1059 } 1060 panic("unreachable") 1061 } 1062 1063 type nopAPIHostPortsSetter struct{} 1064 1065 func (nopAPIHostPortsSetter) SetAPIHostPorts(apiServers [][]network.HostPort) error { 1066 return nil 1067 } 1068 1069 type nopHub struct{} 1070 1071 func (nopHub) Publish(topic string, data interface{}) (<-chan struct{}, error) { 1072 return nil, nil 1073 } 1074 1075 func (nopHub) Subscribe(topic string, handler interface{}) (func(), error) { 1076 return func() {}, nil 1077 } 1078 1079 func (s *workerSuite) newWorkerWithConfig( 1080 c *gc.C, 1081 config Config, 1082 ) worker.Worker { 1083 // We create a new clock for the worker so we can wait on alarms even when 1084 // a single test tests both ipv4 and 6 so is creating two workers. 1085 s.clock = testclock.NewClock(time.Now()) 1086 config.Clock = s.clock 1087 w, err := New(config) 1088 c.Assert(err, jc.ErrorIsNil) 1089 s.AddCleanup(func(c *gc.C) { workertest.DirtyKill(c, w) }) 1090 return w 1091 } 1092 1093 func (s *workerSuite) newWorker( 1094 c *gc.C, 1095 st State, 1096 session MongoSession, 1097 apiHostPortsSetter APIHostPortsSetter, 1098 ) worker.Worker { 1099 return s.newWorkerWithConfig(c, Config{ 1100 State: st, 1101 MongoSession: session, 1102 APIHostPortsSetter: apiHostPortsSetter, 1103 MongoPort: mongoPort, 1104 APIPort: apiPort, 1105 Hub: s.hub, 1106 }) 1107 }