github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/worker/peergrouper/worker_test.go (about) 1 // Copyright 2014 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package peergrouper 5 6 import ( 7 "errors" 8 "fmt" 9 "time" 10 11 jc "github.com/juju/testing/checkers" 12 "github.com/juju/utils/voyeur" 13 gc "launchpad.net/gocheck" 14 15 "github.com/juju/juju/instance" 16 "github.com/juju/juju/juju/testing" 17 statetesting "github.com/juju/juju/state/testing" 18 coretesting "github.com/juju/juju/testing" 19 "github.com/juju/juju/worker" 20 ) 21 22 type workerJujuConnSuite struct { 23 testing.JujuConnSuite 24 } 25 26 var _ = gc.Suite(&workerJujuConnSuite{}) 27 28 func (s *workerJujuConnSuite) TestStartStop(c *gc.C) { 29 w, err := New(s.State) 30 c.Assert(err, gc.IsNil) 31 err = worker.Stop(w) 32 c.Assert(err, gc.IsNil) 33 } 34 35 func (s *workerJujuConnSuite) TestPublisherSetsAPIHostPorts(c *gc.C) { 36 st := newFakeState() 37 initState(c, st, 3) 38 39 watcher := s.State.WatchAPIHostPorts() 40 cwatch := statetesting.NewNotifyWatcherC(c, s.State, watcher) 41 cwatch.AssertOneChange() 42 43 statePublish := newPublisher(s.State) 44 45 // Wrap the publisher so that we can call StartSync immediately 46 // after the publishAPIServers method is called. 47 publish := func(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 48 err := statePublish.publishAPIServers(apiServers, instanceIds) 49 s.State.StartSync() 50 return err 51 } 52 53 w := newWorker(st, publisherFunc(publish)) 54 defer func() { 55 c.Check(worker.Stop(w), gc.IsNil) 56 }() 57 58 cwatch.AssertOneChange() 59 hps, err := s.State.APIHostPorts() 60 c.Assert(err, gc.IsNil) 61 assertAPIHostPorts(c, hps, expectedAPIHostPorts(3)) 62 } 63 64 type workerSuite struct { 65 coretesting.BaseSuite 66 } 67 68 var _ = gc.Suite(&workerSuite{}) 69 70 func (s *workerSuite) SetUpTest(c *gc.C) { 71 s.BaseSuite.SetUpTest(c) 72 resetErrors() 73 } 74 75 // initState initializes the fake state with a single 76 // replicaset member and numMachines machines 77 // primed to vote. 78 func initState(c *gc.C, st *fakeState, numMachines int) { 79 var ids []string 80 for i := 10; i < 10+numMachines; i++ { 81 id := fmt.Sprint(i) 82 m := st.addMachine(id, true) 83 m.setInstanceId(instance.Id("id-" + id)) 84 m.setStateHostPort(fmt.Sprintf("0.1.2.%d:%d", i, mongoPort)) 85 ids = append(ids, id) 86 c.Assert(m.MongoHostPorts(), gc.HasLen, 1) 87 88 m.setAPIHostPorts(addressesWithPort(apiPort, fmt.Sprintf("0.1.2.%d", i))) 89 } 90 st.machine("10").SetHasVote(true) 91 st.setStateServers(ids...) 92 st.session.Set(mkMembers("0v")) 93 st.session.setStatus(mkStatuses("0p")) 94 st.check = checkInvariants 95 } 96 97 // expectedAPIHostPorts returns the expected addresses 98 // of the machines as created by initState. 99 func expectedAPIHostPorts(n int) [][]instance.HostPort { 100 servers := make([][]instance.HostPort, n) 101 for i := range servers { 102 servers[i] = []instance.HostPort{{ 103 Address: instance.NewAddress(fmt.Sprintf("0.1.2.%d", i+10), instance.NetworkUnknown), 104 Port: apiPort, 105 }} 106 } 107 return servers 108 } 109 110 func addressesWithPort(port int, addrs ...string) []instance.HostPort { 111 return instance.AddressesWithPort(instance.NewAddresses(addrs...), port) 112 } 113 114 func (s *workerSuite) TestSetsAndUpdatesMembers(c *gc.C) { 115 s.PatchValue(&pollInterval, 5*time.Millisecond) 116 117 st := newFakeState() 118 initState(c, st, 3) 119 120 memberWatcher := st.session.members.Watch() 121 mustNext(c, memberWatcher) 122 assertMembers(c, memberWatcher.Value(), mkMembers("0v")) 123 124 logger.Infof("starting worker") 125 w := newWorker(st, noPublisher{}) 126 defer func() { 127 c.Check(worker.Stop(w), gc.IsNil) 128 }() 129 130 // Wait for the worker to set the initial members. 131 mustNext(c, memberWatcher) 132 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2")) 133 134 // Update the status of the new members 135 // and check that they become voting. 136 c.Logf("updating new member status") 137 st.session.setStatus(mkStatuses("0p 1s 2s")) 138 mustNext(c, memberWatcher) 139 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v")) 140 141 c.Logf("adding another machine") 142 // Add another machine. 143 m13 := st.addMachine("13", false) 144 m13.setStateHostPort(fmt.Sprintf("0.1.2.%d:%d", 13, mongoPort)) 145 st.setStateServers("10", "11", "12", "13") 146 147 c.Logf("waiting for new member to be added") 148 mustNext(c, memberWatcher) 149 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3")) 150 151 // Remove vote from an existing member; 152 // and give it to the new machine. 153 // Also set the status of the new machine to 154 // healthy. 155 c.Logf("removing vote from machine 10 and adding it to machine 13") 156 st.machine("10").setWantsVote(false) 157 st.machine("13").setWantsVote(true) 158 159 st.session.setStatus(mkStatuses("0p 1s 2s 3s")) 160 161 // Check that the new machine gets the vote and the 162 // old machine loses it. 163 c.Logf("waiting for vote switch") 164 mustNext(c, memberWatcher) 165 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v")) 166 167 c.Logf("removing old machine") 168 // Remove the old machine. 169 st.removeMachine("10") 170 st.setStateServers("11", "12", "13") 171 172 // Check that it's removed from the members. 173 c.Logf("waiting for removal") 174 mustNext(c, memberWatcher) 175 assertMembers(c, memberWatcher.Value(), mkMembers("1v 2v 3v")) 176 } 177 178 func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFails(c *gc.C) { 179 st := newFakeState() 180 181 // Simulate a state where we have four state servers, 182 // one has gone down, and we're replacing it: 183 // 0 - hasvote true, wantsvote false, down 184 // 1 - hasvote true, wantsvote true 185 // 2 - hasvote true, wantsvote true 186 // 3 - hasvote false, wantsvote true 187 // 188 // When it starts, the worker should move the vote from 189 // 0 to 3. We'll arrange things so that it will succeed in 190 // setting the membership but fail setting the HasVote 191 // to false. 192 initState(c, st, 4) 193 st.machine("10").SetHasVote(true) 194 st.machine("11").SetHasVote(true) 195 st.machine("12").SetHasVote(true) 196 st.machine("13").SetHasVote(false) 197 198 st.machine("10").setWantsVote(false) 199 st.machine("11").setWantsVote(true) 200 st.machine("12").setWantsVote(true) 201 st.machine("13").setWantsVote(true) 202 203 st.session.Set(mkMembers("0v 1v 2v 3")) 204 st.session.setStatus(mkStatuses("0H 1p 2s 3s")) 205 206 // Make the worker fail to set HasVote to false 207 // after changing the replica set membership. 208 setErrorFor("Machine.SetHasVote * false", errors.New("frood")) 209 210 memberWatcher := st.session.members.Watch() 211 mustNext(c, memberWatcher) 212 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3")) 213 214 w := newWorker(st, noPublisher{}) 215 done := make(chan error) 216 go func() { 217 done <- w.Wait() 218 }() 219 220 // Wait for the worker to set the initial members. 221 mustNext(c, memberWatcher) 222 assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v")) 223 224 // The worker should encounter an error setting the 225 // has-vote status to false and exit. 226 select { 227 case err := <-done: 228 c.Assert(err, gc.ErrorMatches, `cannot set voting status of "[0-9]+" to false: frood`) 229 case <-time.After(coretesting.LongWait): 230 c.Fatalf("timed out waiting for worker to exit") 231 } 232 233 // Start the worker again - although the membership should 234 // not change, the HasVote status should be updated correctly. 235 resetErrors() 236 w = newWorker(st, noPublisher{}) 237 238 // Watch all the machines for changes, so we can check 239 // their has-vote status without polling. 240 changed := make(chan struct{}, 1) 241 for i := 10; i < 14; i++ { 242 watcher := st.machine(fmt.Sprint(i)).val.Watch() 243 defer watcher.Close() 244 go func() { 245 for watcher.Next() { 246 select { 247 case changed <- struct{}{}: 248 default: 249 } 250 } 251 }() 252 } 253 timeout := time.After(coretesting.LongWait) 254 loop: 255 for { 256 select { 257 case <-changed: 258 correct := true 259 for i := 10; i < 14; i++ { 260 hasVote := st.machine(fmt.Sprint(i)).HasVote() 261 expectHasVote := i != 10 262 if hasVote != expectHasVote { 263 correct = false 264 } 265 } 266 if correct { 267 break loop 268 } 269 case <-timeout: 270 c.Fatalf("timed out waiting for vote to be set") 271 } 272 } 273 } 274 275 func (s *workerSuite) TestAddressChange(c *gc.C) { 276 st := newFakeState() 277 initState(c, st, 3) 278 279 memberWatcher := st.session.members.Watch() 280 mustNext(c, memberWatcher) 281 assertMembers(c, memberWatcher.Value(), mkMembers("0v")) 282 283 logger.Infof("starting worker") 284 w := newWorker(st, noPublisher{}) 285 defer func() { 286 c.Check(worker.Stop(w), gc.IsNil) 287 }() 288 289 // Wait for the worker to set the initial members. 290 mustNext(c, memberWatcher) 291 assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2")) 292 293 // Change an address and wait for it to be changed in the 294 // members. 295 st.machine("11").setStateHostPort("0.1.99.99:9876") 296 297 mustNext(c, memberWatcher) 298 expectMembers := mkMembers("0v 1 2") 299 expectMembers[1].Address = "0.1.99.99:9876" 300 assertMembers(c, memberWatcher.Value(), expectMembers) 301 } 302 303 var fatalErrorsTests = []struct { 304 errPattern string 305 err error 306 expectErr string 307 }{{ 308 errPattern: "State.StateServerInfo", 309 expectErr: "cannot get state server info: sample", 310 }, { 311 errPattern: "Machine.SetHasVote 11 true", 312 expectErr: `cannot set voting status of "11" to true: sample`, 313 }, { 314 errPattern: "Session.CurrentStatus", 315 expectErr: "cannot get replica set status: sample", 316 }, { 317 errPattern: "Session.CurrentMembers", 318 expectErr: "cannot get replica set members: sample", 319 }, { 320 errPattern: "State.Machine *", 321 expectErr: `cannot get machine "10": sample`, 322 }, { 323 errPattern: "Machine.InstanceId *", 324 expectErr: `cannot get API server info: sample`, 325 }} 326 327 func (s *workerSuite) TestFatalErrors(c *gc.C) { 328 s.PatchValue(&pollInterval, 5*time.Millisecond) 329 for i, test := range fatalErrorsTests { 330 c.Logf("test %d: %s -> %s", i, test.errPattern, test.expectErr) 331 resetErrors() 332 st := newFakeState() 333 st.session.InstantlyReady = true 334 initState(c, st, 3) 335 setErrorFor(test.errPattern, errors.New("sample")) 336 w := newWorker(st, noPublisher{}) 337 done := make(chan error) 338 go func() { 339 done <- w.Wait() 340 }() 341 select { 342 case err := <-done: 343 c.Assert(err, gc.ErrorMatches, test.expectErr) 344 case <-time.After(coretesting.LongWait): 345 c.Fatalf("timed out waiting for error") 346 } 347 } 348 } 349 350 func (s *workerSuite) TestSetMembersErrorIsNotFatal(c *gc.C) { 351 st := newFakeState() 352 initState(c, st, 3) 353 st.session.setStatus(mkStatuses("0p 1s 2s")) 354 var isSet voyeur.Value 355 count := 0 356 setErrorFuncFor("Session.Set", func() error { 357 isSet.Set(count) 358 count++ 359 return errors.New("sample") 360 }) 361 s.PatchValue(&initialRetryInterval, 10*time.Microsecond) 362 s.PatchValue(&maxRetryInterval, coretesting.ShortWait/4) 363 364 expectedIterations := 0 365 for d := initialRetryInterval; d < maxRetryInterval*2; d *= 2 { 366 expectedIterations++ 367 } 368 369 w := newWorker(st, noPublisher{}) 370 defer func() { 371 c.Check(worker.Stop(w), gc.IsNil) 372 }() 373 isSetWatcher := isSet.Watch() 374 375 n0 := mustNext(c, isSetWatcher).(int) 376 time.Sleep(maxRetryInterval * 2) 377 n1 := mustNext(c, isSetWatcher).(int) 378 379 // The worker should have backed off exponentially... 380 c.Assert(n1-n0, jc.LessThan, expectedIterations+1) 381 c.Logf("actual iterations %d; expected iterations %d", n1-n0, expectedIterations) 382 383 // ... but only up to the maximum retry interval 384 n0 = mustNext(c, isSetWatcher).(int) 385 time.Sleep(maxRetryInterval * 2) 386 n1 = mustNext(c, isSetWatcher).(int) 387 388 c.Assert(n1-n0, jc.LessThan, 3) 389 } 390 391 type publisherFunc func(apiServers [][]instance.HostPort, instanceIds []instance.Id) error 392 393 func (f publisherFunc) publishAPIServers(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 394 return f(apiServers, instanceIds) 395 } 396 397 func (s *workerSuite) TestStateServersArePublished(c *gc.C) { 398 publishCh := make(chan [][]instance.HostPort) 399 publish := func(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 400 publishCh <- apiServers 401 return nil 402 } 403 404 st := newFakeState() 405 initState(c, st, 3) 406 w := newWorker(st, publisherFunc(publish)) 407 defer func() { 408 c.Check(worker.Stop(w), gc.IsNil) 409 }() 410 select { 411 case servers := <-publishCh: 412 assertAPIHostPorts(c, servers, expectedAPIHostPorts(3)) 413 case <-time.After(coretesting.LongWait): 414 c.Fatalf("timed out waiting for publish") 415 } 416 417 // Change one of the servers' API addresses and check that it's published. 418 419 newMachine10APIHostPorts := addressesWithPort(apiPort, "0.2.8.124") 420 st.machine("10").setAPIHostPorts(newMachine10APIHostPorts) 421 select { 422 case servers := <-publishCh: 423 expected := expectedAPIHostPorts(3) 424 expected[0] = newMachine10APIHostPorts 425 assertAPIHostPorts(c, servers, expected) 426 case <-time.After(coretesting.LongWait): 427 c.Fatalf("timed out waiting for publish") 428 } 429 } 430 431 func (s *workerSuite) TestWorkerRetriesOnPublishError(c *gc.C) { 432 s.PatchValue(&pollInterval, coretesting.LongWait+time.Second) 433 s.PatchValue(&initialRetryInterval, 5*time.Millisecond) 434 s.PatchValue(&maxRetryInterval, initialRetryInterval) 435 436 publishCh := make(chan [][]instance.HostPort, 100) 437 438 count := 0 439 publish := func(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 440 publishCh <- apiServers 441 count++ 442 if count <= 3 { 443 return fmt.Errorf("publish error") 444 } 445 return nil 446 } 447 st := newFakeState() 448 initState(c, st, 3) 449 450 w := newWorker(st, publisherFunc(publish)) 451 defer func() { 452 c.Check(worker.Stop(w), gc.IsNil) 453 }() 454 455 for i := 0; i < 4; i++ { 456 select { 457 case servers := <-publishCh: 458 assertAPIHostPorts(c, servers, expectedAPIHostPorts(3)) 459 case <-time.After(coretesting.LongWait): 460 c.Fatalf("timed out waiting for publish #%d", i) 461 } 462 } 463 select { 464 case <-publishCh: 465 c.Errorf("unexpected publish event") 466 case <-time.After(coretesting.ShortWait): 467 } 468 } 469 470 func (s *workerSuite) TestWorkerPublishesInstanceIds(c *gc.C) { 471 s.PatchValue(&pollInterval, coretesting.LongWait+time.Second) 472 s.PatchValue(&initialRetryInterval, 5*time.Millisecond) 473 s.PatchValue(&maxRetryInterval, initialRetryInterval) 474 475 publishCh := make(chan []instance.Id, 100) 476 477 publish := func(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 478 publishCh <- instanceIds 479 return nil 480 } 481 st := newFakeState() 482 initState(c, st, 3) 483 484 w := newWorker(st, publisherFunc(publish)) 485 defer func() { 486 c.Check(worker.Stop(w), gc.IsNil) 487 }() 488 489 select { 490 case instanceIds := <-publishCh: 491 c.Assert(instanceIds, jc.SameContents, []instance.Id{"id-10", "id-11", "id-12"}) 492 case <-time.After(coretesting.LongWait): 493 c.Errorf("timed out waiting for publish") 494 } 495 } 496 497 // mustNext waits for w's value to be set and returns it. 498 func mustNext(c *gc.C, w *voyeur.Watcher) (val interface{}) { 499 done := make(chan bool) 500 go func() { 501 c.Logf("mustNext %p", w) 502 ok := w.Next() 503 val = w.Value() 504 c.Logf("mustNext done %p, ok %v", w, ok) 505 done <- ok 506 }() 507 select { 508 case ok := <-done: 509 c.Assert(ok, jc.IsTrue) 510 return 511 case <-time.After(coretesting.LongWait): 512 c.Fatalf("timed out waiting for value to be set") 513 } 514 panic("unreachable") 515 } 516 517 type noPublisher struct{} 518 519 func (noPublisher) publishAPIServers(apiServers [][]instance.HostPort, instanceIds []instance.Id) error { 520 return nil 521 }