github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/leader_test.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "fmt" 6 "strconv" 7 "testing" 8 "time" 9 10 "github.com/hashicorp/consul/sdk/testutil/retry" 11 "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/go-version" 14 "github.com/hashicorp/nomad/helper" 15 "github.com/hashicorp/nomad/nomad/mock" 16 "github.com/hashicorp/nomad/nomad/state" 17 "github.com/hashicorp/nomad/nomad/structs" 18 "github.com/hashicorp/nomad/testutil" 19 "github.com/hashicorp/raft" 20 "github.com/hashicorp/serf/serf" 21 "github.com/stretchr/testify/assert" 22 "github.com/stretchr/testify/require" 23 ) 24 25 func TestLeader_LeftServer(t *testing.T) { 26 s1, cleanupS1 := TestServer(t, func(c *Config) { 27 c.BootstrapExpect = 3 28 }) 29 defer cleanupS1() 30 31 s2, cleanupS2 := TestServer(t, func(c *Config) { 32 c.BootstrapExpect = 3 33 }) 34 defer cleanupS2() 35 36 s3, cleanupS3 := TestServer(t, func(c *Config) { 37 c.BootstrapExpect = 3 38 }) 39 defer cleanupS3() 40 servers := []*Server{s1, s2, s3} 41 TestJoin(t, s1, s2, s3) 42 43 for _, s := range servers { 44 testutil.WaitForResult(func() (bool, error) { 45 peers, _ := s.numPeers() 46 return peers == 3, nil 47 }, func(err error) { 48 t.Fatalf("should have 3 peers") 49 }) 50 } 51 52 // Kill any server 53 var peer *Server 54 for _, s := range servers { 55 if !s.IsLeader() { 56 peer = s 57 break 58 } 59 } 60 if peer == nil { 61 t.Fatalf("Should have a non-leader") 62 } 63 peer.Shutdown() 64 name := fmt.Sprintf("%s.%s", peer.config.NodeName, peer.config.Region) 65 66 testutil.WaitForResult(func() (bool, error) { 67 for _, s := range servers { 68 if s == peer { 69 continue 70 } 71 72 // Force remove the non-leader (transition to left state) 73 if err := s.RemoveFailedNode(name); err != nil { 74 return false, err 75 } 76 77 peers, _ := s.numPeers() 78 return peers == 2, errors.New(fmt.Sprintf("%v", peers)) 79 } 80 81 return true, nil 82 }, func(err error) { 83 t.Fatalf("err: %s", err) 84 }) 85 } 86 87 func TestLeader_LeftLeader(t *testing.T) { 88 s1, cleanupS1 := TestServer(t, func(c *Config) { 89 c.BootstrapExpect = 3 90 }) 91 defer cleanupS1() 92 93 s2, cleanupS2 := TestServer(t, func(c *Config) { 94 c.BootstrapExpect = 3 95 }) 96 defer cleanupS2() 97 98 s3, cleanupS3 := TestServer(t, func(c *Config) { 99 c.BootstrapExpect = 3 100 }) 101 defer cleanupS3() 102 servers := []*Server{s1, s2, s3} 103 TestJoin(t, s1, s2, s3) 104 105 for _, s := range servers { 106 testutil.WaitForResult(func() (bool, error) { 107 peers, _ := s.numPeers() 108 return peers == 3, nil 109 }, func(err error) { 110 t.Fatalf("should have 3 peers") 111 }) 112 } 113 114 // Kill the leader! 115 var leader *Server 116 for _, s := range servers { 117 if s.IsLeader() { 118 leader = s 119 break 120 } 121 } 122 if leader == nil { 123 t.Fatalf("Should have a leader") 124 } 125 leader.Leave() 126 leader.Shutdown() 127 128 for _, s := range servers { 129 if s == leader { 130 continue 131 } 132 testutil.WaitForResult(func() (bool, error) { 133 peers, _ := s.numPeers() 134 return peers == 2, errors.New(fmt.Sprintf("%v", peers)) 135 }, func(err error) { 136 t.Fatalf("should have 2 peers: %v", err) 137 }) 138 } 139 } 140 141 func TestLeader_MultiBootstrap(t *testing.T) { 142 s1, cleanupS1 := TestServer(t, nil) 143 defer cleanupS1() 144 145 s2, cleanupS2 := TestServer(t, nil) 146 defer cleanupS2() 147 servers := []*Server{s1, s2} 148 TestJoin(t, s1, s2) 149 150 for _, s := range servers { 151 testutil.WaitForResult(func() (bool, error) { 152 peers := s.Members() 153 return len(peers) == 2, nil 154 }, func(err error) { 155 t.Fatalf("should have 2 peers") 156 }) 157 } 158 159 // Ensure we don't have multiple raft peers 160 for _, s := range servers { 161 peers, err := s.numPeers() 162 if err != nil { 163 t.Fatalf("failed: %v", err) 164 } 165 if peers != 1 { 166 t.Fatalf("should only have 1 raft peer! %v", peers) 167 } 168 } 169 } 170 171 func TestLeader_PlanQueue_Reset(t *testing.T) { 172 s1, cleanupS1 := TestServer(t, func(c *Config) { 173 c.BootstrapExpect = 3 174 }) 175 defer cleanupS1() 176 177 s2, cleanupS2 := TestServer(t, func(c *Config) { 178 c.BootstrapExpect = 3 179 }) 180 defer cleanupS2() 181 182 s3, cleanupS3 := TestServer(t, func(c *Config) { 183 c.BootstrapExpect = 3 184 }) 185 defer cleanupS3() 186 servers := []*Server{s1, s2, s3} 187 TestJoin(t, s1, s2, s3) 188 189 leader := waitForStableLeadership(t, servers) 190 191 if !leader.planQueue.Enabled() { 192 t.Fatalf("should enable plan queue") 193 } 194 195 for _, s := range servers { 196 if !s.IsLeader() && s.planQueue.Enabled() { 197 t.Fatalf("plan queue should not be enabled") 198 } 199 } 200 201 // Kill the leader 202 leader.Shutdown() 203 time.Sleep(100 * time.Millisecond) 204 205 // Wait for a new leader 206 leader = nil 207 testutil.WaitForResult(func() (bool, error) { 208 for _, s := range servers { 209 if s.IsLeader() { 210 leader = s 211 return true, nil 212 } 213 } 214 return false, nil 215 }, func(err error) { 216 t.Fatalf("should have leader") 217 }) 218 219 // Check that the new leader has a pending GC expiration 220 testutil.WaitForResult(func() (bool, error) { 221 return leader.planQueue.Enabled(), nil 222 }, func(err error) { 223 t.Fatalf("should enable plan queue") 224 }) 225 } 226 227 func TestLeader_EvalBroker_Reset(t *testing.T) { 228 s1, cleanupS1 := TestServer(t, func(c *Config) { 229 c.NumSchedulers = 0 230 }) 231 defer cleanupS1() 232 233 s2, cleanupS2 := TestServer(t, func(c *Config) { 234 c.NumSchedulers = 0 235 c.BootstrapExpect = 3 236 }) 237 defer cleanupS2() 238 239 s3, cleanupS3 := TestServer(t, func(c *Config) { 240 c.NumSchedulers = 0 241 c.BootstrapExpect = 3 242 }) 243 defer cleanupS3() 244 servers := []*Server{s1, s2, s3} 245 TestJoin(t, s1, s2, s3) 246 247 leader := waitForStableLeadership(t, servers) 248 249 // Inject a pending eval 250 req := structs.EvalUpdateRequest{ 251 Evals: []*structs.Evaluation{mock.Eval()}, 252 } 253 _, _, err := leader.raftApply(structs.EvalUpdateRequestType, req) 254 if err != nil { 255 t.Fatalf("err: %v", err) 256 } 257 258 // Kill the leader 259 leader.Shutdown() 260 time.Sleep(100 * time.Millisecond) 261 262 // Wait for a new leader 263 leader = nil 264 testutil.WaitForResult(func() (bool, error) { 265 for _, s := range servers { 266 if s.IsLeader() { 267 leader = s 268 return true, nil 269 } 270 } 271 return false, nil 272 }, func(err error) { 273 t.Fatalf("should have leader") 274 }) 275 276 // Check that the new leader has a pending evaluation 277 testutil.WaitForResult(func() (bool, error) { 278 stats := leader.evalBroker.Stats() 279 return stats.TotalReady == 1, nil 280 }, func(err error) { 281 t.Fatalf("should have pending evaluation") 282 }) 283 } 284 285 func TestLeader_PeriodicDispatcher_Restore_Adds(t *testing.T) { 286 s1, cleanupS1 := TestServer(t, func(c *Config) { 287 c.NumSchedulers = 0 288 }) 289 defer cleanupS1() 290 291 s2, cleanupS2 := TestServer(t, func(c *Config) { 292 c.NumSchedulers = 0 293 c.BootstrapExpect = 3 294 }) 295 defer cleanupS2() 296 297 s3, cleanupS3 := TestServer(t, func(c *Config) { 298 c.NumSchedulers = 0 299 c.BootstrapExpect = 3 300 }) 301 defer cleanupS3() 302 servers := []*Server{s1, s2, s3} 303 TestJoin(t, s1, s2, s3) 304 305 leader := waitForStableLeadership(t, servers) 306 307 // Inject a periodic job, a parameterized periodic job and a non-periodic job 308 periodic := mock.PeriodicJob() 309 nonPeriodic := mock.Job() 310 parameterizedPeriodic := mock.PeriodicJob() 311 parameterizedPeriodic.ParameterizedJob = &structs.ParameterizedJobConfig{} 312 for _, job := range []*structs.Job{nonPeriodic, periodic, parameterizedPeriodic} { 313 req := structs.JobRegisterRequest{ 314 Job: job, 315 WriteRequest: structs.WriteRequest{ 316 Namespace: job.Namespace, 317 }, 318 } 319 _, _, err := leader.raftApply(structs.JobRegisterRequestType, req) 320 if err != nil { 321 t.Fatalf("err: %v", err) 322 } 323 } 324 325 // Kill the leader 326 leader.Shutdown() 327 time.Sleep(100 * time.Millisecond) 328 329 // Wait for a new leader 330 leader = nil 331 testutil.WaitForResult(func() (bool, error) { 332 for _, s := range servers { 333 if s.IsLeader() { 334 leader = s 335 return true, nil 336 } 337 } 338 return false, nil 339 }, func(err error) { 340 t.Fatalf("should have leader") 341 }) 342 343 tuplePeriodic := structs.NamespacedID{ 344 ID: periodic.ID, 345 Namespace: periodic.Namespace, 346 } 347 tupleNonPeriodic := structs.NamespacedID{ 348 ID: nonPeriodic.ID, 349 Namespace: nonPeriodic.Namespace, 350 } 351 tupleParameterized := structs.NamespacedID{ 352 ID: parameterizedPeriodic.ID, 353 Namespace: parameterizedPeriodic.Namespace, 354 } 355 356 // Check that the new leader is tracking the periodic job only 357 testutil.WaitForResult(func() (bool, error) { 358 leader.periodicDispatcher.l.Lock() 359 defer leader.periodicDispatcher.l.Unlock() 360 if _, tracked := leader.periodicDispatcher.tracked[tuplePeriodic]; !tracked { 361 return false, fmt.Errorf("periodic job not tracked") 362 } 363 if _, tracked := leader.periodicDispatcher.tracked[tupleNonPeriodic]; tracked { 364 return false, fmt.Errorf("non periodic job tracked") 365 } 366 if _, tracked := leader.periodicDispatcher.tracked[tupleParameterized]; tracked { 367 return false, fmt.Errorf("parameterized periodic job tracked") 368 } 369 return true, nil 370 }, func(err error) { 371 t.Fatalf(err.Error()) 372 }) 373 } 374 375 func TestLeader_PeriodicDispatcher_Restore_NoEvals(t *testing.T) { 376 s1, cleanupS1 := TestServer(t, func(c *Config) { 377 c.NumSchedulers = 0 378 }) 379 defer cleanupS1() 380 testutil.WaitForLeader(t, s1.RPC) 381 382 // Inject a periodic job that will be triggered soon. 383 launch := time.Now().Add(1 * time.Second) 384 job := testPeriodicJob(launch) 385 req := structs.JobRegisterRequest{ 386 Job: job, 387 WriteRequest: structs.WriteRequest{ 388 Namespace: job.Namespace, 389 }, 390 } 391 _, _, err := s1.raftApply(structs.JobRegisterRequestType, req) 392 if err != nil { 393 t.Fatalf("err: %v", err) 394 } 395 396 // Flush the periodic dispatcher, ensuring that no evals will be created. 397 s1.periodicDispatcher.SetEnabled(false) 398 399 // Get the current time to ensure the launch time is after this once we 400 // restore. 401 now := time.Now() 402 403 // Sleep till after the job should have been launched. 404 time.Sleep(3 * time.Second) 405 406 // Restore the periodic dispatcher. 407 s1.periodicDispatcher.SetEnabled(true) 408 s1.restorePeriodicDispatcher() 409 410 // Ensure the job is tracked. 411 tuple := structs.NamespacedID{ 412 ID: job.ID, 413 Namespace: job.Namespace, 414 } 415 if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked { 416 t.Fatalf("periodic job not restored") 417 } 418 419 // Check that an eval was made. 420 ws := memdb.NewWatchSet() 421 last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 422 if err != nil || last == nil { 423 t.Fatalf("failed to get periodic launch time: %v", err) 424 } 425 426 if last.Launch.Before(now) { 427 t.Fatalf("restorePeriodicDispatcher did not force launch: last %v; want after %v", last.Launch, now) 428 } 429 } 430 431 func TestLeader_PeriodicDispatcher_Restore_Evals(t *testing.T) { 432 s1, cleanupS1 := TestServer(t, func(c *Config) { 433 c.NumSchedulers = 0 434 }) 435 defer cleanupS1() 436 testutil.WaitForLeader(t, s1.RPC) 437 438 // Inject a periodic job that triggered once in the past, should trigger now 439 // and once in the future. 440 now := time.Now() 441 past := now.Add(-1 * time.Second) 442 future := now.Add(10 * time.Second) 443 job := testPeriodicJob(past, now, future) 444 req := structs.JobRegisterRequest{ 445 Job: job, 446 WriteRequest: structs.WriteRequest{ 447 Namespace: job.Namespace, 448 }, 449 } 450 _, _, err := s1.raftApply(structs.JobRegisterRequestType, req) 451 if err != nil { 452 t.Fatalf("err: %v", err) 453 } 454 455 // Create an eval for the past launch. 456 s1.periodicDispatcher.createEval(job, past) 457 458 // Flush the periodic dispatcher, ensuring that no evals will be created. 459 s1.periodicDispatcher.SetEnabled(false) 460 461 // Sleep till after the job should have been launched. 462 time.Sleep(3 * time.Second) 463 464 // Restore the periodic dispatcher. 465 s1.periodicDispatcher.SetEnabled(true) 466 s1.restorePeriodicDispatcher() 467 468 // Ensure the job is tracked. 469 tuple := structs.NamespacedID{ 470 ID: job.ID, 471 Namespace: job.Namespace, 472 } 473 if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked { 474 t.Fatalf("periodic job not restored") 475 } 476 477 // Check that an eval was made. 478 ws := memdb.NewWatchSet() 479 last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 480 if err != nil || last == nil { 481 t.Fatalf("failed to get periodic launch time: %v", err) 482 } 483 if last.Launch == past { 484 t.Fatalf("restorePeriodicDispatcher did not force launch") 485 } 486 } 487 488 func TestLeader_PeriodicDispatch(t *testing.T) { 489 s1, cleanupS1 := TestServer(t, func(c *Config) { 490 c.NumSchedulers = 0 491 c.EvalGCInterval = 5 * time.Millisecond 492 }) 493 defer cleanupS1() 494 495 // Wait for a periodic dispatch 496 testutil.WaitForResult(func() (bool, error) { 497 stats := s1.evalBroker.Stats() 498 bySched, ok := stats.ByScheduler[structs.JobTypeCore] 499 if !ok { 500 return false, nil 501 } 502 return bySched.Ready > 0, nil 503 }, func(err error) { 504 t.Fatalf("should pending job") 505 }) 506 } 507 508 func TestLeader_ReapFailedEval(t *testing.T) { 509 s1, cleanupS1 := TestServer(t, func(c *Config) { 510 c.NumSchedulers = 0 511 c.EvalDeliveryLimit = 1 512 }) 513 defer cleanupS1() 514 testutil.WaitForLeader(t, s1.RPC) 515 516 // Wait for a periodic dispatch 517 eval := mock.Eval() 518 s1.evalBroker.Enqueue(eval) 519 520 // Dequeue and Nack 521 out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second) 522 if err != nil { 523 t.Fatalf("err: %v", err) 524 } 525 s1.evalBroker.Nack(out.ID, token) 526 527 // Wait for an updated and followup evaluation 528 state := s1.fsm.State() 529 testutil.WaitForResult(func() (bool, error) { 530 ws := memdb.NewWatchSet() 531 out, err := state.EvalByID(ws, eval.ID) 532 if err != nil { 533 return false, err 534 } 535 if out == nil { 536 return false, fmt.Errorf("expect original evaluation to exist") 537 } 538 if out.Status != structs.EvalStatusFailed { 539 return false, fmt.Errorf("got status %v; want %v", out.Status, structs.EvalStatusFailed) 540 } 541 if out.NextEval == "" { 542 return false, fmt.Errorf("got empty NextEval") 543 } 544 // See if there is a followup 545 evals, err := state.EvalsByJob(ws, eval.Namespace, eval.JobID) 546 if err != nil { 547 return false, err 548 } 549 550 if l := len(evals); l != 2 { 551 return false, fmt.Errorf("got %d evals, want 2", l) 552 } 553 554 for _, e := range evals { 555 if e.ID == eval.ID { 556 continue 557 } 558 559 if e.Status != structs.EvalStatusPending { 560 return false, fmt.Errorf("follow up eval has status %v; want %v", 561 e.Status, structs.EvalStatusPending) 562 } 563 564 if e.ID != out.NextEval { 565 return false, fmt.Errorf("follow up eval id is %v; orig eval NextEval %v", 566 e.ID, out.NextEval) 567 } 568 569 if e.Wait < s1.config.EvalFailedFollowupBaselineDelay || 570 e.Wait > s1.config.EvalFailedFollowupBaselineDelay+s1.config.EvalFailedFollowupDelayRange { 571 return false, fmt.Errorf("bad wait: %v", e.Wait) 572 } 573 574 if e.TriggeredBy != structs.EvalTriggerFailedFollowUp { 575 return false, fmt.Errorf("follow up eval TriggeredBy %v; want %v", 576 e.TriggeredBy, structs.EvalTriggerFailedFollowUp) 577 } 578 } 579 580 return true, nil 581 }, func(err error) { 582 t.Fatalf("err: %v", err) 583 }) 584 } 585 586 func TestLeader_ReapDuplicateEval(t *testing.T) { 587 s1, cleanupS1 := TestServer(t, func(c *Config) { 588 c.NumSchedulers = 0 589 }) 590 defer cleanupS1() 591 testutil.WaitForLeader(t, s1.RPC) 592 593 // Create a duplicate blocked eval 594 eval := mock.Eval() 595 eval.CreateIndex = 100 596 eval2 := mock.Eval() 597 eval2.JobID = eval.JobID 598 eval2.CreateIndex = 102 599 s1.blockedEvals.Block(eval) 600 s1.blockedEvals.Block(eval2) 601 602 // Wait for the evaluation to marked as cancelled 603 state := s1.fsm.State() 604 testutil.WaitForResult(func() (bool, error) { 605 ws := memdb.NewWatchSet() 606 out, err := state.EvalByID(ws, eval.ID) 607 if err != nil { 608 return false, err 609 } 610 return out != nil && out.Status == structs.EvalStatusCancelled, nil 611 }, func(err error) { 612 t.Fatalf("err: %v", err) 613 }) 614 } 615 616 func TestLeader_revokeVaultAccessorsOnRestore(t *testing.T) { 617 s1, cleanupS1 := TestServer(t, func(c *Config) { 618 c.NumSchedulers = 0 619 }) 620 defer cleanupS1() 621 testutil.WaitForLeader(t, s1.RPC) 622 623 // Insert a vault accessor that should be revoked 624 fsmState := s1.fsm.State() 625 va := mock.VaultAccessor() 626 if err := fsmState.UpsertVaultAccessor(100, []*structs.VaultAccessor{va}); err != nil { 627 t.Fatalf("bad: %v", err) 628 } 629 630 // Swap the Vault client 631 tvc := &TestVaultClient{} 632 s1.vault = tvc 633 634 // Do a restore 635 if err := s1.revokeVaultAccessorsOnRestore(); err != nil { 636 t.Fatalf("Failed to restore: %v", err) 637 } 638 639 if len(tvc.RevokedTokens) != 1 && tvc.RevokedTokens[0].Accessor != va.Accessor { 640 t.Fatalf("Bad revoked accessors: %v", tvc.RevokedTokens) 641 } 642 } 643 644 func TestLeader_revokeSITokenAccessorsOnRestore(t *testing.T) { 645 t.Parallel() 646 r := require.New(t) 647 648 s1, cleanupS1 := TestServer(t, func(c *Config) { 649 c.NumSchedulers = 0 650 }) 651 defer cleanupS1() 652 testutil.WaitForLeader(t, s1.RPC) 653 654 // replace consul ACLs api with a mock for tracking calls 655 var consulACLsAPI mockConsulACLsAPI 656 s1.consulACLs = &consulACLsAPI 657 658 // Insert a SI token accessor that should be revoked 659 fsmState := s1.fsm.State() 660 accessor := mock.SITokenAccessor() 661 err := fsmState.UpsertSITokenAccessors(100, []*structs.SITokenAccessor{accessor}) 662 r.NoError(err) 663 664 // Do a restore 665 err = s1.revokeSITokenAccessorsOnRestore() 666 r.NoError(err) 667 668 // Check the accessor was revoked 669 exp := []revokeRequest{{ 670 accessorID: accessor.AccessorID, 671 committed: true, 672 }} 673 r.ElementsMatch(exp, consulACLsAPI.revokeRequests) 674 } 675 676 func TestLeader_ClusterID(t *testing.T) { 677 t.Parallel() 678 679 s1, cleanupS1 := TestServer(t, func(c *Config) { 680 c.NumSchedulers = 0 681 c.Build = minClusterIDVersion.String() 682 }) 683 defer cleanupS1() 684 testutil.WaitForLeader(t, s1.RPC) 685 686 clusterID, err := s1.ClusterID() 687 688 require.NoError(t, err) 689 require.True(t, helper.IsUUID(clusterID)) 690 } 691 692 func TestLeader_ClusterID_upgradePath(t *testing.T) { 693 t.Parallel() 694 695 before := version.Must(version.NewVersion("0.10.1")).String() 696 after := minClusterIDVersion.String() 697 698 type server struct { 699 s *Server 700 cleanup func() 701 } 702 703 outdated := func() server { 704 s, cleanup := TestServer(t, func(c *Config) { 705 c.NumSchedulers = 0 706 c.Build = before 707 c.BootstrapExpect = 3 708 c.Logger.SetLevel(hclog.Trace) 709 }) 710 return server{s: s, cleanup: cleanup} 711 } 712 713 upgraded := func() server { 714 s, cleanup := TestServer(t, func(c *Config) { 715 c.NumSchedulers = 0 716 c.Build = after 717 c.BootstrapExpect = 0 718 c.Logger.SetLevel(hclog.Trace) 719 }) 720 return server{s: s, cleanup: cleanup} 721 } 722 723 servers := []server{outdated(), outdated(), outdated()} 724 // fallback shutdown attempt in case testing fails 725 defer servers[0].cleanup() 726 defer servers[1].cleanup() 727 defer servers[2].cleanup() 728 729 upgrade := func(i int) { 730 previous := servers[i] 731 732 servers[i] = upgraded() 733 TestJoin(t, servers[i].s, servers[(i+1)%3].s, servers[(i+2)%3].s) 734 testutil.WaitForLeader(t, servers[i].s.RPC) 735 736 require.NoError(t, previous.s.Leave()) 737 require.NoError(t, previous.s.Shutdown()) 738 } 739 740 // Join the servers before doing anything 741 TestJoin(t, servers[0].s, servers[1].s, servers[2].s) 742 743 // Wait for servers to settle 744 for i := 0; i < len(servers); i++ { 745 testutil.WaitForLeader(t, servers[i].s.RPC) 746 } 747 748 // A check that ClusterID is not available yet 749 noIDYet := func() { 750 for _, s := range servers { 751 retry.Run(t, func(r *retry.R) { 752 if _, err := s.s.ClusterID(); err == nil { 753 r.Error("expected error") 754 } 755 }) 756 } 757 } 758 759 // Replace first old server with new server 760 upgrade(0) 761 defer servers[0].cleanup() 762 noIDYet() // ClusterID should not work yet, servers: [new, old, old] 763 764 // Replace second old server with new server 765 upgrade(1) 766 defer servers[1].cleanup() 767 noIDYet() // ClusterID should not work yet, servers: [new, new, old] 768 769 // Replace third / final old server with new server 770 upgrade(2) 771 defer servers[2].cleanup() 772 773 // Wait for old servers to really be gone 774 for _, s := range servers { 775 testutil.WaitForResult(func() (bool, error) { 776 peers, _ := s.s.numPeers() 777 return peers == 3, nil 778 }, func(_ error) { 779 t.Fatalf("should have 3 peers") 780 }) 781 } 782 783 // Now we can tickle the leader into making a cluster ID 784 leaderID := "" 785 for _, s := range servers { 786 if s.s.IsLeader() { 787 id, err := s.s.ClusterID() 788 require.NoError(t, err) 789 leaderID = id 790 break 791 } 792 } 793 require.True(t, helper.IsUUID(leaderID)) 794 795 // Now every participating server has been upgraded, each one should be 796 // able to get the cluster ID, having been plumbed all the way through. 797 agreeClusterID(t, []*Server{servers[0].s, servers[1].s, servers[2].s}) 798 } 799 800 func TestLeader_ClusterID_noUpgrade(t *testing.T) { 801 t.Parallel() 802 803 type server struct { 804 s *Server 805 cleanup func() 806 } 807 808 s1, cleanupS1 := TestServer(t, func(c *Config) { 809 c.Logger.SetLevel(hclog.Trace) 810 c.NumSchedulers = 0 811 c.Build = minClusterIDVersion.String() 812 c.BootstrapExpect = 3 813 }) 814 defer cleanupS1() 815 s2, cleanupS2 := TestServer(t, func(c *Config) { 816 c.Logger.SetLevel(hclog.Trace) 817 c.NumSchedulers = 0 818 c.Build = minClusterIDVersion.String() 819 c.BootstrapExpect = 3 820 }) 821 defer cleanupS2() 822 s3, cleanupS3 := TestServer(t, func(c *Config) { 823 c.Logger.SetLevel(hclog.Trace) 824 c.NumSchedulers = 0 825 c.Build = minClusterIDVersion.String() 826 c.BootstrapExpect = 3 827 }) 828 defer cleanupS3() 829 830 servers := []*Server{s1, s2, s3} 831 832 // Join the servers before doing anything 833 TestJoin(t, servers[0], servers[1], servers[2]) 834 835 // Wait for servers to settle 836 for i := 0; i < len(servers); i++ { 837 testutil.WaitForLeader(t, servers[i].RPC) 838 } 839 840 // Each server started at the minimum version, check there should be only 1 841 // cluster ID they all agree on. 842 agreeClusterID(t, []*Server{servers[0], servers[1], servers[2]}) 843 } 844 845 func agreeClusterID(t *testing.T, servers []*Server) { 846 retries := &retry.Timer{Timeout: 60 * time.Second, Wait: 1 * time.Second} 847 ids := make([]string, 3) 848 for i, s := range servers { 849 retry.RunWith(retries, t, func(r *retry.R) { 850 id, err := s.ClusterID() 851 if err != nil { 852 r.Error(err.Error()) 853 return 854 } 855 if !helper.IsUUID(id) { 856 r.Error("not a UUID") 857 return 858 } 859 ids[i] = id 860 }) 861 } 862 require.True(t, ids[0] == ids[1] && ids[1] == ids[2], "ids[0] %s, ids[1] %s, ids[2] %s", ids[0], ids[1], ids[2]) 863 } 864 865 func TestLeader_ReplicateACLPolicies(t *testing.T) { 866 t.Parallel() 867 868 s1, root, cleanupS1 := TestACLServer(t, func(c *Config) { 869 c.Region = "region1" 870 c.AuthoritativeRegion = "region1" 871 c.ACLEnabled = true 872 }) 873 defer cleanupS1() 874 s2, _, cleanupS2 := TestACLServer(t, func(c *Config) { 875 c.Region = "region2" 876 c.AuthoritativeRegion = "region1" 877 c.ACLEnabled = true 878 c.ReplicationBackoff = 20 * time.Millisecond 879 c.ReplicationToken = root.SecretID 880 }) 881 defer cleanupS2() 882 TestJoin(t, s1, s2) 883 testutil.WaitForLeader(t, s1.RPC) 884 testutil.WaitForLeader(t, s2.RPC) 885 886 // Write a policy to the authoritative region 887 p1 := mock.ACLPolicy() 888 if err := s1.State().UpsertACLPolicies(100, []*structs.ACLPolicy{p1}); err != nil { 889 t.Fatalf("bad: %v", err) 890 } 891 892 // Wait for the policy to replicate 893 testutil.WaitForResult(func() (bool, error) { 894 state := s2.State() 895 out, err := state.ACLPolicyByName(nil, p1.Name) 896 return out != nil, err 897 }, func(err error) { 898 t.Fatalf("should replicate policy") 899 }) 900 } 901 902 func TestLeader_DiffACLPolicies(t *testing.T) { 903 t.Parallel() 904 905 state := state.TestStateStore(t) 906 907 // Populate the local state 908 p1 := mock.ACLPolicy() 909 p2 := mock.ACLPolicy() 910 p3 := mock.ACLPolicy() 911 assert.Nil(t, state.UpsertACLPolicies(100, []*structs.ACLPolicy{p1, p2, p3})) 912 913 // Simulate a remote list 914 p2Stub := p2.Stub() 915 p2Stub.ModifyIndex = 50 // Ignored, same index 916 p3Stub := p3.Stub() 917 p3Stub.ModifyIndex = 100 // Updated, higher index 918 p3Stub.Hash = []byte{0, 1, 2, 3} 919 p4 := mock.ACLPolicy() 920 remoteList := []*structs.ACLPolicyListStub{ 921 p2Stub, 922 p3Stub, 923 p4.Stub(), 924 } 925 delete, update := diffACLPolicies(state, 50, remoteList) 926 927 // P1 does not exist on the remote side, should delete 928 assert.Equal(t, []string{p1.Name}, delete) 929 930 // P2 is un-modified - ignore. P3 modified, P4 new. 931 assert.Equal(t, []string{p3.Name, p4.Name}, update) 932 } 933 934 func TestLeader_ReplicateACLTokens(t *testing.T) { 935 t.Parallel() 936 937 s1, root, cleanupS1 := TestACLServer(t, func(c *Config) { 938 c.Region = "region1" 939 c.AuthoritativeRegion = "region1" 940 c.ACLEnabled = true 941 }) 942 defer cleanupS1() 943 s2, _, cleanupS2 := TestACLServer(t, func(c *Config) { 944 c.Region = "region2" 945 c.AuthoritativeRegion = "region1" 946 c.ACLEnabled = true 947 c.ReplicationBackoff = 20 * time.Millisecond 948 c.ReplicationToken = root.SecretID 949 }) 950 defer cleanupS2() 951 TestJoin(t, s1, s2) 952 testutil.WaitForLeader(t, s1.RPC) 953 testutil.WaitForLeader(t, s2.RPC) 954 955 // Write a token to the authoritative region 956 p1 := mock.ACLToken() 957 p1.Global = true 958 if err := s1.State().UpsertACLTokens(100, []*structs.ACLToken{p1}); err != nil { 959 t.Fatalf("bad: %v", err) 960 } 961 962 // Wait for the token to replicate 963 testutil.WaitForResult(func() (bool, error) { 964 state := s2.State() 965 out, err := state.ACLTokenByAccessorID(nil, p1.AccessorID) 966 return out != nil, err 967 }, func(err error) { 968 t.Fatalf("should replicate token") 969 }) 970 } 971 972 func TestLeader_DiffACLTokens(t *testing.T) { 973 t.Parallel() 974 975 state := state.TestStateStore(t) 976 977 // Populate the local state 978 p0 := mock.ACLToken() 979 p1 := mock.ACLToken() 980 p1.Global = true 981 p2 := mock.ACLToken() 982 p2.Global = true 983 p3 := mock.ACLToken() 984 p3.Global = true 985 assert.Nil(t, state.UpsertACLTokens(100, []*structs.ACLToken{p0, p1, p2, p3})) 986 987 // Simulate a remote list 988 p2Stub := p2.Stub() 989 p2Stub.ModifyIndex = 50 // Ignored, same index 990 p3Stub := p3.Stub() 991 p3Stub.ModifyIndex = 100 // Updated, higher index 992 p3Stub.Hash = []byte{0, 1, 2, 3} 993 p4 := mock.ACLToken() 994 p4.Global = true 995 remoteList := []*structs.ACLTokenListStub{ 996 p2Stub, 997 p3Stub, 998 p4.Stub(), 999 } 1000 delete, update := diffACLTokens(state, 50, remoteList) 1001 1002 // P0 is local and should be ignored 1003 // P1 does not exist on the remote side, should delete 1004 assert.Equal(t, []string{p1.AccessorID}, delete) 1005 1006 // P2 is un-modified - ignore. P3 modified, P4 new. 1007 assert.Equal(t, []string{p3.AccessorID, p4.AccessorID}, update) 1008 } 1009 1010 func TestLeader_UpgradeRaftVersion(t *testing.T) { 1011 t.Parallel() 1012 1013 s1, cleanupS1 := TestServer(t, func(c *Config) { 1014 c.Datacenter = "dc1" 1015 c.RaftConfig.ProtocolVersion = 2 1016 }) 1017 defer cleanupS1() 1018 1019 s2, cleanupS2 := TestServer(t, func(c *Config) { 1020 c.BootstrapExpect = 3 1021 c.RaftConfig.ProtocolVersion = 1 1022 }) 1023 defer cleanupS2() 1024 1025 s3, cleanupS3 := TestServer(t, func(c *Config) { 1026 c.BootstrapExpect = 3 1027 c.RaftConfig.ProtocolVersion = 2 1028 }) 1029 defer cleanupS3() 1030 1031 servers := []*Server{s1, s2, s3} 1032 1033 // Try to join 1034 TestJoin(t, s1, s2, s3) 1035 1036 for _, s := range servers { 1037 testutil.WaitForResult(func() (bool, error) { 1038 peers, _ := s.numPeers() 1039 return peers == 3, nil 1040 }, func(err error) { 1041 t.Fatalf("should have 3 peers") 1042 }) 1043 } 1044 1045 // Kill the v1 server 1046 if err := s2.Leave(); err != nil { 1047 t.Fatal(err) 1048 } 1049 1050 for _, s := range []*Server{s1, s3} { 1051 minVer, err := s.autopilot.MinRaftProtocol() 1052 if err != nil { 1053 t.Fatal(err) 1054 } 1055 if got, want := minVer, 2; got != want { 1056 t.Fatalf("got min raft version %d want %d", got, want) 1057 } 1058 } 1059 1060 // Replace the dead server with one running raft protocol v3 1061 s4, cleanupS4 := TestServer(t, func(c *Config) { 1062 c.BootstrapExpect = 3 1063 c.Datacenter = "dc1" 1064 c.RaftConfig.ProtocolVersion = 3 1065 }) 1066 defer cleanupS4() 1067 TestJoin(t, s1, s4) 1068 servers[1] = s4 1069 1070 // Make sure we're back to 3 total peers with the new one added via ID 1071 for _, s := range servers { 1072 testutil.WaitForResult(func() (bool, error) { 1073 addrs := 0 1074 ids := 0 1075 future := s.raft.GetConfiguration() 1076 if err := future.Error(); err != nil { 1077 return false, err 1078 } 1079 for _, server := range future.Configuration().Servers { 1080 if string(server.ID) == string(server.Address) { 1081 addrs++ 1082 } else { 1083 ids++ 1084 } 1085 } 1086 if got, want := addrs, 2; got != want { 1087 return false, fmt.Errorf("got %d server addresses want %d", got, want) 1088 } 1089 if got, want := ids, 1; got != want { 1090 return false, fmt.Errorf("got %d server ids want %d", got, want) 1091 } 1092 1093 return true, nil 1094 }, func(err error) { 1095 t.Fatal(err) 1096 }) 1097 } 1098 } 1099 1100 func TestLeader_Reelection(t *testing.T) { 1101 raftProtocols := []int{1, 2, 3} 1102 for _, p := range raftProtocols { 1103 t.Run("Leader Election - Protocol version "+string(p), func(t *testing.T) { 1104 leaderElectionTest(t, raft.ProtocolVersion(p)) 1105 }) 1106 } 1107 1108 } 1109 1110 func leaderElectionTest(t *testing.T, raftProtocol raft.ProtocolVersion) { 1111 s1, cleanupS1 := TestServer(t, func(c *Config) { 1112 c.BootstrapExpect = 3 1113 c.RaftConfig.ProtocolVersion = raftProtocol 1114 }) 1115 defer cleanupS1() 1116 1117 s2, cleanupS2 := TestServer(t, func(c *Config) { 1118 c.BootstrapExpect = 3 1119 c.RaftConfig.ProtocolVersion = raftProtocol 1120 }) 1121 defer cleanupS2() 1122 1123 s3, cleanupS3 := TestServer(t, func(c *Config) { 1124 c.BootstrapExpect = 3 1125 c.RaftConfig.ProtocolVersion = raftProtocol 1126 }) 1127 defer cleanupS3() // todo(shoenig) added this, should be here right?? 1128 1129 servers := []*Server{s1, s2, s3} 1130 1131 // Try to join 1132 TestJoin(t, s1, s2, s3) 1133 testutil.WaitForLeader(t, s1.RPC) 1134 1135 testutil.WaitForResult(func() (bool, error) { 1136 future := s1.raft.GetConfiguration() 1137 if err := future.Error(); err != nil { 1138 return false, err 1139 } 1140 1141 for _, server := range future.Configuration().Servers { 1142 if server.Suffrage == raft.Nonvoter { 1143 return false, fmt.Errorf("non-voter %v", server) 1144 } 1145 } 1146 1147 return true, nil 1148 }, func(err error) { 1149 t.Fatal(err) 1150 }) 1151 1152 var leader, nonLeader *Server 1153 for _, s := range servers { 1154 if s.IsLeader() { 1155 leader = s 1156 } else { 1157 nonLeader = s 1158 } 1159 } 1160 1161 // Shutdown the leader 1162 leader.Shutdown() 1163 // Wait for new leader to elect 1164 testutil.WaitForLeader(t, nonLeader.RPC) 1165 } 1166 1167 func TestLeader_RollRaftServer(t *testing.T) { 1168 t.Parallel() 1169 1170 s1, cleanupS1 := TestServer(t, func(c *Config) { 1171 c.RaftConfig.ProtocolVersion = 2 1172 }) 1173 defer cleanupS1() 1174 1175 s2, cleanupS2 := TestServer(t, func(c *Config) { 1176 c.BootstrapExpect = 3 1177 c.RaftConfig.ProtocolVersion = 2 1178 }) 1179 defer cleanupS2() 1180 1181 s3, cleanupS3 := TestServer(t, func(c *Config) { 1182 c.BootstrapExpect = 3 1183 c.RaftConfig.ProtocolVersion = 2 1184 }) 1185 defer cleanupS3() 1186 1187 servers := []*Server{s1, s2, s3} 1188 1189 // Try to join 1190 TestJoin(t, s1, s2, s3) 1191 1192 for _, s := range servers { 1193 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 1194 } 1195 1196 // Kill the first v2 server 1197 s1.Shutdown() 1198 1199 for _, s := range []*Server{s1, s3} { 1200 retry.Run(t, func(r *retry.R) { 1201 minVer, err := s.autopilot.MinRaftProtocol() 1202 if err != nil { 1203 r.Fatal(err) 1204 } 1205 if got, want := minVer, 2; got != want { 1206 r.Fatalf("got min raft version %d want %d", got, want) 1207 } 1208 }) 1209 } 1210 1211 // Replace the dead server with one running raft protocol v3 1212 s4, cleanupS4 := TestServer(t, func(c *Config) { 1213 c.BootstrapExpect = 3 1214 c.RaftConfig.ProtocolVersion = 3 1215 }) 1216 defer cleanupS4() 1217 TestJoin(t, s4, s2) 1218 servers[0] = s4 1219 1220 // Kill the second v2 server 1221 s2.Shutdown() 1222 1223 for _, s := range []*Server{s3, s4} { 1224 retry.Run(t, func(r *retry.R) { 1225 minVer, err := s.autopilot.MinRaftProtocol() 1226 if err != nil { 1227 r.Fatal(err) 1228 } 1229 if got, want := minVer, 2; got != want { 1230 r.Fatalf("got min raft version %d want %d", got, want) 1231 } 1232 }) 1233 } 1234 // Replace another dead server with one running raft protocol v3 1235 s5, cleanupS5 := TestServer(t, func(c *Config) { 1236 c.BootstrapExpect = 3 1237 c.RaftConfig.ProtocolVersion = 3 1238 }) 1239 defer cleanupS5() 1240 TestJoin(t, s5, s4) 1241 servers[1] = s5 1242 1243 // Kill the last v2 server, now minRaftProtocol should be 3 1244 s3.Shutdown() 1245 1246 for _, s := range []*Server{s4, s5} { 1247 retry.Run(t, func(r *retry.R) { 1248 minVer, err := s.autopilot.MinRaftProtocol() 1249 if err != nil { 1250 r.Fatal(err) 1251 } 1252 if got, want := minVer, 3; got != want { 1253 r.Fatalf("got min raft version %d want %d", got, want) 1254 } 1255 }) 1256 } 1257 1258 // Replace the last dead server with one running raft protocol v3 1259 s6, cleanupS6 := TestServer(t, func(c *Config) { 1260 c.BootstrapExpect = 3 1261 c.RaftConfig.ProtocolVersion = 3 1262 }) 1263 defer cleanupS6() 1264 TestJoin(t, s6, s4) 1265 servers[2] = s6 1266 1267 // Make sure all the dead servers are removed and we're back to 3 total peers 1268 for _, s := range servers { 1269 retry.Run(t, func(r *retry.R) { 1270 addrs := 0 1271 ids := 0 1272 future := s.raft.GetConfiguration() 1273 if err := future.Error(); err != nil { 1274 r.Fatal(err) 1275 } 1276 for _, server := range future.Configuration().Servers { 1277 if string(server.ID) == string(server.Address) { 1278 addrs++ 1279 } else { 1280 ids++ 1281 } 1282 } 1283 if got, want := addrs, 0; got != want { 1284 r.Fatalf("got %d server addresses want %d", got, want) 1285 } 1286 if got, want := ids, 3; got != want { 1287 r.Fatalf("got %d server ids want %d", got, want) 1288 } 1289 }) 1290 } 1291 } 1292 1293 func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) { 1294 s1, cleanupS1 := TestServer(t, nil) 1295 defer cleanupS1() 1296 testutil.WaitForLeader(t, s1.RPC) 1297 1298 testutil.WaitForResult(func() (bool, error) { 1299 return s1.evalBroker.Enabled(), nil 1300 }, func(err error) { 1301 t.Fatalf("should have finished establish leader loop") 1302 }) 1303 1304 require.Nil(t, s1.revokeLeadership()) 1305 require.Nil(t, s1.revokeLeadership()) 1306 require.Nil(t, s1.revokeLeadership()) 1307 } 1308 1309 func TestLeader_TransitionsUpdateConsistencyRead(t *testing.T) { 1310 s1, cleanupS1 := TestServer(t, nil) 1311 defer cleanupS1() 1312 testutil.WaitForLeader(t, s1.RPC) 1313 1314 testutil.WaitForResult(func() (bool, error) { 1315 return s1.isReadyForConsistentReads(), nil 1316 }, func(err error) { 1317 require.Fail(t, "should have finished establish leader loop") 1318 }) 1319 1320 require.Nil(t, s1.revokeLeadership()) 1321 require.False(t, s1.isReadyForConsistentReads()) 1322 1323 ch := make(chan struct{}) 1324 require.Nil(t, s1.establishLeadership(ch)) 1325 require.True(t, s1.isReadyForConsistentReads()) 1326 } 1327 1328 // Test doing an inplace upgrade on a server from raft protocol 2 to 3 1329 // This verifies that removing the server and adding it back with a uuid works 1330 // even if the server's address stays the same. 1331 func TestServer_ReconcileMember(t *testing.T) { 1332 t.Parallel() 1333 1334 // Create a three node cluster 1335 s1, cleanupS1 := TestServer(t, func(c *Config) { 1336 c.BootstrapExpect = 2 1337 c.RaftConfig.ProtocolVersion = 3 1338 }) 1339 defer cleanupS1() 1340 1341 s2, cleanupS2 := TestServer(t, func(c *Config) { 1342 c.BootstrapExpect = 2 1343 c.RaftConfig.ProtocolVersion = 3 1344 }) 1345 defer cleanupS2() 1346 1347 TestJoin(t, s1, s2) 1348 testutil.WaitForLeader(t, s1.RPC) 1349 1350 // test relies on s3 not being the leader, so adding it 1351 // after leadership has been established to reduce 1352 s3, cleanupS3 := TestServer(t, func(c *Config) { 1353 c.BootstrapExpect = 0 1354 c.RaftConfig.ProtocolVersion = 2 1355 }) 1356 defer cleanupS3() 1357 1358 TestJoin(t, s1, s3) 1359 1360 // Create a memberlist object for s3, with raft protocol upgraded to 3 1361 upgradedS3Member := serf.Member{ 1362 Name: s3.config.NodeName, 1363 Addr: s3.config.RPCAddr.IP, 1364 Status: serf.StatusAlive, 1365 Tags: make(map[string]string), 1366 } 1367 upgradedS3Member.Tags["role"] = "nomad" 1368 upgradedS3Member.Tags["id"] = s3.config.NodeID 1369 upgradedS3Member.Tags["region"] = s3.config.Region 1370 upgradedS3Member.Tags["dc"] = s3.config.Datacenter 1371 upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1" 1372 upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port) 1373 upgradedS3Member.Tags["build"] = "0.8.0" 1374 upgradedS3Member.Tags["vsn"] = "2" 1375 upgradedS3Member.Tags["mvn"] = "1" 1376 upgradedS3Member.Tags["raft_vsn"] = "3" 1377 1378 findLeader := func(t *testing.T) *Server { 1379 t.Helper() 1380 for _, s := range []*Server{s1, s2, s3} { 1381 if s.IsLeader() { 1382 t.Logf("found leader: %v %v", s.config.NodeID, s.config.RPCAddr) 1383 return s 1384 } 1385 } 1386 1387 t.Fatalf("no leader found") 1388 return nil 1389 } 1390 1391 // Find the leader so that we can call reconcile member on it 1392 leader := findLeader(t) 1393 if err := leader.reconcileMember(upgradedS3Member); err != nil { 1394 t.Fatalf("failed to reconcile member: %v", err) 1395 } 1396 1397 // This should remove s3 from the config and potentially cause a leader election 1398 testutil.WaitForLeader(t, s1.RPC) 1399 1400 // Figure out the new leader and call reconcile again, this should add s3 with the new ID format 1401 leader = findLeader(t) 1402 if err := leader.reconcileMember(upgradedS3Member); err != nil { 1403 t.Fatalf("failed to reconcile member: %v", err) 1404 } 1405 1406 testutil.WaitForLeader(t, s1.RPC) 1407 future := s2.raft.GetConfiguration() 1408 if err := future.Error(); err != nil { 1409 t.Fatal(err) 1410 } 1411 addrs := 0 1412 ids := 0 1413 for _, server := range future.Configuration().Servers { 1414 if string(server.ID) == string(server.Address) { 1415 addrs++ 1416 } else { 1417 ids++ 1418 } 1419 } 1420 // After this, all three servers should have IDs in raft 1421 if got, want := addrs, 0; got != want { 1422 t.Fatalf("got %d server addresses want %d", got, want) 1423 } 1424 if got, want := ids, 3; got != want { 1425 t.Fatalf("got %d server ids want %d: %#v", got, want, future.Configuration().Servers) 1426 } 1427 } 1428 1429 // waitForStableLeadership waits until a leader is elected and all servers 1430 // get promoted as voting members, returns the leader 1431 func waitForStableLeadership(t *testing.T, servers []*Server) *Server { 1432 nPeers := len(servers) 1433 1434 // wait for all servers to discover each other 1435 for _, s := range servers { 1436 testutil.WaitForResult(func() (bool, error) { 1437 peers, _ := s.numPeers() 1438 return peers == 3, fmt.Errorf("should find %d peers but found %d", nPeers, peers) 1439 }, func(err error) { 1440 require.NoError(t, err) 1441 }) 1442 } 1443 1444 // wait for leader 1445 var leader *Server 1446 testutil.WaitForResult(func() (bool, error) { 1447 for _, s := range servers { 1448 if s.IsLeader() { 1449 leader = s 1450 return true, nil 1451 } 1452 } 1453 1454 return false, fmt.Errorf("no leader found") 1455 }, func(err error) { 1456 require.NoError(t, err) 1457 }) 1458 1459 // wait for all servers get marked as voters 1460 testutil.WaitForResult(func() (bool, error) { 1461 future := leader.raft.GetConfiguration() 1462 if err := future.Error(); err != nil { 1463 return false, fmt.Errorf("failed to get raft config: %v", future.Error()) 1464 } 1465 ss := future.Configuration().Servers 1466 if len(ss) != len(servers) { 1467 return false, fmt.Errorf("raft doesn't contain all servers. Expected %d but found %d", len(servers), len(ss)) 1468 } 1469 1470 for _, s := range ss { 1471 if s.Suffrage != raft.Voter { 1472 return false, fmt.Errorf("configuration has non voting server: %v", s) 1473 } 1474 } 1475 1476 return true, nil 1477 }, func(err error) { 1478 require.NoError(t, err) 1479 }) 1480 1481 return leader 1482 }