github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/leader_test.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "fmt" 6 "strconv" 7 "testing" 8 "time" 9 10 "github.com/hashicorp/consul/testutil/retry" 11 memdb "github.com/hashicorp/go-memdb" 12 "github.com/hashicorp/nomad/nomad/mock" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "github.com/hashicorp/nomad/testutil" 16 "github.com/hashicorp/raft" 17 "github.com/hashicorp/serf/serf" 18 "github.com/stretchr/testify/assert" 19 "github.com/stretchr/testify/require" 20 ) 21 22 func TestLeader_LeftServer(t *testing.T) { 23 s1 := TestServer(t, nil) 24 defer s1.Shutdown() 25 26 s2 := TestServer(t, func(c *Config) { 27 c.DevDisableBootstrap = true 28 }) 29 defer s2.Shutdown() 30 31 s3 := TestServer(t, func(c *Config) { 32 c.DevDisableBootstrap = true 33 }) 34 defer s3.Shutdown() 35 servers := []*Server{s1, s2, s3} 36 TestJoin(t, s1, s2, s3) 37 38 for _, s := range servers { 39 testutil.WaitForResult(func() (bool, error) { 40 peers, _ := s.numPeers() 41 return peers == 3, nil 42 }, func(err error) { 43 t.Fatalf("should have 3 peers") 44 }) 45 } 46 47 // Kill any server 48 var peer *Server 49 for _, s := range servers { 50 if !s.IsLeader() { 51 peer = s 52 break 53 } 54 } 55 if peer == nil { 56 t.Fatalf("Should have a non-leader") 57 } 58 peer.Shutdown() 59 name := fmt.Sprintf("%s.%s", peer.config.NodeName, peer.config.Region) 60 61 testutil.WaitForResult(func() (bool, error) { 62 for _, s := range servers { 63 if s == peer { 64 continue 65 } 66 67 // Force remove the non-leader (transition to left state) 68 if err := s.RemoveFailedNode(name); err != nil { 69 return false, err 70 } 71 72 peers, _ := s.numPeers() 73 return peers == 2, errors.New(fmt.Sprintf("%v", peers)) 74 } 75 76 return true, nil 77 }, func(err error) { 78 t.Fatalf("err: %s", err) 79 }) 80 } 81 82 func TestLeader_LeftLeader(t *testing.T) { 83 s1 := TestServer(t, nil) 84 defer s1.Shutdown() 85 86 s2 := TestServer(t, func(c *Config) { 87 c.DevDisableBootstrap = true 88 }) 89 defer s2.Shutdown() 90 91 s3 := TestServer(t, func(c *Config) { 92 c.DevDisableBootstrap = true 93 }) 94 defer s3.Shutdown() 95 servers := []*Server{s1, s2, s3} 96 TestJoin(t, s1, s2, s3) 97 98 for _, s := range servers { 99 testutil.WaitForResult(func() (bool, error) { 100 peers, _ := s.numPeers() 101 return peers == 3, nil 102 }, func(err error) { 103 t.Fatalf("should have 3 peers") 104 }) 105 } 106 107 // Kill the leader! 108 var leader *Server 109 for _, s := range servers { 110 if s.IsLeader() { 111 leader = s 112 break 113 } 114 } 115 if leader == nil { 116 t.Fatalf("Should have a leader") 117 } 118 leader.Leave() 119 leader.Shutdown() 120 121 for _, s := range servers { 122 if s == leader { 123 continue 124 } 125 testutil.WaitForResult(func() (bool, error) { 126 peers, _ := s.numPeers() 127 return peers == 2, errors.New(fmt.Sprintf("%v", peers)) 128 }, func(err error) { 129 t.Fatalf("should have 2 peers: %v", err) 130 }) 131 } 132 } 133 134 func TestLeader_MultiBootstrap(t *testing.T) { 135 s1 := TestServer(t, nil) 136 defer s1.Shutdown() 137 138 s2 := TestServer(t, nil) 139 defer s2.Shutdown() 140 servers := []*Server{s1, s2} 141 TestJoin(t, s1, s2) 142 143 for _, s := range servers { 144 testutil.WaitForResult(func() (bool, error) { 145 peers := s.Members() 146 return len(peers) == 2, nil 147 }, func(err error) { 148 t.Fatalf("should have 2 peers") 149 }) 150 } 151 152 // Ensure we don't have multiple raft peers 153 for _, s := range servers { 154 peers, _ := s.numPeers() 155 if peers != 1 { 156 t.Fatalf("should only have 1 raft peer!") 157 } 158 } 159 } 160 161 func TestLeader_PlanQueue_Reset(t *testing.T) { 162 s1 := TestServer(t, nil) 163 defer s1.Shutdown() 164 165 s2 := TestServer(t, func(c *Config) { 166 c.DevDisableBootstrap = true 167 }) 168 defer s2.Shutdown() 169 170 s3 := TestServer(t, func(c *Config) { 171 c.DevDisableBootstrap = true 172 }) 173 defer s3.Shutdown() 174 servers := []*Server{s1, s2, s3} 175 TestJoin(t, s1, s2, s3) 176 177 for _, s := range servers { 178 testutil.WaitForResult(func() (bool, error) { 179 peers, _ := s.numPeers() 180 return peers == 3, nil 181 }, func(err error) { 182 t.Fatalf("should have 3 peers") 183 }) 184 } 185 186 var leader *Server 187 for _, s := range servers { 188 if s.IsLeader() { 189 leader = s 190 break 191 } 192 } 193 if leader == nil { 194 t.Fatalf("Should have a leader") 195 } 196 197 if !leader.planQueue.Enabled() { 198 t.Fatalf("should enable plan queue") 199 } 200 201 for _, s := range servers { 202 if !s.IsLeader() && s.planQueue.Enabled() { 203 t.Fatalf("plan queue should not be enabled") 204 } 205 } 206 207 // Kill the leader 208 leader.Shutdown() 209 time.Sleep(100 * time.Millisecond) 210 211 // Wait for a new leader 212 leader = nil 213 testutil.WaitForResult(func() (bool, error) { 214 for _, s := range servers { 215 if s.IsLeader() { 216 leader = s 217 return true, nil 218 } 219 } 220 return false, nil 221 }, func(err error) { 222 t.Fatalf("should have leader") 223 }) 224 225 // Check that the new leader has a pending GC expiration 226 testutil.WaitForResult(func() (bool, error) { 227 return leader.planQueue.Enabled(), nil 228 }, func(err error) { 229 t.Fatalf("should enable plan queue") 230 }) 231 } 232 233 func TestLeader_EvalBroker_Reset(t *testing.T) { 234 s1 := TestServer(t, func(c *Config) { 235 c.NumSchedulers = 0 236 }) 237 defer s1.Shutdown() 238 239 s2 := TestServer(t, func(c *Config) { 240 c.NumSchedulers = 0 241 c.DevDisableBootstrap = true 242 }) 243 defer s2.Shutdown() 244 245 s3 := TestServer(t, func(c *Config) { 246 c.NumSchedulers = 0 247 c.DevDisableBootstrap = true 248 }) 249 defer s3.Shutdown() 250 servers := []*Server{s1, s2, s3} 251 TestJoin(t, s1, s2, s3) 252 testutil.WaitForLeader(t, s1.RPC) 253 254 for _, s := range servers { 255 testutil.WaitForResult(func() (bool, error) { 256 peers, _ := s.numPeers() 257 return peers == 3, nil 258 }, func(err error) { 259 t.Fatalf("should have 3 peers") 260 }) 261 } 262 263 var leader *Server 264 for _, s := range servers { 265 if s.IsLeader() { 266 leader = s 267 break 268 } 269 } 270 if leader == nil { 271 t.Fatalf("Should have a leader") 272 } 273 274 // Inject a pending eval 275 req := structs.EvalUpdateRequest{ 276 Evals: []*structs.Evaluation{mock.Eval()}, 277 } 278 _, _, err := leader.raftApply(structs.EvalUpdateRequestType, req) 279 if err != nil { 280 t.Fatalf("err: %v", err) 281 } 282 283 // Kill the leader 284 leader.Shutdown() 285 time.Sleep(100 * time.Millisecond) 286 287 // Wait for a new leader 288 leader = nil 289 testutil.WaitForResult(func() (bool, error) { 290 for _, s := range servers { 291 if s.IsLeader() { 292 leader = s 293 return true, nil 294 } 295 } 296 return false, nil 297 }, func(err error) { 298 t.Fatalf("should have leader") 299 }) 300 301 // Check that the new leader has a pending evaluation 302 testutil.WaitForResult(func() (bool, error) { 303 stats := leader.evalBroker.Stats() 304 return stats.TotalReady == 1, nil 305 }, func(err error) { 306 t.Fatalf("should have pending evaluation") 307 }) 308 } 309 310 func TestLeader_PeriodicDispatcher_Restore_Adds(t *testing.T) { 311 s1 := TestServer(t, func(c *Config) { 312 c.NumSchedulers = 0 313 }) 314 defer s1.Shutdown() 315 316 s2 := TestServer(t, func(c *Config) { 317 c.NumSchedulers = 0 318 c.DevDisableBootstrap = true 319 }) 320 defer s2.Shutdown() 321 322 s3 := TestServer(t, func(c *Config) { 323 c.NumSchedulers = 0 324 c.DevDisableBootstrap = true 325 }) 326 defer s3.Shutdown() 327 servers := []*Server{s1, s2, s3} 328 TestJoin(t, s1, s2, s3) 329 testutil.WaitForLeader(t, s1.RPC) 330 331 for _, s := range servers { 332 testutil.WaitForResult(func() (bool, error) { 333 peers, _ := s.numPeers() 334 return peers == 3, nil 335 }, func(err error) { 336 t.Fatalf("should have 3 peers") 337 }) 338 } 339 340 var leader *Server 341 for _, s := range servers { 342 if s.IsLeader() { 343 leader = s 344 break 345 } 346 } 347 if leader == nil { 348 t.Fatalf("Should have a leader") 349 } 350 351 // Inject a periodic job, a parameterized periodic job and a non-periodic job 352 periodic := mock.PeriodicJob() 353 nonPeriodic := mock.Job() 354 parameterizedPeriodic := mock.PeriodicJob() 355 parameterizedPeriodic.ParameterizedJob = &structs.ParameterizedJobConfig{} 356 for _, job := range []*structs.Job{nonPeriodic, periodic, parameterizedPeriodic} { 357 req := structs.JobRegisterRequest{ 358 Job: job, 359 WriteRequest: structs.WriteRequest{ 360 Namespace: job.Namespace, 361 }, 362 } 363 _, _, err := leader.raftApply(structs.JobRegisterRequestType, req) 364 if err != nil { 365 t.Fatalf("err: %v", err) 366 } 367 } 368 369 // Kill the leader 370 leader.Shutdown() 371 time.Sleep(100 * time.Millisecond) 372 373 // Wait for a new leader 374 leader = nil 375 testutil.WaitForResult(func() (bool, error) { 376 for _, s := range servers { 377 if s.IsLeader() { 378 leader = s 379 return true, nil 380 } 381 } 382 return false, nil 383 }, func(err error) { 384 t.Fatalf("should have leader") 385 }) 386 387 tuplePeriodic := structs.NamespacedID{ 388 ID: periodic.ID, 389 Namespace: periodic.Namespace, 390 } 391 tupleNonPeriodic := structs.NamespacedID{ 392 ID: nonPeriodic.ID, 393 Namespace: nonPeriodic.Namespace, 394 } 395 tupleParameterized := structs.NamespacedID{ 396 ID: parameterizedPeriodic.ID, 397 Namespace: parameterizedPeriodic.Namespace, 398 } 399 400 // Check that the new leader is tracking the periodic job only 401 testutil.WaitForResult(func() (bool, error) { 402 if _, tracked := leader.periodicDispatcher.tracked[tuplePeriodic]; !tracked { 403 return false, fmt.Errorf("periodic job not tracked") 404 } 405 if _, tracked := leader.periodicDispatcher.tracked[tupleNonPeriodic]; tracked { 406 return false, fmt.Errorf("non periodic job tracked") 407 } 408 if _, tracked := leader.periodicDispatcher.tracked[tupleParameterized]; tracked { 409 return false, fmt.Errorf("parameterized periodic job tracked") 410 } 411 return true, nil 412 }, func(err error) { 413 t.Fatalf(err.Error()) 414 }) 415 } 416 417 func TestLeader_PeriodicDispatcher_Restore_NoEvals(t *testing.T) { 418 s1 := TestServer(t, func(c *Config) { 419 c.NumSchedulers = 0 420 }) 421 defer s1.Shutdown() 422 testutil.WaitForLeader(t, s1.RPC) 423 424 // Inject a periodic job that will be triggered soon. 425 launch := time.Now().Add(1 * time.Second) 426 job := testPeriodicJob(launch) 427 req := structs.JobRegisterRequest{ 428 Job: job, 429 WriteRequest: structs.WriteRequest{ 430 Namespace: job.Namespace, 431 }, 432 } 433 _, _, err := s1.raftApply(structs.JobRegisterRequestType, req) 434 if err != nil { 435 t.Fatalf("err: %v", err) 436 } 437 438 // Flush the periodic dispatcher, ensuring that no evals will be created. 439 s1.periodicDispatcher.SetEnabled(false) 440 441 // Get the current time to ensure the launch time is after this once we 442 // restore. 443 now := time.Now() 444 445 // Sleep till after the job should have been launched. 446 time.Sleep(3 * time.Second) 447 448 // Restore the periodic dispatcher. 449 s1.periodicDispatcher.SetEnabled(true) 450 s1.restorePeriodicDispatcher() 451 452 // Ensure the job is tracked. 453 tuple := structs.NamespacedID{ 454 ID: job.ID, 455 Namespace: job.Namespace, 456 } 457 if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked { 458 t.Fatalf("periodic job not restored") 459 } 460 461 // Check that an eval was made. 462 ws := memdb.NewWatchSet() 463 last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 464 if err != nil || last == nil { 465 t.Fatalf("failed to get periodic launch time: %v", err) 466 } 467 468 if last.Launch.Before(now) { 469 t.Fatalf("restorePeriodicDispatcher did not force launch: last %v; want after %v", last.Launch, now) 470 } 471 } 472 473 func TestLeader_PeriodicDispatcher_Restore_Evals(t *testing.T) { 474 s1 := TestServer(t, func(c *Config) { 475 c.NumSchedulers = 0 476 }) 477 defer s1.Shutdown() 478 testutil.WaitForLeader(t, s1.RPC) 479 480 // Inject a periodic job that triggered once in the past, should trigger now 481 // and once in the future. 482 now := time.Now() 483 past := now.Add(-1 * time.Second) 484 future := now.Add(10 * time.Second) 485 job := testPeriodicJob(past, now, future) 486 req := structs.JobRegisterRequest{ 487 Job: job, 488 WriteRequest: structs.WriteRequest{ 489 Namespace: job.Namespace, 490 }, 491 } 492 _, _, err := s1.raftApply(structs.JobRegisterRequestType, req) 493 if err != nil { 494 t.Fatalf("err: %v", err) 495 } 496 497 // Create an eval for the past launch. 498 s1.periodicDispatcher.createEval(job, past) 499 500 // Flush the periodic dispatcher, ensuring that no evals will be created. 501 s1.periodicDispatcher.SetEnabled(false) 502 503 // Sleep till after the job should have been launched. 504 time.Sleep(3 * time.Second) 505 506 // Restore the periodic dispatcher. 507 s1.periodicDispatcher.SetEnabled(true) 508 s1.restorePeriodicDispatcher() 509 510 // Ensure the job is tracked. 511 tuple := structs.NamespacedID{ 512 ID: job.ID, 513 Namespace: job.Namespace, 514 } 515 if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked { 516 t.Fatalf("periodic job not restored") 517 } 518 519 // Check that an eval was made. 520 ws := memdb.NewWatchSet() 521 last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID) 522 if err != nil || last == nil { 523 t.Fatalf("failed to get periodic launch time: %v", err) 524 } 525 if last.Launch == past { 526 t.Fatalf("restorePeriodicDispatcher did not force launch") 527 } 528 } 529 530 func TestLeader_PeriodicDispatch(t *testing.T) { 531 s1 := TestServer(t, func(c *Config) { 532 c.NumSchedulers = 0 533 c.EvalGCInterval = 5 * time.Millisecond 534 }) 535 defer s1.Shutdown() 536 537 // Wait for a periodic dispatch 538 testutil.WaitForResult(func() (bool, error) { 539 stats := s1.evalBroker.Stats() 540 bySched, ok := stats.ByScheduler[structs.JobTypeCore] 541 if !ok { 542 return false, nil 543 } 544 return bySched.Ready > 0, nil 545 }, func(err error) { 546 t.Fatalf("should pending job") 547 }) 548 } 549 550 func TestLeader_ReapFailedEval(t *testing.T) { 551 s1 := TestServer(t, func(c *Config) { 552 c.NumSchedulers = 0 553 c.EvalDeliveryLimit = 1 554 }) 555 defer s1.Shutdown() 556 testutil.WaitForLeader(t, s1.RPC) 557 558 // Wait for a periodic dispatch 559 eval := mock.Eval() 560 s1.evalBroker.Enqueue(eval) 561 562 // Dequeue and Nack 563 out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second) 564 if err != nil { 565 t.Fatalf("err: %v", err) 566 } 567 s1.evalBroker.Nack(out.ID, token) 568 569 // Wait for an updated and followup evaluation 570 state := s1.fsm.State() 571 testutil.WaitForResult(func() (bool, error) { 572 ws := memdb.NewWatchSet() 573 out, err := state.EvalByID(ws, eval.ID) 574 if err != nil { 575 return false, err 576 } 577 if out == nil { 578 return false, fmt.Errorf("expect original evaluation to exist") 579 } 580 if out.Status != structs.EvalStatusFailed { 581 return false, fmt.Errorf("got status %v; want %v", out.Status, structs.EvalStatusFailed) 582 } 583 584 // See if there is a followup 585 evals, err := state.EvalsByJob(ws, eval.Namespace, eval.JobID) 586 if err != nil { 587 return false, err 588 } 589 590 if l := len(evals); l != 2 { 591 return false, fmt.Errorf("got %d evals, want 2", l) 592 } 593 594 for _, e := range evals { 595 if e.ID == eval.ID { 596 continue 597 } 598 599 if e.Status != structs.EvalStatusPending { 600 return false, fmt.Errorf("follow up eval has status %v; want %v", 601 e.Status, structs.EvalStatusPending) 602 } 603 604 if e.Wait < s1.config.EvalFailedFollowupBaselineDelay || 605 e.Wait > s1.config.EvalFailedFollowupBaselineDelay+s1.config.EvalFailedFollowupDelayRange { 606 return false, fmt.Errorf("bad wait: %v", e.Wait) 607 } 608 609 if e.TriggeredBy != structs.EvalTriggerFailedFollowUp { 610 return false, fmt.Errorf("follow up eval TriggeredBy %v; want %v", 611 e.TriggeredBy, structs.EvalTriggerFailedFollowUp) 612 } 613 } 614 615 return true, nil 616 }, func(err error) { 617 t.Fatalf("err: %v", err) 618 }) 619 } 620 621 func TestLeader_ReapDuplicateEval(t *testing.T) { 622 s1 := TestServer(t, func(c *Config) { 623 c.NumSchedulers = 0 624 }) 625 defer s1.Shutdown() 626 testutil.WaitForLeader(t, s1.RPC) 627 628 // Create a duplicate blocked eval 629 eval := mock.Eval() 630 eval2 := mock.Eval() 631 eval2.JobID = eval.JobID 632 s1.blockedEvals.Block(eval) 633 s1.blockedEvals.Block(eval2) 634 635 // Wait for the evaluation to marked as cancelled 636 state := s1.fsm.State() 637 testutil.WaitForResult(func() (bool, error) { 638 ws := memdb.NewWatchSet() 639 out, err := state.EvalByID(ws, eval2.ID) 640 if err != nil { 641 return false, err 642 } 643 return out != nil && out.Status == structs.EvalStatusCancelled, nil 644 }, func(err error) { 645 t.Fatalf("err: %v", err) 646 }) 647 } 648 649 func TestLeader_RestoreVaultAccessors(t *testing.T) { 650 s1 := TestServer(t, func(c *Config) { 651 c.NumSchedulers = 0 652 }) 653 defer s1.Shutdown() 654 testutil.WaitForLeader(t, s1.RPC) 655 656 // Insert a vault accessor that should be revoked 657 state := s1.fsm.State() 658 va := mock.VaultAccessor() 659 if err := state.UpsertVaultAccessor(100, []*structs.VaultAccessor{va}); err != nil { 660 t.Fatalf("bad: %v", err) 661 } 662 663 // Swap the Vault client 664 tvc := &TestVaultClient{} 665 s1.vault = tvc 666 667 // Do a restore 668 if err := s1.restoreRevokingAccessors(); err != nil { 669 t.Fatalf("Failed to restore: %v", err) 670 } 671 672 if len(tvc.RevokedTokens) != 1 && tvc.RevokedTokens[0].Accessor != va.Accessor { 673 t.Fatalf("Bad revoked accessors: %v", tvc.RevokedTokens) 674 } 675 } 676 677 func TestLeader_ReplicateACLPolicies(t *testing.T) { 678 t.Parallel() 679 s1, root := TestACLServer(t, func(c *Config) { 680 c.Region = "region1" 681 c.AuthoritativeRegion = "region1" 682 c.ACLEnabled = true 683 }) 684 defer s1.Shutdown() 685 s2, _ := TestACLServer(t, func(c *Config) { 686 c.Region = "region2" 687 c.AuthoritativeRegion = "region1" 688 c.ACLEnabled = true 689 c.ReplicationBackoff = 20 * time.Millisecond 690 c.ReplicationToken = root.SecretID 691 }) 692 defer s2.Shutdown() 693 TestJoin(t, s1, s2) 694 testutil.WaitForLeader(t, s1.RPC) 695 testutil.WaitForLeader(t, s2.RPC) 696 697 // Write a policy to the authoritative region 698 p1 := mock.ACLPolicy() 699 if err := s1.State().UpsertACLPolicies(100, []*structs.ACLPolicy{p1}); err != nil { 700 t.Fatalf("bad: %v", err) 701 } 702 703 // Wait for the policy to replicate 704 testutil.WaitForResult(func() (bool, error) { 705 state := s2.State() 706 out, err := state.ACLPolicyByName(nil, p1.Name) 707 return out != nil, err 708 }, func(err error) { 709 t.Fatalf("should replicate policy") 710 }) 711 } 712 713 func TestLeader_DiffACLPolicies(t *testing.T) { 714 t.Parallel() 715 716 state := state.TestStateStore(t) 717 718 // Populate the local state 719 p1 := mock.ACLPolicy() 720 p2 := mock.ACLPolicy() 721 p3 := mock.ACLPolicy() 722 assert.Nil(t, state.UpsertACLPolicies(100, []*structs.ACLPolicy{p1, p2, p3})) 723 724 // Simulate a remote list 725 p2Stub := p2.Stub() 726 p2Stub.ModifyIndex = 50 // Ignored, same index 727 p3Stub := p3.Stub() 728 p3Stub.ModifyIndex = 100 // Updated, higher index 729 p3Stub.Hash = []byte{0, 1, 2, 3} 730 p4 := mock.ACLPolicy() 731 remoteList := []*structs.ACLPolicyListStub{ 732 p2Stub, 733 p3Stub, 734 p4.Stub(), 735 } 736 delete, update := diffACLPolicies(state, 50, remoteList) 737 738 // P1 does not exist on the remote side, should delete 739 assert.Equal(t, []string{p1.Name}, delete) 740 741 // P2 is un-modified - ignore. P3 modified, P4 new. 742 assert.Equal(t, []string{p3.Name, p4.Name}, update) 743 } 744 745 func TestLeader_ReplicateACLTokens(t *testing.T) { 746 t.Parallel() 747 s1, root := TestACLServer(t, func(c *Config) { 748 c.Region = "region1" 749 c.AuthoritativeRegion = "region1" 750 c.ACLEnabled = true 751 }) 752 defer s1.Shutdown() 753 s2, _ := TestACLServer(t, func(c *Config) { 754 c.Region = "region2" 755 c.AuthoritativeRegion = "region1" 756 c.ACLEnabled = true 757 c.ReplicationBackoff = 20 * time.Millisecond 758 c.ReplicationToken = root.SecretID 759 }) 760 defer s2.Shutdown() 761 TestJoin(t, s1, s2) 762 testutil.WaitForLeader(t, s1.RPC) 763 testutil.WaitForLeader(t, s2.RPC) 764 765 // Write a token to the authoritative region 766 p1 := mock.ACLToken() 767 p1.Global = true 768 if err := s1.State().UpsertACLTokens(100, []*structs.ACLToken{p1}); err != nil { 769 t.Fatalf("bad: %v", err) 770 } 771 772 // Wait for the token to replicate 773 testutil.WaitForResult(func() (bool, error) { 774 state := s2.State() 775 out, err := state.ACLTokenByAccessorID(nil, p1.AccessorID) 776 return out != nil, err 777 }, func(err error) { 778 t.Fatalf("should replicate token") 779 }) 780 } 781 782 func TestLeader_DiffACLTokens(t *testing.T) { 783 t.Parallel() 784 785 state := state.TestStateStore(t) 786 787 // Populate the local state 788 p0 := mock.ACLToken() 789 p1 := mock.ACLToken() 790 p1.Global = true 791 p2 := mock.ACLToken() 792 p2.Global = true 793 p3 := mock.ACLToken() 794 p3.Global = true 795 assert.Nil(t, state.UpsertACLTokens(100, []*structs.ACLToken{p0, p1, p2, p3})) 796 797 // Simulate a remote list 798 p2Stub := p2.Stub() 799 p2Stub.ModifyIndex = 50 // Ignored, same index 800 p3Stub := p3.Stub() 801 p3Stub.ModifyIndex = 100 // Updated, higher index 802 p3Stub.Hash = []byte{0, 1, 2, 3} 803 p4 := mock.ACLToken() 804 p4.Global = true 805 remoteList := []*structs.ACLTokenListStub{ 806 p2Stub, 807 p3Stub, 808 p4.Stub(), 809 } 810 delete, update := diffACLTokens(state, 50, remoteList) 811 812 // P0 is local and should be ignored 813 // P1 does not exist on the remote side, should delete 814 assert.Equal(t, []string{p1.AccessorID}, delete) 815 816 // P2 is un-modified - ignore. P3 modified, P4 new. 817 assert.Equal(t, []string{p3.AccessorID, p4.AccessorID}, update) 818 } 819 820 func TestLeader_UpgradeRaftVersion(t *testing.T) { 821 t.Parallel() 822 s1 := TestServer(t, func(c *Config) { 823 c.Datacenter = "dc1" 824 c.RaftConfig.ProtocolVersion = 2 825 }) 826 defer s1.Shutdown() 827 828 s2 := TestServer(t, func(c *Config) { 829 c.DevDisableBootstrap = true 830 c.RaftConfig.ProtocolVersion = 1 831 }) 832 defer s2.Shutdown() 833 834 s3 := TestServer(t, func(c *Config) { 835 c.DevDisableBootstrap = true 836 c.RaftConfig.ProtocolVersion = 2 837 }) 838 defer s3.Shutdown() 839 840 servers := []*Server{s1, s2, s3} 841 842 // Try to join 843 TestJoin(t, s1, s2, s3) 844 845 for _, s := range servers { 846 testutil.WaitForResult(func() (bool, error) { 847 peers, _ := s.numPeers() 848 return peers == 3, nil 849 }, func(err error) { 850 t.Fatalf("should have 3 peers") 851 }) 852 } 853 854 // Kill the v1 server 855 if err := s2.Leave(); err != nil { 856 t.Fatal(err) 857 } 858 859 for _, s := range []*Server{s1, s3} { 860 minVer, err := s.autopilot.MinRaftProtocol() 861 if err != nil { 862 t.Fatal(err) 863 } 864 if got, want := minVer, 2; got != want { 865 t.Fatalf("got min raft version %d want %d", got, want) 866 } 867 } 868 869 // Replace the dead server with one running raft protocol v3 870 s4 := TestServer(t, func(c *Config) { 871 c.DevDisableBootstrap = true 872 c.Datacenter = "dc1" 873 c.RaftConfig.ProtocolVersion = 3 874 }) 875 defer s4.Shutdown() 876 TestJoin(t, s1, s4) 877 servers[1] = s4 878 879 // Make sure we're back to 3 total peers with the new one added via ID 880 for _, s := range servers { 881 testutil.WaitForResult(func() (bool, error) { 882 addrs := 0 883 ids := 0 884 future := s.raft.GetConfiguration() 885 if err := future.Error(); err != nil { 886 return false, err 887 } 888 for _, server := range future.Configuration().Servers { 889 if string(server.ID) == string(server.Address) { 890 addrs++ 891 } else { 892 ids++ 893 } 894 } 895 if got, want := addrs, 2; got != want { 896 return false, fmt.Errorf("got %d server addresses want %d", got, want) 897 } 898 if got, want := ids, 1; got != want { 899 return false, fmt.Errorf("got %d server ids want %d", got, want) 900 } 901 902 return true, nil 903 }, func(err error) { 904 t.Fatal(err) 905 }) 906 } 907 } 908 909 func TestLeader_Reelection(t *testing.T) { 910 raftProtocols := []int{1, 2, 3} 911 for _, p := range raftProtocols { 912 t.Run("Leader Election - Protocol version "+string(p), func(t *testing.T) { 913 leaderElectionTest(t, raft.ProtocolVersion(p)) 914 }) 915 } 916 917 } 918 919 func leaderElectionTest(t *testing.T, raftProtocol raft.ProtocolVersion) { 920 s1 := TestServer(t, func(c *Config) { 921 c.BootstrapExpect = 3 922 c.RaftConfig.ProtocolVersion = raftProtocol 923 }) 924 defer s1.Shutdown() 925 926 s2 := TestServer(t, func(c *Config) { 927 c.BootstrapExpect = 3 928 c.DevDisableBootstrap = true 929 c.RaftConfig.ProtocolVersion = raftProtocol 930 }) 931 defer s2.Shutdown() 932 933 s3 := TestServer(t, func(c *Config) { 934 c.BootstrapExpect = 3 935 c.DevDisableBootstrap = true 936 c.RaftConfig.ProtocolVersion = raftProtocol 937 }) 938 939 servers := []*Server{s1, s2, s3} 940 941 // Try to join 942 TestJoin(t, s1, s2, s3) 943 testutil.WaitForLeader(t, s1.RPC) 944 945 testutil.WaitForResult(func() (bool, error) { 946 future := s1.raft.GetConfiguration() 947 if err := future.Error(); err != nil { 948 return false, err 949 } 950 951 for _, server := range future.Configuration().Servers { 952 if server.Suffrage == raft.Nonvoter { 953 return false, fmt.Errorf("non-voter %v", server) 954 } 955 } 956 957 return true, nil 958 }, func(err error) { 959 t.Fatal(err) 960 }) 961 962 var leader, nonLeader *Server 963 for _, s := range servers { 964 if s.IsLeader() { 965 leader = s 966 } else { 967 nonLeader = s 968 } 969 } 970 971 // Shutdown the leader 972 leader.Shutdown() 973 // Wait for new leader to elect 974 testutil.WaitForLeader(t, nonLeader.RPC) 975 } 976 977 func TestLeader_RollRaftServer(t *testing.T) { 978 t.Parallel() 979 s1 := TestServer(t, func(c *Config) { 980 c.RaftConfig.ProtocolVersion = 2 981 }) 982 defer s1.Shutdown() 983 984 s2 := TestServer(t, func(c *Config) { 985 c.DevDisableBootstrap = true 986 c.RaftConfig.ProtocolVersion = 2 987 }) 988 defer s2.Shutdown() 989 990 s3 := TestServer(t, func(c *Config) { 991 c.DevDisableBootstrap = true 992 c.RaftConfig.ProtocolVersion = 2 993 }) 994 defer s3.Shutdown() 995 996 servers := []*Server{s1, s2, s3} 997 998 // Try to join 999 TestJoin(t, s1, s2, s3) 1000 1001 for _, s := range servers { 1002 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 1003 } 1004 1005 // Kill the first v2 server 1006 s1.Shutdown() 1007 1008 for _, s := range []*Server{s1, s3} { 1009 retry.Run(t, func(r *retry.R) { 1010 minVer, err := s.autopilot.MinRaftProtocol() 1011 if err != nil { 1012 r.Fatal(err) 1013 } 1014 if got, want := minVer, 2; got != want { 1015 r.Fatalf("got min raft version %d want %d", got, want) 1016 } 1017 }) 1018 } 1019 1020 // Replace the dead server with one running raft protocol v3 1021 s4 := TestServer(t, func(c *Config) { 1022 c.DevDisableBootstrap = true 1023 c.RaftConfig.ProtocolVersion = 3 1024 }) 1025 defer s4.Shutdown() 1026 TestJoin(t, s4, s2) 1027 servers[0] = s4 1028 1029 // Kill the second v2 server 1030 s2.Shutdown() 1031 1032 for _, s := range []*Server{s3, s4} { 1033 retry.Run(t, func(r *retry.R) { 1034 minVer, err := s.autopilot.MinRaftProtocol() 1035 if err != nil { 1036 r.Fatal(err) 1037 } 1038 if got, want := minVer, 2; got != want { 1039 r.Fatalf("got min raft version %d want %d", got, want) 1040 } 1041 }) 1042 } 1043 // Replace another dead server with one running raft protocol v3 1044 s5 := TestServer(t, func(c *Config) { 1045 c.DevDisableBootstrap = true 1046 c.RaftConfig.ProtocolVersion = 3 1047 }) 1048 defer s5.Shutdown() 1049 TestJoin(t, s5, s4) 1050 servers[1] = s5 1051 1052 // Kill the last v2 server, now minRaftProtocol should be 3 1053 s3.Shutdown() 1054 1055 for _, s := range []*Server{s4, s5} { 1056 retry.Run(t, func(r *retry.R) { 1057 minVer, err := s.autopilot.MinRaftProtocol() 1058 if err != nil { 1059 r.Fatal(err) 1060 } 1061 if got, want := minVer, 3; got != want { 1062 r.Fatalf("got min raft version %d want %d", got, want) 1063 } 1064 }) 1065 } 1066 1067 // Replace the last dead server with one running raft protocol v3 1068 s6 := TestServer(t, func(c *Config) { 1069 c.DevDisableBootstrap = true 1070 c.RaftConfig.ProtocolVersion = 3 1071 }) 1072 defer s6.Shutdown() 1073 TestJoin(t, s6, s4) 1074 servers[2] = s6 1075 1076 // Make sure all the dead servers are removed and we're back to 3 total peers 1077 for _, s := range servers { 1078 retry.Run(t, func(r *retry.R) { 1079 addrs := 0 1080 ids := 0 1081 future := s.raft.GetConfiguration() 1082 if err := future.Error(); err != nil { 1083 r.Fatal(err) 1084 } 1085 for _, server := range future.Configuration().Servers { 1086 if string(server.ID) == string(server.Address) { 1087 addrs++ 1088 } else { 1089 ids++ 1090 } 1091 } 1092 if got, want := addrs, 0; got != want { 1093 r.Fatalf("got %d server addresses want %d", got, want) 1094 } 1095 if got, want := ids, 3; got != want { 1096 r.Fatalf("got %d server ids want %d", got, want) 1097 } 1098 }) 1099 } 1100 } 1101 1102 func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) { 1103 s1 := TestServer(t, nil) 1104 defer s1.Shutdown() 1105 testutil.WaitForLeader(t, s1.RPC) 1106 1107 testutil.WaitForResult(func() (bool, error) { 1108 return s1.evalBroker.Enabled(), nil 1109 }, func(err error) { 1110 t.Fatalf("should have finished establish leader loop") 1111 }) 1112 1113 require.Nil(t, s1.revokeLeadership()) 1114 require.Nil(t, s1.revokeLeadership()) 1115 require.Nil(t, s1.revokeLeadership()) 1116 } 1117 1118 // Test doing an inplace upgrade on a server from raft protocol 2 to 3 1119 // This verifies that removing the server and adding it back with a uuid works 1120 // even if the server's address stays the same. 1121 func TestServer_ReconcileMember(t *testing.T) { 1122 // Create a three node cluster 1123 t.Parallel() 1124 s1 := TestServer(t, func(c *Config) { 1125 c.DevDisableBootstrap = true 1126 c.RaftConfig.ProtocolVersion = 3 1127 }) 1128 defer s1.Shutdown() 1129 1130 s2 := TestServer(t, func(c *Config) { 1131 c.DevDisableBootstrap = true 1132 c.RaftConfig.ProtocolVersion = 3 1133 }) 1134 defer s2.Shutdown() 1135 1136 s3 := TestServer(t, func(c *Config) { 1137 c.DevDisableBootstrap = true 1138 c.RaftConfig.ProtocolVersion = 2 1139 }) 1140 defer s3.Shutdown() 1141 TestJoin(t, s1, s2, s3) 1142 testutil.WaitForLeader(t, s1.RPC) 1143 1144 // Create a memberlist object for s3, with raft protocol upgraded to 3 1145 upgradedS3Member := serf.Member{ 1146 Name: s3.config.NodeName, 1147 Addr: s3.config.RPCAddr.IP, 1148 Status: serf.StatusAlive, 1149 Tags: make(map[string]string), 1150 } 1151 upgradedS3Member.Tags["role"] = "nomad" 1152 upgradedS3Member.Tags["id"] = s3.config.NodeID 1153 upgradedS3Member.Tags["region"] = s3.config.Region 1154 upgradedS3Member.Tags["dc"] = s3.config.Datacenter 1155 upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1" 1156 upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port) 1157 upgradedS3Member.Tags["build"] = "0.8.0" 1158 upgradedS3Member.Tags["vsn"] = "2" 1159 upgradedS3Member.Tags["mvn"] = "1" 1160 upgradedS3Member.Tags["raft_vsn"] = "3" 1161 1162 // Find the leader so that we can call reconcile member on it 1163 var leader *Server 1164 for _, s := range []*Server{s1, s2, s3} { 1165 if s.IsLeader() { 1166 leader = s 1167 } 1168 } 1169 leader.reconcileMember(upgradedS3Member) 1170 // This should remove s3 from the config and potentially cause a leader election 1171 testutil.WaitForLeader(t, s1.RPC) 1172 1173 // Figure out the new leader and call reconcile again, this should add s3 with the new ID format 1174 for _, s := range []*Server{s1, s2, s3} { 1175 if s.IsLeader() { 1176 leader = s 1177 } 1178 } 1179 leader.reconcileMember(upgradedS3Member) 1180 testutil.WaitForLeader(t, s1.RPC) 1181 future := s2.raft.GetConfiguration() 1182 if err := future.Error(); err != nil { 1183 t.Fatal(err) 1184 } 1185 addrs := 0 1186 ids := 0 1187 for _, server := range future.Configuration().Servers { 1188 if string(server.ID) == string(server.Address) { 1189 addrs++ 1190 } else { 1191 ids++ 1192 } 1193 } 1194 // After this, all three servers should have IDs in raft 1195 if got, want := addrs, 0; got != want { 1196 t.Fatalf("got %d server addresses want %d", got, want) 1197 } 1198 if got, want := ids, 3; got != want { 1199 t.Fatalf("got %d server ids want %d", got, want) 1200 } 1201 }