github.phpd.cn/hashicorp/consul@v1.4.5/agent/consul/leader_test.go (about) 1 package consul 2 3 import ( 4 "os" 5 "reflect" 6 "testing" 7 "time" 8 9 "github.com/hashicorp/consul/agent/connect" 10 "github.com/hashicorp/consul/agent/structs" 11 "github.com/hashicorp/consul/api" 12 "github.com/hashicorp/consul/testrpc" 13 "github.com/hashicorp/consul/testutil/retry" 14 "github.com/hashicorp/net-rpc-msgpackrpc" 15 "github.com/hashicorp/serf/serf" 16 "github.com/stretchr/testify/require" 17 ) 18 19 func TestLeader_RegisterMember(t *testing.T) { 20 t.Parallel() 21 dir1, s1 := testServerWithConfig(t, func(c *Config) { 22 c.ACLDatacenter = "dc1" 23 c.ACLsEnabled = true 24 c.ACLMasterToken = "root" 25 c.ACLDefaultPolicy = "deny" 26 c.ACLEnforceVersion8 = true 27 }) 28 defer os.RemoveAll(dir1) 29 defer s1.Shutdown() 30 31 dir2, c1 := testClient(t) 32 defer os.RemoveAll(dir2) 33 defer c1.Shutdown() 34 35 // Try to join 36 joinLAN(t, c1, s1) 37 38 testrpc.WaitForLeader(t, s1.RPC, "dc1") 39 40 // Client should be registered 41 state := s1.fsm.State() 42 retry.Run(t, func(r *retry.R) { 43 _, node, err := state.GetNode(c1.config.NodeName) 44 if err != nil { 45 r.Fatalf("err: %v", err) 46 } 47 if node == nil { 48 r.Fatal("client not registered") 49 } 50 }) 51 52 // Should have a check 53 _, checks, err := state.NodeChecks(nil, c1.config.NodeName) 54 if err != nil { 55 t.Fatalf("err: %v", err) 56 } 57 if len(checks) != 1 { 58 t.Fatalf("client missing check") 59 } 60 if checks[0].CheckID != structs.SerfCheckID { 61 t.Fatalf("bad check: %v", checks[0]) 62 } 63 if checks[0].Name != structs.SerfCheckName { 64 t.Fatalf("bad check: %v", checks[0]) 65 } 66 if checks[0].Status != api.HealthPassing { 67 t.Fatalf("bad check: %v", checks[0]) 68 } 69 70 // Server should be registered 71 _, node, err := state.GetNode(s1.config.NodeName) 72 if err != nil { 73 t.Fatalf("err: %v", err) 74 } 75 if node == nil { 76 t.Fatalf("server not registered") 77 } 78 79 // Service should be registered 80 _, services, err := state.NodeServices(nil, s1.config.NodeName) 81 if err != nil { 82 t.Fatalf("err: %v", err) 83 } 84 if _, ok := services.Services["consul"]; !ok { 85 t.Fatalf("consul service not registered: %v", services) 86 } 87 } 88 89 func TestLeader_FailedMember(t *testing.T) { 90 t.Parallel() 91 dir1, s1 := testServerWithConfig(t, func(c *Config) { 92 c.ACLDatacenter = "dc1" 93 c.ACLsEnabled = true 94 c.ACLMasterToken = "root" 95 c.ACLDefaultPolicy = "deny" 96 c.ACLEnforceVersion8 = true 97 }) 98 defer os.RemoveAll(dir1) 99 defer s1.Shutdown() 100 101 dir2, c1 := testClient(t) 102 defer os.RemoveAll(dir2) 103 defer c1.Shutdown() 104 105 testrpc.WaitForLeader(t, s1.RPC, "dc1") 106 107 // Try to join 108 joinLAN(t, c1, s1) 109 110 // Fail the member 111 c1.Shutdown() 112 113 // Should be registered 114 state := s1.fsm.State() 115 retry.Run(t, func(r *retry.R) { 116 _, node, err := state.GetNode(c1.config.NodeName) 117 if err != nil { 118 r.Fatalf("err: %v", err) 119 } 120 if node == nil { 121 r.Fatal("client not registered") 122 } 123 }) 124 125 // Should have a check 126 _, checks, err := state.NodeChecks(nil, c1.config.NodeName) 127 if err != nil { 128 t.Fatalf("err: %v", err) 129 } 130 if len(checks) != 1 { 131 t.Fatalf("client missing check") 132 } 133 if checks[0].CheckID != structs.SerfCheckID { 134 t.Fatalf("bad check: %v", checks[0]) 135 } 136 if checks[0].Name != structs.SerfCheckName { 137 t.Fatalf("bad check: %v", checks[0]) 138 } 139 140 retry.Run(t, func(r *retry.R) { 141 _, checks, err = state.NodeChecks(nil, c1.config.NodeName) 142 if err != nil { 143 r.Fatalf("err: %v", err) 144 } 145 if got, want := checks[0].Status, api.HealthCritical; got != want { 146 r.Fatalf("got status %q want %q", got, want) 147 } 148 }) 149 } 150 151 func TestLeader_LeftMember(t *testing.T) { 152 t.Parallel() 153 dir1, s1 := testServerWithConfig(t, func(c *Config) { 154 c.ACLDatacenter = "dc1" 155 c.ACLsEnabled = true 156 c.ACLMasterToken = "root" 157 c.ACLDefaultPolicy = "deny" 158 c.ACLEnforceVersion8 = true 159 }) 160 defer os.RemoveAll(dir1) 161 defer s1.Shutdown() 162 163 dir2, c1 := testClient(t) 164 defer os.RemoveAll(dir2) 165 defer c1.Shutdown() 166 167 // Try to join 168 joinLAN(t, c1, s1) 169 170 state := s1.fsm.State() 171 172 // Should be registered 173 retry.Run(t, func(r *retry.R) { 174 _, node, err := state.GetNode(c1.config.NodeName) 175 if err != nil { 176 r.Fatalf("err: %v", err) 177 } 178 if node == nil { 179 r.Fatal("client not registered") 180 } 181 }) 182 183 // Node should leave 184 c1.Leave() 185 c1.Shutdown() 186 187 // Should be deregistered 188 retry.Run(t, func(r *retry.R) { 189 _, node, err := state.GetNode(c1.config.NodeName) 190 if err != nil { 191 r.Fatalf("err: %v", err) 192 } 193 if node != nil { 194 r.Fatal("client still registered") 195 } 196 }) 197 } 198 func TestLeader_ReapMember(t *testing.T) { 199 t.Parallel() 200 dir1, s1 := testServerWithConfig(t, func(c *Config) { 201 c.ACLDatacenter = "dc1" 202 c.ACLsEnabled = true 203 c.ACLMasterToken = "root" 204 c.ACLDefaultPolicy = "deny" 205 c.ACLEnforceVersion8 = true 206 }) 207 defer os.RemoveAll(dir1) 208 defer s1.Shutdown() 209 210 dir2, c1 := testClient(t) 211 defer os.RemoveAll(dir2) 212 defer c1.Shutdown() 213 214 // Try to join 215 joinLAN(t, c1, s1) 216 217 state := s1.fsm.State() 218 219 // Should be registered 220 retry.Run(t, func(r *retry.R) { 221 _, node, err := state.GetNode(c1.config.NodeName) 222 if err != nil { 223 r.Fatalf("err: %v", err) 224 } 225 if node == nil { 226 r.Fatal("client not registered") 227 } 228 }) 229 230 // Simulate a node reaping 231 mems := s1.LANMembers() 232 var c1mem serf.Member 233 for _, m := range mems { 234 if m.Name == c1.config.NodeName { 235 c1mem = m 236 c1mem.Status = StatusReap 237 break 238 } 239 } 240 s1.reconcileCh <- c1mem 241 242 // Should be deregistered; we have to poll quickly here because 243 // anti-entropy will put it back. 244 reaped := false 245 for start := time.Now(); time.Since(start) < 5*time.Second; { 246 _, node, err := state.GetNode(c1.config.NodeName) 247 if err != nil { 248 t.Fatalf("err: %v", err) 249 } 250 if node == nil { 251 reaped = true 252 break 253 } 254 } 255 if !reaped { 256 t.Fatalf("client should not be registered") 257 } 258 } 259 260 func TestLeader_ReapServer(t *testing.T) { 261 t.Parallel() 262 dir1, s1 := testServerWithConfig(t, func(c *Config) { 263 c.ACLDatacenter = "dc1" 264 c.ACLsEnabled = true 265 c.ACLMasterToken = "root" 266 c.ACLDefaultPolicy = "allow" 267 c.ACLEnforceVersion8 = true 268 c.Bootstrap = true 269 }) 270 defer os.RemoveAll(dir1) 271 defer s1.Shutdown() 272 273 dir2, s2 := testServerWithConfig(t, func(c *Config) { 274 c.ACLDatacenter = "dc1" 275 c.ACLsEnabled = true 276 c.ACLMasterToken = "root" 277 c.ACLDefaultPolicy = "allow" 278 c.ACLEnforceVersion8 = true 279 c.Bootstrap = false 280 }) 281 defer os.RemoveAll(dir2) 282 defer s2.Shutdown() 283 284 dir3, s3 := testServerWithConfig(t, func(c *Config) { 285 c.ACLDatacenter = "dc1" 286 c.ACLsEnabled = true 287 c.ACLMasterToken = "root" 288 c.ACLDefaultPolicy = "allow" 289 c.ACLEnforceVersion8 = true 290 c.Bootstrap = false 291 }) 292 defer os.RemoveAll(dir3) 293 defer s3.Shutdown() 294 295 // Try to join 296 joinLAN(t, s1, s2) 297 joinLAN(t, s1, s3) 298 299 testrpc.WaitForLeader(t, s1.RPC, "dc1") 300 testrpc.WaitForLeader(t, s2.RPC, "dc1") 301 testrpc.WaitForLeader(t, s3.RPC, "dc1") 302 state := s1.fsm.State() 303 304 // s3 should be registered 305 retry.Run(t, func(r *retry.R) { 306 _, node, err := state.GetNode(s3.config.NodeName) 307 if err != nil { 308 r.Fatalf("err: %v", err) 309 } 310 if node == nil { 311 r.Fatal("client not registered") 312 } 313 }) 314 315 // call reconcileReaped with a map that does not contain s3 316 knownMembers := make(map[string]struct{}) 317 knownMembers[s1.config.NodeName] = struct{}{} 318 knownMembers[s2.config.NodeName] = struct{}{} 319 320 err := s1.reconcileReaped(knownMembers) 321 322 if err != nil { 323 t.Fatalf("Unexpected error :%v", err) 324 } 325 // s3 should be deregistered 326 retry.Run(t, func(r *retry.R) { 327 _, node, err := state.GetNode(s3.config.NodeName) 328 if err != nil { 329 r.Fatalf("err: %v", err) 330 } 331 if node != nil { 332 r.Fatalf("server with id %v should not be registered", s3.config.NodeID) 333 } 334 }) 335 336 } 337 338 func TestLeader_Reconcile_ReapMember(t *testing.T) { 339 t.Parallel() 340 dir1, s1 := testServerWithConfig(t, func(c *Config) { 341 c.ACLDatacenter = "dc1" 342 c.ACLsEnabled = true 343 c.ACLMasterToken = "root" 344 c.ACLDefaultPolicy = "deny" 345 c.ACLEnforceVersion8 = true 346 }) 347 defer os.RemoveAll(dir1) 348 defer s1.Shutdown() 349 350 testrpc.WaitForLeader(t, s1.RPC, "dc1") 351 352 // Register a non-existing member 353 dead := structs.RegisterRequest{ 354 Datacenter: s1.config.Datacenter, 355 Node: "no-longer-around", 356 Address: "127.1.1.1", 357 Check: &structs.HealthCheck{ 358 Node: "no-longer-around", 359 CheckID: structs.SerfCheckID, 360 Name: structs.SerfCheckName, 361 Status: api.HealthCritical, 362 }, 363 WriteRequest: structs.WriteRequest{ 364 Token: "root", 365 }, 366 } 367 var out struct{} 368 if err := s1.RPC("Catalog.Register", &dead, &out); err != nil { 369 t.Fatalf("err: %v", err) 370 } 371 372 // Force a reconciliation 373 if err := s1.reconcile(); err != nil { 374 t.Fatalf("err: %v", err) 375 } 376 377 // Node should be gone 378 state := s1.fsm.State() 379 _, node, err := state.GetNode("no-longer-around") 380 if err != nil { 381 t.Fatalf("err: %v", err) 382 } 383 if node != nil { 384 t.Fatalf("client registered") 385 } 386 } 387 388 func TestLeader_Reconcile(t *testing.T) { 389 t.Parallel() 390 dir1, s1 := testServerWithConfig(t, func(c *Config) { 391 c.ACLDatacenter = "dc1" 392 c.ACLsEnabled = true 393 c.ACLMasterToken = "root" 394 c.ACLDefaultPolicy = "deny" 395 c.ACLEnforceVersion8 = true 396 }) 397 defer os.RemoveAll(dir1) 398 defer s1.Shutdown() 399 400 dir2, c1 := testClient(t) 401 defer os.RemoveAll(dir2) 402 defer c1.Shutdown() 403 404 // Join before we have a leader, this should cause a reconcile! 405 joinLAN(t, c1, s1) 406 407 // Should not be registered 408 state := s1.fsm.State() 409 _, node, err := state.GetNode(c1.config.NodeName) 410 if err != nil { 411 t.Fatalf("err: %v", err) 412 } 413 if node != nil { 414 t.Fatalf("client registered") 415 } 416 417 // Should be registered 418 retry.Run(t, func(r *retry.R) { 419 _, node, err := state.GetNode(c1.config.NodeName) 420 if err != nil { 421 r.Fatalf("err: %v", err) 422 } 423 if node == nil { 424 r.Fatal("client not registered") 425 } 426 }) 427 } 428 429 func TestLeader_Reconcile_Races(t *testing.T) { 430 t.Parallel() 431 dir1, s1 := testServer(t) 432 defer os.RemoveAll(dir1) 433 defer s1.Shutdown() 434 435 testrpc.WaitForLeader(t, s1.RPC, "dc1") 436 437 dir2, c1 := testClient(t) 438 defer os.RemoveAll(dir2) 439 defer c1.Shutdown() 440 441 joinLAN(t, c1, s1) 442 443 // Wait for the server to reconcile the client and register it. 444 state := s1.fsm.State() 445 var nodeAddr string 446 retry.Run(t, func(r *retry.R) { 447 _, node, err := state.GetNode(c1.config.NodeName) 448 if err != nil { 449 r.Fatalf("err: %v", err) 450 } 451 if node == nil { 452 r.Fatal("client not registered") 453 } 454 nodeAddr = node.Address 455 }) 456 457 // Add in some metadata via the catalog (as if the agent synced it 458 // there). We also set the serfHealth check to failing so the reconcile 459 // will attempt to flip it back 460 req := structs.RegisterRequest{ 461 Datacenter: s1.config.Datacenter, 462 Node: c1.config.NodeName, 463 ID: c1.config.NodeID, 464 Address: nodeAddr, 465 NodeMeta: map[string]string{"hello": "world"}, 466 Check: &structs.HealthCheck{ 467 Node: c1.config.NodeName, 468 CheckID: structs.SerfCheckID, 469 Name: structs.SerfCheckName, 470 Status: api.HealthCritical, 471 Output: "", 472 }, 473 } 474 var out struct{} 475 if err := s1.RPC("Catalog.Register", &req, &out); err != nil { 476 t.Fatalf("err: %v", err) 477 } 478 479 // Force a reconcile and make sure the metadata stuck around. 480 if err := s1.reconcile(); err != nil { 481 t.Fatalf("err: %v", err) 482 } 483 _, node, err := state.GetNode(c1.config.NodeName) 484 if err != nil { 485 t.Fatalf("err: %v", err) 486 } 487 if node == nil { 488 t.Fatalf("bad") 489 } 490 if hello, ok := node.Meta["hello"]; !ok || hello != "world" { 491 t.Fatalf("bad") 492 } 493 494 // Fail the member and wait for the health to go critical. 495 c1.Shutdown() 496 retry.Run(t, func(r *retry.R) { 497 _, checks, err := state.NodeChecks(nil, c1.config.NodeName) 498 if err != nil { 499 r.Fatalf("err: %v", err) 500 } 501 if got, want := checks[0].Status, api.HealthCritical; got != want { 502 r.Fatalf("got state %q want %q", got, want) 503 } 504 }) 505 506 // Make sure the metadata didn't get clobbered. 507 _, node, err = state.GetNode(c1.config.NodeName) 508 if err != nil { 509 t.Fatalf("err: %v", err) 510 } 511 if node == nil { 512 t.Fatalf("bad") 513 } 514 if hello, ok := node.Meta["hello"]; !ok || hello != "world" { 515 t.Fatalf("bad") 516 } 517 } 518 519 func TestLeader_LeftServer(t *testing.T) { 520 t.Parallel() 521 dir1, s1 := testServer(t) 522 defer os.RemoveAll(dir1) 523 defer s1.Shutdown() 524 525 dir2, s2 := testServerDCBootstrap(t, "dc1", false) 526 defer os.RemoveAll(dir2) 527 defer s2.Shutdown() 528 529 dir3, s3 := testServerDCBootstrap(t, "dc1", false) 530 defer os.RemoveAll(dir3) 531 defer s3.Shutdown() 532 533 // Put s1 last so we don't trigger a leader election. 534 servers := []*Server{s2, s3, s1} 535 536 // Try to join 537 joinLAN(t, s2, s1) 538 joinLAN(t, s3, s1) 539 for _, s := range servers { 540 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 541 } 542 543 // Kill any server 544 servers[0].Shutdown() 545 546 // Force remove the non-leader (transition to left state) 547 if err := servers[1].RemoveFailedNode(servers[0].config.NodeName); err != nil { 548 t.Fatalf("err: %v", err) 549 } 550 551 // Wait until the remaining servers show only 2 peers. 552 for _, s := range servers[1:] { 553 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 2)) }) 554 } 555 s1.Shutdown() 556 } 557 558 func TestLeader_LeftLeader(t *testing.T) { 559 t.Parallel() 560 dir1, s1 := testServer(t) 561 defer os.RemoveAll(dir1) 562 defer s1.Shutdown() 563 564 dir2, s2 := testServerDCBootstrap(t, "dc1", false) 565 defer os.RemoveAll(dir2) 566 defer s2.Shutdown() 567 568 dir3, s3 := testServerDCBootstrap(t, "dc1", false) 569 defer os.RemoveAll(dir3) 570 defer s3.Shutdown() 571 servers := []*Server{s1, s2, s3} 572 573 // Try to join 574 joinLAN(t, s2, s1) 575 joinLAN(t, s3, s1) 576 577 for _, s := range servers { 578 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 579 } 580 581 // Kill the leader! 582 var leader *Server 583 for _, s := range servers { 584 if s.IsLeader() { 585 leader = s 586 break 587 } 588 } 589 if leader == nil { 590 t.Fatalf("Should have a leader") 591 } 592 if !leader.isReadyForConsistentReads() { 593 t.Fatalf("Expected leader to be ready for consistent reads ") 594 } 595 leader.Leave() 596 if leader.isReadyForConsistentReads() { 597 t.Fatalf("Expected consistent read state to be false ") 598 } 599 leader.Shutdown() 600 time.Sleep(100 * time.Millisecond) 601 602 var remain *Server 603 for _, s := range servers { 604 if s == leader { 605 continue 606 } 607 remain = s 608 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 2)) }) 609 } 610 611 // Verify the old leader is deregistered 612 state := remain.fsm.State() 613 retry.Run(t, func(r *retry.R) { 614 _, node, err := state.GetNode(leader.config.NodeName) 615 if err != nil { 616 r.Fatalf("err: %v", err) 617 } 618 if node != nil { 619 r.Fatal("leader should be deregistered") 620 } 621 }) 622 } 623 624 func TestLeader_MultiBootstrap(t *testing.T) { 625 t.Parallel() 626 dir1, s1 := testServer(t) 627 defer os.RemoveAll(dir1) 628 defer s1.Shutdown() 629 630 dir2, s2 := testServer(t) 631 defer os.RemoveAll(dir2) 632 defer s2.Shutdown() 633 634 servers := []*Server{s1, s2} 635 636 // Try to join 637 joinLAN(t, s2, s1) 638 639 for _, s := range servers { 640 retry.Run(t, func(r *retry.R) { 641 if got, want := len(s.serfLAN.Members()), 2; got != want { 642 r.Fatalf("got %d peers want %d", got, want) 643 } 644 }) 645 } 646 647 // Ensure we don't have multiple raft peers 648 for _, s := range servers { 649 peers, _ := s.numPeers() 650 if peers != 1 { 651 t.Fatalf("should only have 1 raft peer!") 652 } 653 } 654 } 655 656 func TestLeader_TombstoneGC_Reset(t *testing.T) { 657 t.Parallel() 658 dir1, s1 := testServer(t) 659 defer os.RemoveAll(dir1) 660 defer s1.Shutdown() 661 662 dir2, s2 := testServerDCBootstrap(t, "dc1", false) 663 defer os.RemoveAll(dir2) 664 defer s2.Shutdown() 665 666 dir3, s3 := testServerDCBootstrap(t, "dc1", false) 667 defer os.RemoveAll(dir3) 668 defer s3.Shutdown() 669 servers := []*Server{s1, s2, s3} 670 671 // Try to join 672 joinLAN(t, s2, s1) 673 joinLAN(t, s3, s1) 674 675 for _, s := range servers { 676 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 677 } 678 679 var leader *Server 680 for _, s := range servers { 681 if s.IsLeader() { 682 leader = s 683 break 684 } 685 } 686 if leader == nil { 687 t.Fatalf("Should have a leader") 688 } 689 690 // Check that the leader has a pending GC expiration 691 if !leader.tombstoneGC.PendingExpiration() { 692 t.Fatalf("should have pending expiration") 693 } 694 695 // Kill the leader 696 leader.Shutdown() 697 time.Sleep(100 * time.Millisecond) 698 699 // Wait for a new leader 700 leader = nil 701 retry.Run(t, func(r *retry.R) { 702 for _, s := range servers { 703 if s.IsLeader() { 704 leader = s 705 return 706 } 707 } 708 r.Fatal("no leader") 709 }) 710 711 retry.Run(t, func(r *retry.R) { 712 if !leader.tombstoneGC.PendingExpiration() { 713 r.Fatal("leader has no pending GC expiration") 714 } 715 }) 716 } 717 718 func TestLeader_ReapTombstones(t *testing.T) { 719 t.Parallel() 720 dir1, s1 := testServerWithConfig(t, func(c *Config) { 721 c.ACLDatacenter = "dc1" 722 c.ACLsEnabled = true 723 c.ACLMasterToken = "root" 724 c.ACLDefaultPolicy = "deny" 725 c.TombstoneTTL = 50 * time.Millisecond 726 c.TombstoneTTLGranularity = 10 * time.Millisecond 727 }) 728 defer os.RemoveAll(dir1) 729 defer s1.Shutdown() 730 codec := rpcClient(t, s1) 731 732 testrpc.WaitForLeader(t, s1.RPC, "dc1") 733 734 // Create a KV entry 735 arg := structs.KVSRequest{ 736 Datacenter: "dc1", 737 Op: api.KVSet, 738 DirEnt: structs.DirEntry{ 739 Key: "test", 740 Value: []byte("test"), 741 }, 742 WriteRequest: structs.WriteRequest{ 743 Token: "root", 744 }, 745 } 746 var out bool 747 if err := msgpackrpc.CallWithCodec(codec, "KVS.Apply", &arg, &out); err != nil { 748 t.Fatalf("err: %v", err) 749 } 750 751 // Delete the KV entry (tombstoned). 752 arg.Op = api.KVDelete 753 if err := msgpackrpc.CallWithCodec(codec, "KVS.Apply", &arg, &out); err != nil { 754 t.Fatalf("err: %v", err) 755 } 756 757 // Make sure there's a tombstone. 758 state := s1.fsm.State() 759 func() { 760 snap := state.Snapshot() 761 defer snap.Close() 762 stones, err := snap.Tombstones() 763 if err != nil { 764 t.Fatalf("err: %s", err) 765 } 766 if stones.Next() == nil { 767 t.Fatalf("missing tombstones") 768 } 769 if stones.Next() != nil { 770 t.Fatalf("unexpected extra tombstones") 771 } 772 }() 773 774 // Check that the new leader has a pending GC expiration by 775 // watching for the tombstone to get removed. 776 retry.Run(t, func(r *retry.R) { 777 snap := state.Snapshot() 778 defer snap.Close() 779 stones, err := snap.Tombstones() 780 if err != nil { 781 r.Fatal(err) 782 } 783 if stones.Next() != nil { 784 r.Fatal("should have no tombstones") 785 } 786 }) 787 } 788 789 func TestLeader_RollRaftServer(t *testing.T) { 790 t.Parallel() 791 dir1, s1 := testServerWithConfig(t, func(c *Config) { 792 c.Bootstrap = true 793 c.Datacenter = "dc1" 794 c.RaftConfig.ProtocolVersion = 2 795 }) 796 defer os.RemoveAll(dir1) 797 defer s1.Shutdown() 798 799 dir2, s2 := testServerWithConfig(t, func(c *Config) { 800 c.Bootstrap = false 801 c.Datacenter = "dc1" 802 c.RaftConfig.ProtocolVersion = 1 803 }) 804 defer os.RemoveAll(dir2) 805 defer s2.Shutdown() 806 807 dir3, s3 := testServerWithConfig(t, func(c *Config) { 808 c.Bootstrap = false 809 c.Datacenter = "dc1" 810 c.RaftConfig.ProtocolVersion = 2 811 }) 812 defer os.RemoveAll(dir3) 813 defer s3.Shutdown() 814 815 servers := []*Server{s1, s2, s3} 816 817 // Try to join 818 joinLAN(t, s2, s1) 819 joinLAN(t, s3, s1) 820 821 for _, s := range servers { 822 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 823 } 824 825 // Kill the v1 server 826 s2.Shutdown() 827 828 for _, s := range []*Server{s1, s3} { 829 retry.Run(t, func(r *retry.R) { 830 minVer, err := s.autopilot.MinRaftProtocol() 831 if err != nil { 832 r.Fatal(err) 833 } 834 if got, want := minVer, 2; got != want { 835 r.Fatalf("got min raft version %d want %d", got, want) 836 } 837 }) 838 } 839 840 // Replace the dead server with one running raft protocol v3 841 dir4, s4 := testServerWithConfig(t, func(c *Config) { 842 c.Bootstrap = false 843 c.Datacenter = "dc1" 844 c.RaftConfig.ProtocolVersion = 3 845 }) 846 defer os.RemoveAll(dir4) 847 defer s4.Shutdown() 848 joinLAN(t, s4, s1) 849 servers[1] = s4 850 851 // Make sure the dead server is removed and we're back to 3 total peers 852 for _, s := range servers { 853 retry.Run(t, func(r *retry.R) { 854 addrs := 0 855 ids := 0 856 future := s.raft.GetConfiguration() 857 if err := future.Error(); err != nil { 858 r.Fatal(err) 859 } 860 for _, server := range future.Configuration().Servers { 861 if string(server.ID) == string(server.Address) { 862 addrs++ 863 } else { 864 ids++ 865 } 866 } 867 if got, want := addrs, 2; got != want { 868 r.Fatalf("got %d server addresses want %d", got, want) 869 } 870 if got, want := ids, 1; got != want { 871 r.Fatalf("got %d server ids want %d", got, want) 872 } 873 }) 874 } 875 } 876 877 func TestLeader_ChangeServerID(t *testing.T) { 878 t.Parallel() 879 conf := func(c *Config) { 880 c.Bootstrap = false 881 c.BootstrapExpect = 3 882 c.Datacenter = "dc1" 883 c.RaftConfig.ProtocolVersion = 3 884 } 885 dir1, s1 := testServerWithConfig(t, conf) 886 defer os.RemoveAll(dir1) 887 defer s1.Shutdown() 888 889 dir2, s2 := testServerWithConfig(t, conf) 890 defer os.RemoveAll(dir2) 891 defer s2.Shutdown() 892 893 dir3, s3 := testServerWithConfig(t, conf) 894 defer os.RemoveAll(dir3) 895 defer s3.Shutdown() 896 897 servers := []*Server{s1, s2, s3} 898 899 // Try to join and wait for all servers to get promoted 900 joinLAN(t, s2, s1) 901 joinLAN(t, s3, s1) 902 for _, s := range servers { 903 testrpc.WaitForTestAgent(t, s.RPC, "dc1") 904 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 905 } 906 907 // Shut down a server, freeing up its address/port 908 s3.Shutdown() 909 910 retry.Run(t, func(r *retry.R) { 911 alive := 0 912 for _, m := range s1.LANMembers() { 913 if m.Status == serf.StatusAlive { 914 alive++ 915 } 916 } 917 if got, want := alive, 2; got != want { 918 r.Fatalf("got %d alive members want %d", got, want) 919 } 920 }) 921 922 // Bring up a new server with s3's address that will get a different ID 923 dir4, s4 := testServerWithConfig(t, func(c *Config) { 924 c.Bootstrap = false 925 c.BootstrapExpect = 3 926 c.Datacenter = "dc1" 927 c.RaftConfig.ProtocolVersion = 3 928 c.SerfLANConfig.MemberlistConfig = s3.config.SerfLANConfig.MemberlistConfig 929 c.RPCAddr = s3.config.RPCAddr 930 c.RPCAdvertise = s3.config.RPCAdvertise 931 }) 932 defer os.RemoveAll(dir4) 933 defer s4.Shutdown() 934 joinLAN(t, s4, s1) 935 testrpc.WaitForTestAgent(t, s1.RPC, "dc1") 936 testrpc.WaitForTestAgent(t, s4.RPC, "dc1") 937 servers[2] = s4 938 939 // While integrating #3327 it uncovered that this test was flaky. The 940 // connection pool would use the same TCP connection to the old server 941 // which would give EOF errors to the autopilot health check RPC call. 942 // To make this more reliable we changed the connection pool to throw 943 // away the connection if it sees an EOF error, since there's no way 944 // that connection is going to work again. This made this test reliable 945 // since it will make a new connection to s4. 946 947 // Make sure the dead server is removed and we're back to 3 total peers 948 retry.Run(t, func(r *retry.R) { 949 r.Check(wantRaft(servers)) 950 for _, s := range servers { 951 r.Check(wantPeers(s, 3)) 952 } 953 }) 954 } 955 956 func TestLeader_ACL_Initialization(t *testing.T) { 957 t.Parallel() 958 959 tests := []struct { 960 name string 961 build string 962 master string 963 bootstrap bool 964 }{ 965 {"old version, no master", "0.8.0", "", true}, 966 {"old version, master", "0.8.0", "root", false}, 967 {"new version, no master", "0.9.1", "", true}, 968 {"new version, master", "0.9.1", "root", false}, 969 } 970 for _, tt := range tests { 971 t.Run(tt.name, func(t *testing.T) { 972 conf := func(c *Config) { 973 c.Build = tt.build 974 c.Bootstrap = true 975 c.Datacenter = "dc1" 976 c.ACLDatacenter = "dc1" 977 c.ACLsEnabled = true 978 c.ACLMasterToken = tt.master 979 } 980 dir1, s1 := testServerWithConfig(t, conf) 981 defer os.RemoveAll(dir1) 982 defer s1.Shutdown() 983 testrpc.WaitForTestAgent(t, s1.RPC, "dc1") 984 985 if tt.master != "" { 986 _, master, err := s1.fsm.State().ACLTokenGetBySecret(nil, tt.master) 987 require.NoError(t, err) 988 require.NotNil(t, master) 989 } 990 991 _, anon, err := s1.fsm.State().ACLTokenGetBySecret(nil, anonymousToken) 992 require.NoError(t, err) 993 require.NotNil(t, anon) 994 995 canBootstrap, _, err := s1.fsm.State().CanBootstrapACLToken() 996 require.NoError(t, err) 997 require.Equal(t, tt.bootstrap, canBootstrap) 998 999 _, policy, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID) 1000 require.NoError(t, err) 1001 require.NotNil(t, policy) 1002 }) 1003 } 1004 } 1005 1006 func TestLeader_CARootPruning(t *testing.T) { 1007 t.Parallel() 1008 1009 caRootPruneInterval = 200 * time.Millisecond 1010 1011 require := require.New(t) 1012 dir1, s1 := testServer(t) 1013 defer os.RemoveAll(dir1) 1014 defer s1.Shutdown() 1015 codec := rpcClient(t, s1) 1016 defer codec.Close() 1017 1018 testrpc.WaitForTestAgent(t, s1.RPC, "dc1") 1019 1020 // Get the current root 1021 rootReq := &structs.DCSpecificRequest{ 1022 Datacenter: "dc1", 1023 } 1024 var rootList structs.IndexedCARoots 1025 require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList)) 1026 require.Len(rootList.Roots, 1) 1027 oldRoot := rootList.Roots[0] 1028 1029 // Update the provider config to use a new private key, which should 1030 // cause a rotation. 1031 _, newKey, err := connect.GeneratePrivateKey() 1032 require.NoError(err) 1033 newConfig := &structs.CAConfiguration{ 1034 Provider: "consul", 1035 Config: map[string]interface{}{ 1036 "LeafCertTTL": "500ms", 1037 "PrivateKey": newKey, 1038 "RootCert": "", 1039 "RotationPeriod": "2160h", 1040 "SkipValidate": true, 1041 }, 1042 } 1043 { 1044 args := &structs.CARequest{ 1045 Datacenter: "dc1", 1046 Config: newConfig, 1047 } 1048 var reply interface{} 1049 1050 require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply)) 1051 } 1052 1053 // Should have 2 roots now. 1054 _, roots, err := s1.fsm.State().CARoots(nil) 1055 require.NoError(err) 1056 require.Len(roots, 2) 1057 1058 time.Sleep(2 * time.Second) 1059 1060 // Now the old root should be pruned. 1061 _, roots, err = s1.fsm.State().CARoots(nil) 1062 require.NoError(err) 1063 require.Len(roots, 1) 1064 require.True(roots[0].Active) 1065 require.NotEqual(roots[0].ID, oldRoot.ID) 1066 } 1067 1068 func TestLeader_PersistIntermediateCAs(t *testing.T) { 1069 t.Parallel() 1070 1071 require := require.New(t) 1072 dir1, s1 := testServer(t) 1073 defer os.RemoveAll(dir1) 1074 defer s1.Shutdown() 1075 codec := rpcClient(t, s1) 1076 defer codec.Close() 1077 1078 dir2, s2 := testServerDCBootstrap(t, "dc1", false) 1079 defer os.RemoveAll(dir2) 1080 defer s2.Shutdown() 1081 1082 dir3, s3 := testServerDCBootstrap(t, "dc1", false) 1083 defer os.RemoveAll(dir3) 1084 defer s3.Shutdown() 1085 1086 joinLAN(t, s2, s1) 1087 joinLAN(t, s3, s1) 1088 1089 testrpc.WaitForLeader(t, s1.RPC, "dc1") 1090 1091 // Get the current root 1092 rootReq := &structs.DCSpecificRequest{ 1093 Datacenter: "dc1", 1094 } 1095 var rootList structs.IndexedCARoots 1096 require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList)) 1097 require.Len(rootList.Roots, 1) 1098 1099 // Update the provider config to use a new private key, which should 1100 // cause a rotation. 1101 _, newKey, err := connect.GeneratePrivateKey() 1102 require.NoError(err) 1103 newConfig := &structs.CAConfiguration{ 1104 Provider: "consul", 1105 Config: map[string]interface{}{ 1106 "PrivateKey": newKey, 1107 "RootCert": "", 1108 "RotationPeriod": 90 * 24 * time.Hour, 1109 }, 1110 } 1111 { 1112 args := &structs.CARequest{ 1113 Datacenter: "dc1", 1114 Config: newConfig, 1115 } 1116 var reply interface{} 1117 1118 require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply)) 1119 } 1120 1121 // Get the active root before leader change. 1122 _, root := s1.getCAProvider() 1123 require.Len(root.IntermediateCerts, 1) 1124 1125 // Force a leader change and make sure the root CA values are preserved. 1126 s1.Leave() 1127 s1.Shutdown() 1128 1129 retry.Run(t, func(r *retry.R) { 1130 var leader *Server 1131 for _, s := range []*Server{s2, s3} { 1132 if s.IsLeader() { 1133 leader = s 1134 break 1135 } 1136 } 1137 if leader == nil { 1138 r.Fatal("no leader") 1139 } 1140 1141 _, newLeaderRoot := leader.getCAProvider() 1142 if !reflect.DeepEqual(newLeaderRoot, root) { 1143 r.Fatalf("got %v, want %v", newLeaderRoot, root) 1144 } 1145 }) 1146 } 1147 1148 func TestLeader_ACLUpgrade(t *testing.T) { 1149 t.Parallel() 1150 dir1, s1 := testServerWithConfig(t, func(c *Config) { 1151 c.ACLsEnabled = true 1152 c.ACLMasterToken = "root" 1153 }) 1154 defer os.RemoveAll(dir1) 1155 defer s1.Shutdown() 1156 testrpc.WaitForTestAgent(t, s1.RPC, "dc1") 1157 codec := rpcClient(t, s1) 1158 defer codec.Close() 1159 1160 // create a legacy management ACL 1161 mgmt := structs.ACLRequest{ 1162 Datacenter: "dc1", 1163 Op: structs.ACLSet, 1164 ACL: structs.ACL{ 1165 Name: "Management token", 1166 Type: structs.ACLTokenTypeManagement, 1167 }, 1168 WriteRequest: structs.WriteRequest{Token: "root"}, 1169 } 1170 var mgmt_id string 1171 require.NoError(t, msgpackrpc.CallWithCodec(codec, "ACL.Apply", &mgmt, &mgmt_id)) 1172 1173 // wait for it to be upgraded 1174 retry.Run(t, func(t *retry.R) { 1175 _, token, err := s1.fsm.State().ACLTokenGetBySecret(nil, mgmt_id) 1176 require.NoError(t, err) 1177 require.NotNil(t, token) 1178 require.NotEqual(t, "", token.AccessorID) 1179 require.Equal(t, structs.ACLTokenTypeManagement, token.Type) 1180 require.Len(t, token.Policies, 1) 1181 require.Equal(t, structs.ACLPolicyGlobalManagementID, token.Policies[0].ID) 1182 }) 1183 1184 // create a legacy management ACL 1185 client := structs.ACLRequest{ 1186 Datacenter: "dc1", 1187 Op: structs.ACLSet, 1188 ACL: structs.ACL{ 1189 Name: "Management token", 1190 Type: structs.ACLTokenTypeClient, 1191 Rules: `node "" { policy = "read"}`, 1192 }, 1193 WriteRequest: structs.WriteRequest{Token: "root"}, 1194 } 1195 var client_id string 1196 require.NoError(t, msgpackrpc.CallWithCodec(codec, "ACL.Apply", &client, &client_id)) 1197 1198 // wait for it to be upgraded 1199 retry.Run(t, func(t *retry.R) { 1200 _, token, err := s1.fsm.State().ACLTokenGetBySecret(nil, client_id) 1201 require.NoError(t, err) 1202 require.NotNil(t, token) 1203 require.NotEqual(t, "", token.AccessorID) 1204 require.Len(t, token.Policies, 0) 1205 require.Equal(t, structs.ACLTokenTypeClient, token.Type) 1206 require.Equal(t, client.ACL.Rules, token.Rules) 1207 }) 1208 }