github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/integration/integration_test.go (about) 1 package integration 2 3 import ( 4 "bytes" 5 "context" 6 "flag" 7 "fmt" 8 "io/ioutil" 9 "os" 10 "path/filepath" 11 "runtime" 12 "strings" 13 "testing" 14 "time" 15 16 "github.com/docker/swarmkit/node" 17 18 "reflect" 19 20 "github.com/cloudflare/cfssl/helpers" 21 events "github.com/docker/go-events" 22 "github.com/docker/swarmkit/api" 23 "github.com/docker/swarmkit/ca" 24 cautils "github.com/docker/swarmkit/ca/testutils" 25 "github.com/docker/swarmkit/identity" 26 "github.com/docker/swarmkit/manager" 27 "github.com/docker/swarmkit/testutils" 28 "github.com/pkg/errors" 29 "github.com/sirupsen/logrus" 30 "github.com/stretchr/testify/require" 31 ) 32 33 var showTrace = flag.Bool("show-trace", false, "show stack trace after tests finish") 34 35 func printTrace() { 36 var ( 37 buf []byte 38 stackSize int 39 ) 40 bufferLen := 16384 41 for stackSize == len(buf) { 42 buf = make([]byte, bufferLen) 43 stackSize = runtime.Stack(buf, true) 44 bufferLen *= 2 45 } 46 buf = buf[:stackSize] 47 logrus.Error("===========================STACK TRACE===========================") 48 fmt.Println(string(buf)) 49 logrus.Error("===========================STACK TRACE END=======================") 50 } 51 52 func TestMain(m *testing.M) { 53 ca.RenewTLSExponentialBackoff = events.ExponentialBackoffConfig{ 54 Factor: time.Millisecond * 500, 55 Max: time.Minute, 56 } 57 flag.Parse() 58 res := m.Run() 59 if *showTrace { 60 printTrace() 61 } 62 os.Exit(res) 63 } 64 65 // newTestCluster creates new cluster to which nodes can be added. 66 // AcceptancePolicy is set to most permissive mode on first manager node added. 67 func newTestCluster(testname string, fips bool) *testCluster { 68 ctx, cancel := context.WithCancel(context.Background()) 69 ctx = context.WithValue(ctx, testnameKey, testname) 70 c := &testCluster{ 71 ctx: ctx, 72 cancel: cancel, 73 nodes: make(map[string]*testNode), 74 nodesOrder: make(map[string]int), 75 errs: make(chan error, 1024), 76 fips: fips, 77 } 78 c.api = &dummyAPI{c: c} 79 return c 80 } 81 82 // pollClusterReady calls control api until all conditions are true: 83 // * all nodes are ready 84 // * all managers has membership == accepted 85 // * all managers has reachability == reachable 86 // * one node is leader 87 // * number of workers and managers equals to expected 88 func pollClusterReady(t *testing.T, c *testCluster, numWorker, numManager int) { 89 pollFunc := func() error { 90 res, err := c.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 91 if err != nil { 92 return err 93 } 94 var mCount int 95 var leaderFound bool 96 for _, n := range res.Nodes { 97 if n.Status.State != api.NodeStatus_READY { 98 return fmt.Errorf("node %s with desired role %s isn't ready, status %s, message %s", n.ID, n.Spec.DesiredRole, n.Status.State, n.Status.Message) 99 } 100 if n.Spec.Membership != api.NodeMembershipAccepted { 101 return fmt.Errorf("node %s with desired role %s isn't accepted to cluster, membership %s", n.ID, n.Spec.DesiredRole, n.Spec.Membership) 102 } 103 if n.Certificate.Role != n.Spec.DesiredRole { 104 return fmt.Errorf("node %s had different roles in spec and certificate, %s and %s respectively", n.ID, n.Spec.DesiredRole, n.Certificate.Role) 105 } 106 if n.Certificate.Status.State != api.IssuanceStateIssued { 107 return fmt.Errorf("node %s with desired role %s has no issued certificate, issuance state %s", n.ID, n.Spec.DesiredRole, n.Certificate.Status.State) 108 } 109 if n.Role == api.NodeRoleManager { 110 if n.ManagerStatus == nil { 111 return fmt.Errorf("manager node %s has no ManagerStatus field", n.ID) 112 } 113 if n.ManagerStatus.Reachability != api.RaftMemberStatus_REACHABLE { 114 return fmt.Errorf("manager node %s has reachable status: %s", n.ID, n.ManagerStatus.Reachability) 115 } 116 mCount++ 117 if n.ManagerStatus.Leader { 118 leaderFound = true 119 } 120 } else { 121 if n.ManagerStatus != nil { 122 return fmt.Errorf("worker node %s should not have manager status, returned %s", n.ID, n.ManagerStatus) 123 } 124 } 125 if n.Description.TLSInfo == nil { 126 return fmt.Errorf("node %s has not reported its TLS info yet", n.ID) 127 } 128 } 129 if !leaderFound { 130 return fmt.Errorf("leader of cluster is not found") 131 } 132 wCount := len(res.Nodes) - mCount 133 if mCount != numManager { 134 return fmt.Errorf("unexpected number of managers: %d, expected %d", mCount, numManager) 135 } 136 if wCount != numWorker { 137 return fmt.Errorf("unexpected number of workers: %d, expected %d", wCount, numWorker) 138 } 139 return nil 140 } 141 err := testutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout) 142 require.NoError(t, err) 143 } 144 145 func pollServiceReady(t *testing.T, c *testCluster, sid string, replicas int) { 146 pollFunc := func() error { 147 req := &api.ListTasksRequest{Filters: &api.ListTasksRequest_Filters{ 148 ServiceIDs: []string{sid}, 149 }} 150 res, err := c.api.ListTasks(context.Background(), req) 151 require.NoError(t, err) 152 153 if len(res.Tasks) == 0 { 154 return fmt.Errorf("tasks list is empty") 155 } 156 var running int 157 var states []string 158 for _, task := range res.Tasks { 159 if task.Status.State == api.TaskStateRunning { 160 running++ 161 } 162 states = append(states, fmt.Sprintf("[task %s: %s]", task.ID, task.Status.State)) 163 } 164 if running != replicas { 165 return fmt.Errorf("only %d running tasks, but expecting %d replicas: %s", running, replicas, strings.Join(states, ", ")) 166 } 167 168 return nil 169 } 170 require.NoError(t, testutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout)) 171 } 172 173 func newCluster(t *testing.T, numWorker, numManager int) *testCluster { 174 cl := newTestCluster(t.Name(), false) 175 for i := 0; i < numManager; i++ { 176 require.NoError(t, cl.AddManager(false, nil), "manager number %d", i+1) 177 } 178 for i := 0; i < numWorker; i++ { 179 require.NoError(t, cl.AddAgent(), "agent number %d", i+1) 180 } 181 182 pollClusterReady(t, cl, numWorker, numManager) 183 return cl 184 } 185 186 func newClusterWithRootCA(t *testing.T, numWorker, numManager int, rootCA *ca.RootCA, fips bool) *testCluster { 187 cl := newTestCluster(t.Name(), fips) 188 for i := 0; i < numManager; i++ { 189 require.NoError(t, cl.AddManager(false, rootCA), "manager number %d", i+1) 190 } 191 for i := 0; i < numWorker; i++ { 192 require.NoError(t, cl.AddAgent(), "agent number %d", i+1) 193 } 194 195 pollClusterReady(t, cl, numWorker, numManager) 196 return cl 197 } 198 199 func TestClusterCreate(t *testing.T) { 200 t.Parallel() 201 202 numWorker, numManager := 0, 2 203 cl := newCluster(t, numWorker, numManager) 204 defer func() { 205 require.NoError(t, cl.Stop()) 206 }() 207 } 208 209 func TestServiceCreateLateBind(t *testing.T) { 210 t.Parallel() 211 212 numWorker, numManager := 3, 3 213 214 cl := newTestCluster(t.Name(), false) 215 for i := 0; i < numManager; i++ { 216 require.NoError(t, cl.AddManager(true, nil), "manager number %d", i+1) 217 } 218 for i := 0; i < numWorker; i++ { 219 require.NoError(t, cl.AddAgent(), "agent number %d", i+1) 220 } 221 222 defer func() { 223 require.NoError(t, cl.Stop()) 224 }() 225 226 sid, err := cl.CreateService("test_service", 60) 227 require.NoError(t, err) 228 pollServiceReady(t, cl, sid, 60) 229 } 230 231 func TestServiceCreate(t *testing.T) { 232 t.Parallel() 233 234 numWorker, numManager := 3, 3 235 cl := newCluster(t, numWorker, numManager) 236 defer func() { 237 require.NoError(t, cl.Stop()) 238 }() 239 240 sid, err := cl.CreateService("test_service", 60) 241 require.NoError(t, err) 242 pollServiceReady(t, cl, sid, 60) 243 } 244 245 func TestNodeOps(t *testing.T) { 246 t.Parallel() 247 248 numWorker, numManager := 1, 3 249 cl := newCluster(t, numWorker, numManager) 250 defer func() { 251 require.NoError(t, cl.Stop()) 252 }() 253 254 // demote leader 255 leader, err := cl.Leader() 256 require.NoError(t, err) 257 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker)) 258 // agents 2, managers 2 259 numWorker++ 260 numManager-- 261 pollClusterReady(t, cl, numWorker, numManager) 262 263 // remove node 264 var worker *testNode 265 for _, n := range cl.nodes { 266 if !n.IsManager() && n.node.NodeID() != leader.node.NodeID() { 267 worker = n 268 break 269 } 270 } 271 require.NoError(t, cl.RemoveNode(worker.node.NodeID(), false)) 272 // agents 1, managers 2 273 numWorker-- 274 // long wait for heartbeat expiration 275 pollClusterReady(t, cl, numWorker, numManager) 276 277 // promote old leader back 278 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleManager)) 279 numWorker-- 280 numManager++ 281 // agents 0, managers 3 282 pollClusterReady(t, cl, numWorker, numManager) 283 } 284 285 func TestAutolockManagers(t *testing.T) { 286 t.Parallel() 287 288 // run this twice, once with FIPS set and once without FIPS set 289 for _, fips := range []bool{true, false} { 290 rootCA, err := ca.CreateRootCA("rootCN") 291 require.NoError(t, err) 292 numWorker, numManager := 1, 1 293 cl := newClusterWithRootCA(t, numWorker, numManager, &rootCA, fips) 294 defer func() { 295 require.NoError(t, cl.Stop()) 296 }() 297 298 // check that the cluster is not locked initially 299 unlockKey, err := cl.GetUnlockKey() 300 require.NoError(t, err) 301 require.Equal(t, "SWMKEY-1-", unlockKey) 302 303 // lock the cluster and make sure the unlock key is not empty 304 require.NoError(t, cl.AutolockManagers(true)) 305 unlockKey, err = cl.GetUnlockKey() 306 require.NoError(t, err) 307 require.NotEqual(t, "SWMKEY-1-", unlockKey) 308 309 // rotate unlock key 310 require.NoError(t, cl.RotateUnlockKey()) 311 newUnlockKey, err := cl.GetUnlockKey() 312 require.NoError(t, err) 313 require.NotEqual(t, "SWMKEY-1-", newUnlockKey) 314 require.NotEqual(t, unlockKey, newUnlockKey) 315 316 // unlock the cluster 317 require.NoError(t, cl.AutolockManagers(false)) 318 unlockKey, err = cl.GetUnlockKey() 319 require.NoError(t, err) 320 require.Equal(t, "SWMKEY-1-", unlockKey) 321 } 322 } 323 324 func TestDemotePromote(t *testing.T) { 325 t.Parallel() 326 327 numWorker, numManager := 1, 3 328 cl := newCluster(t, numWorker, numManager) 329 defer func() { 330 require.NoError(t, cl.Stop()) 331 }() 332 333 leader, err := cl.Leader() 334 require.NoError(t, err) 335 var manager *testNode 336 for _, n := range cl.nodes { 337 if n.IsManager() && n.node.NodeID() != leader.node.NodeID() { 338 manager = n 339 break 340 } 341 } 342 require.NoError(t, cl.SetNodeRole(manager.node.NodeID(), api.NodeRoleWorker)) 343 // agents 2, managers 2 344 numWorker++ 345 numManager-- 346 pollClusterReady(t, cl, numWorker, numManager) 347 348 // promote same node 349 require.NoError(t, cl.SetNodeRole(manager.node.NodeID(), api.NodeRoleManager)) 350 // agents 1, managers 3 351 numWorker-- 352 numManager++ 353 pollClusterReady(t, cl, numWorker, numManager) 354 } 355 356 func TestPromoteDemote(t *testing.T) { 357 t.Parallel() 358 359 numWorker, numManager := 1, 3 360 cl := newCluster(t, numWorker, numManager) 361 defer func() { 362 require.NoError(t, cl.Stop()) 363 }() 364 365 var worker *testNode 366 for _, n := range cl.nodes { 367 if !n.IsManager() { 368 worker = n 369 break 370 } 371 } 372 require.NoError(t, cl.SetNodeRole(worker.node.NodeID(), api.NodeRoleManager)) 373 // agents 0, managers 4 374 numWorker-- 375 numManager++ 376 pollClusterReady(t, cl, numWorker, numManager) 377 378 // demote same node 379 require.NoError(t, cl.SetNodeRole(worker.node.NodeID(), api.NodeRoleWorker)) 380 // agents 1, managers 3 381 numWorker++ 382 numManager-- 383 pollClusterReady(t, cl, numWorker, numManager) 384 } 385 386 func TestDemotePromoteLeader(t *testing.T) { 387 t.Parallel() 388 389 numWorker, numManager := 1, 3 390 cl := newCluster(t, numWorker, numManager) 391 defer func() { 392 require.NoError(t, cl.Stop()) 393 }() 394 395 leader, err := cl.Leader() 396 require.NoError(t, err) 397 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker)) 398 // agents 2, managers 2 399 numWorker++ 400 numManager-- 401 pollClusterReady(t, cl, numWorker, numManager) 402 403 //promote former leader back 404 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleManager)) 405 // agents 1, managers 3 406 numWorker-- 407 numManager++ 408 pollClusterReady(t, cl, numWorker, numManager) 409 } 410 411 func TestDemoteToSingleManager(t *testing.T) { 412 t.Parallel() 413 414 numWorker, numManager := 1, 3 415 cl := newCluster(t, numWorker, numManager) 416 defer func() { 417 require.NoError(t, cl.Stop()) 418 }() 419 420 leader, err := cl.Leader() 421 require.NoError(t, err) 422 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker)) 423 // agents 2, managers 2 424 numWorker++ 425 numManager-- 426 pollClusterReady(t, cl, numWorker, numManager) 427 428 leader, err = cl.Leader() 429 require.NoError(t, err) 430 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker)) 431 // agents 3, managers 1 432 numWorker++ 433 numManager-- 434 pollClusterReady(t, cl, numWorker, numManager) 435 } 436 437 func TestDemoteLeader(t *testing.T) { 438 t.Parallel() 439 440 numWorker, numManager := 1, 3 441 cl := newCluster(t, numWorker, numManager) 442 defer func() { 443 require.NoError(t, cl.Stop()) 444 }() 445 446 leader, err := cl.Leader() 447 require.NoError(t, err) 448 require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker)) 449 // agents 2, managers 2 450 numWorker++ 451 numManager-- 452 pollClusterReady(t, cl, numWorker, numManager) 453 } 454 455 func TestDemoteDownedManager(t *testing.T) { 456 t.Parallel() 457 458 numWorker, numManager := 0, 3 459 cl := newCluster(t, numWorker, numManager) 460 defer func() { 461 require.NoError(t, cl.Stop()) 462 }() 463 464 leader, err := cl.Leader() 465 require.NoError(t, err) 466 467 // Find a manager (not the leader) to demote. It must not be the third 468 // manager we added, because there may not have been enough time for 469 // that one to write anything to its WAL. 470 var demotee *testNode 471 for _, n := range cl.nodes { 472 nodeID := n.node.NodeID() 473 if n.IsManager() && nodeID != leader.node.NodeID() && cl.nodesOrder[nodeID] != 3 { 474 demotee = n 475 break 476 } 477 } 478 479 nodeID := demotee.node.NodeID() 480 481 resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID}) 482 require.NoError(t, err) 483 spec := resp.Node.Spec.Copy() 484 spec.DesiredRole = api.NodeRoleWorker 485 486 // stop the node, then demote it, and start it back up again so when it comes back up it has to realize 487 // it's not running anymore 488 require.NoError(t, demotee.Pause(false)) 489 490 // demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since 491 // the node is currently down 492 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 493 _, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{ 494 NodeID: nodeID, 495 Spec: spec, 496 NodeVersion: &resp.Node.Meta.Version, 497 }) 498 return err 499 }, opsTimeout)) 500 501 // start it back up again 502 require.NoError(t, cl.StartNode(nodeID)) 503 504 // wait to become worker 505 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 506 if demotee.IsManager() { 507 return fmt.Errorf("node is still not a worker") 508 } 509 return nil 510 }, opsTimeout)) 511 512 // agents 1, managers 2 513 numWorker++ 514 numManager-- 515 pollClusterReady(t, cl, numWorker, numManager) 516 } 517 518 func TestRestartLeader(t *testing.T) { 519 t.Parallel() 520 521 numWorker, numManager := 5, 3 522 cl := newCluster(t, numWorker, numManager) 523 defer func() { 524 require.NoError(t, cl.Stop()) 525 }() 526 leader, err := cl.Leader() 527 require.NoError(t, err) 528 529 origLeaderID := leader.node.NodeID() 530 531 require.NoError(t, leader.Pause(false)) 532 533 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 534 resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 535 if err != nil { 536 return err 537 } 538 for _, node := range resp.Nodes { 539 if node.ID == origLeaderID { 540 continue 541 } 542 require.False(t, node.Status.State == api.NodeStatus_DOWN, "nodes shouldn't go to down") 543 if node.Status.State != api.NodeStatus_READY { 544 return errors.Errorf("node %s is still not ready", node.ID) 545 } 546 } 547 return nil 548 }, opsTimeout)) 549 550 require.NoError(t, cl.StartNode(origLeaderID)) 551 552 pollClusterReady(t, cl, numWorker, numManager) 553 } 554 555 func TestForceNewCluster(t *testing.T) { 556 t.Parallel() 557 558 // create an external CA so that we can use it to generate expired certificates 559 rootCA, err := ca.CreateRootCA("externalRoot") 560 require.NoError(t, err) 561 562 // start a new cluster with the external CA bootstrapped 563 numWorker, numManager := 0, 1 564 cl := newTestCluster(t.Name(), false) 565 defer func() { 566 require.NoError(t, cl.Stop()) 567 }() 568 require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1") 569 pollClusterReady(t, cl, numWorker, numManager) 570 571 leader, err := cl.Leader() 572 require.NoError(t, err) 573 574 sid, err := cl.CreateService("test_service", 2) 575 require.NoError(t, err) 576 pollServiceReady(t, cl, sid, 2) 577 578 // generate an expired certificate 579 managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt") 580 certBytes, err := ioutil.ReadFile(managerCertFile) 581 require.NoError(t, err) 582 now := time.Now() 583 // we don't want it too expired, because it can't have expired before the root CA cert is valid 584 rootSigner, err := rootCA.Signer() 585 require.NoError(t, err) 586 expiredCertPEM := cautils.ReDateCert(t, certBytes, rootSigner.Cert, rootSigner.Key, now.Add(-1*time.Hour), now.Add(-1*time.Second)) 587 588 // restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed 589 nodeID := leader.node.NodeID() 590 require.NoError(t, leader.Pause(true)) 591 require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) 592 require.NoError(t, cl.StartNode(nodeID)) 593 pollClusterReady(t, cl, numWorker, numManager) 594 pollServiceReady(t, cl, sid, 2) 595 596 err = testutils.PollFuncWithTimeout(nil, func() error { 597 certBytes, err := ioutil.ReadFile(managerCertFile) 598 if err != nil { 599 return err 600 } 601 managerCerts, err := helpers.ParseCertificatesPEM(certBytes) 602 if err != nil { 603 return err 604 } 605 if managerCerts[0].NotAfter.Before(time.Now()) { 606 return errors.New("certificate hasn't been renewed yet") 607 } 608 return nil 609 }, opsTimeout) 610 require.NoError(t, err) 611 612 // restart node with an expired certificate without forcing a new cluster - it should error on start 613 require.NoError(t, leader.Pause(true)) 614 require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644)) 615 require.Error(t, cl.StartNode(nodeID)) 616 } 617 618 func pollRootRotationDone(t *testing.T, cl *testCluster) { 619 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 620 clusterInfo, err := cl.GetClusterInfo() 621 if err != nil { 622 return err 623 } 624 if clusterInfo.RootCA.RootRotation != nil { 625 return errors.New("root rotation not done") 626 } 627 return nil 628 }, opsTimeout)) 629 } 630 631 func TestSuccessfulRootRotation(t *testing.T) { 632 t.Parallel() 633 634 // run this twice, once with FIPS set and once without 635 for _, fips := range []bool{true, false} { 636 rootCA, err := ca.CreateRootCA("rootCN") 637 require.NoError(t, err) 638 639 numWorker, numManager := 2, 3 640 cl := newClusterWithRootCA(t, numWorker, numManager, &rootCA, fips) 641 defer func() { 642 require.NoError(t, cl.Stop()) 643 }() 644 pollClusterReady(t, cl, numWorker, numManager) 645 646 // Take down one of managers and both workers, so we can't actually ever finish root rotation. 647 resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 648 require.NoError(t, err) 649 var ( 650 downManagerID string 651 downWorkerIDs []string 652 oldTLSInfo *api.NodeTLSInfo 653 ) 654 for _, n := range resp.Nodes { 655 if oldTLSInfo != nil { 656 require.Equal(t, oldTLSInfo, n.Description.TLSInfo) 657 } else { 658 oldTLSInfo = n.Description.TLSInfo 659 } 660 if n.Role == api.NodeRoleManager { 661 if !n.ManagerStatus.Leader && downManagerID == "" { 662 downManagerID = n.ID 663 require.NoError(t, cl.nodes[n.ID].Pause(false)) 664 } 665 continue 666 } 667 downWorkerIDs = append(downWorkerIDs, n.ID) 668 require.NoError(t, cl.nodes[n.ID].Pause(false)) 669 } 670 671 // perform a root rotation, and wait until all the nodes that are up have newly issued certs 672 newRootCert, newRootKey, err := cautils.CreateRootCertAndKey("newRootCN") 673 require.NoError(t, err) 674 require.NoError(t, cl.RotateRootCA(newRootCert, newRootKey)) 675 676 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 677 resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 678 if err != nil { 679 return err 680 } 681 for _, n := range resp.Nodes { 682 isDown := n.ID == downManagerID || n.ID == downWorkerIDs[0] || n.ID == downWorkerIDs[1] 683 if reflect.DeepEqual(n.Description.TLSInfo, oldTLSInfo) != isDown { 684 return fmt.Errorf("expected TLS info to have changed: %v", !isDown) 685 } 686 } 687 688 // root rotation isn't done 689 clusterInfo, err := cl.GetClusterInfo() 690 if err != nil { 691 return err 692 } 693 require.NotNil(t, clusterInfo.RootCA.RootRotation) // if root rotation is already done, fail and finish the test here 694 return nil 695 }, opsTimeout)) 696 697 // Bring the other manager back. Also bring one worker back, kill the other worker, 698 // and add a new worker - show that we can converge on a root rotation. 699 require.NoError(t, cl.StartNode(downManagerID)) 700 require.NoError(t, cl.StartNode(downWorkerIDs[0])) 701 require.NoError(t, cl.RemoveNode(downWorkerIDs[1], false)) 702 require.NoError(t, cl.AddAgent()) 703 704 // we can finish root rotation even though the previous leader was down because it had 705 // already rotated its cert 706 pollRootRotationDone(t, cl) 707 708 // wait until all the nodes have gotten their new certs and trust roots 709 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 710 resp, err = cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 711 if err != nil { 712 return err 713 } 714 var newTLSInfo *api.NodeTLSInfo 715 for _, n := range resp.Nodes { 716 if newTLSInfo == nil { 717 newTLSInfo = n.Description.TLSInfo 718 if bytes.Equal(newTLSInfo.CertIssuerPublicKey, oldTLSInfo.CertIssuerPublicKey) || 719 bytes.Equal(newTLSInfo.CertIssuerSubject, oldTLSInfo.CertIssuerSubject) { 720 return errors.New("expecting the issuer to have changed") 721 } 722 if !bytes.Equal(newTLSInfo.TrustRoot, newRootCert) { 723 return errors.New("expecting the the root certificate to have changed") 724 } 725 } else if !reflect.DeepEqual(newTLSInfo, n.Description.TLSInfo) { 726 return fmt.Errorf("the nodes have not converged yet, particularly %s", n.ID) 727 } 728 729 if n.Certificate.Status.State != api.IssuanceStateIssued { 730 return errors.New("nodes have yet to finish renewing their TLS certificates") 731 } 732 } 733 return nil 734 }, opsTimeout)) 735 } 736 } 737 738 func TestRepeatedRootRotation(t *testing.T) { 739 t.Parallel() 740 numWorker, numManager := 3, 1 741 cl := newCluster(t, numWorker, numManager) 742 defer func() { 743 require.NoError(t, cl.Stop()) 744 }() 745 pollClusterReady(t, cl, numWorker, numManager) 746 747 resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 748 require.NoError(t, err) 749 var oldTLSInfo *api.NodeTLSInfo 750 for _, n := range resp.Nodes { 751 if oldTLSInfo != nil { 752 require.Equal(t, oldTLSInfo, n.Description.TLSInfo) 753 } else { 754 oldTLSInfo = n.Description.TLSInfo 755 } 756 } 757 758 // perform multiple root rotations, wait a second between each 759 var newRootCert, newRootKey []byte 760 for i := 0; i < 3; i++ { 761 newRootCert, newRootKey, err = cautils.CreateRootCertAndKey("newRootCN") 762 require.NoError(t, err) 763 require.NoError(t, cl.RotateRootCA(newRootCert, newRootKey)) 764 time.Sleep(time.Second) 765 } 766 767 pollRootRotationDone(t, cl) 768 769 // wait until all the nodes are stabilized back to the latest issuer 770 require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error { 771 resp, err = cl.api.ListNodes(context.Background(), &api.ListNodesRequest{}) 772 if err != nil { 773 return nil 774 } 775 for _, n := range resp.Nodes { 776 if reflect.DeepEqual(n.Description.TLSInfo, oldTLSInfo) { 777 return errors.New("nodes have not changed TLS info") 778 } 779 if n.Certificate.Status.State != api.IssuanceStateIssued { 780 return errors.New("nodes have yet to finish renewing their TLS certificates") 781 } 782 if !bytes.Equal(n.Description.TLSInfo.TrustRoot, newRootCert) { 783 return errors.New("nodes do not all trust the new root yet") 784 } 785 } 786 return nil 787 }, opsTimeout)) 788 } 789 790 func TestNodeRejoins(t *testing.T) { 791 t.Parallel() 792 numWorker, numManager := 1, 1 793 cl := newCluster(t, numWorker, numManager) 794 defer func() { 795 require.NoError(t, cl.Stop()) 796 }() 797 pollClusterReady(t, cl, numWorker, numManager) 798 799 clusterInfo, err := cl.GetClusterInfo() 800 require.NoError(t, err) 801 802 // find the worker 803 var worker *testNode 804 for _, n := range cl.nodes { 805 if !n.IsManager() { 806 worker = n 807 } 808 } 809 810 // rejoining succeeds - (both because the certs are correct, and because node.Pause sets the JoinAddr to "") 811 nodeID := worker.node.NodeID() 812 require.NoError(t, worker.Pause(false)) 813 require.NoError(t, cl.StartNode(nodeID)) 814 pollClusterReady(t, cl, numWorker, numManager) 815 816 // rejoining if the certs are wrong will fail fast so long as the join address is passed, but will keep retrying 817 // forever if the join address is not passed 818 leader, err := cl.Leader() 819 require.NoError(t, err) 820 require.NoError(t, worker.Pause(false)) 821 822 // generate new certs with the same node ID, role, and cluster ID, but with the wrong CA 823 paths := ca.NewConfigPaths(filepath.Join(worker.config.StateDir, "certificates")) 824 newRootCA, err := ca.CreateRootCA("bad root CA") 825 require.NoError(t, err) 826 ca.SaveRootCA(newRootCA, paths.RootCA) 827 krw := ca.NewKeyReadWriter(paths.Node, nil, &manager.RaftDEKData{}) // make sure the key headers are preserved 828 _, _, err = krw.Read() 829 require.NoError(t, err) 830 _, _, err = newRootCA.IssueAndSaveNewCertificates(krw, nodeID, ca.WorkerRole, clusterInfo.ID) 831 require.NoError(t, err) 832 833 worker.config.JoinAddr, err = leader.node.RemoteAPIAddr() 834 require.NoError(t, err) 835 err = cl.StartNode(nodeID) 836 require.Error(t, err) 837 require.Contains(t, err.Error(), "certificate signed by unknown authority") 838 } 839 840 func TestNodeJoinWithWrongCerts(t *testing.T) { 841 t.Parallel() 842 numWorker, numManager := 1, 1 843 cl := newCluster(t, numWorker, numManager) 844 defer func() { 845 require.NoError(t, cl.Stop()) 846 }() 847 pollClusterReady(t, cl, numWorker, numManager) 848 849 clusterInfo, err := cl.GetClusterInfo() 850 require.NoError(t, err) 851 852 joinAddr, err := cl.RandomManager().node.RemoteAPIAddr() 853 require.NoError(t, err) 854 855 tokens := map[string]string{ 856 ca.WorkerRole: clusterInfo.RootCA.JoinTokens.Worker, 857 ca.ManagerRole: clusterInfo.RootCA.JoinTokens.Manager, 858 } 859 860 rootCA, err := ca.CreateRootCA("rootCA") 861 require.NoError(t, err) 862 863 for role, token := range tokens { 864 node, err := newTestNode(joinAddr, token, false, false) 865 require.NoError(t, err) 866 nodeID := identity.NewID() 867 require.NoError(t, 868 generateCerts(node.stateDir, &rootCA, nodeID, role, clusterInfo.ID, false)) 869 cl.counter++ 870 cl.nodes[nodeID] = node 871 cl.nodesOrder[nodeID] = cl.counter 872 873 err = cl.StartNode(nodeID) 874 require.Error(t, err) 875 require.Contains(t, err.Error(), "certificate signed by unknown authority") 876 } 877 } 878 879 // If the cluster does not require FIPS, then any node can join and re-join 880 // regardless of FIPS mode. 881 func TestMixedFIPSClusterNonMandatoryFIPS(t *testing.T) { 882 t.Parallel() 883 884 cl := newTestCluster(t.Name(), false) // no fips 885 defer func() { 886 require.NoError(t, cl.Stop()) 887 }() 888 // create cluster with a non-FIPS manager, add another non-FIPS manager and a non-FIPs worker 889 for i := 0; i < 2; i++ { 890 require.NoError(t, cl.AddManager(false, nil)) 891 } 892 require.NoError(t, cl.AddAgent()) 893 894 // add a FIPS manager and FIPS worker 895 joinAddr, err := cl.RandomManager().node.RemoteAPIAddr() 896 require.NoError(t, err) 897 clusterInfo, err := cl.GetClusterInfo() 898 require.NoError(t, err) 899 for _, token := range []string{clusterInfo.RootCA.JoinTokens.Worker, clusterInfo.RootCA.JoinTokens.Manager} { 900 node, err := newTestNode(joinAddr, token, false, true) 901 require.NoError(t, err) 902 require.NoError(t, cl.AddNode(node)) 903 } 904 905 pollClusterReady(t, cl, 2, 3) 906 907 // switch which worker nodes are fips and which are not - all should start up just fine 908 // on managers, if we enable fips on a previously non-fips node, it won't be able to read 909 // non-fernet raft logs 910 for nodeID, n := range cl.nodes { 911 if n.IsManager() { 912 n.config.FIPS = false 913 } else { 914 n.config.FIPS = !n.config.FIPS 915 } 916 require.NoError(t, n.Pause(false)) 917 require.NoError(t, cl.StartNode(nodeID)) 918 } 919 920 pollClusterReady(t, cl, 2, 3) 921 } 922 923 // If the cluster require FIPS, then only FIPS nodes can join and re-join. 924 func TestMixedFIPSClusterMandatoryFIPS(t *testing.T) { 925 t.Parallel() 926 927 cl := newTestCluster(t.Name(), true) 928 defer func() { 929 require.NoError(t, cl.Stop()) 930 }() 931 for i := 0; i < 3; i++ { 932 require.NoError(t, cl.AddManager(false, nil)) 933 } 934 require.NoError(t, cl.AddAgent()) 935 936 pollClusterReady(t, cl, 1, 3) 937 938 // restart a manager and restart the worker in non-FIPS mode - both will fail, but restarting it in FIPS mode 939 // will succeed 940 leader, err := cl.Leader() 941 require.NoError(t, err) 942 var nonLeader, worker *testNode 943 for _, n := range cl.nodes { 944 if n == leader { 945 continue 946 } 947 if nonLeader == nil && n.IsManager() { 948 nonLeader = n 949 } 950 if worker == nil && !n.IsManager() { 951 worker = n 952 } 953 } 954 for _, n := range []*testNode{nonLeader, worker} { 955 nodeID := n.node.NodeID() 956 rAddr := "" 957 if n.IsManager() { 958 // make sure to save the old address because if a node is stopped, we can't get the node address, and it gets set to 959 // a completely new address, which will break raft in the case of a manager 960 rAddr, err = n.node.RemoteAPIAddr() 961 require.NoError(t, err) 962 } 963 require.NoError(t, n.Pause(false)) 964 n.config.FIPS = false 965 require.Equal(t, node.ErrMandatoryFIPS, cl.StartNode(nodeID)) 966 967 require.NoError(t, n.Pause(false)) 968 n.config.FIPS = true 969 n.config.ListenRemoteAPI = rAddr 970 require.NoError(t, cl.StartNode(nodeID)) 971 } 972 973 pollClusterReady(t, cl, 1, 3) 974 975 // try to add a non-FIPS manager and non-FIPS worker - it won't work 976 joinAddr, err := cl.RandomManager().node.RemoteAPIAddr() 977 require.NoError(t, err) 978 clusterInfo, err := cl.GetClusterInfo() 979 require.NoError(t, err) 980 for _, token := range []string{clusterInfo.RootCA.JoinTokens.Worker, clusterInfo.RootCA.JoinTokens.Manager} { 981 n, err := newTestNode(joinAddr, token, false, false) 982 require.NoError(t, err) 983 require.Equal(t, node.ErrMandatoryFIPS, cl.AddNode(n)) 984 } 985 }