github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/integration/integration_test.go (about)

     1  package integration
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"flag"
     7  	"fmt"
     8  	"io/ioutil"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"strings"
    13  	"testing"
    14  	"time"
    15  
    16  	"github.com/docker/swarmkit/node"
    17  
    18  	"reflect"
    19  
    20  	"github.com/cloudflare/cfssl/helpers"
    21  	events "github.com/docker/go-events"
    22  	"github.com/docker/swarmkit/api"
    23  	"github.com/docker/swarmkit/ca"
    24  	cautils "github.com/docker/swarmkit/ca/testutils"
    25  	"github.com/docker/swarmkit/identity"
    26  	"github.com/docker/swarmkit/manager"
    27  	"github.com/docker/swarmkit/testutils"
    28  	"github.com/pkg/errors"
    29  	"github.com/sirupsen/logrus"
    30  	"github.com/stretchr/testify/require"
    31  )
    32  
    33  var showTrace = flag.Bool("show-trace", false, "show stack trace after tests finish")
    34  
    35  func printTrace() {
    36  	var (
    37  		buf       []byte
    38  		stackSize int
    39  	)
    40  	bufferLen := 16384
    41  	for stackSize == len(buf) {
    42  		buf = make([]byte, bufferLen)
    43  		stackSize = runtime.Stack(buf, true)
    44  		bufferLen *= 2
    45  	}
    46  	buf = buf[:stackSize]
    47  	logrus.Error("===========================STACK TRACE===========================")
    48  	fmt.Println(string(buf))
    49  	logrus.Error("===========================STACK TRACE END=======================")
    50  }
    51  
    52  func TestMain(m *testing.M) {
    53  	ca.RenewTLSExponentialBackoff = events.ExponentialBackoffConfig{
    54  		Factor: time.Millisecond * 500,
    55  		Max:    time.Minute,
    56  	}
    57  	flag.Parse()
    58  	res := m.Run()
    59  	if *showTrace {
    60  		printTrace()
    61  	}
    62  	os.Exit(res)
    63  }
    64  
    65  // newTestCluster creates new cluster to which nodes can be added.
    66  // AcceptancePolicy is set to most permissive mode on first manager node added.
    67  func newTestCluster(testname string, fips bool) *testCluster {
    68  	ctx, cancel := context.WithCancel(context.Background())
    69  	ctx = context.WithValue(ctx, testnameKey, testname)
    70  	c := &testCluster{
    71  		ctx:        ctx,
    72  		cancel:     cancel,
    73  		nodes:      make(map[string]*testNode),
    74  		nodesOrder: make(map[string]int),
    75  		errs:       make(chan error, 1024),
    76  		fips:       fips,
    77  	}
    78  	c.api = &dummyAPI{c: c}
    79  	return c
    80  }
    81  
    82  // pollClusterReady calls control api until all conditions are true:
    83  // * all nodes are ready
    84  // * all managers has membership == accepted
    85  // * all managers has reachability == reachable
    86  // * one node is leader
    87  // * number of workers and managers equals to expected
    88  func pollClusterReady(t *testing.T, c *testCluster, numWorker, numManager int) {
    89  	pollFunc := func() error {
    90  		res, err := c.api.ListNodes(context.Background(), &api.ListNodesRequest{})
    91  		if err != nil {
    92  			return err
    93  		}
    94  		var mCount int
    95  		var leaderFound bool
    96  		for _, n := range res.Nodes {
    97  			if n.Status.State != api.NodeStatus_READY {
    98  				return fmt.Errorf("node %s with desired role %s isn't ready, status %s, message %s", n.ID, n.Spec.DesiredRole, n.Status.State, n.Status.Message)
    99  			}
   100  			if n.Spec.Membership != api.NodeMembershipAccepted {
   101  				return fmt.Errorf("node %s with desired role %s isn't accepted to cluster, membership %s", n.ID, n.Spec.DesiredRole, n.Spec.Membership)
   102  			}
   103  			if n.Certificate.Role != n.Spec.DesiredRole {
   104  				return fmt.Errorf("node %s had different roles in spec and certificate, %s and %s respectively", n.ID, n.Spec.DesiredRole, n.Certificate.Role)
   105  			}
   106  			if n.Certificate.Status.State != api.IssuanceStateIssued {
   107  				return fmt.Errorf("node %s with desired role %s has no issued certificate, issuance state %s", n.ID, n.Spec.DesiredRole, n.Certificate.Status.State)
   108  			}
   109  			if n.Role == api.NodeRoleManager {
   110  				if n.ManagerStatus == nil {
   111  					return fmt.Errorf("manager node %s has no ManagerStatus field", n.ID)
   112  				}
   113  				if n.ManagerStatus.Reachability != api.RaftMemberStatus_REACHABLE {
   114  					return fmt.Errorf("manager node %s has reachable status: %s", n.ID, n.ManagerStatus.Reachability)
   115  				}
   116  				mCount++
   117  				if n.ManagerStatus.Leader {
   118  					leaderFound = true
   119  				}
   120  			} else {
   121  				if n.ManagerStatus != nil {
   122  					return fmt.Errorf("worker node %s should not have manager status, returned %s", n.ID, n.ManagerStatus)
   123  				}
   124  			}
   125  			if n.Description.TLSInfo == nil {
   126  				return fmt.Errorf("node %s has not reported its TLS info yet", n.ID)
   127  			}
   128  		}
   129  		if !leaderFound {
   130  			return fmt.Errorf("leader of cluster is not found")
   131  		}
   132  		wCount := len(res.Nodes) - mCount
   133  		if mCount != numManager {
   134  			return fmt.Errorf("unexpected number of managers: %d, expected %d", mCount, numManager)
   135  		}
   136  		if wCount != numWorker {
   137  			return fmt.Errorf("unexpected number of workers: %d, expected %d", wCount, numWorker)
   138  		}
   139  		return nil
   140  	}
   141  	err := testutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout)
   142  	require.NoError(t, err)
   143  }
   144  
   145  func pollServiceReady(t *testing.T, c *testCluster, sid string, replicas int) {
   146  	pollFunc := func() error {
   147  		req := &api.ListTasksRequest{Filters: &api.ListTasksRequest_Filters{
   148  			ServiceIDs: []string{sid},
   149  		}}
   150  		res, err := c.api.ListTasks(context.Background(), req)
   151  		require.NoError(t, err)
   152  
   153  		if len(res.Tasks) == 0 {
   154  			return fmt.Errorf("tasks list is empty")
   155  		}
   156  		var running int
   157  		var states []string
   158  		for _, task := range res.Tasks {
   159  			if task.Status.State == api.TaskStateRunning {
   160  				running++
   161  			}
   162  			states = append(states, fmt.Sprintf("[task %s: %s]", task.ID, task.Status.State))
   163  		}
   164  		if running != replicas {
   165  			return fmt.Errorf("only %d running tasks, but expecting %d replicas: %s", running, replicas, strings.Join(states, ", "))
   166  		}
   167  
   168  		return nil
   169  	}
   170  	require.NoError(t, testutils.PollFuncWithTimeout(nil, pollFunc, opsTimeout))
   171  }
   172  
   173  func newCluster(t *testing.T, numWorker, numManager int) *testCluster {
   174  	cl := newTestCluster(t.Name(), false)
   175  	for i := 0; i < numManager; i++ {
   176  		require.NoError(t, cl.AddManager(false, nil), "manager number %d", i+1)
   177  	}
   178  	for i := 0; i < numWorker; i++ {
   179  		require.NoError(t, cl.AddAgent(), "agent number %d", i+1)
   180  	}
   181  
   182  	pollClusterReady(t, cl, numWorker, numManager)
   183  	return cl
   184  }
   185  
   186  func newClusterWithRootCA(t *testing.T, numWorker, numManager int, rootCA *ca.RootCA, fips bool) *testCluster {
   187  	cl := newTestCluster(t.Name(), fips)
   188  	for i := 0; i < numManager; i++ {
   189  		require.NoError(t, cl.AddManager(false, rootCA), "manager number %d", i+1)
   190  	}
   191  	for i := 0; i < numWorker; i++ {
   192  		require.NoError(t, cl.AddAgent(), "agent number %d", i+1)
   193  	}
   194  
   195  	pollClusterReady(t, cl, numWorker, numManager)
   196  	return cl
   197  }
   198  
   199  func TestClusterCreate(t *testing.T) {
   200  	t.Parallel()
   201  
   202  	numWorker, numManager := 0, 2
   203  	cl := newCluster(t, numWorker, numManager)
   204  	defer func() {
   205  		require.NoError(t, cl.Stop())
   206  	}()
   207  }
   208  
   209  func TestServiceCreateLateBind(t *testing.T) {
   210  	t.Parallel()
   211  
   212  	numWorker, numManager := 3, 3
   213  
   214  	cl := newTestCluster(t.Name(), false)
   215  	for i := 0; i < numManager; i++ {
   216  		require.NoError(t, cl.AddManager(true, nil), "manager number %d", i+1)
   217  	}
   218  	for i := 0; i < numWorker; i++ {
   219  		require.NoError(t, cl.AddAgent(), "agent number %d", i+1)
   220  	}
   221  
   222  	defer func() {
   223  		require.NoError(t, cl.Stop())
   224  	}()
   225  
   226  	sid, err := cl.CreateService("test_service", 60)
   227  	require.NoError(t, err)
   228  	pollServiceReady(t, cl, sid, 60)
   229  }
   230  
   231  func TestServiceCreate(t *testing.T) {
   232  	t.Parallel()
   233  
   234  	numWorker, numManager := 3, 3
   235  	cl := newCluster(t, numWorker, numManager)
   236  	defer func() {
   237  		require.NoError(t, cl.Stop())
   238  	}()
   239  
   240  	sid, err := cl.CreateService("test_service", 60)
   241  	require.NoError(t, err)
   242  	pollServiceReady(t, cl, sid, 60)
   243  }
   244  
   245  func TestNodeOps(t *testing.T) {
   246  	t.Parallel()
   247  
   248  	numWorker, numManager := 1, 3
   249  	cl := newCluster(t, numWorker, numManager)
   250  	defer func() {
   251  		require.NoError(t, cl.Stop())
   252  	}()
   253  
   254  	// demote leader
   255  	leader, err := cl.Leader()
   256  	require.NoError(t, err)
   257  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker))
   258  	// agents 2, managers 2
   259  	numWorker++
   260  	numManager--
   261  	pollClusterReady(t, cl, numWorker, numManager)
   262  
   263  	// remove node
   264  	var worker *testNode
   265  	for _, n := range cl.nodes {
   266  		if !n.IsManager() && n.node.NodeID() != leader.node.NodeID() {
   267  			worker = n
   268  			break
   269  		}
   270  	}
   271  	require.NoError(t, cl.RemoveNode(worker.node.NodeID(), false))
   272  	// agents 1, managers 2
   273  	numWorker--
   274  	// long wait for heartbeat expiration
   275  	pollClusterReady(t, cl, numWorker, numManager)
   276  
   277  	// promote old leader back
   278  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleManager))
   279  	numWorker--
   280  	numManager++
   281  	// agents 0, managers 3
   282  	pollClusterReady(t, cl, numWorker, numManager)
   283  }
   284  
   285  func TestAutolockManagers(t *testing.T) {
   286  	t.Parallel()
   287  
   288  	// run this twice, once with FIPS set and once without FIPS set
   289  	for _, fips := range []bool{true, false} {
   290  		rootCA, err := ca.CreateRootCA("rootCN")
   291  		require.NoError(t, err)
   292  		numWorker, numManager := 1, 1
   293  		cl := newClusterWithRootCA(t, numWorker, numManager, &rootCA, fips)
   294  		defer func() {
   295  			require.NoError(t, cl.Stop())
   296  		}()
   297  
   298  		// check that the cluster is not locked initially
   299  		unlockKey, err := cl.GetUnlockKey()
   300  		require.NoError(t, err)
   301  		require.Equal(t, "SWMKEY-1-", unlockKey)
   302  
   303  		// lock the cluster and make sure the unlock key is not empty
   304  		require.NoError(t, cl.AutolockManagers(true))
   305  		unlockKey, err = cl.GetUnlockKey()
   306  		require.NoError(t, err)
   307  		require.NotEqual(t, "SWMKEY-1-", unlockKey)
   308  
   309  		// rotate unlock key
   310  		require.NoError(t, cl.RotateUnlockKey())
   311  		newUnlockKey, err := cl.GetUnlockKey()
   312  		require.NoError(t, err)
   313  		require.NotEqual(t, "SWMKEY-1-", newUnlockKey)
   314  		require.NotEqual(t, unlockKey, newUnlockKey)
   315  
   316  		// unlock the cluster
   317  		require.NoError(t, cl.AutolockManagers(false))
   318  		unlockKey, err = cl.GetUnlockKey()
   319  		require.NoError(t, err)
   320  		require.Equal(t, "SWMKEY-1-", unlockKey)
   321  	}
   322  }
   323  
   324  func TestDemotePromote(t *testing.T) {
   325  	t.Parallel()
   326  
   327  	numWorker, numManager := 1, 3
   328  	cl := newCluster(t, numWorker, numManager)
   329  	defer func() {
   330  		require.NoError(t, cl.Stop())
   331  	}()
   332  
   333  	leader, err := cl.Leader()
   334  	require.NoError(t, err)
   335  	var manager *testNode
   336  	for _, n := range cl.nodes {
   337  		if n.IsManager() && n.node.NodeID() != leader.node.NodeID() {
   338  			manager = n
   339  			break
   340  		}
   341  	}
   342  	require.NoError(t, cl.SetNodeRole(manager.node.NodeID(), api.NodeRoleWorker))
   343  	// agents 2, managers 2
   344  	numWorker++
   345  	numManager--
   346  	pollClusterReady(t, cl, numWorker, numManager)
   347  
   348  	// promote same node
   349  	require.NoError(t, cl.SetNodeRole(manager.node.NodeID(), api.NodeRoleManager))
   350  	// agents 1, managers 3
   351  	numWorker--
   352  	numManager++
   353  	pollClusterReady(t, cl, numWorker, numManager)
   354  }
   355  
   356  func TestPromoteDemote(t *testing.T) {
   357  	t.Parallel()
   358  
   359  	numWorker, numManager := 1, 3
   360  	cl := newCluster(t, numWorker, numManager)
   361  	defer func() {
   362  		require.NoError(t, cl.Stop())
   363  	}()
   364  
   365  	var worker *testNode
   366  	for _, n := range cl.nodes {
   367  		if !n.IsManager() {
   368  			worker = n
   369  			break
   370  		}
   371  	}
   372  	require.NoError(t, cl.SetNodeRole(worker.node.NodeID(), api.NodeRoleManager))
   373  	// agents 0, managers 4
   374  	numWorker--
   375  	numManager++
   376  	pollClusterReady(t, cl, numWorker, numManager)
   377  
   378  	// demote same node
   379  	require.NoError(t, cl.SetNodeRole(worker.node.NodeID(), api.NodeRoleWorker))
   380  	// agents 1, managers 3
   381  	numWorker++
   382  	numManager--
   383  	pollClusterReady(t, cl, numWorker, numManager)
   384  }
   385  
   386  func TestDemotePromoteLeader(t *testing.T) {
   387  	t.Parallel()
   388  
   389  	numWorker, numManager := 1, 3
   390  	cl := newCluster(t, numWorker, numManager)
   391  	defer func() {
   392  		require.NoError(t, cl.Stop())
   393  	}()
   394  
   395  	leader, err := cl.Leader()
   396  	require.NoError(t, err)
   397  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker))
   398  	// agents 2, managers 2
   399  	numWorker++
   400  	numManager--
   401  	pollClusterReady(t, cl, numWorker, numManager)
   402  
   403  	//promote former leader back
   404  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleManager))
   405  	// agents 1, managers 3
   406  	numWorker--
   407  	numManager++
   408  	pollClusterReady(t, cl, numWorker, numManager)
   409  }
   410  
   411  func TestDemoteToSingleManager(t *testing.T) {
   412  	t.Parallel()
   413  
   414  	numWorker, numManager := 1, 3
   415  	cl := newCluster(t, numWorker, numManager)
   416  	defer func() {
   417  		require.NoError(t, cl.Stop())
   418  	}()
   419  
   420  	leader, err := cl.Leader()
   421  	require.NoError(t, err)
   422  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker))
   423  	// agents 2, managers 2
   424  	numWorker++
   425  	numManager--
   426  	pollClusterReady(t, cl, numWorker, numManager)
   427  
   428  	leader, err = cl.Leader()
   429  	require.NoError(t, err)
   430  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker))
   431  	// agents 3, managers 1
   432  	numWorker++
   433  	numManager--
   434  	pollClusterReady(t, cl, numWorker, numManager)
   435  }
   436  
   437  func TestDemoteLeader(t *testing.T) {
   438  	t.Parallel()
   439  
   440  	numWorker, numManager := 1, 3
   441  	cl := newCluster(t, numWorker, numManager)
   442  	defer func() {
   443  		require.NoError(t, cl.Stop())
   444  	}()
   445  
   446  	leader, err := cl.Leader()
   447  	require.NoError(t, err)
   448  	require.NoError(t, cl.SetNodeRole(leader.node.NodeID(), api.NodeRoleWorker))
   449  	// agents 2, managers 2
   450  	numWorker++
   451  	numManager--
   452  	pollClusterReady(t, cl, numWorker, numManager)
   453  }
   454  
   455  func TestDemoteDownedManager(t *testing.T) {
   456  	t.Parallel()
   457  
   458  	numWorker, numManager := 0, 3
   459  	cl := newCluster(t, numWorker, numManager)
   460  	defer func() {
   461  		require.NoError(t, cl.Stop())
   462  	}()
   463  
   464  	leader, err := cl.Leader()
   465  	require.NoError(t, err)
   466  
   467  	// Find a manager (not the leader) to demote. It must not be the third
   468  	// manager we added, because there may not have been enough time for
   469  	// that one to write anything to its WAL.
   470  	var demotee *testNode
   471  	for _, n := range cl.nodes {
   472  		nodeID := n.node.NodeID()
   473  		if n.IsManager() && nodeID != leader.node.NodeID() && cl.nodesOrder[nodeID] != 3 {
   474  			demotee = n
   475  			break
   476  		}
   477  	}
   478  
   479  	nodeID := demotee.node.NodeID()
   480  
   481  	resp, err := cl.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: nodeID})
   482  	require.NoError(t, err)
   483  	spec := resp.Node.Spec.Copy()
   484  	spec.DesiredRole = api.NodeRoleWorker
   485  
   486  	// stop the node, then demote it, and start it back up again so when it comes back up it has to realize
   487  	// it's not running anymore
   488  	require.NoError(t, demotee.Pause(false))
   489  
   490  	// demote node, but don't use SetNodeRole, which waits until it successfully becomes a worker, since
   491  	// the node is currently down
   492  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   493  		_, err := cl.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{
   494  			NodeID:      nodeID,
   495  			Spec:        spec,
   496  			NodeVersion: &resp.Node.Meta.Version,
   497  		})
   498  		return err
   499  	}, opsTimeout))
   500  
   501  	// start it back up again
   502  	require.NoError(t, cl.StartNode(nodeID))
   503  
   504  	// wait to become worker
   505  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   506  		if demotee.IsManager() {
   507  			return fmt.Errorf("node is still not a worker")
   508  		}
   509  		return nil
   510  	}, opsTimeout))
   511  
   512  	// agents 1, managers 2
   513  	numWorker++
   514  	numManager--
   515  	pollClusterReady(t, cl, numWorker, numManager)
   516  }
   517  
   518  func TestRestartLeader(t *testing.T) {
   519  	t.Parallel()
   520  
   521  	numWorker, numManager := 5, 3
   522  	cl := newCluster(t, numWorker, numManager)
   523  	defer func() {
   524  		require.NoError(t, cl.Stop())
   525  	}()
   526  	leader, err := cl.Leader()
   527  	require.NoError(t, err)
   528  
   529  	origLeaderID := leader.node.NodeID()
   530  
   531  	require.NoError(t, leader.Pause(false))
   532  
   533  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   534  		resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   535  		if err != nil {
   536  			return err
   537  		}
   538  		for _, node := range resp.Nodes {
   539  			if node.ID == origLeaderID {
   540  				continue
   541  			}
   542  			require.False(t, node.Status.State == api.NodeStatus_DOWN, "nodes shouldn't go to down")
   543  			if node.Status.State != api.NodeStatus_READY {
   544  				return errors.Errorf("node %s is still not ready", node.ID)
   545  			}
   546  		}
   547  		return nil
   548  	}, opsTimeout))
   549  
   550  	require.NoError(t, cl.StartNode(origLeaderID))
   551  
   552  	pollClusterReady(t, cl, numWorker, numManager)
   553  }
   554  
   555  func TestForceNewCluster(t *testing.T) {
   556  	t.Parallel()
   557  
   558  	// create an external CA so that we can use it to generate expired certificates
   559  	rootCA, err := ca.CreateRootCA("externalRoot")
   560  	require.NoError(t, err)
   561  
   562  	// start a new cluster with the external CA bootstrapped
   563  	numWorker, numManager := 0, 1
   564  	cl := newTestCluster(t.Name(), false)
   565  	defer func() {
   566  		require.NoError(t, cl.Stop())
   567  	}()
   568  	require.NoError(t, cl.AddManager(false, &rootCA), "manager number 1")
   569  	pollClusterReady(t, cl, numWorker, numManager)
   570  
   571  	leader, err := cl.Leader()
   572  	require.NoError(t, err)
   573  
   574  	sid, err := cl.CreateService("test_service", 2)
   575  	require.NoError(t, err)
   576  	pollServiceReady(t, cl, sid, 2)
   577  
   578  	// generate an expired certificate
   579  	managerCertFile := filepath.Join(leader.stateDir, "certificates", "swarm-node.crt")
   580  	certBytes, err := ioutil.ReadFile(managerCertFile)
   581  	require.NoError(t, err)
   582  	now := time.Now()
   583  	// we don't want it too expired, because it can't have expired before the root CA cert is valid
   584  	rootSigner, err := rootCA.Signer()
   585  	require.NoError(t, err)
   586  	expiredCertPEM := cautils.ReDateCert(t, certBytes, rootSigner.Cert, rootSigner.Key, now.Add(-1*time.Hour), now.Add(-1*time.Second))
   587  
   588  	// restart node with an expired certificate while forcing a new cluster - it should start without error and the certificate should be renewed
   589  	nodeID := leader.node.NodeID()
   590  	require.NoError(t, leader.Pause(true))
   591  	require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644))
   592  	require.NoError(t, cl.StartNode(nodeID))
   593  	pollClusterReady(t, cl, numWorker, numManager)
   594  	pollServiceReady(t, cl, sid, 2)
   595  
   596  	err = testutils.PollFuncWithTimeout(nil, func() error {
   597  		certBytes, err := ioutil.ReadFile(managerCertFile)
   598  		if err != nil {
   599  			return err
   600  		}
   601  		managerCerts, err := helpers.ParseCertificatesPEM(certBytes)
   602  		if err != nil {
   603  			return err
   604  		}
   605  		if managerCerts[0].NotAfter.Before(time.Now()) {
   606  			return errors.New("certificate hasn't been renewed yet")
   607  		}
   608  		return nil
   609  	}, opsTimeout)
   610  	require.NoError(t, err)
   611  
   612  	// restart node with an expired certificate without forcing a new cluster - it should error on start
   613  	require.NoError(t, leader.Pause(true))
   614  	require.NoError(t, ioutil.WriteFile(managerCertFile, expiredCertPEM, 0644))
   615  	require.Error(t, cl.StartNode(nodeID))
   616  }
   617  
   618  func pollRootRotationDone(t *testing.T, cl *testCluster) {
   619  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   620  		clusterInfo, err := cl.GetClusterInfo()
   621  		if err != nil {
   622  			return err
   623  		}
   624  		if clusterInfo.RootCA.RootRotation != nil {
   625  			return errors.New("root rotation not done")
   626  		}
   627  		return nil
   628  	}, opsTimeout))
   629  }
   630  
   631  func TestSuccessfulRootRotation(t *testing.T) {
   632  	t.Parallel()
   633  
   634  	// run this twice, once with FIPS set and once without
   635  	for _, fips := range []bool{true, false} {
   636  		rootCA, err := ca.CreateRootCA("rootCN")
   637  		require.NoError(t, err)
   638  
   639  		numWorker, numManager := 2, 3
   640  		cl := newClusterWithRootCA(t, numWorker, numManager, &rootCA, fips)
   641  		defer func() {
   642  			require.NoError(t, cl.Stop())
   643  		}()
   644  		pollClusterReady(t, cl, numWorker, numManager)
   645  
   646  		// Take down one of managers and both workers, so we can't actually ever finish root rotation.
   647  		resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   648  		require.NoError(t, err)
   649  		var (
   650  			downManagerID string
   651  			downWorkerIDs []string
   652  			oldTLSInfo    *api.NodeTLSInfo
   653  		)
   654  		for _, n := range resp.Nodes {
   655  			if oldTLSInfo != nil {
   656  				require.Equal(t, oldTLSInfo, n.Description.TLSInfo)
   657  			} else {
   658  				oldTLSInfo = n.Description.TLSInfo
   659  			}
   660  			if n.Role == api.NodeRoleManager {
   661  				if !n.ManagerStatus.Leader && downManagerID == "" {
   662  					downManagerID = n.ID
   663  					require.NoError(t, cl.nodes[n.ID].Pause(false))
   664  				}
   665  				continue
   666  			}
   667  			downWorkerIDs = append(downWorkerIDs, n.ID)
   668  			require.NoError(t, cl.nodes[n.ID].Pause(false))
   669  		}
   670  
   671  		// perform a root rotation, and wait until all the nodes that are up have newly issued certs
   672  		newRootCert, newRootKey, err := cautils.CreateRootCertAndKey("newRootCN")
   673  		require.NoError(t, err)
   674  		require.NoError(t, cl.RotateRootCA(newRootCert, newRootKey))
   675  
   676  		require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   677  			resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   678  			if err != nil {
   679  				return err
   680  			}
   681  			for _, n := range resp.Nodes {
   682  				isDown := n.ID == downManagerID || n.ID == downWorkerIDs[0] || n.ID == downWorkerIDs[1]
   683  				if reflect.DeepEqual(n.Description.TLSInfo, oldTLSInfo) != isDown {
   684  					return fmt.Errorf("expected TLS info to have changed: %v", !isDown)
   685  				}
   686  			}
   687  
   688  			// root rotation isn't done
   689  			clusterInfo, err := cl.GetClusterInfo()
   690  			if err != nil {
   691  				return err
   692  			}
   693  			require.NotNil(t, clusterInfo.RootCA.RootRotation) // if root rotation is already done, fail and finish the test here
   694  			return nil
   695  		}, opsTimeout))
   696  
   697  		// Bring the other manager back.  Also bring one worker back, kill the other worker,
   698  		// and add a new worker - show that we can converge on a root rotation.
   699  		require.NoError(t, cl.StartNode(downManagerID))
   700  		require.NoError(t, cl.StartNode(downWorkerIDs[0]))
   701  		require.NoError(t, cl.RemoveNode(downWorkerIDs[1], false))
   702  		require.NoError(t, cl.AddAgent())
   703  
   704  		// we can finish root rotation even though the previous leader was down because it had
   705  		// already rotated its cert
   706  		pollRootRotationDone(t, cl)
   707  
   708  		// wait until all the nodes have gotten their new certs and trust roots
   709  		require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   710  			resp, err = cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   711  			if err != nil {
   712  				return err
   713  			}
   714  			var newTLSInfo *api.NodeTLSInfo
   715  			for _, n := range resp.Nodes {
   716  				if newTLSInfo == nil {
   717  					newTLSInfo = n.Description.TLSInfo
   718  					if bytes.Equal(newTLSInfo.CertIssuerPublicKey, oldTLSInfo.CertIssuerPublicKey) ||
   719  						bytes.Equal(newTLSInfo.CertIssuerSubject, oldTLSInfo.CertIssuerSubject) {
   720  						return errors.New("expecting the issuer to have changed")
   721  					}
   722  					if !bytes.Equal(newTLSInfo.TrustRoot, newRootCert) {
   723  						return errors.New("expecting the the root certificate to have changed")
   724  					}
   725  				} else if !reflect.DeepEqual(newTLSInfo, n.Description.TLSInfo) {
   726  					return fmt.Errorf("the nodes have not converged yet, particularly %s", n.ID)
   727  				}
   728  
   729  				if n.Certificate.Status.State != api.IssuanceStateIssued {
   730  					return errors.New("nodes have yet to finish renewing their TLS certificates")
   731  				}
   732  			}
   733  			return nil
   734  		}, opsTimeout))
   735  	}
   736  }
   737  
   738  func TestRepeatedRootRotation(t *testing.T) {
   739  	t.Parallel()
   740  	numWorker, numManager := 3, 1
   741  	cl := newCluster(t, numWorker, numManager)
   742  	defer func() {
   743  		require.NoError(t, cl.Stop())
   744  	}()
   745  	pollClusterReady(t, cl, numWorker, numManager)
   746  
   747  	resp, err := cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   748  	require.NoError(t, err)
   749  	var oldTLSInfo *api.NodeTLSInfo
   750  	for _, n := range resp.Nodes {
   751  		if oldTLSInfo != nil {
   752  			require.Equal(t, oldTLSInfo, n.Description.TLSInfo)
   753  		} else {
   754  			oldTLSInfo = n.Description.TLSInfo
   755  		}
   756  	}
   757  
   758  	// perform multiple root rotations, wait a second between each
   759  	var newRootCert, newRootKey []byte
   760  	for i := 0; i < 3; i++ {
   761  		newRootCert, newRootKey, err = cautils.CreateRootCertAndKey("newRootCN")
   762  		require.NoError(t, err)
   763  		require.NoError(t, cl.RotateRootCA(newRootCert, newRootKey))
   764  		time.Sleep(time.Second)
   765  	}
   766  
   767  	pollRootRotationDone(t, cl)
   768  
   769  	// wait until all the nodes are stabilized back to the latest issuer
   770  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   771  		resp, err = cl.api.ListNodes(context.Background(), &api.ListNodesRequest{})
   772  		if err != nil {
   773  			return nil
   774  		}
   775  		for _, n := range resp.Nodes {
   776  			if reflect.DeepEqual(n.Description.TLSInfo, oldTLSInfo) {
   777  				return errors.New("nodes have not changed TLS info")
   778  			}
   779  			if n.Certificate.Status.State != api.IssuanceStateIssued {
   780  				return errors.New("nodes have yet to finish renewing their TLS certificates")
   781  			}
   782  			if !bytes.Equal(n.Description.TLSInfo.TrustRoot, newRootCert) {
   783  				return errors.New("nodes do not all trust the new root yet")
   784  			}
   785  		}
   786  		return nil
   787  	}, opsTimeout))
   788  }
   789  
   790  func TestNodeRejoins(t *testing.T) {
   791  	t.Parallel()
   792  	numWorker, numManager := 1, 1
   793  	cl := newCluster(t, numWorker, numManager)
   794  	defer func() {
   795  		require.NoError(t, cl.Stop())
   796  	}()
   797  	pollClusterReady(t, cl, numWorker, numManager)
   798  
   799  	clusterInfo, err := cl.GetClusterInfo()
   800  	require.NoError(t, err)
   801  
   802  	// find the worker
   803  	var worker *testNode
   804  	for _, n := range cl.nodes {
   805  		if !n.IsManager() {
   806  			worker = n
   807  		}
   808  	}
   809  
   810  	// rejoining succeeds - (both because the certs are correct, and because node.Pause sets the JoinAddr to "")
   811  	nodeID := worker.node.NodeID()
   812  	require.NoError(t, worker.Pause(false))
   813  	require.NoError(t, cl.StartNode(nodeID))
   814  	pollClusterReady(t, cl, numWorker, numManager)
   815  
   816  	// rejoining if the certs are wrong will fail fast so long as the join address is passed, but will keep retrying
   817  	// forever if the join address is not passed
   818  	leader, err := cl.Leader()
   819  	require.NoError(t, err)
   820  	require.NoError(t, worker.Pause(false))
   821  
   822  	// generate new certs with the same node ID, role, and cluster ID, but with the wrong CA
   823  	paths := ca.NewConfigPaths(filepath.Join(worker.config.StateDir, "certificates"))
   824  	newRootCA, err := ca.CreateRootCA("bad root CA")
   825  	require.NoError(t, err)
   826  	ca.SaveRootCA(newRootCA, paths.RootCA)
   827  	krw := ca.NewKeyReadWriter(paths.Node, nil, &manager.RaftDEKData{}) // make sure the key headers are preserved
   828  	_, _, err = krw.Read()
   829  	require.NoError(t, err)
   830  	_, _, err = newRootCA.IssueAndSaveNewCertificates(krw, nodeID, ca.WorkerRole, clusterInfo.ID)
   831  	require.NoError(t, err)
   832  
   833  	worker.config.JoinAddr, err = leader.node.RemoteAPIAddr()
   834  	require.NoError(t, err)
   835  	err = cl.StartNode(nodeID)
   836  	require.Error(t, err)
   837  	require.Contains(t, err.Error(), "certificate signed by unknown authority")
   838  }
   839  
   840  func TestNodeJoinWithWrongCerts(t *testing.T) {
   841  	t.Parallel()
   842  	numWorker, numManager := 1, 1
   843  	cl := newCluster(t, numWorker, numManager)
   844  	defer func() {
   845  		require.NoError(t, cl.Stop())
   846  	}()
   847  	pollClusterReady(t, cl, numWorker, numManager)
   848  
   849  	clusterInfo, err := cl.GetClusterInfo()
   850  	require.NoError(t, err)
   851  
   852  	joinAddr, err := cl.RandomManager().node.RemoteAPIAddr()
   853  	require.NoError(t, err)
   854  
   855  	tokens := map[string]string{
   856  		ca.WorkerRole:  clusterInfo.RootCA.JoinTokens.Worker,
   857  		ca.ManagerRole: clusterInfo.RootCA.JoinTokens.Manager,
   858  	}
   859  
   860  	rootCA, err := ca.CreateRootCA("rootCA")
   861  	require.NoError(t, err)
   862  
   863  	for role, token := range tokens {
   864  		node, err := newTestNode(joinAddr, token, false, false)
   865  		require.NoError(t, err)
   866  		nodeID := identity.NewID()
   867  		require.NoError(t,
   868  			generateCerts(node.stateDir, &rootCA, nodeID, role, clusterInfo.ID, false))
   869  		cl.counter++
   870  		cl.nodes[nodeID] = node
   871  		cl.nodesOrder[nodeID] = cl.counter
   872  
   873  		err = cl.StartNode(nodeID)
   874  		require.Error(t, err)
   875  		require.Contains(t, err.Error(), "certificate signed by unknown authority")
   876  	}
   877  }
   878  
   879  // If the cluster does not require FIPS, then any node can join and re-join
   880  // regardless of FIPS mode.
   881  func TestMixedFIPSClusterNonMandatoryFIPS(t *testing.T) {
   882  	t.Parallel()
   883  
   884  	cl := newTestCluster(t.Name(), false) // no fips
   885  	defer func() {
   886  		require.NoError(t, cl.Stop())
   887  	}()
   888  	// create cluster with a non-FIPS manager, add another non-FIPS manager and a non-FIPs worker
   889  	for i := 0; i < 2; i++ {
   890  		require.NoError(t, cl.AddManager(false, nil))
   891  	}
   892  	require.NoError(t, cl.AddAgent())
   893  
   894  	// add a FIPS manager and FIPS worker
   895  	joinAddr, err := cl.RandomManager().node.RemoteAPIAddr()
   896  	require.NoError(t, err)
   897  	clusterInfo, err := cl.GetClusterInfo()
   898  	require.NoError(t, err)
   899  	for _, token := range []string{clusterInfo.RootCA.JoinTokens.Worker, clusterInfo.RootCA.JoinTokens.Manager} {
   900  		node, err := newTestNode(joinAddr, token, false, true)
   901  		require.NoError(t, err)
   902  		require.NoError(t, cl.AddNode(node))
   903  	}
   904  
   905  	pollClusterReady(t, cl, 2, 3)
   906  
   907  	// switch which worker nodes are fips and which are not - all should start up just fine
   908  	// on managers, if we enable fips on a previously non-fips node, it won't be able to read
   909  	// non-fernet raft logs
   910  	for nodeID, n := range cl.nodes {
   911  		if n.IsManager() {
   912  			n.config.FIPS = false
   913  		} else {
   914  			n.config.FIPS = !n.config.FIPS
   915  		}
   916  		require.NoError(t, n.Pause(false))
   917  		require.NoError(t, cl.StartNode(nodeID))
   918  	}
   919  
   920  	pollClusterReady(t, cl, 2, 3)
   921  }
   922  
   923  // If the cluster require FIPS, then only FIPS nodes can join and re-join.
   924  func TestMixedFIPSClusterMandatoryFIPS(t *testing.T) {
   925  	t.Parallel()
   926  
   927  	cl := newTestCluster(t.Name(), true)
   928  	defer func() {
   929  		require.NoError(t, cl.Stop())
   930  	}()
   931  	for i := 0; i < 3; i++ {
   932  		require.NoError(t, cl.AddManager(false, nil))
   933  	}
   934  	require.NoError(t, cl.AddAgent())
   935  
   936  	pollClusterReady(t, cl, 1, 3)
   937  
   938  	// restart a manager and restart the worker in non-FIPS mode - both will fail, but restarting it in FIPS mode
   939  	// will succeed
   940  	leader, err := cl.Leader()
   941  	require.NoError(t, err)
   942  	var nonLeader, worker *testNode
   943  	for _, n := range cl.nodes {
   944  		if n == leader {
   945  			continue
   946  		}
   947  		if nonLeader == nil && n.IsManager() {
   948  			nonLeader = n
   949  		}
   950  		if worker == nil && !n.IsManager() {
   951  			worker = n
   952  		}
   953  	}
   954  	for _, n := range []*testNode{nonLeader, worker} {
   955  		nodeID := n.node.NodeID()
   956  		rAddr := ""
   957  		if n.IsManager() {
   958  			// make sure to save the old address because if a node is stopped, we can't get the node address, and it gets set to
   959  			// a completely new address, which will break raft in the case of a manager
   960  			rAddr, err = n.node.RemoteAPIAddr()
   961  			require.NoError(t, err)
   962  		}
   963  		require.NoError(t, n.Pause(false))
   964  		n.config.FIPS = false
   965  		require.Equal(t, node.ErrMandatoryFIPS, cl.StartNode(nodeID))
   966  
   967  		require.NoError(t, n.Pause(false))
   968  		n.config.FIPS = true
   969  		n.config.ListenRemoteAPI = rAddr
   970  		require.NoError(t, cl.StartNode(nodeID))
   971  	}
   972  
   973  	pollClusterReady(t, cl, 1, 3)
   974  
   975  	// try to add a non-FIPS manager and non-FIPS worker - it won't work
   976  	joinAddr, err := cl.RandomManager().node.RemoteAPIAddr()
   977  	require.NoError(t, err)
   978  	clusterInfo, err := cl.GetClusterInfo()
   979  	require.NoError(t, err)
   980  	for _, token := range []string{clusterInfo.RootCA.JoinTokens.Worker, clusterInfo.RootCA.JoinTokens.Manager} {
   981  		n, err := newTestNode(joinAddr, token, false, false)
   982  		require.NoError(t, err)
   983  		require.Equal(t, node.ErrMandatoryFIPS, cl.AddNode(n))
   984  	}
   985  }