github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/manager_test.go (about)

     1  package manager
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/tls"
     7  	"encoding/pem"
     8  	"errors"
     9  	"fmt"
    10  	"io/ioutil"
    11  	"os"
    12  	"path/filepath"
    13  	"testing"
    14  	"time"
    15  
    16  	"google.golang.org/grpc"
    17  	"google.golang.org/grpc/credentials"
    18  
    19  	"github.com/docker/swarmkit/api"
    20  	"github.com/docker/swarmkit/ca"
    21  	"github.com/docker/swarmkit/ca/keyutils"
    22  	cautils "github.com/docker/swarmkit/ca/testutils"
    23  	"github.com/docker/swarmkit/manager/dispatcher"
    24  	"github.com/docker/swarmkit/manager/encryption"
    25  	"github.com/docker/swarmkit/manager/state/raft/storage"
    26  	"github.com/docker/swarmkit/manager/state/store"
    27  	"github.com/docker/swarmkit/testutils"
    28  	"github.com/stretchr/testify/require"
    29  )
    30  
    31  func TestManager(t *testing.T) {
    32  	temp, err := ioutil.TempFile("", "test-socket")
    33  	require.NoError(t, err)
    34  	require.NoError(t, temp.Close())
    35  	require.NoError(t, os.Remove(temp.Name()))
    36  
    37  	defer os.RemoveAll(temp.Name())
    38  
    39  	stateDir, err := ioutil.TempDir("", "test-raft")
    40  	require.NoError(t, err)
    41  	defer os.RemoveAll(stateDir)
    42  
    43  	tc := cautils.NewTestCA(t, func(p ca.CertPaths) *ca.KeyReadWriter {
    44  		return ca.NewKeyReadWriter(p, []byte("kek"), nil)
    45  	})
    46  	defer tc.Stop()
    47  
    48  	agentSecurityConfig, err := tc.NewNodeConfig(ca.WorkerRole)
    49  	require.NoError(t, err)
    50  	agentDiffOrgSecurityConfig, err := tc.NewNodeConfigOrg(ca.WorkerRole, "another-org")
    51  	require.NoError(t, err)
    52  	managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole)
    53  	require.NoError(t, err)
    54  
    55  	m, err := New(&Config{
    56  		RemoteAPI:        &RemoteAddrs{ListenAddr: "127.0.0.1:0"},
    57  		ControlAPI:       temp.Name(),
    58  		StateDir:         stateDir,
    59  		SecurityConfig:   managerSecurityConfig,
    60  		AutoLockManagers: true,
    61  		UnlockKey:        []byte("kek"),
    62  		RootCAPaths:      tc.Paths.RootCA,
    63  	})
    64  	require.NoError(t, err)
    65  	require.NotNil(t, m)
    66  
    67  	tcpAddr := m.Addr()
    68  
    69  	done := make(chan error)
    70  	defer close(done)
    71  	go func() {
    72  		done <- m.Run(tc.Context)
    73  	}()
    74  
    75  	opts := []grpc.DialOption{
    76  		grpc.WithTimeout(10 * time.Second),
    77  		grpc.WithTransportCredentials(agentSecurityConfig.ClientTLSCreds),
    78  	}
    79  
    80  	conn, err := grpc.Dial(tcpAddr, opts...)
    81  	require.NoError(t, err)
    82  	defer func() {
    83  		require.NoError(t, conn.Close())
    84  	}()
    85  
    86  	// We have to send a dummy request to verify if the connection is actually up.
    87  	client := api.NewDispatcherClient(conn)
    88  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
    89  		_, err = client.Heartbeat(tc.Context, &api.HeartbeatRequest{})
    90  		if dispatcher.ErrNodeNotRegistered.Error() != testutils.ErrorDesc(err) {
    91  			return err
    92  		}
    93  		_, err = client.Session(tc.Context, &api.SessionRequest{})
    94  		return err
    95  	}, 1*time.Second))
    96  
    97  	// Try to have a client in a different org access this manager
    98  	opts = []grpc.DialOption{
    99  		grpc.WithTimeout(10 * time.Second),
   100  		grpc.WithTransportCredentials(agentDiffOrgSecurityConfig.ClientTLSCreds),
   101  	}
   102  
   103  	conn2, err := grpc.Dial(tcpAddr, opts...)
   104  	require.NoError(t, err)
   105  	defer func() {
   106  		require.NoError(t, conn2.Close())
   107  	}()
   108  
   109  	client = api.NewDispatcherClient(conn2)
   110  	_, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{})
   111  	require.Contains(t, testutils.ErrorDesc(err), "Permission denied: unauthorized peer role: rpc error: code = PermissionDenied desc = Permission denied: remote certificate not part of organization")
   112  
   113  	// Verify that requests to the various GRPC services running on TCP
   114  	// are rejected if they don't have certs.
   115  	opts = []grpc.DialOption{
   116  		grpc.WithTimeout(10 * time.Second),
   117  		grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})),
   118  	}
   119  
   120  	noCertConn, err := grpc.Dial(tcpAddr, opts...)
   121  	require.NoError(t, err)
   122  	defer func() {
   123  		require.NoError(t, noCertConn.Close())
   124  	}()
   125  
   126  	client = api.NewDispatcherClient(noCertConn)
   127  	_, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{})
   128  	require.EqualError(t, err, "rpc error: code = PermissionDenied desc = Permission denied: unauthorized peer role: rpc error: code = PermissionDenied desc = no client certificates in request")
   129  
   130  	controlClient := api.NewControlClient(noCertConn)
   131  	_, err = controlClient.ListNodes(context.Background(), &api.ListNodesRequest{})
   132  	require.EqualError(t, err, "rpc error: code = PermissionDenied desc = Permission denied: unauthorized peer role: rpc error: code = PermissionDenied desc = no client certificates in request")
   133  
   134  	raftClient := api.NewRaftMembershipClient(noCertConn)
   135  	_, err = raftClient.Join(context.Background(), &api.JoinRequest{})
   136  	require.EqualError(t, err, "rpc error: code = PermissionDenied desc = Permission denied: unauthorized peer role: rpc error: code = PermissionDenied desc = no client certificates in request")
   137  
   138  	opts = []grpc.DialOption{
   139  		grpc.WithTimeout(10 * time.Second),
   140  		grpc.WithTransportCredentials(managerSecurityConfig.ClientTLSCreds),
   141  	}
   142  
   143  	controlConn, err := grpc.Dial(tcpAddr, opts...)
   144  	require.NoError(t, err)
   145  	defer func() {
   146  		require.NoError(t, controlConn.Close())
   147  	}()
   148  
   149  	// check that the kek is added to the config
   150  	var cluster api.Cluster
   151  	require.NoError(t, testutils.PollFunc(nil, func() error {
   152  		var (
   153  			err      error
   154  			clusters []*api.Cluster
   155  		)
   156  		m.raftNode.MemoryStore().View(func(tx store.ReadTx) {
   157  			clusters, err = store.FindClusters(tx, store.All)
   158  		})
   159  		if err != nil {
   160  			return err
   161  		}
   162  		if len(clusters) != 1 {
   163  			return errors.New("wrong number of clusters")
   164  		}
   165  		cluster = *clusters[0]
   166  		return nil
   167  
   168  	}))
   169  	require.NotNil(t, cluster)
   170  	require.Len(t, cluster.UnlockKeys, 1)
   171  	require.Equal(t, &api.EncryptionKey{
   172  		Subsystem: ca.ManagerRole,
   173  		Key:       []byte("kek"),
   174  	}, cluster.UnlockKeys[0])
   175  
   176  	// Test removal of the agent node
   177  	agentID := agentSecurityConfig.ClientTLSCreds.NodeID()
   178  	require.NoError(t, m.raftNode.MemoryStore().Update(func(tx store.Tx) error {
   179  		return store.CreateNode(tx,
   180  			&api.Node{
   181  				ID: agentID,
   182  				Certificate: api.Certificate{
   183  					Role: api.NodeRoleWorker,
   184  					CN:   agentID,
   185  				},
   186  			},
   187  		)
   188  	}))
   189  	controlClient = api.NewControlClient(controlConn)
   190  	_, err = controlClient.CreateNetwork(context.Background(), &api.CreateNetworkRequest{
   191  		Spec: &api.NetworkSpec{
   192  			Annotations: api.Annotations{
   193  				Name: "test-network-bad-driver",
   194  			},
   195  			DriverConfig: &api.Driver{
   196  				Name: "invalid-must-never-exist",
   197  			},
   198  		},
   199  	})
   200  	require.Error(t, err)
   201  
   202  	_, err = controlClient.RemoveNode(context.Background(),
   203  		&api.RemoveNodeRequest{
   204  			NodeID: agentID,
   205  			Force:  true,
   206  		},
   207  	)
   208  	require.NoError(t, err)
   209  
   210  	client = api.NewDispatcherClient(conn)
   211  	_, err = client.Heartbeat(context.Background(), &api.HeartbeatRequest{})
   212  	require.Contains(t, testutils.ErrorDesc(err), "removed from swarm")
   213  
   214  	m.Stop(tc.Context, false)
   215  
   216  	// After stopping we should MAY receive an error from ListenAndServe if
   217  	// all this happened before WaitForLeader completed, so don't check the
   218  	// error.
   219  	<-done
   220  }
   221  
   222  // Tests locking and unlocking the manager and key rotations
   223  func TestManagerLockUnlock(t *testing.T) {
   224  	temp, err := ioutil.TempFile("", "test-manager-lock")
   225  	require.NoError(t, err)
   226  	require.NoError(t, temp.Close())
   227  	require.NoError(t, os.Remove(temp.Name()))
   228  
   229  	defer os.RemoveAll(temp.Name())
   230  
   231  	stateDir, err := ioutil.TempDir("", "test-raft")
   232  	require.NoError(t, err)
   233  	defer os.RemoveAll(stateDir)
   234  
   235  	tc := cautils.NewTestCA(t)
   236  	defer tc.Stop()
   237  
   238  	managerSecurityConfig, err := tc.NewNodeConfig(ca.ManagerRole)
   239  	require.NoError(t, err)
   240  
   241  	_, _, err = managerSecurityConfig.KeyReader().Read()
   242  	require.NoError(t, err)
   243  
   244  	m, err := New(&Config{
   245  		RemoteAPI:      &RemoteAddrs{ListenAddr: "127.0.0.1:0"},
   246  		ControlAPI:     temp.Name(),
   247  		StateDir:       stateDir,
   248  		SecurityConfig: managerSecurityConfig,
   249  		RootCAPaths:    tc.Paths.RootCA,
   250  		// start off without any encryption
   251  	})
   252  	require.NoError(t, err)
   253  	require.NotNil(t, m)
   254  
   255  	done := make(chan error)
   256  	defer close(done)
   257  	go func() {
   258  		done <- m.Run(tc.Context)
   259  	}()
   260  
   261  	opts := []grpc.DialOption{
   262  		grpc.WithTimeout(10 * time.Second),
   263  		grpc.WithTransportCredentials(managerSecurityConfig.ClientTLSCreds),
   264  	}
   265  
   266  	conn, err := grpc.Dial(m.Addr(), opts...)
   267  	require.NoError(t, err)
   268  	defer func() {
   269  		require.NoError(t, conn.Close())
   270  	}()
   271  
   272  	// check that there is no kek currently - we are using the API because this
   273  	// lets us wait until the manager is up and listening, as well
   274  	var cluster *api.Cluster
   275  	client := api.NewControlClient(conn)
   276  
   277  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   278  		resp, err := client.ListClusters(tc.Context, &api.ListClustersRequest{})
   279  		if err != nil {
   280  			return err
   281  		}
   282  		if len(resp.Clusters) == 0 {
   283  			return fmt.Errorf("no clusters yet")
   284  		}
   285  		cluster = resp.Clusters[0]
   286  		return nil
   287  	}, 1*time.Second))
   288  
   289  	require.Nil(t, cluster.UnlockKeys)
   290  
   291  	// tls key is unencrypted, but there is a DEK
   292  	unencryptedKey, err := ioutil.ReadFile(tc.Paths.Node.Key)
   293  	require.NoError(t, err)
   294  	keyBlock, _ := pem.Decode(unencryptedKey)
   295  	require.NotNil(t, keyBlock)
   296  	require.False(t, keyutils.IsEncryptedPEMBlock(keyBlock))
   297  	require.Len(t, keyBlock.Headers, 2)
   298  	currentDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil, false)
   299  	require.NoError(t, err)
   300  	require.NotEmpty(t, currentDEK)
   301  
   302  	// update the lock key - this may fail due to update out of sequence errors, so try again
   303  	for {
   304  		getResp, err := client.GetCluster(tc.Context, &api.GetClusterRequest{ClusterID: cluster.ID})
   305  		require.NoError(t, err)
   306  		cluster = getResp.Cluster
   307  
   308  		spec := cluster.Spec.Copy()
   309  		spec.EncryptionConfig.AutoLockManagers = true
   310  		updateResp, err := client.UpdateCluster(tc.Context, &api.UpdateClusterRequest{
   311  			ClusterID:      cluster.ID,
   312  			ClusterVersion: &cluster.Meta.Version,
   313  			Spec:           spec,
   314  		})
   315  		if testutils.ErrorDesc(err) == "update out of sequence" {
   316  			continue
   317  		}
   318  		// if there is any other type of error, this should fail
   319  		if err == nil {
   320  			cluster = updateResp.Cluster
   321  		}
   322  		break
   323  	}
   324  	require.NoError(t, err)
   325  
   326  	caConn := api.NewCAClient(conn)
   327  	unlockKeyResp, err := caConn.GetUnlockKey(tc.Context, &api.GetUnlockKeyRequest{})
   328  	require.NoError(t, err)
   329  
   330  	// this should update the TLS key, rotate the DEK, and finish snapshotting
   331  	var encryptedKey []byte
   332  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   333  		encryptedKey, err = ioutil.ReadFile(tc.Paths.Node.Key)
   334  		require.NoError(t, err) // this should never error due to atomic writes
   335  
   336  		if bytes.Equal(unencryptedKey, encryptedKey) {
   337  			return fmt.Errorf("TLS key should have been re-encrypted at least")
   338  		}
   339  
   340  		keyBlock, _ = pem.Decode(encryptedKey)
   341  		require.NotNil(t, keyBlock) // this should never error due to atomic writes
   342  
   343  		if !keyutils.IsEncryptedPEMBlock(keyBlock) {
   344  			return fmt.Errorf("Key not encrypted")
   345  		}
   346  
   347  		// we don't check that the TLS key has been rotated, because that may take
   348  		// a little bit, and is best effort only
   349  		currentDEKString, ok := keyBlock.Headers[pemHeaderRaftDEK]
   350  		require.True(t, ok) // there should never NOT be a current header
   351  		nowCurrentDEK, err := decodePEMHeaderValue(currentDEKString, unlockKeyResp.UnlockKey, false)
   352  		require.NoError(t, err) // it should always be encrypted
   353  		if bytes.Equal(currentDEK, nowCurrentDEK) {
   354  			return fmt.Errorf("snapshot has not been finished yet")
   355  		}
   356  
   357  		currentDEK = nowCurrentDEK
   358  		return nil
   359  	}, 1*time.Second))
   360  
   361  	_, ok := keyBlock.Headers[pemHeaderRaftPendingDEK]
   362  	require.False(t, ok) // once the snapshot is done, the pending DEK should have been deleted
   363  
   364  	_, ok = keyBlock.Headers[pemHeaderRaftDEKNeedsRotation]
   365  	require.False(t, ok)
   366  
   367  	// verify that the snapshot is readable with the new DEK
   368  	encrypter, decrypter := encryption.Defaults(currentDEK, false)
   369  	// we can't use the raftLogger, because the WALs are still locked while the raft node is up.  And once we remove
   370  	// the manager, they'll be deleted.
   371  	snapshot, err := storage.NewSnapFactory(encrypter, decrypter).New(filepath.Join(stateDir, "raft", "snap-v3-encrypted")).Load()
   372  	require.NoError(t, err)
   373  	require.NotNil(t, snapshot)
   374  
   375  	// update the lock key to nil
   376  	for i := 0; i < 3; i++ {
   377  		getResp, err := client.GetCluster(tc.Context, &api.GetClusterRequest{ClusterID: cluster.ID})
   378  		require.NoError(t, err)
   379  		cluster = getResp.Cluster
   380  
   381  		spec := cluster.Spec.Copy()
   382  		spec.EncryptionConfig.AutoLockManagers = false
   383  		_, err = client.UpdateCluster(tc.Context, &api.UpdateClusterRequest{
   384  			ClusterID:      cluster.ID,
   385  			ClusterVersion: &cluster.Meta.Version,
   386  			Spec:           spec,
   387  		})
   388  		if testutils.ErrorDesc(err) == "update out of sequence" {
   389  			continue
   390  		}
   391  		require.NoError(t, err)
   392  	}
   393  
   394  	// this should update the TLS key
   395  	var unlockedKey []byte
   396  	require.NoError(t, testutils.PollFuncWithTimeout(nil, func() error {
   397  		unlockedKey, err = ioutil.ReadFile(tc.Paths.Node.Key)
   398  		if err != nil {
   399  			return err
   400  		}
   401  
   402  		if bytes.Equal(unlockedKey, encryptedKey) {
   403  			return fmt.Errorf("TLS key should have been rotated")
   404  		}
   405  
   406  		// Previously, we did not check that the TLS key got rotated after going from
   407  		// unlocked -> locked, because it might take a while for the snapshot to be done,
   408  		// and the rotation happens on a best effort basis.  However, that *could*
   409  		// have happened, in which case the encrypted key may have changed, so we have
   410  		// to poll to make sure that the key is eventually decrypted, rather than
   411  		// just waiting for it to look different.
   412  
   413  		// the new key should not be encrypted, and the DEK should also be unencrypted
   414  		keyBlock, _ = pem.Decode(unlockedKey)
   415  		if keyBlock == nil {
   416  			return fmt.Errorf("keyblock is nil")
   417  		}
   418  		if keyutils.IsEncryptedPEMBlock(keyBlock) {
   419  			return fmt.Errorf("key is still encrypted")
   420  		}
   421  		return nil
   422  	}, 1*time.Second))
   423  
   424  	// the new key should not be encrypted, and the DEK should also be unencrypted
   425  	// but not rotated
   426  	keyBlock, _ = pem.Decode(unlockedKey)
   427  	require.NotNil(t, keyBlock)
   428  	require.False(t, keyutils.IsEncryptedPEMBlock(keyBlock))
   429  
   430  	unencryptedDEK, err := decodePEMHeaderValue(keyBlock.Headers[pemHeaderRaftDEK], nil, false)
   431  	require.NoError(t, err)
   432  	require.NotNil(t, unencryptedDEK)
   433  	require.Equal(t, currentDEK, unencryptedDEK)
   434  
   435  	m.Stop(tc.Context, false)
   436  
   437  	// After stopping we should MAY receive an error from ListenAndServe if
   438  	// all this happened before WaitForLeader completed, so don't check the
   439  	// error.
   440  	<-done
   441  }