github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/integration/cluster.go (about)

     1  package integration
     2  
     3  import (
     4  	"context"
     5  	"crypto/tls"
     6  	"fmt"
     7  	"math/rand"
     8  	"net"
     9  	"sync"
    10  	"time"
    11  
    12  	"google.golang.org/grpc"
    13  	"google.golang.org/grpc/credentials"
    14  
    15  	"github.com/docker/swarmkit/api"
    16  	"github.com/docker/swarmkit/ca"
    17  	"github.com/docker/swarmkit/identity"
    18  	"github.com/docker/swarmkit/log"
    19  	"github.com/docker/swarmkit/manager/encryption"
    20  	"github.com/docker/swarmkit/node"
    21  	"github.com/docker/swarmkit/testutils"
    22  	"github.com/sirupsen/logrus"
    23  )
    24  
    25  const opsTimeout = 64 * time.Second
    26  
    27  // Cluster is representation of cluster - connected nodes.
    28  type testCluster struct {
    29  	ctx        context.Context
    30  	cancel     context.CancelFunc
    31  	api        *dummyAPI
    32  	nodes      map[string]*testNode
    33  	nodesOrder map[string]int
    34  	errs       chan error
    35  	wg         sync.WaitGroup
    36  	counter    int
    37  	fips       bool
    38  }
    39  
    40  var testnameKey struct{}
    41  
    42  // Stop makes best effort to stop all nodes and close connections to them.
    43  func (c *testCluster) Stop() error {
    44  	c.cancel()
    45  	for _, n := range c.nodes {
    46  		if err := n.Stop(); err != nil {
    47  			return err
    48  		}
    49  	}
    50  	c.wg.Wait()
    51  	close(c.errs)
    52  	for err := range c.errs {
    53  		if err != nil {
    54  			return err
    55  		}
    56  	}
    57  	return nil
    58  }
    59  
    60  // RandomManager chooses random manager from cluster.
    61  func (c *testCluster) RandomManager() *testNode {
    62  	var managers []*testNode
    63  	for _, n := range c.nodes {
    64  		if n.IsManager() {
    65  			managers = append(managers, n)
    66  		}
    67  	}
    68  	idx := rand.Intn(len(managers))
    69  	return managers[idx]
    70  }
    71  
    72  // AddManager adds a node with the Manager role. The node will function as both
    73  // an agent and a manager. If lateBind is set, the manager is started before a
    74  // remote API port is bound. If rootCA is set, the manager is bootstrapped using
    75  // said root CA.  These settings only apply to the first manager.
    76  func (c *testCluster) AddManager(lateBind bool, rootCA *ca.RootCA) error {
    77  	// first node
    78  	var n *testNode
    79  	if len(c.nodes) == 0 {
    80  		node, err := newTestNode("", "", lateBind, c.fips)
    81  		if err != nil {
    82  			return err
    83  		}
    84  		// generate TLS certs for this manager for bootstrapping, else the node will generate its own CA
    85  		if rootCA != nil {
    86  			if err := generateCerts(node.stateDir, rootCA, identity.NewID(), ca.ManagerRole, identity.NewID(), true); err != nil {
    87  				return err
    88  			}
    89  		}
    90  		n = node
    91  	} else {
    92  		lateBind = false
    93  		joinAddr, err := c.RandomManager().node.RemoteAPIAddr()
    94  		if err != nil {
    95  			return err
    96  		}
    97  		clusterInfo, err := c.GetClusterInfo()
    98  		if err != nil {
    99  			return err
   100  		}
   101  		node, err := newTestNode(joinAddr, clusterInfo.RootCA.JoinTokens.Manager, false, c.fips)
   102  		if err != nil {
   103  			return err
   104  		}
   105  		n = node
   106  	}
   107  
   108  	if err := c.AddNode(n); err != nil {
   109  		return err
   110  	}
   111  
   112  	if lateBind {
   113  		// Verify that the control API works
   114  		if _, err := c.GetClusterInfo(); err != nil {
   115  			return err
   116  		}
   117  		return n.node.BindRemote(context.Background(), "127.0.0.1:0", "")
   118  	}
   119  
   120  	return nil
   121  }
   122  
   123  // AddAgent adds node with Agent role(doesn't participate in raft cluster).
   124  func (c *testCluster) AddAgent() error {
   125  	// first node
   126  	if len(c.nodes) == 0 {
   127  		return fmt.Errorf("there is no manager nodes")
   128  	}
   129  	joinAddr, err := c.RandomManager().node.RemoteAPIAddr()
   130  	if err != nil {
   131  		return err
   132  	}
   133  	clusterInfo, err := c.GetClusterInfo()
   134  	if err != nil {
   135  		return err
   136  	}
   137  	node, err := newTestNode(joinAddr, clusterInfo.RootCA.JoinTokens.Worker, false, c.fips)
   138  	if err != nil {
   139  		return err
   140  	}
   141  	return c.AddNode(node)
   142  }
   143  
   144  // AddNode adds a new node to the cluster
   145  func (c *testCluster) AddNode(n *testNode) error {
   146  	c.counter++
   147  	if err := c.runNode(n, c.counter); err != nil {
   148  		c.counter--
   149  		return err
   150  	}
   151  	c.nodes[n.node.NodeID()] = n
   152  	c.nodesOrder[n.node.NodeID()] = c.counter
   153  	return nil
   154  }
   155  
   156  func (c *testCluster) runNode(n *testNode, nodeOrder int) error {
   157  	ctx := log.WithLogger(c.ctx, log.L.WithFields(
   158  		logrus.Fields{
   159  			"testnode": nodeOrder,
   160  			"testname": c.ctx.Value(testnameKey),
   161  		},
   162  	))
   163  
   164  	errCtx, cancel := context.WithCancel(context.Background())
   165  	done := make(chan error)
   166  	defer cancel()
   167  	defer close(done)
   168  
   169  	c.wg.Add(2)
   170  	go func() {
   171  		c.errs <- n.node.Start(ctx)
   172  		c.wg.Done()
   173  	}()
   174  	go func(n *node.Node) {
   175  		err := n.Err(errCtx)
   176  		select {
   177  		case <-errCtx.Done():
   178  		default:
   179  			done <- err
   180  		}
   181  		c.wg.Done()
   182  	}(n.node)
   183  
   184  	select {
   185  	case <-n.node.Ready():
   186  	case err := <-done:
   187  		return err
   188  	case <-time.After(opsTimeout):
   189  		return fmt.Errorf("node did not ready in time")
   190  	}
   191  
   192  	return nil
   193  }
   194  
   195  // CreateService creates dummy service.
   196  func (c *testCluster) CreateService(name string, instances int) (string, error) {
   197  	spec := &api.ServiceSpec{
   198  		Annotations: api.Annotations{Name: name},
   199  		Mode: &api.ServiceSpec_Replicated{
   200  			Replicated: &api.ReplicatedService{
   201  				Replicas: uint64(instances),
   202  			},
   203  		},
   204  		Task: api.TaskSpec{
   205  			Runtime: &api.TaskSpec_Container{
   206  				Container: &api.ContainerSpec{Image: "alpine", Command: []string{"sh"}},
   207  			},
   208  		},
   209  	}
   210  
   211  	resp, err := c.api.CreateService(context.Background(), &api.CreateServiceRequest{Spec: spec})
   212  	if err != nil {
   213  		return "", err
   214  	}
   215  	return resp.Service.ID, nil
   216  }
   217  
   218  // Leader returns TestNode for cluster leader.
   219  func (c *testCluster) Leader() (*testNode, error) {
   220  	resp, err := c.api.ListNodes(context.Background(), &api.ListNodesRequest{
   221  		Filters: &api.ListNodesRequest_Filters{
   222  			Roles: []api.NodeRole{api.NodeRoleManager},
   223  		},
   224  	})
   225  	if err != nil {
   226  		return nil, err
   227  	}
   228  	for _, n := range resp.Nodes {
   229  		if n.ManagerStatus.Leader {
   230  			tn, ok := c.nodes[n.ID]
   231  			if !ok {
   232  				return nil, fmt.Errorf("leader id is %s, but it isn't found in test cluster object", n.ID)
   233  			}
   234  			return tn, nil
   235  		}
   236  	}
   237  	return nil, fmt.Errorf("cluster leader is not found in api response")
   238  }
   239  
   240  // RemoveNode removes node entirely. It tries to demote managers.
   241  func (c *testCluster) RemoveNode(id string, graceful bool) error {
   242  	node, ok := c.nodes[id]
   243  	if !ok {
   244  		return fmt.Errorf("remove node: node %s not found", id)
   245  	}
   246  	// demote before removal
   247  	if node.IsManager() {
   248  		if err := c.SetNodeRole(id, api.NodeRoleWorker); err != nil {
   249  			return fmt.Errorf("demote manager: %v", err)
   250  		}
   251  
   252  	}
   253  	if err := node.Stop(); err != nil {
   254  		return err
   255  	}
   256  	delete(c.nodes, id)
   257  	if graceful {
   258  		if err := testutils.PollFuncWithTimeout(nil, func() error {
   259  			resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id})
   260  			if err != nil {
   261  				return fmt.Errorf("get node: %v", err)
   262  			}
   263  			if resp.Node.Status.State != api.NodeStatus_DOWN {
   264  				return fmt.Errorf("node %s is still not down", id)
   265  			}
   266  			return nil
   267  		}, opsTimeout); err != nil {
   268  			return err
   269  		}
   270  	}
   271  	if _, err := c.api.RemoveNode(context.Background(), &api.RemoveNodeRequest{NodeID: id, Force: !graceful}); err != nil {
   272  		return fmt.Errorf("remove node: %v", err)
   273  	}
   274  	return nil
   275  }
   276  
   277  // SetNodeRole sets role for node through control api.
   278  func (c *testCluster) SetNodeRole(id string, role api.NodeRole) error {
   279  	node, ok := c.nodes[id]
   280  	if !ok {
   281  		return fmt.Errorf("set node role: node %s not found", id)
   282  	}
   283  	if node.IsManager() && role == api.NodeRoleManager {
   284  		return fmt.Errorf("node is already manager")
   285  	}
   286  	if !node.IsManager() && role == api.NodeRoleWorker {
   287  		return fmt.Errorf("node is already worker")
   288  	}
   289  
   290  	var initialTimeout time.Duration
   291  	// version might change between get and update, so retry
   292  	for i := 0; i < 5; i++ {
   293  		time.Sleep(initialTimeout)
   294  		initialTimeout += 500 * time.Millisecond
   295  		resp, err := c.api.GetNode(context.Background(), &api.GetNodeRequest{NodeID: id})
   296  		if err != nil {
   297  			return err
   298  		}
   299  		spec := resp.Node.Spec.Copy()
   300  		spec.DesiredRole = role
   301  		if _, err := c.api.UpdateNode(context.Background(), &api.UpdateNodeRequest{
   302  			NodeID:      id,
   303  			Spec:        spec,
   304  			NodeVersion: &resp.Node.Meta.Version,
   305  		}); err != nil {
   306  			// there possible problems on calling update node because redirecting
   307  			// node or leader might want to shut down
   308  			if testutils.ErrorDesc(err) == "update out of sequence" {
   309  				continue
   310  			}
   311  			return err
   312  		}
   313  		if role == api.NodeRoleManager {
   314  			// wait to become manager
   315  			return testutils.PollFuncWithTimeout(nil, func() error {
   316  				if !node.IsManager() {
   317  					return fmt.Errorf("node is still not a manager")
   318  				}
   319  				return nil
   320  			}, opsTimeout)
   321  		}
   322  		// wait to become worker
   323  		return testutils.PollFuncWithTimeout(nil, func() error {
   324  			if node.IsManager() {
   325  				return fmt.Errorf("node is still not a worker")
   326  			}
   327  			return nil
   328  		}, opsTimeout)
   329  	}
   330  	return fmt.Errorf("set role %s for node %s, got sequence error 5 times", role, id)
   331  }
   332  
   333  // Starts a node from a stopped state
   334  func (c *testCluster) StartNode(id string) error {
   335  	n, ok := c.nodes[id]
   336  	if !ok {
   337  		return fmt.Errorf("set node role: node %s not found", id)
   338  	}
   339  	if err := c.runNode(n, c.nodesOrder[id]); err != nil {
   340  		return err
   341  	}
   342  	if n.node.NodeID() != id {
   343  		return fmt.Errorf("restarted node does not have have the same ID")
   344  	}
   345  	return nil
   346  }
   347  
   348  func (c *testCluster) GetClusterInfo() (*api.Cluster, error) {
   349  	clusterInfo, err := c.api.ListClusters(context.Background(), &api.ListClustersRequest{})
   350  	if err != nil {
   351  		return nil, err
   352  	}
   353  	if len(clusterInfo.Clusters) != 1 {
   354  		return nil, fmt.Errorf("number of clusters in storage: %d; expected 1", len(clusterInfo.Clusters))
   355  	}
   356  	return clusterInfo.Clusters[0], nil
   357  }
   358  
   359  func (c *testCluster) RotateRootCA(cert, key []byte) error {
   360  	// poll in case something else changes the cluster before we can update it
   361  	return testutils.PollFuncWithTimeout(nil, func() error {
   362  		clusterInfo, err := c.GetClusterInfo()
   363  		if err != nil {
   364  			return err
   365  		}
   366  		newSpec := clusterInfo.Spec.Copy()
   367  		newSpec.CAConfig.SigningCACert = cert
   368  		newSpec.CAConfig.SigningCAKey = key
   369  		_, err = c.api.UpdateCluster(context.Background(), &api.UpdateClusterRequest{
   370  			ClusterID:      clusterInfo.ID,
   371  			Spec:           newSpec,
   372  			ClusterVersion: &clusterInfo.Meta.Version,
   373  		})
   374  		return err
   375  	}, opsTimeout)
   376  }
   377  
   378  func (c *testCluster) RotateUnlockKey() error {
   379  	// poll in case something else changes the cluster before we can update it
   380  	return testutils.PollFuncWithTimeout(nil, func() error {
   381  		clusterInfo, err := c.GetClusterInfo()
   382  		if err != nil {
   383  			return err
   384  		}
   385  		_, err = c.api.UpdateCluster(context.Background(), &api.UpdateClusterRequest{
   386  			ClusterID:      clusterInfo.ID,
   387  			Spec:           &clusterInfo.Spec,
   388  			ClusterVersion: &clusterInfo.Meta.Version,
   389  			Rotation: api.KeyRotation{
   390  				ManagerUnlockKey: true,
   391  			},
   392  		})
   393  		return err
   394  	}, opsTimeout)
   395  }
   396  
   397  func (c *testCluster) AutolockManagers(autolock bool) error {
   398  	// poll in case something else changes the cluster before we can update it
   399  	return testutils.PollFuncWithTimeout(nil, func() error {
   400  		clusterInfo, err := c.GetClusterInfo()
   401  		if err != nil {
   402  			return err
   403  		}
   404  		newSpec := clusterInfo.Spec.Copy()
   405  		newSpec.EncryptionConfig.AutoLockManagers = autolock
   406  		_, err = c.api.UpdateCluster(context.Background(), &api.UpdateClusterRequest{
   407  			ClusterID:      clusterInfo.ID,
   408  			Spec:           newSpec,
   409  			ClusterVersion: &clusterInfo.Meta.Version,
   410  		})
   411  		return err
   412  	}, opsTimeout)
   413  }
   414  
   415  func (c *testCluster) GetUnlockKey() (string, error) {
   416  	opts := []grpc.DialOption{}
   417  	insecureCreds := credentials.NewTLS(&tls.Config{InsecureSkipVerify: true})
   418  	opts = append(opts, grpc.WithTransportCredentials(insecureCreds))
   419  	opts = append(opts, grpc.WithDialer(
   420  		func(addr string, timeout time.Duration) (net.Conn, error) {
   421  			return net.DialTimeout("unix", addr, timeout)
   422  		}))
   423  	conn, err := grpc.Dial(c.RandomManager().config.ListenControlAPI, opts...)
   424  	if err != nil {
   425  		return "", err
   426  	}
   427  
   428  	resp, err := api.NewCAClient(conn).GetUnlockKey(context.Background(), &api.GetUnlockKeyRequest{})
   429  	if err != nil {
   430  		return "", err
   431  	}
   432  
   433  	return encryption.HumanReadableKey(resp.UnlockKey), nil
   434  }