github.com/Prakhar-Agarwal-byte/moby@v0.0.0-20231027092010-a14e3e8ab87e/daemon/cluster/swarm.go (about)

     1  package cluster // import "github.com/Prakhar-Agarwal-byte/moby/daemon/cluster"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"strings"
     8  	"time"
     9  
    10  	apitypes "github.com/Prakhar-Agarwal-byte/moby/api/types"
    11  	"github.com/Prakhar-Agarwal-byte/moby/api/types/container"
    12  	"github.com/Prakhar-Agarwal-byte/moby/api/types/filters"
    13  	types "github.com/Prakhar-Agarwal-byte/moby/api/types/swarm"
    14  	"github.com/Prakhar-Agarwal-byte/moby/daemon/cluster/convert"
    15  	"github.com/Prakhar-Agarwal-byte/moby/errdefs"
    16  	"github.com/Prakhar-Agarwal-byte/moby/opts"
    17  	"github.com/Prakhar-Agarwal-byte/moby/pkg/stack"
    18  	"github.com/containerd/log"
    19  	swarmapi "github.com/moby/swarmkit/v2/api"
    20  	"github.com/moby/swarmkit/v2/manager/encryption"
    21  	swarmnode "github.com/moby/swarmkit/v2/node"
    22  	"github.com/pkg/errors"
    23  	"google.golang.org/grpc"
    24  )
    25  
    26  // Init initializes new cluster from user provided request.
    27  func (c *Cluster) Init(req types.InitRequest) (string, error) {
    28  	c.controlMutex.Lock()
    29  	defer c.controlMutex.Unlock()
    30  	if c.nr != nil {
    31  		if req.ForceNewCluster {
    32  
    33  			// Take c.mu temporarily to wait for presently running
    34  			// API handlers to finish before shutting down the node.
    35  			c.mu.Lock()
    36  			if !c.nr.nodeState.IsManager() {
    37  				c.mu.Unlock()
    38  				return "", errSwarmNotManager
    39  			}
    40  			c.mu.Unlock()
    41  
    42  			if err := c.nr.Stop(); err != nil {
    43  				return "", err
    44  			}
    45  		} else {
    46  			return "", errSwarmExists
    47  		}
    48  	}
    49  
    50  	if err := validateAndSanitizeInitRequest(&req); err != nil {
    51  		return "", errdefs.InvalidParameter(err)
    52  	}
    53  
    54  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
    55  	if err != nil {
    56  		return "", errdefs.InvalidParameter(err)
    57  	}
    58  
    59  	advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
    60  	if err != nil {
    61  		return "", err
    62  	}
    63  
    64  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
    65  	if err != nil {
    66  		return "", err
    67  	}
    68  
    69  	localAddr := listenHost
    70  
    71  	// If the local address is undetermined, the advertise address
    72  	// will be used as local address, if it belongs to this system.
    73  	// If the advertise address is not local, then we try to find
    74  	// a system address to use as local address. If this fails,
    75  	// we give up and ask the user to pass the listen address.
    76  	if net.ParseIP(localAddr).IsUnspecified() {
    77  		advertiseIP := net.ParseIP(advertiseHost)
    78  
    79  		found := false
    80  		for _, systemIP := range listSystemIPs() {
    81  			if systemIP.Equal(advertiseIP) {
    82  				localAddr = advertiseIP.String()
    83  				found = true
    84  				break
    85  			}
    86  		}
    87  
    88  		if !found {
    89  			ip, err := c.resolveSystemAddr()
    90  			if err != nil {
    91  				log.G(context.TODO()).Warnf("Could not find a local address: %v", err)
    92  				return "", errMustSpecifyListenAddr
    93  			}
    94  			localAddr = ip.String()
    95  		}
    96  	}
    97  
    98  	if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil {
    99  		return "", err
   100  	}
   101  
   102  	port, err := getDataPathPort(req.DataPathPort)
   103  	if err != nil {
   104  		return "", err
   105  	}
   106  
   107  	nr, err := c.newNodeRunner(nodeStartConfig{
   108  		forceNewCluster:    req.ForceNewCluster,
   109  		autolock:           req.AutoLockManagers,
   110  		LocalAddr:          localAddr,
   111  		ListenAddr:         net.JoinHostPort(listenHost, listenPort),
   112  		AdvertiseAddr:      net.JoinHostPort(advertiseHost, advertisePort),
   113  		DataPathAddr:       dataPathAddr,
   114  		DefaultAddressPool: req.DefaultAddrPool,
   115  		SubnetSize:         req.SubnetSize,
   116  		availability:       req.Availability,
   117  		DataPathPort:       port,
   118  	})
   119  	if err != nil {
   120  		return "", err
   121  	}
   122  	c.mu.Lock()
   123  	c.nr = nr
   124  	c.mu.Unlock()
   125  
   126  	if err := <-nr.Ready(); err != nil {
   127  		c.mu.Lock()
   128  		c.nr = nil
   129  		c.mu.Unlock()
   130  		if !req.ForceNewCluster { // if failure on first attempt don't keep state
   131  			if err := clearPersistentState(c.root); err != nil {
   132  				return "", err
   133  			}
   134  		}
   135  		return "", err
   136  	}
   137  	state := nr.State()
   138  	if state.swarmNode == nil { // should never happen but protect from panic
   139  		return "", errors.New("invalid cluster state for spec initialization")
   140  	}
   141  	if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
   142  		return "", err
   143  	}
   144  	return state.NodeID(), nil
   145  }
   146  
   147  // Join makes current Cluster part of an existing swarm cluster.
   148  func (c *Cluster) Join(req types.JoinRequest) error {
   149  	c.controlMutex.Lock()
   150  	defer c.controlMutex.Unlock()
   151  	c.mu.Lock()
   152  	if c.nr != nil {
   153  		c.mu.Unlock()
   154  		return errors.WithStack(errSwarmExists)
   155  	}
   156  	c.mu.Unlock()
   157  
   158  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   159  		return errdefs.InvalidParameter(err)
   160  	}
   161  
   162  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
   163  	if err != nil {
   164  		return err
   165  	}
   166  
   167  	var advertiseAddr string
   168  	if req.AdvertiseAddr != "" {
   169  		advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
   170  		// For joining, we don't need to provide an advertise address,
   171  		// since the remote side can detect it.
   172  		if err == nil {
   173  			advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
   174  		}
   175  	}
   176  
   177  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
   178  	if err != nil {
   179  		return err
   180  	}
   181  
   182  	nr, err := c.newNodeRunner(nodeStartConfig{
   183  		RemoteAddr:    req.RemoteAddrs[0],
   184  		ListenAddr:    net.JoinHostPort(listenHost, listenPort),
   185  		AdvertiseAddr: advertiseAddr,
   186  		DataPathAddr:  dataPathAddr,
   187  		joinAddr:      req.RemoteAddrs[0],
   188  		joinToken:     req.JoinToken,
   189  		availability:  req.Availability,
   190  	})
   191  	if err != nil {
   192  		return err
   193  	}
   194  
   195  	c.mu.Lock()
   196  	c.nr = nr
   197  	c.mu.Unlock()
   198  
   199  	timeout := time.NewTimer(swarmConnectTimeout)
   200  	defer timeout.Stop()
   201  
   202  	select {
   203  	case <-timeout.C:
   204  		return errSwarmJoinTimeoutReached
   205  	case err := <-nr.Ready():
   206  		if err != nil {
   207  			c.mu.Lock()
   208  			c.nr = nil
   209  			c.mu.Unlock()
   210  			if err := clearPersistentState(c.root); err != nil {
   211  				return err
   212  			}
   213  		}
   214  		return err
   215  	}
   216  }
   217  
   218  // Inspect retrieves the configuration properties of a managed swarm cluster.
   219  func (c *Cluster) Inspect() (types.Swarm, error) {
   220  	var swarm types.Swarm
   221  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   222  		s, err := c.inspect(ctx, state)
   223  		if err != nil {
   224  			return err
   225  		}
   226  		swarm = s
   227  		return nil
   228  	}); err != nil {
   229  		return types.Swarm{}, err
   230  	}
   231  	return swarm, nil
   232  }
   233  
   234  func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) {
   235  	s, err := getSwarm(ctx, state.controlClient)
   236  	if err != nil {
   237  		return types.Swarm{}, err
   238  	}
   239  	return convert.SwarmFromGRPC(*s), nil
   240  }
   241  
   242  // Update updates configuration of a managed swarm cluster.
   243  func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
   244  	return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   245  		swarm, err := getSwarm(ctx, state.controlClient)
   246  		if err != nil {
   247  			return err
   248  		}
   249  
   250  		// Validate spec name.
   251  		if spec.Annotations.Name == "" {
   252  			spec.Annotations.Name = "default"
   253  		} else if spec.Annotations.Name != "default" {
   254  			return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`))
   255  		}
   256  
   257  		// In update, client should provide the complete spec of the swarm, including
   258  		// Name and Labels. If a field is specified with 0 or nil, then the default value
   259  		// will be used to swarmkit.
   260  		clusterSpec, err := convert.SwarmSpecToGRPC(spec)
   261  		if err != nil {
   262  			return errdefs.InvalidParameter(err)
   263  		}
   264  
   265  		_, err = state.controlClient.UpdateCluster(
   266  			ctx,
   267  			&swarmapi.UpdateClusterRequest{
   268  				ClusterID: swarm.ID,
   269  				Spec:      &clusterSpec,
   270  				ClusterVersion: &swarmapi.Version{
   271  					Index: version,
   272  				},
   273  				Rotation: swarmapi.KeyRotation{
   274  					WorkerJoinToken:  flags.RotateWorkerToken,
   275  					ManagerJoinToken: flags.RotateManagerToken,
   276  					ManagerUnlockKey: flags.RotateManagerUnlockKey,
   277  				},
   278  			},
   279  		)
   280  		return err
   281  	})
   282  }
   283  
   284  // GetUnlockKey returns the unlock key for the swarm.
   285  func (c *Cluster) GetUnlockKey() (string, error) {
   286  	var resp *swarmapi.GetUnlockKeyResponse
   287  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   288  		client := swarmapi.NewCAClient(state.grpcConn)
   289  
   290  		r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
   291  		if err != nil {
   292  			return err
   293  		}
   294  		resp = r
   295  		return nil
   296  	}); err != nil {
   297  		return "", err
   298  	}
   299  	if len(resp.UnlockKey) == 0 {
   300  		// no key
   301  		return "", nil
   302  	}
   303  	return encryption.HumanReadableKey(resp.UnlockKey), nil
   304  }
   305  
   306  // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
   307  func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
   308  	c.controlMutex.Lock()
   309  	defer c.controlMutex.Unlock()
   310  
   311  	c.mu.RLock()
   312  	state := c.currentNodeState()
   313  
   314  	if !state.IsActiveManager() {
   315  		// when manager is not active,
   316  		// unless it is locked, otherwise return error.
   317  		if err := c.errNoManager(state); err != errSwarmLocked {
   318  			c.mu.RUnlock()
   319  			return err
   320  		}
   321  	} else {
   322  		// when manager is active, return an error of "not locked"
   323  		c.mu.RUnlock()
   324  		return notLockedError{}
   325  	}
   326  
   327  	// only when swarm is locked, code running reaches here
   328  	nr := c.nr
   329  	c.mu.RUnlock()
   330  
   331  	key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
   332  	if err != nil {
   333  		return errdefs.InvalidParameter(err)
   334  	}
   335  
   336  	config := nr.config
   337  	config.lockKey = key
   338  	if err := nr.Stop(); err != nil {
   339  		return err
   340  	}
   341  	nr, err = c.newNodeRunner(config)
   342  	if err != nil {
   343  		return err
   344  	}
   345  
   346  	c.mu.Lock()
   347  	c.nr = nr
   348  	c.mu.Unlock()
   349  
   350  	if err := <-nr.Ready(); err != nil {
   351  		if errors.Is(err, errSwarmLocked) {
   352  			return invalidUnlockKey{}
   353  		}
   354  		return errors.Errorf("swarm component could not be started: %v", err)
   355  	}
   356  	return nil
   357  }
   358  
   359  // Leave shuts down Cluster and removes current state.
   360  func (c *Cluster) Leave(ctx context.Context, force bool) error {
   361  	c.controlMutex.Lock()
   362  	defer c.controlMutex.Unlock()
   363  
   364  	c.mu.Lock()
   365  	nr := c.nr
   366  	if nr == nil {
   367  		c.mu.Unlock()
   368  		return errors.WithStack(errNoSwarm)
   369  	}
   370  
   371  	state := c.currentNodeState()
   372  
   373  	c.mu.Unlock()
   374  
   375  	if errors.Is(state.err, errSwarmLocked) && !force {
   376  		// leave a locked swarm without --force is not allowed
   377  		return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message."))
   378  	}
   379  
   380  	if state.IsManager() && !force {
   381  		msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
   382  		if state.IsActiveManager() {
   383  			active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
   384  			if err == nil {
   385  				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
   386  					if isLastManager(reachable, unreachable) {
   387  						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
   388  						return errors.WithStack(notAvailableError(msg))
   389  					}
   390  					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
   391  				}
   392  			}
   393  		} else {
   394  			msg += "Doing so may lose the consensus of your cluster. "
   395  		}
   396  
   397  		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
   398  		return errors.WithStack(notAvailableError(msg))
   399  	}
   400  	// release readers in here
   401  	if err := nr.Stop(); err != nil {
   402  		log.G(ctx).Errorf("failed to shut down cluster node: %v", err)
   403  		stack.Dump()
   404  		return err
   405  	}
   406  
   407  	c.mu.Lock()
   408  	c.nr = nil
   409  	c.mu.Unlock()
   410  
   411  	if nodeID := state.NodeID(); nodeID != "" {
   412  		nodeContainers, err := c.listContainerForNode(ctx, nodeID)
   413  		if err != nil {
   414  			return err
   415  		}
   416  		for _, id := range nodeContainers {
   417  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   418  				log.G(ctx).Errorf("error removing %v: %v", id, err)
   419  			}
   420  		}
   421  	}
   422  
   423  	// todo: cleanup optional?
   424  	if err := clearPersistentState(c.root); err != nil {
   425  		return err
   426  	}
   427  	c.config.Backend.DaemonLeavesCluster()
   428  	return nil
   429  }
   430  
   431  // Info returns information about the current cluster state.
   432  func (c *Cluster) Info() types.Info {
   433  	info := types.Info{
   434  		NodeAddr: c.GetAdvertiseAddress(),
   435  	}
   436  	c.mu.RLock()
   437  	defer c.mu.RUnlock()
   438  
   439  	state := c.currentNodeState()
   440  	info.LocalNodeState = state.status
   441  	if state.err != nil {
   442  		info.Error = state.err.Error()
   443  	}
   444  
   445  	ctx, cancel := c.getRequestContext()
   446  	defer cancel()
   447  
   448  	if state.IsActiveManager() {
   449  		info.ControlAvailable = true
   450  		swarm, err := c.inspect(ctx, state)
   451  		if err != nil {
   452  			info.Error = err.Error()
   453  		}
   454  
   455  		info.Cluster = &swarm.ClusterInfo
   456  
   457  		if r, err := state.controlClient.ListNodes(
   458  			ctx, &swarmapi.ListNodesRequest{},
   459  			grpc.MaxCallRecvMsgSize(defaultRecvSizeForListResponse),
   460  		); err != nil {
   461  			info.Error = err.Error()
   462  		} else {
   463  			info.Nodes = len(r.Nodes)
   464  			for _, n := range r.Nodes {
   465  				if n.ManagerStatus != nil {
   466  					info.Managers = info.Managers + 1
   467  				}
   468  			}
   469  		}
   470  
   471  		switch info.LocalNodeState {
   472  		case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError:
   473  			// nothing to do
   474  		default:
   475  			if info.Managers == 2 {
   476  				const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides
   477           no fault tolerance, and poses a high risk to lose control over the cluster.
   478           Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the
   479           Swarm for fault-tolerance.`
   480  
   481  				info.Warnings = append(info.Warnings, warn)
   482  			}
   483  		}
   484  	}
   485  
   486  	if state.swarmNode != nil {
   487  		for _, r := range state.swarmNode.Remotes() {
   488  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   489  		}
   490  		info.NodeID = state.swarmNode.NodeID()
   491  	}
   492  
   493  	return info
   494  }
   495  
   496  // Status returns a textual representation of the node's swarm status and role (manager/worker)
   497  func (c *Cluster) Status() string {
   498  	c.mu.RLock()
   499  	s := c.currentNodeState()
   500  	c.mu.RUnlock()
   501  
   502  	state := string(s.status)
   503  	if s.status == types.LocalNodeStateActive {
   504  		if s.IsActiveManager() || s.IsManager() {
   505  			state += "/manager"
   506  		} else {
   507  			state += "/worker"
   508  		}
   509  	}
   510  	return state
   511  }
   512  
   513  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
   514  	var err error
   515  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   516  	if err != nil {
   517  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   518  	}
   519  
   520  	if req.Spec.Annotations.Name == "" {
   521  		req.Spec.Annotations.Name = "default"
   522  	} else if req.Spec.Annotations.Name != "default" {
   523  		return errors.New(`swarm spec must be named "default"`)
   524  	}
   525  
   526  	return nil
   527  }
   528  
   529  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
   530  	var err error
   531  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   532  	if err != nil {
   533  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   534  	}
   535  	if len(req.RemoteAddrs) == 0 {
   536  		return errors.New("at least 1 RemoteAddr is required to join")
   537  	}
   538  	for i := range req.RemoteAddrs {
   539  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
   540  		if err != nil {
   541  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
   542  		}
   543  	}
   544  	return nil
   545  }
   546  
   547  func validateAddr(addr string) (string, error) {
   548  	if addr == "" {
   549  		return addr, errors.New("invalid empty address")
   550  	}
   551  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
   552  	if err != nil {
   553  		// TODO(thaJeztah) why are we ignoring the error here? Is this to allow "non-tcp" addresses?
   554  		return addr, nil
   555  	}
   556  	return strings.TrimPrefix(newaddr, "tcp://"), nil
   557  }
   558  
   559  func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
   560  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   561  	defer cancel()
   562  	for conn := range node.ListenControlSocket(ctx) {
   563  		if ctx.Err() != nil {
   564  			return ctx.Err()
   565  		}
   566  		if conn != nil {
   567  			client := swarmapi.NewControlClient(conn)
   568  			var cluster *swarmapi.Cluster
   569  			for i := 0; ; i++ {
   570  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
   571  				if err != nil {
   572  					return fmt.Errorf("error on listing clusters: %v", err)
   573  				}
   574  				if len(lcr.Clusters) == 0 {
   575  					if i < 10 {
   576  						time.Sleep(200 * time.Millisecond)
   577  						continue
   578  					}
   579  					return errors.New("empty list of clusters was returned")
   580  				}
   581  				cluster = lcr.Clusters[0]
   582  				break
   583  			}
   584  			// In init, we take the initial default values from swarmkit, and merge
   585  			// any non nil or 0 value from spec to GRPC spec. This will leave the
   586  			// default value alone.
   587  			// Note that this is different from Update(), as in Update() we expect
   588  			// user to specify the complete spec of the cluster (as they already know
   589  			// the existing one and knows which field to update)
   590  			clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
   591  			if err != nil {
   592  				return fmt.Errorf("error updating cluster settings: %v", err)
   593  			}
   594  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
   595  				ClusterID:      cluster.ID,
   596  				ClusterVersion: &cluster.Meta.Version,
   597  				Spec:           &clusterSpec,
   598  			})
   599  			if err != nil {
   600  				return fmt.Errorf("error updating cluster settings: %v", err)
   601  			}
   602  			return nil
   603  		}
   604  	}
   605  	return ctx.Err()
   606  }
   607  
   608  func (c *Cluster) listContainerForNode(ctx context.Context, nodeID string) ([]string, error) {
   609  	var ids []string
   610  	containers, err := c.config.Backend.Containers(ctx, &container.ListOptions{
   611  		Filters: filters.NewArgs(filters.Arg("label", "com.docker.swarm.node.id="+nodeID)),
   612  	})
   613  	if err != nil {
   614  		return []string{}, err
   615  	}
   616  	for _, c := range containers {
   617  		ids = append(ids, c.ID)
   618  	}
   619  	return ids, nil
   620  }