github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/daemon/cluster/swarm.go (about)

     1  package cluster // import "github.com/docker/docker/daemon/cluster"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"strings"
     8  	"time"
     9  
    10  	apitypes "github.com/docker/docker/api/types"
    11  	"github.com/docker/docker/api/types/filters"
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/convert"
    14  	"github.com/docker/docker/errdefs"
    15  	"github.com/docker/docker/opts"
    16  	"github.com/docker/docker/pkg/signal"
    17  	swarmapi "github.com/docker/swarmkit/api"
    18  	"github.com/docker/swarmkit/manager/encryption"
    19  	swarmnode "github.com/docker/swarmkit/node"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  	"google.golang.org/grpc"
    23  )
    24  
    25  // Init initializes new cluster from user provided request.
    26  func (c *Cluster) Init(req types.InitRequest) (string, error) {
    27  	c.controlMutex.Lock()
    28  	defer c.controlMutex.Unlock()
    29  	if c.nr != nil {
    30  		if req.ForceNewCluster {
    31  
    32  			// Take c.mu temporarily to wait for presently running
    33  			// API handlers to finish before shutting down the node.
    34  			c.mu.Lock()
    35  			if !c.nr.nodeState.IsManager() {
    36  				c.mu.Unlock()
    37  				return "", errSwarmNotManager
    38  			}
    39  			c.mu.Unlock()
    40  
    41  			if err := c.nr.Stop(); err != nil {
    42  				return "", err
    43  			}
    44  		} else {
    45  			return "", errSwarmExists
    46  		}
    47  	}
    48  
    49  	if err := validateAndSanitizeInitRequest(&req); err != nil {
    50  		return "", errdefs.InvalidParameter(err)
    51  	}
    52  
    53  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
    54  	if err != nil {
    55  		return "", err
    56  	}
    57  
    58  	advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
    59  	if err != nil {
    60  		return "", err
    61  	}
    62  
    63  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
    64  	if err != nil {
    65  		return "", err
    66  	}
    67  
    68  	localAddr := listenHost
    69  
    70  	// If the local address is undetermined, the advertise address
    71  	// will be used as local address, if it belongs to this system.
    72  	// If the advertise address is not local, then we try to find
    73  	// a system address to use as local address. If this fails,
    74  	// we give up and ask the user to pass the listen address.
    75  	if net.ParseIP(localAddr).IsUnspecified() {
    76  		advertiseIP := net.ParseIP(advertiseHost)
    77  
    78  		found := false
    79  		for _, systemIP := range listSystemIPs() {
    80  			if systemIP.Equal(advertiseIP) {
    81  				localAddr = advertiseIP.String()
    82  				found = true
    83  				break
    84  			}
    85  		}
    86  
    87  		if !found {
    88  			ip, err := c.resolveSystemAddr()
    89  			if err != nil {
    90  				logrus.Warnf("Could not find a local address: %v", err)
    91  				return "", errMustSpecifyListenAddr
    92  			}
    93  			localAddr = ip.String()
    94  		}
    95  	}
    96  
    97  	if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil {
    98  		return "", err
    99  	}
   100  
   101  	port, err := getDataPathPort(req.DataPathPort)
   102  	if err != nil {
   103  		return "", err
   104  	}
   105  
   106  	nr, err := c.newNodeRunner(nodeStartConfig{
   107  		forceNewCluster:    req.ForceNewCluster,
   108  		autolock:           req.AutoLockManagers,
   109  		LocalAddr:          localAddr,
   110  		ListenAddr:         net.JoinHostPort(listenHost, listenPort),
   111  		AdvertiseAddr:      net.JoinHostPort(advertiseHost, advertisePort),
   112  		DataPathAddr:       dataPathAddr,
   113  		DefaultAddressPool: req.DefaultAddrPool,
   114  		SubnetSize:         req.SubnetSize,
   115  		availability:       req.Availability,
   116  		DataPathPort:       port,
   117  	})
   118  	if err != nil {
   119  		return "", err
   120  	}
   121  	c.mu.Lock()
   122  	c.nr = nr
   123  	c.mu.Unlock()
   124  
   125  	if err := <-nr.Ready(); err != nil {
   126  		c.mu.Lock()
   127  		c.nr = nil
   128  		c.mu.Unlock()
   129  		if !req.ForceNewCluster { // if failure on first attempt don't keep state
   130  			if err := clearPersistentState(c.root); err != nil {
   131  				return "", err
   132  			}
   133  		}
   134  		return "", err
   135  	}
   136  	state := nr.State()
   137  	if state.swarmNode == nil { // should never happen but protect from panic
   138  		return "", errors.New("invalid cluster state for spec initialization")
   139  	}
   140  	if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
   141  		return "", err
   142  	}
   143  	return state.NodeID(), nil
   144  }
   145  
   146  // Join makes current Cluster part of an existing swarm cluster.
   147  func (c *Cluster) Join(req types.JoinRequest) error {
   148  	c.controlMutex.Lock()
   149  	defer c.controlMutex.Unlock()
   150  	c.mu.Lock()
   151  	if c.nr != nil {
   152  		c.mu.Unlock()
   153  		return errors.WithStack(errSwarmExists)
   154  	}
   155  	c.mu.Unlock()
   156  
   157  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   158  		return errdefs.InvalidParameter(err)
   159  	}
   160  
   161  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
   162  	if err != nil {
   163  		return err
   164  	}
   165  
   166  	var advertiseAddr string
   167  	if req.AdvertiseAddr != "" {
   168  		advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
   169  		// For joining, we don't need to provide an advertise address,
   170  		// since the remote side can detect it.
   171  		if err == nil {
   172  			advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
   173  		}
   174  	}
   175  
   176  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	nr, err := c.newNodeRunner(nodeStartConfig{
   182  		RemoteAddr:    req.RemoteAddrs[0],
   183  		ListenAddr:    net.JoinHostPort(listenHost, listenPort),
   184  		AdvertiseAddr: advertiseAddr,
   185  		DataPathAddr:  dataPathAddr,
   186  		joinAddr:      req.RemoteAddrs[0],
   187  		joinToken:     req.JoinToken,
   188  		availability:  req.Availability,
   189  	})
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	c.mu.Lock()
   195  	c.nr = nr
   196  	c.mu.Unlock()
   197  
   198  	timeout := time.NewTimer(swarmConnectTimeout)
   199  	defer timeout.Stop()
   200  
   201  	select {
   202  	case <-timeout.C:
   203  		return errSwarmJoinTimeoutReached
   204  	case err := <-nr.Ready():
   205  		if err != nil {
   206  			c.mu.Lock()
   207  			c.nr = nil
   208  			c.mu.Unlock()
   209  			if err := clearPersistentState(c.root); err != nil {
   210  				return err
   211  			}
   212  		}
   213  		return err
   214  	}
   215  }
   216  
   217  // Inspect retrieves the configuration properties of a managed swarm cluster.
   218  func (c *Cluster) Inspect() (types.Swarm, error) {
   219  	var swarm types.Swarm
   220  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   221  		s, err := c.inspect(ctx, state)
   222  		if err != nil {
   223  			return err
   224  		}
   225  		swarm = s
   226  		return nil
   227  	}); err != nil {
   228  		return types.Swarm{}, err
   229  	}
   230  	return swarm, nil
   231  }
   232  
   233  func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) {
   234  	s, err := getSwarm(ctx, state.controlClient)
   235  	if err != nil {
   236  		return types.Swarm{}, err
   237  	}
   238  	return convert.SwarmFromGRPC(*s), nil
   239  }
   240  
   241  // Update updates configuration of a managed swarm cluster.
   242  func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
   243  	return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   244  		swarm, err := getSwarm(ctx, state.controlClient)
   245  		if err != nil {
   246  			return err
   247  		}
   248  
   249  		// Validate spec name.
   250  		if spec.Annotations.Name == "" {
   251  			spec.Annotations.Name = "default"
   252  		} else if spec.Annotations.Name != "default" {
   253  			return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`))
   254  		}
   255  
   256  		// In update, client should provide the complete spec of the swarm, including
   257  		// Name and Labels. If a field is specified with 0 or nil, then the default value
   258  		// will be used to swarmkit.
   259  		clusterSpec, err := convert.SwarmSpecToGRPC(spec)
   260  		if err != nil {
   261  			return errdefs.InvalidParameter(err)
   262  		}
   263  
   264  		_, err = state.controlClient.UpdateCluster(
   265  			ctx,
   266  			&swarmapi.UpdateClusterRequest{
   267  				ClusterID: swarm.ID,
   268  				Spec:      &clusterSpec,
   269  				ClusterVersion: &swarmapi.Version{
   270  					Index: version,
   271  				},
   272  				Rotation: swarmapi.KeyRotation{
   273  					WorkerJoinToken:  flags.RotateWorkerToken,
   274  					ManagerJoinToken: flags.RotateManagerToken,
   275  					ManagerUnlockKey: flags.RotateManagerUnlockKey,
   276  				},
   277  			},
   278  		)
   279  		return err
   280  	})
   281  }
   282  
   283  // GetUnlockKey returns the unlock key for the swarm.
   284  func (c *Cluster) GetUnlockKey() (string, error) {
   285  	var resp *swarmapi.GetUnlockKeyResponse
   286  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   287  		client := swarmapi.NewCAClient(state.grpcConn)
   288  
   289  		r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
   290  		if err != nil {
   291  			return err
   292  		}
   293  		resp = r
   294  		return nil
   295  	}); err != nil {
   296  		return "", err
   297  	}
   298  	if len(resp.UnlockKey) == 0 {
   299  		// no key
   300  		return "", nil
   301  	}
   302  	return encryption.HumanReadableKey(resp.UnlockKey), nil
   303  }
   304  
   305  // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
   306  func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
   307  	c.controlMutex.Lock()
   308  	defer c.controlMutex.Unlock()
   309  
   310  	c.mu.RLock()
   311  	state := c.currentNodeState()
   312  
   313  	if !state.IsActiveManager() {
   314  		// when manager is not active,
   315  		// unless it is locked, otherwise return error.
   316  		if err := c.errNoManager(state); err != errSwarmLocked {
   317  			c.mu.RUnlock()
   318  			return err
   319  		}
   320  	} else {
   321  		// when manager is active, return an error of "not locked"
   322  		c.mu.RUnlock()
   323  		return notLockedError{}
   324  	}
   325  
   326  	// only when swarm is locked, code running reaches here
   327  	nr := c.nr
   328  	c.mu.RUnlock()
   329  
   330  	key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
   331  	if err != nil {
   332  		return errdefs.InvalidParameter(err)
   333  	}
   334  
   335  	config := nr.config
   336  	config.lockKey = key
   337  	if err := nr.Stop(); err != nil {
   338  		return err
   339  	}
   340  	nr, err = c.newNodeRunner(config)
   341  	if err != nil {
   342  		return err
   343  	}
   344  
   345  	c.mu.Lock()
   346  	c.nr = nr
   347  	c.mu.Unlock()
   348  
   349  	if err := <-nr.Ready(); err != nil {
   350  		if errors.Is(err, errSwarmLocked) {
   351  			return invalidUnlockKey{}
   352  		}
   353  		return errors.Errorf("swarm component could not be started: %v", err)
   354  	}
   355  	return nil
   356  }
   357  
   358  // Leave shuts down Cluster and removes current state.
   359  func (c *Cluster) Leave(force bool) error {
   360  	c.controlMutex.Lock()
   361  	defer c.controlMutex.Unlock()
   362  
   363  	c.mu.Lock()
   364  	nr := c.nr
   365  	if nr == nil {
   366  		c.mu.Unlock()
   367  		return errors.WithStack(errNoSwarm)
   368  	}
   369  
   370  	state := c.currentNodeState()
   371  
   372  	c.mu.Unlock()
   373  
   374  	if errors.Is(state.err, errSwarmLocked) && !force {
   375  		// leave a locked swarm without --force is not allowed
   376  		return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message."))
   377  	}
   378  
   379  	if state.IsManager() && !force {
   380  		msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
   381  		if state.IsActiveManager() {
   382  			active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
   383  			if err == nil {
   384  				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
   385  					if isLastManager(reachable, unreachable) {
   386  						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
   387  						return errors.WithStack(notAvailableError(msg))
   388  					}
   389  					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
   390  				}
   391  			}
   392  		} else {
   393  			msg += "Doing so may lose the consensus of your cluster. "
   394  		}
   395  
   396  		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
   397  		return errors.WithStack(notAvailableError(msg))
   398  	}
   399  	// release readers in here
   400  	if err := nr.Stop(); err != nil {
   401  		logrus.Errorf("failed to shut down cluster node: %v", err)
   402  		signal.DumpStacks("")
   403  		return err
   404  	}
   405  
   406  	c.mu.Lock()
   407  	c.nr = nil
   408  	c.mu.Unlock()
   409  
   410  	if nodeID := state.NodeID(); nodeID != "" {
   411  		nodeContainers, err := c.listContainerForNode(nodeID)
   412  		if err != nil {
   413  			return err
   414  		}
   415  		for _, id := range nodeContainers {
   416  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   417  				logrus.Errorf("error removing %v: %v", id, err)
   418  			}
   419  		}
   420  	}
   421  
   422  	// todo: cleanup optional?
   423  	if err := clearPersistentState(c.root); err != nil {
   424  		return err
   425  	}
   426  	c.config.Backend.DaemonLeavesCluster()
   427  	return nil
   428  }
   429  
   430  // Info returns information about the current cluster state.
   431  func (c *Cluster) Info() types.Info {
   432  	info := types.Info{
   433  		NodeAddr: c.GetAdvertiseAddress(),
   434  	}
   435  	c.mu.RLock()
   436  	defer c.mu.RUnlock()
   437  
   438  	state := c.currentNodeState()
   439  	info.LocalNodeState = state.status
   440  	if state.err != nil {
   441  		info.Error = state.err.Error()
   442  	}
   443  
   444  	ctx, cancel := c.getRequestContext()
   445  	defer cancel()
   446  
   447  	if state.IsActiveManager() {
   448  		info.ControlAvailable = true
   449  		swarm, err := c.inspect(ctx, state)
   450  		if err != nil {
   451  			info.Error = err.Error()
   452  		}
   453  
   454  		info.Cluster = &swarm.ClusterInfo
   455  
   456  		if r, err := state.controlClient.ListNodes(
   457  			ctx, &swarmapi.ListNodesRequest{},
   458  			grpc.MaxCallRecvMsgSize(defaultRecvSizeForListResponse),
   459  		); err != nil {
   460  			info.Error = err.Error()
   461  		} else {
   462  			info.Nodes = len(r.Nodes)
   463  			for _, n := range r.Nodes {
   464  				if n.ManagerStatus != nil {
   465  					info.Managers = info.Managers + 1
   466  				}
   467  			}
   468  		}
   469  
   470  		switch info.LocalNodeState {
   471  		case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError:
   472  			// nothing to do
   473  		default:
   474  			if info.Managers == 2 {
   475  				const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides
   476           no fault tolerance, and poses a high risk to lose control over the cluster.
   477           Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the
   478           Swarm for fault-tolerance.`
   479  
   480  				info.Warnings = append(info.Warnings, warn)
   481  			}
   482  		}
   483  	}
   484  
   485  	if state.swarmNode != nil {
   486  		for _, r := range state.swarmNode.Remotes() {
   487  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   488  		}
   489  		info.NodeID = state.swarmNode.NodeID()
   490  	}
   491  
   492  	return info
   493  }
   494  
   495  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
   496  	var err error
   497  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   498  	if err != nil {
   499  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   500  	}
   501  
   502  	if req.Spec.Annotations.Name == "" {
   503  		req.Spec.Annotations.Name = "default"
   504  	} else if req.Spec.Annotations.Name != "default" {
   505  		return errors.New(`swarm spec must be named "default"`)
   506  	}
   507  
   508  	return nil
   509  }
   510  
   511  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
   512  	var err error
   513  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   514  	if err != nil {
   515  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   516  	}
   517  	if len(req.RemoteAddrs) == 0 {
   518  		return errors.New("at least 1 RemoteAddr is required to join")
   519  	}
   520  	for i := range req.RemoteAddrs {
   521  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
   522  		if err != nil {
   523  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
   524  		}
   525  	}
   526  	return nil
   527  }
   528  
   529  func validateAddr(addr string) (string, error) {
   530  	if addr == "" {
   531  		return addr, errors.New("invalid empty address")
   532  	}
   533  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
   534  	if err != nil {
   535  		return addr, nil
   536  	}
   537  	return strings.TrimPrefix(newaddr, "tcp://"), nil
   538  }
   539  
   540  func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
   541  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   542  	defer cancel()
   543  	for conn := range node.ListenControlSocket(ctx) {
   544  		if ctx.Err() != nil {
   545  			return ctx.Err()
   546  		}
   547  		if conn != nil {
   548  			client := swarmapi.NewControlClient(conn)
   549  			var cluster *swarmapi.Cluster
   550  			for i := 0; ; i++ {
   551  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
   552  				if err != nil {
   553  					return fmt.Errorf("error on listing clusters: %v", err)
   554  				}
   555  				if len(lcr.Clusters) == 0 {
   556  					if i < 10 {
   557  						time.Sleep(200 * time.Millisecond)
   558  						continue
   559  					}
   560  					return errors.New("empty list of clusters was returned")
   561  				}
   562  				cluster = lcr.Clusters[0]
   563  				break
   564  			}
   565  			// In init, we take the initial default values from swarmkit, and merge
   566  			// any non nil or 0 value from spec to GRPC spec. This will leave the
   567  			// default value alone.
   568  			// Note that this is different from Update(), as in Update() we expect
   569  			// user to specify the complete spec of the cluster (as they already know
   570  			// the existing one and knows which field to update)
   571  			clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
   572  			if err != nil {
   573  				return fmt.Errorf("error updating cluster settings: %v", err)
   574  			}
   575  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
   576  				ClusterID:      cluster.ID,
   577  				ClusterVersion: &cluster.Meta.Version,
   578  				Spec:           &clusterSpec,
   579  			})
   580  			if err != nil {
   581  				return fmt.Errorf("error updating cluster settings: %v", err)
   582  			}
   583  			return nil
   584  		}
   585  	}
   586  	return ctx.Err()
   587  }
   588  
   589  func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
   590  	var ids []string
   591  	filters := filters.NewArgs()
   592  	filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
   593  	containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
   594  		Filters: filters,
   595  	})
   596  	if err != nil {
   597  		return []string{}, err
   598  	}
   599  	for _, c := range containers {
   600  		ids = append(ids, c.ID)
   601  	}
   602  	return ids, nil
   603  }