github.com/jwhonce/docker@v0.6.7-0.20190327063223-da823cf3a5a3/daemon/cluster/swarm.go (about)

     1  package cluster // import "github.com/docker/docker/daemon/cluster"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"strings"
     8  	"time"
     9  
    10  	apitypes "github.com/docker/docker/api/types"
    11  	"github.com/docker/docker/api/types/filters"
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/convert"
    14  	"github.com/docker/docker/errdefs"
    15  	"github.com/docker/docker/opts"
    16  	"github.com/docker/docker/pkg/signal"
    17  	swarmapi "github.com/docker/swarmkit/api"
    18  	"github.com/docker/swarmkit/manager/encryption"
    19  	swarmnode "github.com/docker/swarmkit/node"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  )
    23  
    24  // Init initializes new cluster from user provided request.
    25  func (c *Cluster) Init(req types.InitRequest) (string, error) {
    26  	c.controlMutex.Lock()
    27  	defer c.controlMutex.Unlock()
    28  	if c.nr != nil {
    29  		if req.ForceNewCluster {
    30  
    31  			// Take c.mu temporarily to wait for presently running
    32  			// API handlers to finish before shutting down the node.
    33  			c.mu.Lock()
    34  			if !c.nr.nodeState.IsManager() {
    35  				return "", errSwarmNotManager
    36  			}
    37  			c.mu.Unlock()
    38  
    39  			if err := c.nr.Stop(); err != nil {
    40  				return "", err
    41  			}
    42  		} else {
    43  			return "", errSwarmExists
    44  		}
    45  	}
    46  
    47  	if err := validateAndSanitizeInitRequest(&req); err != nil {
    48  		return "", errdefs.InvalidParameter(err)
    49  	}
    50  
    51  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
    52  	if err != nil {
    53  		return "", err
    54  	}
    55  
    56  	advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
    57  	if err != nil {
    58  		return "", err
    59  	}
    60  
    61  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
    62  	if err != nil {
    63  		return "", err
    64  	}
    65  
    66  	localAddr := listenHost
    67  
    68  	// If the local address is undetermined, the advertise address
    69  	// will be used as local address, if it belongs to this system.
    70  	// If the advertise address is not local, then we try to find
    71  	// a system address to use as local address. If this fails,
    72  	// we give up and ask the user to pass the listen address.
    73  	if net.ParseIP(localAddr).IsUnspecified() {
    74  		advertiseIP := net.ParseIP(advertiseHost)
    75  
    76  		found := false
    77  		for _, systemIP := range listSystemIPs() {
    78  			if systemIP.Equal(advertiseIP) {
    79  				localAddr = advertiseIP.String()
    80  				found = true
    81  				break
    82  			}
    83  		}
    84  
    85  		if !found {
    86  			ip, err := c.resolveSystemAddr()
    87  			if err != nil {
    88  				logrus.Warnf("Could not find a local address: %v", err)
    89  				return "", errMustSpecifyListenAddr
    90  			}
    91  			localAddr = ip.String()
    92  		}
    93  	}
    94  
    95  	//Validate Default Address Pool input
    96  	if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil {
    97  		return "", err
    98  	}
    99  
   100  	port, err := getDataPathPort(req.DataPathPort)
   101  	if err != nil {
   102  		return "", err
   103  	}
   104  
   105  	nr, err := c.newNodeRunner(nodeStartConfig{
   106  		forceNewCluster:    req.ForceNewCluster,
   107  		autolock:           req.AutoLockManagers,
   108  		LocalAddr:          localAddr,
   109  		ListenAddr:         net.JoinHostPort(listenHost, listenPort),
   110  		AdvertiseAddr:      net.JoinHostPort(advertiseHost, advertisePort),
   111  		DataPathAddr:       dataPathAddr,
   112  		DefaultAddressPool: req.DefaultAddrPool,
   113  		SubnetSize:         req.SubnetSize,
   114  		availability:       req.Availability,
   115  		DataPathPort:       port,
   116  	})
   117  	if err != nil {
   118  		return "", err
   119  	}
   120  	c.mu.Lock()
   121  	c.nr = nr
   122  	c.mu.Unlock()
   123  
   124  	if err := <-nr.Ready(); err != nil {
   125  		c.mu.Lock()
   126  		c.nr = nil
   127  		c.mu.Unlock()
   128  		if !req.ForceNewCluster { // if failure on first attempt don't keep state
   129  			if err := clearPersistentState(c.root); err != nil {
   130  				return "", err
   131  			}
   132  		}
   133  		return "", err
   134  	}
   135  	state := nr.State()
   136  	if state.swarmNode == nil { // should never happen but protect from panic
   137  		return "", errors.New("invalid cluster state for spec initialization")
   138  	}
   139  	if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
   140  		return "", err
   141  	}
   142  	return state.NodeID(), nil
   143  }
   144  
   145  // Join makes current Cluster part of an existing swarm cluster.
   146  func (c *Cluster) Join(req types.JoinRequest) error {
   147  	c.controlMutex.Lock()
   148  	defer c.controlMutex.Unlock()
   149  	c.mu.Lock()
   150  	if c.nr != nil {
   151  		c.mu.Unlock()
   152  		return errors.WithStack(errSwarmExists)
   153  	}
   154  	c.mu.Unlock()
   155  
   156  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   157  		return errdefs.InvalidParameter(err)
   158  	}
   159  
   160  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
   161  	if err != nil {
   162  		return err
   163  	}
   164  
   165  	var advertiseAddr string
   166  	if req.AdvertiseAddr != "" {
   167  		advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
   168  		// For joining, we don't need to provide an advertise address,
   169  		// since the remote side can detect it.
   170  		if err == nil {
   171  			advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
   172  		}
   173  	}
   174  
   175  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
   176  	if err != nil {
   177  		return err
   178  	}
   179  
   180  	nr, err := c.newNodeRunner(nodeStartConfig{
   181  		RemoteAddr:    req.RemoteAddrs[0],
   182  		ListenAddr:    net.JoinHostPort(listenHost, listenPort),
   183  		AdvertiseAddr: advertiseAddr,
   184  		DataPathAddr:  dataPathAddr,
   185  		joinAddr:      req.RemoteAddrs[0],
   186  		joinToken:     req.JoinToken,
   187  		availability:  req.Availability,
   188  	})
   189  	if err != nil {
   190  		return err
   191  	}
   192  
   193  	c.mu.Lock()
   194  	c.nr = nr
   195  	c.mu.Unlock()
   196  
   197  	select {
   198  	case <-time.After(swarmConnectTimeout):
   199  		return errSwarmJoinTimeoutReached
   200  	case err := <-nr.Ready():
   201  		if err != nil {
   202  			c.mu.Lock()
   203  			c.nr = nil
   204  			c.mu.Unlock()
   205  			if err := clearPersistentState(c.root); err != nil {
   206  				return err
   207  			}
   208  		}
   209  		return err
   210  	}
   211  }
   212  
   213  // Inspect retrieves the configuration properties of a managed swarm cluster.
   214  func (c *Cluster) Inspect() (types.Swarm, error) {
   215  	var swarm types.Swarm
   216  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   217  		s, err := c.inspect(ctx, state)
   218  		if err != nil {
   219  			return err
   220  		}
   221  		swarm = s
   222  		return nil
   223  	}); err != nil {
   224  		return types.Swarm{}, err
   225  	}
   226  	return swarm, nil
   227  }
   228  
   229  func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) {
   230  	s, err := getSwarm(ctx, state.controlClient)
   231  	if err != nil {
   232  		return types.Swarm{}, err
   233  	}
   234  	return convert.SwarmFromGRPC(*s), nil
   235  }
   236  
   237  // Update updates configuration of a managed swarm cluster.
   238  func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
   239  	return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   240  		swarm, err := getSwarm(ctx, state.controlClient)
   241  		if err != nil {
   242  			return err
   243  		}
   244  
   245  		// Validate spec name.
   246  		if spec.Annotations.Name == "" {
   247  			spec.Annotations.Name = "default"
   248  		} else if spec.Annotations.Name != "default" {
   249  			return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`))
   250  		}
   251  
   252  		// In update, client should provide the complete spec of the swarm, including
   253  		// Name and Labels. If a field is specified with 0 or nil, then the default value
   254  		// will be used to swarmkit.
   255  		clusterSpec, err := convert.SwarmSpecToGRPC(spec)
   256  		if err != nil {
   257  			return errdefs.InvalidParameter(err)
   258  		}
   259  
   260  		_, err = state.controlClient.UpdateCluster(
   261  			ctx,
   262  			&swarmapi.UpdateClusterRequest{
   263  				ClusterID: swarm.ID,
   264  				Spec:      &clusterSpec,
   265  				ClusterVersion: &swarmapi.Version{
   266  					Index: version,
   267  				},
   268  				Rotation: swarmapi.KeyRotation{
   269  					WorkerJoinToken:  flags.RotateWorkerToken,
   270  					ManagerJoinToken: flags.RotateManagerToken,
   271  					ManagerUnlockKey: flags.RotateManagerUnlockKey,
   272  				},
   273  			},
   274  		)
   275  		return err
   276  	})
   277  }
   278  
   279  // GetUnlockKey returns the unlock key for the swarm.
   280  func (c *Cluster) GetUnlockKey() (string, error) {
   281  	var resp *swarmapi.GetUnlockKeyResponse
   282  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   283  		client := swarmapi.NewCAClient(state.grpcConn)
   284  
   285  		r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
   286  		if err != nil {
   287  			return err
   288  		}
   289  		resp = r
   290  		return nil
   291  	}); err != nil {
   292  		return "", err
   293  	}
   294  	if len(resp.UnlockKey) == 0 {
   295  		// no key
   296  		return "", nil
   297  	}
   298  	return encryption.HumanReadableKey(resp.UnlockKey), nil
   299  }
   300  
   301  // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
   302  func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
   303  	c.controlMutex.Lock()
   304  	defer c.controlMutex.Unlock()
   305  
   306  	c.mu.RLock()
   307  	state := c.currentNodeState()
   308  
   309  	if !state.IsActiveManager() {
   310  		// when manager is not active,
   311  		// unless it is locked, otherwise return error.
   312  		if err := c.errNoManager(state); err != errSwarmLocked {
   313  			c.mu.RUnlock()
   314  			return err
   315  		}
   316  	} else {
   317  		// when manager is active, return an error of "not locked"
   318  		c.mu.RUnlock()
   319  		return notLockedError{}
   320  	}
   321  
   322  	// only when swarm is locked, code running reaches here
   323  	nr := c.nr
   324  	c.mu.RUnlock()
   325  
   326  	key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
   327  	if err != nil {
   328  		return errdefs.InvalidParameter(err)
   329  	}
   330  
   331  	config := nr.config
   332  	config.lockKey = key
   333  	if err := nr.Stop(); err != nil {
   334  		return err
   335  	}
   336  	nr, err = c.newNodeRunner(config)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	c.mu.Lock()
   342  	c.nr = nr
   343  	c.mu.Unlock()
   344  
   345  	if err := <-nr.Ready(); err != nil {
   346  		if errors.Cause(err) == errSwarmLocked {
   347  			return invalidUnlockKey{}
   348  		}
   349  		return errors.Errorf("swarm component could not be started: %v", err)
   350  	}
   351  	return nil
   352  }
   353  
   354  // Leave shuts down Cluster and removes current state.
   355  func (c *Cluster) Leave(force bool) error {
   356  	c.controlMutex.Lock()
   357  	defer c.controlMutex.Unlock()
   358  
   359  	c.mu.Lock()
   360  	nr := c.nr
   361  	if nr == nil {
   362  		c.mu.Unlock()
   363  		return errors.WithStack(errNoSwarm)
   364  	}
   365  
   366  	state := c.currentNodeState()
   367  
   368  	c.mu.Unlock()
   369  
   370  	if errors.Cause(state.err) == errSwarmLocked && !force {
   371  		// leave a locked swarm without --force is not allowed
   372  		return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message."))
   373  	}
   374  
   375  	if state.IsManager() && !force {
   376  		msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
   377  		if state.IsActiveManager() {
   378  			active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
   379  			if err == nil {
   380  				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
   381  					if isLastManager(reachable, unreachable) {
   382  						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
   383  						return errors.WithStack(notAvailableError(msg))
   384  					}
   385  					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
   386  				}
   387  			}
   388  		} else {
   389  			msg += "Doing so may lose the consensus of your cluster. "
   390  		}
   391  
   392  		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
   393  		return errors.WithStack(notAvailableError(msg))
   394  	}
   395  	// release readers in here
   396  	if err := nr.Stop(); err != nil {
   397  		logrus.Errorf("failed to shut down cluster node: %v", err)
   398  		signal.DumpStacks("")
   399  		return err
   400  	}
   401  
   402  	c.mu.Lock()
   403  	c.nr = nil
   404  	c.mu.Unlock()
   405  
   406  	if nodeID := state.NodeID(); nodeID != "" {
   407  		nodeContainers, err := c.listContainerForNode(nodeID)
   408  		if err != nil {
   409  			return err
   410  		}
   411  		for _, id := range nodeContainers {
   412  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   413  				logrus.Errorf("error removing %v: %v", id, err)
   414  			}
   415  		}
   416  	}
   417  
   418  	// todo: cleanup optional?
   419  	if err := clearPersistentState(c.root); err != nil {
   420  		return err
   421  	}
   422  	c.config.Backend.DaemonLeavesCluster()
   423  	return nil
   424  }
   425  
   426  // Info returns information about the current cluster state.
   427  func (c *Cluster) Info() types.Info {
   428  	info := types.Info{
   429  		NodeAddr: c.GetAdvertiseAddress(),
   430  	}
   431  	c.mu.RLock()
   432  	defer c.mu.RUnlock()
   433  
   434  	state := c.currentNodeState()
   435  	info.LocalNodeState = state.status
   436  	if state.err != nil {
   437  		info.Error = state.err.Error()
   438  	}
   439  
   440  	ctx, cancel := c.getRequestContext()
   441  	defer cancel()
   442  
   443  	if state.IsActiveManager() {
   444  		info.ControlAvailable = true
   445  		swarm, err := c.inspect(ctx, state)
   446  		if err != nil {
   447  			info.Error = err.Error()
   448  		}
   449  
   450  		info.Cluster = &swarm.ClusterInfo
   451  
   452  		if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil {
   453  			info.Error = err.Error()
   454  		} else {
   455  			info.Nodes = len(r.Nodes)
   456  			for _, n := range r.Nodes {
   457  				if n.ManagerStatus != nil {
   458  					info.Managers = info.Managers + 1
   459  				}
   460  			}
   461  		}
   462  
   463  		switch info.LocalNodeState {
   464  		case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError:
   465  			// nothing to do
   466  		default:
   467  			if info.Managers == 2 {
   468  				const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides
   469           no fault tolerance, and poses a high risk to loose control over the cluster.
   470           Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the
   471           Swarm for fault-tolerance.`
   472  
   473  				info.Warnings = append(info.Warnings, warn)
   474  			}
   475  		}
   476  	}
   477  
   478  	if state.swarmNode != nil {
   479  		for _, r := range state.swarmNode.Remotes() {
   480  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   481  		}
   482  		info.NodeID = state.swarmNode.NodeID()
   483  	}
   484  
   485  	return info
   486  }
   487  
   488  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
   489  	var err error
   490  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   491  	if err != nil {
   492  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   493  	}
   494  
   495  	if req.Spec.Annotations.Name == "" {
   496  		req.Spec.Annotations.Name = "default"
   497  	} else if req.Spec.Annotations.Name != "default" {
   498  		return errors.New(`swarm spec must be named "default"`)
   499  	}
   500  
   501  	return nil
   502  }
   503  
   504  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
   505  	var err error
   506  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   507  	if err != nil {
   508  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   509  	}
   510  	if len(req.RemoteAddrs) == 0 {
   511  		return errors.New("at least 1 RemoteAddr is required to join")
   512  	}
   513  	for i := range req.RemoteAddrs {
   514  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
   515  		if err != nil {
   516  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
   517  		}
   518  	}
   519  	return nil
   520  }
   521  
   522  func validateAddr(addr string) (string, error) {
   523  	if addr == "" {
   524  		return addr, errors.New("invalid empty address")
   525  	}
   526  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
   527  	if err != nil {
   528  		return addr, nil
   529  	}
   530  	return strings.TrimPrefix(newaddr, "tcp://"), nil
   531  }
   532  
   533  func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
   534  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   535  	defer cancel()
   536  	for conn := range node.ListenControlSocket(ctx) {
   537  		if ctx.Err() != nil {
   538  			return ctx.Err()
   539  		}
   540  		if conn != nil {
   541  			client := swarmapi.NewControlClient(conn)
   542  			var cluster *swarmapi.Cluster
   543  			for i := 0; ; i++ {
   544  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
   545  				if err != nil {
   546  					return fmt.Errorf("error on listing clusters: %v", err)
   547  				}
   548  				if len(lcr.Clusters) == 0 {
   549  					if i < 10 {
   550  						time.Sleep(200 * time.Millisecond)
   551  						continue
   552  					}
   553  					return errors.New("empty list of clusters was returned")
   554  				}
   555  				cluster = lcr.Clusters[0]
   556  				break
   557  			}
   558  			// In init, we take the initial default values from swarmkit, and merge
   559  			// any non nil or 0 value from spec to GRPC spec. This will leave the
   560  			// default value alone.
   561  			// Note that this is different from Update(), as in Update() we expect
   562  			// user to specify the complete spec of the cluster (as they already know
   563  			// the existing one and knows which field to update)
   564  			clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
   565  			if err != nil {
   566  				return fmt.Errorf("error updating cluster settings: %v", err)
   567  			}
   568  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
   569  				ClusterID:      cluster.ID,
   570  				ClusterVersion: &cluster.Meta.Version,
   571  				Spec:           &clusterSpec,
   572  			})
   573  			if err != nil {
   574  				return fmt.Errorf("error updating cluster settings: %v", err)
   575  			}
   576  			return nil
   577  		}
   578  	}
   579  	return ctx.Err()
   580  }
   581  
   582  func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
   583  	var ids []string
   584  	filters := filters.NewArgs()
   585  	filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
   586  	containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
   587  		Filters: filters,
   588  	})
   589  	if err != nil {
   590  		return []string{}, err
   591  	}
   592  	for _, c := range containers {
   593  		ids = append(ids, c.ID)
   594  	}
   595  	return ids, nil
   596  }