github.com/zhouyu0/docker-note@v0.0.0-20190722021225-b8d3825084db/daemon/cluster/swarm.go (about)

     1  package cluster // import "github.com/docker/docker/daemon/cluster"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"strings"
     8  	"time"
     9  
    10  	apitypes "github.com/docker/docker/api/types"
    11  	"github.com/docker/docker/api/types/filters"
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/convert"
    14  	"github.com/docker/docker/errdefs"
    15  	"github.com/docker/docker/opts"
    16  	"github.com/docker/docker/pkg/signal"
    17  	swarmapi "github.com/docker/swarmkit/api"
    18  	"github.com/docker/swarmkit/manager/encryption"
    19  	swarmnode "github.com/docker/swarmkit/node"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  )
    23  
    24  // Init initializes new cluster from user provided request.
    25  func (c *Cluster) Init(req types.InitRequest) (string, error) {
    26  	c.controlMutex.Lock()
    27  	defer c.controlMutex.Unlock()
    28  	if c.nr != nil {
    29  		if req.ForceNewCluster {
    30  
    31  			// Take c.mu temporarily to wait for presently running
    32  			// API handlers to finish before shutting down the node.
    33  			c.mu.Lock()
    34  			if !c.nr.nodeState.IsManager() {
    35  				return "", errSwarmNotManager
    36  			}
    37  			c.mu.Unlock()
    38  
    39  			if err := c.nr.Stop(); err != nil {
    40  				return "", err
    41  			}
    42  		} else {
    43  			return "", errSwarmExists
    44  		}
    45  	}
    46  
    47  	if err := validateAndSanitizeInitRequest(&req); err != nil {
    48  		return "", errdefs.InvalidParameter(err)
    49  	}
    50  
    51  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
    52  	if err != nil {
    53  		return "", err
    54  	}
    55  
    56  	advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
    57  	if err != nil {
    58  		return "", err
    59  	}
    60  
    61  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
    62  	if err != nil {
    63  		return "", err
    64  	}
    65  
    66  	localAddr := listenHost
    67  
    68  	// If the local address is undetermined, the advertise address
    69  	// will be used as local address, if it belongs to this system.
    70  	// If the advertise address is not local, then we try to find
    71  	// a system address to use as local address. If this fails,
    72  	// we give up and ask the user to pass the listen address.
    73  	if net.ParseIP(localAddr).IsUnspecified() {
    74  		advertiseIP := net.ParseIP(advertiseHost)
    75  
    76  		found := false
    77  		for _, systemIP := range listSystemIPs() {
    78  			if systemIP.Equal(advertiseIP) {
    79  				localAddr = advertiseIP.String()
    80  				found = true
    81  				break
    82  			}
    83  		}
    84  
    85  		if !found {
    86  			ip, err := c.resolveSystemAddr()
    87  			if err != nil {
    88  				logrus.Warnf("Could not find a local address: %v", err)
    89  				return "", errMustSpecifyListenAddr
    90  			}
    91  			localAddr = ip.String()
    92  		}
    93  	}
    94  
    95  	//Validate Default Address Pool input
    96  	if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil {
    97  		return "", err
    98  	}
    99  	nr, err := c.newNodeRunner(nodeStartConfig{
   100  		forceNewCluster:    req.ForceNewCluster,
   101  		autolock:           req.AutoLockManagers,
   102  		LocalAddr:          localAddr,
   103  		ListenAddr:         net.JoinHostPort(listenHost, listenPort),
   104  		AdvertiseAddr:      net.JoinHostPort(advertiseHost, advertisePort),
   105  		DataPathAddr:       dataPathAddr,
   106  		DefaultAddressPool: req.DefaultAddrPool,
   107  		SubnetSize:         req.SubnetSize,
   108  		availability:       req.Availability,
   109  	})
   110  	if err != nil {
   111  		return "", err
   112  	}
   113  	c.mu.Lock()
   114  	c.nr = nr
   115  	c.mu.Unlock()
   116  
   117  	if err := <-nr.Ready(); err != nil {
   118  		c.mu.Lock()
   119  		c.nr = nil
   120  		c.mu.Unlock()
   121  		if !req.ForceNewCluster { // if failure on first attempt don't keep state
   122  			if err := clearPersistentState(c.root); err != nil {
   123  				return "", err
   124  			}
   125  		}
   126  		return "", err
   127  	}
   128  	state := nr.State()
   129  	if state.swarmNode == nil { // should never happen but protect from panic
   130  		return "", errors.New("invalid cluster state for spec initialization")
   131  	}
   132  	if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
   133  		return "", err
   134  	}
   135  	return state.NodeID(), nil
   136  }
   137  
   138  // Join makes current Cluster part of an existing swarm cluster.
   139  func (c *Cluster) Join(req types.JoinRequest) error {
   140  	c.controlMutex.Lock()
   141  	defer c.controlMutex.Unlock()
   142  	c.mu.Lock()
   143  	if c.nr != nil {
   144  		c.mu.Unlock()
   145  		return errors.WithStack(errSwarmExists)
   146  	}
   147  	c.mu.Unlock()
   148  
   149  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   150  		return errdefs.InvalidParameter(err)
   151  	}
   152  
   153  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
   154  	if err != nil {
   155  		return err
   156  	}
   157  
   158  	var advertiseAddr string
   159  	if req.AdvertiseAddr != "" {
   160  		advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
   161  		// For joining, we don't need to provide an advertise address,
   162  		// since the remote side can detect it.
   163  		if err == nil {
   164  			advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
   165  		}
   166  	}
   167  
   168  	dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
   169  	if err != nil {
   170  		return err
   171  	}
   172  
   173  	nr, err := c.newNodeRunner(nodeStartConfig{
   174  		RemoteAddr:    req.RemoteAddrs[0],
   175  		ListenAddr:    net.JoinHostPort(listenHost, listenPort),
   176  		AdvertiseAddr: advertiseAddr,
   177  		DataPathAddr:  dataPathAddr,
   178  		joinAddr:      req.RemoteAddrs[0],
   179  		joinToken:     req.JoinToken,
   180  		availability:  req.Availability,
   181  	})
   182  	if err != nil {
   183  		return err
   184  	}
   185  
   186  	c.mu.Lock()
   187  	c.nr = nr
   188  	c.mu.Unlock()
   189  
   190  	select {
   191  	case <-time.After(swarmConnectTimeout):
   192  		return errSwarmJoinTimeoutReached
   193  	case err := <-nr.Ready():
   194  		if err != nil {
   195  			c.mu.Lock()
   196  			c.nr = nil
   197  			c.mu.Unlock()
   198  			if err := clearPersistentState(c.root); err != nil {
   199  				return err
   200  			}
   201  		}
   202  		return err
   203  	}
   204  }
   205  
   206  // Inspect retrieves the configuration properties of a managed swarm cluster.
   207  func (c *Cluster) Inspect() (types.Swarm, error) {
   208  	var swarm types.Swarm
   209  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   210  		s, err := c.inspect(ctx, state)
   211  		if err != nil {
   212  			return err
   213  		}
   214  		swarm = s
   215  		return nil
   216  	}); err != nil {
   217  		return types.Swarm{}, err
   218  	}
   219  	return swarm, nil
   220  }
   221  
   222  func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) {
   223  	s, err := getSwarm(ctx, state.controlClient)
   224  	if err != nil {
   225  		return types.Swarm{}, err
   226  	}
   227  	return convert.SwarmFromGRPC(*s), nil
   228  }
   229  
   230  // Update updates configuration of a managed swarm cluster.
   231  func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
   232  	return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   233  		swarm, err := getSwarm(ctx, state.controlClient)
   234  		if err != nil {
   235  			return err
   236  		}
   237  
   238  		// Validate spec name.
   239  		if spec.Annotations.Name == "" {
   240  			spec.Annotations.Name = "default"
   241  		} else if spec.Annotations.Name != "default" {
   242  			return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`))
   243  		}
   244  
   245  		// In update, client should provide the complete spec of the swarm, including
   246  		// Name and Labels. If a field is specified with 0 or nil, then the default value
   247  		// will be used to swarmkit.
   248  		clusterSpec, err := convert.SwarmSpecToGRPC(spec)
   249  		if err != nil {
   250  			return errdefs.InvalidParameter(err)
   251  		}
   252  
   253  		_, err = state.controlClient.UpdateCluster(
   254  			ctx,
   255  			&swarmapi.UpdateClusterRequest{
   256  				ClusterID: swarm.ID,
   257  				Spec:      &clusterSpec,
   258  				ClusterVersion: &swarmapi.Version{
   259  					Index: version,
   260  				},
   261  				Rotation: swarmapi.KeyRotation{
   262  					WorkerJoinToken:  flags.RotateWorkerToken,
   263  					ManagerJoinToken: flags.RotateManagerToken,
   264  					ManagerUnlockKey: flags.RotateManagerUnlockKey,
   265  				},
   266  			},
   267  		)
   268  		return err
   269  	})
   270  }
   271  
   272  // GetUnlockKey returns the unlock key for the swarm.
   273  func (c *Cluster) GetUnlockKey() (string, error) {
   274  	var resp *swarmapi.GetUnlockKeyResponse
   275  	if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
   276  		client := swarmapi.NewCAClient(state.grpcConn)
   277  
   278  		r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
   279  		if err != nil {
   280  			return err
   281  		}
   282  		resp = r
   283  		return nil
   284  	}); err != nil {
   285  		return "", err
   286  	}
   287  	if len(resp.UnlockKey) == 0 {
   288  		// no key
   289  		return "", nil
   290  	}
   291  	return encryption.HumanReadableKey(resp.UnlockKey), nil
   292  }
   293  
   294  // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
   295  func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
   296  	c.controlMutex.Lock()
   297  	defer c.controlMutex.Unlock()
   298  
   299  	c.mu.RLock()
   300  	state := c.currentNodeState()
   301  
   302  	if !state.IsActiveManager() {
   303  		// when manager is not active,
   304  		// unless it is locked, otherwise return error.
   305  		if err := c.errNoManager(state); err != errSwarmLocked {
   306  			c.mu.RUnlock()
   307  			return err
   308  		}
   309  	} else {
   310  		// when manager is active, return an error of "not locked"
   311  		c.mu.RUnlock()
   312  		return notLockedError{}
   313  	}
   314  
   315  	// only when swarm is locked, code running reaches here
   316  	nr := c.nr
   317  	c.mu.RUnlock()
   318  
   319  	key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
   320  	if err != nil {
   321  		return errdefs.InvalidParameter(err)
   322  	}
   323  
   324  	config := nr.config
   325  	config.lockKey = key
   326  	if err := nr.Stop(); err != nil {
   327  		return err
   328  	}
   329  	nr, err = c.newNodeRunner(config)
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	c.mu.Lock()
   335  	c.nr = nr
   336  	c.mu.Unlock()
   337  
   338  	if err := <-nr.Ready(); err != nil {
   339  		if errors.Cause(err) == errSwarmLocked {
   340  			return invalidUnlockKey{}
   341  		}
   342  		return errors.Errorf("swarm component could not be started: %v", err)
   343  	}
   344  	return nil
   345  }
   346  
   347  // Leave shuts down Cluster and removes current state.
   348  func (c *Cluster) Leave(force bool) error {
   349  	c.controlMutex.Lock()
   350  	defer c.controlMutex.Unlock()
   351  
   352  	c.mu.Lock()
   353  	nr := c.nr
   354  	if nr == nil {
   355  		c.mu.Unlock()
   356  		return errors.WithStack(errNoSwarm)
   357  	}
   358  
   359  	state := c.currentNodeState()
   360  
   361  	c.mu.Unlock()
   362  
   363  	if errors.Cause(state.err) == errSwarmLocked && !force {
   364  		// leave a locked swarm without --force is not allowed
   365  		return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message."))
   366  	}
   367  
   368  	if state.IsManager() && !force {
   369  		msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
   370  		if state.IsActiveManager() {
   371  			active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
   372  			if err == nil {
   373  				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
   374  					if isLastManager(reachable, unreachable) {
   375  						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
   376  						return errors.WithStack(notAvailableError(msg))
   377  					}
   378  					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
   379  				}
   380  			}
   381  		} else {
   382  			msg += "Doing so may lose the consensus of your cluster. "
   383  		}
   384  
   385  		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
   386  		return errors.WithStack(notAvailableError(msg))
   387  	}
   388  	// release readers in here
   389  	if err := nr.Stop(); err != nil {
   390  		logrus.Errorf("failed to shut down cluster node: %v", err)
   391  		signal.DumpStacks("")
   392  		return err
   393  	}
   394  
   395  	c.mu.Lock()
   396  	c.nr = nil
   397  	c.mu.Unlock()
   398  
   399  	if nodeID := state.NodeID(); nodeID != "" {
   400  		nodeContainers, err := c.listContainerForNode(nodeID)
   401  		if err != nil {
   402  			return err
   403  		}
   404  		for _, id := range nodeContainers {
   405  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   406  				logrus.Errorf("error removing %v: %v", id, err)
   407  			}
   408  		}
   409  	}
   410  
   411  	// todo: cleanup optional?
   412  	if err := clearPersistentState(c.root); err != nil {
   413  		return err
   414  	}
   415  	c.config.Backend.DaemonLeavesCluster()
   416  	return nil
   417  }
   418  
   419  // Info returns information about the current cluster state.
   420  func (c *Cluster) Info() types.Info {
   421  	info := types.Info{
   422  		NodeAddr: c.GetAdvertiseAddress(),
   423  	}
   424  	c.mu.RLock()
   425  	defer c.mu.RUnlock()
   426  
   427  	state := c.currentNodeState()
   428  	info.LocalNodeState = state.status
   429  	if state.err != nil {
   430  		info.Error = state.err.Error()
   431  	}
   432  
   433  	ctx, cancel := c.getRequestContext()
   434  	defer cancel()
   435  
   436  	if state.IsActiveManager() {
   437  		info.ControlAvailable = true
   438  		swarm, err := c.inspect(ctx, state)
   439  		if err != nil {
   440  			info.Error = err.Error()
   441  		}
   442  
   443  		info.Cluster = &swarm.ClusterInfo
   444  
   445  		if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil {
   446  			info.Error = err.Error()
   447  		} else {
   448  			info.Nodes = len(r.Nodes)
   449  			for _, n := range r.Nodes {
   450  				if n.ManagerStatus != nil {
   451  					info.Managers = info.Managers + 1
   452  				}
   453  			}
   454  		}
   455  	}
   456  
   457  	if state.swarmNode != nil {
   458  		for _, r := range state.swarmNode.Remotes() {
   459  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   460  		}
   461  		info.NodeID = state.swarmNode.NodeID()
   462  	}
   463  
   464  	return info
   465  }
   466  
   467  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
   468  	var err error
   469  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   470  	if err != nil {
   471  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   472  	}
   473  
   474  	if req.Spec.Annotations.Name == "" {
   475  		req.Spec.Annotations.Name = "default"
   476  	} else if req.Spec.Annotations.Name != "default" {
   477  		return errors.New(`swarm spec must be named "default"`)
   478  	}
   479  
   480  	return nil
   481  }
   482  
   483  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
   484  	var err error
   485  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   486  	if err != nil {
   487  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   488  	}
   489  	if len(req.RemoteAddrs) == 0 {
   490  		return errors.New("at least 1 RemoteAddr is required to join")
   491  	}
   492  	for i := range req.RemoteAddrs {
   493  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
   494  		if err != nil {
   495  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
   496  		}
   497  	}
   498  	return nil
   499  }
   500  
   501  func validateAddr(addr string) (string, error) {
   502  	if addr == "" {
   503  		return addr, errors.New("invalid empty address")
   504  	}
   505  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
   506  	if err != nil {
   507  		return addr, nil
   508  	}
   509  	return strings.TrimPrefix(newaddr, "tcp://"), nil
   510  }
   511  
   512  func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
   513  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   514  	defer cancel()
   515  	for conn := range node.ListenControlSocket(ctx) {
   516  		if ctx.Err() != nil {
   517  			return ctx.Err()
   518  		}
   519  		if conn != nil {
   520  			client := swarmapi.NewControlClient(conn)
   521  			var cluster *swarmapi.Cluster
   522  			for i := 0; ; i++ {
   523  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
   524  				if err != nil {
   525  					return fmt.Errorf("error on listing clusters: %v", err)
   526  				}
   527  				if len(lcr.Clusters) == 0 {
   528  					if i < 10 {
   529  						time.Sleep(200 * time.Millisecond)
   530  						continue
   531  					}
   532  					return errors.New("empty list of clusters was returned")
   533  				}
   534  				cluster = lcr.Clusters[0]
   535  				break
   536  			}
   537  			// In init, we take the initial default values from swarmkit, and merge
   538  			// any non nil or 0 value from spec to GRPC spec. This will leave the
   539  			// default value alone.
   540  			// Note that this is different from Update(), as in Update() we expect
   541  			// user to specify the complete spec of the cluster (as they already know
   542  			// the existing one and knows which field to update)
   543  			clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
   544  			if err != nil {
   545  				return fmt.Errorf("error updating cluster settings: %v", err)
   546  			}
   547  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
   548  				ClusterID:      cluster.ID,
   549  				ClusterVersion: &cluster.Meta.Version,
   550  				Spec:           &clusterSpec,
   551  			})
   552  			if err != nil {
   553  				return fmt.Errorf("error updating cluster settings: %v", err)
   554  			}
   555  			return nil
   556  		}
   557  	}
   558  	return ctx.Err()
   559  }
   560  
   561  func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
   562  	var ids []string
   563  	filters := filters.NewArgs()
   564  	filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
   565  	containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
   566  		Filters: filters,
   567  	})
   568  	if err != nil {
   569  		return []string{}, err
   570  	}
   571  	for _, c := range containers {
   572  		ids = append(ids, c.ID)
   573  	}
   574  	return ids, nil
   575  }