github.com/fabiokung/docker@v0.11.2-0.20170222101415-4534dcd49497/daemon/cluster/swarm.go (about)

     1  package cluster
     2  
     3  import (
     4  	"fmt"
     5  	"net"
     6  	"strings"
     7  	"time"
     8  
     9  	"github.com/Sirupsen/logrus"
    10  	apierrors "github.com/docker/docker/api/errors"
    11  	apitypes "github.com/docker/docker/api/types"
    12  	"github.com/docker/docker/api/types/filters"
    13  	types "github.com/docker/docker/api/types/swarm"
    14  	"github.com/docker/docker/daemon/cluster/convert"
    15  	"github.com/docker/docker/opts"
    16  	"github.com/docker/docker/pkg/signal"
    17  	swarmapi "github.com/docker/swarmkit/api"
    18  	"github.com/docker/swarmkit/manager/encryption"
    19  	swarmnode "github.com/docker/swarmkit/node"
    20  	"github.com/pkg/errors"
    21  	"golang.org/x/net/context"
    22  )
    23  
    24  // Init initializes new cluster from user provided request.
    25  func (c *Cluster) Init(req types.InitRequest) (string, error) {
    26  	c.controlMutex.Lock()
    27  	defer c.controlMutex.Unlock()
    28  	c.mu.Lock()
    29  	if c.nr != nil {
    30  		if req.ForceNewCluster {
    31  			if err := c.nr.Stop(); err != nil {
    32  				c.mu.Unlock()
    33  				return "", err
    34  			}
    35  		} else {
    36  			c.mu.Unlock()
    37  			return "", errSwarmExists
    38  		}
    39  	}
    40  	c.mu.Unlock()
    41  
    42  	if err := validateAndSanitizeInitRequest(&req); err != nil {
    43  		return "", apierrors.NewBadRequestError(err)
    44  	}
    45  
    46  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
    47  	if err != nil {
    48  		return "", err
    49  	}
    50  
    51  	advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
    52  	if err != nil {
    53  		return "", err
    54  	}
    55  
    56  	localAddr := listenHost
    57  
    58  	// If the local address is undetermined, the advertise address
    59  	// will be used as local address, if it belongs to this system.
    60  	// If the advertise address is not local, then we try to find
    61  	// a system address to use as local address. If this fails,
    62  	// we give up and ask the user to pass the listen address.
    63  	if net.ParseIP(localAddr).IsUnspecified() {
    64  		advertiseIP := net.ParseIP(advertiseHost)
    65  
    66  		found := false
    67  		for _, systemIP := range listSystemIPs() {
    68  			if systemIP.Equal(advertiseIP) {
    69  				localAddr = advertiseIP.String()
    70  				found = true
    71  				break
    72  			}
    73  		}
    74  
    75  		if !found {
    76  			ip, err := c.resolveSystemAddr()
    77  			if err != nil {
    78  				logrus.Warnf("Could not find a local address: %v", err)
    79  				return "", errMustSpecifyListenAddr
    80  			}
    81  			localAddr = ip.String()
    82  		}
    83  	}
    84  
    85  	if !req.ForceNewCluster {
    86  		clearPersistentState(c.root)
    87  	}
    88  
    89  	nr, err := c.newNodeRunner(nodeStartConfig{
    90  		forceNewCluster: req.ForceNewCluster,
    91  		autolock:        req.AutoLockManagers,
    92  		LocalAddr:       localAddr,
    93  		ListenAddr:      net.JoinHostPort(listenHost, listenPort),
    94  		AdvertiseAddr:   net.JoinHostPort(advertiseHost, advertisePort),
    95  		availability:    req.Availability,
    96  	})
    97  	if err != nil {
    98  		return "", err
    99  	}
   100  	c.mu.Lock()
   101  	c.nr = nr
   102  	c.mu.Unlock()
   103  
   104  	if err := <-nr.Ready(); err != nil {
   105  		if !req.ForceNewCluster { // if failure on first attempt don't keep state
   106  			if err := clearPersistentState(c.root); err != nil {
   107  				return "", err
   108  			}
   109  		}
   110  		if err != nil {
   111  			c.mu.Lock()
   112  			c.nr = nil
   113  			c.mu.Unlock()
   114  		}
   115  		return "", err
   116  	}
   117  	state := nr.State()
   118  	if state.swarmNode == nil { // should never happen but protect from panic
   119  		return "", errors.New("invalid cluster state for spec initialization")
   120  	}
   121  	if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
   122  		return "", err
   123  	}
   124  	return state.NodeID(), nil
   125  }
   126  
   127  // Join makes current Cluster part of an existing swarm cluster.
   128  func (c *Cluster) Join(req types.JoinRequest) error {
   129  	c.controlMutex.Lock()
   130  	defer c.controlMutex.Unlock()
   131  	c.mu.Lock()
   132  	if c.nr != nil {
   133  		c.mu.Unlock()
   134  		return errSwarmExists
   135  	}
   136  	c.mu.Unlock()
   137  
   138  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   139  		return apierrors.NewBadRequestError(err)
   140  	}
   141  
   142  	listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
   143  	if err != nil {
   144  		return err
   145  	}
   146  
   147  	var advertiseAddr string
   148  	if req.AdvertiseAddr != "" {
   149  		advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
   150  		// For joining, we don't need to provide an advertise address,
   151  		// since the remote side can detect it.
   152  		if err == nil {
   153  			advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
   154  		}
   155  	}
   156  
   157  	clearPersistentState(c.root)
   158  
   159  	nr, err := c.newNodeRunner(nodeStartConfig{
   160  		RemoteAddr:    req.RemoteAddrs[0],
   161  		ListenAddr:    net.JoinHostPort(listenHost, listenPort),
   162  		AdvertiseAddr: advertiseAddr,
   163  		joinAddr:      req.RemoteAddrs[0],
   164  		joinToken:     req.JoinToken,
   165  		availability:  req.Availability,
   166  	})
   167  	if err != nil {
   168  		return err
   169  	}
   170  
   171  	c.mu.Lock()
   172  	c.nr = nr
   173  	c.mu.Unlock()
   174  
   175  	select {
   176  	case <-time.After(swarmConnectTimeout):
   177  		return errSwarmJoinTimeoutReached
   178  	case err := <-nr.Ready():
   179  		if err != nil {
   180  			c.mu.Lock()
   181  			c.nr = nil
   182  			c.mu.Unlock()
   183  		}
   184  		return err
   185  	}
   186  }
   187  
   188  // Inspect retrieves the configuration properties of a managed swarm cluster.
   189  func (c *Cluster) Inspect() (types.Swarm, error) {
   190  	c.mu.RLock()
   191  	defer c.mu.RUnlock()
   192  
   193  	state := c.currentNodeState()
   194  	if !state.IsActiveManager() {
   195  		return types.Swarm{}, c.errNoManager(state)
   196  	}
   197  
   198  	ctx, cancel := c.getRequestContext()
   199  	defer cancel()
   200  
   201  	swarm, err := getSwarm(ctx, state.controlClient)
   202  	if err != nil {
   203  		return types.Swarm{}, err
   204  	}
   205  
   206  	return convert.SwarmFromGRPC(*swarm), nil
   207  }
   208  
   209  // Update updates configuration of a managed swarm cluster.
   210  func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
   211  	c.mu.RLock()
   212  	defer c.mu.RUnlock()
   213  
   214  	state := c.currentNodeState()
   215  	if !state.IsActiveManager() {
   216  		return c.errNoManager(state)
   217  	}
   218  
   219  	ctx, cancel := c.getRequestContext()
   220  	defer cancel()
   221  
   222  	swarm, err := getSwarm(ctx, state.controlClient)
   223  	if err != nil {
   224  		return err
   225  	}
   226  
   227  	// In update, client should provide the complete spec of the swarm, including
   228  	// Name and Labels. If a field is specified with 0 or nil, then the default value
   229  	// will be used to swarmkit.
   230  	clusterSpec, err := convert.SwarmSpecToGRPC(spec)
   231  	if err != nil {
   232  		return apierrors.NewBadRequestError(err)
   233  	}
   234  
   235  	_, err = state.controlClient.UpdateCluster(
   236  		ctx,
   237  		&swarmapi.UpdateClusterRequest{
   238  			ClusterID: swarm.ID,
   239  			Spec:      &clusterSpec,
   240  			ClusterVersion: &swarmapi.Version{
   241  				Index: version,
   242  			},
   243  			Rotation: swarmapi.KeyRotation{
   244  				WorkerJoinToken:  flags.RotateWorkerToken,
   245  				ManagerJoinToken: flags.RotateManagerToken,
   246  				ManagerUnlockKey: flags.RotateManagerUnlockKey,
   247  			},
   248  		},
   249  	)
   250  	return err
   251  }
   252  
   253  // GetUnlockKey returns the unlock key for the swarm.
   254  func (c *Cluster) GetUnlockKey() (string, error) {
   255  	c.mu.RLock()
   256  	defer c.mu.RUnlock()
   257  
   258  	state := c.currentNodeState()
   259  	if !state.IsActiveManager() {
   260  		return "", c.errNoManager(state)
   261  	}
   262  
   263  	ctx, cancel := c.getRequestContext()
   264  	defer cancel()
   265  
   266  	client := swarmapi.NewCAClient(state.grpcConn)
   267  
   268  	r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
   269  	if err != nil {
   270  		return "", err
   271  	}
   272  
   273  	if len(r.UnlockKey) == 0 {
   274  		// no key
   275  		return "", nil
   276  	}
   277  
   278  	return encryption.HumanReadableKey(r.UnlockKey), nil
   279  }
   280  
   281  // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
   282  func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
   283  	c.controlMutex.Lock()
   284  	defer c.controlMutex.Unlock()
   285  
   286  	c.mu.RLock()
   287  	state := c.currentNodeState()
   288  
   289  	if !state.IsActiveManager() {
   290  		// when manager is not active,
   291  		// unless it is locked, otherwise return error.
   292  		if err := c.errNoManager(state); err != errSwarmLocked {
   293  			c.mu.RUnlock()
   294  			return err
   295  		}
   296  	} else {
   297  		// when manager is active, return an error of "not locked"
   298  		c.mu.RUnlock()
   299  		return errors.New("swarm is not locked")
   300  	}
   301  
   302  	// only when swarm is locked, code running reaches here
   303  	nr := c.nr
   304  	c.mu.RUnlock()
   305  
   306  	key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
   307  	if err != nil {
   308  		return err
   309  	}
   310  
   311  	config := nr.config
   312  	config.lockKey = key
   313  	if err := nr.Stop(); err != nil {
   314  		return err
   315  	}
   316  	nr, err = c.newNodeRunner(config)
   317  	if err != nil {
   318  		return err
   319  	}
   320  
   321  	c.mu.Lock()
   322  	c.nr = nr
   323  	c.mu.Unlock()
   324  
   325  	if err := <-nr.Ready(); err != nil {
   326  		if errors.Cause(err) == errSwarmLocked {
   327  			return errors.New("swarm could not be unlocked: invalid key provided")
   328  		}
   329  		return fmt.Errorf("swarm component could not be started: %v", err)
   330  	}
   331  	return nil
   332  }
   333  
   334  // Leave shuts down Cluster and removes current state.
   335  func (c *Cluster) Leave(force bool) error {
   336  	c.controlMutex.Lock()
   337  	defer c.controlMutex.Unlock()
   338  
   339  	c.mu.Lock()
   340  	nr := c.nr
   341  	if nr == nil {
   342  		c.mu.Unlock()
   343  		return errNoSwarm
   344  	}
   345  
   346  	state := c.currentNodeState()
   347  
   348  	if errors.Cause(state.err) == errSwarmLocked && !force {
   349  		// leave a locked swarm without --force is not allowed
   350  		c.mu.Unlock()
   351  		return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
   352  	}
   353  
   354  	if state.IsManager() && !force {
   355  		msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
   356  		if state.IsActiveManager() {
   357  			active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
   358  			if err == nil {
   359  				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
   360  					if isLastManager(reachable, unreachable) {
   361  						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
   362  						c.mu.Unlock()
   363  						return errors.New(msg)
   364  					}
   365  					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
   366  				}
   367  			}
   368  		} else {
   369  			msg += "Doing so may lose the consensus of your cluster. "
   370  		}
   371  
   372  		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
   373  		c.mu.Unlock()
   374  		return errors.New(msg)
   375  	}
   376  	// release readers in here
   377  	if err := nr.Stop(); err != nil {
   378  		logrus.Errorf("failed to shut down cluster node: %v", err)
   379  		signal.DumpStacks("")
   380  		c.mu.Unlock()
   381  		return err
   382  	}
   383  	c.nr = nil
   384  	c.mu.Unlock()
   385  	if nodeID := state.NodeID(); nodeID != "" {
   386  		nodeContainers, err := c.listContainerForNode(nodeID)
   387  		if err != nil {
   388  			return err
   389  		}
   390  		for _, id := range nodeContainers {
   391  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   392  				logrus.Errorf("error removing %v: %v", id, err)
   393  			}
   394  		}
   395  	}
   396  
   397  	c.configEvent <- struct{}{}
   398  	// todo: cleanup optional?
   399  	if err := clearPersistentState(c.root); err != nil {
   400  		return err
   401  	}
   402  	c.config.Backend.DaemonLeavesCluster()
   403  	return nil
   404  }
   405  
   406  // Info returns information about the current cluster state.
   407  func (c *Cluster) Info() types.Info {
   408  	info := types.Info{
   409  		NodeAddr: c.GetAdvertiseAddress(),
   410  	}
   411  	c.mu.RLock()
   412  	defer c.mu.RUnlock()
   413  
   414  	state := c.currentNodeState()
   415  	info.LocalNodeState = state.status
   416  	if state.err != nil {
   417  		info.Error = state.err.Error()
   418  	}
   419  
   420  	ctx, cancel := c.getRequestContext()
   421  	defer cancel()
   422  
   423  	if state.IsActiveManager() {
   424  		info.ControlAvailable = true
   425  		swarm, err := c.Inspect()
   426  		if err != nil {
   427  			info.Error = err.Error()
   428  		}
   429  
   430  		// Strip JoinTokens
   431  		info.Cluster = swarm.ClusterInfo
   432  
   433  		if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil {
   434  			info.Error = err.Error()
   435  		} else {
   436  			info.Nodes = len(r.Nodes)
   437  			for _, n := range r.Nodes {
   438  				if n.ManagerStatus != nil {
   439  					info.Managers = info.Managers + 1
   440  				}
   441  			}
   442  		}
   443  	}
   444  
   445  	if state.swarmNode != nil {
   446  		for _, r := range state.swarmNode.Remotes() {
   447  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   448  		}
   449  		info.NodeID = state.swarmNode.NodeID()
   450  	}
   451  
   452  	return info
   453  }
   454  
   455  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
   456  	var err error
   457  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   458  	if err != nil {
   459  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   460  	}
   461  
   462  	if req.Spec.Annotations.Name == "" {
   463  		req.Spec.Annotations.Name = "default"
   464  	} else if req.Spec.Annotations.Name != "default" {
   465  		return errors.New(`swarm spec must be named "default"`)
   466  	}
   467  
   468  	return nil
   469  }
   470  
   471  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
   472  	var err error
   473  	req.ListenAddr, err = validateAddr(req.ListenAddr)
   474  	if err != nil {
   475  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
   476  	}
   477  	if len(req.RemoteAddrs) == 0 {
   478  		return errors.New("at least 1 RemoteAddr is required to join")
   479  	}
   480  	for i := range req.RemoteAddrs {
   481  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
   482  		if err != nil {
   483  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
   484  		}
   485  	}
   486  	return nil
   487  }
   488  
   489  func validateAddr(addr string) (string, error) {
   490  	if addr == "" {
   491  		return addr, errors.New("invalid empty address")
   492  	}
   493  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
   494  	if err != nil {
   495  		return addr, nil
   496  	}
   497  	return strings.TrimPrefix(newaddr, "tcp://"), nil
   498  }
   499  
   500  func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
   501  	ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
   502  	for conn := range node.ListenControlSocket(ctx) {
   503  		if ctx.Err() != nil {
   504  			return ctx.Err()
   505  		}
   506  		if conn != nil {
   507  			client := swarmapi.NewControlClient(conn)
   508  			var cluster *swarmapi.Cluster
   509  			for i := 0; ; i++ {
   510  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
   511  				if err != nil {
   512  					return fmt.Errorf("error on listing clusters: %v", err)
   513  				}
   514  				if len(lcr.Clusters) == 0 {
   515  					if i < 10 {
   516  						time.Sleep(200 * time.Millisecond)
   517  						continue
   518  					}
   519  					return errors.New("empty list of clusters was returned")
   520  				}
   521  				cluster = lcr.Clusters[0]
   522  				break
   523  			}
   524  			// In init, we take the initial default values from swarmkit, and merge
   525  			// any non nil or 0 value from spec to GRPC spec. This will leave the
   526  			// default value alone.
   527  			// Note that this is different from Update(), as in Update() we expect
   528  			// user to specify the complete spec of the cluster (as they already know
   529  			// the existing one and knows which field to update)
   530  			clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
   531  			if err != nil {
   532  				return fmt.Errorf("error updating cluster settings: %v", err)
   533  			}
   534  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
   535  				ClusterID:      cluster.ID,
   536  				ClusterVersion: &cluster.Meta.Version,
   537  				Spec:           &clusterSpec,
   538  			})
   539  			if err != nil {
   540  				return fmt.Errorf("error updating cluster settings: %v", err)
   541  			}
   542  			return nil
   543  		}
   544  	}
   545  	return ctx.Err()
   546  }
   547  
   548  func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
   549  	var ids []string
   550  	filters := filters.NewArgs()
   551  	filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
   552  	containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
   553  		Filters: filters,
   554  	})
   555  	if err != nil {
   556  		return []string{}, err
   557  	}
   558  	for _, c := range containers {
   559  		ids = append(ids, c.ID)
   560  	}
   561  	return ids, nil
   562  }