github.com/noxiouz/docker@v0.7.3-0.20160629055221-3d231c78e8c5/daemon/cluster/cluster.go (about)

     1  package cluster
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"path/filepath"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"google.golang.org/grpc"
    14  
    15  	"github.com/Sirupsen/logrus"
    16  	"github.com/docker/distribution/digest"
    17  	"github.com/docker/docker/daemon/cluster/convert"
    18  	executorpkg "github.com/docker/docker/daemon/cluster/executor"
    19  	"github.com/docker/docker/daemon/cluster/executor/container"
    20  	"github.com/docker/docker/errors"
    21  	"github.com/docker/docker/opts"
    22  	"github.com/docker/docker/pkg/ioutils"
    23  	"github.com/docker/docker/runconfig"
    24  	apitypes "github.com/docker/engine-api/types"
    25  	types "github.com/docker/engine-api/types/swarm"
    26  	swarmagent "github.com/docker/swarmkit/agent"
    27  	swarmapi "github.com/docker/swarmkit/api"
    28  	"golang.org/x/net/context"
    29  )
    30  
    31  const swarmDirName = "swarm"
    32  const controlSocket = "control.sock"
    33  const swarmConnectTimeout = 20 * time.Second
    34  const stateFile = "docker-state.json"
    35  const defaultAddr = "0.0.0.0:2377"
    36  
    37  const (
    38  	initialReconnectDelay = 100 * time.Millisecond
    39  	maxReconnectDelay     = 30 * time.Second
    40  )
    41  
    42  // ErrNoSwarm is returned on leaving a cluster that was never initialized
    43  var ErrNoSwarm = fmt.Errorf("This node is not part of Swarm")
    44  
    45  // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated
    46  var ErrSwarmExists = fmt.Errorf("This node is already part of a Swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.")
    47  
    48  // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet.
    49  var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.")
    50  
    51  // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached.
    52  var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current Swarm status of your node.")
    53  
    54  // defaultSpec contains some sane defaults if cluster options are missing on init
    55  var defaultSpec = types.Spec{
    56  	Raft: types.RaftConfig{
    57  		SnapshotInterval:           10000,
    58  		KeepOldSnapshots:           0,
    59  		LogEntriesForSlowFollowers: 500,
    60  		HeartbeatTick:              1,
    61  		ElectionTick:               3,
    62  	},
    63  	CAConfig: types.CAConfig{
    64  		NodeCertExpiry: 90 * 24 * time.Hour,
    65  	},
    66  	Dispatcher: types.DispatcherConfig{
    67  		HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()),
    68  	},
    69  	Orchestration: types.OrchestrationConfig{
    70  		TaskHistoryRetentionLimit: 10,
    71  	},
    72  }
    73  
    74  type state struct {
    75  	ListenAddr string
    76  }
    77  
    78  // Config provides values for Cluster.
    79  type Config struct {
    80  	Root    string
    81  	Name    string
    82  	Backend executorpkg.Backend
    83  }
    84  
    85  // Cluster provides capabilities to participate in a cluster as a worker or a
    86  // manager.
    87  type Cluster struct {
    88  	sync.RWMutex
    89  	root           string
    90  	config         Config
    91  	configEvent    chan struct{} // todo: make this array and goroutine safe
    92  	node           *swarmagent.Node
    93  	conn           *grpc.ClientConn
    94  	client         swarmapi.ControlClient
    95  	ready          bool
    96  	listenAddr     string
    97  	err            error
    98  	reconnectDelay time.Duration
    99  	stop           bool
   100  	cancelDelay    func()
   101  }
   102  
   103  // New creates a new Cluster instance using provided config.
   104  func New(config Config) (*Cluster, error) {
   105  	root := filepath.Join(config.Root, swarmDirName)
   106  	if err := os.MkdirAll(root, 0700); err != nil {
   107  		return nil, err
   108  	}
   109  	c := &Cluster{
   110  		root:           root,
   111  		config:         config,
   112  		configEvent:    make(chan struct{}, 10),
   113  		reconnectDelay: initialReconnectDelay,
   114  	}
   115  
   116  	st, err := c.loadState()
   117  	if err != nil {
   118  		if os.IsNotExist(err) {
   119  			return c, nil
   120  		}
   121  		return nil, err
   122  	}
   123  
   124  	n, ctx, err := c.startNewNode(false, st.ListenAddr, "", "", "", false)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	select {
   130  	case <-time.After(swarmConnectTimeout):
   131  		logrus.Errorf("swarm component could not be started before timeout was reached")
   132  	case <-n.Ready():
   133  	case <-ctx.Done():
   134  	}
   135  	if ctx.Err() != nil {
   136  		return nil, fmt.Errorf("swarm component could not be started")
   137  	}
   138  	go c.reconnectOnFailure(ctx)
   139  	return c, nil
   140  }
   141  
   142  func (c *Cluster) loadState() (*state, error) {
   143  	dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile))
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  	// missing certificate means no actual state to restore from
   148  	if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil {
   149  		if os.IsNotExist(err) {
   150  			c.clearState()
   151  		}
   152  		return nil, err
   153  	}
   154  	var st state
   155  	if err := json.Unmarshal(dt, &st); err != nil {
   156  		return nil, err
   157  	}
   158  	return &st, nil
   159  }
   160  
   161  func (c *Cluster) saveState() error {
   162  	dt, err := json.Marshal(state{ListenAddr: c.listenAddr})
   163  	if err != nil {
   164  		return err
   165  	}
   166  	return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600)
   167  }
   168  
   169  func (c *Cluster) reconnectOnFailure(ctx context.Context) {
   170  	for {
   171  		<-ctx.Done()
   172  		c.Lock()
   173  		if c.stop || c.node != nil {
   174  			c.Unlock()
   175  			return
   176  		}
   177  		c.reconnectDelay *= 2
   178  		if c.reconnectDelay > maxReconnectDelay {
   179  			c.reconnectDelay = maxReconnectDelay
   180  		}
   181  		logrus.Warnf("Restarting swarm in %.2f seconds", c.reconnectDelay.Seconds())
   182  		delayCtx, cancel := context.WithTimeout(context.Background(), c.reconnectDelay)
   183  		c.cancelDelay = cancel
   184  		c.Unlock()
   185  		<-delayCtx.Done()
   186  		if delayCtx.Err() != context.DeadlineExceeded {
   187  			return
   188  		}
   189  		c.Lock()
   190  		if c.node != nil {
   191  			c.Unlock()
   192  			return
   193  		}
   194  		var err error
   195  		_, ctx, err = c.startNewNode(false, c.listenAddr, c.getRemoteAddress(), "", "", false)
   196  		if err != nil {
   197  			c.err = err
   198  			ctx = delayCtx
   199  		}
   200  		c.Unlock()
   201  	}
   202  }
   203  
   204  func (c *Cluster) startNewNode(forceNewCluster bool, listenAddr, joinAddr, secret, cahash string, ismanager bool) (*swarmagent.Node, context.Context, error) {
   205  	if err := c.config.Backend.IsSwarmCompatible(); err != nil {
   206  		return nil, nil, err
   207  	}
   208  	c.node = nil
   209  	c.cancelDelay = nil
   210  	node, err := swarmagent.NewNode(&swarmagent.NodeConfig{
   211  		Hostname:         c.config.Name,
   212  		ForceNewCluster:  forceNewCluster,
   213  		ListenControlAPI: filepath.Join(c.root, controlSocket),
   214  		ListenRemoteAPI:  listenAddr,
   215  		JoinAddr:         joinAddr,
   216  		StateDir:         c.root,
   217  		CAHash:           cahash,
   218  		Secret:           secret,
   219  		Executor:         container.NewExecutor(c.config.Backend),
   220  		HeartbeatTick:    1,
   221  		ElectionTick:     3,
   222  		IsManager:        ismanager,
   223  	})
   224  	if err != nil {
   225  		return nil, nil, err
   226  	}
   227  	ctx, cancel := context.WithCancel(context.Background())
   228  	if err := node.Start(ctx); err != nil {
   229  		return nil, nil, err
   230  	}
   231  
   232  	c.node = node
   233  	c.listenAddr = listenAddr
   234  	c.saveState()
   235  	c.config.Backend.SetClusterProvider(c)
   236  	go func() {
   237  		err := node.Err(ctx)
   238  		if err != nil {
   239  			logrus.Errorf("cluster exited with error: %v", err)
   240  		}
   241  		c.Lock()
   242  		c.conn = nil
   243  		c.client = nil
   244  		c.node = nil
   245  		c.ready = false
   246  		c.err = err
   247  		c.Unlock()
   248  		cancel()
   249  	}()
   250  
   251  	go func() {
   252  		select {
   253  		case <-node.Ready():
   254  			c.Lock()
   255  			c.reconnectDelay = initialReconnectDelay
   256  			c.Unlock()
   257  		case <-ctx.Done():
   258  		}
   259  		if ctx.Err() == nil {
   260  			c.Lock()
   261  			c.ready = true
   262  			c.err = nil
   263  			c.Unlock()
   264  		}
   265  		c.configEvent <- struct{}{}
   266  	}()
   267  
   268  	go func() {
   269  		for conn := range node.ListenControlSocket(ctx) {
   270  			c.Lock()
   271  			if c.conn != conn {
   272  				c.client = swarmapi.NewControlClient(conn)
   273  			}
   274  			if c.conn != nil {
   275  				c.client = nil
   276  			}
   277  			c.conn = conn
   278  			c.Unlock()
   279  			c.configEvent <- struct{}{}
   280  		}
   281  	}()
   282  
   283  	return node, ctx, nil
   284  }
   285  
   286  // Init initializes new cluster from user provided request.
   287  func (c *Cluster) Init(req types.InitRequest) (string, error) {
   288  	c.Lock()
   289  	if node := c.node; node != nil {
   290  		c.Unlock()
   291  		if !req.ForceNewCluster {
   292  			return "", errSwarmExists(node)
   293  		}
   294  		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   295  		defer cancel()
   296  		c.cancelReconnect()
   297  		if err := c.node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   298  			return "", err
   299  		}
   300  		c.Lock()
   301  		c.node = nil
   302  		c.conn = nil
   303  		c.ready = false
   304  	}
   305  
   306  	if err := validateAndSanitizeInitRequest(&req); err != nil {
   307  		c.Unlock()
   308  		return "", err
   309  	}
   310  
   311  	// todo: check current state existing
   312  	n, ctx, err := c.startNewNode(req.ForceNewCluster, req.ListenAddr, "", "", "", false)
   313  	if err != nil {
   314  		c.Unlock()
   315  		return "", err
   316  	}
   317  	c.Unlock()
   318  
   319  	select {
   320  	case <-n.Ready():
   321  		if err := initClusterSpec(n, req.Spec); err != nil {
   322  			return "", err
   323  		}
   324  		go c.reconnectOnFailure(ctx)
   325  		return n.NodeID(), nil
   326  	case <-ctx.Done():
   327  		c.RLock()
   328  		defer c.RUnlock()
   329  		if c.err != nil {
   330  			if !req.ForceNewCluster { // if failure on first attempt don't keep state
   331  				if err := c.clearState(); err != nil {
   332  					return "", err
   333  				}
   334  			}
   335  			return "", c.err
   336  		}
   337  		return "", ctx.Err()
   338  	}
   339  }
   340  
   341  // Join makes current Cluster part of an existing swarm cluster.
   342  func (c *Cluster) Join(req types.JoinRequest) error {
   343  	c.Lock()
   344  	if node := c.node; node != nil {
   345  		c.Unlock()
   346  		return errSwarmExists(node)
   347  	}
   348  	if err := validateAndSanitizeJoinRequest(&req); err != nil {
   349  		c.Unlock()
   350  		return err
   351  	}
   352  	// todo: check current state existing
   353  	n, ctx, err := c.startNewNode(false, req.ListenAddr, req.RemoteAddrs[0], req.Secret, req.CACertHash, req.Manager)
   354  	if err != nil {
   355  		c.Unlock()
   356  		return err
   357  	}
   358  	c.Unlock()
   359  
   360  	certificateRequested := n.CertificateRequested()
   361  	for {
   362  		select {
   363  		case <-certificateRequested:
   364  			if n.NodeMembership() == swarmapi.NodeMembershipPending {
   365  				return fmt.Errorf("Your node is in the process of joining the cluster but needs to be accepted by existing cluster member.\nTo accept this node into cluster run \"docker node accept %v\" in an existing cluster manager. Use \"docker info\" command to see the current Swarm status of your node.", n.NodeID())
   366  			}
   367  			certificateRequested = nil
   368  		case <-time.After(swarmConnectTimeout):
   369  			// attempt to connect will continue in background, also reconnecting
   370  			go c.reconnectOnFailure(ctx)
   371  			return ErrSwarmJoinTimeoutReached
   372  		case <-n.Ready():
   373  			go c.reconnectOnFailure(ctx)
   374  			return nil
   375  		case <-ctx.Done():
   376  			c.RLock()
   377  			defer c.RUnlock()
   378  			if c.err != nil {
   379  				return c.err
   380  			}
   381  			return ctx.Err()
   382  		}
   383  	}
   384  }
   385  
   386  func (c *Cluster) cancelReconnect() {
   387  	c.stop = true
   388  	if c.cancelDelay != nil {
   389  		c.cancelDelay()
   390  		c.cancelDelay = nil
   391  	}
   392  }
   393  
   394  // Leave shuts down Cluster and removes current state.
   395  func (c *Cluster) Leave(force bool) error {
   396  	c.Lock()
   397  	node := c.node
   398  	if node == nil {
   399  		c.Unlock()
   400  		return ErrNoSwarm
   401  	}
   402  
   403  	if node.Manager() != nil && !force {
   404  		msg := "You are attempting to leave cluster on a node that is participating as a manager. "
   405  		if c.isActiveManager() {
   406  			active, reachable, unreachable, err := c.managerStats()
   407  			if err == nil {
   408  				if active && reachable-2 <= unreachable {
   409  					if reachable == 1 && unreachable == 0 {
   410  						msg += "Leaving last manager will remove all current state of the cluster. Use `--force` to ignore this message. "
   411  						c.Unlock()
   412  						return fmt.Errorf(msg)
   413  					}
   414  					msg += fmt.Sprintf("Leaving cluster will leave you with %v managers out of %v. This means Raft quorum will be lost and your cluster will become inaccessible. ", reachable-1, reachable+unreachable)
   415  				}
   416  			}
   417  		} else {
   418  			msg += "Doing so may lose the consensus of your cluster. "
   419  		}
   420  
   421  		msg += "Only way to restore a cluster that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to ignore this message."
   422  		c.Unlock()
   423  		return fmt.Errorf(msg)
   424  	}
   425  	c.cancelReconnect()
   426  	c.Unlock()
   427  
   428  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   429  	defer cancel()
   430  	if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   431  		return err
   432  	}
   433  	if nodeID := node.NodeID(); nodeID != "" {
   434  		for _, id := range c.config.Backend.ListContainersForNode(nodeID) {
   435  			if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
   436  				logrus.Errorf("error removing %v: %v", id, err)
   437  			}
   438  		}
   439  	}
   440  	c.Lock()
   441  	defer c.Unlock()
   442  	c.node = nil
   443  	c.conn = nil
   444  	c.ready = false
   445  	c.configEvent <- struct{}{}
   446  	// todo: cleanup optional?
   447  	if err := c.clearState(); err != nil {
   448  		return err
   449  	}
   450  	return nil
   451  }
   452  
   453  func (c *Cluster) clearState() error {
   454  	// todo: backup this data instead of removing?
   455  	if err := os.RemoveAll(c.root); err != nil {
   456  		return err
   457  	}
   458  	if err := os.MkdirAll(c.root, 0700); err != nil {
   459  		return err
   460  	}
   461  	c.config.Backend.SetClusterProvider(nil)
   462  	return nil
   463  }
   464  
   465  func (c *Cluster) getRequestContext() context.Context { // TODO: not needed when requests don't block on qourum lost
   466  	ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
   467  	return ctx
   468  }
   469  
   470  // Inspect retrieves the configuration properties of a managed swarm cluster.
   471  func (c *Cluster) Inspect() (types.Swarm, error) {
   472  	c.RLock()
   473  	defer c.RUnlock()
   474  
   475  	if !c.isActiveManager() {
   476  		return types.Swarm{}, c.errNoManager()
   477  	}
   478  
   479  	swarm, err := getSwarm(c.getRequestContext(), c.client)
   480  	if err != nil {
   481  		return types.Swarm{}, err
   482  	}
   483  
   484  	if err != nil {
   485  		return types.Swarm{}, err
   486  	}
   487  
   488  	return convert.SwarmFromGRPC(*swarm), nil
   489  }
   490  
   491  // Update updates configuration of a managed swarm cluster.
   492  func (c *Cluster) Update(version uint64, spec types.Spec) error {
   493  	c.RLock()
   494  	defer c.RUnlock()
   495  
   496  	if !c.isActiveManager() {
   497  		return c.errNoManager()
   498  	}
   499  
   500  	swarm, err := getSwarm(c.getRequestContext(), c.client)
   501  	if err != nil {
   502  		return err
   503  	}
   504  
   505  	swarmSpec, err := convert.SwarmSpecToGRPCandMerge(spec, &swarm.Spec)
   506  	if err != nil {
   507  		return err
   508  	}
   509  
   510  	_, err = c.client.UpdateCluster(
   511  		c.getRequestContext(),
   512  		&swarmapi.UpdateClusterRequest{
   513  			ClusterID: swarm.ID,
   514  			Spec:      &swarmSpec,
   515  			ClusterVersion: &swarmapi.Version{
   516  				Index: version,
   517  			},
   518  		},
   519  	)
   520  	return err
   521  }
   522  
   523  // IsManager returns true if Cluster is participating as a manager.
   524  func (c *Cluster) IsManager() bool {
   525  	c.RLock()
   526  	defer c.RUnlock()
   527  	return c.isActiveManager()
   528  }
   529  
   530  // IsAgent returns true if Cluster is participating as a worker/agent.
   531  func (c *Cluster) IsAgent() bool {
   532  	c.RLock()
   533  	defer c.RUnlock()
   534  	return c.ready
   535  }
   536  
   537  // GetListenAddress returns the listening address for current manager's
   538  // consensus and dispatcher APIs.
   539  func (c *Cluster) GetListenAddress() string {
   540  	c.RLock()
   541  	defer c.RUnlock()
   542  	if c.conn != nil {
   543  		return c.listenAddr
   544  	}
   545  	return ""
   546  }
   547  
   548  // GetRemoteAddress returns a known advertise address of a remote manager if
   549  // available.
   550  // todo: change to array/connect with info
   551  func (c *Cluster) GetRemoteAddress() string {
   552  	c.RLock()
   553  	defer c.RUnlock()
   554  	return c.getRemoteAddress()
   555  }
   556  
   557  func (c *Cluster) getRemoteAddress() string {
   558  	if c.node == nil {
   559  		return ""
   560  	}
   561  	nodeID := c.node.NodeID()
   562  	for _, r := range c.node.Remotes() {
   563  		if r.NodeID != nodeID {
   564  			return r.Addr
   565  		}
   566  	}
   567  	return ""
   568  }
   569  
   570  // ListenClusterEvents returns a channel that receives messages on cluster
   571  // participation changes.
   572  // todo: make cancelable and accessible to multiple callers
   573  func (c *Cluster) ListenClusterEvents() <-chan struct{} {
   574  	return c.configEvent
   575  }
   576  
   577  // Info returns information about the current cluster state.
   578  func (c *Cluster) Info() types.Info {
   579  	var info types.Info
   580  	c.RLock()
   581  	defer c.RUnlock()
   582  
   583  	if c.node == nil {
   584  		info.LocalNodeState = types.LocalNodeStateInactive
   585  		if c.cancelDelay != nil {
   586  			info.LocalNodeState = types.LocalNodeStateError
   587  		}
   588  	} else {
   589  		info.LocalNodeState = types.LocalNodeStatePending
   590  		if c.ready == true {
   591  			info.LocalNodeState = types.LocalNodeStateActive
   592  		}
   593  	}
   594  	if c.err != nil {
   595  		info.Error = c.err.Error()
   596  	}
   597  
   598  	if c.isActiveManager() {
   599  		info.ControlAvailable = true
   600  		if r, err := c.client.ListNodes(c.getRequestContext(), &swarmapi.ListNodesRequest{}); err == nil {
   601  			info.Nodes = len(r.Nodes)
   602  			for _, n := range r.Nodes {
   603  				if n.ManagerStatus != nil {
   604  					info.Managers = info.Managers + 1
   605  				}
   606  			}
   607  		}
   608  
   609  		if swarm, err := getSwarm(c.getRequestContext(), c.client); err == nil && swarm != nil {
   610  			info.CACertHash = swarm.RootCA.CACertHash
   611  		}
   612  	}
   613  
   614  	if c.node != nil {
   615  		for _, r := range c.node.Remotes() {
   616  			info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
   617  		}
   618  		info.NodeID = c.node.NodeID()
   619  	}
   620  
   621  	return info
   622  }
   623  
   624  // isActiveManager should not be called without a read lock
   625  func (c *Cluster) isActiveManager() bool {
   626  	return c.conn != nil
   627  }
   628  
   629  // errNoManager returns error describing why manager commands can't be used.
   630  // Call with read lock.
   631  func (c *Cluster) errNoManager() error {
   632  	if c.node == nil {
   633  		return fmt.Errorf("This node is not a Swarm manager. Use \"docker swarm init\" or \"docker swarm join --manager\" to connect this node to Swarm and try again.")
   634  	}
   635  	if c.node.Manager() != nil {
   636  		return fmt.Errorf("This node is not a Swarm manager. Manager is being prepared or has trouble connecting to the cluster.")
   637  	}
   638  	return fmt.Errorf("This node is not a Swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.")
   639  }
   640  
   641  // GetServices returns all services of a managed swarm cluster.
   642  func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) {
   643  	c.RLock()
   644  	defer c.RUnlock()
   645  
   646  	if !c.isActiveManager() {
   647  		return nil, c.errNoManager()
   648  	}
   649  
   650  	filters, err := newListServicesFilters(options.Filter)
   651  	if err != nil {
   652  		return nil, err
   653  	}
   654  	r, err := c.client.ListServices(
   655  		c.getRequestContext(),
   656  		&swarmapi.ListServicesRequest{Filters: filters})
   657  	if err != nil {
   658  		return nil, err
   659  	}
   660  
   661  	var services []types.Service
   662  
   663  	for _, service := range r.Services {
   664  		services = append(services, convert.ServiceFromGRPC(*service))
   665  	}
   666  
   667  	return services, nil
   668  }
   669  
   670  // CreateService creates a new service in a managed swarm cluster.
   671  func (c *Cluster) CreateService(s types.ServiceSpec) (string, error) {
   672  	c.RLock()
   673  	defer c.RUnlock()
   674  
   675  	if !c.isActiveManager() {
   676  		return "", c.errNoManager()
   677  	}
   678  
   679  	ctx := c.getRequestContext()
   680  
   681  	err := populateNetworkID(ctx, c.client, &s)
   682  	if err != nil {
   683  		return "", err
   684  	}
   685  
   686  	serviceSpec, err := convert.ServiceSpecToGRPC(s)
   687  	if err != nil {
   688  		return "", err
   689  	}
   690  	r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec})
   691  	if err != nil {
   692  		return "", err
   693  	}
   694  
   695  	return r.Service.ID, nil
   696  }
   697  
   698  // GetService returns a service based on an ID or name.
   699  func (c *Cluster) GetService(input string) (types.Service, error) {
   700  	c.RLock()
   701  	defer c.RUnlock()
   702  
   703  	if !c.isActiveManager() {
   704  		return types.Service{}, c.errNoManager()
   705  	}
   706  
   707  	service, err := getService(c.getRequestContext(), c.client, input)
   708  	if err != nil {
   709  		return types.Service{}, err
   710  	}
   711  	return convert.ServiceFromGRPC(*service), nil
   712  }
   713  
   714  // UpdateService updates existing service to match new properties.
   715  func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec) error {
   716  	c.RLock()
   717  	defer c.RUnlock()
   718  
   719  	if !c.isActiveManager() {
   720  		return c.errNoManager()
   721  	}
   722  
   723  	serviceSpec, err := convert.ServiceSpecToGRPC(spec)
   724  	if err != nil {
   725  		return err
   726  	}
   727  
   728  	_, err = c.client.UpdateService(
   729  		c.getRequestContext(),
   730  		&swarmapi.UpdateServiceRequest{
   731  			ServiceID: serviceID,
   732  			Spec:      &serviceSpec,
   733  			ServiceVersion: &swarmapi.Version{
   734  				Index: version,
   735  			},
   736  		},
   737  	)
   738  	return err
   739  }
   740  
   741  // RemoveService removes a service from a managed swarm cluster.
   742  func (c *Cluster) RemoveService(input string) error {
   743  	c.RLock()
   744  	defer c.RUnlock()
   745  
   746  	if !c.isActiveManager() {
   747  		return c.errNoManager()
   748  	}
   749  
   750  	service, err := getService(c.getRequestContext(), c.client, input)
   751  	if err != nil {
   752  		return err
   753  	}
   754  
   755  	if _, err := c.client.RemoveService(c.getRequestContext(), &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil {
   756  		return err
   757  	}
   758  	return nil
   759  }
   760  
   761  // GetNodes returns a list of all nodes known to a cluster.
   762  func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) {
   763  	c.RLock()
   764  	defer c.RUnlock()
   765  
   766  	if !c.isActiveManager() {
   767  		return nil, c.errNoManager()
   768  	}
   769  
   770  	filters, err := newListNodesFilters(options.Filter)
   771  	if err != nil {
   772  		return nil, err
   773  	}
   774  	r, err := c.client.ListNodes(
   775  		c.getRequestContext(),
   776  		&swarmapi.ListNodesRequest{Filters: filters})
   777  	if err != nil {
   778  		return nil, err
   779  	}
   780  
   781  	nodes := []types.Node{}
   782  
   783  	for _, node := range r.Nodes {
   784  		nodes = append(nodes, convert.NodeFromGRPC(*node))
   785  	}
   786  	return nodes, nil
   787  }
   788  
   789  // GetNode returns a node based on an ID or name.
   790  func (c *Cluster) GetNode(input string) (types.Node, error) {
   791  	c.RLock()
   792  	defer c.RUnlock()
   793  
   794  	if !c.isActiveManager() {
   795  		return types.Node{}, c.errNoManager()
   796  	}
   797  
   798  	node, err := getNode(c.getRequestContext(), c.client, input)
   799  	if err != nil {
   800  		return types.Node{}, err
   801  	}
   802  	return convert.NodeFromGRPC(*node), nil
   803  }
   804  
   805  // UpdateNode updates existing nodes properties.
   806  func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error {
   807  	c.RLock()
   808  	defer c.RUnlock()
   809  
   810  	if !c.isActiveManager() {
   811  		return c.errNoManager()
   812  	}
   813  
   814  	nodeSpec, err := convert.NodeSpecToGRPC(spec)
   815  	if err != nil {
   816  		return err
   817  	}
   818  
   819  	_, err = c.client.UpdateNode(
   820  		c.getRequestContext(),
   821  		&swarmapi.UpdateNodeRequest{
   822  			NodeID: nodeID,
   823  			Spec:   &nodeSpec,
   824  			NodeVersion: &swarmapi.Version{
   825  				Index: version,
   826  			},
   827  		},
   828  	)
   829  	return err
   830  }
   831  
   832  // RemoveNode removes a node from a cluster
   833  func (c *Cluster) RemoveNode(input string) error {
   834  	c.RLock()
   835  	defer c.RUnlock()
   836  
   837  	if !c.isActiveManager() {
   838  		return c.errNoManager()
   839  	}
   840  
   841  	ctx := c.getRequestContext()
   842  
   843  	node, err := getNode(ctx, c.client, input)
   844  	if err != nil {
   845  		return err
   846  	}
   847  
   848  	if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID}); err != nil {
   849  		return err
   850  	}
   851  	return nil
   852  }
   853  
   854  // GetTasks returns a list of tasks matching the filter options.
   855  func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) {
   856  	c.RLock()
   857  	defer c.RUnlock()
   858  
   859  	if !c.isActiveManager() {
   860  		return nil, c.errNoManager()
   861  	}
   862  
   863  	filters, err := newListTasksFilters(options.Filter)
   864  	if err != nil {
   865  		return nil, err
   866  	}
   867  	r, err := c.client.ListTasks(
   868  		c.getRequestContext(),
   869  		&swarmapi.ListTasksRequest{Filters: filters})
   870  	if err != nil {
   871  		return nil, err
   872  	}
   873  
   874  	tasks := []types.Task{}
   875  
   876  	for _, task := range r.Tasks {
   877  		tasks = append(tasks, convert.TaskFromGRPC(*task))
   878  	}
   879  	return tasks, nil
   880  }
   881  
   882  // GetTask returns a task by an ID.
   883  func (c *Cluster) GetTask(input string) (types.Task, error) {
   884  	c.RLock()
   885  	defer c.RUnlock()
   886  
   887  	if !c.isActiveManager() {
   888  		return types.Task{}, c.errNoManager()
   889  	}
   890  
   891  	task, err := getTask(c.getRequestContext(), c.client, input)
   892  	if err != nil {
   893  		return types.Task{}, err
   894  	}
   895  	return convert.TaskFromGRPC(*task), nil
   896  }
   897  
   898  // GetNetwork returns a cluster network by an ID.
   899  func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) {
   900  	c.RLock()
   901  	defer c.RUnlock()
   902  
   903  	if !c.isActiveManager() {
   904  		return apitypes.NetworkResource{}, c.errNoManager()
   905  	}
   906  
   907  	network, err := getNetwork(c.getRequestContext(), c.client, input)
   908  	if err != nil {
   909  		return apitypes.NetworkResource{}, err
   910  	}
   911  	return convert.BasicNetworkFromGRPC(*network), nil
   912  }
   913  
   914  // GetNetworks returns all current cluster managed networks.
   915  func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) {
   916  	c.RLock()
   917  	defer c.RUnlock()
   918  
   919  	if !c.isActiveManager() {
   920  		return nil, c.errNoManager()
   921  	}
   922  
   923  	r, err := c.client.ListNetworks(c.getRequestContext(), &swarmapi.ListNetworksRequest{})
   924  	if err != nil {
   925  		return nil, err
   926  	}
   927  
   928  	var networks []apitypes.NetworkResource
   929  
   930  	for _, network := range r.Networks {
   931  		networks = append(networks, convert.BasicNetworkFromGRPC(*network))
   932  	}
   933  
   934  	return networks, nil
   935  }
   936  
   937  // CreateNetwork creates a new cluster managed network.
   938  func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) {
   939  	c.RLock()
   940  	defer c.RUnlock()
   941  
   942  	if !c.isActiveManager() {
   943  		return "", c.errNoManager()
   944  	}
   945  
   946  	if runconfig.IsPreDefinedNetwork(s.Name) {
   947  		err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name)
   948  		return "", errors.NewRequestForbiddenError(err)
   949  	}
   950  
   951  	networkSpec := convert.BasicNetworkCreateToGRPC(s)
   952  	r, err := c.client.CreateNetwork(c.getRequestContext(), &swarmapi.CreateNetworkRequest{Spec: &networkSpec})
   953  	if err != nil {
   954  		return "", err
   955  	}
   956  
   957  	return r.Network.ID, nil
   958  }
   959  
   960  // RemoveNetwork removes a cluster network.
   961  func (c *Cluster) RemoveNetwork(input string) error {
   962  	c.RLock()
   963  	defer c.RUnlock()
   964  
   965  	if !c.isActiveManager() {
   966  		return c.errNoManager()
   967  	}
   968  
   969  	network, err := getNetwork(c.getRequestContext(), c.client, input)
   970  	if err != nil {
   971  		return err
   972  	}
   973  
   974  	if _, err := c.client.RemoveNetwork(c.getRequestContext(), &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil {
   975  		return err
   976  	}
   977  	return nil
   978  }
   979  
   980  func populateNetworkID(ctx context.Context, c swarmapi.ControlClient, s *types.ServiceSpec) error {
   981  	for i, n := range s.Networks {
   982  		apiNetwork, err := getNetwork(ctx, c, n.Target)
   983  		if err != nil {
   984  			return err
   985  		}
   986  		s.Networks[i].Target = apiNetwork.ID
   987  	}
   988  	return nil
   989  }
   990  
   991  func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) {
   992  	// GetNetwork to match via full ID.
   993  	rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input})
   994  	if err != nil {
   995  		// If any error (including NotFound), ListNetworks to match via ID prefix and full name.
   996  		rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}})
   997  		if err != nil || len(rl.Networks) == 0 {
   998  			rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}})
   999  		}
  1000  
  1001  		if err != nil {
  1002  			return nil, err
  1003  		}
  1004  
  1005  		if len(rl.Networks) == 0 {
  1006  			return nil, fmt.Errorf("network %s not found", input)
  1007  		}
  1008  
  1009  		if l := len(rl.Networks); l > 1 {
  1010  			return nil, fmt.Errorf("network %s is ambigious (%d matches found)", input, l)
  1011  		}
  1012  
  1013  		return rl.Networks[0], nil
  1014  	}
  1015  	return rg.Network, nil
  1016  }
  1017  
  1018  // Cleanup stops active swarm node. This is run before daemon shutdown.
  1019  func (c *Cluster) Cleanup() {
  1020  	c.Lock()
  1021  	node := c.node
  1022  	if node == nil {
  1023  		c.Unlock()
  1024  		return
  1025  	}
  1026  
  1027  	if c.isActiveManager() {
  1028  		active, reachable, unreachable, err := c.managerStats()
  1029  		if err == nil {
  1030  			singlenode := active && reachable == 1 && unreachable == 0
  1031  			if active && !singlenode && reachable-2 <= unreachable {
  1032  				logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable)
  1033  			}
  1034  		}
  1035  	}
  1036  	c.cancelReconnect()
  1037  	c.Unlock()
  1038  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
  1039  	defer cancel()
  1040  	if err := node.Stop(ctx); err != nil {
  1041  		logrus.Errorf("error cleaning up cluster: %v", err)
  1042  	}
  1043  	c.Lock()
  1044  	c.node = nil
  1045  	c.ready = false
  1046  	c.conn = nil
  1047  	c.Unlock()
  1048  }
  1049  
  1050  func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) {
  1051  	ctx, _ := context.WithTimeout(context.Background(), 3*time.Second)
  1052  	nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{})
  1053  	if err != nil {
  1054  		return false, 0, 0, err
  1055  	}
  1056  	for _, n := range nodes.Nodes {
  1057  		if n.ManagerStatus != nil {
  1058  			if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE {
  1059  				reachable++
  1060  				if n.ID == c.node.NodeID() {
  1061  					current = true
  1062  				}
  1063  			}
  1064  			if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE {
  1065  				unreachable++
  1066  			}
  1067  		}
  1068  	}
  1069  	return
  1070  }
  1071  
  1072  func validateAndSanitizeInitRequest(req *types.InitRequest) error {
  1073  	var err error
  1074  	req.ListenAddr, err = validateAddr(req.ListenAddr)
  1075  	if err != nil {
  1076  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  1077  	}
  1078  
  1079  	spec := &req.Spec
  1080  	// provide sane defaults instead of erroring
  1081  	if spec.Name == "" {
  1082  		spec.Name = "default"
  1083  	}
  1084  	if spec.Raft.SnapshotInterval == 0 {
  1085  		spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval
  1086  	}
  1087  	if spec.Raft.LogEntriesForSlowFollowers == 0 {
  1088  		spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers
  1089  	}
  1090  	if spec.Raft.ElectionTick == 0 {
  1091  		spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick
  1092  	}
  1093  	if spec.Raft.HeartbeatTick == 0 {
  1094  		spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick
  1095  	}
  1096  	if spec.Dispatcher.HeartbeatPeriod == 0 {
  1097  		spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod
  1098  	}
  1099  	if spec.CAConfig.NodeCertExpiry == 0 {
  1100  		spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry
  1101  	}
  1102  	if spec.Orchestration.TaskHistoryRetentionLimit == 0 {
  1103  		spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit
  1104  	}
  1105  	return nil
  1106  }
  1107  
  1108  func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
  1109  	var err error
  1110  	req.ListenAddr, err = validateAddr(req.ListenAddr)
  1111  	if err != nil {
  1112  		return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  1113  	}
  1114  	if len(req.RemoteAddrs) == 0 {
  1115  		return fmt.Errorf("at least 1 RemoteAddr is required to join")
  1116  	}
  1117  	for i := range req.RemoteAddrs {
  1118  		req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
  1119  		if err != nil {
  1120  			return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
  1121  		}
  1122  	}
  1123  	if req.CACertHash != "" {
  1124  		if _, err := digest.ParseDigest(req.CACertHash); err != nil {
  1125  			return fmt.Errorf("invalid CACertHash %q, %v", req.CACertHash, err)
  1126  		}
  1127  	}
  1128  	return nil
  1129  }
  1130  
  1131  func validateAddr(addr string) (string, error) {
  1132  	if addr == "" {
  1133  		return addr, fmt.Errorf("invalid empty address")
  1134  	}
  1135  	newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
  1136  	if err != nil {
  1137  		return addr, nil
  1138  	}
  1139  	return strings.TrimPrefix(newaddr, "tcp://"), nil
  1140  }
  1141  
  1142  func errSwarmExists(node *swarmagent.Node) error {
  1143  	if node.NodeMembership() != swarmapi.NodeMembershipAccepted {
  1144  		return ErrPendingSwarmExists
  1145  	}
  1146  	return ErrSwarmExists
  1147  }
  1148  
  1149  func initClusterSpec(node *swarmagent.Node, spec types.Spec) error {
  1150  	ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
  1151  	for conn := range node.ListenControlSocket(ctx) {
  1152  		if ctx.Err() != nil {
  1153  			return ctx.Err()
  1154  		}
  1155  		if conn != nil {
  1156  			client := swarmapi.NewControlClient(conn)
  1157  			var cluster *swarmapi.Cluster
  1158  			for i := 0; ; i++ {
  1159  				lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
  1160  				if err != nil {
  1161  					return fmt.Errorf("error on listing clusters: %v", err)
  1162  				}
  1163  				if len(lcr.Clusters) == 0 {
  1164  					if i < 10 {
  1165  						time.Sleep(200 * time.Millisecond)
  1166  						continue
  1167  					}
  1168  					return fmt.Errorf("empty list of clusters was returned")
  1169  				}
  1170  				cluster = lcr.Clusters[0]
  1171  				break
  1172  			}
  1173  			newspec, err := convert.SwarmSpecToGRPCandMerge(spec, &cluster.Spec)
  1174  			if err != nil {
  1175  				return fmt.Errorf("error updating cluster settings: %v", err)
  1176  			}
  1177  			_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
  1178  				ClusterID:      cluster.ID,
  1179  				ClusterVersion: &cluster.Meta.Version,
  1180  				Spec:           &newspec,
  1181  			})
  1182  			if err != nil {
  1183  				return fmt.Errorf("error updating cluster settings: %v", err)
  1184  			}
  1185  			return nil
  1186  		}
  1187  	}
  1188  	return ctx.Err()
  1189  }