github.com/jwhonce/docker@v0.6.7-0.20190327063223-da823cf3a5a3/daemon/cluster/noderunner.go

github.com/jwhonce/docker@v0.6.7-0.20190327063223-da823cf3a5a3/daemon/cluster/noderunner.go (about)

     1  package cluster // import "github.com/docker/docker/daemon/cluster"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"path/filepath"
     7  	"runtime"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/executor/container"
    14  	lncluster "github.com/docker/libnetwork/cluster"
    15  	swarmapi "github.com/docker/swarmkit/api"
    16  	swarmallocator "github.com/docker/swarmkit/manager/allocator/cnmallocator"
    17  	swarmnode "github.com/docker/swarmkit/node"
    18  	"github.com/pkg/errors"
    19  	"github.com/sirupsen/logrus"
    20  	"google.golang.org/grpc"
    21  	"google.golang.org/grpc/codes"
    22  	"google.golang.org/grpc/status"
    23  )
    24  
    25  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    26  type nodeRunner struct {
    27  	nodeState
    28  	mu             sync.RWMutex
    29  	done           chan struct{} // closed when swarmNode exits
    30  	ready          chan struct{} // closed when swarmNode becomes active
    31  	reconnectDelay time.Duration
    32  	config         nodeStartConfig
    33  
    34  	repeatedRun     bool
    35  	cancelReconnect func()
    36  	stopping        bool
    37  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    38  }
    39  
    40  // nodeStartConfig holds configuration needed to start a new node. Exported
    41  // fields of this structure are saved to disk in json. Unexported fields
    42  // contain data that shouldn't be persisted between daemon reloads.
    43  type nodeStartConfig struct {
    44  	// LocalAddr is this machine's local IP or hostname, if specified.
    45  	LocalAddr string
    46  	// RemoteAddr is the address that was given to "swarm join". It is used
    47  	// to find LocalAddr if necessary.
    48  	RemoteAddr string
    49  	// ListenAddr is the address we bind to, including a port.
    50  	ListenAddr string
    51  	// AdvertiseAddr is the address other nodes should connect to,
    52  	// including a port.
    53  	AdvertiseAddr string
    54  	// DataPathAddr is the address that has to be used for the data path
    55  	DataPathAddr string
    56  	// DefaultAddressPool contains list of subnets
    57  	DefaultAddressPool []string
    58  	// SubnetSize contains subnet size of DefaultAddressPool
    59  	SubnetSize uint32
    60  	// DataPathPort contains Data path port (VXLAN UDP port) number that is used for data traffic.
    61  	DataPathPort uint32
    62  	// JoinInProgress is set to true if a join operation has started, but
    63  	// not completed yet.
    64  	JoinInProgress bool
    65  
    66  	joinAddr        string
    67  	forceNewCluster bool
    68  	joinToken       string
    69  	lockKey         []byte
    70  	autolock        bool
    71  	availability    types.NodeAvailability
    72  }
    73  
    74  func (n *nodeRunner) Ready() chan error {
    75  	c := make(chan error, 1)
    76  	n.mu.RLock()
    77  	ready, done := n.ready, n.done
    78  	n.mu.RUnlock()
    79  	go func() {
    80  		select {
    81  		case <-ready:
    82  		case <-done:
    83  		}
    84  		select {
    85  		case <-ready:
    86  		default:
    87  			n.mu.RLock()
    88  			c <- n.err
    89  			n.mu.RUnlock()
    90  		}
    91  		close(c)
    92  	}()
    93  	return c
    94  }
    95  
    96  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    97  	n.mu.Lock()
    98  	defer n.mu.Unlock()
    99  
   100  	n.reconnectDelay = initialReconnectDelay
   101  
   102  	return n.start(conf)
   103  }
   104  
   105  func (n *nodeRunner) start(conf nodeStartConfig) error {
   106  	var control string
   107  	if runtime.GOOS == "windows" {
   108  		control = `\\.\pipe\` + controlSocket
   109  	} else {
   110  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
   111  	}
   112  
   113  	joinAddr := conf.joinAddr
   114  	if joinAddr == "" && conf.JoinInProgress {
   115  		// We must have been restarted while trying to join a cluster.
   116  		// Continue trying to join instead of forming our own cluster.
   117  		joinAddr = conf.RemoteAddr
   118  	}
   119  
   120  	// Hostname is not set here. Instead, it is obtained from
   121  	// the node description that is reported periodically
   122  	swarmnodeConfig := swarmnode.Config{
   123  		ForceNewCluster:    conf.forceNewCluster,
   124  		ListenControlAPI:   control,
   125  		ListenRemoteAPI:    conf.ListenAddr,
   126  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   127  		NetworkConfig: &swarmallocator.NetworkConfig{
   128  			DefaultAddrPool: conf.DefaultAddressPool,
   129  			SubnetSize:      conf.SubnetSize,
   130  			VXLANUDPPort:    conf.DataPathPort,
   131  		},
   132  		JoinAddr:  joinAddr,
   133  		StateDir:  n.cluster.root,
   134  		JoinToken: conf.joinToken,
   135  		Executor: container.NewExecutor(
   136  			n.cluster.config.Backend,
   137  			n.cluster.config.PluginBackend,
   138  			n.cluster.config.ImageBackend,
   139  			n.cluster.config.VolumeBackend,
   140  		),
   141  		HeartbeatTick: n.cluster.config.RaftHeartbeatTick,
   142  		// Recommended value in etcd/raft is 10 x (HeartbeatTick).
   143  		// Lower values were seen to have caused instability because of
   144  		// frequent leader elections when running on flakey networks.
   145  		ElectionTick:     n.cluster.config.RaftElectionTick,
   146  		UnlockKey:        conf.lockKey,
   147  		AutoLockManagers: conf.autolock,
   148  		PluginGetter:     n.cluster.config.Backend.PluginGetter(),
   149  	}
   150  	if conf.availability != "" {
   151  		avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
   152  		if !ok {
   153  			return fmt.Errorf("invalid Availability: %q", conf.availability)
   154  		}
   155  		swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
   156  	}
   157  	node, err := swarmnode.New(&swarmnodeConfig)
   158  	if err != nil {
   159  		return err
   160  	}
   161  	if err := node.Start(context.Background()); err != nil {
   162  		return err
   163  	}
   164  
   165  	n.done = make(chan struct{})
   166  	n.ready = make(chan struct{})
   167  	n.swarmNode = node
   168  	if conf.joinAddr != "" {
   169  		conf.JoinInProgress = true
   170  	}
   171  	n.config = conf
   172  	savePersistentState(n.cluster.root, conf)
   173  
   174  	ctx, cancel := context.WithCancel(context.Background())
   175  
   176  	go func() {
   177  		n.handleNodeExit(node)
   178  		cancel()
   179  	}()
   180  
   181  	go n.handleReadyEvent(ctx, node, n.ready)
   182  	go n.handleControlSocketChange(ctx, node)
   183  
   184  	return nil
   185  }
   186  
   187  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   188  	for conn := range node.ListenControlSocket(ctx) {
   189  		n.mu.Lock()
   190  		if n.grpcConn != conn {
   191  			if conn == nil {
   192  				n.controlClient = nil
   193  				n.logsClient = nil
   194  			} else {
   195  				n.controlClient = swarmapi.NewControlClient(conn)
   196  				n.logsClient = swarmapi.NewLogsClient(conn)
   197  				// push store changes to daemon
   198  				go n.watchClusterEvents(ctx, conn)
   199  			}
   200  		}
   201  		n.grpcConn = conn
   202  		n.mu.Unlock()
   203  		n.cluster.SendClusterEvent(lncluster.EventSocketChange)
   204  	}
   205  }
   206  
   207  func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) {
   208  	client := swarmapi.NewWatchClient(conn)
   209  	watch, err := client.Watch(ctx, &swarmapi.WatchRequest{
   210  		Entries: []*swarmapi.WatchRequest_WatchEntry{
   211  			{
   212  				Kind:   "node",
   213  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   214  			},
   215  			{
   216  				Kind:   "service",
   217  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   218  			},
   219  			{
   220  				Kind:   "network",
   221  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   222  			},
   223  			{
   224  				Kind:   "secret",
   225  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   226  			},
   227  			{
   228  				Kind:   "config",
   229  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   230  			},
   231  		},
   232  		IncludeOldObject: true,
   233  	})
   234  	if err != nil {
   235  		logrus.WithError(err).Error("failed to watch cluster store")
   236  		return
   237  	}
   238  	for {
   239  		msg, err := watch.Recv()
   240  		if err != nil {
   241  			// store watch is broken
   242  			errStatus, ok := status.FromError(err)
   243  			if !ok || errStatus.Code() != codes.Canceled {
   244  				logrus.WithError(err).Error("failed to receive changes from store watch API")
   245  			}
   246  			return
   247  		}
   248  		select {
   249  		case <-ctx.Done():
   250  			return
   251  		case n.cluster.watchStream <- msg:
   252  		}
   253  	}
   254  }
   255  
   256  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   257  	select {
   258  	case <-node.Ready():
   259  		n.mu.Lock()
   260  		n.err = nil
   261  		if n.config.JoinInProgress {
   262  			n.config.JoinInProgress = false
   263  			savePersistentState(n.cluster.root, n.config)
   264  		}
   265  		n.mu.Unlock()
   266  		close(ready)
   267  	case <-ctx.Done():
   268  	}
   269  	n.cluster.SendClusterEvent(lncluster.EventNodeReady)
   270  }
   271  
   272  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   273  	err := detectLockedError(node.Err(context.Background()))
   274  	if err != nil {
   275  		logrus.Errorf("cluster exited with error: %v", err)
   276  	}
   277  	n.mu.Lock()
   278  	n.swarmNode = nil
   279  	n.err = err
   280  	close(n.done)
   281  	select {
   282  	case <-n.ready:
   283  		n.enableReconnectWatcher()
   284  	default:
   285  		if n.repeatedRun {
   286  			n.enableReconnectWatcher()
   287  		}
   288  	}
   289  	n.repeatedRun = true
   290  	n.mu.Unlock()
   291  }
   292  
   293  // Stop stops the current swarm node if it is running.
   294  func (n *nodeRunner) Stop() error {
   295  	n.mu.Lock()
   296  	if n.cancelReconnect != nil { // between restarts
   297  		n.cancelReconnect()
   298  		n.cancelReconnect = nil
   299  	}
   300  	if n.swarmNode == nil {
   301  		// even though the swarm node is nil we still may need
   302  		// to send a node leave event to perform any cleanup required.
   303  		if n.cluster != nil {
   304  			n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
   305  		}
   306  		n.mu.Unlock()
   307  		return nil
   308  	}
   309  	n.stopping = true
   310  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   311  	defer cancel()
   312  	n.mu.Unlock()
   313  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   314  		return err
   315  	}
   316  	n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
   317  	<-n.done
   318  	return nil
   319  }
   320  
   321  func (n *nodeRunner) State() nodeState {
   322  	if n == nil {
   323  		return nodeState{status: types.LocalNodeStateInactive}
   324  	}
   325  	n.mu.RLock()
   326  	defer n.mu.RUnlock()
   327  
   328  	ns := n.nodeState
   329  
   330  	if ns.err != nil || n.cancelReconnect != nil {
   331  		if errors.Cause(ns.err) == errSwarmLocked {
   332  			ns.status = types.LocalNodeStateLocked
   333  		} else {
   334  			ns.status = types.LocalNodeStateError
   335  		}
   336  	} else {
   337  		select {
   338  		case <-n.ready:
   339  			ns.status = types.LocalNodeStateActive
   340  		default:
   341  			ns.status = types.LocalNodeStatePending
   342  		}
   343  	}
   344  
   345  	return ns
   346  }
   347  
   348  func (n *nodeRunner) enableReconnectWatcher() {
   349  	if n.stopping {
   350  		return
   351  	}
   352  	n.reconnectDelay *= 2
   353  	if n.reconnectDelay > maxReconnectDelay {
   354  		n.reconnectDelay = maxReconnectDelay
   355  	}
   356  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   357  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   358  	n.cancelReconnect = cancel
   359  
   360  	go func() {
   361  		<-delayCtx.Done()
   362  		if delayCtx.Err() != context.DeadlineExceeded {
   363  			return
   364  		}
   365  		n.mu.Lock()
   366  		defer n.mu.Unlock()
   367  		if n.stopping {
   368  			return
   369  		}
   370  
   371  		if err := n.start(n.config); err != nil {
   372  			n.err = err
   373  		}
   374  	}()
   375  }
   376  
   377  // nodeState represents information about the current state of the cluster and
   378  // provides access to the grpc clients.
   379  type nodeState struct {
   380  	swarmNode       *swarmnode.Node
   381  	grpcConn        *grpc.ClientConn
   382  	controlClient   swarmapi.ControlClient
   383  	logsClient      swarmapi.LogsClient
   384  	status          types.LocalNodeState
   385  	actualLocalAddr string
   386  	err             error
   387  }
   388  
   389  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   390  func (ns nodeState) IsActiveManager() bool {
   391  	return ns.controlClient != nil
   392  }
   393  
   394  // IsManager returns true if node is a manager.
   395  func (ns nodeState) IsManager() bool {
   396  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   397  }
   398  
   399  // NodeID returns node's ID or empty string if node is inactive.
   400  func (ns nodeState) NodeID() string {
   401  	if ns.swarmNode != nil {
   402  		return ns.swarmNode.NodeID()
   403  	}
   404  	return ""
   405  }