github.com/rhatdan/docker@v0.7.7-0.20180119204836-47a0dcbcd20a/daemon/cluster/noderunner.go

github.com/rhatdan/docker@v0.7.7-0.20180119204836-47a0dcbcd20a/daemon/cluster/noderunner.go (about)

     1  package cluster
     2  
     3  import (
     4  	"fmt"
     5  	"path/filepath"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	types "github.com/docker/docker/api/types/swarm"
    12  	"github.com/docker/docker/daemon/cluster/executor/container"
    13  	lncluster "github.com/docker/libnetwork/cluster"
    14  	swarmapi "github.com/docker/swarmkit/api"
    15  	swarmnode "github.com/docker/swarmkit/node"
    16  	"github.com/pkg/errors"
    17  	"github.com/sirupsen/logrus"
    18  	"golang.org/x/net/context"
    19  	"google.golang.org/grpc"
    20  	"google.golang.org/grpc/codes"
    21  	"google.golang.org/grpc/status"
    22  )
    23  
    24  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    25  type nodeRunner struct {
    26  	nodeState
    27  	mu             sync.RWMutex
    28  	done           chan struct{} // closed when swarmNode exits
    29  	ready          chan struct{} // closed when swarmNode becomes active
    30  	reconnectDelay time.Duration
    31  	config         nodeStartConfig
    32  
    33  	repeatedRun     bool
    34  	cancelReconnect func()
    35  	stopping        bool
    36  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    37  }
    38  
    39  // nodeStartConfig holds configuration needed to start a new node. Exported
    40  // fields of this structure are saved to disk in json. Unexported fields
    41  // contain data that shouldn't be persisted between daemon reloads.
    42  type nodeStartConfig struct {
    43  	// LocalAddr is this machine's local IP or hostname, if specified.
    44  	LocalAddr string
    45  	// RemoteAddr is the address that was given to "swarm join". It is used
    46  	// to find LocalAddr if necessary.
    47  	RemoteAddr string
    48  	// ListenAddr is the address we bind to, including a port.
    49  	ListenAddr string
    50  	// AdvertiseAddr is the address other nodes should connect to,
    51  	// including a port.
    52  	AdvertiseAddr string
    53  	// DataPathAddr is the address that has to be used for the data path
    54  	DataPathAddr string
    55  	// JoinInProgress is set to true if a join operation has started, but
    56  	// not completed yet.
    57  	JoinInProgress bool
    58  
    59  	joinAddr        string
    60  	forceNewCluster bool
    61  	joinToken       string
    62  	lockKey         []byte
    63  	autolock        bool
    64  	availability    types.NodeAvailability
    65  }
    66  
    67  func (n *nodeRunner) Ready() chan error {
    68  	c := make(chan error, 1)
    69  	n.mu.RLock()
    70  	ready, done := n.ready, n.done
    71  	n.mu.RUnlock()
    72  	go func() {
    73  		select {
    74  		case <-ready:
    75  		case <-done:
    76  		}
    77  		select {
    78  		case <-ready:
    79  		default:
    80  			n.mu.RLock()
    81  			c <- n.err
    82  			n.mu.RUnlock()
    83  		}
    84  		close(c)
    85  	}()
    86  	return c
    87  }
    88  
    89  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    90  	n.mu.Lock()
    91  	defer n.mu.Unlock()
    92  
    93  	n.reconnectDelay = initialReconnectDelay
    94  
    95  	return n.start(conf)
    96  }
    97  
    98  func (n *nodeRunner) start(conf nodeStartConfig) error {
    99  	var control string
   100  	if runtime.GOOS == "windows" {
   101  		control = `\\.\pipe\` + controlSocket
   102  	} else {
   103  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
   104  	}
   105  
   106  	joinAddr := conf.joinAddr
   107  	if joinAddr == "" && conf.JoinInProgress {
   108  		// We must have been restarted while trying to join a cluster.
   109  		// Continue trying to join instead of forming our own cluster.
   110  		joinAddr = conf.RemoteAddr
   111  	}
   112  
   113  	// Hostname is not set here. Instead, it is obtained from
   114  	// the node description that is reported periodically
   115  	swarmnodeConfig := swarmnode.Config{
   116  		ForceNewCluster:    conf.forceNewCluster,
   117  		ListenControlAPI:   control,
   118  		ListenRemoteAPI:    conf.ListenAddr,
   119  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   120  		JoinAddr:           joinAddr,
   121  		StateDir:           n.cluster.root,
   122  		JoinToken:          conf.joinToken,
   123  		Executor:           container.NewExecutor(n.cluster.config.Backend, n.cluster.config.PluginBackend),
   124  		HeartbeatTick:      1,
   125  		ElectionTick:       3,
   126  		UnlockKey:          conf.lockKey,
   127  		AutoLockManagers:   conf.autolock,
   128  		PluginGetter:       n.cluster.config.Backend.PluginGetter(),
   129  	}
   130  	if conf.availability != "" {
   131  		avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
   132  		if !ok {
   133  			return fmt.Errorf("invalid Availability: %q", conf.availability)
   134  		}
   135  		swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
   136  	}
   137  	node, err := swarmnode.New(&swarmnodeConfig)
   138  	if err != nil {
   139  		return err
   140  	}
   141  	if err := node.Start(context.Background()); err != nil {
   142  		return err
   143  	}
   144  
   145  	n.done = make(chan struct{})
   146  	n.ready = make(chan struct{})
   147  	n.swarmNode = node
   148  	if conf.joinAddr != "" {
   149  		conf.JoinInProgress = true
   150  	}
   151  	n.config = conf
   152  	savePersistentState(n.cluster.root, conf)
   153  
   154  	ctx, cancel := context.WithCancel(context.Background())
   155  
   156  	go func() {
   157  		n.handleNodeExit(node)
   158  		cancel()
   159  	}()
   160  
   161  	go n.handleReadyEvent(ctx, node, n.ready)
   162  	go n.handleControlSocketChange(ctx, node)
   163  
   164  	return nil
   165  }
   166  
   167  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   168  	for conn := range node.ListenControlSocket(ctx) {
   169  		n.mu.Lock()
   170  		if n.grpcConn != conn {
   171  			if conn == nil {
   172  				n.controlClient = nil
   173  				n.logsClient = nil
   174  			} else {
   175  				n.controlClient = swarmapi.NewControlClient(conn)
   176  				n.logsClient = swarmapi.NewLogsClient(conn)
   177  				// push store changes to daemon
   178  				go n.watchClusterEvents(ctx, conn)
   179  			}
   180  		}
   181  		n.grpcConn = conn
   182  		n.mu.Unlock()
   183  		n.cluster.SendClusterEvent(lncluster.EventSocketChange)
   184  	}
   185  }
   186  
   187  func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) {
   188  	client := swarmapi.NewWatchClient(conn)
   189  	watch, err := client.Watch(ctx, &swarmapi.WatchRequest{
   190  		Entries: []*swarmapi.WatchRequest_WatchEntry{
   191  			{
   192  				Kind:   "node",
   193  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   194  			},
   195  			{
   196  				Kind:   "service",
   197  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   198  			},
   199  			{
   200  				Kind:   "network",
   201  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   202  			},
   203  			{
   204  				Kind:   "secret",
   205  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   206  			},
   207  			{
   208  				Kind:   "config",
   209  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   210  			},
   211  		},
   212  		IncludeOldObject: true,
   213  	})
   214  	if err != nil {
   215  		logrus.WithError(err).Error("failed to watch cluster store")
   216  		return
   217  	}
   218  	for {
   219  		msg, err := watch.Recv()
   220  		if err != nil {
   221  			// store watch is broken
   222  			errStatus, ok := status.FromError(err)
   223  			if !ok || errStatus.Code() != codes.Canceled {
   224  				logrus.WithError(err).Error("failed to receive changes from store watch API")
   225  			}
   226  			return
   227  		}
   228  		select {
   229  		case <-ctx.Done():
   230  			return
   231  		case n.cluster.watchStream <- msg:
   232  		}
   233  	}
   234  }
   235  
   236  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   237  	select {
   238  	case <-node.Ready():
   239  		n.mu.Lock()
   240  		n.err = nil
   241  		if n.config.JoinInProgress {
   242  			n.config.JoinInProgress = false
   243  			savePersistentState(n.cluster.root, n.config)
   244  		}
   245  		n.mu.Unlock()
   246  		close(ready)
   247  	case <-ctx.Done():
   248  	}
   249  	n.cluster.SendClusterEvent(lncluster.EventNodeReady)
   250  }
   251  
   252  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   253  	err := detectLockedError(node.Err(context.Background()))
   254  	if err != nil {
   255  		logrus.Errorf("cluster exited with error: %v", err)
   256  	}
   257  	n.mu.Lock()
   258  	n.swarmNode = nil
   259  	n.err = err
   260  	close(n.done)
   261  	select {
   262  	case <-n.ready:
   263  		n.enableReconnectWatcher()
   264  	default:
   265  		if n.repeatedRun {
   266  			n.enableReconnectWatcher()
   267  		}
   268  	}
   269  	n.repeatedRun = true
   270  	n.mu.Unlock()
   271  }
   272  
   273  // Stop stops the current swarm node if it is running.
   274  func (n *nodeRunner) Stop() error {
   275  	n.mu.Lock()
   276  	if n.cancelReconnect != nil { // between restarts
   277  		n.cancelReconnect()
   278  		n.cancelReconnect = nil
   279  	}
   280  	if n.swarmNode == nil {
   281  		n.mu.Unlock()
   282  		return nil
   283  	}
   284  	n.stopping = true
   285  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   286  	defer cancel()
   287  	n.mu.Unlock()
   288  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   289  		return err
   290  	}
   291  	n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
   292  	<-n.done
   293  	return nil
   294  }
   295  
   296  func (n *nodeRunner) State() nodeState {
   297  	if n == nil {
   298  		return nodeState{status: types.LocalNodeStateInactive}
   299  	}
   300  	n.mu.RLock()
   301  	defer n.mu.RUnlock()
   302  
   303  	ns := n.nodeState
   304  
   305  	if ns.err != nil || n.cancelReconnect != nil {
   306  		if errors.Cause(ns.err) == errSwarmLocked {
   307  			ns.status = types.LocalNodeStateLocked
   308  		} else {
   309  			ns.status = types.LocalNodeStateError
   310  		}
   311  	} else {
   312  		select {
   313  		case <-n.ready:
   314  			ns.status = types.LocalNodeStateActive
   315  		default:
   316  			ns.status = types.LocalNodeStatePending
   317  		}
   318  	}
   319  
   320  	return ns
   321  }
   322  
   323  func (n *nodeRunner) enableReconnectWatcher() {
   324  	if n.stopping {
   325  		return
   326  	}
   327  	n.reconnectDelay *= 2
   328  	if n.reconnectDelay > maxReconnectDelay {
   329  		n.reconnectDelay = maxReconnectDelay
   330  	}
   331  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   332  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   333  	n.cancelReconnect = cancel
   334  
   335  	go func() {
   336  		<-delayCtx.Done()
   337  		if delayCtx.Err() != context.DeadlineExceeded {
   338  			return
   339  		}
   340  		n.mu.Lock()
   341  		defer n.mu.Unlock()
   342  		if n.stopping {
   343  			return
   344  		}
   345  
   346  		if err := n.start(n.config); err != nil {
   347  			n.err = err
   348  		}
   349  	}()
   350  }
   351  
   352  // nodeState represents information about the current state of the cluster and
   353  // provides access to the grpc clients.
   354  type nodeState struct {
   355  	swarmNode       *swarmnode.Node
   356  	grpcConn        *grpc.ClientConn
   357  	controlClient   swarmapi.ControlClient
   358  	logsClient      swarmapi.LogsClient
   359  	status          types.LocalNodeState
   360  	actualLocalAddr string
   361  	err             error
   362  }
   363  
   364  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   365  func (ns nodeState) IsActiveManager() bool {
   366  	return ns.controlClient != nil
   367  }
   368  
   369  // IsManager returns true if node is a manager.
   370  func (ns nodeState) IsManager() bool {
   371  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   372  }
   373  
   374  // NodeID returns node's ID or empty string if node is inactive.
   375  func (ns nodeState) NodeID() string {
   376  	if ns.swarmNode != nil {
   377  		return ns.swarmNode.NodeID()
   378  	}
   379  	return ""
   380  }