github.com/rita33cool1/iot-system-gateway@v0.0.0-20200911033302-e65bde238cc5/docker-engine/daemon/cluster/noderunner.go (about)

     1  package cluster // import "github.com/docker/docker/daemon/cluster"
     2  
     3  import (
     4  	"fmt"
     5  	"path/filepath"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	types "github.com/docker/docker/api/types/swarm"
    12  	"github.com/docker/docker/daemon/cluster/executor/container"
    13  	lncluster "github.com/docker/libnetwork/cluster"
    14  	swarmapi "github.com/docker/swarmkit/api"
    15  	swarmnode "github.com/docker/swarmkit/node"
    16  	"github.com/pkg/errors"
    17  	"github.com/sirupsen/logrus"
    18  	"golang.org/x/net/context"
    19  	"google.golang.org/grpc"
    20  	"google.golang.org/grpc/codes"
    21  	"google.golang.org/grpc/status"
    22  )
    23  
    24  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    25  type nodeRunner struct {
    26  	nodeState
    27  	mu             sync.RWMutex
    28  	done           chan struct{} // closed when swarmNode exits
    29  	ready          chan struct{} // closed when swarmNode becomes active
    30  	reconnectDelay time.Duration
    31  	config         nodeStartConfig
    32  
    33  	repeatedRun     bool
    34  	cancelReconnect func()
    35  	stopping        bool
    36  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    37  }
    38  
    39  // nodeStartConfig holds configuration needed to start a new node. Exported
    40  // fields of this structure are saved to disk in json. Unexported fields
    41  // contain data that shouldn't be persisted between daemon reloads.
    42  type nodeStartConfig struct {
    43  	// LocalAddr is this machine's local IP or hostname, if specified.
    44  	LocalAddr string
    45  	// RemoteAddr is the address that was given to "swarm join". It is used
    46  	// to find LocalAddr if necessary.
    47  	RemoteAddr string
    48  	// ListenAddr is the address we bind to, including a port.
    49  	ListenAddr string
    50  	// AdvertiseAddr is the address other nodes should connect to,
    51  	// including a port.
    52  	AdvertiseAddr string
    53  	// DataPathAddr is the address that has to be used for the data path
    54  	DataPathAddr string
    55  	// JoinInProgress is set to true if a join operation has started, but
    56  	// not completed yet.
    57  	JoinInProgress bool
    58  
    59  	joinAddr        string
    60  	forceNewCluster bool
    61  	joinToken       string
    62  	lockKey         []byte
    63  	autolock        bool
    64  	availability    types.NodeAvailability
    65  }
    66  
    67  func (n *nodeRunner) Ready() chan error {
    68  	c := make(chan error, 1)
    69  	n.mu.RLock()
    70  	ready, done := n.ready, n.done
    71  	n.mu.RUnlock()
    72  	go func() {
    73  		select {
    74  		case <-ready:
    75  		case <-done:
    76  		}
    77  		select {
    78  		case <-ready:
    79  		default:
    80  			n.mu.RLock()
    81  			c <- n.err
    82  			n.mu.RUnlock()
    83  		}
    84  		close(c)
    85  	}()
    86  	return c
    87  }
    88  
    89  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    90  	n.mu.Lock()
    91  	defer n.mu.Unlock()
    92  
    93  	n.reconnectDelay = initialReconnectDelay
    94  
    95  	return n.start(conf)
    96  }
    97  
    98  func (n *nodeRunner) start(conf nodeStartConfig) error {
    99  	var control string
   100  	if runtime.GOOS == "windows" {
   101  		control = `\\.\pipe\` + controlSocket
   102  	} else {
   103  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
   104  	}
   105  
   106  	joinAddr := conf.joinAddr
   107  	if joinAddr == "" && conf.JoinInProgress {
   108  		// We must have been restarted while trying to join a cluster.
   109  		// Continue trying to join instead of forming our own cluster.
   110  		joinAddr = conf.RemoteAddr
   111  	}
   112  
   113  	// Hostname is not set here. Instead, it is obtained from
   114  	// the node description that is reported periodically
   115  	swarmnodeConfig := swarmnode.Config{
   116  		ForceNewCluster:    conf.forceNewCluster,
   117  		ListenControlAPI:   control,
   118  		ListenRemoteAPI:    conf.ListenAddr,
   119  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   120  		JoinAddr:           joinAddr,
   121  		StateDir:           n.cluster.root,
   122  		JoinToken:          conf.joinToken,
   123  		Executor:           container.NewExecutor(n.cluster.config.Backend, n.cluster.config.PluginBackend),
   124  		HeartbeatTick:      1,
   125  		// Recommended value in etcd/raft is 10 x (HeartbeatTick).
   126  		// Lower values were seen to have caused instability because of
   127  		// frequent leader elections when running on flakey networks.
   128  		ElectionTick:     10,
   129  		UnlockKey:        conf.lockKey,
   130  		AutoLockManagers: conf.autolock,
   131  		PluginGetter:     n.cluster.config.Backend.PluginGetter(),
   132  	}
   133  	if conf.availability != "" {
   134  		avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
   135  		if !ok {
   136  			return fmt.Errorf("invalid Availability: %q", conf.availability)
   137  		}
   138  		swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
   139  	}
   140  	node, err := swarmnode.New(&swarmnodeConfig)
   141  	if err != nil {
   142  		return err
   143  	}
   144  	if err := node.Start(context.Background()); err != nil {
   145  		return err
   146  	}
   147  
   148  	n.done = make(chan struct{})
   149  	n.ready = make(chan struct{})
   150  	n.swarmNode = node
   151  	if conf.joinAddr != "" {
   152  		conf.JoinInProgress = true
   153  	}
   154  	n.config = conf
   155  	savePersistentState(n.cluster.root, conf)
   156  
   157  	ctx, cancel := context.WithCancel(context.Background())
   158  
   159  	go func() {
   160  		n.handleNodeExit(node)
   161  		cancel()
   162  	}()
   163  
   164  	go n.handleReadyEvent(ctx, node, n.ready)
   165  	go n.handleControlSocketChange(ctx, node)
   166  
   167  	return nil
   168  }
   169  
   170  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   171  	for conn := range node.ListenControlSocket(ctx) {
   172  		n.mu.Lock()
   173  		if n.grpcConn != conn {
   174  			if conn == nil {
   175  				n.controlClient = nil
   176  				n.logsClient = nil
   177  			} else {
   178  				n.controlClient = swarmapi.NewControlClient(conn)
   179  				n.logsClient = swarmapi.NewLogsClient(conn)
   180  				// push store changes to daemon
   181  				go n.watchClusterEvents(ctx, conn)
   182  			}
   183  		}
   184  		n.grpcConn = conn
   185  		n.mu.Unlock()
   186  		n.cluster.SendClusterEvent(lncluster.EventSocketChange)
   187  	}
   188  }
   189  
   190  func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) {
   191  	client := swarmapi.NewWatchClient(conn)
   192  	watch, err := client.Watch(ctx, &swarmapi.WatchRequest{
   193  		Entries: []*swarmapi.WatchRequest_WatchEntry{
   194  			{
   195  				Kind:   "node",
   196  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   197  			},
   198  			{
   199  				Kind:   "service",
   200  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   201  			},
   202  			{
   203  				Kind:   "network",
   204  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   205  			},
   206  			{
   207  				Kind:   "secret",
   208  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   209  			},
   210  			{
   211  				Kind:   "config",
   212  				Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
   213  			},
   214  		},
   215  		IncludeOldObject: true,
   216  	})
   217  	if err != nil {
   218  		logrus.WithError(err).Error("failed to watch cluster store")
   219  		return
   220  	}
   221  	for {
   222  		msg, err := watch.Recv()
   223  		if err != nil {
   224  			// store watch is broken
   225  			errStatus, ok := status.FromError(err)
   226  			if !ok || errStatus.Code() != codes.Canceled {
   227  				logrus.WithError(err).Error("failed to receive changes from store watch API")
   228  			}
   229  			return
   230  		}
   231  		select {
   232  		case <-ctx.Done():
   233  			return
   234  		case n.cluster.watchStream <- msg:
   235  		}
   236  	}
   237  }
   238  
   239  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   240  	select {
   241  	case <-node.Ready():
   242  		n.mu.Lock()
   243  		n.err = nil
   244  		if n.config.JoinInProgress {
   245  			n.config.JoinInProgress = false
   246  			savePersistentState(n.cluster.root, n.config)
   247  		}
   248  		n.mu.Unlock()
   249  		close(ready)
   250  	case <-ctx.Done():
   251  	}
   252  	n.cluster.SendClusterEvent(lncluster.EventNodeReady)
   253  }
   254  
   255  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   256  	err := detectLockedError(node.Err(context.Background()))
   257  	if err != nil {
   258  		logrus.Errorf("cluster exited with error: %v", err)
   259  	}
   260  	n.mu.Lock()
   261  	n.swarmNode = nil
   262  	n.err = err
   263  	close(n.done)
   264  	select {
   265  	case <-n.ready:
   266  		n.enableReconnectWatcher()
   267  	default:
   268  		if n.repeatedRun {
   269  			n.enableReconnectWatcher()
   270  		}
   271  	}
   272  	n.repeatedRun = true
   273  	n.mu.Unlock()
   274  }
   275  
   276  // Stop stops the current swarm node if it is running.
   277  func (n *nodeRunner) Stop() error {
   278  	n.mu.Lock()
   279  	if n.cancelReconnect != nil { // between restarts
   280  		n.cancelReconnect()
   281  		n.cancelReconnect = nil
   282  	}
   283  	if n.swarmNode == nil {
   284  		n.mu.Unlock()
   285  		return nil
   286  	}
   287  	n.stopping = true
   288  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   289  	defer cancel()
   290  	n.mu.Unlock()
   291  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   292  		return err
   293  	}
   294  	n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
   295  	<-n.done
   296  	return nil
   297  }
   298  
   299  func (n *nodeRunner) State() nodeState {
   300  	if n == nil {
   301  		return nodeState{status: types.LocalNodeStateInactive}
   302  	}
   303  	n.mu.RLock()
   304  	defer n.mu.RUnlock()
   305  
   306  	ns := n.nodeState
   307  
   308  	if ns.err != nil || n.cancelReconnect != nil {
   309  		if errors.Cause(ns.err) == errSwarmLocked {
   310  			ns.status = types.LocalNodeStateLocked
   311  		} else {
   312  			ns.status = types.LocalNodeStateError
   313  		}
   314  	} else {
   315  		select {
   316  		case <-n.ready:
   317  			ns.status = types.LocalNodeStateActive
   318  		default:
   319  			ns.status = types.LocalNodeStatePending
   320  		}
   321  	}
   322  
   323  	return ns
   324  }
   325  
   326  func (n *nodeRunner) enableReconnectWatcher() {
   327  	if n.stopping {
   328  		return
   329  	}
   330  	n.reconnectDelay *= 2
   331  	if n.reconnectDelay > maxReconnectDelay {
   332  		n.reconnectDelay = maxReconnectDelay
   333  	}
   334  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   335  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   336  	n.cancelReconnect = cancel
   337  
   338  	go func() {
   339  		<-delayCtx.Done()
   340  		if delayCtx.Err() != context.DeadlineExceeded {
   341  			return
   342  		}
   343  		n.mu.Lock()
   344  		defer n.mu.Unlock()
   345  		if n.stopping {
   346  			return
   347  		}
   348  
   349  		if err := n.start(n.config); err != nil {
   350  			n.err = err
   351  		}
   352  	}()
   353  }
   354  
   355  // nodeState represents information about the current state of the cluster and
   356  // provides access to the grpc clients.
   357  type nodeState struct {
   358  	swarmNode       *swarmnode.Node
   359  	grpcConn        *grpc.ClientConn
   360  	controlClient   swarmapi.ControlClient
   361  	logsClient      swarmapi.LogsClient
   362  	status          types.LocalNodeState
   363  	actualLocalAddr string
   364  	err             error
   365  }
   366  
   367  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   368  func (ns nodeState) IsActiveManager() bool {
   369  	return ns.controlClient != nil
   370  }
   371  
   372  // IsManager returns true if node is a manager.
   373  func (ns nodeState) IsManager() bool {
   374  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   375  }
   376  
   377  // NodeID returns node's ID or empty string if node is inactive.
   378  func (ns nodeState) NodeID() string {
   379  	if ns.swarmNode != nil {
   380  		return ns.swarmNode.NodeID()
   381  	}
   382  	return ""
   383  }