github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/daemon/cluster/noderunner.go

github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/daemon/cluster/noderunner.go (about)

     1  package cluster
     2  
     3  import (
     4  	"path/filepath"
     5  	"runtime"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/Sirupsen/logrus"
    11  	types "github.com/docker/docker/api/types/swarm"
    12  	"github.com/docker/docker/daemon/cluster/executor/container"
    13  	swarmapi "github.com/docker/swarmkit/api"
    14  	swarmnode "github.com/docker/swarmkit/node"
    15  	"github.com/pkg/errors"
    16  	"golang.org/x/net/context"
    17  	"google.golang.org/grpc"
    18  )
    19  
    20  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    21  type nodeRunner struct {
    22  	nodeState
    23  	mu             sync.RWMutex
    24  	done           chan struct{} // closed when swarmNode exits
    25  	ready          chan struct{} // closed when swarmNode becomes active
    26  	reconnectDelay time.Duration
    27  	config         nodeStartConfig
    28  
    29  	repeatedRun     bool
    30  	cancelReconnect func()
    31  	stopping        bool
    32  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    33  }
    34  
    35  // nodeStartConfig holds configuration needed to start a new node. Exported
    36  // fields of this structure are saved to disk in json. Unexported fields
    37  // contain data that shouldn't be persisted between daemon reloads.
    38  type nodeStartConfig struct {
    39  	// LocalAddr is this machine's local IP or hostname, if specified.
    40  	LocalAddr string
    41  	// RemoteAddr is the address that was given to "swarm join". It is used
    42  	// to find LocalAddr if necessary.
    43  	RemoteAddr string
    44  	// ListenAddr is the address we bind to, including a port.
    45  	ListenAddr string
    46  	// AdvertiseAddr is the address other nodes should connect to,
    47  	// including a port.
    48  	AdvertiseAddr   string
    49  	joinAddr        string
    50  	forceNewCluster bool
    51  	joinToken       string
    52  	lockKey         []byte
    53  	autolock        bool
    54  }
    55  
    56  func (n *nodeRunner) Ready() chan error {
    57  	c := make(chan error, 1)
    58  	n.mu.RLock()
    59  	ready, done := n.ready, n.done
    60  	n.mu.RUnlock()
    61  	go func() {
    62  		select {
    63  		case <-ready:
    64  		case <-done:
    65  		}
    66  		select {
    67  		case <-ready:
    68  		default:
    69  			n.mu.RLock()
    70  			c <- n.err
    71  			n.mu.RUnlock()
    72  		}
    73  		close(c)
    74  	}()
    75  	return c
    76  }
    77  
    78  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    79  	n.mu.Lock()
    80  	defer n.mu.Unlock()
    81  
    82  	n.reconnectDelay = initialReconnectDelay
    83  
    84  	return n.start(conf)
    85  }
    86  
    87  func (n *nodeRunner) start(conf nodeStartConfig) error {
    88  	var control string
    89  	if runtime.GOOS == "windows" {
    90  		control = `\\.\pipe\` + controlSocket
    91  	} else {
    92  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
    93  	}
    94  
    95  	node, err := swarmnode.New(&swarmnode.Config{
    96  		Hostname:           n.cluster.config.Name,
    97  		ForceNewCluster:    conf.forceNewCluster,
    98  		ListenControlAPI:   control,
    99  		ListenRemoteAPI:    conf.ListenAddr,
   100  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   101  		JoinAddr:           conf.joinAddr,
   102  		StateDir:           n.cluster.root,
   103  		JoinToken:          conf.joinToken,
   104  		Executor:           container.NewExecutor(n.cluster.config.Backend),
   105  		HeartbeatTick:      1,
   106  		ElectionTick:       3,
   107  		UnlockKey:          conf.lockKey,
   108  		AutoLockManagers:   conf.autolock,
   109  	})
   110  	if err != nil {
   111  		return err
   112  	}
   113  	if err := node.Start(context.Background()); err != nil {
   114  		return err
   115  	}
   116  
   117  	n.done = make(chan struct{})
   118  	n.ready = make(chan struct{})
   119  	n.swarmNode = node
   120  	n.config = conf
   121  	savePersistentState(n.cluster.root, conf)
   122  
   123  	ctx, cancel := context.WithCancel(context.Background())
   124  
   125  	go func() {
   126  		n.handleNodeExit(node)
   127  		cancel()
   128  	}()
   129  
   130  	go n.handleReadyEvent(ctx, node, n.ready)
   131  	go n.handleControlSocketChange(ctx, node)
   132  
   133  	return nil
   134  }
   135  
   136  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   137  	for conn := range node.ListenControlSocket(ctx) {
   138  		n.mu.Lock()
   139  		if n.grpcConn != conn {
   140  			if conn == nil {
   141  				n.controlClient = nil
   142  				n.logsClient = nil
   143  			} else {
   144  				n.controlClient = swarmapi.NewControlClient(conn)
   145  				n.logsClient = swarmapi.NewLogsClient(conn)
   146  			}
   147  		}
   148  		n.grpcConn = conn
   149  		n.mu.Unlock()
   150  		n.cluster.configEvent <- struct{}{}
   151  	}
   152  }
   153  
   154  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   155  	select {
   156  	case <-node.Ready():
   157  		n.mu.Lock()
   158  		n.err = nil
   159  		n.mu.Unlock()
   160  		close(ready)
   161  	case <-ctx.Done():
   162  	}
   163  	n.cluster.configEvent <- struct{}{}
   164  }
   165  
   166  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   167  	err := detectLockedError(node.Err(context.Background()))
   168  	if err != nil {
   169  		logrus.Errorf("cluster exited with error: %v", err)
   170  	}
   171  	n.mu.Lock()
   172  	n.swarmNode = nil
   173  	n.err = err
   174  	close(n.done)
   175  	select {
   176  	case <-n.ready:
   177  		n.enableReconnectWatcher()
   178  	default:
   179  		if n.repeatedRun {
   180  			n.enableReconnectWatcher()
   181  		}
   182  	}
   183  	n.repeatedRun = true
   184  	n.mu.Unlock()
   185  }
   186  
   187  // Stop stops the current swarm node if it is running.
   188  func (n *nodeRunner) Stop() error {
   189  	n.mu.Lock()
   190  	if n.cancelReconnect != nil { // between restarts
   191  		n.cancelReconnect()
   192  		n.cancelReconnect = nil
   193  	}
   194  	if n.swarmNode == nil {
   195  		n.mu.Unlock()
   196  		return nil
   197  	}
   198  	n.stopping = true
   199  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   200  	defer cancel()
   201  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   202  		n.mu.Unlock()
   203  		return err
   204  	}
   205  	n.mu.Unlock()
   206  	<-n.done
   207  	return nil
   208  }
   209  
   210  func (n *nodeRunner) State() nodeState {
   211  	if n == nil {
   212  		return nodeState{status: types.LocalNodeStateInactive}
   213  	}
   214  	n.mu.RLock()
   215  	defer n.mu.RUnlock()
   216  
   217  	ns := n.nodeState
   218  
   219  	if ns.err != nil || n.cancelReconnect != nil {
   220  		if errors.Cause(ns.err) == errSwarmLocked {
   221  			ns.status = types.LocalNodeStateLocked
   222  		} else {
   223  			ns.status = types.LocalNodeStateError
   224  		}
   225  	} else {
   226  		select {
   227  		case <-n.ready:
   228  			ns.status = types.LocalNodeStateActive
   229  		default:
   230  			ns.status = types.LocalNodeStatePending
   231  		}
   232  	}
   233  
   234  	return ns
   235  }
   236  
   237  func (n *nodeRunner) enableReconnectWatcher() {
   238  	if n.stopping {
   239  		return
   240  	}
   241  	n.reconnectDelay *= 2
   242  	if n.reconnectDelay > maxReconnectDelay {
   243  		n.reconnectDelay = maxReconnectDelay
   244  	}
   245  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   246  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   247  	n.cancelReconnect = cancel
   248  
   249  	config := n.config
   250  	go func() {
   251  		<-delayCtx.Done()
   252  		if delayCtx.Err() != context.DeadlineExceeded {
   253  			return
   254  		}
   255  		n.mu.Lock()
   256  		defer n.mu.Unlock()
   257  		if n.stopping {
   258  			return
   259  		}
   260  		config.RemoteAddr = n.cluster.getRemoteAddress()
   261  		config.joinAddr = config.RemoteAddr
   262  		if err := n.start(config); err != nil {
   263  			n.err = err
   264  		}
   265  	}()
   266  }
   267  
   268  // nodeState represents information about the current state of the cluster and
   269  // provides access to the grpc clients.
   270  type nodeState struct {
   271  	swarmNode       *swarmnode.Node
   272  	grpcConn        *grpc.ClientConn
   273  	controlClient   swarmapi.ControlClient
   274  	logsClient      swarmapi.LogsClient
   275  	status          types.LocalNodeState
   276  	actualLocalAddr string
   277  	err             error
   278  }
   279  
   280  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   281  func (ns nodeState) IsActiveManager() bool {
   282  	return ns.controlClient != nil
   283  }
   284  
   285  // IsManager returns true if node is a manager.
   286  func (ns nodeState) IsManager() bool {
   287  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   288  }
   289  
   290  // NodeID returns node's ID or empty string if node is inactive.
   291  func (ns nodeState) NodeID() string {
   292  	if ns.swarmNode != nil {
   293  		return ns.swarmNode.NodeID()
   294  	}
   295  	return ""
   296  }