github.com/fabiokung/docker@v0.11.2-0.20170222101415-4534dcd49497/daemon/cluster/noderunner.go (about)

     1  package cluster
     2  
     3  import (
     4  	"fmt"
     5  	"path/filepath"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/Sirupsen/logrus"
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/executor/container"
    14  	swarmapi "github.com/docker/swarmkit/api"
    15  	swarmnode "github.com/docker/swarmkit/node"
    16  	"github.com/pkg/errors"
    17  	"golang.org/x/net/context"
    18  	"google.golang.org/grpc"
    19  )
    20  
    21  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    22  type nodeRunner struct {
    23  	nodeState
    24  	mu             sync.RWMutex
    25  	done           chan struct{} // closed when swarmNode exits
    26  	ready          chan struct{} // closed when swarmNode becomes active
    27  	reconnectDelay time.Duration
    28  	config         nodeStartConfig
    29  
    30  	repeatedRun     bool
    31  	cancelReconnect func()
    32  	stopping        bool
    33  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    34  }
    35  
    36  // nodeStartConfig holds configuration needed to start a new node. Exported
    37  // fields of this structure are saved to disk in json. Unexported fields
    38  // contain data that shouldn't be persisted between daemon reloads.
    39  type nodeStartConfig struct {
    40  	// LocalAddr is this machine's local IP or hostname, if specified.
    41  	LocalAddr string
    42  	// RemoteAddr is the address that was given to "swarm join". It is used
    43  	// to find LocalAddr if necessary.
    44  	RemoteAddr string
    45  	// ListenAddr is the address we bind to, including a port.
    46  	ListenAddr string
    47  	// AdvertiseAddr is the address other nodes should connect to,
    48  	// including a port.
    49  	AdvertiseAddr   string
    50  	joinAddr        string
    51  	forceNewCluster bool
    52  	joinToken       string
    53  	lockKey         []byte
    54  	autolock        bool
    55  	availability    types.NodeAvailability
    56  }
    57  
    58  func (n *nodeRunner) Ready() chan error {
    59  	c := make(chan error, 1)
    60  	n.mu.RLock()
    61  	ready, done := n.ready, n.done
    62  	n.mu.RUnlock()
    63  	go func() {
    64  		select {
    65  		case <-ready:
    66  		case <-done:
    67  		}
    68  		select {
    69  		case <-ready:
    70  		default:
    71  			n.mu.RLock()
    72  			c <- n.err
    73  			n.mu.RUnlock()
    74  		}
    75  		close(c)
    76  	}()
    77  	return c
    78  }
    79  
    80  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    81  	n.mu.Lock()
    82  	defer n.mu.Unlock()
    83  
    84  	n.reconnectDelay = initialReconnectDelay
    85  
    86  	return n.start(conf)
    87  }
    88  
    89  func (n *nodeRunner) start(conf nodeStartConfig) error {
    90  	var control string
    91  	if runtime.GOOS == "windows" {
    92  		control = `\\.\pipe\` + controlSocket
    93  	} else {
    94  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
    95  	}
    96  
    97  	// Hostname is not set here. Instead, it is obtained from
    98  	// the node description that is reported periodically
    99  	swarmnodeConfig := swarmnode.Config{
   100  		ForceNewCluster:    conf.forceNewCluster,
   101  		ListenControlAPI:   control,
   102  		ListenRemoteAPI:    conf.ListenAddr,
   103  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   104  		JoinAddr:           conf.joinAddr,
   105  		StateDir:           n.cluster.root,
   106  		JoinToken:          conf.joinToken,
   107  		Executor:           container.NewExecutor(n.cluster.config.Backend),
   108  		HeartbeatTick:      1,
   109  		ElectionTick:       3,
   110  		UnlockKey:          conf.lockKey,
   111  		AutoLockManagers:   conf.autolock,
   112  		PluginGetter:       n.cluster.config.Backend.PluginGetter(),
   113  	}
   114  	if conf.availability != "" {
   115  		avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
   116  		if !ok {
   117  			return fmt.Errorf("invalid Availability: %q", conf.availability)
   118  		}
   119  		swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
   120  	}
   121  	node, err := swarmnode.New(&swarmnodeConfig)
   122  	if err != nil {
   123  		return err
   124  	}
   125  	if err := node.Start(context.Background()); err != nil {
   126  		return err
   127  	}
   128  
   129  	n.done = make(chan struct{})
   130  	n.ready = make(chan struct{})
   131  	n.swarmNode = node
   132  	n.config = conf
   133  	savePersistentState(n.cluster.root, conf)
   134  
   135  	ctx, cancel := context.WithCancel(context.Background())
   136  
   137  	go func() {
   138  		n.handleNodeExit(node)
   139  		cancel()
   140  	}()
   141  
   142  	go n.handleReadyEvent(ctx, node, n.ready)
   143  	go n.handleControlSocketChange(ctx, node)
   144  
   145  	return nil
   146  }
   147  
   148  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   149  	for conn := range node.ListenControlSocket(ctx) {
   150  		n.mu.Lock()
   151  		if n.grpcConn != conn {
   152  			if conn == nil {
   153  				n.controlClient = nil
   154  				n.logsClient = nil
   155  			} else {
   156  				n.controlClient = swarmapi.NewControlClient(conn)
   157  				n.logsClient = swarmapi.NewLogsClient(conn)
   158  			}
   159  		}
   160  		n.grpcConn = conn
   161  		n.mu.Unlock()
   162  		n.cluster.configEvent <- struct{}{}
   163  	}
   164  }
   165  
   166  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   167  	select {
   168  	case <-node.Ready():
   169  		n.mu.Lock()
   170  		n.err = nil
   171  		n.mu.Unlock()
   172  		close(ready)
   173  	case <-ctx.Done():
   174  	}
   175  	n.cluster.configEvent <- struct{}{}
   176  }
   177  
   178  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   179  	err := detectLockedError(node.Err(context.Background()))
   180  	if err != nil {
   181  		logrus.Errorf("cluster exited with error: %v", err)
   182  	}
   183  	n.mu.Lock()
   184  	n.swarmNode = nil
   185  	n.err = err
   186  	close(n.done)
   187  	select {
   188  	case <-n.ready:
   189  		n.enableReconnectWatcher()
   190  	default:
   191  		if n.repeatedRun {
   192  			n.enableReconnectWatcher()
   193  		}
   194  	}
   195  	n.repeatedRun = true
   196  	n.mu.Unlock()
   197  }
   198  
   199  // Stop stops the current swarm node if it is running.
   200  func (n *nodeRunner) Stop() error {
   201  	n.mu.Lock()
   202  	if n.cancelReconnect != nil { // between restarts
   203  		n.cancelReconnect()
   204  		n.cancelReconnect = nil
   205  	}
   206  	if n.swarmNode == nil {
   207  		n.mu.Unlock()
   208  		return nil
   209  	}
   210  	n.stopping = true
   211  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   212  	defer cancel()
   213  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   214  		n.mu.Unlock()
   215  		return err
   216  	}
   217  	n.mu.Unlock()
   218  	<-n.done
   219  	return nil
   220  }
   221  
   222  func (n *nodeRunner) State() nodeState {
   223  	if n == nil {
   224  		return nodeState{status: types.LocalNodeStateInactive}
   225  	}
   226  	n.mu.RLock()
   227  	defer n.mu.RUnlock()
   228  
   229  	ns := n.nodeState
   230  
   231  	if ns.err != nil || n.cancelReconnect != nil {
   232  		if errors.Cause(ns.err) == errSwarmLocked {
   233  			ns.status = types.LocalNodeStateLocked
   234  		} else {
   235  			ns.status = types.LocalNodeStateError
   236  		}
   237  	} else {
   238  		select {
   239  		case <-n.ready:
   240  			ns.status = types.LocalNodeStateActive
   241  		default:
   242  			ns.status = types.LocalNodeStatePending
   243  		}
   244  	}
   245  
   246  	return ns
   247  }
   248  
   249  func (n *nodeRunner) enableReconnectWatcher() {
   250  	if n.stopping {
   251  		return
   252  	}
   253  	n.reconnectDelay *= 2
   254  	if n.reconnectDelay > maxReconnectDelay {
   255  		n.reconnectDelay = maxReconnectDelay
   256  	}
   257  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   258  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   259  	n.cancelReconnect = cancel
   260  
   261  	config := n.config
   262  	go func() {
   263  		<-delayCtx.Done()
   264  		if delayCtx.Err() != context.DeadlineExceeded {
   265  			return
   266  		}
   267  		n.mu.Lock()
   268  		defer n.mu.Unlock()
   269  		if n.stopping {
   270  			return
   271  		}
   272  		config.RemoteAddr = n.cluster.getRemoteAddress()
   273  		config.joinAddr = config.RemoteAddr
   274  		if err := n.start(config); err != nil {
   275  			n.err = err
   276  		}
   277  	}()
   278  }
   279  
   280  // nodeState represents information about the current state of the cluster and
   281  // provides access to the grpc clients.
   282  type nodeState struct {
   283  	swarmNode       *swarmnode.Node
   284  	grpcConn        *grpc.ClientConn
   285  	controlClient   swarmapi.ControlClient
   286  	logsClient      swarmapi.LogsClient
   287  	status          types.LocalNodeState
   288  	actualLocalAddr string
   289  	err             error
   290  }
   291  
   292  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   293  func (ns nodeState) IsActiveManager() bool {
   294  	return ns.controlClient != nil
   295  }
   296  
   297  // IsManager returns true if node is a manager.
   298  func (ns nodeState) IsManager() bool {
   299  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   300  }
   301  
   302  // NodeID returns node's ID or empty string if node is inactive.
   303  func (ns nodeState) NodeID() string {
   304  	if ns.swarmNode != nil {
   305  		return ns.swarmNode.NodeID()
   306  	}
   307  	return ""
   308  }