github.com/flavio/docker@v0.1.3-0.20170117145210-f63d1a6eec47/daemon/cluster/noderunner.go (about)

     1  package cluster
     2  
     3  import (
     4  	"fmt"
     5  	"path/filepath"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/Sirupsen/logrus"
    12  	types "github.com/docker/docker/api/types/swarm"
    13  	"github.com/docker/docker/daemon/cluster/executor/container"
    14  	swarmapi "github.com/docker/swarmkit/api"
    15  	swarmnode "github.com/docker/swarmkit/node"
    16  	"github.com/pkg/errors"
    17  	"golang.org/x/net/context"
    18  	"google.golang.org/grpc"
    19  )
    20  
    21  // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
    22  type nodeRunner struct {
    23  	nodeState
    24  	mu             sync.RWMutex
    25  	done           chan struct{} // closed when swarmNode exits
    26  	ready          chan struct{} // closed when swarmNode becomes active
    27  	reconnectDelay time.Duration
    28  	config         nodeStartConfig
    29  
    30  	repeatedRun     bool
    31  	cancelReconnect func()
    32  	stopping        bool
    33  	cluster         *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
    34  }
    35  
    36  // nodeStartConfig holds configuration needed to start a new node. Exported
    37  // fields of this structure are saved to disk in json. Unexported fields
    38  // contain data that shouldn't be persisted between daemon reloads.
    39  type nodeStartConfig struct {
    40  	// LocalAddr is this machine's local IP or hostname, if specified.
    41  	LocalAddr string
    42  	// RemoteAddr is the address that was given to "swarm join". It is used
    43  	// to find LocalAddr if necessary.
    44  	RemoteAddr string
    45  	// ListenAddr is the address we bind to, including a port.
    46  	ListenAddr string
    47  	// AdvertiseAddr is the address other nodes should connect to,
    48  	// including a port.
    49  	AdvertiseAddr   string
    50  	joinAddr        string
    51  	forceNewCluster bool
    52  	joinToken       string
    53  	lockKey         []byte
    54  	autolock        bool
    55  	availability    types.NodeAvailability
    56  }
    57  
    58  func (n *nodeRunner) Ready() chan error {
    59  	c := make(chan error, 1)
    60  	n.mu.RLock()
    61  	ready, done := n.ready, n.done
    62  	n.mu.RUnlock()
    63  	go func() {
    64  		select {
    65  		case <-ready:
    66  		case <-done:
    67  		}
    68  		select {
    69  		case <-ready:
    70  		default:
    71  			n.mu.RLock()
    72  			c <- n.err
    73  			n.mu.RUnlock()
    74  		}
    75  		close(c)
    76  	}()
    77  	return c
    78  }
    79  
    80  func (n *nodeRunner) Start(conf nodeStartConfig) error {
    81  	n.mu.Lock()
    82  	defer n.mu.Unlock()
    83  
    84  	n.reconnectDelay = initialReconnectDelay
    85  
    86  	return n.start(conf)
    87  }
    88  
    89  func (n *nodeRunner) start(conf nodeStartConfig) error {
    90  	var control string
    91  	if runtime.GOOS == "windows" {
    92  		control = `\\.\pipe\` + controlSocket
    93  	} else {
    94  		control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
    95  	}
    96  
    97  	swarmnodeConfig := swarmnode.Config{
    98  		Hostname:           n.cluster.config.Name,
    99  		ForceNewCluster:    conf.forceNewCluster,
   100  		ListenControlAPI:   control,
   101  		ListenRemoteAPI:    conf.ListenAddr,
   102  		AdvertiseRemoteAPI: conf.AdvertiseAddr,
   103  		JoinAddr:           conf.joinAddr,
   104  		StateDir:           n.cluster.root,
   105  		JoinToken:          conf.joinToken,
   106  		Executor:           container.NewExecutor(n.cluster.config.Backend),
   107  		HeartbeatTick:      1,
   108  		ElectionTick:       3,
   109  		UnlockKey:          conf.lockKey,
   110  		AutoLockManagers:   conf.autolock,
   111  	}
   112  	if conf.availability != "" {
   113  		avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
   114  		if !ok {
   115  			return fmt.Errorf("invalid Availability: %q", conf.availability)
   116  		}
   117  		swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
   118  	}
   119  	node, err := swarmnode.New(&swarmnodeConfig)
   120  	if err != nil {
   121  		return err
   122  	}
   123  	if err := node.Start(context.Background()); err != nil {
   124  		return err
   125  	}
   126  
   127  	n.done = make(chan struct{})
   128  	n.ready = make(chan struct{})
   129  	n.swarmNode = node
   130  	n.config = conf
   131  	savePersistentState(n.cluster.root, conf)
   132  
   133  	ctx, cancel := context.WithCancel(context.Background())
   134  
   135  	go func() {
   136  		n.handleNodeExit(node)
   137  		cancel()
   138  	}()
   139  
   140  	go n.handleReadyEvent(ctx, node, n.ready)
   141  	go n.handleControlSocketChange(ctx, node)
   142  
   143  	return nil
   144  }
   145  
   146  func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
   147  	for conn := range node.ListenControlSocket(ctx) {
   148  		n.mu.Lock()
   149  		if n.grpcConn != conn {
   150  			if conn == nil {
   151  				n.controlClient = nil
   152  				n.logsClient = nil
   153  			} else {
   154  				n.controlClient = swarmapi.NewControlClient(conn)
   155  				n.logsClient = swarmapi.NewLogsClient(conn)
   156  			}
   157  		}
   158  		n.grpcConn = conn
   159  		n.mu.Unlock()
   160  		n.cluster.configEvent <- struct{}{}
   161  	}
   162  }
   163  
   164  func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
   165  	select {
   166  	case <-node.Ready():
   167  		n.mu.Lock()
   168  		n.err = nil
   169  		n.mu.Unlock()
   170  		close(ready)
   171  	case <-ctx.Done():
   172  	}
   173  	n.cluster.configEvent <- struct{}{}
   174  }
   175  
   176  func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
   177  	err := detectLockedError(node.Err(context.Background()))
   178  	if err != nil {
   179  		logrus.Errorf("cluster exited with error: %v", err)
   180  	}
   181  	n.mu.Lock()
   182  	n.swarmNode = nil
   183  	n.err = err
   184  	close(n.done)
   185  	select {
   186  	case <-n.ready:
   187  		n.enableReconnectWatcher()
   188  	default:
   189  		if n.repeatedRun {
   190  			n.enableReconnectWatcher()
   191  		}
   192  	}
   193  	n.repeatedRun = true
   194  	n.mu.Unlock()
   195  }
   196  
   197  // Stop stops the current swarm node if it is running.
   198  func (n *nodeRunner) Stop() error {
   199  	n.mu.Lock()
   200  	if n.cancelReconnect != nil { // between restarts
   201  		n.cancelReconnect()
   202  		n.cancelReconnect = nil
   203  	}
   204  	if n.swarmNode == nil {
   205  		n.mu.Unlock()
   206  		return nil
   207  	}
   208  	n.stopping = true
   209  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
   210  	defer cancel()
   211  	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
   212  		n.mu.Unlock()
   213  		return err
   214  	}
   215  	n.mu.Unlock()
   216  	<-n.done
   217  	return nil
   218  }
   219  
   220  func (n *nodeRunner) State() nodeState {
   221  	if n == nil {
   222  		return nodeState{status: types.LocalNodeStateInactive}
   223  	}
   224  	n.mu.RLock()
   225  	defer n.mu.RUnlock()
   226  
   227  	ns := n.nodeState
   228  
   229  	if ns.err != nil || n.cancelReconnect != nil {
   230  		if errors.Cause(ns.err) == errSwarmLocked {
   231  			ns.status = types.LocalNodeStateLocked
   232  		} else {
   233  			ns.status = types.LocalNodeStateError
   234  		}
   235  	} else {
   236  		select {
   237  		case <-n.ready:
   238  			ns.status = types.LocalNodeStateActive
   239  		default:
   240  			ns.status = types.LocalNodeStatePending
   241  		}
   242  	}
   243  
   244  	return ns
   245  }
   246  
   247  func (n *nodeRunner) enableReconnectWatcher() {
   248  	if n.stopping {
   249  		return
   250  	}
   251  	n.reconnectDelay *= 2
   252  	if n.reconnectDelay > maxReconnectDelay {
   253  		n.reconnectDelay = maxReconnectDelay
   254  	}
   255  	logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
   256  	delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
   257  	n.cancelReconnect = cancel
   258  
   259  	config := n.config
   260  	go func() {
   261  		<-delayCtx.Done()
   262  		if delayCtx.Err() != context.DeadlineExceeded {
   263  			return
   264  		}
   265  		n.mu.Lock()
   266  		defer n.mu.Unlock()
   267  		if n.stopping {
   268  			return
   269  		}
   270  		config.RemoteAddr = n.cluster.getRemoteAddress()
   271  		config.joinAddr = config.RemoteAddr
   272  		if err := n.start(config); err != nil {
   273  			n.err = err
   274  		}
   275  	}()
   276  }
   277  
   278  // nodeState represents information about the current state of the cluster and
   279  // provides access to the grpc clients.
   280  type nodeState struct {
   281  	swarmNode       *swarmnode.Node
   282  	grpcConn        *grpc.ClientConn
   283  	controlClient   swarmapi.ControlClient
   284  	logsClient      swarmapi.LogsClient
   285  	status          types.LocalNodeState
   286  	actualLocalAddr string
   287  	err             error
   288  }
   289  
   290  // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
   291  func (ns nodeState) IsActiveManager() bool {
   292  	return ns.controlClient != nil
   293  }
   294  
   295  // IsManager returns true if node is a manager.
   296  func (ns nodeState) IsManager() bool {
   297  	return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
   298  }
   299  
   300  // NodeID returns node's ID or empty string if node is inactive.
   301  func (ns nodeState) NodeID() string {
   302  	if ns.swarmNode != nil {
   303  		return ns.swarmNode.NodeID()
   304  	}
   305  	return ""
   306  }