github.com/flavio/docker@v0.1.3-0.20170117145210-f63d1a6eec47/daemon/cluster/noderunner.go (about) 1 package cluster 2 3 import ( 4 "fmt" 5 "path/filepath" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/Sirupsen/logrus" 12 types "github.com/docker/docker/api/types/swarm" 13 "github.com/docker/docker/daemon/cluster/executor/container" 14 swarmapi "github.com/docker/swarmkit/api" 15 swarmnode "github.com/docker/swarmkit/node" 16 "github.com/pkg/errors" 17 "golang.org/x/net/context" 18 "google.golang.org/grpc" 19 ) 20 21 // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed. 22 type nodeRunner struct { 23 nodeState 24 mu sync.RWMutex 25 done chan struct{} // closed when swarmNode exits 26 ready chan struct{} // closed when swarmNode becomes active 27 reconnectDelay time.Duration 28 config nodeStartConfig 29 30 repeatedRun bool 31 cancelReconnect func() 32 stopping bool 33 cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct 34 } 35 36 // nodeStartConfig holds configuration needed to start a new node. Exported 37 // fields of this structure are saved to disk in json. Unexported fields 38 // contain data that shouldn't be persisted between daemon reloads. 39 type nodeStartConfig struct { 40 // LocalAddr is this machine's local IP or hostname, if specified. 41 LocalAddr string 42 // RemoteAddr is the address that was given to "swarm join". It is used 43 // to find LocalAddr if necessary. 44 RemoteAddr string 45 // ListenAddr is the address we bind to, including a port. 46 ListenAddr string 47 // AdvertiseAddr is the address other nodes should connect to, 48 // including a port. 49 AdvertiseAddr string 50 joinAddr string 51 forceNewCluster bool 52 joinToken string 53 lockKey []byte 54 autolock bool 55 availability types.NodeAvailability 56 } 57 58 func (n *nodeRunner) Ready() chan error { 59 c := make(chan error, 1) 60 n.mu.RLock() 61 ready, done := n.ready, n.done 62 n.mu.RUnlock() 63 go func() { 64 select { 65 case <-ready: 66 case <-done: 67 } 68 select { 69 case <-ready: 70 default: 71 n.mu.RLock() 72 c <- n.err 73 n.mu.RUnlock() 74 } 75 close(c) 76 }() 77 return c 78 } 79 80 func (n *nodeRunner) Start(conf nodeStartConfig) error { 81 n.mu.Lock() 82 defer n.mu.Unlock() 83 84 n.reconnectDelay = initialReconnectDelay 85 86 return n.start(conf) 87 } 88 89 func (n *nodeRunner) start(conf nodeStartConfig) error { 90 var control string 91 if runtime.GOOS == "windows" { 92 control = `\\.\pipe\` + controlSocket 93 } else { 94 control = filepath.Join(n.cluster.runtimeRoot, controlSocket) 95 } 96 97 swarmnodeConfig := swarmnode.Config{ 98 Hostname: n.cluster.config.Name, 99 ForceNewCluster: conf.forceNewCluster, 100 ListenControlAPI: control, 101 ListenRemoteAPI: conf.ListenAddr, 102 AdvertiseRemoteAPI: conf.AdvertiseAddr, 103 JoinAddr: conf.joinAddr, 104 StateDir: n.cluster.root, 105 JoinToken: conf.joinToken, 106 Executor: container.NewExecutor(n.cluster.config.Backend), 107 HeartbeatTick: 1, 108 ElectionTick: 3, 109 UnlockKey: conf.lockKey, 110 AutoLockManagers: conf.autolock, 111 } 112 if conf.availability != "" { 113 avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))] 114 if !ok { 115 return fmt.Errorf("invalid Availability: %q", conf.availability) 116 } 117 swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail) 118 } 119 node, err := swarmnode.New(&swarmnodeConfig) 120 if err != nil { 121 return err 122 } 123 if err := node.Start(context.Background()); err != nil { 124 return err 125 } 126 127 n.done = make(chan struct{}) 128 n.ready = make(chan struct{}) 129 n.swarmNode = node 130 n.config = conf 131 savePersistentState(n.cluster.root, conf) 132 133 ctx, cancel := context.WithCancel(context.Background()) 134 135 go func() { 136 n.handleNodeExit(node) 137 cancel() 138 }() 139 140 go n.handleReadyEvent(ctx, node, n.ready) 141 go n.handleControlSocketChange(ctx, node) 142 143 return nil 144 } 145 146 func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) { 147 for conn := range node.ListenControlSocket(ctx) { 148 n.mu.Lock() 149 if n.grpcConn != conn { 150 if conn == nil { 151 n.controlClient = nil 152 n.logsClient = nil 153 } else { 154 n.controlClient = swarmapi.NewControlClient(conn) 155 n.logsClient = swarmapi.NewLogsClient(conn) 156 } 157 } 158 n.grpcConn = conn 159 n.mu.Unlock() 160 n.cluster.configEvent <- struct{}{} 161 } 162 } 163 164 func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) { 165 select { 166 case <-node.Ready(): 167 n.mu.Lock() 168 n.err = nil 169 n.mu.Unlock() 170 close(ready) 171 case <-ctx.Done(): 172 } 173 n.cluster.configEvent <- struct{}{} 174 } 175 176 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { 177 err := detectLockedError(node.Err(context.Background())) 178 if err != nil { 179 logrus.Errorf("cluster exited with error: %v", err) 180 } 181 n.mu.Lock() 182 n.swarmNode = nil 183 n.err = err 184 close(n.done) 185 select { 186 case <-n.ready: 187 n.enableReconnectWatcher() 188 default: 189 if n.repeatedRun { 190 n.enableReconnectWatcher() 191 } 192 } 193 n.repeatedRun = true 194 n.mu.Unlock() 195 } 196 197 // Stop stops the current swarm node if it is running. 198 func (n *nodeRunner) Stop() error { 199 n.mu.Lock() 200 if n.cancelReconnect != nil { // between restarts 201 n.cancelReconnect() 202 n.cancelReconnect = nil 203 } 204 if n.swarmNode == nil { 205 n.mu.Unlock() 206 return nil 207 } 208 n.stopping = true 209 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 210 defer cancel() 211 if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 212 n.mu.Unlock() 213 return err 214 } 215 n.mu.Unlock() 216 <-n.done 217 return nil 218 } 219 220 func (n *nodeRunner) State() nodeState { 221 if n == nil { 222 return nodeState{status: types.LocalNodeStateInactive} 223 } 224 n.mu.RLock() 225 defer n.mu.RUnlock() 226 227 ns := n.nodeState 228 229 if ns.err != nil || n.cancelReconnect != nil { 230 if errors.Cause(ns.err) == errSwarmLocked { 231 ns.status = types.LocalNodeStateLocked 232 } else { 233 ns.status = types.LocalNodeStateError 234 } 235 } else { 236 select { 237 case <-n.ready: 238 ns.status = types.LocalNodeStateActive 239 default: 240 ns.status = types.LocalNodeStatePending 241 } 242 } 243 244 return ns 245 } 246 247 func (n *nodeRunner) enableReconnectWatcher() { 248 if n.stopping { 249 return 250 } 251 n.reconnectDelay *= 2 252 if n.reconnectDelay > maxReconnectDelay { 253 n.reconnectDelay = maxReconnectDelay 254 } 255 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 256 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 257 n.cancelReconnect = cancel 258 259 config := n.config 260 go func() { 261 <-delayCtx.Done() 262 if delayCtx.Err() != context.DeadlineExceeded { 263 return 264 } 265 n.mu.Lock() 266 defer n.mu.Unlock() 267 if n.stopping { 268 return 269 } 270 config.RemoteAddr = n.cluster.getRemoteAddress() 271 config.joinAddr = config.RemoteAddr 272 if err := n.start(config); err != nil { 273 n.err = err 274 } 275 }() 276 } 277 278 // nodeState represents information about the current state of the cluster and 279 // provides access to the grpc clients. 280 type nodeState struct { 281 swarmNode *swarmnode.Node 282 grpcConn *grpc.ClientConn 283 controlClient swarmapi.ControlClient 284 logsClient swarmapi.LogsClient 285 status types.LocalNodeState 286 actualLocalAddr string 287 err error 288 } 289 290 // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true. 291 func (ns nodeState) IsActiveManager() bool { 292 return ns.controlClient != nil 293 } 294 295 // IsManager returns true if node is a manager. 296 func (ns nodeState) IsManager() bool { 297 return ns.swarmNode != nil && ns.swarmNode.Manager() != nil 298 } 299 300 // NodeID returns node's ID or empty string if node is inactive. 301 func (ns nodeState) NodeID() string { 302 if ns.swarmNode != nil { 303 return ns.swarmNode.NodeID() 304 } 305 return "" 306 }