github.com/fabiokung/docker@v0.11.2-0.20170222101415-4534dcd49497/daemon/cluster/noderunner.go (about) 1 package cluster 2 3 import ( 4 "fmt" 5 "path/filepath" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/Sirupsen/logrus" 12 types "github.com/docker/docker/api/types/swarm" 13 "github.com/docker/docker/daemon/cluster/executor/container" 14 swarmapi "github.com/docker/swarmkit/api" 15 swarmnode "github.com/docker/swarmkit/node" 16 "github.com/pkg/errors" 17 "golang.org/x/net/context" 18 "google.golang.org/grpc" 19 ) 20 21 // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed. 22 type nodeRunner struct { 23 nodeState 24 mu sync.RWMutex 25 done chan struct{} // closed when swarmNode exits 26 ready chan struct{} // closed when swarmNode becomes active 27 reconnectDelay time.Duration 28 config nodeStartConfig 29 30 repeatedRun bool 31 cancelReconnect func() 32 stopping bool 33 cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct 34 } 35 36 // nodeStartConfig holds configuration needed to start a new node. Exported 37 // fields of this structure are saved to disk in json. Unexported fields 38 // contain data that shouldn't be persisted between daemon reloads. 39 type nodeStartConfig struct { 40 // LocalAddr is this machine's local IP or hostname, if specified. 41 LocalAddr string 42 // RemoteAddr is the address that was given to "swarm join". It is used 43 // to find LocalAddr if necessary. 44 RemoteAddr string 45 // ListenAddr is the address we bind to, including a port. 46 ListenAddr string 47 // AdvertiseAddr is the address other nodes should connect to, 48 // including a port. 49 AdvertiseAddr string 50 joinAddr string 51 forceNewCluster bool 52 joinToken string 53 lockKey []byte 54 autolock bool 55 availability types.NodeAvailability 56 } 57 58 func (n *nodeRunner) Ready() chan error { 59 c := make(chan error, 1) 60 n.mu.RLock() 61 ready, done := n.ready, n.done 62 n.mu.RUnlock() 63 go func() { 64 select { 65 case <-ready: 66 case <-done: 67 } 68 select { 69 case <-ready: 70 default: 71 n.mu.RLock() 72 c <- n.err 73 n.mu.RUnlock() 74 } 75 close(c) 76 }() 77 return c 78 } 79 80 func (n *nodeRunner) Start(conf nodeStartConfig) error { 81 n.mu.Lock() 82 defer n.mu.Unlock() 83 84 n.reconnectDelay = initialReconnectDelay 85 86 return n.start(conf) 87 } 88 89 func (n *nodeRunner) start(conf nodeStartConfig) error { 90 var control string 91 if runtime.GOOS == "windows" { 92 control = `\\.\pipe\` + controlSocket 93 } else { 94 control = filepath.Join(n.cluster.runtimeRoot, controlSocket) 95 } 96 97 // Hostname is not set here. Instead, it is obtained from 98 // the node description that is reported periodically 99 swarmnodeConfig := swarmnode.Config{ 100 ForceNewCluster: conf.forceNewCluster, 101 ListenControlAPI: control, 102 ListenRemoteAPI: conf.ListenAddr, 103 AdvertiseRemoteAPI: conf.AdvertiseAddr, 104 JoinAddr: conf.joinAddr, 105 StateDir: n.cluster.root, 106 JoinToken: conf.joinToken, 107 Executor: container.NewExecutor(n.cluster.config.Backend), 108 HeartbeatTick: 1, 109 ElectionTick: 3, 110 UnlockKey: conf.lockKey, 111 AutoLockManagers: conf.autolock, 112 PluginGetter: n.cluster.config.Backend.PluginGetter(), 113 } 114 if conf.availability != "" { 115 avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))] 116 if !ok { 117 return fmt.Errorf("invalid Availability: %q", conf.availability) 118 } 119 swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail) 120 } 121 node, err := swarmnode.New(&swarmnodeConfig) 122 if err != nil { 123 return err 124 } 125 if err := node.Start(context.Background()); err != nil { 126 return err 127 } 128 129 n.done = make(chan struct{}) 130 n.ready = make(chan struct{}) 131 n.swarmNode = node 132 n.config = conf 133 savePersistentState(n.cluster.root, conf) 134 135 ctx, cancel := context.WithCancel(context.Background()) 136 137 go func() { 138 n.handleNodeExit(node) 139 cancel() 140 }() 141 142 go n.handleReadyEvent(ctx, node, n.ready) 143 go n.handleControlSocketChange(ctx, node) 144 145 return nil 146 } 147 148 func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) { 149 for conn := range node.ListenControlSocket(ctx) { 150 n.mu.Lock() 151 if n.grpcConn != conn { 152 if conn == nil { 153 n.controlClient = nil 154 n.logsClient = nil 155 } else { 156 n.controlClient = swarmapi.NewControlClient(conn) 157 n.logsClient = swarmapi.NewLogsClient(conn) 158 } 159 } 160 n.grpcConn = conn 161 n.mu.Unlock() 162 n.cluster.configEvent <- struct{}{} 163 } 164 } 165 166 func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) { 167 select { 168 case <-node.Ready(): 169 n.mu.Lock() 170 n.err = nil 171 n.mu.Unlock() 172 close(ready) 173 case <-ctx.Done(): 174 } 175 n.cluster.configEvent <- struct{}{} 176 } 177 178 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { 179 err := detectLockedError(node.Err(context.Background())) 180 if err != nil { 181 logrus.Errorf("cluster exited with error: %v", err) 182 } 183 n.mu.Lock() 184 n.swarmNode = nil 185 n.err = err 186 close(n.done) 187 select { 188 case <-n.ready: 189 n.enableReconnectWatcher() 190 default: 191 if n.repeatedRun { 192 n.enableReconnectWatcher() 193 } 194 } 195 n.repeatedRun = true 196 n.mu.Unlock() 197 } 198 199 // Stop stops the current swarm node if it is running. 200 func (n *nodeRunner) Stop() error { 201 n.mu.Lock() 202 if n.cancelReconnect != nil { // between restarts 203 n.cancelReconnect() 204 n.cancelReconnect = nil 205 } 206 if n.swarmNode == nil { 207 n.mu.Unlock() 208 return nil 209 } 210 n.stopping = true 211 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 212 defer cancel() 213 if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 214 n.mu.Unlock() 215 return err 216 } 217 n.mu.Unlock() 218 <-n.done 219 return nil 220 } 221 222 func (n *nodeRunner) State() nodeState { 223 if n == nil { 224 return nodeState{status: types.LocalNodeStateInactive} 225 } 226 n.mu.RLock() 227 defer n.mu.RUnlock() 228 229 ns := n.nodeState 230 231 if ns.err != nil || n.cancelReconnect != nil { 232 if errors.Cause(ns.err) == errSwarmLocked { 233 ns.status = types.LocalNodeStateLocked 234 } else { 235 ns.status = types.LocalNodeStateError 236 } 237 } else { 238 select { 239 case <-n.ready: 240 ns.status = types.LocalNodeStateActive 241 default: 242 ns.status = types.LocalNodeStatePending 243 } 244 } 245 246 return ns 247 } 248 249 func (n *nodeRunner) enableReconnectWatcher() { 250 if n.stopping { 251 return 252 } 253 n.reconnectDelay *= 2 254 if n.reconnectDelay > maxReconnectDelay { 255 n.reconnectDelay = maxReconnectDelay 256 } 257 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 258 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 259 n.cancelReconnect = cancel 260 261 config := n.config 262 go func() { 263 <-delayCtx.Done() 264 if delayCtx.Err() != context.DeadlineExceeded { 265 return 266 } 267 n.mu.Lock() 268 defer n.mu.Unlock() 269 if n.stopping { 270 return 271 } 272 config.RemoteAddr = n.cluster.getRemoteAddress() 273 config.joinAddr = config.RemoteAddr 274 if err := n.start(config); err != nil { 275 n.err = err 276 } 277 }() 278 } 279 280 // nodeState represents information about the current state of the cluster and 281 // provides access to the grpc clients. 282 type nodeState struct { 283 swarmNode *swarmnode.Node 284 grpcConn *grpc.ClientConn 285 controlClient swarmapi.ControlClient 286 logsClient swarmapi.LogsClient 287 status types.LocalNodeState 288 actualLocalAddr string 289 err error 290 } 291 292 // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true. 293 func (ns nodeState) IsActiveManager() bool { 294 return ns.controlClient != nil 295 } 296 297 // IsManager returns true if node is a manager. 298 func (ns nodeState) IsManager() bool { 299 return ns.swarmNode != nil && ns.swarmNode.Manager() != nil 300 } 301 302 // NodeID returns node's ID or empty string if node is inactive. 303 func (ns nodeState) NodeID() string { 304 if ns.swarmNode != nil { 305 return ns.swarmNode.NodeID() 306 } 307 return "" 308 }