github.com/rhatdan/docker@v0.7.7-0.20180119204836-47a0dcbcd20a/daemon/cluster/noderunner.go (about) 1 package cluster 2 3 import ( 4 "fmt" 5 "path/filepath" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 types "github.com/docker/docker/api/types/swarm" 12 "github.com/docker/docker/daemon/cluster/executor/container" 13 lncluster "github.com/docker/libnetwork/cluster" 14 swarmapi "github.com/docker/swarmkit/api" 15 swarmnode "github.com/docker/swarmkit/node" 16 "github.com/pkg/errors" 17 "github.com/sirupsen/logrus" 18 "golang.org/x/net/context" 19 "google.golang.org/grpc" 20 "google.golang.org/grpc/codes" 21 "google.golang.org/grpc/status" 22 ) 23 24 // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed. 25 type nodeRunner struct { 26 nodeState 27 mu sync.RWMutex 28 done chan struct{} // closed when swarmNode exits 29 ready chan struct{} // closed when swarmNode becomes active 30 reconnectDelay time.Duration 31 config nodeStartConfig 32 33 repeatedRun bool 34 cancelReconnect func() 35 stopping bool 36 cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct 37 } 38 39 // nodeStartConfig holds configuration needed to start a new node. Exported 40 // fields of this structure are saved to disk in json. Unexported fields 41 // contain data that shouldn't be persisted between daemon reloads. 42 type nodeStartConfig struct { 43 // LocalAddr is this machine's local IP or hostname, if specified. 44 LocalAddr string 45 // RemoteAddr is the address that was given to "swarm join". It is used 46 // to find LocalAddr if necessary. 47 RemoteAddr string 48 // ListenAddr is the address we bind to, including a port. 49 ListenAddr string 50 // AdvertiseAddr is the address other nodes should connect to, 51 // including a port. 52 AdvertiseAddr string 53 // DataPathAddr is the address that has to be used for the data path 54 DataPathAddr string 55 // JoinInProgress is set to true if a join operation has started, but 56 // not completed yet. 57 JoinInProgress bool 58 59 joinAddr string 60 forceNewCluster bool 61 joinToken string 62 lockKey []byte 63 autolock bool 64 availability types.NodeAvailability 65 } 66 67 func (n *nodeRunner) Ready() chan error { 68 c := make(chan error, 1) 69 n.mu.RLock() 70 ready, done := n.ready, n.done 71 n.mu.RUnlock() 72 go func() { 73 select { 74 case <-ready: 75 case <-done: 76 } 77 select { 78 case <-ready: 79 default: 80 n.mu.RLock() 81 c <- n.err 82 n.mu.RUnlock() 83 } 84 close(c) 85 }() 86 return c 87 } 88 89 func (n *nodeRunner) Start(conf nodeStartConfig) error { 90 n.mu.Lock() 91 defer n.mu.Unlock() 92 93 n.reconnectDelay = initialReconnectDelay 94 95 return n.start(conf) 96 } 97 98 func (n *nodeRunner) start(conf nodeStartConfig) error { 99 var control string 100 if runtime.GOOS == "windows" { 101 control = `\\.\pipe\` + controlSocket 102 } else { 103 control = filepath.Join(n.cluster.runtimeRoot, controlSocket) 104 } 105 106 joinAddr := conf.joinAddr 107 if joinAddr == "" && conf.JoinInProgress { 108 // We must have been restarted while trying to join a cluster. 109 // Continue trying to join instead of forming our own cluster. 110 joinAddr = conf.RemoteAddr 111 } 112 113 // Hostname is not set here. Instead, it is obtained from 114 // the node description that is reported periodically 115 swarmnodeConfig := swarmnode.Config{ 116 ForceNewCluster: conf.forceNewCluster, 117 ListenControlAPI: control, 118 ListenRemoteAPI: conf.ListenAddr, 119 AdvertiseRemoteAPI: conf.AdvertiseAddr, 120 JoinAddr: joinAddr, 121 StateDir: n.cluster.root, 122 JoinToken: conf.joinToken, 123 Executor: container.NewExecutor(n.cluster.config.Backend, n.cluster.config.PluginBackend), 124 HeartbeatTick: 1, 125 ElectionTick: 3, 126 UnlockKey: conf.lockKey, 127 AutoLockManagers: conf.autolock, 128 PluginGetter: n.cluster.config.Backend.PluginGetter(), 129 } 130 if conf.availability != "" { 131 avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))] 132 if !ok { 133 return fmt.Errorf("invalid Availability: %q", conf.availability) 134 } 135 swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail) 136 } 137 node, err := swarmnode.New(&swarmnodeConfig) 138 if err != nil { 139 return err 140 } 141 if err := node.Start(context.Background()); err != nil { 142 return err 143 } 144 145 n.done = make(chan struct{}) 146 n.ready = make(chan struct{}) 147 n.swarmNode = node 148 if conf.joinAddr != "" { 149 conf.JoinInProgress = true 150 } 151 n.config = conf 152 savePersistentState(n.cluster.root, conf) 153 154 ctx, cancel := context.WithCancel(context.Background()) 155 156 go func() { 157 n.handleNodeExit(node) 158 cancel() 159 }() 160 161 go n.handleReadyEvent(ctx, node, n.ready) 162 go n.handleControlSocketChange(ctx, node) 163 164 return nil 165 } 166 167 func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) { 168 for conn := range node.ListenControlSocket(ctx) { 169 n.mu.Lock() 170 if n.grpcConn != conn { 171 if conn == nil { 172 n.controlClient = nil 173 n.logsClient = nil 174 } else { 175 n.controlClient = swarmapi.NewControlClient(conn) 176 n.logsClient = swarmapi.NewLogsClient(conn) 177 // push store changes to daemon 178 go n.watchClusterEvents(ctx, conn) 179 } 180 } 181 n.grpcConn = conn 182 n.mu.Unlock() 183 n.cluster.SendClusterEvent(lncluster.EventSocketChange) 184 } 185 } 186 187 func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) { 188 client := swarmapi.NewWatchClient(conn) 189 watch, err := client.Watch(ctx, &swarmapi.WatchRequest{ 190 Entries: []*swarmapi.WatchRequest_WatchEntry{ 191 { 192 Kind: "node", 193 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 194 }, 195 { 196 Kind: "service", 197 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 198 }, 199 { 200 Kind: "network", 201 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 202 }, 203 { 204 Kind: "secret", 205 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 206 }, 207 { 208 Kind: "config", 209 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 210 }, 211 }, 212 IncludeOldObject: true, 213 }) 214 if err != nil { 215 logrus.WithError(err).Error("failed to watch cluster store") 216 return 217 } 218 for { 219 msg, err := watch.Recv() 220 if err != nil { 221 // store watch is broken 222 errStatus, ok := status.FromError(err) 223 if !ok || errStatus.Code() != codes.Canceled { 224 logrus.WithError(err).Error("failed to receive changes from store watch API") 225 } 226 return 227 } 228 select { 229 case <-ctx.Done(): 230 return 231 case n.cluster.watchStream <- msg: 232 } 233 } 234 } 235 236 func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) { 237 select { 238 case <-node.Ready(): 239 n.mu.Lock() 240 n.err = nil 241 if n.config.JoinInProgress { 242 n.config.JoinInProgress = false 243 savePersistentState(n.cluster.root, n.config) 244 } 245 n.mu.Unlock() 246 close(ready) 247 case <-ctx.Done(): 248 } 249 n.cluster.SendClusterEvent(lncluster.EventNodeReady) 250 } 251 252 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { 253 err := detectLockedError(node.Err(context.Background())) 254 if err != nil { 255 logrus.Errorf("cluster exited with error: %v", err) 256 } 257 n.mu.Lock() 258 n.swarmNode = nil 259 n.err = err 260 close(n.done) 261 select { 262 case <-n.ready: 263 n.enableReconnectWatcher() 264 default: 265 if n.repeatedRun { 266 n.enableReconnectWatcher() 267 } 268 } 269 n.repeatedRun = true 270 n.mu.Unlock() 271 } 272 273 // Stop stops the current swarm node if it is running. 274 func (n *nodeRunner) Stop() error { 275 n.mu.Lock() 276 if n.cancelReconnect != nil { // between restarts 277 n.cancelReconnect() 278 n.cancelReconnect = nil 279 } 280 if n.swarmNode == nil { 281 n.mu.Unlock() 282 return nil 283 } 284 n.stopping = true 285 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 286 defer cancel() 287 n.mu.Unlock() 288 if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 289 return err 290 } 291 n.cluster.SendClusterEvent(lncluster.EventNodeLeave) 292 <-n.done 293 return nil 294 } 295 296 func (n *nodeRunner) State() nodeState { 297 if n == nil { 298 return nodeState{status: types.LocalNodeStateInactive} 299 } 300 n.mu.RLock() 301 defer n.mu.RUnlock() 302 303 ns := n.nodeState 304 305 if ns.err != nil || n.cancelReconnect != nil { 306 if errors.Cause(ns.err) == errSwarmLocked { 307 ns.status = types.LocalNodeStateLocked 308 } else { 309 ns.status = types.LocalNodeStateError 310 } 311 } else { 312 select { 313 case <-n.ready: 314 ns.status = types.LocalNodeStateActive 315 default: 316 ns.status = types.LocalNodeStatePending 317 } 318 } 319 320 return ns 321 } 322 323 func (n *nodeRunner) enableReconnectWatcher() { 324 if n.stopping { 325 return 326 } 327 n.reconnectDelay *= 2 328 if n.reconnectDelay > maxReconnectDelay { 329 n.reconnectDelay = maxReconnectDelay 330 } 331 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 332 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 333 n.cancelReconnect = cancel 334 335 go func() { 336 <-delayCtx.Done() 337 if delayCtx.Err() != context.DeadlineExceeded { 338 return 339 } 340 n.mu.Lock() 341 defer n.mu.Unlock() 342 if n.stopping { 343 return 344 } 345 346 if err := n.start(n.config); err != nil { 347 n.err = err 348 } 349 }() 350 } 351 352 // nodeState represents information about the current state of the cluster and 353 // provides access to the grpc clients. 354 type nodeState struct { 355 swarmNode *swarmnode.Node 356 grpcConn *grpc.ClientConn 357 controlClient swarmapi.ControlClient 358 logsClient swarmapi.LogsClient 359 status types.LocalNodeState 360 actualLocalAddr string 361 err error 362 } 363 364 // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true. 365 func (ns nodeState) IsActiveManager() bool { 366 return ns.controlClient != nil 367 } 368 369 // IsManager returns true if node is a manager. 370 func (ns nodeState) IsManager() bool { 371 return ns.swarmNode != nil && ns.swarmNode.Manager() != nil 372 } 373 374 // NodeID returns node's ID or empty string if node is inactive. 375 func (ns nodeState) NodeID() string { 376 if ns.swarmNode != nil { 377 return ns.swarmNode.NodeID() 378 } 379 return "" 380 }