github.com/rita33cool1/iot-system-gateway@v0.0.0-20200911033302-e65bde238cc5/docker-engine/daemon/cluster/noderunner.go (about) 1 package cluster // import "github.com/docker/docker/daemon/cluster" 2 3 import ( 4 "fmt" 5 "path/filepath" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 types "github.com/docker/docker/api/types/swarm" 12 "github.com/docker/docker/daemon/cluster/executor/container" 13 lncluster "github.com/docker/libnetwork/cluster" 14 swarmapi "github.com/docker/swarmkit/api" 15 swarmnode "github.com/docker/swarmkit/node" 16 "github.com/pkg/errors" 17 "github.com/sirupsen/logrus" 18 "golang.org/x/net/context" 19 "google.golang.org/grpc" 20 "google.golang.org/grpc/codes" 21 "google.golang.org/grpc/status" 22 ) 23 24 // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed. 25 type nodeRunner struct { 26 nodeState 27 mu sync.RWMutex 28 done chan struct{} // closed when swarmNode exits 29 ready chan struct{} // closed when swarmNode becomes active 30 reconnectDelay time.Duration 31 config nodeStartConfig 32 33 repeatedRun bool 34 cancelReconnect func() 35 stopping bool 36 cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct 37 } 38 39 // nodeStartConfig holds configuration needed to start a new node. Exported 40 // fields of this structure are saved to disk in json. Unexported fields 41 // contain data that shouldn't be persisted between daemon reloads. 42 type nodeStartConfig struct { 43 // LocalAddr is this machine's local IP or hostname, if specified. 44 LocalAddr string 45 // RemoteAddr is the address that was given to "swarm join". It is used 46 // to find LocalAddr if necessary. 47 RemoteAddr string 48 // ListenAddr is the address we bind to, including a port. 49 ListenAddr string 50 // AdvertiseAddr is the address other nodes should connect to, 51 // including a port. 52 AdvertiseAddr string 53 // DataPathAddr is the address that has to be used for the data path 54 DataPathAddr string 55 // JoinInProgress is set to true if a join operation has started, but 56 // not completed yet. 57 JoinInProgress bool 58 59 joinAddr string 60 forceNewCluster bool 61 joinToken string 62 lockKey []byte 63 autolock bool 64 availability types.NodeAvailability 65 } 66 67 func (n *nodeRunner) Ready() chan error { 68 c := make(chan error, 1) 69 n.mu.RLock() 70 ready, done := n.ready, n.done 71 n.mu.RUnlock() 72 go func() { 73 select { 74 case <-ready: 75 case <-done: 76 } 77 select { 78 case <-ready: 79 default: 80 n.mu.RLock() 81 c <- n.err 82 n.mu.RUnlock() 83 } 84 close(c) 85 }() 86 return c 87 } 88 89 func (n *nodeRunner) Start(conf nodeStartConfig) error { 90 n.mu.Lock() 91 defer n.mu.Unlock() 92 93 n.reconnectDelay = initialReconnectDelay 94 95 return n.start(conf) 96 } 97 98 func (n *nodeRunner) start(conf nodeStartConfig) error { 99 var control string 100 if runtime.GOOS == "windows" { 101 control = `\\.\pipe\` + controlSocket 102 } else { 103 control = filepath.Join(n.cluster.runtimeRoot, controlSocket) 104 } 105 106 joinAddr := conf.joinAddr 107 if joinAddr == "" && conf.JoinInProgress { 108 // We must have been restarted while trying to join a cluster. 109 // Continue trying to join instead of forming our own cluster. 110 joinAddr = conf.RemoteAddr 111 } 112 113 // Hostname is not set here. Instead, it is obtained from 114 // the node description that is reported periodically 115 swarmnodeConfig := swarmnode.Config{ 116 ForceNewCluster: conf.forceNewCluster, 117 ListenControlAPI: control, 118 ListenRemoteAPI: conf.ListenAddr, 119 AdvertiseRemoteAPI: conf.AdvertiseAddr, 120 JoinAddr: joinAddr, 121 StateDir: n.cluster.root, 122 JoinToken: conf.joinToken, 123 Executor: container.NewExecutor(n.cluster.config.Backend, n.cluster.config.PluginBackend), 124 HeartbeatTick: 1, 125 // Recommended value in etcd/raft is 10 x (HeartbeatTick). 126 // Lower values were seen to have caused instability because of 127 // frequent leader elections when running on flakey networks. 128 ElectionTick: 10, 129 UnlockKey: conf.lockKey, 130 AutoLockManagers: conf.autolock, 131 PluginGetter: n.cluster.config.Backend.PluginGetter(), 132 } 133 if conf.availability != "" { 134 avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))] 135 if !ok { 136 return fmt.Errorf("invalid Availability: %q", conf.availability) 137 } 138 swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail) 139 } 140 node, err := swarmnode.New(&swarmnodeConfig) 141 if err != nil { 142 return err 143 } 144 if err := node.Start(context.Background()); err != nil { 145 return err 146 } 147 148 n.done = make(chan struct{}) 149 n.ready = make(chan struct{}) 150 n.swarmNode = node 151 if conf.joinAddr != "" { 152 conf.JoinInProgress = true 153 } 154 n.config = conf 155 savePersistentState(n.cluster.root, conf) 156 157 ctx, cancel := context.WithCancel(context.Background()) 158 159 go func() { 160 n.handleNodeExit(node) 161 cancel() 162 }() 163 164 go n.handleReadyEvent(ctx, node, n.ready) 165 go n.handleControlSocketChange(ctx, node) 166 167 return nil 168 } 169 170 func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) { 171 for conn := range node.ListenControlSocket(ctx) { 172 n.mu.Lock() 173 if n.grpcConn != conn { 174 if conn == nil { 175 n.controlClient = nil 176 n.logsClient = nil 177 } else { 178 n.controlClient = swarmapi.NewControlClient(conn) 179 n.logsClient = swarmapi.NewLogsClient(conn) 180 // push store changes to daemon 181 go n.watchClusterEvents(ctx, conn) 182 } 183 } 184 n.grpcConn = conn 185 n.mu.Unlock() 186 n.cluster.SendClusterEvent(lncluster.EventSocketChange) 187 } 188 } 189 190 func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) { 191 client := swarmapi.NewWatchClient(conn) 192 watch, err := client.Watch(ctx, &swarmapi.WatchRequest{ 193 Entries: []*swarmapi.WatchRequest_WatchEntry{ 194 { 195 Kind: "node", 196 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 197 }, 198 { 199 Kind: "service", 200 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 201 }, 202 { 203 Kind: "network", 204 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 205 }, 206 { 207 Kind: "secret", 208 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 209 }, 210 { 211 Kind: "config", 212 Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove, 213 }, 214 }, 215 IncludeOldObject: true, 216 }) 217 if err != nil { 218 logrus.WithError(err).Error("failed to watch cluster store") 219 return 220 } 221 for { 222 msg, err := watch.Recv() 223 if err != nil { 224 // store watch is broken 225 errStatus, ok := status.FromError(err) 226 if !ok || errStatus.Code() != codes.Canceled { 227 logrus.WithError(err).Error("failed to receive changes from store watch API") 228 } 229 return 230 } 231 select { 232 case <-ctx.Done(): 233 return 234 case n.cluster.watchStream <- msg: 235 } 236 } 237 } 238 239 func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) { 240 select { 241 case <-node.Ready(): 242 n.mu.Lock() 243 n.err = nil 244 if n.config.JoinInProgress { 245 n.config.JoinInProgress = false 246 savePersistentState(n.cluster.root, n.config) 247 } 248 n.mu.Unlock() 249 close(ready) 250 case <-ctx.Done(): 251 } 252 n.cluster.SendClusterEvent(lncluster.EventNodeReady) 253 } 254 255 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { 256 err := detectLockedError(node.Err(context.Background())) 257 if err != nil { 258 logrus.Errorf("cluster exited with error: %v", err) 259 } 260 n.mu.Lock() 261 n.swarmNode = nil 262 n.err = err 263 close(n.done) 264 select { 265 case <-n.ready: 266 n.enableReconnectWatcher() 267 default: 268 if n.repeatedRun { 269 n.enableReconnectWatcher() 270 } 271 } 272 n.repeatedRun = true 273 n.mu.Unlock() 274 } 275 276 // Stop stops the current swarm node if it is running. 277 func (n *nodeRunner) Stop() error { 278 n.mu.Lock() 279 if n.cancelReconnect != nil { // between restarts 280 n.cancelReconnect() 281 n.cancelReconnect = nil 282 } 283 if n.swarmNode == nil { 284 n.mu.Unlock() 285 return nil 286 } 287 n.stopping = true 288 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 289 defer cancel() 290 n.mu.Unlock() 291 if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 292 return err 293 } 294 n.cluster.SendClusterEvent(lncluster.EventNodeLeave) 295 <-n.done 296 return nil 297 } 298 299 func (n *nodeRunner) State() nodeState { 300 if n == nil { 301 return nodeState{status: types.LocalNodeStateInactive} 302 } 303 n.mu.RLock() 304 defer n.mu.RUnlock() 305 306 ns := n.nodeState 307 308 if ns.err != nil || n.cancelReconnect != nil { 309 if errors.Cause(ns.err) == errSwarmLocked { 310 ns.status = types.LocalNodeStateLocked 311 } else { 312 ns.status = types.LocalNodeStateError 313 } 314 } else { 315 select { 316 case <-n.ready: 317 ns.status = types.LocalNodeStateActive 318 default: 319 ns.status = types.LocalNodeStatePending 320 } 321 } 322 323 return ns 324 } 325 326 func (n *nodeRunner) enableReconnectWatcher() { 327 if n.stopping { 328 return 329 } 330 n.reconnectDelay *= 2 331 if n.reconnectDelay > maxReconnectDelay { 332 n.reconnectDelay = maxReconnectDelay 333 } 334 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 335 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 336 n.cancelReconnect = cancel 337 338 go func() { 339 <-delayCtx.Done() 340 if delayCtx.Err() != context.DeadlineExceeded { 341 return 342 } 343 n.mu.Lock() 344 defer n.mu.Unlock() 345 if n.stopping { 346 return 347 } 348 349 if err := n.start(n.config); err != nil { 350 n.err = err 351 } 352 }() 353 } 354 355 // nodeState represents information about the current state of the cluster and 356 // provides access to the grpc clients. 357 type nodeState struct { 358 swarmNode *swarmnode.Node 359 grpcConn *grpc.ClientConn 360 controlClient swarmapi.ControlClient 361 logsClient swarmapi.LogsClient 362 status types.LocalNodeState 363 actualLocalAddr string 364 err error 365 } 366 367 // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true. 368 func (ns nodeState) IsActiveManager() bool { 369 return ns.controlClient != nil 370 } 371 372 // IsManager returns true if node is a manager. 373 func (ns nodeState) IsManager() bool { 374 return ns.swarmNode != nil && ns.swarmNode.Manager() != nil 375 } 376 377 // NodeID returns node's ID or empty string if node is inactive. 378 func (ns nodeState) NodeID() string { 379 if ns.swarmNode != nil { 380 return ns.swarmNode.NodeID() 381 } 382 return "" 383 }