github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/daemon/cluster/noderunner.go (about) 1 package cluster 2 3 import ( 4 "path/filepath" 5 "runtime" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/Sirupsen/logrus" 11 types "github.com/docker/docker/api/types/swarm" 12 "github.com/docker/docker/daemon/cluster/executor/container" 13 swarmapi "github.com/docker/swarmkit/api" 14 swarmnode "github.com/docker/swarmkit/node" 15 "github.com/pkg/errors" 16 "golang.org/x/net/context" 17 "google.golang.org/grpc" 18 ) 19 20 // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed. 21 type nodeRunner struct { 22 nodeState 23 mu sync.RWMutex 24 done chan struct{} // closed when swarmNode exits 25 ready chan struct{} // closed when swarmNode becomes active 26 reconnectDelay time.Duration 27 config nodeStartConfig 28 29 repeatedRun bool 30 cancelReconnect func() 31 stopping bool 32 cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct 33 } 34 35 // nodeStartConfig holds configuration needed to start a new node. Exported 36 // fields of this structure are saved to disk in json. Unexported fields 37 // contain data that shouldn't be persisted between daemon reloads. 38 type nodeStartConfig struct { 39 // LocalAddr is this machine's local IP or hostname, if specified. 40 LocalAddr string 41 // RemoteAddr is the address that was given to "swarm join". It is used 42 // to find LocalAddr if necessary. 43 RemoteAddr string 44 // ListenAddr is the address we bind to, including a port. 45 ListenAddr string 46 // AdvertiseAddr is the address other nodes should connect to, 47 // including a port. 48 AdvertiseAddr string 49 joinAddr string 50 forceNewCluster bool 51 joinToken string 52 lockKey []byte 53 autolock bool 54 } 55 56 func (n *nodeRunner) Ready() chan error { 57 c := make(chan error, 1) 58 n.mu.RLock() 59 ready, done := n.ready, n.done 60 n.mu.RUnlock() 61 go func() { 62 select { 63 case <-ready: 64 case <-done: 65 } 66 select { 67 case <-ready: 68 default: 69 n.mu.RLock() 70 c <- n.err 71 n.mu.RUnlock() 72 } 73 close(c) 74 }() 75 return c 76 } 77 78 func (n *nodeRunner) Start(conf nodeStartConfig) error { 79 n.mu.Lock() 80 defer n.mu.Unlock() 81 82 n.reconnectDelay = initialReconnectDelay 83 84 return n.start(conf) 85 } 86 87 func (n *nodeRunner) start(conf nodeStartConfig) error { 88 var control string 89 if runtime.GOOS == "windows" { 90 control = `\\.\pipe\` + controlSocket 91 } else { 92 control = filepath.Join(n.cluster.runtimeRoot, controlSocket) 93 } 94 95 node, err := swarmnode.New(&swarmnode.Config{ 96 Hostname: n.cluster.config.Name, 97 ForceNewCluster: conf.forceNewCluster, 98 ListenControlAPI: control, 99 ListenRemoteAPI: conf.ListenAddr, 100 AdvertiseRemoteAPI: conf.AdvertiseAddr, 101 JoinAddr: conf.joinAddr, 102 StateDir: n.cluster.root, 103 JoinToken: conf.joinToken, 104 Executor: container.NewExecutor(n.cluster.config.Backend), 105 HeartbeatTick: 1, 106 ElectionTick: 3, 107 UnlockKey: conf.lockKey, 108 AutoLockManagers: conf.autolock, 109 }) 110 if err != nil { 111 return err 112 } 113 if err := node.Start(context.Background()); err != nil { 114 return err 115 } 116 117 n.done = make(chan struct{}) 118 n.ready = make(chan struct{}) 119 n.swarmNode = node 120 n.config = conf 121 savePersistentState(n.cluster.root, conf) 122 123 ctx, cancel := context.WithCancel(context.Background()) 124 125 go func() { 126 n.handleNodeExit(node) 127 cancel() 128 }() 129 130 go n.handleReadyEvent(ctx, node, n.ready) 131 go n.handleControlSocketChange(ctx, node) 132 133 return nil 134 } 135 136 func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) { 137 for conn := range node.ListenControlSocket(ctx) { 138 n.mu.Lock() 139 if n.grpcConn != conn { 140 if conn == nil { 141 n.controlClient = nil 142 n.logsClient = nil 143 } else { 144 n.controlClient = swarmapi.NewControlClient(conn) 145 n.logsClient = swarmapi.NewLogsClient(conn) 146 } 147 } 148 n.grpcConn = conn 149 n.mu.Unlock() 150 n.cluster.configEvent <- struct{}{} 151 } 152 } 153 154 func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) { 155 select { 156 case <-node.Ready(): 157 n.mu.Lock() 158 n.err = nil 159 n.mu.Unlock() 160 close(ready) 161 case <-ctx.Done(): 162 } 163 n.cluster.configEvent <- struct{}{} 164 } 165 166 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { 167 err := detectLockedError(node.Err(context.Background())) 168 if err != nil { 169 logrus.Errorf("cluster exited with error: %v", err) 170 } 171 n.mu.Lock() 172 n.swarmNode = nil 173 n.err = err 174 close(n.done) 175 select { 176 case <-n.ready: 177 n.enableReconnectWatcher() 178 default: 179 if n.repeatedRun { 180 n.enableReconnectWatcher() 181 } 182 } 183 n.repeatedRun = true 184 n.mu.Unlock() 185 } 186 187 // Stop stops the current swarm node if it is running. 188 func (n *nodeRunner) Stop() error { 189 n.mu.Lock() 190 if n.cancelReconnect != nil { // between restarts 191 n.cancelReconnect() 192 n.cancelReconnect = nil 193 } 194 if n.swarmNode == nil { 195 n.mu.Unlock() 196 return nil 197 } 198 n.stopping = true 199 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 200 defer cancel() 201 if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 202 n.mu.Unlock() 203 return err 204 } 205 n.mu.Unlock() 206 <-n.done 207 return nil 208 } 209 210 func (n *nodeRunner) State() nodeState { 211 if n == nil { 212 return nodeState{status: types.LocalNodeStateInactive} 213 } 214 n.mu.RLock() 215 defer n.mu.RUnlock() 216 217 ns := n.nodeState 218 219 if ns.err != nil || n.cancelReconnect != nil { 220 if errors.Cause(ns.err) == errSwarmLocked { 221 ns.status = types.LocalNodeStateLocked 222 } else { 223 ns.status = types.LocalNodeStateError 224 } 225 } else { 226 select { 227 case <-n.ready: 228 ns.status = types.LocalNodeStateActive 229 default: 230 ns.status = types.LocalNodeStatePending 231 } 232 } 233 234 return ns 235 } 236 237 func (n *nodeRunner) enableReconnectWatcher() { 238 if n.stopping { 239 return 240 } 241 n.reconnectDelay *= 2 242 if n.reconnectDelay > maxReconnectDelay { 243 n.reconnectDelay = maxReconnectDelay 244 } 245 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 246 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 247 n.cancelReconnect = cancel 248 249 config := n.config 250 go func() { 251 <-delayCtx.Done() 252 if delayCtx.Err() != context.DeadlineExceeded { 253 return 254 } 255 n.mu.Lock() 256 defer n.mu.Unlock() 257 if n.stopping { 258 return 259 } 260 config.RemoteAddr = n.cluster.getRemoteAddress() 261 config.joinAddr = config.RemoteAddr 262 if err := n.start(config); err != nil { 263 n.err = err 264 } 265 }() 266 } 267 268 // nodeState represents information about the current state of the cluster and 269 // provides access to the grpc clients. 270 type nodeState struct { 271 swarmNode *swarmnode.Node 272 grpcConn *grpc.ClientConn 273 controlClient swarmapi.ControlClient 274 logsClient swarmapi.LogsClient 275 status types.LocalNodeState 276 actualLocalAddr string 277 err error 278 } 279 280 // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true. 281 func (ns nodeState) IsActiveManager() bool { 282 return ns.controlClient != nil 283 } 284 285 // IsManager returns true if node is a manager. 286 func (ns nodeState) IsManager() bool { 287 return ns.swarmNode != nil && ns.swarmNode.Manager() != nil 288 } 289 290 // NodeID returns node's ID or empty string if node is inactive. 291 func (ns nodeState) NodeID() string { 292 if ns.swarmNode != nil { 293 return ns.swarmNode.NodeID() 294 } 295 return "" 296 }