github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/role_manager.go (about) 1 package manager 2 3 import ( 4 "context" 5 "time" 6 7 "code.cloudfoundry.org/clock" 8 "github.com/docker/swarmkit/api" 9 "github.com/docker/swarmkit/log" 10 "github.com/docker/swarmkit/manager/state/raft" 11 "github.com/docker/swarmkit/manager/state/raft/membership" 12 "github.com/docker/swarmkit/manager/state/store" 13 ) 14 15 const ( 16 // roleReconcileInterval is how often to retry removing a node, if a reconciliation or 17 // removal failed 18 roleReconcileInterval = 5 * time.Second 19 20 // removalTimeout is how long to wait before a raft member removal fails to be applied 21 // to the store 22 removalTimeout = 5 * time.Second 23 ) 24 25 // roleManager reconciles the raft member list with desired role changes. 26 type roleManager struct { 27 ctx context.Context 28 cancel func() 29 30 store *store.MemoryStore 31 raft *raft.Node 32 doneChan chan struct{} 33 34 // pendingReconciliation contains changed nodes that have not yet been reconciled in 35 // the raft member list. 36 pendingReconciliation map[string]*api.Node 37 38 // pendingRemoval contains the IDs of nodes that have been deleted - if these correspond 39 // to members in the raft cluster, those members need to be removed from raft 40 pendingRemoval map[string]struct{} 41 42 // leave this nil except for tests which need to inject a fake time source 43 clocksource clock.Clock 44 } 45 46 // newRoleManager creates a new roleManager. 47 func newRoleManager(store *store.MemoryStore, raftNode *raft.Node) *roleManager { 48 ctx, cancel := context.WithCancel(context.Background()) 49 return &roleManager{ 50 ctx: ctx, 51 cancel: cancel, 52 store: store, 53 raft: raftNode, 54 doneChan: make(chan struct{}), 55 pendingReconciliation: make(map[string]*api.Node), 56 pendingRemoval: make(map[string]struct{}), 57 } 58 } 59 60 // getTicker returns a ticker based on the configured clock source 61 func (rm *roleManager) getTicker(interval time.Duration) clock.Ticker { 62 if rm.clocksource == nil { 63 return clock.NewClock().NewTicker(interval) 64 } 65 return rm.clocksource.NewTicker(interval) 66 67 } 68 69 // Run is roleManager's main loop. On startup, it looks at every node object in the cluster and 70 // attempts to reconcile the raft member list with all the nodes' desired roles. If any nodes 71 // need to be demoted or promoted, it will add them to a reconciliation queue, and if any raft 72 // members' node have been deleted, it will add them to a removal queue. 73 74 // These queues are processed immediately, and any nodes that failed to be processed are 75 // processed again in the next reconciliation interval, so that nodes will hopefully eventually 76 // be reconciled. As node updates come in, any promotions or demotions are also added to the 77 // reconciliation queue and reconciled. As node removals come in, they are added to the removal 78 // queue to be removed from the raft cluster. 79 80 // Removal from a raft cluster is idempotent (and it's the only raft cluster change that will occur 81 // during reconciliation or removal), so it's fine if a node is in both the removal and reconciliation 82 // queues. 83 84 // The ctx param is only used for logging. 85 func (rm *roleManager) Run(ctx context.Context) { 86 defer close(rm.doneChan) 87 88 var ( 89 nodes []*api.Node 90 91 // ticker and tickerCh are used to time the reconciliation interval, which will 92 // periodically attempt to re-reconcile nodes that failed to reconcile the first 93 // time through 94 ticker clock.Ticker 95 tickerCh <-chan time.Time 96 ) 97 98 watcher, cancelWatch, err := store.ViewAndWatch(rm.store, 99 func(readTx store.ReadTx) error { 100 var err error 101 nodes, err = store.FindNodes(readTx, store.All) 102 return err 103 }, 104 api.EventUpdateNode{}, 105 api.EventDeleteNode{}) 106 defer cancelWatch() 107 108 if err != nil { 109 log.G(ctx).WithError(err).Error("failed to check nodes for role changes") 110 } else { 111 // Assume all raft members have been deleted from the cluster, until the node list 112 // tells us otherwise. We can make this assumption because the node object must 113 // exist first before the raft member object. 114 115 // Background life-cycle for a manager: it joins the cluster, getting a new TLS 116 // certificate. To get a TLS certificate, it makes an RPC call to the CA server, 117 // which on successful join adds its information to the cluster node list and 118 // eventually generates a TLS certificate for it. Once it has a TLS certificate, 119 // it can contact the other nodes, and makes an RPC call to request to join the 120 // raft cluster. The node it contacts will add the node to the raft membership. 121 for _, member := range rm.raft.GetMemberlist() { 122 rm.pendingRemoval[member.NodeID] = struct{}{} 123 } 124 for _, node := range nodes { 125 // if the node exists, we don't want it removed from the raft membership cluster 126 // necessarily 127 delete(rm.pendingRemoval, node.ID) 128 129 // reconcile each existing node 130 rm.pendingReconciliation[node.ID] = node 131 rm.reconcileRole(ctx, node) 132 } 133 for nodeID := range rm.pendingRemoval { 134 rm.evictRemovedNode(ctx, nodeID) 135 } 136 // If any reconciliations or member removals failed, we want to try again, so 137 // make sure that we start the ticker so we can try again and again every 138 // roleReconciliationInterval seconds until the queues are both empty. 139 if len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0 { 140 ticker = rm.getTicker(roleReconcileInterval) 141 tickerCh = ticker.C() 142 } 143 } 144 145 for { 146 select { 147 case event := <-watcher: 148 switch ev := event.(type) { 149 case api.EventUpdateNode: 150 rm.pendingReconciliation[ev.Node.ID] = ev.Node 151 rm.reconcileRole(ctx, ev.Node) 152 case api.EventDeleteNode: 153 rm.pendingRemoval[ev.Node.ID] = struct{}{} 154 rm.evictRemovedNode(ctx, ev.Node.ID) 155 } 156 // If any reconciliations or member removals failed, we want to try again, so 157 // make sure that we start the ticker so we can try again and again every 158 // roleReconciliationInterval seconds until the queues are both empty. 159 if (len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0) && ticker == nil { 160 ticker = rm.getTicker(roleReconcileInterval) 161 tickerCh = ticker.C() 162 } 163 case <-tickerCh: 164 for _, node := range rm.pendingReconciliation { 165 rm.reconcileRole(ctx, node) 166 } 167 for nodeID := range rm.pendingRemoval { 168 rm.evictRemovedNode(ctx, nodeID) 169 } 170 if len(rm.pendingReconciliation) == 0 && len(rm.pendingRemoval) == 0 { 171 ticker.Stop() 172 ticker = nil 173 tickerCh = nil 174 } 175 case <-rm.ctx.Done(): 176 if ticker != nil { 177 ticker.Stop() 178 } 179 return 180 } 181 } 182 } 183 184 // evictRemovedNode evicts a removed node from the raft cluster membership. This is to cover an edge case in which 185 // a node might have been removed, but somehow the role was not reconciled (possibly a demotion and a removal happened 186 // in rapid succession before the raft membership configuration went through). 187 func (rm *roleManager) evictRemovedNode(ctx context.Context, nodeID string) { 188 // Check if the member still exists in the membership 189 member := rm.raft.GetMemberByNodeID(nodeID) 190 if member != nil { 191 // We first try to remove the raft node from the raft cluster. On the next tick, if the node 192 // has been removed from the cluster membership, we then delete it from the removed list 193 rm.removeMember(ctx, member) 194 return 195 } 196 delete(rm.pendingRemoval, nodeID) 197 } 198 199 // removeMember removes a member from the raft cluster membership 200 func (rm *roleManager) removeMember(ctx context.Context, member *membership.Member) { 201 // Quorum safeguard - quorum should have been checked before a node was allowed to be demoted, but if in the 202 // intervening time some other node disconnected, removing this node would result in a loss of cluster quorum. 203 // We leave it 204 if !rm.raft.CanRemoveMember(member.RaftID) { 205 // TODO(aaronl): Retry later 206 log.G(ctx).Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", member.NodeID) 207 return 208 } 209 210 rmCtx, rmCancel := context.WithTimeout(rm.ctx, removalTimeout) 211 defer rmCancel() 212 213 if member.RaftID == rm.raft.Config.ID { 214 // Don't use rmCtx, because we expect to lose 215 // leadership, which will cancel this context. 216 log.G(ctx).Info("demoted; transferring leadership") 217 err := rm.raft.TransferLeadership(context.Background()) 218 if err == nil { 219 return 220 } 221 log.G(ctx).WithError(err).Info("failed to transfer leadership") 222 } 223 if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil { 224 // TODO(aaronl): Retry later 225 log.G(ctx).WithError(err).Debugf("can't demote node %s at this time", member.NodeID) 226 } 227 } 228 229 // reconcileRole looks at the desired role for a node, and if it is being demoted or promoted, updates the 230 // node role accordingly. If the node is being demoted, it also removes the node from the raft cluster membership. 231 func (rm *roleManager) reconcileRole(ctx context.Context, node *api.Node) { 232 if node.Role == node.Spec.DesiredRole { 233 // Nothing to do. 234 delete(rm.pendingReconciliation, node.ID) 235 return 236 } 237 238 // Promotion can proceed right away. 239 if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker { 240 err := rm.store.Update(func(tx store.Tx) error { 241 updatedNode := store.GetNode(tx, node.ID) 242 if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { 243 return nil 244 } 245 updatedNode.Role = api.NodeRoleManager 246 return store.UpdateNode(tx, updatedNode) 247 }) 248 if err != nil { 249 log.G(ctx).WithError(err).Errorf("failed to promote node %s", node.ID) 250 } else { 251 delete(rm.pendingReconciliation, node.ID) 252 } 253 } else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager { 254 // Check for node in memberlist 255 member := rm.raft.GetMemberByNodeID(node.ID) 256 if member != nil { 257 // We first try to remove the raft node from the raft cluster. On the next tick, if the node 258 // has been removed from the cluster membership, we then update the store to reflect the fact 259 // that it has been successfully demoted, and if that works, remove it from the pending list. 260 rm.removeMember(ctx, member) 261 return 262 } 263 264 err := rm.store.Update(func(tx store.Tx) error { 265 updatedNode := store.GetNode(tx, node.ID) 266 if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role { 267 return nil 268 } 269 updatedNode.Role = api.NodeRoleWorker 270 271 return store.UpdateNode(tx, updatedNode) 272 }) 273 if err != nil { 274 log.G(ctx).WithError(err).Errorf("failed to demote node %s", node.ID) 275 } else { 276 delete(rm.pendingReconciliation, node.ID) 277 } 278 } 279 } 280 281 // Stop stops the roleManager and waits for the main loop to exit. 282 func (rm *roleManager) Stop() { 283 rm.cancel() 284 <-rm.doneChan 285 }