github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/drainer/watch_nodes.go (about) 1 package drainer 2 3 import ( 4 "context" 5 6 log "github.com/hashicorp/go-hclog" 7 memdb "github.com/hashicorp/go-memdb" 8 "github.com/hashicorp/nomad/helper" 9 10 "github.com/hashicorp/nomad/nomad/state" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "golang.org/x/time/rate" 13 ) 14 15 // DrainingNodeWatcher is the interface for watching for draining nodes. 16 type DrainingNodeWatcher interface{} 17 18 // TrackedNodes returns the set of tracked nodes 19 func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node { 20 n.l.RLock() 21 defer n.l.RUnlock() 22 23 t := make(map[string]*structs.Node, len(n.nodes)) 24 for n, d := range n.nodes { 25 t[n] = d.GetNode() 26 } 27 28 return t 29 } 30 31 // Remove removes the given node from being tracked 32 func (n *NodeDrainer) Remove(nodeID string) { 33 n.l.Lock() 34 defer n.l.Unlock() 35 36 // TODO test the notifier is updated 37 // Remove it from being tracked and remove it from the dealiner 38 delete(n.nodes, nodeID) 39 n.deadlineNotifier.Remove(nodeID) 40 } 41 42 // Update updates the node, either updating the tracked version or starting to 43 // track the node. 44 func (n *NodeDrainer) Update(node *structs.Node) { 45 n.l.Lock() 46 defer n.l.Unlock() 47 48 if node == nil { 49 return 50 } 51 52 draining, ok := n.nodes[node.ID] 53 if !ok { 54 draining = NewDrainingNode(node, n.state) 55 n.nodes[node.ID] = draining 56 } else { 57 // Update it 58 draining.Update(node) 59 } 60 61 // TODO test the notifier is updated 62 if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf { 63 n.deadlineNotifier.Watch(node.ID, deadline) 64 } else { 65 // There is an infinite deadline so it shouldn't be tracked for 66 // deadlining 67 n.deadlineNotifier.Remove(node.ID) 68 } 69 70 // TODO Test this 71 // Register interest in the draining jobs. 72 jobs, err := draining.DrainingJobs() 73 if err != nil { 74 n.logger.Error("error retrieving draining jobs on node", "node_id", node.ID, "error", err) 75 return 76 } 77 n.logger.Trace("node has draining jobs on it", "node_id", node.ID, "num_jobs", len(jobs)) 78 n.jobWatcher.RegisterJobs(jobs) 79 80 // TODO Test at this layer as well that a node drain on a node without 81 // allocs immediately gets unmarked as draining 82 // Check if the node is done such that if an operator drains a node with 83 // nothing on it we unset drain 84 done, err := draining.IsDone() 85 if err != nil { 86 n.logger.Error("failed to check if node is done draining", "node_id", node.ID, "error", err) 87 return 88 } 89 90 if done { 91 // Node is done draining. Stop remaining system allocs before 92 // marking node as complete. 93 remaining, err := draining.RemainingAllocs() 94 if err != nil { 95 n.logger.Error("error getting remaining allocs on drained node", "node_id", node.ID, "error", err) 96 } else if len(remaining) > 0 { 97 future := structs.NewBatchFuture() 98 n.drainAllocs(future, remaining) 99 if err := future.Wait(); err != nil { 100 n.logger.Error("failed to drain remaining allocs from done node", "num_allocs", len(remaining), "node_id", node.ID, "error", err) 101 } 102 } 103 104 // Create the node event 105 event := structs.NewNodeEvent(). 106 SetSubsystem(structs.NodeEventSubsystemDrain). 107 SetMessage(NodeDrainEventComplete) 108 109 index, err := n.raft.NodesDrainComplete([]string{node.ID}, event) 110 if err != nil { 111 n.logger.Error("failed to unset drain for node", "node_id", node.ID, "error", err) 112 } else { 113 n.logger.Info("node completed draining at index", "node_id", node.ID, "index", index) 114 } 115 } 116 } 117 118 // nodeDrainWatcher is used to watch nodes that are entering, leaving or 119 // changing their drain strategy. 120 type nodeDrainWatcher struct { 121 ctx context.Context 122 logger log.Logger 123 124 // state is the state that is watched for state changes. 125 state *state.StateStore 126 127 // limiter is used to limit the rate of blocking queries 128 limiter *rate.Limiter 129 130 // tracker is the object that is tracking the nodes and provides us with the 131 // needed callbacks 132 tracker NodeTracker 133 } 134 135 // NewNodeDrainWatcher returns a new node drain watcher. 136 func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) *nodeDrainWatcher { 137 w := &nodeDrainWatcher{ 138 ctx: ctx, 139 limiter: limiter, 140 logger: logger.Named("node_watcher"), 141 tracker: tracker, 142 state: state, 143 } 144 145 go w.watch() 146 return w 147 } 148 149 // watch is the long lived watching routine that detects node changes. 150 func (w *nodeDrainWatcher) watch() { 151 timer, stop := helper.NewSafeTimer(stateReadErrorDelay) 152 defer stop() 153 154 nindex := uint64(1) 155 156 for { 157 timer.Reset(stateReadErrorDelay) 158 nodes, index, err := w.getNodes(nindex) 159 if err != nil { 160 if err == context.Canceled { 161 return 162 } 163 164 w.logger.Error("error watching node updates at index", "index", nindex, "error", err) 165 select { 166 case <-w.ctx.Done(): 167 return 168 case <-timer.C: 169 continue 170 } 171 } 172 173 // update index for next run 174 nindex = index 175 176 tracked := w.tracker.TrackedNodes() 177 for nodeID, node := range nodes { 178 newDraining := node.DrainStrategy != nil 179 currentNode, tracked := tracked[nodeID] 180 181 switch { 182 // If the node is tracked but not draining, untrack 183 case tracked && !newDraining: 184 w.tracker.Remove(nodeID) 185 186 // If the node is not being tracked but is draining, track 187 case !tracked && newDraining: 188 w.tracker.Update(node) 189 190 // If the node is being tracked but has changed, update: 191 case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): 192 w.tracker.Update(node) 193 default: 194 } 195 196 // TODO(schmichael) handle the case of a lost node 197 } 198 199 for nodeID := range tracked { 200 if _, ok := nodes[nodeID]; !ok { 201 w.tracker.Remove(nodeID) 202 } 203 } 204 } 205 } 206 207 // getNodes returns all nodes blocking until the nodes are after the given index. 208 func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) { 209 if err := w.limiter.Wait(w.ctx); err != nil { 210 return nil, 0, err 211 } 212 213 resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx) 214 if err != nil { 215 return nil, 0, err 216 } 217 218 return resp.(map[string]*structs.Node), index, nil 219 } 220 221 // getNodesImpl is used to get nodes from the state store, returning the set of 222 // nodes and the given index. 223 func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 224 iter, err := state.Nodes(ws) 225 if err != nil { 226 return nil, 0, err 227 } 228 229 index, err := state.Index("nodes") 230 if err != nil { 231 return nil, 0, err 232 } 233 234 var maxIndex uint64 = 0 235 resp := make(map[string]*structs.Node, 64) 236 for { 237 raw := iter.Next() 238 if raw == nil { 239 break 240 } 241 242 node := raw.(*structs.Node) 243 resp[node.ID] = node 244 if maxIndex < node.ModifyIndex { 245 maxIndex = node.ModifyIndex 246 } 247 } 248 249 // Prefer using the actual max index of affected nodes since it means less 250 // unblocking 251 if maxIndex != 0 { 252 index = maxIndex 253 } 254 255 return resp, index, nil 256 }