github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/nomad/drainer/watch_nodes.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "time" 6 7 log "github.com/hashicorp/go-hclog" 8 memdb "github.com/hashicorp/go-memdb" 9 10 "github.com/hashicorp/nomad/nomad/state" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "golang.org/x/time/rate" 13 ) 14 15 // DrainingNodeWatcher is the interface for watching for draining nodes. 16 type DrainingNodeWatcher interface{} 17 18 // TrackedNodes returns the set of tracked nodes 19 func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node { 20 n.l.RLock() 21 defer n.l.RUnlock() 22 23 t := make(map[string]*structs.Node, len(n.nodes)) 24 for n, d := range n.nodes { 25 t[n] = d.GetNode() 26 } 27 28 return t 29 } 30 31 // Remove removes the given node from being tracked 32 func (n *NodeDrainer) Remove(nodeID string) { 33 n.l.Lock() 34 defer n.l.Unlock() 35 36 // TODO test the notifier is updated 37 // Remove it from being tracked and remove it from the dealiner 38 delete(n.nodes, nodeID) 39 n.deadlineNotifier.Remove(nodeID) 40 } 41 42 // Update updates the node, either updating the tracked version or starting to 43 // track the node. 44 func (n *NodeDrainer) Update(node *structs.Node) { 45 n.l.Lock() 46 defer n.l.Unlock() 47 48 if node == nil { 49 return 50 } 51 52 draining, ok := n.nodes[node.ID] 53 if !ok { 54 draining = NewDrainingNode(node, n.state) 55 n.nodes[node.ID] = draining 56 } else { 57 // Update it 58 draining.Update(node) 59 } 60 61 // TODO test the notifier is updated 62 if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf { 63 n.deadlineNotifier.Watch(node.ID, deadline) 64 } else { 65 // There is an infinite deadline so it shouldn't be tracked for 66 // deadlining 67 n.deadlineNotifier.Remove(node.ID) 68 } 69 70 // TODO Test this 71 // Register interest in the draining jobs. 72 jobs, err := draining.DrainingJobs() 73 if err != nil { 74 n.logger.Error("error retrieving draining jobs on node", "node_id", node.ID, "error", err) 75 return 76 } 77 n.logger.Trace("node has draining jobs on it", "node_id", node.ID, "num_jobs", len(jobs)) 78 n.jobWatcher.RegisterJobs(jobs) 79 80 // TODO Test at this layer as well that a node drain on a node without 81 // allocs immediately gets unmarked as draining 82 // Check if the node is done such that if an operator drains a node with 83 // nothing on it we unset drain 84 done, err := draining.IsDone() 85 if err != nil { 86 n.logger.Error("failed to check if node is done draining", "node_id", node.ID, "error", err) 87 return 88 } 89 90 if done { 91 // Node is done draining. Stop remaining system allocs before 92 // marking node as complete. 93 remaining, err := draining.RemainingAllocs() 94 if err != nil { 95 n.logger.Error("error getting remaining allocs on drained node", "node_id", node.ID, "error", err) 96 } else if len(remaining) > 0 { 97 future := structs.NewBatchFuture() 98 n.drainAllocs(future, remaining) 99 if err := future.Wait(); err != nil { 100 n.logger.Error("failed to drain remaining allocs from done node", "num_allocs", len(remaining), "node_id", node.ID, "error", err) 101 } 102 } 103 104 // Create the node event 105 event := structs.NewNodeEvent(). 106 SetSubsystem(structs.NodeEventSubsystemDrain). 107 SetMessage(NodeDrainEventComplete) 108 109 index, err := n.raft.NodesDrainComplete([]string{node.ID}, event) 110 if err != nil { 111 n.logger.Error("failed to unset drain for node", "node_id", node.ID, "error", err) 112 } else { 113 n.logger.Info("node completed draining at index", "node_id", node.ID, "index", index) 114 } 115 } 116 } 117 118 // nodeDrainWatcher is used to watch nodes that are entering, leaving or 119 // changing their drain strategy. 120 type nodeDrainWatcher struct { 121 ctx context.Context 122 logger log.Logger 123 124 // state is the state that is watched for state changes. 125 state *state.StateStore 126 127 // limiter is used to limit the rate of blocking queries 128 limiter *rate.Limiter 129 130 // tracker is the object that is tracking the nodes and provides us with the 131 // needed callbacks 132 tracker NodeTracker 133 } 134 135 // NewNodeDrainWatcher returns a new node drain watcher. 136 func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) *nodeDrainWatcher { 137 w := &nodeDrainWatcher{ 138 ctx: ctx, 139 limiter: limiter, 140 logger: logger.Named("node_watcher"), 141 tracker: tracker, 142 state: state, 143 } 144 145 go w.watch() 146 return w 147 } 148 149 // watch is the long lived watching routine that detects node changes. 150 func (w *nodeDrainWatcher) watch() { 151 nindex := uint64(1) 152 for { 153 nodes, index, err := w.getNodes(nindex) 154 if err != nil { 155 if err == context.Canceled { 156 return 157 } 158 159 w.logger.Error("error watching node updates at index", "index", nindex, "error", err) 160 select { 161 case <-w.ctx.Done(): 162 return 163 case <-time.After(stateReadErrorDelay): 164 continue 165 } 166 } 167 168 // update index for next run 169 nindex = index 170 171 tracked := w.tracker.TrackedNodes() 172 for nodeID, node := range nodes { 173 newDraining := node.DrainStrategy != nil 174 currentNode, tracked := tracked[nodeID] 175 176 switch { 177 // If the node is tracked but not draining, untrack 178 case tracked && !newDraining: 179 w.tracker.Remove(nodeID) 180 181 // If the node is not being tracked but is draining, track 182 case !tracked && newDraining: 183 w.tracker.Update(node) 184 185 // If the node is being tracked but has changed, update: 186 case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): 187 w.tracker.Update(node) 188 default: 189 } 190 191 // TODO(schmichael) handle the case of a lost node 192 } 193 194 for nodeID := range tracked { 195 if _, ok := nodes[nodeID]; !ok { 196 w.tracker.Remove(nodeID) 197 } 198 } 199 } 200 } 201 202 // getNodes returns all nodes blocking until the nodes are after the given index. 203 func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) { 204 if err := w.limiter.Wait(w.ctx); err != nil { 205 return nil, 0, err 206 } 207 208 resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx) 209 if err != nil { 210 return nil, 0, err 211 } 212 213 return resp.(map[string]*structs.Node), index, nil 214 } 215 216 // getNodesImpl is used to get nodes from the state store, returning the set of 217 // nodes and the given index. 218 func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 219 iter, err := state.Nodes(ws) 220 if err != nil { 221 return nil, 0, err 222 } 223 224 index, err := state.Index("nodes") 225 if err != nil { 226 return nil, 0, err 227 } 228 229 var maxIndex uint64 = 0 230 resp := make(map[string]*structs.Node, 64) 231 for { 232 raw := iter.Next() 233 if raw == nil { 234 break 235 } 236 237 node := raw.(*structs.Node) 238 resp[node.ID] = node 239 if maxIndex < node.ModifyIndex { 240 maxIndex = node.ModifyIndex 241 } 242 } 243 244 // Prefer using the actual max index of affected nodes since it means less 245 // unblocking 246 if maxIndex != 0 { 247 index = maxIndex 248 } 249 250 return resp, index, nil 251 }