github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/drainer/watch_nodes.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "log" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "golang.org/x/time/rate" 12 ) 13 14 // DrainingNodeWatcher is the interface for watching for draining nodes. 15 type DrainingNodeWatcher interface{} 16 17 // TrackedNodes returns the set of tracked nodes 18 func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node { 19 n.l.RLock() 20 defer n.l.RUnlock() 21 22 t := make(map[string]*structs.Node, len(n.nodes)) 23 for n, d := range n.nodes { 24 t[n] = d.GetNode() 25 } 26 27 return t 28 } 29 30 // Remove removes the given node from being tracked 31 func (n *NodeDrainer) Remove(nodeID string) { 32 n.l.Lock() 33 defer n.l.Unlock() 34 35 // TODO test the notifier is updated 36 // Remove it from being tracked and remove it from the dealiner 37 delete(n.nodes, nodeID) 38 n.deadlineNotifier.Remove(nodeID) 39 } 40 41 // Update updates the node, either updating the tracked version or starting to 42 // track the node. 43 func (n *NodeDrainer) Update(node *structs.Node) { 44 n.l.Lock() 45 defer n.l.Unlock() 46 47 if node == nil { 48 return 49 } 50 51 draining, ok := n.nodes[node.ID] 52 if !ok { 53 draining = NewDrainingNode(node, n.state) 54 n.nodes[node.ID] = draining 55 } else { 56 // Update it 57 draining.Update(node) 58 } 59 60 // TODO test the notifier is updated 61 if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf { 62 n.deadlineNotifier.Watch(node.ID, deadline) 63 } else { 64 // There is an infinite deadline so it shouldn't be tracked for 65 // deadlining 66 n.deadlineNotifier.Remove(node.ID) 67 } 68 69 // TODO Test this 70 // Register interest in the draining jobs. 71 jobs, err := draining.DrainingJobs() 72 if err != nil { 73 n.logger.Printf("[ERR] nomad.drain: error retrieving draining jobs on node %q: %v", node.ID, err) 74 return 75 } 76 n.logger.Printf("[TRACE] nomad.drain: node %q has %d draining jobs on it", node.ID, len(jobs)) 77 n.jobWatcher.RegisterJobs(jobs) 78 79 // TODO Test at this layer as well that a node drain on a node without 80 // allocs immediately gets unmarked as draining 81 // Check if the node is done such that if an operator drains a node with 82 // nothing on it we unset drain 83 done, err := draining.IsDone() 84 if err != nil { 85 n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err) 86 return 87 } 88 89 if done { 90 // Node is done draining. Stop remaining system allocs before 91 // marking node as complete. 92 remaining, err := draining.RemainingAllocs() 93 if err != nil { 94 n.logger.Printf("[ERR] nomad.drain: error getting remaining allocs on drained node %q: %v", 95 node.ID, err) 96 } else if len(remaining) > 0 { 97 future := structs.NewBatchFuture() 98 n.drainAllocs(future, remaining) 99 if err := future.Wait(); err != nil { 100 n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done node %q: %v", 101 len(remaining), node.ID, err) 102 } 103 } 104 105 index, err := n.raft.NodesDrainComplete([]string{node.ID}) 106 if err != nil { 107 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err) 108 } else { 109 n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index) 110 } 111 } 112 } 113 114 // nodeDrainWatcher is used to watch nodes that are entering, leaving or 115 // changing their drain strategy. 116 type nodeDrainWatcher struct { 117 ctx context.Context 118 logger *log.Logger 119 120 // state is the state that is watched for state changes. 121 state *state.StateStore 122 123 // limiter is used to limit the rate of blocking queries 124 limiter *rate.Limiter 125 126 // tracker is the object that is tracking the nodes and provides us with the 127 // needed callbacks 128 tracker NodeTracker 129 } 130 131 // NewNodeDrainWatcher returns a new node drain watcher. 132 func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher { 133 w := &nodeDrainWatcher{ 134 ctx: ctx, 135 limiter: limiter, 136 logger: logger, 137 tracker: tracker, 138 state: state, 139 } 140 141 go w.watch() 142 return w 143 } 144 145 // watch is the long lived watching routine that detects node changes. 146 func (w *nodeDrainWatcher) watch() { 147 nindex := uint64(1) 148 for { 149 w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex) 150 nodes, index, err := w.getNodes(nindex) 151 w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err) 152 if err != nil { 153 if err == context.Canceled { 154 w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") 155 return 156 } 157 158 w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err) 159 select { 160 case <-w.ctx.Done(): 161 w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") 162 return 163 case <-time.After(stateReadErrorDelay): 164 continue 165 } 166 } 167 168 // update index for next run 169 nindex = index 170 171 tracked := w.tracker.TrackedNodes() 172 for nodeID, node := range nodes { 173 newDraining := node.DrainStrategy != nil 174 currentNode, tracked := tracked[nodeID] 175 176 switch { 177 // If the node is tracked but not draining, untrack 178 case tracked && !newDraining: 179 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID) 180 w.tracker.Remove(nodeID) 181 182 // If the node is not being tracked but is draining, track 183 case !tracked && newDraining: 184 w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID) 185 w.tracker.Update(node) 186 187 // If the node is being tracked but has changed, update: 188 case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): 189 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID) 190 w.tracker.Update(node) 191 default: 192 w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining) 193 } 194 195 // TODO(schmichael) handle the case of a lost node 196 } 197 198 for nodeID := range tracked { 199 if _, ok := nodes[nodeID]; !ok { 200 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID) 201 w.tracker.Remove(nodeID) 202 } 203 } 204 } 205 } 206 207 // getNodes returns all nodes blocking until the nodes are after the given index. 208 func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) { 209 if err := w.limiter.Wait(w.ctx); err != nil { 210 return nil, 0, err 211 } 212 213 resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx) 214 if err != nil { 215 return nil, 0, err 216 } 217 218 return resp.(map[string]*structs.Node), index, nil 219 } 220 221 // getNodesImpl is used to get nodes from the state store, returning the set of 222 // nodes and the given index. 223 func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 224 iter, err := state.Nodes(ws) 225 if err != nil { 226 return nil, 0, err 227 } 228 229 index, err := state.Index("nodes") 230 if err != nil { 231 return nil, 0, err 232 } 233 234 resp := make(map[string]*structs.Node, 64) 235 for { 236 raw := iter.Next() 237 if raw == nil { 238 break 239 } 240 241 node := raw.(*structs.Node) 242 resp[node.ID] = node 243 } 244 245 return resp, index, nil 246 }