github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/drainer/watch_nodes.go (about) 1 package drainer 2 3 import ( 4 "context" 5 "log" 6 "time" 7 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/state" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "golang.org/x/time/rate" 12 ) 13 14 // DrainingNodeWatcher is the interface for watching for draining nodes. 15 type DrainingNodeWatcher interface{} 16 17 // TrackedNodes returns the set of tracked nodes 18 func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node { 19 n.l.RLock() 20 defer n.l.RUnlock() 21 22 t := make(map[string]*structs.Node, len(n.nodes)) 23 for n, d := range n.nodes { 24 t[n] = d.GetNode() 25 } 26 27 return t 28 } 29 30 // Remove removes the given node from being tracked 31 func (n *NodeDrainer) Remove(nodeID string) { 32 n.l.Lock() 33 defer n.l.Unlock() 34 35 // TODO test the notifier is updated 36 // Remove it from being tracked and remove it from the dealiner 37 delete(n.nodes, nodeID) 38 n.deadlineNotifier.Remove(nodeID) 39 } 40 41 // Update updates the node, either updating the tracked version or starting to 42 // track the node. 43 func (n *NodeDrainer) Update(node *structs.Node) { 44 n.l.Lock() 45 defer n.l.Unlock() 46 47 if node == nil { 48 return 49 } 50 51 draining, ok := n.nodes[node.ID] 52 if !ok { 53 draining = NewDrainingNode(node, n.state) 54 n.nodes[node.ID] = draining 55 } else { 56 // Update it 57 draining.Update(node) 58 } 59 60 // TODO test the notifier is updated 61 if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf { 62 n.deadlineNotifier.Watch(node.ID, deadline) 63 } else { 64 // There is an infinite deadline so it shouldn't be tracked for 65 // deadlining 66 n.deadlineNotifier.Remove(node.ID) 67 } 68 69 // TODO Test this 70 // Register interest in the draining jobs. 71 jobs, err := draining.DrainingJobs() 72 if err != nil { 73 n.logger.Printf("[ERR] nomad.drain: error retrieving draining jobs on node %q: %v", node.ID, err) 74 return 75 } 76 n.logger.Printf("[TRACE] nomad.drain: node %q has %d draining jobs on it", node.ID, len(jobs)) 77 n.jobWatcher.RegisterJobs(jobs) 78 79 // TODO Test at this layer as well that a node drain on a node without 80 // allocs immediately gets unmarked as draining 81 // Check if the node is done such that if an operator drains a node with 82 // nothing on it we unset drain 83 done, err := draining.IsDone() 84 if err != nil { 85 n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err) 86 return 87 } 88 89 if done { 90 // Node is done draining. Stop remaining system allocs before 91 // marking node as complete. 92 remaining, err := draining.RemainingAllocs() 93 if err != nil { 94 n.logger.Printf("[ERR] nomad.drain: error getting remaining allocs on drained node %q: %v", 95 node.ID, err) 96 } else if len(remaining) > 0 { 97 future := structs.NewBatchFuture() 98 n.drainAllocs(future, remaining) 99 if err := future.Wait(); err != nil { 100 n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done node %q: %v", 101 len(remaining), node.ID, err) 102 } 103 } 104 105 // Create the node event 106 event := structs.NewNodeEvent(). 107 SetSubsystem(structs.NodeEventSubsystemDrain). 108 SetMessage(NodeDrainEventComplete) 109 110 index, err := n.raft.NodesDrainComplete([]string{node.ID}, event) 111 if err != nil { 112 n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err) 113 } else { 114 n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index) 115 } 116 } 117 } 118 119 // nodeDrainWatcher is used to watch nodes that are entering, leaving or 120 // changing their drain strategy. 121 type nodeDrainWatcher struct { 122 ctx context.Context 123 logger *log.Logger 124 125 // state is the state that is watched for state changes. 126 state *state.StateStore 127 128 // limiter is used to limit the rate of blocking queries 129 limiter *rate.Limiter 130 131 // tracker is the object that is tracking the nodes and provides us with the 132 // needed callbacks 133 tracker NodeTracker 134 } 135 136 // NewNodeDrainWatcher returns a new node drain watcher. 137 func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher { 138 w := &nodeDrainWatcher{ 139 ctx: ctx, 140 limiter: limiter, 141 logger: logger, 142 tracker: tracker, 143 state: state, 144 } 145 146 go w.watch() 147 return w 148 } 149 150 // watch is the long lived watching routine that detects node changes. 151 func (w *nodeDrainWatcher) watch() { 152 nindex := uint64(1) 153 for { 154 w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex) 155 nodes, index, err := w.getNodes(nindex) 156 w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err) 157 if err != nil { 158 if err == context.Canceled { 159 w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") 160 return 161 } 162 163 w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err) 164 select { 165 case <-w.ctx.Done(): 166 w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") 167 return 168 case <-time.After(stateReadErrorDelay): 169 continue 170 } 171 } 172 173 // update index for next run 174 nindex = index 175 176 tracked := w.tracker.TrackedNodes() 177 for nodeID, node := range nodes { 178 newDraining := node.DrainStrategy != nil 179 currentNode, tracked := tracked[nodeID] 180 181 switch { 182 // If the node is tracked but not draining, untrack 183 case tracked && !newDraining: 184 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID) 185 w.tracker.Remove(nodeID) 186 187 // If the node is not being tracked but is draining, track 188 case !tracked && newDraining: 189 w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID) 190 w.tracker.Update(node) 191 192 // If the node is being tracked but has changed, update: 193 case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): 194 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID) 195 w.tracker.Update(node) 196 default: 197 w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining) 198 } 199 200 // TODO(schmichael) handle the case of a lost node 201 } 202 203 for nodeID := range tracked { 204 if _, ok := nodes[nodeID]; !ok { 205 w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID) 206 w.tracker.Remove(nodeID) 207 } 208 } 209 } 210 } 211 212 // getNodes returns all nodes blocking until the nodes are after the given index. 213 func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) { 214 if err := w.limiter.Wait(w.ctx); err != nil { 215 return nil, 0, err 216 } 217 218 resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx) 219 if err != nil { 220 return nil, 0, err 221 } 222 223 return resp.(map[string]*structs.Node), index, nil 224 } 225 226 // getNodesImpl is used to get nodes from the state store, returning the set of 227 // nodes and the given index. 228 func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 229 iter, err := state.Nodes(ws) 230 if err != nil { 231 return nil, 0, err 232 } 233 234 index, err := state.Index("nodes") 235 if err != nil { 236 return nil, 0, err 237 } 238 239 resp := make(map[string]*structs.Node, 64) 240 for { 241 raw := iter.Next() 242 if raw == nil { 243 break 244 } 245 246 node := raw.(*structs.Node) 247 resp[node.ID] = node 248 } 249 250 return resp, index, nil 251 }