github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/drainer/watch_nodes.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/state"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"golang.org/x/time/rate"
    12  )
    13  
    14  // DrainingNodeWatcher is the interface for watching for draining nodes.
    15  type DrainingNodeWatcher interface{}
    16  
    17  // TrackedNodes returns the set of tracked nodes
    18  func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node {
    19  	n.l.RLock()
    20  	defer n.l.RUnlock()
    21  
    22  	t := make(map[string]*structs.Node, len(n.nodes))
    23  	for n, d := range n.nodes {
    24  		t[n] = d.GetNode()
    25  	}
    26  
    27  	return t
    28  }
    29  
    30  // Remove removes the given node from being tracked
    31  func (n *NodeDrainer) Remove(nodeID string) {
    32  	n.l.Lock()
    33  	defer n.l.Unlock()
    34  
    35  	// TODO test the notifier is updated
    36  	// Remove it from being tracked and remove it from the dealiner
    37  	delete(n.nodes, nodeID)
    38  	n.deadlineNotifier.Remove(nodeID)
    39  }
    40  
    41  // Update updates the node, either updating the tracked version or starting to
    42  // track the node.
    43  func (n *NodeDrainer) Update(node *structs.Node) {
    44  	n.l.Lock()
    45  	defer n.l.Unlock()
    46  
    47  	if node == nil {
    48  		return
    49  	}
    50  
    51  	draining, ok := n.nodes[node.ID]
    52  	if !ok {
    53  		draining = NewDrainingNode(node, n.state)
    54  		n.nodes[node.ID] = draining
    55  	} else {
    56  		// Update it
    57  		draining.Update(node)
    58  	}
    59  
    60  	// TODO test the notifier is updated
    61  	if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf {
    62  		n.deadlineNotifier.Watch(node.ID, deadline)
    63  	} else {
    64  		// There is an infinite deadline so it shouldn't be tracked for
    65  		// deadlining
    66  		n.deadlineNotifier.Remove(node.ID)
    67  	}
    68  
    69  	// TODO Test this
    70  	// Register interest in the draining jobs.
    71  	jobs, err := draining.DrainingJobs()
    72  	if err != nil {
    73  		n.logger.Printf("[ERR] nomad.drain: error retrieving draining jobs on node %q: %v", node.ID, err)
    74  		return
    75  	}
    76  	n.logger.Printf("[TRACE] nomad.drain: node %q has %d draining jobs on it", node.ID, len(jobs))
    77  	n.jobWatcher.RegisterJobs(jobs)
    78  
    79  	// TODO Test at this layer as well that a node drain on a node without
    80  	// allocs immediately gets unmarked as draining
    81  	// Check if the node is done such that if an operator drains a node with
    82  	// nothing on it we unset drain
    83  	done, err := draining.IsDone()
    84  	if err != nil {
    85  		n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err)
    86  		return
    87  	}
    88  
    89  	if done {
    90  		// Node is done draining. Stop remaining system allocs before
    91  		// marking node as complete.
    92  		remaining, err := draining.RemainingAllocs()
    93  		if err != nil {
    94  			n.logger.Printf("[ERR] nomad.drain: error getting remaining allocs on drained node %q: %v",
    95  				node.ID, err)
    96  		} else if len(remaining) > 0 {
    97  			future := structs.NewBatchFuture()
    98  			n.drainAllocs(future, remaining)
    99  			if err := future.Wait(); err != nil {
   100  				n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done node %q: %v",
   101  					len(remaining), node.ID, err)
   102  			}
   103  		}
   104  
   105  		// Create the node event
   106  		event := structs.NewNodeEvent().
   107  			SetSubsystem(structs.NodeEventSubsystemDrain).
   108  			SetMessage(NodeDrainEventComplete)
   109  
   110  		index, err := n.raft.NodesDrainComplete([]string{node.ID}, event)
   111  		if err != nil {
   112  			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err)
   113  		} else {
   114  			n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index)
   115  		}
   116  	}
   117  }
   118  
   119  // nodeDrainWatcher is used to watch nodes that are entering, leaving or
   120  // changing their drain strategy.
   121  type nodeDrainWatcher struct {
   122  	ctx    context.Context
   123  	logger *log.Logger
   124  
   125  	// state is the state that is watched for state changes.
   126  	state *state.StateStore
   127  
   128  	// limiter is used to limit the rate of blocking queries
   129  	limiter *rate.Limiter
   130  
   131  	// tracker is the object that is tracking the nodes and provides us with the
   132  	// needed callbacks
   133  	tracker NodeTracker
   134  }
   135  
   136  // NewNodeDrainWatcher returns a new node drain watcher.
   137  func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher {
   138  	w := &nodeDrainWatcher{
   139  		ctx:     ctx,
   140  		limiter: limiter,
   141  		logger:  logger,
   142  		tracker: tracker,
   143  		state:   state,
   144  	}
   145  
   146  	go w.watch()
   147  	return w
   148  }
   149  
   150  // watch is the long lived watching routine that detects node changes.
   151  func (w *nodeDrainWatcher) watch() {
   152  	nindex := uint64(1)
   153  	for {
   154  		w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex)
   155  		nodes, index, err := w.getNodes(nindex)
   156  		w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err)
   157  		if err != nil {
   158  			if err == context.Canceled {
   159  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
   160  				return
   161  			}
   162  
   163  			w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err)
   164  			select {
   165  			case <-w.ctx.Done():
   166  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
   167  				return
   168  			case <-time.After(stateReadErrorDelay):
   169  				continue
   170  			}
   171  		}
   172  
   173  		// update index for next run
   174  		nindex = index
   175  
   176  		tracked := w.tracker.TrackedNodes()
   177  		for nodeID, node := range nodes {
   178  			newDraining := node.DrainStrategy != nil
   179  			currentNode, tracked := tracked[nodeID]
   180  
   181  			switch {
   182  			// If the node is tracked but not draining, untrack
   183  			case tracked && !newDraining:
   184  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID)
   185  				w.tracker.Remove(nodeID)
   186  
   187  				// If the node is not being tracked but is draining, track
   188  			case !tracked && newDraining:
   189  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID)
   190  				w.tracker.Update(node)
   191  
   192  				// If the node is being tracked but has changed, update:
   193  			case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy):
   194  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID)
   195  				w.tracker.Update(node)
   196  			default:
   197  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining)
   198  			}
   199  
   200  			// TODO(schmichael) handle the case of a lost node
   201  		}
   202  
   203  		for nodeID := range tracked {
   204  			if _, ok := nodes[nodeID]; !ok {
   205  				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID)
   206  				w.tracker.Remove(nodeID)
   207  			}
   208  		}
   209  	}
   210  }
   211  
   212  // getNodes returns all nodes blocking until the nodes are after the given index.
   213  func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) {
   214  	if err := w.limiter.Wait(w.ctx); err != nil {
   215  		return nil, 0, err
   216  	}
   217  
   218  	resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx)
   219  	if err != nil {
   220  		return nil, 0, err
   221  	}
   222  
   223  	return resp.(map[string]*structs.Node), index, nil
   224  }
   225  
   226  // getNodesImpl is used to get nodes from the state store, returning the set of
   227  // nodes and the given index.
   228  func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   229  	iter, err := state.Nodes(ws)
   230  	if err != nil {
   231  		return nil, 0, err
   232  	}
   233  
   234  	index, err := state.Index("nodes")
   235  	if err != nil {
   236  		return nil, 0, err
   237  	}
   238  
   239  	resp := make(map[string]*structs.Node, 64)
   240  	for {
   241  		raw := iter.Next()
   242  		if raw == nil {
   243  			break
   244  		}
   245  
   246  		node := raw.(*structs.Node)
   247  		resp[node.ID] = node
   248  	}
   249  
   250  	return resp, index, nil
   251  }