github.com/djenriquez/nomad-1@v0.8.1/nomad/drainer/drainer.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/helper"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/nomad/state"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"golang.org/x/time/rate"
    14  )
    15  
    16  var (
    17  	// stateReadErrorDelay is the delay to apply before retrying reading state
    18  	// when there is an error
    19  	stateReadErrorDelay = 1 * time.Second
    20  )
    21  
    22  const (
    23  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    24  	// second
    25  	LimitStateQueriesPerSecond = 100.0
    26  
    27  	// BatchUpdateInterval is how long we wait to batch updates
    28  	BatchUpdateInterval = 1 * time.Second
    29  
    30  	// NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will
    31  	// be coalesced together
    32  	NodeDeadlineCoalesceWindow = 5 * time.Second
    33  )
    34  
    35  // RaftApplier contains methods for applying the raft requests required by the
    36  // NodeDrainer.
    37  type RaftApplier interface {
    38  	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
    39  	NodesDrainComplete(nodes []string) (uint64, error)
    40  }
    41  
    42  // NodeTracker is the interface to notify an object that is tracking draining
    43  // nodes of changes
    44  type NodeTracker interface {
    45  	// TrackedNodes returns all the nodes that are currently tracked as
    46  	// draining.
    47  	TrackedNodes() map[string]*structs.Node
    48  
    49  	// Remove removes a node from the draining set.
    50  	Remove(nodeID string)
    51  
    52  	// Update either updates the specification of a draining node or tracks the
    53  	// node as draining.
    54  	Update(node *structs.Node)
    55  }
    56  
    57  // DrainingJobWatcherFactory returns a new DrainingJobWatcher
    58  type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher
    59  
    60  // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher
    61  type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
    62  
    63  // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier
    64  type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
    65  
    66  // GetDrainingJobWatcher returns a draining job watcher
    67  func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher {
    68  	return NewDrainingJobWatcher(ctx, limiter, state, logger)
    69  }
    70  
    71  // GetDeadlineNotifier returns a node deadline notifier with default coalescing.
    72  func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
    73  	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
    74  }
    75  
    76  // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory
    77  func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
    78  	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher {
    79  		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
    80  	}
    81  }
    82  
    83  // allocMigrateBatcher is used to batch allocation updates.
    84  type allocMigrateBatcher struct {
    85  	// updates holds pending client status updates for allocations
    86  	updates []*structs.Allocation
    87  
    88  	// updateFuture is used to wait for the pending batch update
    89  	// to complete. This may be nil if no batch is pending.
    90  	updateFuture *structs.BatchFuture
    91  
    92  	// updateTimer is the timer that will trigger the next batch
    93  	// update, and may be nil if there is no batch pending.
    94  	updateTimer *time.Timer
    95  
    96  	batchWindow time.Duration
    97  
    98  	// synchronizes access to the updates list, the future and the timer.
    99  	sync.Mutex
   100  }
   101  
   102  // NodeDrainerConfig is used to configure a new node drainer.
   103  type NodeDrainerConfig struct {
   104  	Logger               *log.Logger
   105  	Raft                 RaftApplier
   106  	JobFactory           DrainingJobWatcherFactory
   107  	NodeFactory          DrainingNodeWatcherFactory
   108  	DrainDeadlineFactory DrainDeadlineNotifierFactory
   109  
   110  	// StateQueriesPerSecond configures the query limit against the state store
   111  	// that is allowed by the node drainer.
   112  	StateQueriesPerSecond float64
   113  
   114  	// BatchUpdateInterval is the interval in which allocation updates are
   115  	// batched.
   116  	BatchUpdateInterval time.Duration
   117  }
   118  
   119  // NodeDrainer is used to orchestrate migrating allocations off of draining
   120  // nodes.
   121  type NodeDrainer struct {
   122  	enabled bool
   123  	logger  *log.Logger
   124  
   125  	// nodes is the set of draining nodes
   126  	nodes map[string]*drainingNode
   127  
   128  	// nodeWatcher watches for nodes to transition in and out of drain state.
   129  	nodeWatcher DrainingNodeWatcher
   130  	nodeFactory DrainingNodeWatcherFactory
   131  
   132  	// jobWatcher watches draining jobs and emits desired drains and notifies
   133  	// when migrations take place.
   134  	jobWatcher DrainingJobWatcher
   135  	jobFactory DrainingJobWatcherFactory
   136  
   137  	// deadlineNotifier notifies when nodes reach their drain deadline.
   138  	deadlineNotifier        DrainDeadlineNotifier
   139  	deadlineNotifierFactory DrainDeadlineNotifierFactory
   140  
   141  	// state is the state that is watched for state changes.
   142  	state *state.StateStore
   143  
   144  	// queryLimiter is used to limit the rate of blocking queries
   145  	queryLimiter *rate.Limiter
   146  
   147  	// raft is a shim around the raft messages necessary for draining
   148  	raft RaftApplier
   149  
   150  	// batcher is used to batch alloc migrations.
   151  	batcher allocMigrateBatcher
   152  
   153  	// ctx and exitFn are used to cancel the watcher
   154  	ctx    context.Context
   155  	exitFn context.CancelFunc
   156  
   157  	l sync.RWMutex
   158  }
   159  
   160  // NewNodeDrainer returns a new new node drainer. The node drainer is
   161  // responsible for marking allocations on draining nodes with a desired
   162  // migration transition, updating the drain strategy on nodes when they are
   163  // complete and creating evaluations for the system to react to these changes.
   164  func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
   165  	return &NodeDrainer{
   166  		raft:                    c.Raft,
   167  		logger:                  c.Logger,
   168  		jobFactory:              c.JobFactory,
   169  		nodeFactory:             c.NodeFactory,
   170  		deadlineNotifierFactory: c.DrainDeadlineFactory,
   171  		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
   172  		batcher: allocMigrateBatcher{
   173  			batchWindow: c.BatchUpdateInterval,
   174  		},
   175  	}
   176  }
   177  
   178  // SetEnabled will start or stop the node draining goroutine depending on the
   179  // enabled boolean.
   180  func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
   181  	n.l.Lock()
   182  	defer n.l.Unlock()
   183  
   184  	// If we are starting now or have a new state, init state and start the
   185  	// run loop
   186  	n.enabled = enabled
   187  	if enabled {
   188  		n.flush(state)
   189  		go n.run(n.ctx)
   190  	} else if !enabled && n.exitFn != nil {
   191  		n.exitFn()
   192  	}
   193  }
   194  
   195  // flush is used to clear the state of the watcher
   196  func (n *NodeDrainer) flush(state *state.StateStore) {
   197  	// Cancel anything that may be running.
   198  	if n.exitFn != nil {
   199  		n.exitFn()
   200  	}
   201  
   202  	// Store the new state
   203  	if state != nil {
   204  		n.state = state
   205  	}
   206  
   207  	n.ctx, n.exitFn = context.WithCancel(context.Background())
   208  	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
   209  	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
   210  	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
   211  	n.nodes = make(map[string]*drainingNode, 32)
   212  }
   213  
   214  // run is a long lived event handler that receives changes from the relevant
   215  // watchers and takes action based on them.
   216  func (n *NodeDrainer) run(ctx context.Context) {
   217  	for {
   218  		select {
   219  		case <-n.ctx.Done():
   220  			return
   221  		case nodes := <-n.deadlineNotifier.NextBatch():
   222  			n.handleDeadlinedNodes(nodes)
   223  		case req := <-n.jobWatcher.Drain():
   224  			n.handleJobAllocDrain(req)
   225  		case allocs := <-n.jobWatcher.Migrated():
   226  			n.handleMigratedAllocs(allocs)
   227  		}
   228  	}
   229  }
   230  
   231  // handleDeadlinedNodes handles a set of nodes reaching their drain deadline.
   232  // The handler detects the remaining allocations on the nodes and immediately
   233  // marks them for migration.
   234  func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
   235  	// Retrieve the set of allocations that will be force stopped.
   236  	var forceStop []*structs.Allocation
   237  	n.l.RLock()
   238  	for _, node := range nodes {
   239  		draining, ok := n.nodes[node]
   240  		if !ok {
   241  			n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node)
   242  			continue
   243  		}
   244  
   245  		allocs, err := draining.RemainingAllocs()
   246  		if err != nil {
   247  			n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err)
   248  			continue
   249  		}
   250  
   251  		n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs))
   252  		forceStop = append(forceStop, allocs...)
   253  	}
   254  	n.l.RUnlock()
   255  	n.batchDrainAllocs(forceStop)
   256  
   257  	// Submit the node transistions in a sharded form to ensure a reasonable
   258  	// Raft transaction size.
   259  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) {
   260  		if _, err := n.raft.NodesDrainComplete(nodes); err != nil {
   261  			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
   262  		}
   263  	}
   264  }
   265  
   266  // handleJobAllocDrain handles marking a set of allocations as having a desired
   267  // transition to drain. The handler blocks till the changes to the allocation
   268  // have occurred.
   269  func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
   270  	index, err := n.batchDrainAllocs(req.Allocs)
   271  	req.Resp.Respond(index, err)
   272  }
   273  
   274  // handleMigratedAllocs checks to see if any nodes can be considered done
   275  // draining based on the set of allocations that have migrated because of an
   276  // ongoing drain for a job.
   277  func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
   278  	// Determine the set of nodes that were effected
   279  	nodes := make(map[string]struct{})
   280  	for _, alloc := range allocs {
   281  		nodes[alloc.NodeID] = struct{}{}
   282  	}
   283  
   284  	var done []string
   285  	var remainingAllocs []*structs.Allocation
   286  
   287  	// For each node, check if it is now done
   288  	n.l.RLock()
   289  	for node := range nodes {
   290  		draining, ok := n.nodes[node]
   291  		if !ok {
   292  			continue
   293  		}
   294  
   295  		isDone, err := draining.IsDone()
   296  		if err != nil {
   297  			n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err)
   298  			continue
   299  		}
   300  
   301  		if !isDone {
   302  			continue
   303  		}
   304  
   305  		done = append(done, node)
   306  
   307  		remaining, err := draining.RemainingAllocs()
   308  		if err != nil {
   309  			n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err)
   310  			continue
   311  		}
   312  
   313  		remainingAllocs = append(remainingAllocs, remaining...)
   314  	}
   315  	n.l.RUnlock()
   316  
   317  	// Stop any running system jobs on otherwise done nodes
   318  	if len(remainingAllocs) > 0 {
   319  		future := structs.NewBatchFuture()
   320  		n.drainAllocs(future, remainingAllocs)
   321  		if err := future.Wait(); err != nil {
   322  			n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v",
   323  				len(remainingAllocs), err)
   324  		}
   325  	}
   326  
   327  	// Submit the node transistions in a sharded form to ensure a reasonable
   328  	// Raft transaction size.
   329  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) {
   330  		if _, err := n.raft.NodesDrainComplete(nodes); err != nil {
   331  			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
   332  		}
   333  	}
   334  }
   335  
   336  // batchDrainAllocs is used to batch the draining of allocations. It will block
   337  // until the batch is complete.
   338  func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
   339  	// Add this to the batch
   340  	n.batcher.Lock()
   341  	n.batcher.updates = append(n.batcher.updates, allocs...)
   342  
   343  	// Start a new batch if none
   344  	future := n.batcher.updateFuture
   345  	if future == nil {
   346  		future = structs.NewBatchFuture()
   347  		n.batcher.updateFuture = future
   348  		n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() {
   349  			// Get the pending updates
   350  			n.batcher.Lock()
   351  			updates := n.batcher.updates
   352  			future := n.batcher.updateFuture
   353  			n.batcher.updates = nil
   354  			n.batcher.updateFuture = nil
   355  			n.batcher.updateTimer = nil
   356  			n.batcher.Unlock()
   357  
   358  			// Perform the batch update
   359  			n.drainAllocs(future, updates)
   360  		})
   361  	}
   362  	n.batcher.Unlock()
   363  
   364  	if err := future.Wait(); err != nil {
   365  		return 0, err
   366  	}
   367  
   368  	return future.Index(), nil
   369  }
   370  
   371  // drainAllocs is a non batch, marking of the desired transition to migrate for
   372  // the set of allocations. It will also create the necessary evaluations for the
   373  // affected jobs.
   374  func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
   375  	// Compute the effected jobs and make the transition map
   376  	jobs := make(map[string]*structs.Allocation, 4)
   377  	transistions := make(map[string]*structs.DesiredTransition, len(allocs))
   378  	for _, alloc := range allocs {
   379  		transistions[alloc.ID] = &structs.DesiredTransition{
   380  			Migrate: helper.BoolToPtr(true),
   381  		}
   382  		jobs[alloc.JobID] = alloc
   383  	}
   384  
   385  	evals := make([]*structs.Evaluation, 0, len(jobs))
   386  	for job, alloc := range jobs {
   387  		evals = append(evals, &structs.Evaluation{
   388  			ID:          uuid.Generate(),
   389  			Namespace:   alloc.Namespace,
   390  			Priority:    alloc.Job.Priority,
   391  			Type:        alloc.Job.Type,
   392  			TriggeredBy: structs.EvalTriggerNodeDrain,
   393  			JobID:       job,
   394  			Status:      structs.EvalStatusPending,
   395  		})
   396  	}
   397  
   398  	// Commit this update via Raft
   399  	var finalIndex uint64
   400  	for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transistions, evals) {
   401  		index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals)
   402  		if err != nil {
   403  			future.Respond(0, err)
   404  			return
   405  		}
   406  		finalIndex = index
   407  	}
   408  
   409  	future.Respond(finalIndex, nil)
   410  }