github.com/quite/nomad@v0.8.6/nomad/drainer/drainer.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/helper"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/nomad/state"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"golang.org/x/time/rate"
    14  )
    15  
    16  var (
    17  	// stateReadErrorDelay is the delay to apply before retrying reading state
    18  	// when there is an error
    19  	stateReadErrorDelay = 1 * time.Second
    20  )
    21  
    22  const (
    23  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    24  	// second
    25  	LimitStateQueriesPerSecond = 100.0
    26  
    27  	// BatchUpdateInterval is how long we wait to batch updates
    28  	BatchUpdateInterval = 1 * time.Second
    29  
    30  	// NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will
    31  	// be coalesced together
    32  	NodeDeadlineCoalesceWindow = 5 * time.Second
    33  
    34  	// NodeDrainEventComplete is used to indicate that the node drain is
    35  	// finished.
    36  	NodeDrainEventComplete = "Node drain complete"
    37  
    38  	// NodeDrainEventDetailDeadlined is the key to use when the drain is
    39  	// complete because a deadline. The acceptable values are "true" and "false"
    40  	NodeDrainEventDetailDeadlined = "deadline_reached"
    41  )
    42  
    43  // RaftApplier contains methods for applying the raft requests required by the
    44  // NodeDrainer.
    45  type RaftApplier interface {
    46  	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
    47  	NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error)
    48  }
    49  
    50  // NodeTracker is the interface to notify an object that is tracking draining
    51  // nodes of changes
    52  type NodeTracker interface {
    53  	// TrackedNodes returns all the nodes that are currently tracked as
    54  	// draining.
    55  	TrackedNodes() map[string]*structs.Node
    56  
    57  	// Remove removes a node from the draining set.
    58  	Remove(nodeID string)
    59  
    60  	// Update either updates the specification of a draining node or tracks the
    61  	// node as draining.
    62  	Update(node *structs.Node)
    63  }
    64  
    65  // DrainingJobWatcherFactory returns a new DrainingJobWatcher
    66  type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher
    67  
    68  // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher
    69  type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
    70  
    71  // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier
    72  type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
    73  
    74  // GetDrainingJobWatcher returns a draining job watcher
    75  func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher {
    76  	return NewDrainingJobWatcher(ctx, limiter, state, logger)
    77  }
    78  
    79  // GetDeadlineNotifier returns a node deadline notifier with default coalescing.
    80  func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
    81  	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
    82  }
    83  
    84  // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory
    85  func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
    86  	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher {
    87  		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
    88  	}
    89  }
    90  
    91  // allocMigrateBatcher is used to batch allocation updates.
    92  type allocMigrateBatcher struct {
    93  	// updates holds pending client status updates for allocations
    94  	updates []*structs.Allocation
    95  
    96  	// updateFuture is used to wait for the pending batch update
    97  	// to complete. This may be nil if no batch is pending.
    98  	updateFuture *structs.BatchFuture
    99  
   100  	// updateTimer is the timer that will trigger the next batch
   101  	// update, and may be nil if there is no batch pending.
   102  	updateTimer *time.Timer
   103  
   104  	batchWindow time.Duration
   105  
   106  	// synchronizes access to the updates list, the future and the timer.
   107  	sync.Mutex
   108  }
   109  
   110  // NodeDrainerConfig is used to configure a new node drainer.
   111  type NodeDrainerConfig struct {
   112  	Logger               *log.Logger
   113  	Raft                 RaftApplier
   114  	JobFactory           DrainingJobWatcherFactory
   115  	NodeFactory          DrainingNodeWatcherFactory
   116  	DrainDeadlineFactory DrainDeadlineNotifierFactory
   117  
   118  	// StateQueriesPerSecond configures the query limit against the state store
   119  	// that is allowed by the node drainer.
   120  	StateQueriesPerSecond float64
   121  
   122  	// BatchUpdateInterval is the interval in which allocation updates are
   123  	// batched.
   124  	BatchUpdateInterval time.Duration
   125  }
   126  
   127  // NodeDrainer is used to orchestrate migrating allocations off of draining
   128  // nodes.
   129  type NodeDrainer struct {
   130  	enabled bool
   131  	logger  *log.Logger
   132  
   133  	// nodes is the set of draining nodes
   134  	nodes map[string]*drainingNode
   135  
   136  	// nodeWatcher watches for nodes to transition in and out of drain state.
   137  	nodeWatcher DrainingNodeWatcher
   138  	nodeFactory DrainingNodeWatcherFactory
   139  
   140  	// jobWatcher watches draining jobs and emits desired drains and notifies
   141  	// when migrations take place.
   142  	jobWatcher DrainingJobWatcher
   143  	jobFactory DrainingJobWatcherFactory
   144  
   145  	// deadlineNotifier notifies when nodes reach their drain deadline.
   146  	deadlineNotifier        DrainDeadlineNotifier
   147  	deadlineNotifierFactory DrainDeadlineNotifierFactory
   148  
   149  	// state is the state that is watched for state changes.
   150  	state *state.StateStore
   151  
   152  	// queryLimiter is used to limit the rate of blocking queries
   153  	queryLimiter *rate.Limiter
   154  
   155  	// raft is a shim around the raft messages necessary for draining
   156  	raft RaftApplier
   157  
   158  	// batcher is used to batch alloc migrations.
   159  	batcher allocMigrateBatcher
   160  
   161  	// ctx and exitFn are used to cancel the watcher
   162  	ctx    context.Context
   163  	exitFn context.CancelFunc
   164  
   165  	l sync.RWMutex
   166  }
   167  
   168  // NewNodeDrainer returns a new new node drainer. The node drainer is
   169  // responsible for marking allocations on draining nodes with a desired
   170  // migration transition, updating the drain strategy on nodes when they are
   171  // complete and creating evaluations for the system to react to these changes.
   172  func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
   173  	return &NodeDrainer{
   174  		raft:                    c.Raft,
   175  		logger:                  c.Logger,
   176  		jobFactory:              c.JobFactory,
   177  		nodeFactory:             c.NodeFactory,
   178  		deadlineNotifierFactory: c.DrainDeadlineFactory,
   179  		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
   180  		batcher: allocMigrateBatcher{
   181  			batchWindow: c.BatchUpdateInterval,
   182  		},
   183  	}
   184  }
   185  
   186  // SetEnabled will start or stop the node draining goroutine depending on the
   187  // enabled boolean.
   188  func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
   189  	n.l.Lock()
   190  	defer n.l.Unlock()
   191  
   192  	// If we are starting now or have a new state, init state and start the
   193  	// run loop
   194  	n.enabled = enabled
   195  	if enabled {
   196  		n.flush(state)
   197  		go n.run(n.ctx)
   198  	} else if !enabled && n.exitFn != nil {
   199  		n.exitFn()
   200  	}
   201  }
   202  
   203  // flush is used to clear the state of the watcher
   204  func (n *NodeDrainer) flush(state *state.StateStore) {
   205  	// Cancel anything that may be running.
   206  	if n.exitFn != nil {
   207  		n.exitFn()
   208  	}
   209  
   210  	// Store the new state
   211  	if state != nil {
   212  		n.state = state
   213  	}
   214  
   215  	n.ctx, n.exitFn = context.WithCancel(context.Background())
   216  	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
   217  	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
   218  	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
   219  	n.nodes = make(map[string]*drainingNode, 32)
   220  }
   221  
   222  // run is a long lived event handler that receives changes from the relevant
   223  // watchers and takes action based on them.
   224  func (n *NodeDrainer) run(ctx context.Context) {
   225  	for {
   226  		select {
   227  		case <-n.ctx.Done():
   228  			return
   229  		case nodes := <-n.deadlineNotifier.NextBatch():
   230  			n.handleDeadlinedNodes(nodes)
   231  		case req := <-n.jobWatcher.Drain():
   232  			n.handleJobAllocDrain(req)
   233  		case allocs := <-n.jobWatcher.Migrated():
   234  			n.handleMigratedAllocs(allocs)
   235  		}
   236  	}
   237  }
   238  
   239  // handleDeadlinedNodes handles a set of nodes reaching their drain deadline.
   240  // The handler detects the remaining allocations on the nodes and immediately
   241  // marks them for migration.
   242  func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
   243  	// Retrieve the set of allocations that will be force stopped.
   244  	var forceStop []*structs.Allocation
   245  	n.l.RLock()
   246  	for _, node := range nodes {
   247  		draining, ok := n.nodes[node]
   248  		if !ok {
   249  			n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node)
   250  			continue
   251  		}
   252  
   253  		allocs, err := draining.RemainingAllocs()
   254  		if err != nil {
   255  			n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err)
   256  			continue
   257  		}
   258  
   259  		n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs))
   260  		forceStop = append(forceStop, allocs...)
   261  	}
   262  	n.l.RUnlock()
   263  	n.batchDrainAllocs(forceStop)
   264  
   265  	// Create the node event
   266  	event := structs.NewNodeEvent().
   267  		SetSubsystem(structs.NodeEventSubsystemDrain).
   268  		SetMessage(NodeDrainEventComplete).
   269  		AddDetail(NodeDrainEventDetailDeadlined, "true")
   270  
   271  	// Submit the node transitions in a sharded form to ensure a reasonable
   272  	// Raft transaction size.
   273  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) {
   274  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   275  			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
   276  		}
   277  	}
   278  }
   279  
   280  // handleJobAllocDrain handles marking a set of allocations as having a desired
   281  // transition to drain. The handler blocks till the changes to the allocation
   282  // have occurred.
   283  func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
   284  	index, err := n.batchDrainAllocs(req.Allocs)
   285  	req.Resp.Respond(index, err)
   286  }
   287  
   288  // handleMigratedAllocs checks to see if any nodes can be considered done
   289  // draining based on the set of allocations that have migrated because of an
   290  // ongoing drain for a job.
   291  func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
   292  	// Determine the set of nodes that were effected
   293  	nodes := make(map[string]struct{})
   294  	for _, alloc := range allocs {
   295  		nodes[alloc.NodeID] = struct{}{}
   296  	}
   297  
   298  	var done []string
   299  	var remainingAllocs []*structs.Allocation
   300  
   301  	// For each node, check if it is now done
   302  	n.l.RLock()
   303  	for node := range nodes {
   304  		draining, ok := n.nodes[node]
   305  		if !ok {
   306  			continue
   307  		}
   308  
   309  		isDone, err := draining.IsDone()
   310  		if err != nil {
   311  			n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err)
   312  			continue
   313  		}
   314  
   315  		if !isDone {
   316  			continue
   317  		}
   318  
   319  		done = append(done, node)
   320  
   321  		remaining, err := draining.RemainingAllocs()
   322  		if err != nil {
   323  			n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err)
   324  			continue
   325  		}
   326  
   327  		remainingAllocs = append(remainingAllocs, remaining...)
   328  	}
   329  	n.l.RUnlock()
   330  
   331  	// Stop any running system jobs on otherwise done nodes
   332  	if len(remainingAllocs) > 0 {
   333  		future := structs.NewBatchFuture()
   334  		n.drainAllocs(future, remainingAllocs)
   335  		if err := future.Wait(); err != nil {
   336  			n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v",
   337  				len(remainingAllocs), err)
   338  		}
   339  	}
   340  
   341  	// Create the node event
   342  	event := structs.NewNodeEvent().
   343  		SetSubsystem(structs.NodeEventSubsystemDrain).
   344  		SetMessage(NodeDrainEventComplete)
   345  
   346  	// Submit the node transitions in a sharded form to ensure a reasonable
   347  	// Raft transaction size.
   348  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) {
   349  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   350  			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
   351  		}
   352  	}
   353  }
   354  
   355  // batchDrainAllocs is used to batch the draining of allocations. It will block
   356  // until the batch is complete.
   357  func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
   358  	// Add this to the batch
   359  	n.batcher.Lock()
   360  	n.batcher.updates = append(n.batcher.updates, allocs...)
   361  
   362  	// Start a new batch if none
   363  	future := n.batcher.updateFuture
   364  	if future == nil {
   365  		future = structs.NewBatchFuture()
   366  		n.batcher.updateFuture = future
   367  		n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() {
   368  			// Get the pending updates
   369  			n.batcher.Lock()
   370  			updates := n.batcher.updates
   371  			future := n.batcher.updateFuture
   372  			n.batcher.updates = nil
   373  			n.batcher.updateFuture = nil
   374  			n.batcher.updateTimer = nil
   375  			n.batcher.Unlock()
   376  
   377  			// Perform the batch update
   378  			n.drainAllocs(future, updates)
   379  		})
   380  	}
   381  	n.batcher.Unlock()
   382  
   383  	if err := future.Wait(); err != nil {
   384  		return 0, err
   385  	}
   386  
   387  	return future.Index(), nil
   388  }
   389  
   390  // drainAllocs is a non batch, marking of the desired transition to migrate for
   391  // the set of allocations. It will also create the necessary evaluations for the
   392  // affected jobs.
   393  func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
   394  	// Compute the effected jobs and make the transition map
   395  	jobs := make(map[string]*structs.Allocation, 4)
   396  	transitions := make(map[string]*structs.DesiredTransition, len(allocs))
   397  	for _, alloc := range allocs {
   398  		transitions[alloc.ID] = &structs.DesiredTransition{
   399  			Migrate: helper.BoolToPtr(true),
   400  		}
   401  		jobs[alloc.JobID] = alloc
   402  	}
   403  
   404  	evals := make([]*structs.Evaluation, 0, len(jobs))
   405  	for job, alloc := range jobs {
   406  		evals = append(evals, &structs.Evaluation{
   407  			ID:          uuid.Generate(),
   408  			Namespace:   alloc.Namespace,
   409  			Priority:    alloc.Job.Priority,
   410  			Type:        alloc.Job.Type,
   411  			TriggeredBy: structs.EvalTriggerNodeDrain,
   412  			JobID:       job,
   413  			Status:      structs.EvalStatusPending,
   414  		})
   415  	}
   416  
   417  	// Commit this update via Raft
   418  	var finalIndex uint64
   419  	for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) {
   420  		index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals)
   421  		if err != nil {
   422  			future.Respond(0, err)
   423  			return
   424  		}
   425  		finalIndex = index
   426  	}
   427  
   428  	future.Respond(finalIndex, nil)
   429  }