github.com/hernad/nomad@v1.6.112/nomad/drainer/drainer.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package drainer
     5  
     6  import (
     7  	"context"
     8  	"sync"
     9  	"time"
    10  
    11  	log "github.com/hashicorp/go-hclog"
    12  
    13  	"github.com/hernad/nomad/helper/pointer"
    14  	"github.com/hernad/nomad/helper/uuid"
    15  	"github.com/hernad/nomad/nomad/state"
    16  	"github.com/hernad/nomad/nomad/structs"
    17  	"golang.org/x/time/rate"
    18  )
    19  
    20  var (
    21  	// stateReadErrorDelay is the delay to apply before retrying reading state
    22  	// when there is an error
    23  	stateReadErrorDelay = 1 * time.Second
    24  )
    25  
    26  const (
    27  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    28  	// second
    29  	LimitStateQueriesPerSecond = 100.0
    30  
    31  	// BatchUpdateInterval is how long we wait to batch updates
    32  	BatchUpdateInterval = 1 * time.Second
    33  
    34  	// NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will
    35  	// be coalesced together
    36  	NodeDeadlineCoalesceWindow = 5 * time.Second
    37  
    38  	// NodeDrainEventComplete is used to indicate that the node drain is
    39  	// finished.
    40  	NodeDrainEventComplete = "Node drain complete"
    41  
    42  	// NodeDrainEventDetailDeadlined is the key to use when the drain is
    43  	// complete because a deadline. The acceptable values are "true" and "false"
    44  	NodeDrainEventDetailDeadlined = "deadline_reached"
    45  )
    46  
    47  // RaftApplier contains methods for applying the raft requests required by the
    48  // NodeDrainer.
    49  type RaftApplier interface {
    50  	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
    51  	NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error)
    52  }
    53  
    54  // NodeTracker is the interface to notify an object that is tracking draining
    55  // nodes of changes
    56  type NodeTracker interface {
    57  	// TrackedNodes returns all the nodes that are currently tracked as
    58  	// draining.
    59  	TrackedNodes() map[string]*structs.Node
    60  
    61  	// Remove removes a node from the draining set.
    62  	Remove(nodeID string)
    63  
    64  	// Update either updates the specification of a draining node or tracks the
    65  	// node as draining.
    66  	Update(node *structs.Node)
    67  }
    68  
    69  // DrainingJobWatcherFactory returns a new DrainingJobWatcher
    70  type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher
    71  
    72  // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher
    73  type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher
    74  
    75  // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier
    76  type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
    77  
    78  // GetDrainingJobWatcher returns a draining job watcher
    79  func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher {
    80  	return NewDrainingJobWatcher(ctx, limiter, state, logger)
    81  }
    82  
    83  // GetDeadlineNotifier returns a node deadline notifier with default coalescing.
    84  func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
    85  	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
    86  }
    87  
    88  // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory
    89  func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
    90  	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher {
    91  		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
    92  	}
    93  }
    94  
    95  // allocMigrateBatcher is used to batch allocation updates.
    96  type allocMigrateBatcher struct {
    97  	// updates holds pending client status updates for allocations
    98  	updates []*structs.Allocation
    99  
   100  	// updateFuture is used to wait for the pending batch update
   101  	// to complete. This may be nil if no batch is pending.
   102  	updateFuture *structs.BatchFuture
   103  
   104  	// updateTimer is the timer that will trigger the next batch
   105  	// update, and may be nil if there is no batch pending.
   106  	updateTimer *time.Timer
   107  
   108  	batchWindow time.Duration
   109  
   110  	// synchronizes access to the updates list, the future and the timer.
   111  	sync.Mutex
   112  }
   113  
   114  // NodeDrainerConfig is used to configure a new node drainer.
   115  type NodeDrainerConfig struct {
   116  	Logger               log.Logger
   117  	Raft                 RaftApplier
   118  	JobFactory           DrainingJobWatcherFactory
   119  	NodeFactory          DrainingNodeWatcherFactory
   120  	DrainDeadlineFactory DrainDeadlineNotifierFactory
   121  
   122  	// StateQueriesPerSecond configures the query limit against the state store
   123  	// that is allowed by the node drainer.
   124  	StateQueriesPerSecond float64
   125  
   126  	// BatchUpdateInterval is the interval in which allocation updates are
   127  	// batched.
   128  	BatchUpdateInterval time.Duration
   129  }
   130  
   131  // NodeDrainer is used to orchestrate migrating allocations off of draining
   132  // nodes.
   133  type NodeDrainer struct {
   134  	enabled bool
   135  	logger  log.Logger
   136  
   137  	// nodes is the set of draining nodes
   138  	nodes map[string]*drainingNode
   139  
   140  	// nodeWatcher watches for nodes to transition in and out of drain state.
   141  	nodeWatcher DrainingNodeWatcher
   142  	nodeFactory DrainingNodeWatcherFactory
   143  
   144  	// jobWatcher watches draining jobs and emits desired drains and notifies
   145  	// when migrations take place.
   146  	jobWatcher DrainingJobWatcher
   147  	jobFactory DrainingJobWatcherFactory
   148  
   149  	// deadlineNotifier notifies when nodes reach their drain deadline.
   150  	deadlineNotifier        DrainDeadlineNotifier
   151  	deadlineNotifierFactory DrainDeadlineNotifierFactory
   152  
   153  	// state is the state that is watched for state changes.
   154  	state *state.StateStore
   155  
   156  	// queryLimiter is used to limit the rate of blocking queries
   157  	queryLimiter *rate.Limiter
   158  
   159  	// raft is a shim around the raft messages necessary for draining
   160  	raft RaftApplier
   161  
   162  	// batcher is used to batch alloc migrations.
   163  	batcher allocMigrateBatcher
   164  
   165  	// ctx and exitFn are used to cancel the watcher
   166  	ctx    context.Context
   167  	exitFn context.CancelFunc
   168  
   169  	l sync.RWMutex
   170  }
   171  
   172  // NewNodeDrainer returns a new new node drainer. The node drainer is
   173  // responsible for marking allocations on draining nodes with a desired
   174  // migration transition, updating the drain strategy on nodes when they are
   175  // complete and creating evaluations for the system to react to these changes.
   176  func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
   177  	return &NodeDrainer{
   178  		raft:                    c.Raft,
   179  		logger:                  c.Logger.Named("drain"),
   180  		jobFactory:              c.JobFactory,
   181  		nodeFactory:             c.NodeFactory,
   182  		deadlineNotifierFactory: c.DrainDeadlineFactory,
   183  		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
   184  		batcher: allocMigrateBatcher{
   185  			batchWindow: c.BatchUpdateInterval,
   186  		},
   187  	}
   188  }
   189  
   190  // SetEnabled will start or stop the node draining goroutine depending on the
   191  // enabled boolean.
   192  func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
   193  	n.l.Lock()
   194  	defer n.l.Unlock()
   195  
   196  	// If we are starting now or have a new state, init state and start the
   197  	// run loop
   198  	n.enabled = enabled
   199  	if enabled {
   200  		n.flush(state)
   201  		go n.run(n.ctx)
   202  	} else if !enabled && n.exitFn != nil {
   203  		n.exitFn()
   204  	}
   205  }
   206  
   207  // flush is used to clear the state of the watcher
   208  func (n *NodeDrainer) flush(state *state.StateStore) {
   209  	// Cancel anything that may be running.
   210  	if n.exitFn != nil {
   211  		n.exitFn()
   212  	}
   213  
   214  	// Store the new state
   215  	if state != nil {
   216  		n.state = state
   217  	}
   218  
   219  	n.ctx, n.exitFn = context.WithCancel(context.Background())
   220  	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
   221  	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
   222  	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
   223  	n.nodes = make(map[string]*drainingNode, 32)
   224  }
   225  
   226  // run is a long lived event handler that receives changes from the relevant
   227  // watchers and takes action based on them.
   228  func (n *NodeDrainer) run(ctx context.Context) {
   229  	for {
   230  		select {
   231  		case <-n.ctx.Done():
   232  			return
   233  		case nodes := <-n.deadlineNotifier.NextBatch():
   234  			n.handleDeadlinedNodes(nodes)
   235  		case req := <-n.jobWatcher.Drain():
   236  			n.handleJobAllocDrain(req)
   237  		case allocs := <-n.jobWatcher.Migrated():
   238  			n.handleMigratedAllocs(allocs)
   239  		}
   240  	}
   241  }
   242  
   243  // handleDeadlinedNodes handles a set of nodes reaching their drain deadline.
   244  // The handler detects the remaining allocations on the nodes and immediately
   245  // marks them for migration.
   246  func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
   247  	// Retrieve the set of allocations that will be force stopped.
   248  	var forceStop []*structs.Allocation
   249  	n.l.RLock()
   250  	for _, node := range nodes {
   251  		draining, ok := n.nodes[node]
   252  		if !ok {
   253  			n.logger.Debug("skipping untracked deadlined node", "node_id", node)
   254  			continue
   255  		}
   256  
   257  		allocs, err := draining.RemainingAllocs()
   258  		if err != nil {
   259  			n.logger.Error("failed to retrieve allocs on deadlined node", "node_id", node, "error", err)
   260  			continue
   261  		}
   262  
   263  		n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs))
   264  		forceStop = append(forceStop, allocs...)
   265  	}
   266  	n.l.RUnlock()
   267  	n.batchDrainAllocs(forceStop)
   268  
   269  	// Create the node event
   270  	event := structs.NewNodeEvent().
   271  		SetSubsystem(structs.NodeEventSubsystemDrain).
   272  		SetMessage(NodeDrainEventComplete).
   273  		AddDetail(NodeDrainEventDetailDeadlined, "true")
   274  
   275  	// Submit the node transitions in a sharded form to ensure a reasonable
   276  	// Raft transaction size.
   277  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) {
   278  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   279  			n.logger.Error("failed to unset drain for nodes", "error", err)
   280  		}
   281  	}
   282  }
   283  
   284  // handleJobAllocDrain handles marking a set of allocations as having a desired
   285  // transition to drain. The handler blocks till the changes to the allocation
   286  // have occurred.
   287  func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
   288  	index, err := n.batchDrainAllocs(req.Allocs)
   289  	req.Resp.Respond(index, err)
   290  }
   291  
   292  // handleMigratedAllocs checks to see if any nodes can be considered done
   293  // draining based on the set of allocations that have migrated because of an
   294  // ongoing drain for a job.
   295  func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
   296  	// Determine the set of nodes that were effected
   297  	nodes := make(map[string]struct{})
   298  	for _, alloc := range allocs {
   299  		nodes[alloc.NodeID] = struct{}{}
   300  	}
   301  
   302  	var done []string
   303  	var remainingAllocs []*structs.Allocation
   304  
   305  	// For each node, check if it is now done
   306  	n.l.RLock()
   307  	for node := range nodes {
   308  		draining, ok := n.nodes[node]
   309  		if !ok {
   310  			continue
   311  		}
   312  
   313  		isDone, err := draining.IsDone()
   314  		if err != nil {
   315  			n.logger.Error("error checking if node is done draining", "node_id", node, "error", err)
   316  			continue
   317  		}
   318  
   319  		if !isDone {
   320  			continue
   321  		}
   322  
   323  		done = append(done, node)
   324  
   325  		remaining, err := draining.RemainingAllocs()
   326  		if err != nil {
   327  			n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err)
   328  			continue
   329  		}
   330  
   331  		remainingAllocs = append(remainingAllocs, remaining...)
   332  	}
   333  	n.l.RUnlock()
   334  
   335  	// Stop any running system jobs on otherwise done nodes
   336  	if len(remainingAllocs) > 0 {
   337  		future := structs.NewBatchFuture()
   338  		n.drainAllocs(future, remainingAllocs)
   339  		if err := future.Wait(); err != nil {
   340  			n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err)
   341  		}
   342  	}
   343  
   344  	// Create the node event
   345  	event := structs.NewNodeEvent().
   346  		SetSubsystem(structs.NodeEventSubsystemDrain).
   347  		SetMessage(NodeDrainEventComplete)
   348  
   349  	// Submit the node transitions in a sharded form to ensure a reasonable
   350  	// Raft transaction size.
   351  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) {
   352  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   353  			n.logger.Error("failed to unset drain for nodes", "error", err)
   354  		}
   355  	}
   356  }
   357  
   358  // batchDrainAllocs is used to batch the draining of allocations. It will block
   359  // until the batch is complete.
   360  func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
   361  	// Add this to the batch
   362  	n.batcher.Lock()
   363  	n.batcher.updates = append(n.batcher.updates, allocs...)
   364  
   365  	// Start a new batch if none
   366  	future := n.batcher.updateFuture
   367  	if future == nil {
   368  		future = structs.NewBatchFuture()
   369  		n.batcher.updateFuture = future
   370  		n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() {
   371  			// Get the pending updates
   372  			n.batcher.Lock()
   373  			updates := n.batcher.updates
   374  			future := n.batcher.updateFuture
   375  			n.batcher.updates = nil
   376  			n.batcher.updateFuture = nil
   377  			n.batcher.updateTimer = nil
   378  			n.batcher.Unlock()
   379  
   380  			// Perform the batch update
   381  			n.drainAllocs(future, updates)
   382  		})
   383  	}
   384  	n.batcher.Unlock()
   385  
   386  	if err := future.Wait(); err != nil {
   387  		return 0, err
   388  	}
   389  
   390  	return future.Index(), nil
   391  }
   392  
   393  // drainAllocs is a non batch, marking of the desired transition to migrate for
   394  // the set of allocations. It will also create the necessary evaluations for the
   395  // affected jobs.
   396  func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
   397  	// Compute the effected jobs and make the transition map
   398  	jobs := make(map[structs.NamespacedID]*structs.Allocation, 4)
   399  	transitions := make(map[string]*structs.DesiredTransition, len(allocs))
   400  	for _, alloc := range allocs {
   401  		transitions[alloc.ID] = &structs.DesiredTransition{
   402  			Migrate: pointer.Of(true),
   403  		}
   404  		jobs[alloc.JobNamespacedID()] = alloc
   405  	}
   406  
   407  	evals := make([]*structs.Evaluation, 0, len(jobs))
   408  	now := time.Now().UTC().UnixNano()
   409  	for _, alloc := range jobs {
   410  		evals = append(evals, &structs.Evaluation{
   411  			ID:          uuid.Generate(),
   412  			Namespace:   alloc.Namespace,
   413  			Priority:    alloc.Job.Priority,
   414  			Type:        alloc.Job.Type,
   415  			TriggeredBy: structs.EvalTriggerNodeDrain,
   416  			JobID:       alloc.JobID,
   417  			Status:      structs.EvalStatusPending,
   418  			CreateTime:  now,
   419  			ModifyTime:  now,
   420  		})
   421  	}
   422  
   423  	// Commit this update via Raft
   424  	var finalIndex uint64
   425  	for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) {
   426  		index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals)
   427  		if err != nil {
   428  			future.Respond(0, err)
   429  			return
   430  		}
   431  		finalIndex = index
   432  	}
   433  
   434  	future.Respond(finalIndex, nil)
   435  }