github.com/aminovpavel/nomad@v0.11.8/nomad/drainer/drainer.go (about)

     1  package drainer
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  
    10  	"github.com/hashicorp/nomad/helper"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"golang.org/x/time/rate"
    15  )
    16  
    17  var (
    18  	// stateReadErrorDelay is the delay to apply before retrying reading state
    19  	// when there is an error
    20  	stateReadErrorDelay = 1 * time.Second
    21  )
    22  
    23  const (
    24  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    25  	// second
    26  	LimitStateQueriesPerSecond = 100.0
    27  
    28  	// BatchUpdateInterval is how long we wait to batch updates
    29  	BatchUpdateInterval = 1 * time.Second
    30  
    31  	// NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will
    32  	// be coalesced together
    33  	NodeDeadlineCoalesceWindow = 5 * time.Second
    34  
    35  	// NodeDrainEventComplete is used to indicate that the node drain is
    36  	// finished.
    37  	NodeDrainEventComplete = "Node drain complete"
    38  
    39  	// NodeDrainEventDetailDeadlined is the key to use when the drain is
    40  	// complete because a deadline. The acceptable values are "true" and "false"
    41  	NodeDrainEventDetailDeadlined = "deadline_reached"
    42  )
    43  
    44  // RaftApplier contains methods for applying the raft requests required by the
    45  // NodeDrainer.
    46  type RaftApplier interface {
    47  	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
    48  	NodesDrainComplete(nodes []string, event *structs.NodeEvent) (uint64, error)
    49  }
    50  
    51  // NodeTracker is the interface to notify an object that is tracking draining
    52  // nodes of changes
    53  type NodeTracker interface {
    54  	// TrackedNodes returns all the nodes that are currently tracked as
    55  	// draining.
    56  	TrackedNodes() map[string]*structs.Node
    57  
    58  	// Remove removes a node from the draining set.
    59  	Remove(nodeID string)
    60  
    61  	// Update either updates the specification of a draining node or tracks the
    62  	// node as draining.
    63  	Update(node *structs.Node)
    64  }
    65  
    66  // DrainingJobWatcherFactory returns a new DrainingJobWatcher
    67  type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher
    68  
    69  // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher
    70  type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher
    71  
    72  // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier
    73  type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
    74  
    75  // GetDrainingJobWatcher returns a draining job watcher
    76  func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher {
    77  	return NewDrainingJobWatcher(ctx, limiter, state, logger)
    78  }
    79  
    80  // GetDeadlineNotifier returns a node deadline notifier with default coalescing.
    81  func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
    82  	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
    83  }
    84  
    85  // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory
    86  func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
    87  	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher {
    88  		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
    89  	}
    90  }
    91  
    92  // allocMigrateBatcher is used to batch allocation updates.
    93  type allocMigrateBatcher struct {
    94  	// updates holds pending client status updates for allocations
    95  	updates []*structs.Allocation
    96  
    97  	// updateFuture is used to wait for the pending batch update
    98  	// to complete. This may be nil if no batch is pending.
    99  	updateFuture *structs.BatchFuture
   100  
   101  	// updateTimer is the timer that will trigger the next batch
   102  	// update, and may be nil if there is no batch pending.
   103  	updateTimer *time.Timer
   104  
   105  	batchWindow time.Duration
   106  
   107  	// synchronizes access to the updates list, the future and the timer.
   108  	sync.Mutex
   109  }
   110  
   111  // NodeDrainerConfig is used to configure a new node drainer.
   112  type NodeDrainerConfig struct {
   113  	Logger               log.Logger
   114  	Raft                 RaftApplier
   115  	JobFactory           DrainingJobWatcherFactory
   116  	NodeFactory          DrainingNodeWatcherFactory
   117  	DrainDeadlineFactory DrainDeadlineNotifierFactory
   118  
   119  	// StateQueriesPerSecond configures the query limit against the state store
   120  	// that is allowed by the node drainer.
   121  	StateQueriesPerSecond float64
   122  
   123  	// BatchUpdateInterval is the interval in which allocation updates are
   124  	// batched.
   125  	BatchUpdateInterval time.Duration
   126  }
   127  
   128  // NodeDrainer is used to orchestrate migrating allocations off of draining
   129  // nodes.
   130  type NodeDrainer struct {
   131  	enabled bool
   132  	logger  log.Logger
   133  
   134  	// nodes is the set of draining nodes
   135  	nodes map[string]*drainingNode
   136  
   137  	// nodeWatcher watches for nodes to transition in and out of drain state.
   138  	nodeWatcher DrainingNodeWatcher
   139  	nodeFactory DrainingNodeWatcherFactory
   140  
   141  	// jobWatcher watches draining jobs and emits desired drains and notifies
   142  	// when migrations take place.
   143  	jobWatcher DrainingJobWatcher
   144  	jobFactory DrainingJobWatcherFactory
   145  
   146  	// deadlineNotifier notifies when nodes reach their drain deadline.
   147  	deadlineNotifier        DrainDeadlineNotifier
   148  	deadlineNotifierFactory DrainDeadlineNotifierFactory
   149  
   150  	// state is the state that is watched for state changes.
   151  	state *state.StateStore
   152  
   153  	// queryLimiter is used to limit the rate of blocking queries
   154  	queryLimiter *rate.Limiter
   155  
   156  	// raft is a shim around the raft messages necessary for draining
   157  	raft RaftApplier
   158  
   159  	// batcher is used to batch alloc migrations.
   160  	batcher allocMigrateBatcher
   161  
   162  	// ctx and exitFn are used to cancel the watcher
   163  	ctx    context.Context
   164  	exitFn context.CancelFunc
   165  
   166  	l sync.RWMutex
   167  }
   168  
   169  // NewNodeDrainer returns a new new node drainer. The node drainer is
   170  // responsible for marking allocations on draining nodes with a desired
   171  // migration transition, updating the drain strategy on nodes when they are
   172  // complete and creating evaluations for the system to react to these changes.
   173  func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
   174  	return &NodeDrainer{
   175  		raft:                    c.Raft,
   176  		logger:                  c.Logger.Named("drain"),
   177  		jobFactory:              c.JobFactory,
   178  		nodeFactory:             c.NodeFactory,
   179  		deadlineNotifierFactory: c.DrainDeadlineFactory,
   180  		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
   181  		batcher: allocMigrateBatcher{
   182  			batchWindow: c.BatchUpdateInterval,
   183  		},
   184  	}
   185  }
   186  
   187  // SetEnabled will start or stop the node draining goroutine depending on the
   188  // enabled boolean.
   189  func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
   190  	n.l.Lock()
   191  	defer n.l.Unlock()
   192  
   193  	// If we are starting now or have a new state, init state and start the
   194  	// run loop
   195  	n.enabled = enabled
   196  	if enabled {
   197  		n.flush(state)
   198  		go n.run(n.ctx)
   199  	} else if !enabled && n.exitFn != nil {
   200  		n.exitFn()
   201  	}
   202  }
   203  
   204  // flush is used to clear the state of the watcher
   205  func (n *NodeDrainer) flush(state *state.StateStore) {
   206  	// Cancel anything that may be running.
   207  	if n.exitFn != nil {
   208  		n.exitFn()
   209  	}
   210  
   211  	// Store the new state
   212  	if state != nil {
   213  		n.state = state
   214  	}
   215  
   216  	n.ctx, n.exitFn = context.WithCancel(context.Background())
   217  	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
   218  	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
   219  	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
   220  	n.nodes = make(map[string]*drainingNode, 32)
   221  }
   222  
   223  // run is a long lived event handler that receives changes from the relevant
   224  // watchers and takes action based on them.
   225  func (n *NodeDrainer) run(ctx context.Context) {
   226  	for {
   227  		select {
   228  		case <-n.ctx.Done():
   229  			return
   230  		case nodes := <-n.deadlineNotifier.NextBatch():
   231  			n.handleDeadlinedNodes(nodes)
   232  		case req := <-n.jobWatcher.Drain():
   233  			n.handleJobAllocDrain(req)
   234  		case allocs := <-n.jobWatcher.Migrated():
   235  			n.handleMigratedAllocs(allocs)
   236  		}
   237  	}
   238  }
   239  
   240  // handleDeadlinedNodes handles a set of nodes reaching their drain deadline.
   241  // The handler detects the remaining allocations on the nodes and immediately
   242  // marks them for migration.
   243  func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
   244  	// Retrieve the set of allocations that will be force stopped.
   245  	var forceStop []*structs.Allocation
   246  	n.l.RLock()
   247  	for _, node := range nodes {
   248  		draining, ok := n.nodes[node]
   249  		if !ok {
   250  			n.logger.Debug("skipping untracked deadlined node", "node_id", node)
   251  			continue
   252  		}
   253  
   254  		allocs, err := draining.RemainingAllocs()
   255  		if err != nil {
   256  			n.logger.Error("failed to retrieve allocs on deadlined node", "node_id", node, "error", err)
   257  			continue
   258  		}
   259  
   260  		n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs))
   261  		forceStop = append(forceStop, allocs...)
   262  	}
   263  	n.l.RUnlock()
   264  	n.batchDrainAllocs(forceStop)
   265  
   266  	// Create the node event
   267  	event := structs.NewNodeEvent().
   268  		SetSubsystem(structs.NodeEventSubsystemDrain).
   269  		SetMessage(NodeDrainEventComplete).
   270  		AddDetail(NodeDrainEventDetailDeadlined, "true")
   271  
   272  	// Submit the node transitions in a sharded form to ensure a reasonable
   273  	// Raft transaction size.
   274  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) {
   275  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   276  			n.logger.Error("ailed to unset drain for nodes", "error", err)
   277  		}
   278  	}
   279  }
   280  
   281  // handleJobAllocDrain handles marking a set of allocations as having a desired
   282  // transition to drain. The handler blocks till the changes to the allocation
   283  // have occurred.
   284  func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
   285  	index, err := n.batchDrainAllocs(req.Allocs)
   286  	req.Resp.Respond(index, err)
   287  }
   288  
   289  // handleMigratedAllocs checks to see if any nodes can be considered done
   290  // draining based on the set of allocations that have migrated because of an
   291  // ongoing drain for a job.
   292  func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
   293  	// Determine the set of nodes that were effected
   294  	nodes := make(map[string]struct{})
   295  	for _, alloc := range allocs {
   296  		nodes[alloc.NodeID] = struct{}{}
   297  	}
   298  
   299  	var done []string
   300  	var remainingAllocs []*structs.Allocation
   301  
   302  	// For each node, check if it is now done
   303  	n.l.RLock()
   304  	for node := range nodes {
   305  		draining, ok := n.nodes[node]
   306  		if !ok {
   307  			continue
   308  		}
   309  
   310  		isDone, err := draining.IsDone()
   311  		if err != nil {
   312  			n.logger.Error("error checking if node is done draining", "node_id", node, "error", err)
   313  			continue
   314  		}
   315  
   316  		if !isDone {
   317  			continue
   318  		}
   319  
   320  		done = append(done, node)
   321  
   322  		remaining, err := draining.RemainingAllocs()
   323  		if err != nil {
   324  			n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err)
   325  			continue
   326  		}
   327  
   328  		remainingAllocs = append(remainingAllocs, remaining...)
   329  	}
   330  	n.l.RUnlock()
   331  
   332  	// Stop any running system jobs on otherwise done nodes
   333  	if len(remainingAllocs) > 0 {
   334  		future := structs.NewBatchFuture()
   335  		n.drainAllocs(future, remainingAllocs)
   336  		if err := future.Wait(); err != nil {
   337  			n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err)
   338  		}
   339  	}
   340  
   341  	// Create the node event
   342  	event := structs.NewNodeEvent().
   343  		SetSubsystem(structs.NodeEventSubsystemDrain).
   344  		SetMessage(NodeDrainEventComplete)
   345  
   346  	// Submit the node transitions in a sharded form to ensure a reasonable
   347  	// Raft transaction size.
   348  	for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) {
   349  		if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil {
   350  			n.logger.Error("failed to unset drain for nodes", "error", err)
   351  		}
   352  	}
   353  }
   354  
   355  // batchDrainAllocs is used to batch the draining of allocations. It will block
   356  // until the batch is complete.
   357  func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
   358  	// Add this to the batch
   359  	n.batcher.Lock()
   360  	n.batcher.updates = append(n.batcher.updates, allocs...)
   361  
   362  	// Start a new batch if none
   363  	future := n.batcher.updateFuture
   364  	if future == nil {
   365  		future = structs.NewBatchFuture()
   366  		n.batcher.updateFuture = future
   367  		n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() {
   368  			// Get the pending updates
   369  			n.batcher.Lock()
   370  			updates := n.batcher.updates
   371  			future := n.batcher.updateFuture
   372  			n.batcher.updates = nil
   373  			n.batcher.updateFuture = nil
   374  			n.batcher.updateTimer = nil
   375  			n.batcher.Unlock()
   376  
   377  			// Perform the batch update
   378  			n.drainAllocs(future, updates)
   379  		})
   380  	}
   381  	n.batcher.Unlock()
   382  
   383  	if err := future.Wait(); err != nil {
   384  		return 0, err
   385  	}
   386  
   387  	return future.Index(), nil
   388  }
   389  
   390  // drainAllocs is a non batch, marking of the desired transition to migrate for
   391  // the set of allocations. It will also create the necessary evaluations for the
   392  // affected jobs.
   393  func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
   394  	// Compute the effected jobs and make the transition map
   395  	jobs := make(map[string]*structs.Allocation, 4)
   396  	transitions := make(map[string]*structs.DesiredTransition, len(allocs))
   397  	for _, alloc := range allocs {
   398  		transitions[alloc.ID] = &structs.DesiredTransition{
   399  			Migrate: helper.BoolToPtr(true),
   400  		}
   401  		jobs[alloc.JobID] = alloc
   402  	}
   403  
   404  	evals := make([]*structs.Evaluation, 0, len(jobs))
   405  	now := time.Now().UTC().UnixNano()
   406  	for job, alloc := range jobs {
   407  		evals = append(evals, &structs.Evaluation{
   408  			ID:          uuid.Generate(),
   409  			Namespace:   alloc.Namespace,
   410  			Priority:    alloc.Job.Priority,
   411  			Type:        alloc.Job.Type,
   412  			TriggeredBy: structs.EvalTriggerNodeDrain,
   413  			JobID:       job,
   414  			Status:      structs.EvalStatusPending,
   415  			CreateTime:  now,
   416  			ModifyTime:  now,
   417  		})
   418  	}
   419  
   420  	// Commit this update via Raft
   421  	var finalIndex uint64
   422  	for _, u := range partitionAllocDrain(defaultMaxIdsPerTxn, transitions, evals) {
   423  		index, err := n.raft.AllocUpdateDesiredTransition(u.Transitions, u.Evals)
   424  		if err != nil {
   425  			future.Respond(0, err)
   426  			return
   427  		}
   428  		finalIndex = index
   429  	}
   430  
   431  	future.Respond(finalIndex, nil)
   432  }