github.com/hernad/nomad@v1.6.112/nomad/plan_apply.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"runtime"
    10  	"time"
    11  
    12  	metrics "github.com/armon/go-metrics"
    13  	log "github.com/hashicorp/go-hclog"
    14  	memdb "github.com/hashicorp/go-memdb"
    15  	multierror "github.com/hashicorp/go-multierror"
    16  	"github.com/hernad/nomad/helper/uuid"
    17  	"github.com/hernad/nomad/nomad/state"
    18  	"github.com/hernad/nomad/nomad/structs"
    19  	"github.com/hashicorp/raft"
    20  )
    21  
    22  // planner is used to manage the submitted allocation plans that are waiting
    23  // to be accessed by the leader
    24  type planner struct {
    25  	*Server
    26  	log log.Logger
    27  
    28  	// planQueue is used to manage the submitted allocation
    29  	// plans that are waiting to be assessed by the leader
    30  	planQueue *PlanQueue
    31  
    32  	// badNodeTracker keeps a score for nodes that have plan rejections.
    33  	// Plan rejections are somewhat expected given Nomad's optimistic
    34  	// scheduling, but repeated rejections for the same node may indicate an
    35  	// undetected issue, so we need to track rejection history.
    36  	badNodeTracker BadNodeTracker
    37  }
    38  
    39  // newPlanner returns a new planner to be used for managing allocation plans.
    40  func newPlanner(s *Server) (*planner, error) {
    41  	log := s.logger.Named("planner")
    42  
    43  	// Create a plan queue
    44  	planQueue, err := NewPlanQueue()
    45  	if err != nil {
    46  		return nil, err
    47  	}
    48  
    49  	// Create the bad node tracker.
    50  	var badNodeTracker BadNodeTracker
    51  	if s.config.NodePlanRejectionEnabled {
    52  		config := DefaultCachedBadNodeTrackerConfig()
    53  
    54  		config.Window = s.config.NodePlanRejectionWindow
    55  		config.Threshold = s.config.NodePlanRejectionThreshold
    56  
    57  		badNodeTracker, err = NewCachedBadNodeTracker(log, config)
    58  		if err != nil {
    59  			return nil, err
    60  		}
    61  	} else {
    62  		badNodeTracker = &NoopBadNodeTracker{}
    63  	}
    64  
    65  	return &planner{
    66  		Server:         s,
    67  		log:            log,
    68  		planQueue:      planQueue,
    69  		badNodeTracker: badNodeTracker,
    70  	}, nil
    71  }
    72  
    73  // planApply is a long lived goroutine that reads plan allocations from
    74  // the plan queue, determines if they can be applied safely and applies
    75  // them via Raft.
    76  //
    77  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    78  // However, the plan application is bounded by the Raft apply time and
    79  // subject to some latency. This creates a stall condition, where we are
    80  // not evaluating, but simply waiting for a transaction to apply.
    81  //
    82  // To avoid this, we overlap verification with apply. This means once
    83  // we've verified plan N we attempt to apply it. However, while waiting
    84  // for apply, we begin to verify plan N+1 under the assumption that plan
    85  // N has succeeded.
    86  //
    87  // In this sense, we track two parallel versions of the world. One is
    88  // the pessimistic one driven by the Raft log which is replicated. The
    89  // other is optimistic and assumes our transactions will succeed. In the
    90  // happy path, this lets us do productive work during the latency of
    91  // apply.
    92  //
    93  // In the unhappy path (Raft transaction fails), effectively we only
    94  // wasted work during a time we would have been waiting anyways. However,
    95  // in anticipation of this case we cannot respond to the plan until
    96  // the Raft log is updated. This means our schedulers will stall,
    97  // but there are many of those and only a single plan verifier.
    98  func (p *planner) planApply() {
    99  	// planIndexCh is used to track an outstanding application and receive
   100  	// its committed index while snap holds an optimistic state which
   101  	// includes that plan application.
   102  	var planIndexCh chan uint64
   103  	var snap *state.StateSnapshot
   104  
   105  	// prevPlanResultIndex is the index when the last PlanResult was
   106  	// committed. Since only the last plan is optimistically applied to the
   107  	// snapshot, it's possible the current snapshot's and plan's indexes
   108  	// are less than the index the previous plan result was committed at.
   109  	// prevPlanResultIndex also guards against the previous plan committing
   110  	// during Dequeue, thus causing the snapshot containing the optimistic
   111  	// commit to be discarded and potentially evaluating the current plan
   112  	// against an index older than the previous plan was committed at.
   113  	var prevPlanResultIndex uint64
   114  
   115  	// Setup a worker pool with half the cores, with at least 1
   116  	poolSize := runtime.NumCPU() / 2
   117  	if poolSize == 0 {
   118  		poolSize = 1
   119  	}
   120  	pool := NewEvaluatePool(poolSize, workerPoolBufferSize)
   121  	defer pool.Shutdown()
   122  
   123  	for {
   124  		// Pull the next pending plan, exit if we are no longer leader
   125  		pending, err := p.planQueue.Dequeue(0)
   126  		if err != nil {
   127  			return
   128  		}
   129  
   130  		// If last plan has completed get a new snapshot
   131  		select {
   132  		case idx := <-planIndexCh:
   133  			// Previous plan committed. Discard snapshot and ensure
   134  			// future snapshots include this plan. idx may be 0 if
   135  			// plan failed to apply, so use max(prev, idx)
   136  			prevPlanResultIndex = max(prevPlanResultIndex, idx)
   137  			planIndexCh = nil
   138  			snap = nil
   139  		default:
   140  		}
   141  
   142  		if snap != nil {
   143  			// If snapshot doesn't contain the previous plan
   144  			// result's index and the current plan's snapshot it,
   145  			// discard it and get a new one below.
   146  			minIndex := max(prevPlanResultIndex, pending.plan.SnapshotIndex)
   147  			if idx, err := snap.LatestIndex(); err != nil || idx < minIndex {
   148  				snap = nil
   149  			}
   150  		}
   151  
   152  		// Snapshot the state so that we have a consistent view of the world
   153  		// if no snapshot is available.
   154  		//  - planIndexCh will be nil if the previous plan result applied
   155  		//    during Dequeue
   156  		//  - snap will be nil if its index < max(prevIndex, curIndex)
   157  		if planIndexCh == nil || snap == nil {
   158  			snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex)
   159  			if err != nil {
   160  				p.logger.Error("failed to snapshot state", "error", err)
   161  				pending.respond(nil, err)
   162  				continue
   163  			}
   164  		}
   165  
   166  		// Evaluate the plan
   167  		result, err := evaluatePlan(pool, snap, pending.plan, p.logger)
   168  		if err != nil {
   169  			p.logger.Error("failed to evaluate plan", "error", err)
   170  			pending.respond(nil, err)
   171  			continue
   172  		}
   173  
   174  		// Check if any of the rejected nodes should be made ineligible.
   175  		for _, nodeID := range result.RejectedNodes {
   176  			if p.badNodeTracker.Add(nodeID) {
   177  				result.IneligibleNodes = append(result.IneligibleNodes, nodeID)
   178  			}
   179  		}
   180  
   181  		// Fast-path the response if there is nothing to do
   182  		if result.IsNoOp() {
   183  			pending.respond(result, nil)
   184  			continue
   185  		}
   186  
   187  		// Ensure any parallel apply is complete before starting the next one.
   188  		// This also limits how out of date our snapshot can be.
   189  		if planIndexCh != nil {
   190  			idx := <-planIndexCh
   191  			planIndexCh = nil
   192  			prevPlanResultIndex = max(prevPlanResultIndex, idx)
   193  			snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex)
   194  			if err != nil {
   195  				p.logger.Error("failed to update snapshot state", "error", err)
   196  				pending.respond(nil, err)
   197  				continue
   198  			}
   199  		}
   200  
   201  		// Dispatch the Raft transaction for the plan
   202  		future, err := p.applyPlan(pending.plan, result, snap)
   203  		if err != nil {
   204  			p.logger.Error("failed to submit plan", "error", err)
   205  			pending.respond(nil, err)
   206  			continue
   207  		}
   208  
   209  		// Respond to the plan in async; receive plan's committed index via chan
   210  		planIndexCh = make(chan uint64, 1)
   211  		go p.asyncPlanWait(planIndexCh, future, result, pending)
   212  	}
   213  }
   214  
   215  // snapshotMinIndex wraps SnapshotAfter with a 10s timeout and converts timeout
   216  // errors to a more descriptive error message. The snapshot is guaranteed to
   217  // include both the previous plan and all objects referenced by the plan or
   218  // return an error.
   219  func (p *planner) snapshotMinIndex(prevPlanResultIndex, planSnapshotIndex uint64) (*state.StateSnapshot, error) {
   220  	defer metrics.MeasureSince([]string{"nomad", "plan", "wait_for_index"}, time.Now())
   221  
   222  	// Minimum index the snapshot must include is the max of the previous
   223  	// plan result's and current plan's snapshot index.
   224  	minIndex := max(prevPlanResultIndex, planSnapshotIndex)
   225  
   226  	// This timeout creates backpressure where any concurrent
   227  	// Plan.Submit RPCs will block waiting for results. This sheds
   228  	// load across all servers and gives raft some CPU to catch up,
   229  	// because schedulers won't dequeue more work while waiting.
   230  	const timeout = 10 * time.Second
   231  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   232  	snap, err := p.fsm.State().SnapshotMinIndex(ctx, minIndex)
   233  	cancel()
   234  	if err == context.DeadlineExceeded {
   235  		return nil, fmt.Errorf("timed out after %s waiting for index=%d (previous plan result index=%d; plan snapshot index=%d)",
   236  			timeout, minIndex, prevPlanResultIndex, planSnapshotIndex)
   237  	}
   238  
   239  	return snap, err
   240  }
   241  
   242  // applyPlan is used to apply the plan result and to return the alloc index
   243  func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   244  	now := time.Now().UTC().UnixNano()
   245  
   246  	// Setup the update request
   247  	req := structs.ApplyPlanResultsRequest{
   248  		AllocUpdateRequest: structs.AllocUpdateRequest{
   249  			Job: plan.Job,
   250  		},
   251  		Deployment:        result.Deployment,
   252  		DeploymentUpdates: result.DeploymentUpdates,
   253  		IneligibleNodes:   result.IneligibleNodes,
   254  		EvalID:            plan.EvalID,
   255  		UpdatedAt:         now,
   256  	}
   257  
   258  	preemptedJobIDs := make(map[structs.NamespacedID]struct{})
   259  
   260  	if ServersMeetMinimumVersion(p.Members(), p.Region(), MinVersionPlanNormalization, true) {
   261  		// Initialize the allocs request using the new optimized log entry format.
   262  		// Determine the minimum number of updates, could be more if there
   263  		// are multiple updates per node
   264  		req.AllocsStopped = make([]*structs.AllocationDiff, 0, len(result.NodeUpdate))
   265  		req.AllocsUpdated = make([]*structs.Allocation, 0, len(result.NodeAllocation))
   266  		req.AllocsPreempted = make([]*structs.AllocationDiff, 0, len(result.NodePreemptions))
   267  
   268  		for _, updateList := range result.NodeUpdate {
   269  			for _, stoppedAlloc := range updateList {
   270  				req.AllocsStopped = append(req.AllocsStopped, normalizeStoppedAlloc(stoppedAlloc, now))
   271  			}
   272  		}
   273  
   274  		for _, allocList := range result.NodeAllocation {
   275  			req.AllocsUpdated = append(req.AllocsUpdated, allocList...)
   276  		}
   277  
   278  		// Set the time the alloc was applied for the first time. This can be used
   279  		// to approximate the scheduling time.
   280  		updateAllocTimestamps(req.AllocsUpdated, now)
   281  
   282  		err := p.signAllocIdentities(plan.Job, req.AllocsUpdated)
   283  		if err != nil {
   284  			return nil, err
   285  		}
   286  
   287  		for _, preemptions := range result.NodePreemptions {
   288  			for _, preemptedAlloc := range preemptions {
   289  				req.AllocsPreempted = append(req.AllocsPreempted, normalizePreemptedAlloc(preemptedAlloc, now))
   290  
   291  				// Gather jobids to create follow up evals
   292  				appendNamespacedJobID(preemptedJobIDs, preemptedAlloc)
   293  			}
   294  		}
   295  	} else {
   296  		// COMPAT 0.11: This branch is deprecated and will only be used to support
   297  		// application of older log entries. Expected to be removed in a future version.
   298  
   299  		// Determine the minimum number of updates, could be more if there
   300  		// are multiple updates per node
   301  		minUpdates := len(result.NodeUpdate)
   302  		minUpdates += len(result.NodeAllocation)
   303  
   304  		// Initialize using the older log entry format for Alloc and NodePreemptions
   305  		req.Alloc = make([]*structs.Allocation, 0, minUpdates)
   306  		req.NodePreemptions = make([]*structs.Allocation, 0, len(result.NodePreemptions))
   307  
   308  		for _, updateList := range result.NodeUpdate {
   309  			req.Alloc = append(req.Alloc, updateList...)
   310  		}
   311  		for _, allocList := range result.NodeAllocation {
   312  			req.Alloc = append(req.Alloc, allocList...)
   313  		}
   314  
   315  		for _, preemptions := range result.NodePreemptions {
   316  			req.NodePreemptions = append(req.NodePreemptions, preemptions...)
   317  		}
   318  
   319  		// Set the time the alloc was applied for the first time. This can be used
   320  		// to approximate the scheduling time.
   321  		updateAllocTimestamps(req.Alloc, now)
   322  
   323  		// Set modify time for preempted allocs if any
   324  		// Also gather jobids to create follow up evals
   325  		for _, alloc := range req.NodePreemptions {
   326  			alloc.ModifyTime = now
   327  			appendNamespacedJobID(preemptedJobIDs, alloc)
   328  		}
   329  	}
   330  
   331  	var evals []*structs.Evaluation
   332  	for preemptedJobID := range preemptedJobIDs {
   333  		job, _ := p.State().JobByID(nil, preemptedJobID.Namespace, preemptedJobID.ID)
   334  		if job != nil {
   335  			eval := &structs.Evaluation{
   336  				ID:          uuid.Generate(),
   337  				Namespace:   job.Namespace,
   338  				TriggeredBy: structs.EvalTriggerPreemption,
   339  				JobID:       job.ID,
   340  				Type:        job.Type,
   341  				Priority:    job.Priority,
   342  				Status:      structs.EvalStatusPending,
   343  				CreateTime:  now,
   344  				ModifyTime:  now,
   345  			}
   346  			evals = append(evals, eval)
   347  		}
   348  	}
   349  	req.PreemptionEvals = evals
   350  
   351  	// Dispatch the Raft transaction
   352  	future, err := p.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req)
   353  	if err != nil {
   354  		return nil, err
   355  	}
   356  
   357  	// Optimistically apply to our state view
   358  	if snap != nil {
   359  		nextIdx := p.raft.AppliedIndex() + 1
   360  		if err := snap.UpsertPlanResults(structs.ApplyPlanResultsRequestType, nextIdx, &req); err != nil {
   361  			return future, err
   362  		}
   363  	}
   364  	return future, nil
   365  }
   366  
   367  // normalizePreemptedAlloc removes redundant fields from a preempted allocation and
   368  // returns AllocationDiff. Since a preempted allocation is always an existing allocation,
   369  // the struct returned by this method contains only the differential, which can be
   370  // applied to an existing allocation, to yield the updated struct
   371  func normalizePreemptedAlloc(preemptedAlloc *structs.Allocation, now int64) *structs.AllocationDiff {
   372  	return &structs.AllocationDiff{
   373  		ID:                    preemptedAlloc.ID,
   374  		PreemptedByAllocation: preemptedAlloc.PreemptedByAllocation,
   375  		ModifyTime:            now,
   376  	}
   377  }
   378  
   379  // normalizeStoppedAlloc removes redundant fields from a stopped allocation and
   380  // returns AllocationDiff. Since a stopped allocation is always an existing allocation,
   381  // the struct returned by this method contains only the differential, which can be
   382  // applied to an existing allocation, to yield the updated struct
   383  func normalizeStoppedAlloc(stoppedAlloc *structs.Allocation, now int64) *structs.AllocationDiff {
   384  	return &structs.AllocationDiff{
   385  		ID:                 stoppedAlloc.ID,
   386  		DesiredDescription: stoppedAlloc.DesiredDescription,
   387  		ClientStatus:       stoppedAlloc.ClientStatus,
   388  		ModifyTime:         now,
   389  		FollowupEvalID:     stoppedAlloc.FollowupEvalID,
   390  	}
   391  }
   392  
   393  // appendNamespacedJobID appends the namespaced Job ID for the alloc to the jobIDs set
   394  func appendNamespacedJobID(jobIDs map[structs.NamespacedID]struct{}, alloc *structs.Allocation) {
   395  	id := structs.NamespacedID{Namespace: alloc.Namespace, ID: alloc.JobID}
   396  	if _, ok := jobIDs[id]; !ok {
   397  		jobIDs[id] = struct{}{}
   398  	}
   399  }
   400  
   401  // updateAllocTimestamps sets the CreateTime and ModifyTime for the allocations
   402  // to the timestamp provided
   403  func updateAllocTimestamps(allocations []*structs.Allocation, timestamp int64) {
   404  	for _, alloc := range allocations {
   405  		if alloc.CreateTime == 0 {
   406  			alloc.CreateTime = timestamp
   407  		}
   408  		alloc.ModifyTime = timestamp
   409  	}
   410  }
   411  
   412  func (p *planner) signAllocIdentities(job *structs.Job, allocations []*structs.Allocation) error {
   413  
   414  	encrypter := p.Server.encrypter
   415  
   416  	for _, alloc := range allocations {
   417  		alloc.SignedIdentities = map[string]string{}
   418  		tg := job.LookupTaskGroup(alloc.TaskGroup)
   419  		for _, task := range tg.Tasks {
   420  			claims := alloc.ToTaskIdentityClaims(job, task.Name)
   421  			token, keyID, err := encrypter.SignClaims(claims)
   422  			if err != nil {
   423  				return err
   424  			}
   425  			alloc.SignedIdentities[task.Name] = token
   426  			alloc.SigningKeyID = keyID
   427  		}
   428  	}
   429  	return nil
   430  }
   431  
   432  // asyncPlanWait is used to apply and respond to a plan async. On successful
   433  // commit the plan's index will be sent on the chan. On error the chan will be
   434  // closed.
   435  func (p *planner) asyncPlanWait(indexCh chan<- uint64, future raft.ApplyFuture,
   436  	result *structs.PlanResult, pending *pendingPlan) {
   437  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   438  	defer close(indexCh)
   439  
   440  	// Wait for the plan to apply
   441  	if err := future.Error(); err != nil {
   442  		p.logger.Error("failed to apply plan", "error", err)
   443  		pending.respond(nil, err)
   444  		return
   445  	}
   446  
   447  	// Respond to the plan
   448  	index := future.Index()
   449  	result.AllocIndex = index
   450  
   451  	// If this is a partial plan application, we need to ensure the scheduler
   452  	// at least has visibility into any placements it made to avoid double placement.
   453  	// The RefreshIndex computed by evaluatePlan may be stale due to evaluation
   454  	// against an optimistic copy of the state.
   455  	if result.RefreshIndex != 0 {
   456  		result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex)
   457  	}
   458  	pending.respond(result, nil)
   459  	indexCh <- index
   460  }
   461  
   462  // evaluatePlan is used to determine what portions of a plan
   463  // can be applied if any. Returns if there should be a plan application
   464  // which may be partial or if there was an error
   465  func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) {
   466  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   467  
   468  	logger.Trace("evaluating plan", "plan", log.Fmt("%#v", plan))
   469  
   470  	// Denormalize without the job
   471  	err := snap.DenormalizeAllocationsMap(plan.NodeUpdate)
   472  	if err != nil {
   473  		return nil, err
   474  	}
   475  	// Denormalize without the job
   476  	err = snap.DenormalizeAllocationsMap(plan.NodePreemptions)
   477  	if err != nil {
   478  		return nil, err
   479  	}
   480  
   481  	// Check if the plan exceeds quota
   482  	overQuota, err := evaluatePlanQuota(snap, plan)
   483  	if err != nil {
   484  		return nil, err
   485  	}
   486  
   487  	// Reject the plan and force the scheduler to refresh
   488  	if overQuota {
   489  		index, err := refreshIndex(snap)
   490  		if err != nil {
   491  			return nil, err
   492  		}
   493  
   494  		logger.Debug("plan for evaluation exceeds quota limit. Forcing state refresh", "eval_id", plan.EvalID, "refresh_index", index)
   495  		return &structs.PlanResult{RefreshIndex: index}, nil
   496  	}
   497  
   498  	return evaluatePlanPlacements(pool, snap, plan, logger)
   499  }
   500  
   501  // evaluatePlanPlacements is used to determine what portions of a plan can be
   502  // applied if any, looking for node over commitment. Returns if there should be
   503  // a plan application which may be partial or if there was an error
   504  func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) {
   505  	// Create a result holder for the plan
   506  	result := &structs.PlanResult{
   507  		NodeUpdate:        make(map[string][]*structs.Allocation),
   508  		NodeAllocation:    make(map[string][]*structs.Allocation),
   509  		Deployment:        plan.Deployment.Copy(),
   510  		DeploymentUpdates: plan.DeploymentUpdates,
   511  		NodePreemptions:   make(map[string][]*structs.Allocation),
   512  	}
   513  
   514  	// Collect all the nodeIDs
   515  	nodeIDs := make(map[string]struct{})
   516  	nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation))
   517  	for nodeID := range plan.NodeUpdate {
   518  		if _, ok := nodeIDs[nodeID]; !ok {
   519  			nodeIDs[nodeID] = struct{}{}
   520  			nodeIDList = append(nodeIDList, nodeID)
   521  		}
   522  	}
   523  	for nodeID := range plan.NodeAllocation {
   524  		if _, ok := nodeIDs[nodeID]; !ok {
   525  			nodeIDs[nodeID] = struct{}{}
   526  			nodeIDList = append(nodeIDList, nodeID)
   527  		}
   528  	}
   529  
   530  	// Setup a multierror to handle potentially getting many
   531  	// errors since we are processing in parallel.
   532  	var mErr multierror.Error
   533  	partialCommit := false
   534  	rejectedNodes := make(map[string]struct{}, 0)
   535  
   536  	// handleResult is used to process the result of evaluateNodePlan
   537  	handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) {
   538  		// Evaluate the plan for this node
   539  		if err != nil {
   540  			mErr.Errors = append(mErr.Errors, err)
   541  			return true
   542  		}
   543  		if !fit {
   544  			metrics.IncrCounterWithLabels([]string{"nomad", "plan", "node_rejected"}, 1, []metrics.Label{{Name: "node_id", Value: nodeID}})
   545  
   546  			// Log the reason why the node's allocations could not be made
   547  			if reason != "" {
   548  				//TODO This was debug level and should return
   549  				//to debug level in the future. However until
   550  				//https://github.com/hernad/nomad/issues/9506
   551  				//is resolved this log line is the only way to
   552  				//monitor the disagreement between workers and
   553  				//the plan applier.
   554  				logger.Info("plan for node rejected, refer to https://www.nomadproject.io/s/port-plan-failure for more information",
   555  					"node_id", nodeID, "reason", reason, "eval_id", plan.EvalID,
   556  					"namespace", plan.Job.Namespace)
   557  			}
   558  			// Set that this is a partial commit and store the node that was
   559  			// rejected so the plan applier can detect repeated plan rejections
   560  			// for the same node.
   561  			partialCommit = true
   562  			rejectedNodes[nodeID] = struct{}{}
   563  
   564  			// If we require all-at-once scheduling, there is no point
   565  			// to continue the evaluation, as we've already failed.
   566  			if plan.AllAtOnce {
   567  				result.NodeUpdate = nil
   568  				result.NodeAllocation = nil
   569  				result.DeploymentUpdates = nil
   570  				result.Deployment = nil
   571  				result.NodePreemptions = nil
   572  				return true
   573  			}
   574  
   575  			// Skip this node, since it cannot be used.
   576  			return
   577  		}
   578  
   579  		// Add this to the plan result
   580  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   581  			result.NodeUpdate[nodeID] = nodeUpdate
   582  		}
   583  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   584  			result.NodeAllocation[nodeID] = nodeAlloc
   585  		}
   586  
   587  		if nodePreemptions := plan.NodePreemptions[nodeID]; nodePreemptions != nil {
   588  
   589  			// Do a pass over preempted allocs in the plan to check
   590  			// whether the alloc is already in a terminal state
   591  			var filteredNodePreemptions []*structs.Allocation
   592  			for _, preemptedAlloc := range nodePreemptions {
   593  				alloc, err := snap.AllocByID(nil, preemptedAlloc.ID)
   594  				if err != nil {
   595  					mErr.Errors = append(mErr.Errors, err)
   596  					continue
   597  				}
   598  				if alloc != nil && !alloc.TerminalStatus() {
   599  					filteredNodePreemptions = append(filteredNodePreemptions, preemptedAlloc)
   600  				}
   601  			}
   602  
   603  			result.NodePreemptions[nodeID] = filteredNodePreemptions
   604  		}
   605  
   606  		return
   607  	}
   608  
   609  	// Get the pool channels
   610  	req := pool.RequestCh()
   611  	resp := pool.ResultCh()
   612  	outstanding := 0
   613  	didCancel := false
   614  
   615  	// Evaluate each node in the plan, handling results as they are ready to
   616  	// avoid blocking.
   617  OUTER:
   618  	for len(nodeIDList) > 0 {
   619  		nodeID := nodeIDList[0]
   620  		select {
   621  		case req <- evaluateRequest{snap, plan, nodeID}:
   622  			outstanding++
   623  			nodeIDList = nodeIDList[1:]
   624  		case r := <-resp:
   625  			outstanding--
   626  
   627  			// Handle a result that allows us to cancel evaluation,
   628  			// which may save time processing additional entries.
   629  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   630  				didCancel = true
   631  				break OUTER
   632  			}
   633  		}
   634  	}
   635  
   636  	// Drain the remaining results
   637  	for outstanding > 0 {
   638  		r := <-resp
   639  		if !didCancel {
   640  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   641  				didCancel = true
   642  			}
   643  		}
   644  		outstanding--
   645  	}
   646  
   647  	// If the plan resulted in a partial commit, we need to determine
   648  	// a minimum refresh index to force the scheduler to work on a more
   649  	// up-to-date state to avoid the failures.
   650  	if partialCommit {
   651  		index, err := refreshIndex(snap)
   652  		if err != nil {
   653  			mErr.Errors = append(mErr.Errors, err)
   654  		}
   655  		result.RefreshIndex = index
   656  
   657  		if result.RefreshIndex == 0 {
   658  			err := fmt.Errorf("partialCommit with RefreshIndex of 0")
   659  			mErr.Errors = append(mErr.Errors, err)
   660  		}
   661  
   662  		// If there was a partial commit and we are operating within a
   663  		// deployment correct for any canary that may have been desired to be
   664  		// placed but wasn't actually placed
   665  		correctDeploymentCanaries(result)
   666  	}
   667  
   668  	for n := range rejectedNodes {
   669  		result.RejectedNodes = append(result.RejectedNodes, n)
   670  	}
   671  	return result, mErr.ErrorOrNil()
   672  }
   673  
   674  // correctDeploymentCanaries ensures that the deployment object doesn't list any
   675  // canaries as placed if they didn't actually get placed. This could happen if
   676  // the plan had a partial commit.
   677  func correctDeploymentCanaries(result *structs.PlanResult) {
   678  	// Hot path
   679  	if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() {
   680  		return
   681  	}
   682  
   683  	// Build a set of all the allocations IDs that were placed
   684  	placedAllocs := make(map[string]struct{}, len(result.NodeAllocation))
   685  	for _, placed := range result.NodeAllocation {
   686  		for _, alloc := range placed {
   687  			placedAllocs[alloc.ID] = struct{}{}
   688  		}
   689  	}
   690  
   691  	// Go through all the canaries and ensure that the result list only contains
   692  	// those that have been placed
   693  	for _, group := range result.Deployment.TaskGroups {
   694  		canaries := group.PlacedCanaries
   695  		if len(canaries) == 0 {
   696  			continue
   697  		}
   698  
   699  		// Prune the canaries in place to avoid allocating an extra slice
   700  		i := 0
   701  		for _, canaryID := range canaries {
   702  			if _, ok := placedAllocs[canaryID]; ok {
   703  				canaries[i] = canaryID
   704  				i++
   705  			}
   706  		}
   707  
   708  		group.PlacedCanaries = canaries[:i]
   709  	}
   710  }
   711  
   712  // evaluateNodePlan is used to evaluate the plan for a single node,
   713  // returning if the plan is valid or if an error is encountered
   714  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) {
   715  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   716  	if len(plan.NodeAllocation[nodeID]) == 0 {
   717  		return true, "", nil
   718  	}
   719  
   720  	// Get the node itself
   721  	ws := memdb.NewWatchSet()
   722  	node, err := snap.NodeByID(ws, nodeID)
   723  	if err != nil {
   724  		return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   725  	}
   726  
   727  	// If the node does not exist or is not ready for scheduling it is not fit
   728  	// XXX: There is a potential race between when we do this check and when
   729  	// the Raft commit happens.
   730  	if node == nil {
   731  		return false, "node does not exist", nil
   732  	} else if node.Status == structs.NodeStatusDisconnected {
   733  		if isValidForDisconnectedNode(plan, node.ID) {
   734  			return true, "", nil
   735  		}
   736  		return false, "node is disconnected and contains invalid updates", nil
   737  	} else if node.Status != structs.NodeStatusReady {
   738  		return false, "node is not ready for placements", nil
   739  	}
   740  
   741  	// Get the existing allocations that are non-terminal
   742  	existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false)
   743  	if err != nil {
   744  		return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   745  	}
   746  
   747  	// If nodeAllocations is a subset of the existing allocations we can continue,
   748  	// even if the node is not eligible, as only in-place updates or stop/evict are performed
   749  	if structs.AllocSubset(existingAlloc, plan.NodeAllocation[nodeID]) {
   750  		return true, "", nil
   751  	}
   752  	if node.SchedulingEligibility == structs.NodeSchedulingIneligible {
   753  		return false, "node is not eligible", nil
   754  	}
   755  
   756  	// Determine the proposed allocation by first removing allocations
   757  	// that are planned evictions and adding the new allocations.
   758  	var remove []*structs.Allocation
   759  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   760  		remove = append(remove, update...)
   761  	}
   762  
   763  	// Remove any preempted allocs
   764  	if preempted := plan.NodePreemptions[nodeID]; len(preempted) > 0 {
   765  		remove = append(remove, preempted...)
   766  	}
   767  
   768  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   769  		remove = append(remove, updated...)
   770  	}
   771  	proposed := structs.RemoveAllocs(existingAlloc, remove)
   772  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   773  
   774  	// Check if these allocations fit
   775  	fit, reason, _, err := structs.AllocsFit(node, proposed, nil, true)
   776  	return fit, reason, err
   777  }
   778  
   779  // The plan is only valid for disconnected nodes if it only contains
   780  // updates to mark allocations as unknown.
   781  func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool {
   782  	for _, alloc := range plan.NodeAllocation[nodeID] {
   783  		if alloc.ClientStatus != structs.AllocClientStatusUnknown {
   784  			return false
   785  		}
   786  	}
   787  
   788  	return true
   789  }
   790  
   791  func max(a, b uint64) uint64 {
   792  	if a > b {
   793  		return a
   794  	}
   795  	return b
   796  }