gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/plan_apply.go

gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"runtime"
     7  	"time"
     8  
     9  	metrics "github.com/armon/go-metrics"
    10  	log "github.com/hashicorp/go-hclog"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	multierror "github.com/hashicorp/go-multierror"
    13  	"github.com/hashicorp/nomad/helper/uuid"
    14  	"github.com/hashicorp/nomad/nomad/state"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  	"github.com/hashicorp/raft"
    17  )
    18  
    19  // planner is used to manage the submitted allocation plans that are waiting
    20  // to be accessed by the leader
    21  type planner struct {
    22  	*Server
    23  	log log.Logger
    24  
    25  	// planQueue is used to manage the submitted allocation
    26  	// plans that are waiting to be assessed by the leader
    27  	planQueue *PlanQueue
    28  }
    29  
    30  // newPlanner returns a new planner to be used for managing allocation plans.
    31  func newPlanner(s *Server) (*planner, error) {
    32  	// Create a plan queue
    33  	planQueue, err := NewPlanQueue()
    34  	if err != nil {
    35  		return nil, err
    36  	}
    37  
    38  	return &planner{
    39  		Server:    s,
    40  		log:       s.logger.Named("planner"),
    41  		planQueue: planQueue,
    42  	}, nil
    43  }
    44  
    45  // planApply is a long lived goroutine that reads plan allocations from
    46  // the plan queue, determines if they can be applied safely and applies
    47  // them via Raft.
    48  //
    49  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    50  // However, the plan application is bounded by the Raft apply time and
    51  // subject to some latency. This creates a stall condition, where we are
    52  // not evaluating, but simply waiting for a transaction to apply.
    53  //
    54  // To avoid this, we overlap verification with apply. This means once
    55  // we've verified plan N we attempt to apply it. However, while waiting
    56  // for apply, we begin to verify plan N+1 under the assumption that plan
    57  // N has succeeded.
    58  //
    59  // In this sense, we track two parallel versions of the world. One is
    60  // the pessimistic one driven by the Raft log which is replicated. The
    61  // other is optimistic and assumes our transactions will succeed. In the
    62  // happy path, this lets us do productive work during the latency of
    63  // apply.
    64  //
    65  // In the unhappy path (Raft transaction fails), effectively we only
    66  // wasted work during a time we would have been waiting anyways. However,
    67  // in anticipation of this case we cannot respond to the plan until
    68  // the Raft log is updated. This means our schedulers will stall,
    69  // but there are many of those and only a single plan verifier.
    70  //
    71  func (p *planner) planApply() {
    72  	// planIndexCh is used to track an outstanding application and receive
    73  	// its committed index while snap holds an optimistic state which
    74  	// includes that plan application.
    75  	var planIndexCh chan uint64
    76  	var snap *state.StateSnapshot
    77  
    78  	// prevPlanResultIndex is the index when the last PlanResult was
    79  	// committed. Since only the last plan is optimistically applied to the
    80  	// snapshot, it's possible the current snapshot's and plan's indexes
    81  	// are less than the index the previous plan result was committed at.
    82  	// prevPlanResultIndex also guards against the previous plan committing
    83  	// during Dequeue, thus causing the snapshot containing the optimistic
    84  	// commit to be discarded and potentially evaluating the current plan
    85  	// against an index older than the previous plan was committed at.
    86  	var prevPlanResultIndex uint64
    87  
    88  	// Setup a worker pool with half the cores, with at least 1
    89  	poolSize := runtime.NumCPU() / 2
    90  	if poolSize == 0 {
    91  		poolSize = 1
    92  	}
    93  	pool := NewEvaluatePool(poolSize, workerPoolBufferSize)
    94  	defer pool.Shutdown()
    95  
    96  	for {
    97  		// Pull the next pending plan, exit if we are no longer leader
    98  		pending, err := p.planQueue.Dequeue(0)
    99  		if err != nil {
   100  			return
   101  		}
   102  
   103  		// If last plan has completed get a new snapshot
   104  		select {
   105  		case idx := <-planIndexCh:
   106  			// Previous plan committed. Discard snapshot and ensure
   107  			// future snapshots include this plan. idx may be 0 if
   108  			// plan failed to apply, so use max(prev, idx)
   109  			prevPlanResultIndex = max(prevPlanResultIndex, idx)
   110  			planIndexCh = nil
   111  			snap = nil
   112  		default:
   113  		}
   114  
   115  		if snap != nil {
   116  			// If snapshot doesn't contain the previous plan
   117  			// result's index and the current plan's snapshot it,
   118  			// discard it and get a new one below.
   119  			minIndex := max(prevPlanResultIndex, pending.plan.SnapshotIndex)
   120  			if idx, err := snap.LatestIndex(); err != nil || idx < minIndex {
   121  				snap = nil
   122  			}
   123  		}
   124  
   125  		// Snapshot the state so that we have a consistent view of the world
   126  		// if no snapshot is available.
   127  		//  - planIndexCh will be nil if the previous plan result applied
   128  		//    during Dequeue
   129  		//  - snap will be nil if its index < max(prevIndex, curIndex)
   130  		if planIndexCh == nil || snap == nil {
   131  			snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex)
   132  			if err != nil {
   133  				p.logger.Error("failed to snapshot state", "error", err)
   134  				pending.respond(nil, err)
   135  				continue
   136  			}
   137  		}
   138  
   139  		// Evaluate the plan
   140  		result, err := evaluatePlan(pool, snap, pending.plan, p.logger)
   141  		if err != nil {
   142  			p.logger.Error("failed to evaluate plan", "error", err)
   143  			pending.respond(nil, err)
   144  			continue
   145  		}
   146  
   147  		// Fast-path the response if there is nothing to do
   148  		if result.IsNoOp() {
   149  			pending.respond(result, nil)
   150  			continue
   151  		}
   152  
   153  		// Ensure any parallel apply is complete before starting the next one.
   154  		// This also limits how out of date our snapshot can be.
   155  		if planIndexCh != nil {
   156  			idx := <-planIndexCh
   157  			prevPlanResultIndex = max(prevPlanResultIndex, idx)
   158  			snap, err = p.snapshotMinIndex(prevPlanResultIndex, pending.plan.SnapshotIndex)
   159  			if err != nil {
   160  				p.logger.Error("failed to update snapshot state", "error", err)
   161  				pending.respond(nil, err)
   162  				continue
   163  			}
   164  		}
   165  
   166  		// Dispatch the Raft transaction for the plan
   167  		future, err := p.applyPlan(pending.plan, result, snap)
   168  		if err != nil {
   169  			p.logger.Error("failed to submit plan", "error", err)
   170  			pending.respond(nil, err)
   171  			continue
   172  		}
   173  
   174  		// Respond to the plan in async; receive plan's committed index via chan
   175  		planIndexCh = make(chan uint64, 1)
   176  		go p.asyncPlanWait(planIndexCh, future, result, pending)
   177  	}
   178  }
   179  
   180  // snapshotMinIndex wraps SnapshotAfter with a 5s timeout and converts timeout
   181  // errors to a more descriptive error message. The snapshot is guaranteed to
   182  // include both the previous plan and all objects referenced by the plan or
   183  // return an error.
   184  func (p *planner) snapshotMinIndex(prevPlanResultIndex, planSnapshotIndex uint64) (*state.StateSnapshot, error) {
   185  	defer metrics.MeasureSince([]string{"nomad", "plan", "wait_for_index"}, time.Now())
   186  
   187  	// Minimum index the snapshot must include is the max of the previous
   188  	// plan result's and current plan's snapshot index.
   189  	minIndex := max(prevPlanResultIndex, planSnapshotIndex)
   190  
   191  	const timeout = 5 * time.Second
   192  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   193  	snap, err := p.fsm.State().SnapshotMinIndex(ctx, minIndex)
   194  	cancel()
   195  	if err == context.DeadlineExceeded {
   196  		return nil, fmt.Errorf("timed out after %s waiting for index=%d (previous plan result index=%d; plan snapshot index=%d)",
   197  			timeout, minIndex, prevPlanResultIndex, planSnapshotIndex)
   198  	}
   199  
   200  	return snap, err
   201  }
   202  
   203  // applyPlan is used to apply the plan result and to return the alloc index
   204  func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   205  	// Setup the update request
   206  	req := structs.ApplyPlanResultsRequest{
   207  		AllocUpdateRequest: structs.AllocUpdateRequest{
   208  			Job: plan.Job,
   209  		},
   210  		Deployment:        result.Deployment,
   211  		DeploymentUpdates: result.DeploymentUpdates,
   212  		EvalID:            plan.EvalID,
   213  	}
   214  
   215  	preemptedJobIDs := make(map[structs.NamespacedID]struct{})
   216  	now := time.Now().UTC().UnixNano()
   217  
   218  	if ServersMeetMinimumVersion(p.Members(), MinVersionPlanNormalization, true) {
   219  		// Initialize the allocs request using the new optimized log entry format.
   220  		// Determine the minimum number of updates, could be more if there
   221  		// are multiple updates per node
   222  		req.AllocsStopped = make([]*structs.AllocationDiff, 0, len(result.NodeUpdate))
   223  		req.AllocsUpdated = make([]*structs.Allocation, 0, len(result.NodeAllocation))
   224  		req.AllocsPreempted = make([]*structs.AllocationDiff, 0, len(result.NodePreemptions))
   225  
   226  		for _, updateList := range result.NodeUpdate {
   227  			for _, stoppedAlloc := range updateList {
   228  				req.AllocsStopped = append(req.AllocsStopped, normalizeStoppedAlloc(stoppedAlloc, now))
   229  			}
   230  		}
   231  
   232  		for _, allocList := range result.NodeAllocation {
   233  			req.AllocsUpdated = append(req.AllocsUpdated, allocList...)
   234  		}
   235  
   236  		// Set the time the alloc was applied for the first time. This can be used
   237  		// to approximate the scheduling time.
   238  		updateAllocTimestamps(req.AllocsUpdated, now)
   239  
   240  		for _, preemptions := range result.NodePreemptions {
   241  			for _, preemptedAlloc := range preemptions {
   242  				req.AllocsPreempted = append(req.AllocsPreempted, normalizePreemptedAlloc(preemptedAlloc, now))
   243  
   244  				// Gather jobids to create follow up evals
   245  				appendNamespacedJobID(preemptedJobIDs, preemptedAlloc)
   246  			}
   247  		}
   248  	} else {
   249  		// COMPAT 0.11: This branch is deprecated and will only be used to support
   250  		// application of older log entries. Expected to be removed in a future version.
   251  
   252  		// Determine the minimum number of updates, could be more if there
   253  		// are multiple updates per node
   254  		minUpdates := len(result.NodeUpdate)
   255  		minUpdates += len(result.NodeAllocation)
   256  
   257  		// Initialize using the older log entry format for Alloc and NodePreemptions
   258  		req.Alloc = make([]*structs.Allocation, 0, minUpdates)
   259  		req.NodePreemptions = make([]*structs.Allocation, 0, len(result.NodePreemptions))
   260  
   261  		for _, updateList := range result.NodeUpdate {
   262  			req.Alloc = append(req.Alloc, updateList...)
   263  		}
   264  		for _, allocList := range result.NodeAllocation {
   265  			req.Alloc = append(req.Alloc, allocList...)
   266  		}
   267  
   268  		for _, preemptions := range result.NodePreemptions {
   269  			req.NodePreemptions = append(req.NodePreemptions, preemptions...)
   270  		}
   271  
   272  		// Set the time the alloc was applied for the first time. This can be used
   273  		// to approximate the scheduling time.
   274  		updateAllocTimestamps(req.Alloc, now)
   275  
   276  		// Set modify time for preempted allocs if any
   277  		// Also gather jobids to create follow up evals
   278  		for _, alloc := range req.NodePreemptions {
   279  			alloc.ModifyTime = now
   280  			appendNamespacedJobID(preemptedJobIDs, alloc)
   281  		}
   282  	}
   283  
   284  	var evals []*structs.Evaluation
   285  	for preemptedJobID := range preemptedJobIDs {
   286  		job, _ := p.State().JobByID(nil, preemptedJobID.Namespace, preemptedJobID.ID)
   287  		if job != nil {
   288  			eval := &structs.Evaluation{
   289  				ID:          uuid.Generate(),
   290  				Namespace:   job.Namespace,
   291  				TriggeredBy: structs.EvalTriggerPreemption,
   292  				JobID:       job.ID,
   293  				Type:        job.Type,
   294  				Priority:    job.Priority,
   295  				Status:      structs.EvalStatusPending,
   296  				CreateTime:  now,
   297  				ModifyTime:  now,
   298  			}
   299  			evals = append(evals, eval)
   300  		}
   301  	}
   302  	req.PreemptionEvals = evals
   303  
   304  	// Dispatch the Raft transaction
   305  	future, err := p.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req)
   306  	if err != nil {
   307  		return nil, err
   308  	}
   309  
   310  	// Optimistically apply to our state view
   311  	if snap != nil {
   312  		nextIdx := p.raft.AppliedIndex() + 1
   313  		if err := snap.UpsertPlanResults(nextIdx, &req); err != nil {
   314  			return future, err
   315  		}
   316  	}
   317  	return future, nil
   318  }
   319  
   320  // normalizePreemptedAlloc removes redundant fields from a preempted allocation and
   321  // returns AllocationDiff. Since a preempted allocation is always an existing allocation,
   322  // the struct returned by this method contains only the differential, which can be
   323  // applied to an existing allocation, to yield the updated struct
   324  func normalizePreemptedAlloc(preemptedAlloc *structs.Allocation, now int64) *structs.AllocationDiff {
   325  	return &structs.AllocationDiff{
   326  		ID:                    preemptedAlloc.ID,
   327  		PreemptedByAllocation: preemptedAlloc.PreemptedByAllocation,
   328  		ModifyTime:            now,
   329  	}
   330  }
   331  
   332  // normalizeStoppedAlloc removes redundant fields from a stopped allocation and
   333  // returns AllocationDiff. Since a stopped allocation is always an existing allocation,
   334  // the struct returned by this method contains only the differential, which can be
   335  // applied to an existing allocation, to yield the updated struct
   336  func normalizeStoppedAlloc(stoppedAlloc *structs.Allocation, now int64) *structs.AllocationDiff {
   337  	return &structs.AllocationDiff{
   338  		ID:                 stoppedAlloc.ID,
   339  		DesiredDescription: stoppedAlloc.DesiredDescription,
   340  		ClientStatus:       stoppedAlloc.ClientStatus,
   341  		ModifyTime:         now,
   342  		FollowupEvalID:     stoppedAlloc.FollowupEvalID,
   343  	}
   344  }
   345  
   346  // appendNamespacedJobID appends the namespaced Job ID for the alloc to the jobIDs set
   347  func appendNamespacedJobID(jobIDs map[structs.NamespacedID]struct{}, alloc *structs.Allocation) {
   348  	id := structs.NamespacedID{Namespace: alloc.Namespace, ID: alloc.JobID}
   349  	if _, ok := jobIDs[id]; !ok {
   350  		jobIDs[id] = struct{}{}
   351  	}
   352  }
   353  
   354  // updateAllocTimestamps sets the CreateTime and ModifyTime for the allocations
   355  // to the timestamp provided
   356  func updateAllocTimestamps(allocations []*structs.Allocation, timestamp int64) {
   357  	for _, alloc := range allocations {
   358  		if alloc.CreateTime == 0 {
   359  			alloc.CreateTime = timestamp
   360  		}
   361  		alloc.ModifyTime = timestamp
   362  	}
   363  }
   364  
   365  // asyncPlanWait is used to apply and respond to a plan async. On successful
   366  // commit the plan's index will be sent on the chan. On error the chan will be
   367  // closed.
   368  func (p *planner) asyncPlanWait(indexCh chan<- uint64, future raft.ApplyFuture,
   369  	result *structs.PlanResult, pending *pendingPlan) {
   370  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   371  
   372  	// Wait for the plan to apply
   373  	if err := future.Error(); err != nil {
   374  		p.logger.Error("failed to apply plan", "error", err)
   375  		pending.respond(nil, err)
   376  
   377  		// Close indexCh on error
   378  		close(indexCh)
   379  		return
   380  	}
   381  
   382  	// Respond to the plan
   383  	index := future.Index()
   384  	result.AllocIndex = index
   385  
   386  	// If this is a partial plan application, we need to ensure the scheduler
   387  	// at least has visibility into any placements it made to avoid double placement.
   388  	// The RefreshIndex computed by evaluatePlan may be stale due to evaluation
   389  	// against an optimistic copy of the state.
   390  	if result.RefreshIndex != 0 {
   391  		result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex)
   392  	}
   393  	pending.respond(result, nil)
   394  	indexCh <- index
   395  }
   396  
   397  // evaluatePlan is used to determine what portions of a plan
   398  // can be applied if any. Returns if there should be a plan application
   399  // which may be partial or if there was an error
   400  func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) {
   401  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   402  
   403  	// Denormalize without the job
   404  	err := snap.DenormalizeAllocationsMap(plan.NodeUpdate)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  	// Denormalize without the job
   409  	err = snap.DenormalizeAllocationsMap(plan.NodePreemptions)
   410  	if err != nil {
   411  		return nil, err
   412  	}
   413  
   414  	// Check if the plan exceeds quota
   415  	overQuota, err := evaluatePlanQuota(snap, plan)
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  
   420  	// Reject the plan and force the scheduler to refresh
   421  	if overQuota {
   422  		index, err := refreshIndex(snap)
   423  		if err != nil {
   424  			return nil, err
   425  		}
   426  
   427  		logger.Debug("plan for evaluation exceeds quota limit. Forcing state refresh", "eval_id", plan.EvalID, "refresh_index", index)
   428  		return &structs.PlanResult{RefreshIndex: index}, nil
   429  	}
   430  
   431  	return evaluatePlanPlacements(pool, snap, plan, logger)
   432  }
   433  
   434  // evaluatePlanPlacements is used to determine what portions of a plan can be
   435  // applied if any, looking for node over commitment. Returns if there should be
   436  // a plan application which may be partial or if there was an error
   437  func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) {
   438  	// Create a result holder for the plan
   439  	result := &structs.PlanResult{
   440  		NodeUpdate:        make(map[string][]*structs.Allocation),
   441  		NodeAllocation:    make(map[string][]*structs.Allocation),
   442  		Deployment:        plan.Deployment.Copy(),
   443  		DeploymentUpdates: plan.DeploymentUpdates,
   444  		NodePreemptions:   make(map[string][]*structs.Allocation),
   445  	}
   446  
   447  	// Collect all the nodeIDs
   448  	nodeIDs := make(map[string]struct{})
   449  	nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation))
   450  	for nodeID := range plan.NodeUpdate {
   451  		if _, ok := nodeIDs[nodeID]; !ok {
   452  			nodeIDs[nodeID] = struct{}{}
   453  			nodeIDList = append(nodeIDList, nodeID)
   454  		}
   455  	}
   456  	for nodeID := range plan.NodeAllocation {
   457  		if _, ok := nodeIDs[nodeID]; !ok {
   458  			nodeIDs[nodeID] = struct{}{}
   459  			nodeIDList = append(nodeIDList, nodeID)
   460  		}
   461  	}
   462  
   463  	// Setup a multierror to handle potentially getting many
   464  	// errors since we are processing in parallel.
   465  	var mErr multierror.Error
   466  	partialCommit := false
   467  
   468  	// handleResult is used to process the result of evaluateNodePlan
   469  	handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) {
   470  		// Evaluate the plan for this node
   471  		if err != nil {
   472  			mErr.Errors = append(mErr.Errors, err)
   473  			return true
   474  		}
   475  		if !fit {
   476  			// Log the reason why the node's allocations could not be made
   477  			if reason != "" {
   478  				logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason, "eval_id", plan.EvalID)
   479  			}
   480  			// Set that this is a partial commit
   481  			partialCommit = true
   482  
   483  			// If we require all-at-once scheduling, there is no point
   484  			// to continue the evaluation, as we've already failed.
   485  			if plan.AllAtOnce {
   486  				result.NodeUpdate = nil
   487  				result.NodeAllocation = nil
   488  				result.DeploymentUpdates = nil
   489  				result.Deployment = nil
   490  				result.NodePreemptions = nil
   491  				return true
   492  			}
   493  
   494  			// Skip this node, since it cannot be used.
   495  			return
   496  		}
   497  
   498  		// Add this to the plan result
   499  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   500  			result.NodeUpdate[nodeID] = nodeUpdate
   501  		}
   502  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   503  			result.NodeAllocation[nodeID] = nodeAlloc
   504  		}
   505  
   506  		if nodePreemptions := plan.NodePreemptions[nodeID]; nodePreemptions != nil {
   507  
   508  			// Do a pass over preempted allocs in the plan to check
   509  			// whether the alloc is already in a terminal state
   510  			var filteredNodePreemptions []*structs.Allocation
   511  			for _, preemptedAlloc := range nodePreemptions {
   512  				alloc, err := snap.AllocByID(nil, preemptedAlloc.ID)
   513  				if err != nil {
   514  					mErr.Errors = append(mErr.Errors, err)
   515  					continue
   516  				}
   517  				if alloc != nil && !alloc.TerminalStatus() {
   518  					filteredNodePreemptions = append(filteredNodePreemptions, preemptedAlloc)
   519  				}
   520  			}
   521  
   522  			result.NodePreemptions[nodeID] = filteredNodePreemptions
   523  		}
   524  
   525  		return
   526  	}
   527  
   528  	// Get the pool channels
   529  	req := pool.RequestCh()
   530  	resp := pool.ResultCh()
   531  	outstanding := 0
   532  	didCancel := false
   533  
   534  	// Evaluate each node in the plan, handling results as they are ready to
   535  	// avoid blocking.
   536  OUTER:
   537  	for len(nodeIDList) > 0 {
   538  		nodeID := nodeIDList[0]
   539  		select {
   540  		case req <- evaluateRequest{snap, plan, nodeID}:
   541  			outstanding++
   542  			nodeIDList = nodeIDList[1:]
   543  		case r := <-resp:
   544  			outstanding--
   545  
   546  			// Handle a result that allows us to cancel evaluation,
   547  			// which may save time processing additional entries.
   548  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   549  				didCancel = true
   550  				break OUTER
   551  			}
   552  		}
   553  	}
   554  
   555  	// Drain the remaining results
   556  	for outstanding > 0 {
   557  		r := <-resp
   558  		if !didCancel {
   559  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   560  				didCancel = true
   561  			}
   562  		}
   563  		outstanding--
   564  	}
   565  
   566  	// If the plan resulted in a partial commit, we need to determine
   567  	// a minimum refresh index to force the scheduler to work on a more
   568  	// up-to-date state to avoid the failures.
   569  	if partialCommit {
   570  		index, err := refreshIndex(snap)
   571  		if err != nil {
   572  			mErr.Errors = append(mErr.Errors, err)
   573  		}
   574  		result.RefreshIndex = index
   575  
   576  		if result.RefreshIndex == 0 {
   577  			err := fmt.Errorf("partialCommit with RefreshIndex of 0")
   578  			mErr.Errors = append(mErr.Errors, err)
   579  		}
   580  
   581  		// If there was a partial commit and we are operating within a
   582  		// deployment correct for any canary that may have been desired to be
   583  		// placed but wasn't actually placed
   584  		correctDeploymentCanaries(result)
   585  	}
   586  	return result, mErr.ErrorOrNil()
   587  }
   588  
   589  // correctDeploymentCanaries ensures that the deployment object doesn't list any
   590  // canaries as placed if they didn't actually get placed. This could happen if
   591  // the plan had a partial commit.
   592  func correctDeploymentCanaries(result *structs.PlanResult) {
   593  	// Hot path
   594  	if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() {
   595  		return
   596  	}
   597  
   598  	// Build a set of all the allocations IDs that were placed
   599  	placedAllocs := make(map[string]struct{}, len(result.NodeAllocation))
   600  	for _, placed := range result.NodeAllocation {
   601  		for _, alloc := range placed {
   602  			placedAllocs[alloc.ID] = struct{}{}
   603  		}
   604  	}
   605  
   606  	// Go through all the canaries and ensure that the result list only contains
   607  	// those that have been placed
   608  	for _, group := range result.Deployment.TaskGroups {
   609  		canaries := group.PlacedCanaries
   610  		if len(canaries) == 0 {
   611  			continue
   612  		}
   613  
   614  		// Prune the canaries in place to avoid allocating an extra slice
   615  		i := 0
   616  		for _, canaryID := range canaries {
   617  			if _, ok := placedAllocs[canaryID]; ok {
   618  				canaries[i] = canaryID
   619  				i++
   620  			}
   621  		}
   622  
   623  		group.PlacedCanaries = canaries[:i]
   624  	}
   625  }
   626  
   627  // evaluateNodePlan is used to evaluate the plan for a single node,
   628  // returning if the plan is valid or if an error is encountered
   629  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) {
   630  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   631  	if len(plan.NodeAllocation[nodeID]) == 0 {
   632  		return true, "", nil
   633  	}
   634  
   635  	// Get the node itself
   636  	ws := memdb.NewWatchSet()
   637  	node, err := snap.NodeByID(ws, nodeID)
   638  	if err != nil {
   639  		return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   640  	}
   641  
   642  	// If the node does not exist or is not ready for scheduling it is not fit
   643  	// XXX: There is a potential race between when we do this check and when
   644  	// the Raft commit happens.
   645  	if node == nil {
   646  		return false, "node does not exist", nil
   647  	} else if node.Status != structs.NodeStatusReady {
   648  		return false, "node is not ready for placements", nil
   649  	} else if node.SchedulingEligibility == structs.NodeSchedulingIneligible {
   650  		return false, "node is not eligible for draining", nil
   651  	} else if node.Drain {
   652  		// Deprecate in favor of scheduling eligibility and remove post-0.8
   653  		return false, "node is draining", nil
   654  	}
   655  
   656  	// Get the existing allocations that are non-terminal
   657  	existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false)
   658  	if err != nil {
   659  		return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   660  	}
   661  
   662  	// Determine the proposed allocation by first removing allocations
   663  	// that are planned evictions and adding the new allocations.
   664  	var remove []*structs.Allocation
   665  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   666  		remove = append(remove, update...)
   667  	}
   668  
   669  	// Remove any preempted allocs
   670  	if preempted := plan.NodePreemptions[nodeID]; len(preempted) > 0 {
   671  		remove = append(remove, preempted...)
   672  	}
   673  
   674  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   675  		remove = append(remove, updated...)
   676  	}
   677  	proposed := structs.RemoveAllocs(existingAlloc, remove)
   678  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   679  
   680  	// Check if these allocations fit
   681  	fit, reason, _, err := structs.AllocsFit(node, proposed, nil, true)
   682  	return fit, reason, err
   683  }
   684  
   685  func max(a, b uint64) uint64 {
   686  	if a > b {
   687  		return a
   688  	}
   689  	return b
   690  }