github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"runtime"
     7  	"time"
     8  
     9  	"github.com/armon/go-metrics"
    10  	memdb "github.com/hashicorp/go-memdb"
    11  	"github.com/hashicorp/go-multierror"
    12  	"github.com/hashicorp/nomad/nomad/state"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  	"github.com/hashicorp/raft"
    15  )
    16  
    17  // planApply is a long lived goroutine that reads plan allocations from
    18  // the plan queue, determines if they can be applied safely and applies
    19  // them via Raft.
    20  //
    21  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    22  // However, the plan application is bounded by the Raft apply time and
    23  // subject to some latency. This creates a stall condition, where we are
    24  // not evaluating, but simply waiting for a transaction to apply.
    25  //
    26  // To avoid this, we overlap verification with apply. This means once
    27  // we've verified plan N we attempt to apply it. However, while waiting
    28  // for apply, we begin to verify plan N+1 under the assumption that plan
    29  // N has succeeded.
    30  //
    31  // In this sense, we track two parallel versions of the world. One is
    32  // the pessimistic one driven by the Raft log which is replicated. The
    33  // other is optimistic and assumes our transactions will succeed. In the
    34  // happy path, this lets us do productive work during the latency of
    35  // apply.
    36  //
    37  // In the unhappy path (Raft transaction fails), effectively we only
    38  // wasted work during a time we would have been waiting anyways. However,
    39  // in anticipation of this case we cannot respond to the plan until
    40  // the Raft log is updated. This means our schedulers will stall,
    41  // but there are many of those and only a single plan verifier.
    42  //
    43  func (s *Server) planApply() {
    44  	// waitCh is used to track an outstanding application while snap
    45  	// holds an optimistic state which includes that plan application.
    46  	var waitCh chan struct{}
    47  	var snap *state.StateSnapshot
    48  
    49  	// Setup a worker pool with half the cores, with at least 1
    50  	poolSize := runtime.NumCPU() / 2
    51  	if poolSize == 0 {
    52  		poolSize = 1
    53  	}
    54  	pool := NewEvaluatePool(poolSize, workerPoolBufferSize)
    55  	defer pool.Shutdown()
    56  
    57  	for {
    58  		// Pull the next pending plan, exit if we are no longer leader
    59  		pending, err := s.planQueue.Dequeue(0)
    60  		if err != nil {
    61  			return
    62  		}
    63  
    64  		// Check if out last plan has completed
    65  		select {
    66  		case <-waitCh:
    67  			waitCh = nil
    68  			snap = nil
    69  		default:
    70  		}
    71  
    72  		// Snapshot the state so that we have a consistent view of the world
    73  		// if no snapshot is available
    74  		if waitCh == nil || snap == nil {
    75  			snap, err = s.fsm.State().Snapshot()
    76  			if err != nil {
    77  				s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err)
    78  				pending.respond(nil, err)
    79  				continue
    80  			}
    81  		}
    82  
    83  		// Evaluate the plan
    84  		result, err := evaluatePlan(pool, snap, pending.plan, s.logger)
    85  		if err != nil {
    86  			s.logger.Printf("[ERR] nomad.planner: failed to evaluate plan: %v", err)
    87  			pending.respond(nil, err)
    88  			continue
    89  		}
    90  
    91  		// Fast-path the response if there is nothing to do
    92  		if result.IsNoOp() {
    93  			pending.respond(result, nil)
    94  			continue
    95  		}
    96  
    97  		// Ensure any parallel apply is complete before starting the next one.
    98  		// This also limits how out of date our snapshot can be.
    99  		if waitCh != nil {
   100  			<-waitCh
   101  			snap, err = s.fsm.State().Snapshot()
   102  			if err != nil {
   103  				s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err)
   104  				pending.respond(nil, err)
   105  				continue
   106  			}
   107  		}
   108  
   109  		// Dispatch the Raft transaction for the plan
   110  		future, err := s.applyPlan(pending.plan, result, snap)
   111  		if err != nil {
   112  			s.logger.Printf("[ERR] nomad.planner: failed to submit plan: %v", err)
   113  			pending.respond(nil, err)
   114  			continue
   115  		}
   116  
   117  		// Respond to the plan in async
   118  		waitCh = make(chan struct{})
   119  		go s.asyncPlanWait(waitCh, future, result, pending)
   120  	}
   121  }
   122  
   123  // applyPlan is used to apply the plan result and to return the alloc index
   124  func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   125  	// Determine the minimum number of updates, could be more if there
   126  	// are multiple updates per node
   127  	minUpdates := len(result.NodeUpdate)
   128  	minUpdates += len(result.NodeAllocation)
   129  
   130  	// Setup the update request
   131  	req := structs.ApplyPlanResultsRequest{
   132  		AllocUpdateRequest: structs.AllocUpdateRequest{
   133  			Job:   plan.Job,
   134  			Alloc: make([]*structs.Allocation, 0, minUpdates),
   135  		},
   136  		Deployment:        result.Deployment,
   137  		DeploymentUpdates: result.DeploymentUpdates,
   138  		EvalID:            plan.EvalID,
   139  	}
   140  	for _, updateList := range result.NodeUpdate {
   141  		req.Alloc = append(req.Alloc, updateList...)
   142  	}
   143  	for _, allocList := range result.NodeAllocation {
   144  		req.Alloc = append(req.Alloc, allocList...)
   145  	}
   146  
   147  	// Set the time the alloc was applied for the first time. This can be used
   148  	// to approximate the scheduling time.
   149  	now := time.Now().UTC().UnixNano()
   150  	for _, alloc := range req.Alloc {
   151  		if alloc.CreateTime == 0 {
   152  			alloc.CreateTime = now
   153  		}
   154  		alloc.ModifyTime = now
   155  	}
   156  
   157  	// Dispatch the Raft transaction
   158  	future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req)
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  
   163  	// Optimistically apply to our state view
   164  	if snap != nil {
   165  		nextIdx := s.raft.AppliedIndex() + 1
   166  		if err := snap.UpsertPlanResults(nextIdx, &req); err != nil {
   167  			return future, err
   168  		}
   169  	}
   170  	return future, nil
   171  }
   172  
   173  // asyncPlanWait is used to apply and respond to a plan async
   174  func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture,
   175  	result *structs.PlanResult, pending *pendingPlan) {
   176  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   177  	defer close(waitCh)
   178  
   179  	// Wait for the plan to apply
   180  	if err := future.Error(); err != nil {
   181  		s.logger.Printf("[ERR] nomad.planner: failed to apply plan: %v", err)
   182  		pending.respond(nil, err)
   183  		return
   184  	}
   185  
   186  	// Respond to the plan
   187  	result.AllocIndex = future.Index()
   188  
   189  	// If this is a partial plan application, we need to ensure the scheduler
   190  	// at least has visibility into any placements it made to avoid double placement.
   191  	// The RefreshIndex computed by evaluatePlan may be stale due to evaluation
   192  	// against an optimistic copy of the state.
   193  	if result.RefreshIndex != 0 {
   194  		result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex)
   195  	}
   196  	pending.respond(result, nil)
   197  }
   198  
   199  // evaluatePlan is used to determine what portions of a plan
   200  // can be applied if any. Returns if there should be a plan application
   201  // which may be partial or if there was an error
   202  func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) {
   203  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   204  
   205  	// Check if the plan exceeds quota
   206  	overQuota, err := evaluatePlanQuota(snap, plan)
   207  	if err != nil {
   208  		return nil, err
   209  	}
   210  
   211  	// Reject the plan and force the scheduler to refresh
   212  	if overQuota {
   213  		index, err := refreshIndex(snap)
   214  		if err != nil {
   215  			return nil, err
   216  		}
   217  
   218  		logger.Printf("[DEBUG] nomad.planner: plan for evaluation %q exceeds quota limit. Forcing refresh to %d", plan.EvalID, index)
   219  		return &structs.PlanResult{RefreshIndex: index}, nil
   220  	}
   221  
   222  	return evaluatePlanPlacements(pool, snap, plan, logger)
   223  }
   224  
   225  // evaluatePlanPlacements is used to determine what portions of a plan can be
   226  // applied if any, looking for node over commitment. Returns if there should be
   227  // a plan application which may be partial or if there was an error
   228  func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) {
   229  	// Create a result holder for the plan
   230  	result := &structs.PlanResult{
   231  		NodeUpdate:        make(map[string][]*structs.Allocation),
   232  		NodeAllocation:    make(map[string][]*structs.Allocation),
   233  		Deployment:        plan.Deployment.Copy(),
   234  		DeploymentUpdates: plan.DeploymentUpdates,
   235  	}
   236  
   237  	// Collect all the nodeIDs
   238  	nodeIDs := make(map[string]struct{})
   239  	nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation))
   240  	for nodeID := range plan.NodeUpdate {
   241  		if _, ok := nodeIDs[nodeID]; !ok {
   242  			nodeIDs[nodeID] = struct{}{}
   243  			nodeIDList = append(nodeIDList, nodeID)
   244  		}
   245  	}
   246  	for nodeID := range plan.NodeAllocation {
   247  		if _, ok := nodeIDs[nodeID]; !ok {
   248  			nodeIDs[nodeID] = struct{}{}
   249  			nodeIDList = append(nodeIDList, nodeID)
   250  		}
   251  	}
   252  
   253  	// Setup a multierror to handle potentially getting many
   254  	// errors since we are processing in parallel.
   255  	var mErr multierror.Error
   256  	partialCommit := false
   257  
   258  	// handleResult is used to process the result of evaluateNodePlan
   259  	handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) {
   260  		// Evaluate the plan for this node
   261  		if err != nil {
   262  			mErr.Errors = append(mErr.Errors, err)
   263  			return true
   264  		}
   265  		if !fit {
   266  			// Log the reason why the node's allocations could not be made
   267  			if reason != "" {
   268  				logger.Printf("[DEBUG] nomad.planner: plan for node %q rejected because: %v", nodeID, reason)
   269  			}
   270  			// Set that this is a partial commit
   271  			partialCommit = true
   272  
   273  			// If we require all-at-once scheduling, there is no point
   274  			// to continue the evaluation, as we've already failed.
   275  			if plan.AllAtOnce {
   276  				result.NodeUpdate = nil
   277  				result.NodeAllocation = nil
   278  				result.DeploymentUpdates = nil
   279  				result.Deployment = nil
   280  				return true
   281  			}
   282  
   283  			// Skip this node, since it cannot be used.
   284  			return
   285  		}
   286  
   287  		// Add this to the plan result
   288  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   289  			result.NodeUpdate[nodeID] = nodeUpdate
   290  		}
   291  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   292  			result.NodeAllocation[nodeID] = nodeAlloc
   293  		}
   294  		return
   295  	}
   296  
   297  	// Get the pool channels
   298  	req := pool.RequestCh()
   299  	resp := pool.ResultCh()
   300  	outstanding := 0
   301  	didCancel := false
   302  
   303  	// Evaluate each node in the plan, handling results as they are ready to
   304  	// avoid blocking.
   305  OUTER:
   306  	for len(nodeIDList) > 0 {
   307  		nodeID := nodeIDList[0]
   308  		select {
   309  		case req <- evaluateRequest{snap, plan, nodeID}:
   310  			outstanding++
   311  			nodeIDList = nodeIDList[1:]
   312  		case r := <-resp:
   313  			outstanding--
   314  
   315  			// Handle a result that allows us to cancel evaluation,
   316  			// which may save time processing additional entries.
   317  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   318  				didCancel = true
   319  				break OUTER
   320  			}
   321  		}
   322  	}
   323  
   324  	// Drain the remaining results
   325  	for outstanding > 0 {
   326  		r := <-resp
   327  		if !didCancel {
   328  			if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel {
   329  				didCancel = true
   330  			}
   331  		}
   332  		outstanding--
   333  	}
   334  
   335  	// If the plan resulted in a partial commit, we need to determine
   336  	// a minimum refresh index to force the scheduler to work on a more
   337  	// up-to-date state to avoid the failures.
   338  	if partialCommit {
   339  		index, err := refreshIndex(snap)
   340  		if err != nil {
   341  			mErr.Errors = append(mErr.Errors, err)
   342  		}
   343  		result.RefreshIndex = index
   344  
   345  		if result.RefreshIndex == 0 {
   346  			err := fmt.Errorf("partialCommit with RefreshIndex of 0")
   347  			mErr.Errors = append(mErr.Errors, err)
   348  		}
   349  
   350  		// If there was a partial commit and we are operating within a
   351  		// deployment correct for any canary that may have been desired to be
   352  		// placed but wasn't actually placed
   353  		correctDeploymentCanaries(result)
   354  	}
   355  	return result, mErr.ErrorOrNil()
   356  }
   357  
   358  // correctDeploymentCanaries ensures that the deployment object doesn't list any
   359  // canaries as placed if they didn't actually get placed. This could happen if
   360  // the plan had a partial commit.
   361  func correctDeploymentCanaries(result *structs.PlanResult) {
   362  	// Hot path
   363  	if result.Deployment == nil || !result.Deployment.HasPlacedCanaries() {
   364  		return
   365  	}
   366  
   367  	// Build a set of all the allocations IDs that were placed
   368  	placedAllocs := make(map[string]struct{}, len(result.NodeAllocation))
   369  	for _, placed := range result.NodeAllocation {
   370  		for _, alloc := range placed {
   371  			placedAllocs[alloc.ID] = struct{}{}
   372  		}
   373  	}
   374  
   375  	// Go through all the canaries and ensure that the result list only contains
   376  	// those that have been placed
   377  	for _, group := range result.Deployment.TaskGroups {
   378  		canaries := group.PlacedCanaries
   379  		if len(canaries) == 0 {
   380  			continue
   381  		}
   382  
   383  		// Prune the canaries in place to avoid allocating an extra slice
   384  		i := 0
   385  		for _, canaryID := range canaries {
   386  			if _, ok := placedAllocs[canaryID]; ok {
   387  				canaries[i] = canaryID
   388  				i++
   389  			}
   390  		}
   391  
   392  		group.PlacedCanaries = canaries[:i]
   393  	}
   394  }
   395  
   396  // evaluateNodePlan is used to evaluate the plan for a single node,
   397  // returning if the plan is valid or if an error is encountered
   398  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) {
   399  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   400  	if len(plan.NodeAllocation[nodeID]) == 0 {
   401  		return true, "", nil
   402  	}
   403  
   404  	// Get the node itself
   405  	ws := memdb.NewWatchSet()
   406  	node, err := snap.NodeByID(ws, nodeID)
   407  	if err != nil {
   408  		return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   409  	}
   410  
   411  	// If the node does not exist or is not ready for scheduling it is not fit
   412  	// XXX: There is a potential race between when we do this check and when
   413  	// the Raft commit happens.
   414  	if node == nil {
   415  		return false, "node does not exist", nil
   416  	} else if node.Status != structs.NodeStatusReady {
   417  		return false, "node is not ready for placements", nil
   418  	} else if node.SchedulingEligibility == structs.NodeSchedulingIneligible {
   419  		return false, "node is not eligible for draining", nil
   420  	} else if node.Drain {
   421  		// Deprecate in favor of scheduling eligibility and remove post-0.8
   422  		return false, "node is draining", nil
   423  	}
   424  
   425  	// Get the existing allocations that are non-terminal
   426  	existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false)
   427  	if err != nil {
   428  		return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   429  	}
   430  
   431  	// Determine the proposed allocation by first removing allocations
   432  	// that are planned evictions and adding the new allocations.
   433  	var remove []*structs.Allocation
   434  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   435  		remove = append(remove, update...)
   436  	}
   437  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   438  		for _, alloc := range updated {
   439  			remove = append(remove, alloc)
   440  		}
   441  	}
   442  	proposed := structs.RemoveAllocs(existingAlloc, remove)
   443  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   444  
   445  	// Check if these allocations fit
   446  	fit, reason, _, err := structs.AllocsFit(node, proposed, nil)
   447  	return fit, reason, err
   448  }