github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/nomad/plan_apply.go

github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"runtime"
     6  	"time"
     7  
     8  	"github.com/armon/go-metrics"
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/hashicorp/nomad/nomad/state"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/hashicorp/raft"
    13  )
    14  
    15  // planApply is a long lived goroutine that reads plan allocations from
    16  // the plan queue, determines if they can be applied safely and applies
    17  // them via Raft.
    18  //
    19  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    20  // However, the plan application is bounded by the Raft apply time and
    21  // subject to some latency. This creates a stall condition, where we are
    22  // not evaluating, but simply waiting for a transaction to apply.
    23  //
    24  // To avoid this, we overlap verification with apply. This means once
    25  // we've verified plan N we attempt to apply it. However, while waiting
    26  // for apply, we begin to verify plan N+1 under the assumption that plan
    27  // N has succeeded.
    28  //
    29  // In this sense, we track two parallel versions of the world. One is
    30  // the pessimistic one driven by the Raft log which is replicated. The
    31  // other is optimistic and assumes our transactions will succeed. In the
    32  // happy path, this lets us do productive work during the latency of
    33  // apply.
    34  //
    35  // In the unhappy path (Raft transaction fails), effectively we only
    36  // wasted work during a time we would have been waiting anyways. However,
    37  // in anticipation of this case we cannot respond to the plan until
    38  // the Raft log is updated. This means our schedulers will stall,
    39  // but there are many of those and only a single plan verifier.
    40  //
    41  func (s *Server) planApply() {
    42  	// waitCh is used to track an outstanding application while snap
    43  	// holds an optimistic state which includes that plan application.
    44  	var waitCh chan struct{}
    45  	var snap *state.StateSnapshot
    46  
    47  	// Setup a worker pool with half the cores, with at least 1
    48  	poolSize := runtime.NumCPU() / 2
    49  	if poolSize == 0 {
    50  		poolSize = 1
    51  	}
    52  	pool := NewEvaluatePool(poolSize, workerPoolBufferSize)
    53  	defer pool.Shutdown()
    54  
    55  	for {
    56  		// Pull the next pending plan, exit if we are no longer leader
    57  		pending, err := s.planQueue.Dequeue(0)
    58  		if err != nil {
    59  			return
    60  		}
    61  
    62  		// Check if out last plan has completed
    63  		select {
    64  		case <-waitCh:
    65  			waitCh = nil
    66  			snap = nil
    67  		default:
    68  		}
    69  
    70  		// Snapshot the state so that we have a consistent view of the world
    71  		// if no snapshot is available
    72  		if waitCh == nil || snap == nil {
    73  			snap, err = s.fsm.State().Snapshot()
    74  			if err != nil {
    75  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
    76  				pending.respond(nil, err)
    77  				continue
    78  			}
    79  		}
    80  
    81  		// Evaluate the plan
    82  		result, err := evaluatePlan(pool, snap, pending.plan)
    83  		if err != nil {
    84  			s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err)
    85  			pending.respond(nil, err)
    86  			continue
    87  		}
    88  
    89  		// Fast-path the response if there is nothing to do
    90  		if result.IsNoOp() {
    91  			pending.respond(result, nil)
    92  			continue
    93  		}
    94  
    95  		// Ensure any parallel apply is complete before starting the next one.
    96  		// This also limits how out of date our snapshot can be.
    97  		if waitCh != nil {
    98  			<-waitCh
    99  			snap, err = s.fsm.State().Snapshot()
   100  			if err != nil {
   101  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
   102  				pending.respond(nil, err)
   103  				continue
   104  			}
   105  		}
   106  
   107  		// Dispatch the Raft transaction for the plan
   108  		future, err := s.applyPlan(pending.plan.Job, result, snap)
   109  		if err != nil {
   110  			s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err)
   111  			pending.respond(nil, err)
   112  			continue
   113  		}
   114  
   115  		// Respond to the plan in async
   116  		waitCh = make(chan struct{})
   117  		go s.asyncPlanWait(waitCh, future, result, pending)
   118  	}
   119  }
   120  
   121  // applyPlan is used to apply the plan result and to return the alloc index
   122  func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   123  	// Determine the miniumum number of updates, could be more if there
   124  	// are multiple updates per node
   125  	minUpdates := len(result.NodeUpdate)
   126  	minUpdates += len(result.NodeAllocation)
   127  
   128  	// Setup the update request
   129  	req := structs.AllocUpdateRequest{
   130  		Job:   job,
   131  		Alloc: make([]*structs.Allocation, 0, minUpdates),
   132  	}
   133  	for _, updateList := range result.NodeUpdate {
   134  		req.Alloc = append(req.Alloc, updateList...)
   135  	}
   136  	for _, allocList := range result.NodeAllocation {
   137  		req.Alloc = append(req.Alloc, allocList...)
   138  	}
   139  
   140  	// Set the time the alloc was applied for the first time. This can be used
   141  	// to approximate the scheduling time.
   142  	now := time.Now().UTC().UnixNano()
   143  	for _, alloc := range req.Alloc {
   144  		if alloc.CreateTime == 0 {
   145  			alloc.CreateTime = now
   146  		}
   147  	}
   148  
   149  	// Dispatch the Raft transaction
   150  	future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req)
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  
   155  	// Optimistically apply to our state view
   156  	if snap != nil {
   157  		nextIdx := s.raft.AppliedIndex() + 1
   158  		if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil {
   159  			return future, err
   160  		}
   161  	}
   162  	return future, nil
   163  }
   164  
   165  // asyncPlanWait is used to apply and respond to a plan async
   166  func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture,
   167  	result *structs.PlanResult, pending *pendingPlan) {
   168  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   169  	defer close(waitCh)
   170  
   171  	// Wait for the plan to apply
   172  	if err := future.Error(); err != nil {
   173  		s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err)
   174  		pending.respond(nil, err)
   175  		return
   176  	}
   177  
   178  	// Respond to the plan
   179  	result.AllocIndex = future.Index()
   180  
   181  	// If this is a partial plan application, we need to ensure the scheduler
   182  	// at least has visibility into any placements it made to avoid double placement.
   183  	// The RefreshIndex computed by evaluatePlan may be stale due to evaluation
   184  	// against an optimistic copy of the state.
   185  	if result.RefreshIndex != 0 {
   186  		result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex)
   187  	}
   188  	pending.respond(result, nil)
   189  }
   190  
   191  // evaluatePlan is used to determine what portions of a plan
   192  // can be applied if any. Returns if there should be a plan application
   193  // which may be partial or if there was an error
   194  func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) {
   195  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   196  
   197  	// Create a result holder for the plan
   198  	result := &structs.PlanResult{
   199  		NodeUpdate:     make(map[string][]*structs.Allocation),
   200  		NodeAllocation: make(map[string][]*structs.Allocation),
   201  	}
   202  
   203  	// Collect all the nodeIDs
   204  	nodeIDs := make(map[string]struct{})
   205  	nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation))
   206  	for nodeID := range plan.NodeUpdate {
   207  		if _, ok := nodeIDs[nodeID]; !ok {
   208  			nodeIDs[nodeID] = struct{}{}
   209  			nodeIDList = append(nodeIDList, nodeID)
   210  		}
   211  	}
   212  	for nodeID := range plan.NodeAllocation {
   213  		if _, ok := nodeIDs[nodeID]; !ok {
   214  			nodeIDs[nodeID] = struct{}{}
   215  			nodeIDList = append(nodeIDList, nodeID)
   216  		}
   217  	}
   218  
   219  	// Setup a multierror to handle potentially getting many
   220  	// errors since we are processing in parallel.
   221  	var mErr multierror.Error
   222  	partialCommit := false
   223  
   224  	// handleResult is used to process the result of evaluateNodePlan
   225  	handleResult := func(nodeID string, fit bool, err error) (cancel bool) {
   226  		// Evaluate the plan for this node
   227  		if err != nil {
   228  			mErr.Errors = append(mErr.Errors, err)
   229  			return true
   230  		}
   231  		if !fit {
   232  			// Set that this is a partial commit
   233  			partialCommit = true
   234  
   235  			// If we require all-at-once scheduling, there is no point
   236  			// to continue the evaluation, as we've already failed.
   237  			if plan.AllAtOnce {
   238  				result.NodeUpdate = nil
   239  				result.NodeAllocation = nil
   240  				return true
   241  			}
   242  
   243  			// Skip this node, since it cannot be used.
   244  			return
   245  		}
   246  
   247  		// Add this to the plan result
   248  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   249  			result.NodeUpdate[nodeID] = nodeUpdate
   250  		}
   251  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   252  			result.NodeAllocation[nodeID] = nodeAlloc
   253  		}
   254  		return
   255  	}
   256  
   257  	// Get the pool channels
   258  	req := pool.RequestCh()
   259  	resp := pool.ResultCh()
   260  	outstanding := 0
   261  	didCancel := false
   262  
   263  	// Evalute each node in the plan, handling results as they are ready to
   264  	// avoid blocking.
   265  	for len(nodeIDList) > 0 {
   266  		nodeID := nodeIDList[0]
   267  		select {
   268  		case req <- evaluateRequest{snap, plan, nodeID}:
   269  			outstanding++
   270  			nodeIDList = nodeIDList[1:]
   271  		case r := <-resp:
   272  			outstanding--
   273  
   274  			// Handle a result that allows us to cancel evaluation,
   275  			// which may save time processing additional entries.
   276  			if cancel := handleResult(r.nodeID, r.fit, r.err); cancel {
   277  				didCancel = true
   278  				break
   279  			}
   280  		}
   281  	}
   282  
   283  	// Drain the remaining results
   284  	for outstanding > 0 {
   285  		r := <-resp
   286  		if !didCancel {
   287  			if cancel := handleResult(r.nodeID, r.fit, r.err); cancel {
   288  				didCancel = true
   289  			}
   290  		}
   291  		outstanding--
   292  	}
   293  
   294  	// If the plan resulted in a partial commit, we need to determine
   295  	// a minimum refresh index to force the scheduler to work on a more
   296  	// up-to-date state to avoid the failures.
   297  	if partialCommit {
   298  		allocIndex, err := snap.Index("allocs")
   299  		if err != nil {
   300  			mErr.Errors = append(mErr.Errors, err)
   301  		}
   302  		nodeIndex, err := snap.Index("nodes")
   303  		if err != nil {
   304  			mErr.Errors = append(mErr.Errors, err)
   305  		}
   306  		result.RefreshIndex = maxUint64(nodeIndex, allocIndex)
   307  
   308  		if result.RefreshIndex == 0 {
   309  			err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex)
   310  			mErr.Errors = append(mErr.Errors, err)
   311  		}
   312  	}
   313  	return result, mErr.ErrorOrNil()
   314  }
   315  
   316  // evaluateNodePlan is used to evalute the plan for a single node,
   317  // returning if the plan is valid or if an error is encountered
   318  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) {
   319  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   320  	if len(plan.NodeAllocation[nodeID]) == 0 {
   321  		return true, nil
   322  	}
   323  
   324  	// Get the node itself
   325  	node, err := snap.NodeByID(nodeID)
   326  	if err != nil {
   327  		return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   328  	}
   329  
   330  	// If the node does not exist or is not ready for schduling it is not fit
   331  	// XXX: There is a potential race between when we do this check and when
   332  	// the Raft commit happens.
   333  	if node == nil || node.Status != structs.NodeStatusReady || node.Drain {
   334  		return false, nil
   335  	}
   336  
   337  	// Get the existing allocations that are non-terminal
   338  	existingAlloc, err := snap.AllocsByNodeTerminal(nodeID, false)
   339  	if err != nil {
   340  		return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   341  	}
   342  
   343  	// Determine the proposed allocation by first removing allocations
   344  	// that are planned evictions and adding the new allocations.
   345  	proposed := existingAlloc
   346  	var remove []*structs.Allocation
   347  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   348  		remove = append(remove, update...)
   349  	}
   350  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   351  		for _, alloc := range updated {
   352  			remove = append(remove, alloc)
   353  		}
   354  	}
   355  	proposed = structs.RemoveAllocs(existingAlloc, remove)
   356  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   357  
   358  	// Check if these allocations fit
   359  	fit, _, _, err := structs.AllocsFit(node, proposed, nil)
   360  	return fit, err
   361  }