github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/plan_apply.go

github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"runtime"
     6  	"time"
     7  
     8  	"github.com/armon/go-metrics"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/go-multierror"
    11  	"github.com/hashicorp/nomad/nomad/state"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  	"github.com/hashicorp/raft"
    14  )
    15  
    16  // planApply is a long lived goroutine that reads plan allocations from
    17  // the plan queue, determines if they can be applied safely and applies
    18  // them via Raft.
    19  //
    20  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    21  // However, the plan application is bounded by the Raft apply time and
    22  // subject to some latency. This creates a stall condition, where we are
    23  // not evaluating, but simply waiting for a transaction to apply.
    24  //
    25  // To avoid this, we overlap verification with apply. This means once
    26  // we've verified plan N we attempt to apply it. However, while waiting
    27  // for apply, we begin to verify plan N+1 under the assumption that plan
    28  // N has succeeded.
    29  //
    30  // In this sense, we track two parallel versions of the world. One is
    31  // the pessimistic one driven by the Raft log which is replicated. The
    32  // other is optimistic and assumes our transactions will succeed. In the
    33  // happy path, this lets us do productive work during the latency of
    34  // apply.
    35  //
    36  // In the unhappy path (Raft transaction fails), effectively we only
    37  // wasted work during a time we would have been waiting anyways. However,
    38  // in anticipation of this case we cannot respond to the plan until
    39  // the Raft log is updated. This means our schedulers will stall,
    40  // but there are many of those and only a single plan verifier.
    41  //
    42  func (s *Server) planApply() {
    43  	// waitCh is used to track an outstanding application while snap
    44  	// holds an optimistic state which includes that plan application.
    45  	var waitCh chan struct{}
    46  	var snap *state.StateSnapshot
    47  
    48  	// Setup a worker pool with half the cores, with at least 1
    49  	poolSize := runtime.NumCPU() / 2
    50  	if poolSize == 0 {
    51  		poolSize = 1
    52  	}
    53  	pool := NewEvaluatePool(poolSize, workerPoolBufferSize)
    54  	defer pool.Shutdown()
    55  
    56  	for {
    57  		// Pull the next pending plan, exit if we are no longer leader
    58  		pending, err := s.planQueue.Dequeue(0)
    59  		if err != nil {
    60  			return
    61  		}
    62  
    63  		// Check if out last plan has completed
    64  		select {
    65  		case <-waitCh:
    66  			waitCh = nil
    67  			snap = nil
    68  		default:
    69  		}
    70  
    71  		// Snapshot the state so that we have a consistent view of the world
    72  		// if no snapshot is available
    73  		if waitCh == nil || snap == nil {
    74  			snap, err = s.fsm.State().Snapshot()
    75  			if err != nil {
    76  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
    77  				pending.respond(nil, err)
    78  				continue
    79  			}
    80  		}
    81  
    82  		// Evaluate the plan
    83  		result, err := evaluatePlan(pool, snap, pending.plan)
    84  		if err != nil {
    85  			s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err)
    86  			pending.respond(nil, err)
    87  			continue
    88  		}
    89  
    90  		// Fast-path the response if there is nothing to do
    91  		if result.IsNoOp() {
    92  			pending.respond(result, nil)
    93  			continue
    94  		}
    95  
    96  		// Ensure any parallel apply is complete before starting the next one.
    97  		// This also limits how out of date our snapshot can be.
    98  		if waitCh != nil {
    99  			<-waitCh
   100  			snap, err = s.fsm.State().Snapshot()
   101  			if err != nil {
   102  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
   103  				pending.respond(nil, err)
   104  				continue
   105  			}
   106  		}
   107  
   108  		// Dispatch the Raft transaction for the plan
   109  		future, err := s.applyPlan(pending.plan, result, snap)
   110  		if err != nil {
   111  			s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err)
   112  			pending.respond(nil, err)
   113  			continue
   114  		}
   115  
   116  		// Respond to the plan in async
   117  		waitCh = make(chan struct{})
   118  		go s.asyncPlanWait(waitCh, future, result, pending)
   119  	}
   120  }
   121  
   122  // applyPlan is used to apply the plan result and to return the alloc index
   123  func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   124  	// Determine the miniumum number of updates, could be more if there
   125  	// are multiple updates per node
   126  	minUpdates := len(result.NodeUpdate)
   127  	minUpdates += len(result.NodeAllocation)
   128  
   129  	// Grab the job
   130  	job := plan.Job
   131  
   132  	// Setup the update request
   133  	req := structs.ApplyPlanResultsRequest{
   134  		AllocUpdateRequest: structs.AllocUpdateRequest{
   135  			Job:   job,
   136  			Alloc: make([]*structs.Allocation, 0, minUpdates),
   137  		},
   138  		CreatedDeployment: plan.CreatedDeployment,
   139  		DeploymentUpdates: plan.DeploymentUpdates,
   140  	}
   141  	for _, updateList := range result.NodeUpdate {
   142  		req.Alloc = append(req.Alloc, updateList...)
   143  	}
   144  	for _, allocList := range result.NodeAllocation {
   145  		req.Alloc = append(req.Alloc, allocList...)
   146  	}
   147  
   148  	// Set the time the alloc was applied for the first time. This can be used
   149  	// to approximate the scheduling time.
   150  	now := time.Now().UTC().UnixNano()
   151  	for _, alloc := range req.Alloc {
   152  		if alloc.CreateTime == 0 {
   153  			alloc.CreateTime = now
   154  		}
   155  	}
   156  
   157  	// Dispatch the Raft transaction
   158  	future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req)
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  
   163  	// Optimistically apply to our state view
   164  	if snap != nil {
   165  		nextIdx := s.raft.AppliedIndex() + 1
   166  		if err := snap.UpsertPlanResults(nextIdx, &req); err != nil {
   167  			return future, err
   168  		}
   169  	}
   170  	return future, nil
   171  }
   172  
   173  // asyncPlanWait is used to apply and respond to a plan async
   174  func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture,
   175  	result *structs.PlanResult, pending *pendingPlan) {
   176  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   177  	defer close(waitCh)
   178  
   179  	// Wait for the plan to apply
   180  	if err := future.Error(); err != nil {
   181  		s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err)
   182  		pending.respond(nil, err)
   183  		return
   184  	}
   185  
   186  	// Respond to the plan
   187  	result.AllocIndex = future.Index()
   188  
   189  	// If this is a partial plan application, we need to ensure the scheduler
   190  	// at least has visibility into any placements it made to avoid double placement.
   191  	// The RefreshIndex computed by evaluatePlan may be stale due to evaluation
   192  	// against an optimistic copy of the state.
   193  	if result.RefreshIndex != 0 {
   194  		result.RefreshIndex = maxUint64(result.RefreshIndex, result.AllocIndex)
   195  	}
   196  	pending.respond(result, nil)
   197  }
   198  
   199  // evaluatePlan is used to determine what portions of a plan
   200  // can be applied if any. Returns if there should be a plan application
   201  // which may be partial or if there was an error
   202  func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) {
   203  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   204  
   205  	// Create a result holder for the plan
   206  	result := &structs.PlanResult{
   207  		NodeUpdate:     make(map[string][]*structs.Allocation),
   208  		NodeAllocation: make(map[string][]*structs.Allocation),
   209  	}
   210  
   211  	// Collect all the nodeIDs
   212  	nodeIDs := make(map[string]struct{})
   213  	nodeIDList := make([]string, 0, len(plan.NodeUpdate)+len(plan.NodeAllocation))
   214  	for nodeID := range plan.NodeUpdate {
   215  		if _, ok := nodeIDs[nodeID]; !ok {
   216  			nodeIDs[nodeID] = struct{}{}
   217  			nodeIDList = append(nodeIDList, nodeID)
   218  		}
   219  	}
   220  	for nodeID := range plan.NodeAllocation {
   221  		if _, ok := nodeIDs[nodeID]; !ok {
   222  			nodeIDs[nodeID] = struct{}{}
   223  			nodeIDList = append(nodeIDList, nodeID)
   224  		}
   225  	}
   226  
   227  	// Setup a multierror to handle potentially getting many
   228  	// errors since we are processing in parallel.
   229  	var mErr multierror.Error
   230  	partialCommit := false
   231  
   232  	// handleResult is used to process the result of evaluateNodePlan
   233  	handleResult := func(nodeID string, fit bool, err error) (cancel bool) {
   234  		// Evaluate the plan for this node
   235  		if err != nil {
   236  			mErr.Errors = append(mErr.Errors, err)
   237  			return true
   238  		}
   239  		if !fit {
   240  			// Set that this is a partial commit
   241  			partialCommit = true
   242  
   243  			// If we require all-at-once scheduling, there is no point
   244  			// to continue the evaluation, as we've already failed.
   245  			if plan.AllAtOnce {
   246  				result.NodeUpdate = nil
   247  				result.NodeAllocation = nil
   248  				return true
   249  			}
   250  
   251  			// Skip this node, since it cannot be used.
   252  			return
   253  		}
   254  
   255  		// Add this to the plan result
   256  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   257  			result.NodeUpdate[nodeID] = nodeUpdate
   258  		}
   259  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   260  			result.NodeAllocation[nodeID] = nodeAlloc
   261  		}
   262  		return
   263  	}
   264  
   265  	// Get the pool channels
   266  	req := pool.RequestCh()
   267  	resp := pool.ResultCh()
   268  	outstanding := 0
   269  	didCancel := false
   270  
   271  	// Evalute each node in the plan, handling results as they are ready to
   272  	// avoid blocking.
   273  OUTER:
   274  	for len(nodeIDList) > 0 {
   275  		nodeID := nodeIDList[0]
   276  		select {
   277  		case req <- evaluateRequest{snap, plan, nodeID}:
   278  			outstanding++
   279  			nodeIDList = nodeIDList[1:]
   280  		case r := <-resp:
   281  			outstanding--
   282  
   283  			// Handle a result that allows us to cancel evaluation,
   284  			// which may save time processing additional entries.
   285  			if cancel := handleResult(r.nodeID, r.fit, r.err); cancel {
   286  				didCancel = true
   287  				break OUTER
   288  			}
   289  		}
   290  	}
   291  
   292  	// Drain the remaining results
   293  	for outstanding > 0 {
   294  		r := <-resp
   295  		if !didCancel {
   296  			if cancel := handleResult(r.nodeID, r.fit, r.err); cancel {
   297  				didCancel = true
   298  			}
   299  		}
   300  		outstanding--
   301  	}
   302  
   303  	// If the plan resulted in a partial commit, we need to determine
   304  	// a minimum refresh index to force the scheduler to work on a more
   305  	// up-to-date state to avoid the failures.
   306  	if partialCommit {
   307  		allocIndex, err := snap.Index("allocs")
   308  		if err != nil {
   309  			mErr.Errors = append(mErr.Errors, err)
   310  		}
   311  		nodeIndex, err := snap.Index("nodes")
   312  		if err != nil {
   313  			mErr.Errors = append(mErr.Errors, err)
   314  		}
   315  		result.RefreshIndex = maxUint64(nodeIndex, allocIndex)
   316  
   317  		if result.RefreshIndex == 0 {
   318  			err := fmt.Errorf("partialCommit with RefreshIndex of 0 (%d node, %d alloc)", nodeIndex, allocIndex)
   319  			mErr.Errors = append(mErr.Errors, err)
   320  		}
   321  	}
   322  	return result, mErr.ErrorOrNil()
   323  }
   324  
   325  // evaluateNodePlan is used to evalute the plan for a single node,
   326  // returning if the plan is valid or if an error is encountered
   327  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) {
   328  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   329  	if len(plan.NodeAllocation[nodeID]) == 0 {
   330  		return true, nil
   331  	}
   332  
   333  	// Get the node itself
   334  	ws := memdb.NewWatchSet()
   335  	node, err := snap.NodeByID(ws, nodeID)
   336  	if err != nil {
   337  		return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   338  	}
   339  
   340  	// If the node does not exist or is not ready for schduling it is not fit
   341  	// XXX: There is a potential race between when we do this check and when
   342  	// the Raft commit happens.
   343  	if node == nil || node.Status != structs.NodeStatusReady || node.Drain {
   344  		return false, nil
   345  	}
   346  
   347  	// Get the existing allocations that are non-terminal
   348  	existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false)
   349  	if err != nil {
   350  		return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   351  	}
   352  
   353  	// Determine the proposed allocation by first removing allocations
   354  	// that are planned evictions and adding the new allocations.
   355  	proposed := existingAlloc
   356  	var remove []*structs.Allocation
   357  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   358  		remove = append(remove, update...)
   359  	}
   360  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   361  		for _, alloc := range updated {
   362  			remove = append(remove, alloc)
   363  		}
   364  	}
   365  	proposed = structs.RemoveAllocs(existingAlloc, remove)
   366  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   367  
   368  	// Check if these allocations fit
   369  	fit, _, _, err := structs.AllocsFit(node, proposed, nil)
   370  	return fit, err
   371  }