github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/plan_apply.go

github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/nomad/nomad/state"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/raft"
    11  )
    12  
    13  // planApply is a long lived goroutine that reads plan allocations from
    14  // the plan queue, determines if they can be applied safely and applies
    15  // them via Raft.
    16  //
    17  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    18  // However, the plan application is bounded by the Raft apply time and
    19  // subject to some latency. This creates a stall condition, where we are
    20  // not evaluating, but simply waiting for a transaction to apply.
    21  //
    22  // To avoid this, we overlap verification with apply. This means once
    23  // we've verified plan N we attempt to apply it. However, while waiting
    24  // for apply, we begin to verify plan N+1 under the assumption that plan
    25  // N has succeeded.
    26  //
    27  // In this sense, we track two parallel versions of the world. One is
    28  // the pessimistic one driven by the Raft log which is replicated. The
    29  // other is optimistic and assumes our transactions will succeed. In the
    30  // happy path, this lets us do productive work during the latency of
    31  // apply.
    32  //
    33  // In the unhappy path (Raft transaction fails), effectively we only
    34  // wasted work during a time we would have been waiting anyways. However,
    35  // in anticipation of this case we cannot respond to the plan until
    36  // the Raft log is updated. This means our schedulers will stall,
    37  // but there are many of those and only a single plan verifier.
    38  //
    39  func (s *Server) planApply() {
    40  	// waitCh is used to track an outstanding application while snap
    41  	// holds an optimistic state which includes that plan application.
    42  	var waitCh chan struct{}
    43  	var snap *state.StateSnapshot
    44  
    45  	for {
    46  		// Pull the next pending plan, exit if we are no longer leader
    47  		pending, err := s.planQueue.Dequeue(0)
    48  		if err != nil {
    49  			return
    50  		}
    51  
    52  		// Verify the evaluation is outstanding, and that the tokens match.
    53  		token, ok := s.evalBroker.Outstanding(pending.plan.EvalID)
    54  		if !ok {
    55  			s.logger.Printf("[ERR] nomad: plan received for non-outstanding evaluation %s",
    56  				pending.plan.EvalID)
    57  			pending.respond(nil, fmt.Errorf("evaluation is not outstanding"))
    58  			continue
    59  		}
    60  		if pending.plan.EvalToken != token {
    61  			s.logger.Printf("[ERR] nomad: plan received for evaluation %s with wrong token",
    62  				pending.plan.EvalID)
    63  			pending.respond(nil, fmt.Errorf("evaluation token does not match"))
    64  			continue
    65  		}
    66  
    67  		// Check if out last plan has completed
    68  		select {
    69  		case <-waitCh:
    70  			waitCh = nil
    71  			snap = nil
    72  		default:
    73  		}
    74  
    75  		// Snapshot the state so that we have a consistent view of the world
    76  		// if no snapshot is available
    77  		if waitCh == nil || snap == nil {
    78  			snap, err = s.fsm.State().Snapshot()
    79  			if err != nil {
    80  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
    81  				pending.respond(nil, err)
    82  				continue
    83  			}
    84  		}
    85  
    86  		// Evaluate the plan
    87  		result, err := evaluatePlan(snap, pending.plan)
    88  		if err != nil {
    89  			s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err)
    90  			pending.respond(nil, err)
    91  			continue
    92  		}
    93  
    94  		// Fast-path the response if there is nothing to do
    95  		if result.IsNoOp() {
    96  			pending.respond(result, nil)
    97  			continue
    98  		}
    99  
   100  		// Ensure any parallel apply is complete before starting the next one.
   101  		// This also limits how out of date our snapshot can be.
   102  		if waitCh != nil {
   103  			<-waitCh
   104  			snap, err = s.fsm.State().Snapshot()
   105  			if err != nil {
   106  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
   107  				pending.respond(nil, err)
   108  				continue
   109  			}
   110  		}
   111  
   112  		// Dispatch the Raft transaction for the plan
   113  		future, err := s.applyPlan(result, snap)
   114  		if err != nil {
   115  			s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err)
   116  			pending.respond(nil, err)
   117  			continue
   118  		}
   119  
   120  		// Respond to the plan in async
   121  		waitCh = make(chan struct{})
   122  		go s.asyncPlanWait(waitCh, future, result, pending)
   123  	}
   124  }
   125  
   126  // applyPlan is used to apply the plan result and to return the alloc index
   127  func (s *Server) applyPlan(result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   128  	req := structs.AllocUpdateRequest{}
   129  	for _, updateList := range result.NodeUpdate {
   130  		req.Alloc = append(req.Alloc, updateList...)
   131  	}
   132  	for _, allocList := range result.NodeAllocation {
   133  		req.Alloc = append(req.Alloc, allocList...)
   134  	}
   135  	req.Alloc = append(req.Alloc, result.FailedAllocs...)
   136  
   137  	// Dispatch the Raft transaction
   138  	future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req)
   139  	if err != nil {
   140  		return nil, err
   141  	}
   142  
   143  	// Optimistically apply to our state view
   144  	if snap != nil {
   145  		nextIdx := s.raft.AppliedIndex() + 1
   146  		if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil {
   147  			return future, err
   148  		}
   149  	}
   150  	return future, nil
   151  }
   152  
   153  // asyncPlanWait is used to apply and respond to a plan async
   154  func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture,
   155  	result *structs.PlanResult, pending *pendingPlan) {
   156  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   157  	defer close(waitCh)
   158  
   159  	// Wait for the plan to apply
   160  	if err := future.Error(); err != nil {
   161  		s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err)
   162  		pending.respond(nil, err)
   163  		return
   164  	}
   165  
   166  	// Respond to the plan
   167  	result.AllocIndex = future.Index()
   168  	pending.respond(result, nil)
   169  }
   170  
   171  // evaluatePlan is used to determine what portions of a plan
   172  // can be applied if any. Returns if there should be a plan application
   173  // which may be partial or if there was an error
   174  func evaluatePlan(snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) {
   175  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   176  
   177  	// Create a result holder for the plan
   178  	result := &structs.PlanResult{
   179  		NodeUpdate:     make(map[string][]*structs.Allocation),
   180  		NodeAllocation: make(map[string][]*structs.Allocation),
   181  		FailedAllocs:   plan.FailedAllocs,
   182  	}
   183  
   184  	// Collect all the nodeIDs
   185  	nodeIDs := make(map[string]struct{})
   186  	for nodeID := range plan.NodeUpdate {
   187  		nodeIDs[nodeID] = struct{}{}
   188  	}
   189  	for nodeID := range plan.NodeAllocation {
   190  		nodeIDs[nodeID] = struct{}{}
   191  	}
   192  
   193  	// Check each allocation to see if it should be allowed
   194  	for nodeID := range nodeIDs {
   195  		// Evaluate the plan for this node
   196  		fit, err := evaluateNodePlan(snap, plan, nodeID)
   197  		if err != nil {
   198  			return nil, err
   199  		}
   200  		if !fit {
   201  			// Scheduler must have stale data, RefreshIndex should force
   202  			// the latest view of allocations and nodes
   203  			allocIndex, err := snap.Index("allocs")
   204  			if err != nil {
   205  				return nil, err
   206  			}
   207  			nodeIndex, err := snap.Index("nodes")
   208  			if err != nil {
   209  				return nil, err
   210  			}
   211  			result.RefreshIndex = maxUint64(nodeIndex, allocIndex)
   212  
   213  			// If we require all-at-once scheduling, there is no point
   214  			// to continue the evaluation, as we've already failed.
   215  			if plan.AllAtOnce {
   216  				result.NodeUpdate = nil
   217  				result.NodeAllocation = nil
   218  				return result, nil
   219  			}
   220  
   221  			// Skip this node, since it cannot be used.
   222  			continue
   223  		}
   224  
   225  		// Add this to the plan result
   226  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   227  			result.NodeUpdate[nodeID] = nodeUpdate
   228  		}
   229  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   230  			result.NodeAllocation[nodeID] = nodeAlloc
   231  		}
   232  	}
   233  	return result, nil
   234  }
   235  
   236  // evaluateNodePlan is used to evalute the plan for a single node,
   237  // returning if the plan is valid or if an error is encountered
   238  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) {
   239  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   240  	if len(plan.NodeAllocation[nodeID]) == 0 {
   241  		return true, nil
   242  	}
   243  
   244  	// Get the node itself
   245  	node, err := snap.NodeByID(nodeID)
   246  	if err != nil {
   247  		return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   248  	}
   249  
   250  	// If the node does not exist or is not ready for schduling it is not fit
   251  	// XXX: There is a potential race between when we do this check and when
   252  	// the Raft commit happens.
   253  	if node == nil || node.Status != structs.NodeStatusReady || node.Drain {
   254  		return false, nil
   255  	}
   256  
   257  	// Get the existing allocations
   258  	existingAlloc, err := snap.AllocsByNode(nodeID)
   259  	if err != nil {
   260  		return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   261  	}
   262  
   263  	// Filter on alloc state
   264  	existingAlloc = structs.FilterTerminalAllocs(existingAlloc)
   265  
   266  	// Determine the proposed allocation by first removing allocations
   267  	// that are planned evictions and adding the new allocations.
   268  	proposed := existingAlloc
   269  	var remove []*structs.Allocation
   270  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   271  		remove = append(remove, update...)
   272  	}
   273  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   274  		for _, alloc := range updated {
   275  			remove = append(remove, alloc)
   276  		}
   277  	}
   278  	proposed = structs.RemoveAllocs(existingAlloc, remove)
   279  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   280  
   281  	// Check if these allocations fit
   282  	fit, _, _, err := structs.AllocsFit(node, proposed, nil)
   283  	return fit, err
   284  }