github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/plan_apply.go

github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/plan_apply.go (about)

     1  package nomad
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/nomad/nomad/state"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/raft"
    11  )
    12  
    13  // planApply is a long lived goroutine that reads plan allocations from
    14  // the plan queue, determines if they can be applied safely and applies
    15  // them via Raft.
    16  //
    17  // Naively, we could simply dequeue a plan, verify, apply and then respond.
    18  // However, the plan application is bounded by the Raft apply time and
    19  // subject to some latency. This creates a stall condition, where we are
    20  // not evaluating, but simply waiting for a transaction to apply.
    21  //
    22  // To avoid this, we overlap verification with apply. This means once
    23  // we've verified plan N we attempt to apply it. However, while waiting
    24  // for apply, we begin to verify plan N+1 under the assumption that plan
    25  // N has succeeded.
    26  //
    27  // In this sense, we track two parallel versions of the world. One is
    28  // the pessimistic one driven by the Raft log which is replicated. The
    29  // other is optimistic and assumes our transactions will succeed. In the
    30  // happy path, this lets us do productive work during the latency of
    31  // apply.
    32  //
    33  // In the unhappy path (Raft transaction fails), effectively we only
    34  // wasted work during a time we would have been waiting anyways. However,
    35  // in anticipation of this case we cannot respond to the plan until
    36  // the Raft log is updated. This means our schedulers will stall,
    37  // but there are many of those and only a single plan verifier.
    38  //
    39  func (s *Server) planApply() {
    40  	// waitCh is used to track an outstanding application while snap
    41  	// holds an optimistic state which includes that plan application.
    42  	var waitCh chan struct{}
    43  	var snap *state.StateSnapshot
    44  
    45  	for {
    46  		// Pull the next pending plan, exit if we are no longer leader
    47  		pending, err := s.planQueue.Dequeue(0)
    48  		if err != nil {
    49  			return
    50  		}
    51  
    52  		// Verify the evaluation is outstanding, and that the tokens match.
    53  		if err := s.evalBroker.OutstandingReset(pending.plan.EvalID, pending.plan.EvalToken); err != nil {
    54  			s.logger.Printf("[ERR] nomad: plan rejected for evaluation %s: %v",
    55  				pending.plan.EvalID, err)
    56  			pending.respond(nil, err)
    57  			continue
    58  		}
    59  
    60  		// Check if out last plan has completed
    61  		select {
    62  		case <-waitCh:
    63  			waitCh = nil
    64  			snap = nil
    65  		default:
    66  		}
    67  
    68  		// Snapshot the state so that we have a consistent view of the world
    69  		// if no snapshot is available
    70  		if waitCh == nil || snap == nil {
    71  			snap, err = s.fsm.State().Snapshot()
    72  			if err != nil {
    73  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
    74  				pending.respond(nil, err)
    75  				continue
    76  			}
    77  		}
    78  
    79  		// Evaluate the plan
    80  		result, err := evaluatePlan(snap, pending.plan)
    81  		if err != nil {
    82  			s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err)
    83  			pending.respond(nil, err)
    84  			continue
    85  		}
    86  
    87  		// Fast-path the response if there is nothing to do
    88  		if result.IsNoOp() {
    89  			pending.respond(result, nil)
    90  			continue
    91  		}
    92  
    93  		// Ensure any parallel apply is complete before starting the next one.
    94  		// This also limits how out of date our snapshot can be.
    95  		if waitCh != nil {
    96  			<-waitCh
    97  			snap, err = s.fsm.State().Snapshot()
    98  			if err != nil {
    99  				s.logger.Printf("[ERR] nomad: failed to snapshot state: %v", err)
   100  				pending.respond(nil, err)
   101  				continue
   102  			}
   103  		}
   104  
   105  		// Dispatch the Raft transaction for the plan
   106  		future, err := s.applyPlan(result, snap)
   107  		if err != nil {
   108  			s.logger.Printf("[ERR] nomad: failed to submit plan: %v", err)
   109  			pending.respond(nil, err)
   110  			continue
   111  		}
   112  
   113  		// Respond to the plan in async
   114  		waitCh = make(chan struct{})
   115  		go s.asyncPlanWait(waitCh, future, result, pending)
   116  	}
   117  }
   118  
   119  // applyPlan is used to apply the plan result and to return the alloc index
   120  func (s *Server) applyPlan(result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) {
   121  	req := structs.AllocUpdateRequest{}
   122  	for _, updateList := range result.NodeUpdate {
   123  		req.Alloc = append(req.Alloc, updateList...)
   124  	}
   125  	for _, allocList := range result.NodeAllocation {
   126  		req.Alloc = append(req.Alloc, allocList...)
   127  	}
   128  	req.Alloc = append(req.Alloc, result.FailedAllocs...)
   129  
   130  	// Dispatch the Raft transaction
   131  	future, err := s.raftApplyFuture(structs.AllocUpdateRequestType, &req)
   132  	if err != nil {
   133  		return nil, err
   134  	}
   135  
   136  	// Optimistically apply to our state view
   137  	if snap != nil {
   138  		nextIdx := s.raft.AppliedIndex() + 1
   139  		if err := snap.UpsertAllocs(nextIdx, req.Alloc); err != nil {
   140  			return future, err
   141  		}
   142  	}
   143  	return future, nil
   144  }
   145  
   146  // asyncPlanWait is used to apply and respond to a plan async
   147  func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture,
   148  	result *structs.PlanResult, pending *pendingPlan) {
   149  	defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now())
   150  	defer close(waitCh)
   151  
   152  	// Wait for the plan to apply
   153  	if err := future.Error(); err != nil {
   154  		s.logger.Printf("[ERR] nomad: failed to apply plan: %v", err)
   155  		pending.respond(nil, err)
   156  		return
   157  	}
   158  
   159  	// Respond to the plan
   160  	result.AllocIndex = future.Index()
   161  	pending.respond(result, nil)
   162  }
   163  
   164  // evaluatePlan is used to determine what portions of a plan
   165  // can be applied if any. Returns if there should be a plan application
   166  // which may be partial or if there was an error
   167  func evaluatePlan(snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) {
   168  	defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now())
   169  
   170  	// Create a result holder for the plan
   171  	result := &structs.PlanResult{
   172  		NodeUpdate:     make(map[string][]*structs.Allocation),
   173  		NodeAllocation: make(map[string][]*structs.Allocation),
   174  		FailedAllocs:   plan.FailedAllocs,
   175  	}
   176  
   177  	// Collect all the nodeIDs
   178  	nodeIDs := make(map[string]struct{})
   179  	for nodeID := range plan.NodeUpdate {
   180  		nodeIDs[nodeID] = struct{}{}
   181  	}
   182  	for nodeID := range plan.NodeAllocation {
   183  		nodeIDs[nodeID] = struct{}{}
   184  	}
   185  
   186  	// Check each allocation to see if it should be allowed
   187  	for nodeID := range nodeIDs {
   188  		// Evaluate the plan for this node
   189  		fit, err := evaluateNodePlan(snap, plan, nodeID)
   190  		if err != nil {
   191  			return nil, err
   192  		}
   193  		if !fit {
   194  			// Scheduler must have stale data, RefreshIndex should force
   195  			// the latest view of allocations and nodes
   196  			allocIndex, err := snap.Index("allocs")
   197  			if err != nil {
   198  				return nil, err
   199  			}
   200  			nodeIndex, err := snap.Index("nodes")
   201  			if err != nil {
   202  				return nil, err
   203  			}
   204  			result.RefreshIndex = maxUint64(nodeIndex, allocIndex)
   205  
   206  			// If we require all-at-once scheduling, there is no point
   207  			// to continue the evaluation, as we've already failed.
   208  			if plan.AllAtOnce {
   209  				result.NodeUpdate = nil
   210  				result.NodeAllocation = nil
   211  				return result, nil
   212  			}
   213  
   214  			// Skip this node, since it cannot be used.
   215  			continue
   216  		}
   217  
   218  		// Add this to the plan result
   219  		if nodeUpdate := plan.NodeUpdate[nodeID]; len(nodeUpdate) > 0 {
   220  			result.NodeUpdate[nodeID] = nodeUpdate
   221  		}
   222  		if nodeAlloc := plan.NodeAllocation[nodeID]; len(nodeAlloc) > 0 {
   223  			result.NodeAllocation[nodeID] = nodeAlloc
   224  		}
   225  	}
   226  	return result, nil
   227  }
   228  
   229  // evaluateNodePlan is used to evalute the plan for a single node,
   230  // returning if the plan is valid or if an error is encountered
   231  func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) {
   232  	// If this is an evict-only plan, it always 'fits' since we are removing things.
   233  	if len(plan.NodeAllocation[nodeID]) == 0 {
   234  		return true, nil
   235  	}
   236  
   237  	// Get the node itself
   238  	node, err := snap.NodeByID(nodeID)
   239  	if err != nil {
   240  		return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err)
   241  	}
   242  
   243  	// If the node does not exist or is not ready for schduling it is not fit
   244  	// XXX: There is a potential race between when we do this check and when
   245  	// the Raft commit happens.
   246  	if node == nil || node.Status != structs.NodeStatusReady || node.Drain {
   247  		return false, nil
   248  	}
   249  
   250  	// Get the existing allocations
   251  	existingAlloc, err := snap.AllocsByNode(nodeID)
   252  	if err != nil {
   253  		return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err)
   254  	}
   255  
   256  	// Filter on alloc state
   257  	existingAlloc = structs.FilterTerminalAllocs(existingAlloc)
   258  
   259  	// Determine the proposed allocation by first removing allocations
   260  	// that are planned evictions and adding the new allocations.
   261  	proposed := existingAlloc
   262  	var remove []*structs.Allocation
   263  	if update := plan.NodeUpdate[nodeID]; len(update) > 0 {
   264  		remove = append(remove, update...)
   265  	}
   266  	if updated := plan.NodeAllocation[nodeID]; len(updated) > 0 {
   267  		for _, alloc := range updated {
   268  			remove = append(remove, alloc)
   269  		}
   270  	}
   271  	proposed = structs.RemoveAllocs(existingAlloc, remove)
   272  	proposed = append(proposed, plan.NodeAllocation[nodeID]...)
   273  
   274  	// Check if these allocations fit
   275  	fit, _, _, err := structs.AllocsFit(node, proposed, nil)
   276  	return fit, err
   277  }