github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/generic_sched.go

github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"time"
     7  
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/go-multierror"
    10  	"github.com/hashicorp/nomad/helper/uuid"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// maxServiceScheduleAttempts is used to limit the number of times
    16  	// we will attempt to schedule if we continue to hit conflicts for services.
    17  	maxServiceScheduleAttempts = 5
    18  
    19  	// maxBatchScheduleAttempts is used to limit the number of times
    20  	// we will attempt to schedule if we continue to hit conflicts for batch.
    21  	maxBatchScheduleAttempts = 2
    22  
    23  	// allocNotNeeded is the status used when a job no longer requires an allocation
    24  	allocNotNeeded = "alloc not needed due to job update"
    25  
    26  	// allocMigrating is the status used when we must migrate an allocation
    27  	allocMigrating = "alloc is being migrated"
    28  
    29  	// allocUpdating is the status used when a job requires an update
    30  	allocUpdating = "alloc is being updated due to job update"
    31  
    32  	// allocLost is the status used when an allocation is lost
    33  	allocLost = "alloc is lost since its node is down"
    34  
    35  	// allocInPlace is the status used when speculating on an in-place update
    36  	allocInPlace = "alloc updating in-place"
    37  
    38  	// blockedEvalMaxPlanDesc is the description used for blocked evals that are
    39  	// a result of hitting the max number of plan attempts
    40  	blockedEvalMaxPlanDesc = "created due to placement conflicts"
    41  
    42  	// blockedEvalFailedPlacements is the description used for blocked evals
    43  	// that are a result of failing to place all allocations.
    44  	blockedEvalFailedPlacements = "created to place remaining allocations"
    45  )
    46  
    47  // SetStatusError is used to set the status of the evaluation to the given error
    48  type SetStatusError struct {
    49  	Err        error
    50  	EvalStatus string
    51  }
    52  
    53  func (s *SetStatusError) Error() string {
    54  	return s.Err.Error()
    55  }
    56  
    57  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    58  // designed for long-lived services, and as such spends more time attempting
    59  // to make a high quality placement. This is the primary scheduler for
    60  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    61  // making at the cost of quality.
    62  type GenericScheduler struct {
    63  	logger  *log.Logger
    64  	state   State
    65  	planner Planner
    66  	batch   bool
    67  
    68  	eval       *structs.Evaluation
    69  	job        *structs.Job
    70  	plan       *structs.Plan
    71  	planResult *structs.PlanResult
    72  	ctx        *EvalContext
    73  	stack      *GenericStack
    74  
    75  	followupEvalWait time.Duration
    76  	nextEval         *structs.Evaluation
    77  
    78  	deployment *structs.Deployment
    79  
    80  	blocked        *structs.Evaluation
    81  	failedTGAllocs map[string]*structs.AllocMetric
    82  	queuedAllocs   map[string]int
    83  }
    84  
    85  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    86  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    87  	s := &GenericScheduler{
    88  		logger:  logger,
    89  		state:   state,
    90  		planner: planner,
    91  		batch:   false,
    92  	}
    93  	return s
    94  }
    95  
    96  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    97  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    98  	s := &GenericScheduler{
    99  		logger:  logger,
   100  		state:   state,
   101  		planner: planner,
   102  		batch:   true,
   103  	}
   104  	return s
   105  }
   106  
   107  // Process is used to handle a single evaluation
   108  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
   109  	// Store the evaluation
   110  	s.eval = eval
   111  
   112  	// Verify the evaluation trigger reason is understood
   113  	switch eval.TriggeredBy {
   114  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
   115  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
   116  		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
   117  		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc:
   118  	default:
   119  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   120  			eval.TriggeredBy)
   121  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   122  			s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs,
   123  			s.deployment.GetID())
   124  	}
   125  
   126  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   127  	progress := func() bool { return progressMade(s.planResult) }
   128  	limit := maxServiceScheduleAttempts
   129  	if s.batch {
   130  		limit = maxBatchScheduleAttempts
   131  	}
   132  	if err := retryMax(limit, s.process, progress); err != nil {
   133  		if statusErr, ok := err.(*SetStatusError); ok {
   134  			// Scheduling was tried but made no forward progress so create a
   135  			// blocked eval to retry once resources become available.
   136  			var mErr multierror.Error
   137  			if err := s.createBlockedEval(true); err != nil {
   138  				mErr.Errors = append(mErr.Errors, err)
   139  			}
   140  			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   141  				s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
   142  				s.queuedAllocs, s.deployment.GetID()); err != nil {
   143  				mErr.Errors = append(mErr.Errors, err)
   144  			}
   145  			return mErr.ErrorOrNil()
   146  		}
   147  		return err
   148  	}
   149  
   150  	// If the current evaluation is a blocked evaluation and we didn't place
   151  	// everything, do not update the status to complete.
   152  	if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 {
   153  		e := s.ctx.Eligibility()
   154  		newEval := s.eval.Copy()
   155  		newEval.EscapedComputedClass = e.HasEscaped()
   156  		newEval.ClassEligibility = e.GetClasses()
   157  		newEval.QuotaLimitReached = e.QuotaLimitReached()
   158  		return s.planner.ReblockEval(newEval)
   159  	}
   160  
   161  	// Update the status to complete
   162  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   163  		s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs,
   164  		s.deployment.GetID())
   165  }
   166  
   167  // createBlockedEval creates a blocked eval and submits it to the planner. If
   168  // failure is set to true, the eval's trigger reason reflects that.
   169  func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
   170  	e := s.ctx.Eligibility()
   171  	escaped := e.HasEscaped()
   172  
   173  	// Only store the eligible classes if the eval hasn't escaped.
   174  	var classEligibility map[string]bool
   175  	if !escaped {
   176  		classEligibility = e.GetClasses()
   177  	}
   178  
   179  	s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached())
   180  	if planFailure {
   181  		s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
   182  		s.blocked.StatusDescription = blockedEvalMaxPlanDesc
   183  	} else {
   184  		s.blocked.StatusDescription = blockedEvalFailedPlacements
   185  	}
   186  
   187  	return s.planner.CreateEval(s.blocked)
   188  }
   189  
   190  // process is wrapped in retryMax to iteratively run the handler until we have no
   191  // further work or we've made the maximum number of attempts.
   192  func (s *GenericScheduler) process() (bool, error) {
   193  	// Lookup the Job by ID
   194  	var err error
   195  	ws := memdb.NewWatchSet()
   196  	s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
   197  	if err != nil {
   198  		return false, fmt.Errorf("failed to get job %q: %v", s.eval.JobID, err)
   199  	}
   200  
   201  	numTaskGroups := 0
   202  	stopped := s.job.Stopped()
   203  	if !stopped {
   204  		numTaskGroups = len(s.job.TaskGroups)
   205  	}
   206  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   207  
   208  	// Create a plan
   209  	s.plan = s.eval.MakePlan(s.job)
   210  
   211  	if !s.batch {
   212  		// Get any existing deployment
   213  		s.deployment, err = s.state.LatestDeploymentByJobID(ws, s.eval.Namespace, s.eval.JobID)
   214  		if err != nil {
   215  			return false, fmt.Errorf("failed to get job deployment %q: %v", s.eval.JobID, err)
   216  		}
   217  	}
   218  
   219  	// Reset the failed allocations
   220  	s.failedTGAllocs = nil
   221  
   222  	// Create an evaluation context
   223  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   224  
   225  	// Construct the placement stack
   226  	s.stack = NewGenericStack(s.batch, s.ctx)
   227  	if !s.job.Stopped() {
   228  		s.stack.SetJob(s.job)
   229  	}
   230  
   231  	// Compute the target job allocations
   232  	if err := s.computeJobAllocs(); err != nil {
   233  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   234  		return false, err
   235  	}
   236  
   237  	// If there are failed allocations, we need to create a blocked evaluation
   238  	// to place the failed allocations when resources become available. If the
   239  	// current evaluation is already a blocked eval, we reuse it.
   240  	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil {
   241  		if err := s.createBlockedEval(false); err != nil {
   242  			s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
   243  			return false, err
   244  		}
   245  		s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID)
   246  	}
   247  
   248  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   249  	// anyways to get the annotations.
   250  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   251  		return true, nil
   252  	}
   253  
   254  	// If we need a followup eval and we haven't created one, do so.
   255  	if s.followupEvalWait != 0 && s.nextEval == nil {
   256  		s.nextEval = s.eval.NextRollingEval(s.followupEvalWait)
   257  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   258  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling migration: %v", s.eval, err)
   259  			return false, err
   260  		}
   261  		s.logger.Printf("[DEBUG] sched: %#v: rolling migration limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   262  	}
   263  
   264  	// Submit the plan and store the results.
   265  	result, newState, err := s.planner.SubmitPlan(s.plan)
   266  	s.planResult = result
   267  	if err != nil {
   268  		return false, err
   269  	}
   270  
   271  	// Decrement the number of allocations pending per task group based on the
   272  	// number of allocations successfully placed
   273  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   274  
   275  	// If we got a state refresh, try again since we have stale data
   276  	if newState != nil {
   277  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   278  		s.state = newState
   279  		return false, nil
   280  	}
   281  
   282  	// Try again if the plan was not fully committed, potential conflict
   283  	fullCommit, expected, actual := result.FullCommit(s.plan)
   284  	if !fullCommit {
   285  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   286  			s.eval, expected, actual)
   287  		if newState == nil {
   288  			return false, fmt.Errorf("missing state refresh after partial commit")
   289  		}
   290  		return false, nil
   291  	}
   292  
   293  	// Success!
   294  	return true, nil
   295  }
   296  
   297  // computeJobAllocs is used to reconcile differences between the job,
   298  // existing allocations and node status to update the allocations.
   299  func (s *GenericScheduler) computeJobAllocs() error {
   300  	// Lookup the allocations by JobID
   301  	ws := memdb.NewWatchSet()
   302  	allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
   303  	if err != nil {
   304  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   305  			s.eval.JobID, err)
   306  	}
   307  
   308  	// Determine the tainted nodes containing job allocs
   309  	tainted, err := taintedNodes(s.state, allocs)
   310  	if err != nil {
   311  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   312  			s.eval.JobID, err)
   313  	}
   314  
   315  	// Update the allocations which are in pending/running state on tainted
   316  	// nodes to lost
   317  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   318  
   319  	reconciler := NewAllocReconciler(s.ctx.Logger(),
   320  		genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID),
   321  		s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted)
   322  	results := reconciler.Compute()
   323  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results)
   324  
   325  	if s.eval.AnnotatePlan {
   326  		s.plan.Annotations = &structs.PlanAnnotations{
   327  			DesiredTGUpdates: results.desiredTGUpdates,
   328  		}
   329  	}
   330  
   331  	// Add the deployment changes to the plan
   332  	s.plan.Deployment = results.deployment
   333  	s.plan.DeploymentUpdates = results.deploymentUpdates
   334  
   335  	// Store the the follow up eval wait duration. If set this will trigger a
   336  	// follow up eval to handle node draining.
   337  	s.followupEvalWait = results.followupEvalWait
   338  
   339  	// Update the stored deployment
   340  	if results.deployment != nil {
   341  		s.deployment = results.deployment
   342  	}
   343  
   344  	// Handle the stop
   345  	for _, stop := range results.stop {
   346  		s.plan.AppendUpdate(stop.alloc, structs.AllocDesiredStatusStop, stop.statusDescription, stop.clientStatus)
   347  	}
   348  
   349  	// Handle the in-place updates
   350  	for _, update := range results.inplaceUpdate {
   351  		if update.DeploymentID != s.deployment.GetID() {
   352  			update.DeploymentID = s.deployment.GetID()
   353  			update.DeploymentStatus = nil
   354  		}
   355  		s.ctx.Plan().AppendAlloc(update)
   356  	}
   357  
   358  	// Nothing remaining to do if placement is not required
   359  	if len(results.place)+len(results.destructiveUpdate) == 0 {
   360  		if !s.job.Stopped() {
   361  			for _, tg := range s.job.TaskGroups {
   362  				s.queuedAllocs[tg.Name] = 0
   363  			}
   364  		}
   365  		return nil
   366  	}
   367  
   368  	// Record the number of allocations that needs to be placed per Task Group
   369  	for _, place := range results.place {
   370  		s.queuedAllocs[place.taskGroup.Name] += 1
   371  	}
   372  	for _, destructive := range results.destructiveUpdate {
   373  		s.queuedAllocs[destructive.placeTaskGroup.Name] += 1
   374  	}
   375  
   376  	// Compute the placements
   377  	place := make([]placementResult, 0, len(results.place))
   378  	for _, p := range results.place {
   379  		place = append(place, p)
   380  	}
   381  
   382  	destructive := make([]placementResult, 0, len(results.destructiveUpdate))
   383  	for _, p := range results.destructiveUpdate {
   384  		destructive = append(destructive, p)
   385  	}
   386  	return s.computePlacements(destructive, place)
   387  }
   388  
   389  // computePlacements computes placements for allocations. It is given the set of
   390  // destructive updates to place and the set of new placements to place.
   391  func (s *GenericScheduler) computePlacements(destructive, place []placementResult) error {
   392  	// Get the base nodes
   393  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   394  	if err != nil {
   395  		return err
   396  	}
   397  
   398  	var deploymentID string
   399  	if s.deployment != nil {
   400  		deploymentID = s.deployment.ID
   401  	}
   402  
   403  	// Update the set of placement nodes
   404  	s.stack.SetNodes(nodes)
   405  
   406  	// Have to handle destructive changes first as we need to discount their
   407  	// resources. To understand this imagine the resources were reduced and the
   408  	// count was scaled up.
   409  	for _, results := range [][]placementResult{destructive, place} {
   410  		for _, missing := range results {
   411  			// Get the task group
   412  			tg := missing.TaskGroup()
   413  
   414  			// Check if this task group has already failed
   415  			if metric, ok := s.failedTGAllocs[tg.Name]; ok {
   416  				metric.CoalescedFailures += 1
   417  				continue
   418  			}
   419  
   420  			// Find the preferred node
   421  			preferredNode, err := s.findPreferredNode(missing)
   422  			if err != nil {
   423  				return err
   424  			}
   425  
   426  			// Check if we should stop the previous allocation upon successful
   427  			// placement of its replacement. This allow atomic placements/stops. We
   428  			// stop the allocation before trying to find a replacement because this
   429  			// frees the resources currently used by the previous allocation.
   430  			stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc()
   431  			prevAllocation := missing.PreviousAllocation()
   432  			if stopPrevAlloc {
   433  				s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
   434  			}
   435  
   436  			// Compute penalty nodes for rescheduled allocs
   437  			selectOptions := getSelectOptions(prevAllocation, preferredNode)
   438  			option, _ := s.stack.Select(tg, selectOptions)
   439  
   440  			// Store the available nodes by datacenter
   441  			s.ctx.Metrics().NodesAvailable = byDC
   442  
   443  			// Set fields based on if we found an allocation option
   444  			if option != nil {
   445  				// Create an allocation for this
   446  				alloc := &structs.Allocation{
   447  					ID:            uuid.Generate(),
   448  					Namespace:     s.job.Namespace,
   449  					EvalID:        s.eval.ID,
   450  					Name:          missing.Name(),
   451  					JobID:         s.job.ID,
   452  					TaskGroup:     tg.Name,
   453  					Metrics:       s.ctx.Metrics(),
   454  					NodeID:        option.Node.ID,
   455  					DeploymentID:  deploymentID,
   456  					TaskResources: option.TaskResources,
   457  					DesiredStatus: structs.AllocDesiredStatusRun,
   458  					ClientStatus:  structs.AllocClientStatusPending,
   459  
   460  					SharedResources: &structs.Resources{
   461  						DiskMB: tg.EphemeralDisk.SizeMB,
   462  					},
   463  				}
   464  
   465  				// If the new allocation is replacing an older allocation then we
   466  				// set the record the older allocation id so that they are chained
   467  				if prevAllocation != nil {
   468  					alloc.PreviousAllocation = prevAllocation.ID
   469  					if missing.IsRescheduling() {
   470  						updateRescheduleTracker(alloc, prevAllocation)
   471  					}
   472  				}
   473  
   474  				// If we are placing a canary and we found a match, add the canary
   475  				// to the deployment state object.
   476  				if missing.Canary() {
   477  					if state, ok := s.deployment.TaskGroups[tg.Name]; ok {
   478  						state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID)
   479  					}
   480  				}
   481  
   482  				// Track the placement
   483  				s.plan.AppendAlloc(alloc)
   484  
   485  			} else {
   486  				// Lazy initialize the failed map
   487  				if s.failedTGAllocs == nil {
   488  					s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   489  				}
   490  
   491  				// Track the fact that we didn't find a placement
   492  				s.failedTGAllocs[tg.Name] = s.ctx.Metrics()
   493  
   494  				// If we weren't able to find a replacement for the allocation, back
   495  				// out the fact that we asked to stop the allocation.
   496  				if stopPrevAlloc {
   497  					s.plan.PopUpdate(prevAllocation)
   498  				}
   499  			}
   500  
   501  		}
   502  	}
   503  
   504  	return nil
   505  }
   506  
   507  // getSelectOptions sets up preferred nodes and penalty nodes
   508  func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions {
   509  	selectOptions := &SelectOptions{}
   510  	if prevAllocation != nil {
   511  		penaltyNodes := make(map[string]struct{})
   512  		penaltyNodes[prevAllocation.NodeID] = struct{}{}
   513  		if prevAllocation.RescheduleTracker != nil {
   514  			for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
   515  				penaltyNodes[reschedEvent.PrevNodeID] = struct{}{}
   516  			}
   517  		}
   518  		selectOptions.PenaltyNodeIDs = penaltyNodes
   519  	}
   520  	if preferredNode != nil {
   521  		selectOptions.PreferredNodes = []*structs.Node{preferredNode}
   522  	}
   523  	return selectOptions
   524  }
   525  
   526  // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart
   527  func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) {
   528  	var rescheduleEvents []*structs.RescheduleEvent
   529  	if prev.RescheduleTracker != nil {
   530  		for _, reschedEvent := range prev.RescheduleTracker.Events {
   531  			rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
   532  		}
   533  	}
   534  	rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, prev.NodeID)
   535  	rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
   536  	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
   537  }
   538  
   539  // findPreferredNode finds the preferred node for an allocation
   540  func (s *GenericScheduler) findPreferredNode(place placementResult) (node *structs.Node, err error) {
   541  	if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true {
   542  		var preferredNode *structs.Node
   543  		ws := memdb.NewWatchSet()
   544  		preferredNode, err = s.state.NodeByID(ws, prev.NodeID)
   545  		if preferredNode.Ready() {
   546  			node = preferredNode
   547  		}
   548  	}
   549  	return
   550  }