github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	memdb "github.com/hashicorp/go-memdb"
     8  	"github.com/hashicorp/go-multierror"
     9  	"github.com/ncodes/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// maxServiceScheduleAttempts is used to limit the number of times
    14  	// we will attempt to schedule if we continue to hit conflicts for services.
    15  	maxServiceScheduleAttempts = 5
    16  
    17  	// maxBatchScheduleAttempts is used to limit the number of times
    18  	// we will attempt to schedule if we continue to hit conflicts for batch.
    19  	maxBatchScheduleAttempts = 2
    20  
    21  	// allocNotNeeded is the status used when a job no longer requires an allocation
    22  	allocNotNeeded = "alloc not needed due to job update"
    23  
    24  	// allocMigrating is the status used when we must migrate an allocation
    25  	allocMigrating = "alloc is being migrated"
    26  
    27  	// allocUpdating is the status used when a job requires an update
    28  	allocUpdating = "alloc is being updated due to job update"
    29  
    30  	// allocLost is the status used when an allocation is lost
    31  	allocLost = "alloc is lost since its node is down"
    32  
    33  	// allocInPlace is the status used when speculating on an in-place update
    34  	allocInPlace = "alloc updating in-place"
    35  
    36  	// blockedEvalMaxPlanDesc is the description used for blocked evals that are
    37  	// a result of hitting the max number of plan attempts
    38  	blockedEvalMaxPlanDesc = "created due to placement conflicts"
    39  
    40  	// blockedEvalFailedPlacements is the description used for blocked evals
    41  	// that are a result of failing to place all allocations.
    42  	blockedEvalFailedPlacements = "created to place remaining allocations"
    43  )
    44  
    45  // SetStatusError is used to set the status of the evaluation to the given error
    46  type SetStatusError struct {
    47  	Err        error
    48  	EvalStatus string
    49  }
    50  
    51  func (s *SetStatusError) Error() string {
    52  	return s.Err.Error()
    53  }
    54  
    55  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    56  // designed for long-lived services, and as such spends more time attemping
    57  // to make a high quality placement. This is the primary scheduler for
    58  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    59  // making at the cost of quality.
    60  type GenericScheduler struct {
    61  	logger  *log.Logger
    62  	state   State
    63  	planner Planner
    64  	batch   bool
    65  
    66  	eval       *structs.Evaluation
    67  	job        *structs.Job
    68  	plan       *structs.Plan
    69  	planResult *structs.PlanResult
    70  	ctx        *EvalContext
    71  	stack      *GenericStack
    72  
    73  	limitReached bool
    74  	nextEval     *structs.Evaluation
    75  
    76  	blocked        *structs.Evaluation
    77  	failedTGAllocs map[string]*structs.AllocMetric
    78  	queuedAllocs   map[string]int
    79  }
    80  
    81  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    82  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    83  	s := &GenericScheduler{
    84  		logger:  logger,
    85  		state:   state,
    86  		planner: planner,
    87  		batch:   false,
    88  	}
    89  	return s
    90  }
    91  
    92  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    93  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    94  	s := &GenericScheduler{
    95  		logger:  logger,
    96  		state:   state,
    97  		planner: planner,
    98  		batch:   true,
    99  	}
   100  	return s
   101  }
   102  
   103  // Process is used to handle a single evaluation
   104  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
   105  	// Store the evaluation
   106  	s.eval = eval
   107  
   108  	// Verify the evaluation trigger reason is understood
   109  	switch eval.TriggeredBy {
   110  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
   111  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
   112  		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans:
   113  	default:
   114  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   115  			eval.TriggeredBy)
   116  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   117  			s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs)
   118  	}
   119  
   120  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   121  	progress := func() bool { return progressMade(s.planResult) }
   122  	limit := maxServiceScheduleAttempts
   123  	if s.batch {
   124  		limit = maxBatchScheduleAttempts
   125  	}
   126  	if err := retryMax(limit, s.process, progress); err != nil {
   127  		if statusErr, ok := err.(*SetStatusError); ok {
   128  			// Scheduling was tried but made no forward progress so create a
   129  			// blocked eval to retry once resources become available.
   130  			var mErr multierror.Error
   131  			if err := s.createBlockedEval(true); err != nil {
   132  				mErr.Errors = append(mErr.Errors, err)
   133  			}
   134  			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   135  				s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
   136  				s.queuedAllocs); err != nil {
   137  				mErr.Errors = append(mErr.Errors, err)
   138  			}
   139  			return mErr.ErrorOrNil()
   140  		}
   141  		return err
   142  	}
   143  
   144  	// If the current evaluation is a blocked evaluation and we didn't place
   145  	// everything, do not update the status to complete.
   146  	if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 {
   147  		e := s.ctx.Eligibility()
   148  		newEval := s.eval.Copy()
   149  		newEval.EscapedComputedClass = e.HasEscaped()
   150  		newEval.ClassEligibility = e.GetClasses()
   151  		return s.planner.ReblockEval(newEval)
   152  	}
   153  
   154  	// Update the status to complete
   155  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   156  		s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs)
   157  }
   158  
   159  // createBlockedEval creates a blocked eval and submits it to the planner. If
   160  // failure is set to true, the eval's trigger reason reflects that.
   161  func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
   162  	e := s.ctx.Eligibility()
   163  	escaped := e.HasEscaped()
   164  
   165  	// Only store the eligible classes if the eval hasn't escaped.
   166  	var classEligibility map[string]bool
   167  	if !escaped {
   168  		classEligibility = e.GetClasses()
   169  	}
   170  
   171  	s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped)
   172  	if planFailure {
   173  		s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
   174  		s.blocked.StatusDescription = blockedEvalMaxPlanDesc
   175  	} else {
   176  		s.blocked.StatusDescription = blockedEvalFailedPlacements
   177  	}
   178  
   179  	return s.planner.CreateEval(s.blocked)
   180  }
   181  
   182  // process is wrapped in retryMax to iteratively run the handler until we have no
   183  // further work or we've made the maximum number of attempts.
   184  func (s *GenericScheduler) process() (bool, error) {
   185  	// Lookup the Job by ID
   186  	var err error
   187  	ws := memdb.NewWatchSet()
   188  	s.job, err = s.state.JobByID(ws, s.eval.JobID)
   189  	if err != nil {
   190  		return false, fmt.Errorf("failed to get job '%s': %v",
   191  			s.eval.JobID, err)
   192  	}
   193  	numTaskGroups := 0
   194  	if s.job != nil {
   195  		numTaskGroups = len(s.job.TaskGroups)
   196  	}
   197  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   198  
   199  	// Create a plan
   200  	s.plan = s.eval.MakePlan(s.job)
   201  
   202  	// Reset the failed allocations
   203  	s.failedTGAllocs = nil
   204  
   205  	// Create an evaluation context
   206  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   207  
   208  	// Construct the placement stack
   209  	s.stack = NewGenericStack(s.batch, s.ctx)
   210  	if s.job != nil {
   211  		s.stack.SetJob(s.job)
   212  	}
   213  
   214  	// Compute the target job allocations
   215  	if err := s.computeJobAllocs(); err != nil {
   216  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   217  		return false, err
   218  	}
   219  
   220  	// If there are failed allocations, we need to create a blocked evaluation
   221  	// to place the failed allocations when resources become available. If the
   222  	// current evaluation is already a blocked eval, we reuse it.
   223  	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil {
   224  		if err := s.createBlockedEval(false); err != nil {
   225  			s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
   226  			return false, err
   227  		}
   228  		s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID)
   229  	}
   230  
   231  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   232  	// anyways to get the annotations.
   233  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   234  		return true, nil
   235  	}
   236  
   237  	// If the limit of placements was reached we need to create an evaluation
   238  	// to pickup from here after the stagger period.
   239  	if s.limitReached && s.nextEval == nil {
   240  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   241  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   242  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   243  			return false, err
   244  		}
   245  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   246  	}
   247  
   248  	// Submit the plan and store the results.
   249  	result, newState, err := s.planner.SubmitPlan(s.plan)
   250  	s.planResult = result
   251  	if err != nil {
   252  		return false, err
   253  	}
   254  
   255  	// Decrement the number of allocations pending per task group based on the
   256  	// number of allocations successfully placed
   257  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   258  
   259  	// If we got a state refresh, try again since we have stale data
   260  	if newState != nil {
   261  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   262  		s.state = newState
   263  		return false, nil
   264  	}
   265  
   266  	// Try again if the plan was not fully committed, potential conflict
   267  	fullCommit, expected, actual := result.FullCommit(s.plan)
   268  	if !fullCommit {
   269  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   270  			s.eval, expected, actual)
   271  		if newState == nil {
   272  			return false, fmt.Errorf("missing state refresh after partial commit")
   273  		}
   274  		return false, nil
   275  	}
   276  
   277  	// Success!
   278  	return true, nil
   279  }
   280  
   281  // filterCompleteAllocs filters allocations that are terminal and should be
   282  // re-placed.
   283  func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) ([]*structs.Allocation, map[string]*structs.Allocation) {
   284  	filter := func(a *structs.Allocation) bool {
   285  		if s.batch {
   286  			// Allocs from batch jobs should be filtered when the desired status
   287  			// is terminal and the client did not finish or when the client
   288  			// status is failed so that they will be replaced. If they are
   289  			// complete but not failed, they shouldn't be replaced.
   290  			switch a.DesiredStatus {
   291  			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   292  				return !a.RanSuccessfully()
   293  			default:
   294  			}
   295  
   296  			switch a.ClientStatus {
   297  			case structs.AllocClientStatusFailed:
   298  				return true
   299  			default:
   300  				return false
   301  			}
   302  		}
   303  
   304  		// Filter terminal, non batch allocations
   305  		return a.TerminalStatus()
   306  	}
   307  
   308  	terminalAllocsByName := make(map[string]*structs.Allocation)
   309  	n := len(allocs)
   310  	for i := 0; i < n; i++ {
   311  		if filter(allocs[i]) {
   312  
   313  			// Add the allocation to the terminal allocs map if it's not already
   314  			// added or has a higher create index than the one which is
   315  			// currently present.
   316  			alloc, ok := terminalAllocsByName[allocs[i].Name]
   317  			if !ok || alloc.CreateIndex < allocs[i].CreateIndex {
   318  				terminalAllocsByName[allocs[i].Name] = allocs[i]
   319  			}
   320  
   321  			// Remove the allocation
   322  			allocs[i], allocs[n-1] = allocs[n-1], nil
   323  			i--
   324  			n--
   325  		}
   326  	}
   327  
   328  	// If the job is batch, we want to filter allocations that have been
   329  	// replaced by a newer version for the same task group.
   330  	filtered := allocs[:n]
   331  	if s.batch {
   332  		byTG := make(map[string]*structs.Allocation)
   333  		for _, alloc := range filtered {
   334  			existing := byTG[alloc.Name]
   335  			if existing == nil || existing.CreateIndex < alloc.CreateIndex {
   336  				byTG[alloc.Name] = alloc
   337  			}
   338  		}
   339  
   340  		filtered = make([]*structs.Allocation, 0, len(byTG))
   341  		for _, alloc := range byTG {
   342  			filtered = append(filtered, alloc)
   343  		}
   344  	}
   345  
   346  	return filtered, terminalAllocsByName
   347  }
   348  
   349  // computeJobAllocs is used to reconcile differences between the job,
   350  // existing allocations and node status to update the allocations.
   351  func (s *GenericScheduler) computeJobAllocs() error {
   352  	// Materialize all the task groups, job could be missing if deregistered
   353  	var groups map[string]*structs.TaskGroup
   354  	if s.job != nil {
   355  		groups = materializeTaskGroups(s.job)
   356  	}
   357  
   358  	// Lookup the allocations by JobID
   359  	ws := memdb.NewWatchSet()
   360  	allocs, err := s.state.AllocsByJob(ws, s.eval.JobID, true)
   361  	if err != nil {
   362  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   363  			s.eval.JobID, err)
   364  	}
   365  
   366  	// Determine the tainted nodes containing job allocs
   367  	tainted, err := taintedNodes(s.state, allocs)
   368  	if err != nil {
   369  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   370  			s.eval.JobID, err)
   371  	}
   372  
   373  	// Update the allocations which are in pending/running state on tainted
   374  	// nodes to lost
   375  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   376  
   377  	// Filter out the allocations in a terminal state
   378  	allocs, terminalAllocs := s.filterCompleteAllocs(allocs)
   379  
   380  	// Diff the required and existing allocations
   381  	diff := diffAllocs(s.job, tainted, groups, allocs, terminalAllocs)
   382  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   383  
   384  	// Add all the allocs to stop
   385  	for _, e := range diff.stop {
   386  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "")
   387  	}
   388  
   389  	// Attempt to do the upgrades in place
   390  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   391  	diff.update = destructiveUpdates
   392  
   393  	if s.eval.AnnotatePlan {
   394  		s.plan.Annotations = &structs.PlanAnnotations{
   395  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   396  		}
   397  	}
   398  
   399  	// Check if a rolling upgrade strategy is being used
   400  	limit := len(diff.update) + len(diff.migrate) + len(diff.lost)
   401  	if s.job != nil && s.job.Update.Rolling() {
   402  		limit = s.job.Update.MaxParallel
   403  	}
   404  
   405  	// Treat migrations as an eviction and a new placement.
   406  	s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit)
   407  
   408  	// Treat non in-place updates as an eviction and new placement.
   409  	s.limitReached = s.limitReached || evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   410  
   411  	// Lost allocations should be transistioned to desired status stop and client
   412  	// status lost and a new placement should be made
   413  	s.limitReached = s.limitReached || markLostAndPlace(s.ctx, diff, diff.lost, allocLost, &limit)
   414  
   415  	// Nothing remaining to do if placement is not required
   416  	if len(diff.place) == 0 {
   417  		if s.job != nil {
   418  			for _, tg := range s.job.TaskGroups {
   419  				s.queuedAllocs[tg.Name] = 0
   420  			}
   421  		}
   422  		return nil
   423  	}
   424  
   425  	// Record the number of allocations that needs to be placed per Task Group
   426  	for _, allocTuple := range diff.place {
   427  		s.queuedAllocs[allocTuple.TaskGroup.Name] += 1
   428  	}
   429  
   430  	// Compute the placements
   431  	return s.computePlacements(diff.place)
   432  }
   433  
   434  // computePlacements computes placements for allocations
   435  func (s *GenericScheduler) computePlacements(place []allocTuple) error {
   436  	// Get the base nodes
   437  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   438  	if err != nil {
   439  		return err
   440  	}
   441  
   442  	// Update the set of placement ndoes
   443  	s.stack.SetNodes(nodes)
   444  
   445  	for _, missing := range place {
   446  		// Check if this task group has already failed
   447  		if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok {
   448  			metric.CoalescedFailures += 1
   449  			continue
   450  		}
   451  
   452  		// Find the preferred node
   453  		preferredNode, err := s.findPreferredNode(&missing)
   454  		if err != nil {
   455  			return err
   456  		}
   457  
   458  		// Attempt to match the task group
   459  		var option *RankedNode
   460  		if preferredNode != nil {
   461  			option, _ = s.stack.SelectPreferringNodes(missing.TaskGroup, []*structs.Node{preferredNode})
   462  		} else {
   463  			option, _ = s.stack.Select(missing.TaskGroup)
   464  		}
   465  
   466  		// Store the available nodes by datacenter
   467  		s.ctx.Metrics().NodesAvailable = byDC
   468  
   469  		// Set fields based on if we found an allocation option
   470  		if option != nil {
   471  			// Create an allocation for this
   472  			alloc := &structs.Allocation{
   473  				ID:            structs.GenerateUUID(),
   474  				EvalID:        s.eval.ID,
   475  				Name:          missing.Name,
   476  				JobID:         s.job.ID,
   477  				TaskGroup:     missing.TaskGroup.Name,
   478  				Metrics:       s.ctx.Metrics(),
   479  				NodeID:        option.Node.ID,
   480  				TaskResources: option.TaskResources,
   481  				DesiredStatus: structs.AllocDesiredStatusRun,
   482  				ClientStatus:  structs.AllocClientStatusPending,
   483  
   484  				SharedResources: &structs.Resources{
   485  					DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB,
   486  				},
   487  			}
   488  
   489  			// If the new allocation is replacing an older allocation then we
   490  			// set the record the older allocation id so that they are chained
   491  			if missing.Alloc != nil {
   492  				alloc.PreviousAllocation = missing.Alloc.ID
   493  			}
   494  
   495  			s.plan.AppendAlloc(alloc)
   496  		} else {
   497  			// Lazy initialize the failed map
   498  			if s.failedTGAllocs == nil {
   499  				s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   500  			}
   501  
   502  			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   503  		}
   504  	}
   505  
   506  	return nil
   507  }
   508  
   509  // findPreferredNode finds the preferred node for an allocation
   510  func (s *GenericScheduler) findPreferredNode(allocTuple *allocTuple) (node *structs.Node, err error) {
   511  	if allocTuple.Alloc != nil {
   512  		taskGroup := allocTuple.Alloc.Job.LookupTaskGroup(allocTuple.Alloc.TaskGroup)
   513  		if taskGroup == nil {
   514  			err = fmt.Errorf("can't find task group of existing allocation %q", allocTuple.Alloc.ID)
   515  			return
   516  		}
   517  		if taskGroup.EphemeralDisk.Sticky == true {
   518  			var preferredNode *structs.Node
   519  			ws := memdb.NewWatchSet()
   520  			preferredNode, err = s.state.NodeByID(ws, allocTuple.Alloc.NodeID)
   521  			if preferredNode.Ready() {
   522  				node = preferredNode
   523  			}
   524  		}
   525  	}
   526  	return
   527  }