github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/scheduler/generic_sched.go

github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/hashicorp/go-multierror"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  )
    10  
    11  const (
    12  	// maxServiceScheduleAttempts is used to limit the number of times
    13  	// we will attempt to schedule if we continue to hit conflicts for services.
    14  	maxServiceScheduleAttempts = 5
    15  
    16  	// maxBatchScheduleAttempts is used to limit the number of times
    17  	// we will attempt to schedule if we continue to hit conflicts for batch.
    18  	maxBatchScheduleAttempts = 2
    19  
    20  	// allocNotNeeded is the status used when a job no longer requires an allocation
    21  	allocNotNeeded = "alloc not needed due to job update"
    22  
    23  	// allocMigrating is the status used when we must migrate an allocation
    24  	allocMigrating = "alloc is being migrated"
    25  
    26  	// allocUpdating is the status used when a job requires an update
    27  	allocUpdating = "alloc is being updated due to job update"
    28  
    29  	// allocLost is the status used when an allocation is lost
    30  	allocLost = "alloc is lost since its node is down"
    31  
    32  	// allocInPlace is the status used when speculating on an in-place update
    33  	allocInPlace = "alloc updating in-place"
    34  
    35  	// blockedEvalMaxPlanDesc is the description used for blocked evals that are
    36  	// a result of hitting the max number of plan attempts
    37  	blockedEvalMaxPlanDesc = "created due to placement conflicts"
    38  
    39  	// blockedEvalFailedPlacements is the description used for blocked evals
    40  	// that are a result of failing to place all allocations.
    41  	blockedEvalFailedPlacements = "created to place remaining allocations"
    42  )
    43  
    44  // SetStatusError is used to set the status of the evaluation to the given error
    45  type SetStatusError struct {
    46  	Err        error
    47  	EvalStatus string
    48  }
    49  
    50  func (s *SetStatusError) Error() string {
    51  	return s.Err.Error()
    52  }
    53  
    54  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    55  // designed for long-lived services, and as such spends more time attemping
    56  // to make a high quality placement. This is the primary scheduler for
    57  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    58  // making at the cost of quality.
    59  type GenericScheduler struct {
    60  	logger  *log.Logger
    61  	state   State
    62  	planner Planner
    63  	batch   bool
    64  
    65  	eval       *structs.Evaluation
    66  	job        *structs.Job
    67  	plan       *structs.Plan
    68  	planResult *structs.PlanResult
    69  	ctx        *EvalContext
    70  	stack      *GenericStack
    71  
    72  	limitReached bool
    73  	nextEval     *structs.Evaluation
    74  
    75  	blocked        *structs.Evaluation
    76  	failedTGAllocs map[string]*structs.AllocMetric
    77  	queuedAllocs   map[string]int
    78  }
    79  
    80  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    81  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    82  	s := &GenericScheduler{
    83  		logger:  logger,
    84  		state:   state,
    85  		planner: planner,
    86  		batch:   false,
    87  	}
    88  	return s
    89  }
    90  
    91  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    92  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    93  	s := &GenericScheduler{
    94  		logger:  logger,
    95  		state:   state,
    96  		planner: planner,
    97  		batch:   true,
    98  	}
    99  	return s
   100  }
   101  
   102  // Process is used to handle a single evaluation
   103  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
   104  	// Store the evaluation
   105  	s.eval = eval
   106  
   107  	// Verify the evaluation trigger reason is understood
   108  	switch eval.TriggeredBy {
   109  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
   110  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
   111  		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans:
   112  	default:
   113  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   114  			eval.TriggeredBy)
   115  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   116  			s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs)
   117  	}
   118  
   119  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   120  	progress := func() bool { return progressMade(s.planResult) }
   121  	limit := maxServiceScheduleAttempts
   122  	if s.batch {
   123  		limit = maxBatchScheduleAttempts
   124  	}
   125  	if err := retryMax(limit, s.process, progress); err != nil {
   126  		if statusErr, ok := err.(*SetStatusError); ok {
   127  			// Scheduling was tried but made no forward progress so create a
   128  			// blocked eval to retry once resources become available.
   129  			var mErr multierror.Error
   130  			if err := s.createBlockedEval(true); err != nil {
   131  				mErr.Errors = append(mErr.Errors, err)
   132  			}
   133  			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   134  				s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
   135  				s.queuedAllocs); err != nil {
   136  				mErr.Errors = append(mErr.Errors, err)
   137  			}
   138  			return mErr.ErrorOrNil()
   139  		}
   140  		return err
   141  	}
   142  
   143  	// If the current evaluation is a blocked evaluation and we didn't place
   144  	// everything, do not update the status to complete.
   145  	if s.eval.Status == structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 {
   146  		e := s.ctx.Eligibility()
   147  		newEval := s.eval.Copy()
   148  		newEval.EscapedComputedClass = e.HasEscaped()
   149  		newEval.ClassEligibility = e.GetClasses()
   150  		return s.planner.ReblockEval(newEval)
   151  	}
   152  
   153  	// Update the status to complete
   154  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
   155  		s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs)
   156  }
   157  
   158  // createBlockedEval creates a blocked eval and submits it to the planner. If
   159  // failure is set to true, the eval's trigger reason reflects that.
   160  func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
   161  	e := s.ctx.Eligibility()
   162  	escaped := e.HasEscaped()
   163  
   164  	// Only store the eligible classes if the eval hasn't escaped.
   165  	var classEligibility map[string]bool
   166  	if !escaped {
   167  		classEligibility = e.GetClasses()
   168  	}
   169  
   170  	s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped)
   171  	if planFailure {
   172  		s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
   173  		s.blocked.StatusDescription = blockedEvalMaxPlanDesc
   174  	} else {
   175  		s.blocked.StatusDescription = blockedEvalFailedPlacements
   176  	}
   177  
   178  	return s.planner.CreateEval(s.blocked)
   179  }
   180  
   181  // process is wrapped in retryMax to iteratively run the handler until we have no
   182  // further work or we've made the maximum number of attempts.
   183  func (s *GenericScheduler) process() (bool, error) {
   184  	// Lookup the Job by ID
   185  	var err error
   186  	s.job, err = s.state.JobByID(s.eval.JobID)
   187  	if err != nil {
   188  		return false, fmt.Errorf("failed to get job '%s': %v",
   189  			s.eval.JobID, err)
   190  	}
   191  	numTaskGroups := 0
   192  	if s.job != nil {
   193  		numTaskGroups = len(s.job.TaskGroups)
   194  	}
   195  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   196  
   197  	// Create a plan
   198  	s.plan = s.eval.MakePlan(s.job)
   199  
   200  	// Reset the failed allocations
   201  	s.failedTGAllocs = nil
   202  
   203  	// Create an evaluation context
   204  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   205  
   206  	// Construct the placement stack
   207  	s.stack = NewGenericStack(s.batch, s.ctx)
   208  	if s.job != nil {
   209  		s.stack.SetJob(s.job)
   210  	}
   211  
   212  	// Compute the target job allocations
   213  	if err := s.computeJobAllocs(); err != nil {
   214  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   215  		return false, err
   216  	}
   217  
   218  	// If there are failed allocations, we need to create a blocked evaluation
   219  	// to place the failed allocations when resources become available. If the
   220  	// current evaluation is already a blocked eval, we reuse it.
   221  	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil {
   222  		if err := s.createBlockedEval(false); err != nil {
   223  			s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
   224  			return false, err
   225  		}
   226  		s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID)
   227  	}
   228  
   229  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   230  	// anyways to get the annotations.
   231  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   232  		return true, nil
   233  	}
   234  
   235  	// If the limit of placements was reached we need to create an evaluation
   236  	// to pickup from here after the stagger period.
   237  	if s.limitReached && s.nextEval == nil {
   238  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   239  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   240  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   241  			return false, err
   242  		}
   243  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   244  	}
   245  
   246  	// Submit the plan and store the results.
   247  	result, newState, err := s.planner.SubmitPlan(s.plan)
   248  	s.planResult = result
   249  	if err != nil {
   250  		return false, err
   251  	}
   252  
   253  	// Decrement the number of allocations pending per task group based on the
   254  	// number of allocations successfully placed
   255  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   256  
   257  	// If we got a state refresh, try again since we have stale data
   258  	if newState != nil {
   259  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   260  		s.state = newState
   261  		return false, nil
   262  	}
   263  
   264  	// Try again if the plan was not fully committed, potential conflict
   265  	fullCommit, expected, actual := result.FullCommit(s.plan)
   266  	if !fullCommit {
   267  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   268  			s.eval, expected, actual)
   269  		if newState == nil {
   270  			return false, fmt.Errorf("missing state refresh after partial commit")
   271  		}
   272  		return false, nil
   273  	}
   274  
   275  	// Success!
   276  	return true, nil
   277  }
   278  
   279  // filterCompleteAllocs filters allocations that are terminal and should be
   280  // re-placed.
   281  func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) ([]*structs.Allocation, map[string]*structs.Allocation) {
   282  	filter := func(a *structs.Allocation) bool {
   283  		if s.batch {
   284  			// Allocs from batch jobs should be filtered when the desired status
   285  			// is terminal and the client did not finish or when the client
   286  			// status is failed so that they will be replaced. If they are
   287  			// complete but not failed, they shouldn't be replaced.
   288  			switch a.DesiredStatus {
   289  			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   290  				return !a.RanSuccessfully()
   291  			default:
   292  			}
   293  
   294  			switch a.ClientStatus {
   295  			case structs.AllocClientStatusFailed:
   296  				return true
   297  			default:
   298  				return false
   299  			}
   300  		}
   301  
   302  		// Filter terminal, non batch allocations
   303  		return a.TerminalStatus()
   304  	}
   305  
   306  	terminalAllocsByName := make(map[string]*structs.Allocation)
   307  	n := len(allocs)
   308  	for i := 0; i < n; i++ {
   309  		if filter(allocs[i]) {
   310  
   311  			// Add the allocation to the terminal allocs map if it's not already
   312  			// added or has a higher create index than the one which is
   313  			// currently present.
   314  			alloc, ok := terminalAllocsByName[allocs[i].Name]
   315  			if !ok || alloc.CreateIndex < allocs[i].CreateIndex {
   316  				terminalAllocsByName[allocs[i].Name] = allocs[i]
   317  			}
   318  
   319  			// Remove the allocation
   320  			allocs[i], allocs[n-1] = allocs[n-1], nil
   321  			i--
   322  			n--
   323  		}
   324  	}
   325  
   326  	// If the job is batch, we want to filter allocations that have been
   327  	// replaced by a newer version for the same task group.
   328  	filtered := allocs[:n]
   329  	if s.batch {
   330  		byTG := make(map[string]*structs.Allocation)
   331  		for _, alloc := range filtered {
   332  			existing := byTG[alloc.Name]
   333  			if existing == nil || existing.CreateIndex < alloc.CreateIndex {
   334  				byTG[alloc.Name] = alloc
   335  			}
   336  		}
   337  
   338  		filtered = make([]*structs.Allocation, 0, len(byTG))
   339  		for _, alloc := range byTG {
   340  			filtered = append(filtered, alloc)
   341  		}
   342  	}
   343  
   344  	return filtered, terminalAllocsByName
   345  }
   346  
   347  // computeJobAllocs is used to reconcile differences between the job,
   348  // existing allocations and node status to update the allocations.
   349  func (s *GenericScheduler) computeJobAllocs() error {
   350  	// Materialize all the task groups, job could be missing if deregistered
   351  	var groups map[string]*structs.TaskGroup
   352  	if s.job != nil {
   353  		groups = materializeTaskGroups(s.job)
   354  	}
   355  
   356  	// Lookup the allocations by JobID
   357  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   358  	if err != nil {
   359  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   360  			s.eval.JobID, err)
   361  	}
   362  
   363  	// Determine the tainted nodes containing job allocs
   364  	tainted, err := taintedNodes(s.state, allocs)
   365  	if err != nil {
   366  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   367  			s.eval.JobID, err)
   368  	}
   369  
   370  	// Update the allocations which are in pending/running state on tainted
   371  	// nodes to lost
   372  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   373  
   374  	// Filter out the allocations in a terminal state
   375  	allocs, terminalAllocs := s.filterCompleteAllocs(allocs)
   376  
   377  	// Diff the required and existing allocations
   378  	diff := diffAllocs(s.job, tainted, groups, allocs, terminalAllocs)
   379  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   380  
   381  	// Add all the allocs to stop
   382  	for _, e := range diff.stop {
   383  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "")
   384  	}
   385  
   386  	// Attempt to do the upgrades in place
   387  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   388  	diff.update = destructiveUpdates
   389  
   390  	if s.eval.AnnotatePlan {
   391  		s.plan.Annotations = &structs.PlanAnnotations{
   392  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   393  		}
   394  	}
   395  
   396  	// Check if a rolling upgrade strategy is being used
   397  	limit := len(diff.update) + len(diff.migrate) + len(diff.lost)
   398  	if s.job != nil && s.job.Update.Rolling() {
   399  		limit = s.job.Update.MaxParallel
   400  	}
   401  
   402  	// Treat migrations as an eviction and a new placement.
   403  	s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit)
   404  
   405  	// Treat non in-place updates as an eviction and new placement.
   406  	s.limitReached = s.limitReached || evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   407  
   408  	// Lost allocations should be transistioned to desired status stop and client
   409  	// status lost and a new placement should be made
   410  	s.limitReached = s.limitReached || markLostAndPlace(s.ctx, diff, diff.lost, allocLost, &limit)
   411  
   412  	// Nothing remaining to do if placement is not required
   413  	if len(diff.place) == 0 {
   414  		if s.job != nil {
   415  			for _, tg := range s.job.TaskGroups {
   416  				s.queuedAllocs[tg.Name] = 0
   417  			}
   418  		}
   419  		return nil
   420  	}
   421  
   422  	// Record the number of allocations that needs to be placed per Task Group
   423  	for _, allocTuple := range diff.place {
   424  		s.queuedAllocs[allocTuple.TaskGroup.Name] += 1
   425  	}
   426  
   427  	// Compute the placements
   428  	return s.computePlacements(diff.place)
   429  }
   430  
   431  // computePlacements computes placements for allocations
   432  func (s *GenericScheduler) computePlacements(place []allocTuple) error {
   433  	// Get the base nodes
   434  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   435  	if err != nil {
   436  		return err
   437  	}
   438  
   439  	// Update the set of placement ndoes
   440  	s.stack.SetNodes(nodes)
   441  
   442  	for _, missing := range place {
   443  		// Check if this task group has already failed
   444  		if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok {
   445  			metric.CoalescedFailures += 1
   446  			continue
   447  		}
   448  
   449  		// Find the preferred node
   450  		preferredNode, err := s.findPreferredNode(&missing)
   451  		if err != nil {
   452  			return err
   453  		}
   454  
   455  		// Attempt to match the task group
   456  		var option *RankedNode
   457  		if preferredNode != nil {
   458  			option, _ = s.stack.SelectPreferringNodes(missing.TaskGroup, []*structs.Node{preferredNode})
   459  		} else {
   460  			option, _ = s.stack.Select(missing.TaskGroup)
   461  		}
   462  
   463  		// Store the available nodes by datacenter
   464  		s.ctx.Metrics().NodesAvailable = byDC
   465  
   466  		// Set fields based on if we found an allocation option
   467  		if option != nil {
   468  			// Create an allocation for this
   469  			alloc := &structs.Allocation{
   470  				ID:            structs.GenerateUUID(),
   471  				EvalID:        s.eval.ID,
   472  				Name:          missing.Name,
   473  				JobID:         s.job.ID,
   474  				TaskGroup:     missing.TaskGroup.Name,
   475  				Metrics:       s.ctx.Metrics(),
   476  				NodeID:        option.Node.ID,
   477  				TaskResources: option.TaskResources,
   478  				DesiredStatus: structs.AllocDesiredStatusRun,
   479  				ClientStatus:  structs.AllocClientStatusPending,
   480  
   481  				SharedResources: &structs.Resources{
   482  					DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB,
   483  				},
   484  			}
   485  
   486  			// If the new allocation is replacing an older allocation then we
   487  			// set the record the older allocation id so that they are chained
   488  			if missing.Alloc != nil {
   489  				alloc.PreviousAllocation = missing.Alloc.ID
   490  			}
   491  
   492  			s.plan.AppendAlloc(alloc)
   493  		} else {
   494  			// Lazy initialize the failed map
   495  			if s.failedTGAllocs == nil {
   496  				s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   497  			}
   498  
   499  			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   500  		}
   501  	}
   502  
   503  	return nil
   504  }
   505  
   506  // findPreferredNode finds the preferred node for an allocation
   507  func (s *GenericScheduler) findPreferredNode(allocTuple *allocTuple) (node *structs.Node, err error) {
   508  	if allocTuple.Alloc != nil {
   509  		taskGroup := allocTuple.Alloc.Job.LookupTaskGroup(allocTuple.Alloc.TaskGroup)
   510  		if taskGroup == nil {
   511  			err = fmt.Errorf("can't find task group of existing allocation %q", allocTuple.Alloc.ID)
   512  			return
   513  		}
   514  		if taskGroup.EphemeralDisk.Sticky == true {
   515  			var preferredNode *structs.Node
   516  			preferredNode, err = s.state.NodeByID(allocTuple.Alloc.NodeID)
   517  			if preferredNode.Ready() {
   518  				node = preferredNode
   519  			}
   520  		}
   521  	}
   522  	return
   523  }