github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/hashicorp/go-multierror"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  )
    10  
    11  const (
    12  	// maxServiceScheduleAttempts is used to limit the number of times
    13  	// we will attempt to schedule if we continue to hit conflicts for services.
    14  	maxServiceScheduleAttempts = 5
    15  
    16  	// maxBatchScheduleAttempts is used to limit the number of times
    17  	// we will attempt to schedule if we continue to hit conflicts for batch.
    18  	maxBatchScheduleAttempts = 2
    19  
    20  	// allocNotNeeded is the status used when a job no longer requires an allocation
    21  	allocNotNeeded = "alloc not needed due to job update"
    22  
    23  	// allocMigrating is the status used when we must migrate an allocation
    24  	allocMigrating = "alloc is being migrated"
    25  
    26  	// allocUpdating is the status used when a job requires an update
    27  	allocUpdating = "alloc is being updated due to job update"
    28  
    29  	// allocInPlace is the status used when speculating on an in-place update
    30  	allocInPlace = "alloc updating in-place"
    31  )
    32  
    33  // SetStatusError is used to set the status of the evaluation to the given error
    34  type SetStatusError struct {
    35  	Err        error
    36  	EvalStatus string
    37  }
    38  
    39  func (s *SetStatusError) Error() string {
    40  	return s.Err.Error()
    41  }
    42  
    43  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    44  // designed for long-lived services, and as such spends more time attemping
    45  // to make a high quality placement. This is the primary scheduler for
    46  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    47  // making at the cost of quality.
    48  type GenericScheduler struct {
    49  	logger  *log.Logger
    50  	state   State
    51  	planner Planner
    52  	batch   bool
    53  
    54  	eval       *structs.Evaluation
    55  	job        *structs.Job
    56  	plan       *structs.Plan
    57  	planResult *structs.PlanResult
    58  	ctx        *EvalContext
    59  	stack      *GenericStack
    60  
    61  	limitReached bool
    62  	nextEval     *structs.Evaluation
    63  
    64  	blocked *structs.Evaluation
    65  }
    66  
    67  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    68  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    69  	s := &GenericScheduler{
    70  		logger:  logger,
    71  		state:   state,
    72  		planner: planner,
    73  		batch:   false,
    74  	}
    75  	return s
    76  }
    77  
    78  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    79  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    80  	s := &GenericScheduler{
    81  		logger:  logger,
    82  		state:   state,
    83  		planner: planner,
    84  		batch:   true,
    85  	}
    86  	return s
    87  }
    88  
    89  // Process is used to handle a single evaluation
    90  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
    91  	// Store the evaluation
    92  	s.eval = eval
    93  
    94  	// Verify the evaluation trigger reason is understood
    95  	switch eval.TriggeredBy {
    96  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    97  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
    98  		structs.EvalTriggerPeriodicJob:
    99  	default:
   100  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   101  			eval.TriggeredBy)
   102  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc)
   103  	}
   104  
   105  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   106  	progress := func() bool { return progressMade(s.planResult) }
   107  	limit := maxServiceScheduleAttempts
   108  	if s.batch {
   109  		limit = maxBatchScheduleAttempts
   110  	}
   111  	if err := retryMax(limit, s.process, progress); err != nil {
   112  		if statusErr, ok := err.(*SetStatusError); ok {
   113  			// Scheduling was tried but made no forward progress so create a
   114  			// blocked eval to retry once resources become available.
   115  			var mErr multierror.Error
   116  			if err := s.createBlockedEval(); err != nil {
   117  				mErr.Errors = append(mErr.Errors, err)
   118  			}
   119  			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error()); err != nil {
   120  				mErr.Errors = append(mErr.Errors, err)
   121  			}
   122  			return mErr.ErrorOrNil()
   123  		}
   124  		return err
   125  	}
   126  
   127  	// Update the status to complete
   128  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "")
   129  }
   130  
   131  // createBlockedEval creates a blocked eval and stores it.
   132  func (s *GenericScheduler) createBlockedEval() error {
   133  	e := s.ctx.Eligibility()
   134  	escaped := e.HasEscaped()
   135  
   136  	// Only store the eligible classes if the eval hasn't escaped.
   137  	var classEligibility map[string]bool
   138  	if !escaped {
   139  		classEligibility = e.GetClasses()
   140  	}
   141  
   142  	s.blocked = s.eval.BlockedEval(classEligibility, escaped)
   143  	return s.planner.CreateEval(s.blocked)
   144  }
   145  
   146  // process is wrapped in retryMax to iteratively run the handler until we have no
   147  // further work or we've made the maximum number of attempts.
   148  func (s *GenericScheduler) process() (bool, error) {
   149  	// Lookup the Job by ID
   150  	var err error
   151  	s.job, err = s.state.JobByID(s.eval.JobID)
   152  	if err != nil {
   153  		return false, fmt.Errorf("failed to get job '%s': %v",
   154  			s.eval.JobID, err)
   155  	}
   156  
   157  	// Create a plan
   158  	s.plan = s.eval.MakePlan(s.job)
   159  
   160  	// Create an evaluation context
   161  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   162  
   163  	// Construct the placement stack
   164  	s.stack = NewGenericStack(s.batch, s.ctx)
   165  	if s.job != nil {
   166  		s.stack.SetJob(s.job)
   167  	}
   168  
   169  	// Compute the target job allocations
   170  	if err := s.computeJobAllocs(); err != nil {
   171  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   172  		return false, err
   173  	}
   174  
   175  	// If the plan is a no-op, we can bail
   176  	if s.plan.IsNoOp() {
   177  		return true, nil
   178  	}
   179  
   180  	// If the limit of placements was reached we need to create an evaluation
   181  	// to pickup from here after the stagger period.
   182  	if s.limitReached && s.nextEval == nil {
   183  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   184  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   185  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   186  			return false, err
   187  		}
   188  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   189  	}
   190  
   191  	// If there are failed allocations, we need to create a blocked evaluation
   192  	// to place the failed allocations when resources become available.
   193  	if len(s.plan.FailedAllocs) != 0 && s.blocked == nil {
   194  		if err := s.createBlockedEval(); err != nil {
   195  			s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
   196  			return false, err
   197  		}
   198  		s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID)
   199  	}
   200  
   201  	// Submit the plan and store the results.
   202  	result, newState, err := s.planner.SubmitPlan(s.plan)
   203  	s.planResult = result
   204  	if err != nil {
   205  		return false, err
   206  	}
   207  
   208  	// If we got a state refresh, try again since we have stale data
   209  	if newState != nil {
   210  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   211  		s.state = newState
   212  		return false, nil
   213  	}
   214  
   215  	// Try again if the plan was not fully committed, potential conflict
   216  	fullCommit, expected, actual := result.FullCommit(s.plan)
   217  	if !fullCommit {
   218  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   219  			s.eval, expected, actual)
   220  		if newState == nil {
   221  			return false, fmt.Errorf("missing state refresh after partial commit")
   222  		}
   223  		return false, nil
   224  	}
   225  
   226  	// Success!
   227  	return true, nil
   228  }
   229  
   230  // filterCompleteAllocs filters allocations that are terminal and should be
   231  // re-placed.
   232  func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation {
   233  	filter := func(a *structs.Allocation) bool {
   234  		// Allocs from batch jobs should be filtered when their status is failed so that
   235  		// they will be replaced. If they are dead but not failed, they
   236  		// shouldn't be replaced.
   237  		if s.batch {
   238  			return a.ClientStatus == structs.AllocClientStatusFailed
   239  		}
   240  
   241  		// Filter terminal, non batch allocations
   242  		return a.TerminalStatus()
   243  	}
   244  
   245  	n := len(allocs)
   246  	for i := 0; i < n; i++ {
   247  		if filter(allocs[i]) {
   248  			allocs[i], allocs[n-1] = allocs[n-1], nil
   249  			i--
   250  			n--
   251  		}
   252  	}
   253  	return allocs[:n]
   254  }
   255  
   256  // computeJobAllocs is used to reconcile differences between the job,
   257  // existing allocations and node status to update the allocations.
   258  func (s *GenericScheduler) computeJobAllocs() error {
   259  	// Materialize all the task groups, job could be missing if deregistered
   260  	var groups map[string]*structs.TaskGroup
   261  	if s.job != nil {
   262  		groups = materializeTaskGroups(s.job)
   263  	}
   264  
   265  	// Lookup the allocations by JobID
   266  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   267  	if err != nil {
   268  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   269  			s.eval.JobID, err)
   270  	}
   271  
   272  	// Filter out the allocations in a terminal state
   273  	allocs = s.filterCompleteAllocs(allocs)
   274  
   275  	// Determine the tainted nodes containing job allocs
   276  	tainted, err := taintedNodes(s.state, allocs)
   277  	if err != nil {
   278  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   279  			s.eval.JobID, err)
   280  	}
   281  
   282  	// Diff the required and existing allocations
   283  	diff := diffAllocs(s.job, tainted, groups, allocs)
   284  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   285  
   286  	// Add all the allocs to stop
   287  	for _, e := range diff.stop {
   288  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded)
   289  	}
   290  
   291  	// Attempt to do the upgrades in place
   292  	diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   293  
   294  	// Check if a rolling upgrade strategy is being used
   295  	limit := len(diff.update) + len(diff.migrate)
   296  	if s.job != nil && s.job.Update.Rolling() {
   297  		limit = s.job.Update.MaxParallel
   298  	}
   299  
   300  	// Treat migrations as an eviction and a new placement.
   301  	s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit)
   302  
   303  	// Treat non in-place updates as an eviction and new placement.
   304  	s.limitReached = s.limitReached || evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   305  
   306  	// Nothing remaining to do if placement is not required
   307  	if len(diff.place) == 0 {
   308  		return nil
   309  	}
   310  
   311  	// Compute the placements
   312  	return s.computePlacements(diff.place)
   313  }
   314  
   315  // computePlacements computes placements for allocations
   316  func (s *GenericScheduler) computePlacements(place []allocTuple) error {
   317  	// Get the base nodes
   318  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   319  	if err != nil {
   320  		return err
   321  	}
   322  
   323  	// Update the set of placement ndoes
   324  	s.stack.SetNodes(nodes)
   325  
   326  	// Track the failed task groups so that we can coalesce
   327  	// the failures together to avoid creating many failed allocs.
   328  	failedTG := make(map[*structs.TaskGroup]*structs.Allocation)
   329  
   330  	for _, missing := range place {
   331  		// Check if this task group has already failed
   332  		if alloc, ok := failedTG[missing.TaskGroup]; ok {
   333  			alloc.Metrics.CoalescedFailures += 1
   334  			continue
   335  		}
   336  
   337  		// Attempt to match the task group
   338  		option, size := s.stack.Select(missing.TaskGroup)
   339  
   340  		// Create an allocation for this
   341  		alloc := &structs.Allocation{
   342  			ID:        structs.GenerateUUID(),
   343  			EvalID:    s.eval.ID,
   344  			Name:      missing.Name,
   345  			JobID:     s.job.ID,
   346  			TaskGroup: missing.TaskGroup.Name,
   347  			Resources: size,
   348  			Metrics:   s.ctx.Metrics(),
   349  		}
   350  
   351  		// Store the available nodes by datacenter
   352  		s.ctx.Metrics().NodesAvailable = byDC
   353  
   354  		// Set fields based on if we found an allocation option
   355  		if option != nil {
   356  			// Generate service IDs tasks in this allocation
   357  			alloc.PopulateServiceIDs(missing.TaskGroup)
   358  
   359  			alloc.NodeID = option.Node.ID
   360  			alloc.TaskResources = option.TaskResources
   361  			alloc.DesiredStatus = structs.AllocDesiredStatusRun
   362  			alloc.ClientStatus = structs.AllocClientStatusPending
   363  			alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStatePending)
   364  			s.plan.AppendAlloc(alloc)
   365  		} else {
   366  			alloc.DesiredStatus = structs.AllocDesiredStatusFailed
   367  			alloc.DesiredDescription = "failed to find a node for placement"
   368  			alloc.ClientStatus = structs.AllocClientStatusFailed
   369  			alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStateDead)
   370  			s.plan.AppendFailed(alloc)
   371  			failedTG[missing.TaskGroup] = alloc
   372  		}
   373  	}
   374  
   375  	return nil
   376  }