github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/davecgh/go-spew/spew"
     8  	"github.com/hashicorp/go-multierror"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// maxServiceScheduleAttempts is used to limit the number of times
    14  	// we will attempt to schedule if we continue to hit conflicts for services.
    15  	maxServiceScheduleAttempts = 5
    16  
    17  	// maxBatchScheduleAttempts is used to limit the number of times
    18  	// we will attempt to schedule if we continue to hit conflicts for batch.
    19  	maxBatchScheduleAttempts = 2
    20  
    21  	// allocNotNeeded is the status used when a job no longer requires an allocation
    22  	allocNotNeeded = "alloc not needed due to job update"
    23  
    24  	// allocMigrating is the status used when we must migrate an allocation
    25  	allocMigrating = "alloc is being migrated"
    26  
    27  	// allocUpdating is the status used when a job requires an update
    28  	allocUpdating = "alloc is being updated due to job update"
    29  
    30  	// allocInPlace is the status used when speculating on an in-place update
    31  	allocInPlace = "alloc updating in-place"
    32  )
    33  
    34  // SetStatusError is used to set the status of the evaluation to the given error
    35  type SetStatusError struct {
    36  	Err        error
    37  	EvalStatus string
    38  }
    39  
    40  func (s *SetStatusError) Error() string {
    41  	return s.Err.Error()
    42  }
    43  
    44  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    45  // designed for long-lived services, and as such spends more time attemping
    46  // to make a high quality placement. This is the primary scheduler for
    47  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    48  // making at the cost of quality.
    49  type GenericScheduler struct {
    50  	logger  *log.Logger
    51  	state   State
    52  	planner Planner
    53  	batch   bool
    54  
    55  	eval       *structs.Evaluation
    56  	job        *structs.Job
    57  	plan       *structs.Plan
    58  	planResult *structs.PlanResult
    59  	ctx        *EvalContext
    60  	stack      *GenericStack
    61  
    62  	limitReached bool
    63  	nextEval     *structs.Evaluation
    64  
    65  	blocked *structs.Evaluation
    66  }
    67  
    68  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    69  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    70  	s := &GenericScheduler{
    71  		logger:  logger,
    72  		state:   state,
    73  		planner: planner,
    74  		batch:   false,
    75  	}
    76  	return s
    77  }
    78  
    79  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    80  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    81  	s := &GenericScheduler{
    82  		logger:  logger,
    83  		state:   state,
    84  		planner: planner,
    85  		batch:   true,
    86  	}
    87  	return s
    88  }
    89  
    90  // Process is used to handle a single evaluation
    91  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
    92  	// Store the evaluation
    93  	s.eval = eval
    94  
    95  	// Verify the evaluation trigger reason is understood
    96  	switch eval.TriggeredBy {
    97  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    98  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
    99  		structs.EvalTriggerPeriodicJob:
   100  	default:
   101  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
   102  			eval.TriggeredBy)
   103  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, structs.EvalStatusFailed, desc)
   104  	}
   105  
   106  	// Retry up to the maxScheduleAttempts and reset if progress is made.
   107  	progress := func() bool { return progressMade(s.planResult) }
   108  	limit := maxServiceScheduleAttempts
   109  	if s.batch {
   110  		limit = maxBatchScheduleAttempts
   111  	}
   112  	if err := retryMax(limit, s.process, progress); err != nil {
   113  		if statusErr, ok := err.(*SetStatusError); ok {
   114  			// Scheduling was tried but made no forward progress so create a
   115  			// blocked eval to retry once resources become available.
   116  			var mErr multierror.Error
   117  			if err := s.createBlockedEval(true); err != nil {
   118  				mErr.Errors = append(mErr.Errors, err)
   119  			}
   120  			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, statusErr.EvalStatus, err.Error()); err != nil {
   121  				mErr.Errors = append(mErr.Errors, err)
   122  			}
   123  			return mErr.ErrorOrNil()
   124  		}
   125  		return err
   126  	}
   127  
   128  	// If the current evaluation is a blocked evaluation and we didn't place
   129  	// everything, do not update the status to complete.
   130  	if s.eval.Status == structs.EvalStatusBlocked && len(s.eval.FailedTGAllocs) != 0 {
   131  		return s.planner.ReblockEval(s.eval)
   132  	}
   133  
   134  	// Update the status to complete
   135  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, structs.EvalStatusComplete, "")
   136  }
   137  
   138  // createBlockedEval creates a blocked eval and submits it to the planner. If
   139  // failure is set to true, the eval's trigger reason reflects that.
   140  func (s *GenericScheduler) createBlockedEval(planFailure bool) error {
   141  	e := s.ctx.Eligibility()
   142  	escaped := e.HasEscaped()
   143  
   144  	// Only store the eligible classes if the eval hasn't escaped.
   145  	var classEligibility map[string]bool
   146  	if !escaped {
   147  		classEligibility = e.GetClasses()
   148  	}
   149  
   150  	s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped)
   151  	if planFailure {
   152  		s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans
   153  	}
   154  
   155  	return s.planner.CreateEval(s.blocked)
   156  }
   157  
   158  // process is wrapped in retryMax to iteratively run the handler until we have no
   159  // further work or we've made the maximum number of attempts.
   160  func (s *GenericScheduler) process() (bool, error) {
   161  	// Lookup the Job by ID
   162  	var err error
   163  	s.job, err = s.state.JobByID(s.eval.JobID)
   164  	if err != nil {
   165  		return false, fmt.Errorf("failed to get job '%s': %v",
   166  			s.eval.JobID, err)
   167  	}
   168  
   169  	// Create a plan
   170  	s.plan = s.eval.MakePlan(s.job)
   171  
   172  	// Reset the failed allocations
   173  	s.eval.FailedTGAllocs = nil
   174  
   175  	// Create an evaluation context
   176  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   177  
   178  	// Construct the placement stack
   179  	s.stack = NewGenericStack(s.batch, s.ctx)
   180  	if s.job != nil {
   181  		s.stack.SetJob(s.job)
   182  	}
   183  
   184  	// Compute the target job allocations
   185  	if err := s.computeJobAllocs(); err != nil {
   186  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   187  		return false, err
   188  	}
   189  
   190  	// If there are failed allocations, we need to create a blocked evaluation
   191  	// to place the failed allocations when resources become available. If the
   192  	// current evaluation is already a blocked eval, we reuse it.
   193  	if s.eval.Status != structs.EvalStatusBlocked && len(s.eval.FailedTGAllocs) != 0 && s.blocked == nil {
   194  		if err := s.createBlockedEval(false); err != nil {
   195  			s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err)
   196  			return false, err
   197  		}
   198  		s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID)
   199  	}
   200  
   201  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   202  	// anyways to get the annotations.
   203  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   204  		return true, nil
   205  	}
   206  
   207  	// If the limit of placements was reached we need to create an evaluation
   208  	// to pickup from here after the stagger period.
   209  	if s.limitReached && s.nextEval == nil {
   210  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   211  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   212  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   213  			return false, err
   214  		}
   215  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   216  	}
   217  
   218  	// Submit the plan and store the results.
   219  	result, newState, err := s.planner.SubmitPlan(s.plan)
   220  	s.planResult = result
   221  	if err != nil {
   222  		return false, err
   223  	}
   224  
   225  	// If we got a state refresh, try again since we have stale data
   226  	if newState != nil {
   227  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   228  		s.state = newState
   229  		return false, nil
   230  	}
   231  
   232  	// Try again if the plan was not fully committed, potential conflict
   233  	fullCommit, expected, actual := result.FullCommit(s.plan)
   234  	if !fullCommit {
   235  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   236  			s.eval, expected, actual)
   237  		if newState == nil {
   238  			return false, fmt.Errorf("missing state refresh after partial commit")
   239  		}
   240  		return false, nil
   241  	}
   242  
   243  	// Success!
   244  	return true, nil
   245  }
   246  
   247  // filterCompleteAllocs filters allocations that are terminal and should be
   248  // re-placed.
   249  func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation {
   250  	filter := func(a *structs.Allocation) bool {
   251  		if s.batch {
   252  			// Allocs from batch jobs should be filtered when the desired status
   253  			// is terminal and the client did not finish or when the client
   254  			// status is failed so that they will be replaced. If they are
   255  			// complete but not failed, they shouldn't be replaced.
   256  			switch a.DesiredStatus {
   257  			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
   258  				return !a.RanSuccessfully()
   259  			default:
   260  			}
   261  
   262  			switch a.ClientStatus {
   263  			case structs.AllocClientStatusFailed:
   264  				return true
   265  			default:
   266  				return false
   267  			}
   268  		}
   269  
   270  		// Filter terminal, non batch allocations
   271  		return a.TerminalStatus()
   272  	}
   273  
   274  	n := len(allocs)
   275  	for i := 0; i < n; i++ {
   276  		if filter(allocs[i]) {
   277  			allocs[i], allocs[n-1] = allocs[n-1], nil
   278  			i--
   279  			n--
   280  		}
   281  	}
   282  	return allocs[:n]
   283  }
   284  
   285  // computeJobAllocs is used to reconcile differences between the job,
   286  // existing allocations and node status to update the allocations.
   287  func (s *GenericScheduler) computeJobAllocs() error {
   288  	// Materialize all the task groups, job could be missing if deregistered
   289  	var groups map[string]*structs.TaskGroup
   290  	if s.job != nil {
   291  		groups = materializeTaskGroups(s.job)
   292  	}
   293  
   294  	// Lookup the allocations by JobID
   295  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   296  	if err != nil {
   297  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   298  			s.eval.JobID, err)
   299  	}
   300  
   301  	// Filter out the allocations in a terminal state
   302  	allocs = s.filterCompleteAllocs(allocs)
   303  
   304  	// Determine the tainted nodes containing job allocs
   305  	tainted, err := taintedNodes(s.state, allocs)
   306  	if err != nil {
   307  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   308  			s.eval.JobID, err)
   309  	}
   310  
   311  	// Diff the required and existing allocations
   312  	diff := diffAllocs(s.job, tainted, groups, allocs)
   313  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   314  
   315  	// XXX: For debugging purposes only. An issue was observed where a job had a
   316  	// task group with count > 0 that produced a diff where no action would be
   317  	// taken (every slice was empty). Below we dump debug information if this
   318  	// condition is hit.
   319  	diffSum := len(diff.stop) + len(diff.place) + len(diff.ignore) +
   320  		len(diff.update) + len(diff.migrate)
   321  	if diffSum == 0 && len(groups) != 0 {
   322  		s.logger.Printf("[ERR] sched: %d tasks to schedule but scheduler believes there is no work", len(groups))
   323  
   324  		// Get the original set of allocations for the job.
   325  		jobAllocs, err := s.state.AllocsByJob(s.eval.JobID)
   326  		if err != nil {
   327  			return fmt.Errorf("failed to get allocs for job '%s': %v", s.eval.JobID, err)
   328  		}
   329  		s.logger.Printf("[DEBUG] sched: job: %s", spew.Sdump(s.job))
   330  		s.logger.Printf("[DEBUG] sched: materializeTaskGroups() returned: %s", spew.Sdump(groups))
   331  		s.logger.Printf("[DEBUG] sched: AllocsByJob(%q) returned: %s", s.eval.JobID, spew.Sdump(jobAllocs))
   332  		s.logger.Printf("[DEBUG] sched: filterCompleteAllocs(): %s", spew.Sdump(allocs))
   333  		s.logger.Printf("[DEBUG] sched: taintedNodes(): %s", spew.Sdump(tainted))
   334  	}
   335  
   336  	// Add all the allocs to stop
   337  	for _, e := range diff.stop {
   338  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded)
   339  	}
   340  
   341  	// Attempt to do the upgrades in place
   342  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   343  	diff.update = destructiveUpdates
   344  
   345  	if s.eval.AnnotatePlan {
   346  		s.plan.Annotations = &structs.PlanAnnotations{
   347  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   348  		}
   349  	}
   350  
   351  	// Check if a rolling upgrade strategy is being used
   352  	limit := len(diff.update) + len(diff.migrate)
   353  	if s.job != nil && s.job.Update.Rolling() {
   354  		limit = s.job.Update.MaxParallel
   355  	}
   356  
   357  	// Treat migrations as an eviction and a new placement.
   358  	s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit)
   359  
   360  	// Treat non in-place updates as an eviction and new placement.
   361  	s.limitReached = s.limitReached || evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   362  
   363  	// Nothing remaining to do if placement is not required
   364  	if len(diff.place) == 0 {
   365  		return nil
   366  	}
   367  
   368  	// Compute the placements
   369  	return s.computePlacements(diff.place)
   370  }
   371  
   372  // computePlacements computes placements for allocations
   373  func (s *GenericScheduler) computePlacements(place []allocTuple) error {
   374  	// Get the base nodes
   375  	nodes, byDC, err := readyNodesInDCs(s.state, s.job.Datacenters)
   376  	if err != nil {
   377  		return err
   378  	}
   379  
   380  	// Update the set of placement ndoes
   381  	s.stack.SetNodes(nodes)
   382  
   383  	for _, missing := range place {
   384  		// Check if this task group has already failed
   385  		if metric, ok := s.eval.FailedTGAllocs[missing.TaskGroup.Name]; ok {
   386  			metric.CoalescedFailures += 1
   387  			continue
   388  		}
   389  
   390  		// Attempt to match the task group
   391  		option, _ := s.stack.Select(missing.TaskGroup)
   392  
   393  		// Store the available nodes by datacenter
   394  		s.ctx.Metrics().NodesAvailable = byDC
   395  
   396  		// Set fields based on if we found an allocation option
   397  		if option != nil {
   398  			// Create an allocation for this
   399  			alloc := &structs.Allocation{
   400  				ID:            structs.GenerateUUID(),
   401  				EvalID:        s.eval.ID,
   402  				Name:          missing.Name,
   403  				JobID:         s.job.ID,
   404  				TaskGroup:     missing.TaskGroup.Name,
   405  				Metrics:       s.ctx.Metrics(),
   406  				NodeID:        option.Node.ID,
   407  				TaskResources: option.TaskResources,
   408  				DesiredStatus: structs.AllocDesiredStatusRun,
   409  				ClientStatus:  structs.AllocClientStatusPending,
   410  			}
   411  
   412  			// Generate service IDs tasks in this allocation
   413  			// COMPAT - This is no longer required and would be removed in v0.4
   414  			alloc.PopulateServiceIDs(missing.TaskGroup)
   415  
   416  			s.plan.AppendAlloc(alloc)
   417  		} else {
   418  			// Lazy initialize the failed map
   419  			if s.eval.FailedTGAllocs == nil {
   420  				s.eval.FailedTGAllocs = make(map[string]*structs.AllocMetric)
   421  			}
   422  
   423  			s.eval.FailedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   424  		}
   425  	}
   426  
   427  	return nil
   428  }