github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/scheduler/generic_sched.go

github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/scheduler/generic_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// maxServiceScheduleAttempts is used to limit the number of times
    12  	// we will attempt to schedule if we continue to hit conflicts for services.
    13  	maxServiceScheduleAttempts = 5
    14  
    15  	// maxBatchScheduleAttempts is used to limit the number of times
    16  	// we will attempt to schedule if we continue to hit conflicts for batch.
    17  	maxBatchScheduleAttempts = 2
    18  
    19  	// allocNotNeeded is the status used when a job no longer requires an allocation
    20  	allocNotNeeded = "alloc not needed due to job update"
    21  
    22  	// allocMigrating is the status used when we must migrate an allocation
    23  	allocMigrating = "alloc is being migrated"
    24  
    25  	// allocUpdating is the status used when a job requires an update
    26  	allocUpdating = "alloc is being updated due to job update"
    27  
    28  	// allocInPlace is the status used when speculating on an in-place update
    29  	allocInPlace = "alloc updating in-place"
    30  )
    31  
    32  // SetStatusError is used to set the status of the evaluation to the given error
    33  type SetStatusError struct {
    34  	Err        error
    35  	EvalStatus string
    36  }
    37  
    38  func (s *SetStatusError) Error() string {
    39  	return s.Err.Error()
    40  }
    41  
    42  // GenericScheduler is used for 'service' and 'batch' type jobs. This scheduler is
    43  // designed for long-lived services, and as such spends more time attemping
    44  // to make a high quality placement. This is the primary scheduler for
    45  // most workloads. It also supports a 'batch' mode to optimize for fast decision
    46  // making at the cost of quality.
    47  type GenericScheduler struct {
    48  	logger  *log.Logger
    49  	state   State
    50  	planner Planner
    51  	batch   bool
    52  
    53  	eval  *structs.Evaluation
    54  	job   *structs.Job
    55  	plan  *structs.Plan
    56  	ctx   *EvalContext
    57  	stack *GenericStack
    58  
    59  	limitReached bool
    60  	nextEval     *structs.Evaluation
    61  }
    62  
    63  // NewServiceScheduler is a factory function to instantiate a new service scheduler
    64  func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    65  	s := &GenericScheduler{
    66  		logger:  logger,
    67  		state:   state,
    68  		planner: planner,
    69  		batch:   false,
    70  	}
    71  	return s
    72  }
    73  
    74  // NewBatchScheduler is a factory function to instantiate a new batch scheduler
    75  func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    76  	s := &GenericScheduler{
    77  		logger:  logger,
    78  		state:   state,
    79  		planner: planner,
    80  		batch:   true,
    81  	}
    82  	return s
    83  }
    84  
    85  // Process is used to handle a single evaluation
    86  func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
    87  	// Store the evaluation
    88  	s.eval = eval
    89  
    90  	// Verify the evaluation trigger reason is understood
    91  	switch eval.TriggeredBy {
    92  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    93  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate:
    94  	default:
    95  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
    96  			eval.TriggeredBy)
    97  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc)
    98  	}
    99  
   100  	// Retry up to the maxScheduleAttempts
   101  	limit := maxServiceScheduleAttempts
   102  	if s.batch {
   103  		limit = maxBatchScheduleAttempts
   104  	}
   105  	if err := retryMax(limit, s.process); err != nil {
   106  		if statusErr, ok := err.(*SetStatusError); ok {
   107  			return setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error())
   108  		}
   109  		return err
   110  	}
   111  
   112  	// Update the status to complete
   113  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "")
   114  }
   115  
   116  // process is wrapped in retryMax to iteratively run the handler until we have no
   117  // further work or we've made the maximum number of attempts.
   118  func (s *GenericScheduler) process() (bool, error) {
   119  	// Lookup the Job by ID
   120  	var err error
   121  	s.job, err = s.state.JobByID(s.eval.JobID)
   122  	if err != nil {
   123  		return false, fmt.Errorf("failed to get job '%s': %v",
   124  			s.eval.JobID, err)
   125  	}
   126  
   127  	// Create a plan
   128  	s.plan = s.eval.MakePlan(s.job)
   129  
   130  	// Create an evaluation context
   131  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   132  
   133  	// Construct the placement stack
   134  	s.stack = NewGenericStack(s.batch, s.ctx)
   135  	if s.job != nil {
   136  		s.stack.SetJob(s.job)
   137  	}
   138  
   139  	// Compute the target job allocations
   140  	if err := s.computeJobAllocs(); err != nil {
   141  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   142  		return false, err
   143  	}
   144  
   145  	// If the plan is a no-op, we can bail
   146  	if s.plan.IsNoOp() {
   147  		return true, nil
   148  	}
   149  
   150  	// If the limit of placements was reached we need to create an evaluation
   151  	// to pickup from here after the stagger period.
   152  	if s.limitReached && s.nextEval == nil {
   153  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   154  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   155  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   156  			return false, err
   157  		}
   158  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   159  	}
   160  
   161  	// Submit the plan
   162  	result, newState, err := s.planner.SubmitPlan(s.plan)
   163  	if err != nil {
   164  		return false, err
   165  	}
   166  
   167  	// If we got a state refresh, try again since we have stale data
   168  	if newState != nil {
   169  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   170  		s.state = newState
   171  		return false, nil
   172  	}
   173  
   174  	// Try again if the plan was not fully committed, potential conflict
   175  	fullCommit, expected, actual := result.FullCommit(s.plan)
   176  	if !fullCommit {
   177  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   178  			s.eval, expected, actual)
   179  		return false, nil
   180  	}
   181  
   182  	// Success!
   183  	return true, nil
   184  }
   185  
   186  // computeJobAllocs is used to reconcile differences between the job,
   187  // existing allocations and node status to update the allocations.
   188  func (s *GenericScheduler) computeJobAllocs() error {
   189  	// Materialize all the task groups, job could be missing if deregistered
   190  	var groups map[string]*structs.TaskGroup
   191  	if s.job != nil {
   192  		groups = materializeTaskGroups(s.job)
   193  	}
   194  
   195  	// Lookup the allocations by JobID
   196  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   197  	if err != nil {
   198  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   199  			s.eval.JobID, err)
   200  	}
   201  
   202  	// Filter out the allocations in a terminal state
   203  	allocs = structs.FilterTerminalAllocs(allocs)
   204  
   205  	// Determine the tainted nodes containing job allocs
   206  	tainted, err := taintedNodes(s.state, allocs)
   207  	if err != nil {
   208  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   209  			s.eval.JobID, err)
   210  	}
   211  
   212  	// Diff the required and existing allocations
   213  	diff := diffAllocs(s.job, tainted, groups, allocs)
   214  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   215  
   216  	// Add all the allocs to stop
   217  	for _, e := range diff.stop {
   218  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded)
   219  	}
   220  
   221  	// Attempt to do the upgrades in place
   222  	diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   223  
   224  	// Check if a rolling upgrade strategy is being used
   225  	limit := len(diff.update) + len(diff.migrate)
   226  	if s.job != nil && s.job.Update.Rolling() {
   227  		limit = s.job.Update.MaxParallel
   228  	}
   229  
   230  	// Treat migrations as an eviction and a new placement.
   231  	s.limitReached = evictAndPlace(s.ctx, diff, diff.migrate, allocMigrating, &limit)
   232  
   233  	// Treat non in-place updates as an eviction and new placement.
   234  	s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   235  
   236  	// Nothing remaining to do if placement is not required
   237  	if len(diff.place) == 0 {
   238  		return nil
   239  	}
   240  
   241  	// Compute the placements
   242  	return s.computePlacements(diff.place)
   243  }
   244  
   245  // computePlacements computes placements for allocations
   246  func (s *GenericScheduler) computePlacements(place []allocTuple) error {
   247  	// Get the base nodes
   248  	nodes, err := readyNodesInDCs(s.state, s.job.Datacenters)
   249  	if err != nil {
   250  		return err
   251  	}
   252  
   253  	// Update the set of placement ndoes
   254  	s.stack.SetNodes(nodes)
   255  
   256  	// Track the failed task groups so that we can coalesce
   257  	// the failures together to avoid creating many failed allocs.
   258  	failedTG := make(map[*structs.TaskGroup]*structs.Allocation)
   259  
   260  	for _, missing := range place {
   261  		// Check if this task group has already failed
   262  		if alloc, ok := failedTG[missing.TaskGroup]; ok {
   263  			alloc.Metrics.CoalescedFailures += 1
   264  			continue
   265  		}
   266  
   267  		// Attempt to match the task group
   268  		option, size := s.stack.Select(missing.TaskGroup)
   269  
   270  		// Create an allocation for this
   271  		alloc := &structs.Allocation{
   272  			ID:        structs.GenerateUUID(),
   273  			EvalID:    s.eval.ID,
   274  			Name:      missing.Name,
   275  			JobID:     s.job.ID,
   276  			Job:       s.job,
   277  			TaskGroup: missing.TaskGroup.Name,
   278  			Resources: size,
   279  			Metrics:   s.ctx.Metrics(),
   280  		}
   281  
   282  		// Set fields based on if we found an allocation option
   283  		if option != nil {
   284  			alloc.NodeID = option.Node.ID
   285  			alloc.TaskResources = option.TaskResources
   286  			alloc.DesiredStatus = structs.AllocDesiredStatusRun
   287  			alloc.ClientStatus = structs.AllocClientStatusPending
   288  			s.plan.AppendAlloc(alloc)
   289  		} else {
   290  			alloc.DesiredStatus = structs.AllocDesiredStatusFailed
   291  			alloc.DesiredDescription = "failed to find a node for placement"
   292  			alloc.ClientStatus = structs.AllocClientStatusFailed
   293  			s.plan.AppendFailed(alloc)
   294  			failedTG[missing.TaskGroup] = alloc
   295  		}
   296  	}
   297  	return nil
   298  }