github.com/quite/nomad@v0.8.6/scheduler/system_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	memdb "github.com/hashicorp/go-memdb"
     8  	"github.com/hashicorp/nomad/helper/uuid"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// maxSystemScheduleAttempts is used to limit the number of times
    14  	// we will attempt to schedule if we continue to hit conflicts for system
    15  	// jobs.
    16  	maxSystemScheduleAttempts = 5
    17  )
    18  
    19  // SystemScheduler is used for 'system' jobs. This scheduler is
    20  // designed for services that should be run on every client.
    21  type SystemScheduler struct {
    22  	logger  *log.Logger
    23  	state   State
    24  	planner Planner
    25  
    26  	eval       *structs.Evaluation
    27  	job        *structs.Job
    28  	plan       *structs.Plan
    29  	planResult *structs.PlanResult
    30  	ctx        *EvalContext
    31  	stack      *SystemStack
    32  	nodes      []*structs.Node
    33  	nodesByDC  map[string]int
    34  
    35  	limitReached bool
    36  	nextEval     *structs.Evaluation
    37  
    38  	failedTGAllocs map[string]*structs.AllocMetric
    39  	queuedAllocs   map[string]int
    40  }
    41  
    42  // NewSystemScheduler is a factory function to instantiate a new system
    43  // scheduler.
    44  func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    45  	return &SystemScheduler{
    46  		logger:  logger,
    47  		state:   state,
    48  		planner: planner,
    49  	}
    50  }
    51  
    52  // Process is used to handle a single evaluation.
    53  func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
    54  	// Store the evaluation
    55  	s.eval = eval
    56  
    57  	// Verify the evaluation trigger reason is understood
    58  	switch eval.TriggeredBy {
    59  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp,
    60  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
    61  		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain:
    62  	default:
    63  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
    64  			eval.TriggeredBy)
    65  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc,
    66  			s.queuedAllocs, "")
    67  	}
    68  
    69  	// Retry up to the maxSystemScheduleAttempts and reset if progress is made.
    70  	progress := func() bool { return progressMade(s.planResult) }
    71  	if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil {
    72  		if statusErr, ok := err.(*SetStatusError); ok {
    73  			return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
    74  				s.queuedAllocs, "")
    75  		}
    76  		return err
    77  	}
    78  
    79  	// Update the status to complete
    80  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "",
    81  		s.queuedAllocs, "")
    82  }
    83  
    84  // process is wrapped in retryMax to iteratively run the handler until we have no
    85  // further work or we've made the maximum number of attempts.
    86  func (s *SystemScheduler) process() (bool, error) {
    87  	// Lookup the Job by ID
    88  	var err error
    89  	ws := memdb.NewWatchSet()
    90  	s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
    91  	if err != nil {
    92  		return false, fmt.Errorf("failed to get job '%s': %v",
    93  			s.eval.JobID, err)
    94  	}
    95  	numTaskGroups := 0
    96  	if !s.job.Stopped() {
    97  		numTaskGroups = len(s.job.TaskGroups)
    98  	}
    99  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   100  
   101  	// Get the ready nodes in the required datacenters
   102  	if !s.job.Stopped() {
   103  		s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters)
   104  		if err != nil {
   105  			return false, fmt.Errorf("failed to get ready nodes: %v", err)
   106  		}
   107  	}
   108  
   109  	// Create a plan
   110  	s.plan = s.eval.MakePlan(s.job)
   111  
   112  	// Reset the failed allocations
   113  	s.failedTGAllocs = nil
   114  
   115  	// Create an evaluation context
   116  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   117  
   118  	// Construct the placement stack
   119  	s.stack = NewSystemStack(s.ctx)
   120  	if !s.job.Stopped() {
   121  		s.stack.SetJob(s.job)
   122  	}
   123  
   124  	// Compute the target job allocations
   125  	if err := s.computeJobAllocs(); err != nil {
   126  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   127  		return false, err
   128  	}
   129  
   130  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   131  	// anyways to get the annotations.
   132  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   133  		return true, nil
   134  	}
   135  
   136  	// If the limit of placements was reached we need to create an evaluation
   137  	// to pickup from here after the stagger period.
   138  	if s.limitReached && s.nextEval == nil {
   139  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   140  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   141  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   142  			return false, err
   143  		}
   144  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   145  	}
   146  
   147  	// Submit the plan
   148  	result, newState, err := s.planner.SubmitPlan(s.plan)
   149  	s.planResult = result
   150  	if err != nil {
   151  		return false, err
   152  	}
   153  
   154  	// Decrement the number of allocations pending per task group based on the
   155  	// number of allocations successfully placed
   156  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   157  
   158  	// If we got a state refresh, try again since we have stale data
   159  	if newState != nil {
   160  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   161  		s.state = newState
   162  		return false, nil
   163  	}
   164  
   165  	// Try again if the plan was not fully committed, potential conflict
   166  	fullCommit, expected, actual := result.FullCommit(s.plan)
   167  	if !fullCommit {
   168  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   169  			s.eval, expected, actual)
   170  		return false, nil
   171  	}
   172  
   173  	// Success!
   174  	return true, nil
   175  }
   176  
   177  // computeJobAllocs is used to reconcile differences between the job,
   178  // existing allocations and node status to update the allocations.
   179  func (s *SystemScheduler) computeJobAllocs() error {
   180  	// Lookup the allocations by JobID
   181  	ws := memdb.NewWatchSet()
   182  	allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
   183  	if err != nil {
   184  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   185  			s.eval.JobID, err)
   186  	}
   187  
   188  	// Determine the tainted nodes containing job allocs
   189  	tainted, err := taintedNodes(s.state, allocs)
   190  	if err != nil {
   191  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   192  			s.eval.JobID, err)
   193  	}
   194  
   195  	// Update the allocations which are in pending/running state on tainted
   196  	// nodes to lost
   197  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   198  
   199  	// Filter out the allocations in a terminal state
   200  	allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs)
   201  
   202  	// Diff the required and existing allocations
   203  	diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs)
   204  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   205  
   206  	// Add all the allocs to stop
   207  	for _, e := range diff.stop {
   208  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "")
   209  	}
   210  
   211  	// Add all the allocs to migrate
   212  	for _, e := range diff.migrate {
   213  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNodeTainted, "")
   214  	}
   215  
   216  	// Lost allocations should be transitioned to desired status stop and client
   217  	// status lost.
   218  	for _, e := range diff.lost {
   219  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
   220  	}
   221  
   222  	// Attempt to do the upgrades in place
   223  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   224  	diff.update = destructiveUpdates
   225  
   226  	if s.eval.AnnotatePlan {
   227  		s.plan.Annotations = &structs.PlanAnnotations{
   228  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   229  		}
   230  	}
   231  
   232  	// Check if a rolling upgrade strategy is being used
   233  	limit := len(diff.update)
   234  	if !s.job.Stopped() && s.job.Update.Rolling() {
   235  		limit = s.job.Update.MaxParallel
   236  	}
   237  
   238  	// Treat non in-place updates as an eviction and new placement.
   239  	s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   240  
   241  	// Nothing remaining to do if placement is not required
   242  	if len(diff.place) == 0 {
   243  		if !s.job.Stopped() {
   244  			for _, tg := range s.job.TaskGroups {
   245  				s.queuedAllocs[tg.Name] = 0
   246  			}
   247  		}
   248  		return nil
   249  	}
   250  
   251  	// Record the number of allocations that needs to be placed per Task Group
   252  	for _, allocTuple := range diff.place {
   253  		s.queuedAllocs[allocTuple.TaskGroup.Name] += 1
   254  	}
   255  
   256  	// Compute the placements
   257  	return s.computePlacements(diff.place)
   258  }
   259  
   260  // computePlacements computes placements for allocations
   261  func (s *SystemScheduler) computePlacements(place []allocTuple) error {
   262  	nodeByID := make(map[string]*structs.Node, len(s.nodes))
   263  	for _, node := range s.nodes {
   264  		nodeByID[node.ID] = node
   265  	}
   266  
   267  	nodes := make([]*structs.Node, 1)
   268  	for _, missing := range place {
   269  		node, ok := nodeByID[missing.Alloc.NodeID]
   270  		if !ok {
   271  			return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
   272  		}
   273  
   274  		// Update the set of placement nodes
   275  		nodes[0] = node
   276  		s.stack.SetNodes(nodes)
   277  
   278  		// Attempt to match the task group
   279  		option, _ := s.stack.Select(missing.TaskGroup, nil)
   280  
   281  		if option == nil {
   282  			// If nodes were filtered because of constraint mismatches and we
   283  			// couldn't create an allocation then decrementing queued for that
   284  			// task group
   285  			if s.ctx.metrics.NodesFiltered > 0 {
   286  				s.queuedAllocs[missing.TaskGroup.Name] -= 1
   287  
   288  				// If we are annotating the plan, then decrement the desired
   289  				// placements based on whether the node meets the constraints
   290  				if s.eval.AnnotatePlan && s.plan.Annotations != nil &&
   291  					s.plan.Annotations.DesiredTGUpdates != nil {
   292  					desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name]
   293  					desired.Place -= 1
   294  				}
   295  			}
   296  
   297  			// Check if this task group has already failed
   298  			if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok {
   299  				metric.CoalescedFailures += 1
   300  				continue
   301  			}
   302  		}
   303  
   304  		// Store the available nodes by datacenter
   305  		s.ctx.Metrics().NodesAvailable = s.nodesByDC
   306  
   307  		// Set fields based on if we found an allocation option
   308  		if option != nil {
   309  			// Create an allocation for this
   310  			alloc := &structs.Allocation{
   311  				ID:            uuid.Generate(),
   312  				Namespace:     s.job.Namespace,
   313  				EvalID:        s.eval.ID,
   314  				Name:          missing.Name,
   315  				JobID:         s.job.ID,
   316  				TaskGroup:     missing.TaskGroup.Name,
   317  				Metrics:       s.ctx.Metrics(),
   318  				NodeID:        option.Node.ID,
   319  				TaskResources: option.TaskResources,
   320  				DesiredStatus: structs.AllocDesiredStatusRun,
   321  				ClientStatus:  structs.AllocClientStatusPending,
   322  
   323  				SharedResources: &structs.Resources{
   324  					DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB,
   325  				},
   326  			}
   327  
   328  			// If the new allocation is replacing an older allocation then we
   329  			// set the record the older allocation id so that they are chained
   330  			if missing.Alloc != nil {
   331  				alloc.PreviousAllocation = missing.Alloc.ID
   332  			}
   333  
   334  			s.plan.AppendAlloc(alloc)
   335  		} else {
   336  			// Lazy initialize the failed map
   337  			if s.failedTGAllocs == nil {
   338  				s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   339  			}
   340  
   341  			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   342  		}
   343  	}
   344  
   345  	return nil
   346  }