github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/system_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	memdb "github.com/hashicorp/go-memdb"
     8  	"github.com/hashicorp/nomad/helper/uuid"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// maxSystemScheduleAttempts is used to limit the number of times
    14  	// we will attempt to schedule if we continue to hit conflicts for system
    15  	// jobs.
    16  	maxSystemScheduleAttempts = 5
    17  
    18  	// allocNodeTainted is the status used when stopping an alloc because it's
    19  	// node is tainted.
    20  	allocNodeTainted = "alloc not needed as node is tainted"
    21  )
    22  
    23  // SystemScheduler is used for 'system' jobs. This scheduler is
    24  // designed for services that should be run on every client.
    25  type SystemScheduler struct {
    26  	logger  *log.Logger
    27  	state   State
    28  	planner Planner
    29  
    30  	eval       *structs.Evaluation
    31  	job        *structs.Job
    32  	plan       *structs.Plan
    33  	planResult *structs.PlanResult
    34  	ctx        *EvalContext
    35  	stack      *SystemStack
    36  	nodes      []*structs.Node
    37  	nodesByDC  map[string]int
    38  
    39  	limitReached bool
    40  	nextEval     *structs.Evaluation
    41  
    42  	failedTGAllocs map[string]*structs.AllocMetric
    43  	queuedAllocs   map[string]int
    44  }
    45  
    46  // NewSystemScheduler is a factory function to instantiate a new system
    47  // scheduler.
    48  func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    49  	return &SystemScheduler{
    50  		logger:  logger,
    51  		state:   state,
    52  		planner: planner,
    53  	}
    54  }
    55  
    56  // Process is used to handle a single evaluation.
    57  func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
    58  	// Store the evaluation
    59  	s.eval = eval
    60  
    61  	// Verify the evaluation trigger reason is understood
    62  	switch eval.TriggeredBy {
    63  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    64  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
    65  		structs.EvalTriggerDeploymentWatcher:
    66  	default:
    67  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
    68  			eval.TriggeredBy)
    69  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusFailed, desc,
    70  			s.queuedAllocs, "")
    71  	}
    72  
    73  	// Retry up to the maxSystemScheduleAttempts and reset if progress is made.
    74  	progress := func() bool { return progressMade(s.planResult) }
    75  	if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil {
    76  		if statusErr, ok := err.(*SetStatusError); ok {
    77  			return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
    78  				s.queuedAllocs, "")
    79  		}
    80  		return err
    81  	}
    82  
    83  	// Update the status to complete
    84  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, structs.EvalStatusComplete, "",
    85  		s.queuedAllocs, "")
    86  }
    87  
    88  // process is wrapped in retryMax to iteratively run the handler until we have no
    89  // further work or we've made the maximum number of attempts.
    90  func (s *SystemScheduler) process() (bool, error) {
    91  	// Lookup the Job by ID
    92  	var err error
    93  	ws := memdb.NewWatchSet()
    94  	s.job, err = s.state.JobByID(ws, s.eval.Namespace, s.eval.JobID)
    95  	if err != nil {
    96  		return false, fmt.Errorf("failed to get job '%s': %v",
    97  			s.eval.JobID, err)
    98  	}
    99  	numTaskGroups := 0
   100  	if !s.job.Stopped() {
   101  		numTaskGroups = len(s.job.TaskGroups)
   102  	}
   103  	s.queuedAllocs = make(map[string]int, numTaskGroups)
   104  
   105  	// Get the ready nodes in the required datacenters
   106  	if !s.job.Stopped() {
   107  		s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters)
   108  		if err != nil {
   109  			return false, fmt.Errorf("failed to get ready nodes: %v", err)
   110  		}
   111  	}
   112  
   113  	// Create a plan
   114  	s.plan = s.eval.MakePlan(s.job)
   115  
   116  	// Reset the failed allocations
   117  	s.failedTGAllocs = nil
   118  
   119  	// Create an evaluation context
   120  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   121  
   122  	// Construct the placement stack
   123  	s.stack = NewSystemStack(s.ctx)
   124  	if !s.job.Stopped() {
   125  		s.stack.SetJob(s.job)
   126  	}
   127  
   128  	// Compute the target job allocations
   129  	if err := s.computeJobAllocs(); err != nil {
   130  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   131  		return false, err
   132  	}
   133  
   134  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   135  	// anyways to get the annotations.
   136  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   137  		return true, nil
   138  	}
   139  
   140  	// If the limit of placements was reached we need to create an evaluation
   141  	// to pickup from here after the stagger period.
   142  	if s.limitReached && s.nextEval == nil {
   143  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   144  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   145  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   146  			return false, err
   147  		}
   148  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   149  	}
   150  
   151  	// Submit the plan
   152  	result, newState, err := s.planner.SubmitPlan(s.plan)
   153  	s.planResult = result
   154  	if err != nil {
   155  		return false, err
   156  	}
   157  
   158  	// Decrement the number of allocations pending per task group based on the
   159  	// number of allocations successfully placed
   160  	adjustQueuedAllocations(s.logger, result, s.queuedAllocs)
   161  
   162  	// If we got a state refresh, try again since we have stale data
   163  	if newState != nil {
   164  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   165  		s.state = newState
   166  		return false, nil
   167  	}
   168  
   169  	// Try again if the plan was not fully committed, potential conflict
   170  	fullCommit, expected, actual := result.FullCommit(s.plan)
   171  	if !fullCommit {
   172  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   173  			s.eval, expected, actual)
   174  		return false, nil
   175  	}
   176  
   177  	// Success!
   178  	return true, nil
   179  }
   180  
   181  // computeJobAllocs is used to reconcile differences between the job,
   182  // existing allocations and node status to update the allocations.
   183  func (s *SystemScheduler) computeJobAllocs() error {
   184  	// Lookup the allocations by JobID
   185  	ws := memdb.NewWatchSet()
   186  	allocs, err := s.state.AllocsByJob(ws, s.eval.Namespace, s.eval.JobID, true)
   187  	if err != nil {
   188  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   189  			s.eval.JobID, err)
   190  	}
   191  
   192  	// Determine the tainted nodes containing job allocs
   193  	tainted, err := taintedNodes(s.state, allocs)
   194  	if err != nil {
   195  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   196  			s.eval.JobID, err)
   197  	}
   198  
   199  	// Update the allocations which are in pending/running state on tainted
   200  	// nodes to lost
   201  	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
   202  
   203  	// Filter out the allocations in a terminal state
   204  	allocs, terminalAllocs := structs.FilterTerminalAllocs(allocs)
   205  
   206  	// Diff the required and existing allocations
   207  	diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs)
   208  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   209  
   210  	// Add all the allocs to stop
   211  	for _, e := range diff.stop {
   212  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded, "")
   213  	}
   214  
   215  	// Lost allocations should be transitioned to desired status stop and client
   216  	// status lost.
   217  	for _, e := range diff.lost {
   218  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
   219  	}
   220  
   221  	// Attempt to do the upgrades in place
   222  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   223  	diff.update = destructiveUpdates
   224  
   225  	if s.eval.AnnotatePlan {
   226  		s.plan.Annotations = &structs.PlanAnnotations{
   227  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   228  		}
   229  	}
   230  
   231  	// Check if a rolling upgrade strategy is being used
   232  	limit := len(diff.update)
   233  	if !s.job.Stopped() && s.job.Update.Rolling() {
   234  		limit = s.job.Update.MaxParallel
   235  	}
   236  
   237  	// Treat non in-place updates as an eviction and new placement.
   238  	s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   239  
   240  	// Nothing remaining to do if placement is not required
   241  	if len(diff.place) == 0 {
   242  		if !s.job.Stopped() {
   243  			for _, tg := range s.job.TaskGroups {
   244  				s.queuedAllocs[tg.Name] = 0
   245  			}
   246  		}
   247  		return nil
   248  	}
   249  
   250  	// Record the number of allocations that needs to be placed per Task Group
   251  	for _, allocTuple := range diff.place {
   252  		s.queuedAllocs[allocTuple.TaskGroup.Name] += 1
   253  	}
   254  
   255  	// Compute the placements
   256  	return s.computePlacements(diff.place)
   257  }
   258  
   259  // computePlacements computes placements for allocations
   260  func (s *SystemScheduler) computePlacements(place []allocTuple) error {
   261  	nodeByID := make(map[string]*structs.Node, len(s.nodes))
   262  	for _, node := range s.nodes {
   263  		nodeByID[node.ID] = node
   264  	}
   265  
   266  	nodes := make([]*structs.Node, 1)
   267  	for _, missing := range place {
   268  		node, ok := nodeByID[missing.Alloc.NodeID]
   269  		if !ok {
   270  			return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
   271  		}
   272  
   273  		// Update the set of placement nodes
   274  		nodes[0] = node
   275  		s.stack.SetNodes(nodes)
   276  
   277  		// Attempt to match the task group
   278  		option, _ := s.stack.Select(missing.TaskGroup, nil)
   279  
   280  		if option == nil {
   281  			// If nodes were filtered because of constraint mismatches and we
   282  			// couldn't create an allocation then decrementing queued for that
   283  			// task group
   284  			if s.ctx.metrics.NodesFiltered > 0 {
   285  				s.queuedAllocs[missing.TaskGroup.Name] -= 1
   286  
   287  				// If we are annotating the plan, then decrement the desired
   288  				// placements based on whether the node meets the constraints
   289  				if s.eval.AnnotatePlan && s.plan.Annotations != nil &&
   290  					s.plan.Annotations.DesiredTGUpdates != nil {
   291  					desired := s.plan.Annotations.DesiredTGUpdates[missing.TaskGroup.Name]
   292  					desired.Place -= 1
   293  				}
   294  			}
   295  
   296  			// Check if this task group has already failed
   297  			if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok {
   298  				metric.CoalescedFailures += 1
   299  				continue
   300  			}
   301  		}
   302  
   303  		// Store the available nodes by datacenter
   304  		s.ctx.Metrics().NodesAvailable = s.nodesByDC
   305  
   306  		// Set fields based on if we found an allocation option
   307  		if option != nil {
   308  			// Create an allocation for this
   309  			alloc := &structs.Allocation{
   310  				ID:            uuid.Generate(),
   311  				Namespace:     s.job.Namespace,
   312  				EvalID:        s.eval.ID,
   313  				Name:          missing.Name,
   314  				JobID:         s.job.ID,
   315  				TaskGroup:     missing.TaskGroup.Name,
   316  				Metrics:       s.ctx.Metrics(),
   317  				NodeID:        option.Node.ID,
   318  				TaskResources: option.TaskResources,
   319  				DesiredStatus: structs.AllocDesiredStatusRun,
   320  				ClientStatus:  structs.AllocClientStatusPending,
   321  
   322  				SharedResources: &structs.Resources{
   323  					DiskMB: missing.TaskGroup.EphemeralDisk.SizeMB,
   324  				},
   325  			}
   326  
   327  			// If the new allocation is replacing an older allocation then we
   328  			// set the record the older allocation id so that they are chained
   329  			if missing.Alloc != nil {
   330  				alloc.PreviousAllocation = missing.Alloc.ID
   331  			}
   332  
   333  			s.plan.AppendAlloc(alloc)
   334  		} else {
   335  			// Lazy initialize the failed map
   336  			if s.failedTGAllocs == nil {
   337  				s.failedTGAllocs = make(map[string]*structs.AllocMetric)
   338  			}
   339  
   340  			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   341  		}
   342  	}
   343  
   344  	return nil
   345  }