github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/scheduler/system_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// maxSystemScheduleAttempts is used to limit the number of times
    12  	// we will attempt to schedule if we continue to hit conflicts for system
    13  	// jobs.
    14  	maxSystemScheduleAttempts = 5
    15  
    16  	// allocNodeTainted is the status used when stopping an alloc because it's
    17  	// node is tainted.
    18  	allocNodeTainted = "system alloc not needed as node is tainted"
    19  )
    20  
    21  // SystemScheduler is used for 'system' jobs. This scheduler is
    22  // designed for services that should be run on every client.
    23  type SystemScheduler struct {
    24  	logger  *log.Logger
    25  	state   State
    26  	planner Planner
    27  
    28  	eval       *structs.Evaluation
    29  	job        *structs.Job
    30  	plan       *structs.Plan
    31  	planResult *structs.PlanResult
    32  	ctx        *EvalContext
    33  	stack      *SystemStack
    34  	nodes      []*structs.Node
    35  	nodesByDC  map[string]int
    36  
    37  	limitReached bool
    38  	nextEval     *structs.Evaluation
    39  }
    40  
    41  // NewSystemScheduler is a factory function to instantiate a new system
    42  // scheduler.
    43  func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    44  	return &SystemScheduler{
    45  		logger:  logger,
    46  		state:   state,
    47  		planner: planner,
    48  	}
    49  }
    50  
    51  // Process is used to handle a single evaluation.
    52  func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
    53  	// Store the evaluation
    54  	s.eval = eval
    55  
    56  	// Verify the evaluation trigger reason is understood
    57  	switch eval.TriggeredBy {
    58  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    59  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate:
    60  	default:
    61  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
    62  			eval.TriggeredBy)
    63  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, structs.EvalStatusFailed, desc)
    64  	}
    65  
    66  	// Retry up to the maxSystemScheduleAttempts and reset if progress is made.
    67  	progress := func() bool { return progressMade(s.planResult) }
    68  	if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil {
    69  		if statusErr, ok := err.(*SetStatusError); ok {
    70  			return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, statusErr.EvalStatus, err.Error())
    71  		}
    72  		return err
    73  	}
    74  
    75  	// Update the status to complete
    76  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, structs.EvalStatusComplete, "")
    77  }
    78  
    79  // process is wrapped in retryMax to iteratively run the handler until we have no
    80  // further work or we've made the maximum number of attempts.
    81  func (s *SystemScheduler) process() (bool, error) {
    82  	// Lookup the Job by ID
    83  	var err error
    84  	s.job, err = s.state.JobByID(s.eval.JobID)
    85  	if err != nil {
    86  		return false, fmt.Errorf("failed to get job '%s': %v",
    87  			s.eval.JobID, err)
    88  	}
    89  
    90  	// Get the ready nodes in the required datacenters
    91  	if s.job != nil {
    92  		s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters)
    93  		if err != nil {
    94  			return false, fmt.Errorf("failed to get ready nodes: %v", err)
    95  		}
    96  	}
    97  
    98  	// Create a plan
    99  	s.plan = s.eval.MakePlan(s.job)
   100  
   101  	// Reset the failed allocations
   102  	s.eval.FailedTGAllocs = nil
   103  
   104  	// Create an evaluation context
   105  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   106  
   107  	// Construct the placement stack
   108  	s.stack = NewSystemStack(s.ctx)
   109  	if s.job != nil {
   110  		s.stack.SetJob(s.job)
   111  	}
   112  
   113  	// Compute the target job allocations
   114  	if err := s.computeJobAllocs(); err != nil {
   115  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   116  		return false, err
   117  	}
   118  
   119  	// If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan
   120  	// anyways to get the annotations.
   121  	if s.plan.IsNoOp() && !s.eval.AnnotatePlan {
   122  		return true, nil
   123  	}
   124  
   125  	// If the limit of placements was reached we need to create an evaluation
   126  	// to pickup from here after the stagger period.
   127  	if s.limitReached && s.nextEval == nil {
   128  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   129  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   130  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   131  			return false, err
   132  		}
   133  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   134  	}
   135  
   136  	// Submit the plan
   137  	result, newState, err := s.planner.SubmitPlan(s.plan)
   138  	s.planResult = result
   139  	if err != nil {
   140  		return false, err
   141  	}
   142  
   143  	// If we got a state refresh, try again since we have stale data
   144  	if newState != nil {
   145  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   146  		s.state = newState
   147  		return false, nil
   148  	}
   149  
   150  	// Try again if the plan was not fully committed, potential conflict
   151  	fullCommit, expected, actual := result.FullCommit(s.plan)
   152  	if !fullCommit {
   153  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   154  			s.eval, expected, actual)
   155  		return false, nil
   156  	}
   157  
   158  	// Success!
   159  	return true, nil
   160  }
   161  
   162  // computeJobAllocs is used to reconcile differences between the job,
   163  // existing allocations and node status to update the allocations.
   164  func (s *SystemScheduler) computeJobAllocs() error {
   165  	// Lookup the allocations by JobID
   166  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   167  	if err != nil {
   168  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   169  			s.eval.JobID, err)
   170  	}
   171  
   172  	// Filter out the allocations in a terminal state
   173  	allocs = structs.FilterTerminalAllocs(allocs)
   174  
   175  	// Determine the tainted nodes containing job allocs
   176  	tainted, err := taintedNodes(s.state, allocs)
   177  	if err != nil {
   178  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   179  			s.eval.JobID, err)
   180  	}
   181  
   182  	// Diff the required and existing allocations
   183  	diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs)
   184  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   185  
   186  	// Add all the allocs to stop
   187  	for _, e := range diff.stop {
   188  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded)
   189  	}
   190  
   191  	// Attempt to do the upgrades in place
   192  	destructiveUpdates, inplaceUpdates := inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   193  	diff.update = destructiveUpdates
   194  
   195  	if s.eval.AnnotatePlan {
   196  		s.plan.Annotations = &structs.PlanAnnotations{
   197  			DesiredTGUpdates: desiredUpdates(diff, inplaceUpdates, destructiveUpdates),
   198  		}
   199  	}
   200  
   201  	// Check if a rolling upgrade strategy is being used
   202  	limit := len(diff.update)
   203  	if s.job != nil && s.job.Update.Rolling() {
   204  		limit = s.job.Update.MaxParallel
   205  	}
   206  
   207  	// Treat non in-place updates as an eviction and new placement.
   208  	s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   209  
   210  	// Nothing remaining to do if placement is not required
   211  	if len(diff.place) == 0 {
   212  		return nil
   213  	}
   214  
   215  	// Compute the placements
   216  	return s.computePlacements(diff.place)
   217  }
   218  
   219  // computePlacements computes placements for allocations
   220  func (s *SystemScheduler) computePlacements(place []allocTuple) error {
   221  	nodeByID := make(map[string]*structs.Node, len(s.nodes))
   222  	for _, node := range s.nodes {
   223  		nodeByID[node.ID] = node
   224  	}
   225  
   226  	nodes := make([]*structs.Node, 1)
   227  	for _, missing := range place {
   228  		node, ok := nodeByID[missing.Alloc.NodeID]
   229  		if !ok {
   230  			return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
   231  		}
   232  
   233  		// Update the set of placement nodes
   234  		nodes[0] = node
   235  		s.stack.SetNodes(nodes)
   236  
   237  		// Attempt to match the task group
   238  		option, _ := s.stack.Select(missing.TaskGroup)
   239  
   240  		if option == nil {
   241  			// Check if this task group has already failed
   242  			if metric, ok := s.eval.FailedTGAllocs[missing.TaskGroup.Name]; ok {
   243  				metric.CoalescedFailures += 1
   244  				continue
   245  			}
   246  		}
   247  
   248  		// Store the available nodes by datacenter
   249  		s.ctx.Metrics().NodesAvailable = s.nodesByDC
   250  
   251  		// Set fields based on if we found an allocation option
   252  		if option != nil {
   253  			// Create an allocation for this
   254  			alloc := &structs.Allocation{
   255  				ID:            structs.GenerateUUID(),
   256  				EvalID:        s.eval.ID,
   257  				Name:          missing.Name,
   258  				JobID:         s.job.ID,
   259  				TaskGroup:     missing.TaskGroup.Name,
   260  				Metrics:       s.ctx.Metrics(),
   261  				NodeID:        option.Node.ID,
   262  				TaskResources: option.TaskResources,
   263  				DesiredStatus: structs.AllocDesiredStatusRun,
   264  				ClientStatus:  structs.AllocClientStatusPending,
   265  			}
   266  
   267  			// Generate service IDs tasks in this allocation
   268  			// COMPAT - This is no longer required and would be removed in v0.4
   269  			alloc.PopulateServiceIDs(missing.TaskGroup)
   270  
   271  			s.plan.AppendAlloc(alloc)
   272  		} else {
   273  			// Lazy initialize the failed map
   274  			if s.eval.FailedTGAllocs == nil {
   275  				s.eval.FailedTGAllocs = make(map[string]*structs.AllocMetric)
   276  			}
   277  
   278  			s.eval.FailedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
   279  		}
   280  	}
   281  
   282  	return nil
   283  }