github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/system_sched.go

github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/system_sched.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// maxSystemScheduleAttempts is used to limit the number of times
    12  	// we will attempt to schedule if we continue to hit conflicts for system
    13  	// jobs.
    14  	maxSystemScheduleAttempts = 5
    15  
    16  	// allocNodeTainted is the status used when stopping an alloc because it's
    17  	// node is tainted.
    18  	allocNodeTainted = "system alloc not needed as node is tainted"
    19  )
    20  
    21  // SystemScheduler is used for 'system' jobs. This scheduler is
    22  // designed for services that should be run on every client.
    23  type SystemScheduler struct {
    24  	logger  *log.Logger
    25  	state   State
    26  	planner Planner
    27  
    28  	eval       *structs.Evaluation
    29  	job        *structs.Job
    30  	plan       *structs.Plan
    31  	planResult *structs.PlanResult
    32  	ctx        *EvalContext
    33  	stack      *SystemStack
    34  	nodes      []*structs.Node
    35  	nodesByDC  map[string]int
    36  
    37  	limitReached bool
    38  	nextEval     *structs.Evaluation
    39  }
    40  
    41  // NewSystemScheduler is a factory function to instantiate a new system
    42  // scheduler.
    43  func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler {
    44  	return &SystemScheduler{
    45  		logger:  logger,
    46  		state:   state,
    47  		planner: planner,
    48  	}
    49  }
    50  
    51  // Process is used to handle a single evaluation.
    52  func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
    53  	// Store the evaluation
    54  	s.eval = eval
    55  
    56  	// Verify the evaluation trigger reason is understood
    57  	switch eval.TriggeredBy {
    58  	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
    59  		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate:
    60  	default:
    61  		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
    62  			eval.TriggeredBy)
    63  		return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusFailed, desc)
    64  	}
    65  
    66  	// Retry up to the maxSystemScheduleAttempts and reset if progress is made.
    67  	progress := func() bool { return progressMade(s.planResult) }
    68  	if err := retryMax(maxSystemScheduleAttempts, s.process, progress); err != nil {
    69  		if statusErr, ok := err.(*SetStatusError); ok {
    70  			return setStatus(s.logger, s.planner, s.eval, s.nextEval, statusErr.EvalStatus, err.Error())
    71  		}
    72  		return err
    73  	}
    74  
    75  	// Update the status to complete
    76  	return setStatus(s.logger, s.planner, s.eval, s.nextEval, structs.EvalStatusComplete, "")
    77  }
    78  
    79  // process is wrapped in retryMax to iteratively run the handler until we have no
    80  // further work or we've made the maximum number of attempts.
    81  func (s *SystemScheduler) process() (bool, error) {
    82  	// Lookup the Job by ID
    83  	var err error
    84  	s.job, err = s.state.JobByID(s.eval.JobID)
    85  	if err != nil {
    86  		return false, fmt.Errorf("failed to get job '%s': %v",
    87  			s.eval.JobID, err)
    88  	}
    89  
    90  	// Get the ready nodes in the required datacenters
    91  	if s.job != nil {
    92  		s.nodes, s.nodesByDC, err = readyNodesInDCs(s.state, s.job.Datacenters)
    93  		if err != nil {
    94  			return false, fmt.Errorf("failed to get ready nodes: %v", err)
    95  		}
    96  	}
    97  
    98  	// Create a plan
    99  	s.plan = s.eval.MakePlan(s.job)
   100  
   101  	// Create an evaluation context
   102  	s.ctx = NewEvalContext(s.state, s.plan, s.logger)
   103  
   104  	// Construct the placement stack
   105  	s.stack = NewSystemStack(s.ctx)
   106  	if s.job != nil {
   107  		s.stack.SetJob(s.job)
   108  	}
   109  
   110  	// Compute the target job allocations
   111  	if err := s.computeJobAllocs(); err != nil {
   112  		s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err)
   113  		return false, err
   114  	}
   115  
   116  	// If the plan is a no-op, we can bail
   117  	if s.plan.IsNoOp() {
   118  		return true, nil
   119  	}
   120  
   121  	// If the limit of placements was reached we need to create an evaluation
   122  	// to pickup from here after the stagger period.
   123  	if s.limitReached && s.nextEval == nil {
   124  		s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger)
   125  		if err := s.planner.CreateEval(s.nextEval); err != nil {
   126  			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err)
   127  			return false, err
   128  		}
   129  		s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
   130  	}
   131  
   132  	// Submit the plan
   133  	result, newState, err := s.planner.SubmitPlan(s.plan)
   134  	s.planResult = result
   135  	if err != nil {
   136  		return false, err
   137  	}
   138  
   139  	// If we got a state refresh, try again since we have stale data
   140  	if newState != nil {
   141  		s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval)
   142  		s.state = newState
   143  		return false, nil
   144  	}
   145  
   146  	// Try again if the plan was not fully committed, potential conflict
   147  	fullCommit, expected, actual := result.FullCommit(s.plan)
   148  	if !fullCommit {
   149  		s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed",
   150  			s.eval, expected, actual)
   151  		return false, nil
   152  	}
   153  
   154  	// Success!
   155  	return true, nil
   156  }
   157  
   158  // computeJobAllocs is used to reconcile differences between the job,
   159  // existing allocations and node status to update the allocations.
   160  func (s *SystemScheduler) computeJobAllocs() error {
   161  	// Lookup the allocations by JobID
   162  	allocs, err := s.state.AllocsByJob(s.eval.JobID)
   163  	if err != nil {
   164  		return fmt.Errorf("failed to get allocs for job '%s': %v",
   165  			s.eval.JobID, err)
   166  	}
   167  
   168  	// Filter out the allocations in a terminal state
   169  	allocs = structs.FilterTerminalAllocs(allocs)
   170  
   171  	// Determine the tainted nodes containing job allocs
   172  	tainted, err := taintedNodes(s.state, allocs)
   173  	if err != nil {
   174  		return fmt.Errorf("failed to get tainted nodes for job '%s': %v",
   175  			s.eval.JobID, err)
   176  	}
   177  
   178  	// Diff the required and existing allocations
   179  	diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs)
   180  	s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff)
   181  
   182  	// Add all the allocs to stop
   183  	for _, e := range diff.stop {
   184  		s.plan.AppendUpdate(e.Alloc, structs.AllocDesiredStatusStop, allocNotNeeded)
   185  	}
   186  
   187  	// Attempt to do the upgrades in place
   188  	diff.update = inplaceUpdate(s.ctx, s.eval, s.job, s.stack, diff.update)
   189  
   190  	// Check if a rolling upgrade strategy is being used
   191  	limit := len(diff.update)
   192  	if s.job != nil && s.job.Update.Rolling() {
   193  		limit = s.job.Update.MaxParallel
   194  	}
   195  
   196  	// Treat non in-place updates as an eviction and new placement.
   197  	s.limitReached = evictAndPlace(s.ctx, diff, diff.update, allocUpdating, &limit)
   198  
   199  	// Nothing remaining to do if placement is not required
   200  	if len(diff.place) == 0 {
   201  		return nil
   202  	}
   203  
   204  	// Compute the placements
   205  	return s.computePlacements(diff.place)
   206  }
   207  
   208  // computePlacements computes placements for allocations
   209  func (s *SystemScheduler) computePlacements(place []allocTuple) error {
   210  	nodeByID := make(map[string]*structs.Node, len(s.nodes))
   211  	for _, node := range s.nodes {
   212  		nodeByID[node.ID] = node
   213  	}
   214  
   215  	// Track the failed task groups so that we can coalesce
   216  	// the failures together to avoid creating many failed allocs.
   217  	failedTG := make(map[*structs.TaskGroup]*structs.Allocation)
   218  
   219  	nodes := make([]*structs.Node, 1)
   220  	for _, missing := range place {
   221  		node, ok := nodeByID[missing.Alloc.NodeID]
   222  		if !ok {
   223  			return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
   224  		}
   225  
   226  		// Update the set of placement nodes
   227  		nodes[0] = node
   228  		s.stack.SetNodes(nodes)
   229  
   230  		// Attempt to match the task group
   231  		option, size := s.stack.Select(missing.TaskGroup)
   232  
   233  		if option == nil {
   234  			// Check if this task group has already failed
   235  			if alloc, ok := failedTG[missing.TaskGroup]; ok {
   236  				alloc.Metrics.CoalescedFailures += 1
   237  				continue
   238  			}
   239  		}
   240  
   241  		// Create an allocation for this
   242  		alloc := &structs.Allocation{
   243  			ID:        structs.GenerateUUID(),
   244  			EvalID:    s.eval.ID,
   245  			Name:      missing.Name,
   246  			JobID:     s.job.ID,
   247  			TaskGroup: missing.TaskGroup.Name,
   248  			Resources: size,
   249  			Metrics:   s.ctx.Metrics(),
   250  		}
   251  
   252  		// Store the available nodes by datacenter
   253  		s.ctx.Metrics().NodesAvailable = s.nodesByDC
   254  
   255  		// Set fields based on if we found an allocation option
   256  		if option != nil {
   257  			// Generate service IDs tasks in this allocation
   258  			alloc.PopulateServiceIDs(missing.TaskGroup)
   259  
   260  			alloc.NodeID = option.Node.ID
   261  			alloc.TaskResources = option.TaskResources
   262  			alloc.DesiredStatus = structs.AllocDesiredStatusRun
   263  			alloc.ClientStatus = structs.AllocClientStatusPending
   264  			alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStatePending)
   265  			s.plan.AppendAlloc(alloc)
   266  		} else {
   267  			alloc.DesiredStatus = structs.AllocDesiredStatusFailed
   268  			alloc.DesiredDescription = "failed to find a node for placement"
   269  			alloc.ClientStatus = structs.AllocClientStatusFailed
   270  			alloc.TaskStates = initTaskState(missing.TaskGroup, structs.TaskStateDead)
   271  			s.plan.AppendFailed(alloc)
   272  			failedTG[missing.TaskGroup] = alloc
   273  		}
   274  	}
   275  	return nil
   276  }