github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/stack.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"math"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
    12  	// that have a score lower than this. -1 is the lowest possible score for a
    13  	// node with penalties (based on job anti affinity and node rescheduling penalties
    14  	skipScoreThreshold = 0.0
    15  
    16  	// maxSkip limits the number of nodes that can be skipped in the limit iterator
    17  	maxSkip = 3
    18  )
    19  
    20  // Stack is a chained collection of iterators. The stack is used to
    21  // make placement decisions. Different schedulers may customize the
    22  // stack they use to vary the way placements are made.
    23  type Stack interface {
    24  	// SetNodes is used to set the base set of potential nodes
    25  	SetNodes([]*structs.Node)
    26  
    27  	// SetTaskGroup is used to set the job for selection
    28  	SetJob(job *structs.Job)
    29  
    30  	// Select is used to select a node for the task group
    31  	Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources)
    32  }
    33  
    34  type SelectOptions struct {
    35  	PenaltyNodeIDs map[string]struct{}
    36  	PreferredNodes []*structs.Node
    37  }
    38  
    39  // GenericStack is the Stack used for the Generic scheduler. It is
    40  // designed to make better placement decisions at the cost of performance.
    41  type GenericStack struct {
    42  	batch  bool
    43  	ctx    Context
    44  	source *StaticIterator
    45  
    46  	wrappedChecks       *FeasibilityWrapper
    47  	quota               FeasibleIterator
    48  	jobConstraint       *ConstraintChecker
    49  	taskGroupDrivers    *DriverChecker
    50  	taskGroupConstraint *ConstraintChecker
    51  
    52  	distinctHostsConstraint    *DistinctHostsIterator
    53  	distinctPropertyConstraint *DistinctPropertyIterator
    54  	binPack                    *BinPackIterator
    55  	jobAntiAff                 *JobAntiAffinityIterator
    56  	nodeReschedulingPenalty    *NodeReschedulingPenaltyIterator
    57  	limit                      *LimitIterator
    58  	maxScore                   *MaxScoreIterator
    59  	nodeAffinity               *NodeAffinityIterator
    60  	spread                     *SpreadIterator
    61  	scoreNorm                  *ScoreNormalizationIterator
    62  }
    63  
    64  // NewGenericStack constructs a stack used for selecting service placements
    65  func NewGenericStack(batch bool, ctx Context) *GenericStack {
    66  	// Create a new stack
    67  	s := &GenericStack{
    68  		batch: batch,
    69  		ctx:   ctx,
    70  	}
    71  
    72  	// Create the source iterator. We randomize the order we visit nodes
    73  	// to reduce collisions between schedulers and to do a basic load
    74  	// balancing across eligible nodes.
    75  	s.source = NewRandomIterator(ctx, nil)
    76  
    77  	// Create the quota iterator to determine if placements would result in the
    78  	// quota attached to the namespace of the job to go over.
    79  	s.quota = NewQuotaIterator(ctx, s.source)
    80  
    81  	// Attach the job constraints. The job is filled in later.
    82  	s.jobConstraint = NewConstraintChecker(ctx, nil)
    83  
    84  	// Filter on task group drivers first as they are faster
    85  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
    86  
    87  	// Filter on task group constraints second
    88  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
    89  
    90  	// Create the feasibility wrapper which wraps all feasibility checks in
    91  	// which feasibility checking can be skipped if the computed node class has
    92  	// previously been marked as eligible or ineligible. Generally this will be
    93  	// checks that only needs to examine the single node to determine feasibility.
    94  	jobs := []FeasibilityChecker{s.jobConstraint}
    95  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint}
    96  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
    97  
    98  	// Filter on distinct host constraints.
    99  	s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks)
   100  
   101  	// Filter on distinct property constraints.
   102  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint)
   103  
   104  	// Upgrade from feasible to rank iterator
   105  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   106  
   107  	// Apply the bin packing, this depends on the resources needed
   108  	// by a particular task group. Only enable eviction for the service
   109  	// scheduler as that logic is expensive.
   110  	evict := !batch
   111  	s.binPack = NewBinPackIterator(ctx, rankSource, evict, 0)
   112  
   113  	// Apply the job anti-affinity iterator. This is to avoid placing
   114  	// multiple allocations on the same node for this job.
   115  	s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, "")
   116  
   117  	s.nodeReschedulingPenalty = NewNodeReschedulingPenaltyIterator(ctx, s.jobAntiAff)
   118  
   119  	s.nodeAffinity = NewNodeAffinityIterator(ctx, s.nodeReschedulingPenalty)
   120  
   121  	s.spread = NewSpreadIterator(ctx, s.nodeAffinity)
   122  
   123  	s.scoreNorm = NewScoreNormalizationIterator(ctx, s.spread)
   124  
   125  	// Apply a limit function. This is to avoid scanning *every* possible node.
   126  	s.limit = NewLimitIterator(ctx, s.scoreNorm, 2, skipScoreThreshold, maxSkip)
   127  
   128  	// Select the node with the maximum score for placement
   129  	s.maxScore = NewMaxScoreIterator(ctx, s.limit)
   130  	return s
   131  }
   132  
   133  func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
   134  	// Shuffle base nodes
   135  	shuffleNodes(baseNodes)
   136  
   137  	// Update the set of base nodes
   138  	s.source.SetNodes(baseNodes)
   139  
   140  	// Apply a limit function. This is to avoid scanning *every* possible node.
   141  	// For batch jobs we only need to evaluate 2 options and depend on the
   142  	// power of two choices. For services jobs we need to visit "enough".
   143  	// Using a log of the total number of nodes is a good restriction, with
   144  	// at least 2 as the floor
   145  	limit := 2
   146  	if n := len(baseNodes); !s.batch && n > 0 {
   147  		logLimit := int(math.Ceil(math.Log2(float64(n))))
   148  		if logLimit > limit {
   149  			limit = logLimit
   150  		}
   151  	}
   152  	s.limit.SetLimit(limit)
   153  }
   154  
   155  func (s *GenericStack) SetJob(job *structs.Job) {
   156  	s.jobConstraint.SetConstraints(job.Constraints)
   157  	s.distinctHostsConstraint.SetJob(job)
   158  	s.distinctPropertyConstraint.SetJob(job)
   159  	s.binPack.SetPriority(job.Priority)
   160  	s.jobAntiAff.SetJob(job)
   161  	s.nodeAffinity.SetJob(job)
   162  	s.spread.SetJob(job)
   163  	s.ctx.Eligibility().SetJob(job)
   164  
   165  	if contextual, ok := s.quota.(ContextualIterator); ok {
   166  		contextual.SetJob(job)
   167  	}
   168  }
   169  
   170  func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
   171  
   172  	// This block handles trying to select from preferred nodes if options specify them
   173  	// It also sets back the set of nodes to the original nodes
   174  	if options != nil && len(options.PreferredNodes) > 0 {
   175  		originalNodes := s.source.nodes
   176  		s.source.SetNodes(options.PreferredNodes)
   177  		optionsNew := *options
   178  		optionsNew.PreferredNodes = nil
   179  		if option, resources := s.Select(tg, &optionsNew); option != nil {
   180  			s.source.SetNodes(originalNodes)
   181  			return option, resources
   182  		}
   183  		s.source.SetNodes(originalNodes)
   184  		return s.Select(tg, &optionsNew)
   185  	}
   186  
   187  	// Reset the max selector and context
   188  	s.maxScore.Reset()
   189  	s.ctx.Reset()
   190  	start := time.Now()
   191  
   192  	// Get the task groups constraints.
   193  	tgConstr := taskGroupConstraints(tg)
   194  
   195  	// Update the parameters of iterators
   196  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   197  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   198  	s.distinctHostsConstraint.SetTaskGroup(tg)
   199  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   200  	s.wrappedChecks.SetTaskGroup(tg.Name)
   201  	s.binPack.SetTaskGroup(tg)
   202  	s.jobAntiAff.SetTaskGroup(tg)
   203  	if options != nil {
   204  		s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs)
   205  	}
   206  	s.nodeAffinity.SetTaskGroup(tg)
   207  	s.spread.SetTaskGroup(tg)
   208  
   209  	if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() {
   210  		s.limit.SetLimit(math.MaxInt32)
   211  	}
   212  
   213  	if contextual, ok := s.quota.(ContextualIterator); ok {
   214  		contextual.SetTaskGroup(tg)
   215  	}
   216  
   217  	// Find the node with the max score
   218  	option := s.maxScore.Next()
   219  
   220  	// Ensure that the task resources were specified
   221  	if option != nil && len(option.TaskResources) != len(tg.Tasks) {
   222  		for _, task := range tg.Tasks {
   223  			option.SetTaskResources(task, task.Resources)
   224  		}
   225  	}
   226  
   227  	// Store the compute time
   228  	s.ctx.Metrics().AllocationTime = time.Since(start)
   229  	return option, tgConstr.size
   230  }
   231  
   232  // SystemStack is the Stack used for the System scheduler. It is designed to
   233  // attempt to make placements on all nodes.
   234  type SystemStack struct {
   235  	ctx                        Context
   236  	source                     *StaticIterator
   237  	wrappedChecks              *FeasibilityWrapper
   238  	quota                      FeasibleIterator
   239  	jobConstraint              *ConstraintChecker
   240  	taskGroupDrivers           *DriverChecker
   241  	taskGroupConstraint        *ConstraintChecker
   242  	distinctPropertyConstraint *DistinctPropertyIterator
   243  	binPack                    *BinPackIterator
   244  	scoreNorm                  *ScoreNormalizationIterator
   245  }
   246  
   247  // NewSystemStack constructs a stack used for selecting service placements
   248  func NewSystemStack(ctx Context) *SystemStack {
   249  	// Create a new stack
   250  	s := &SystemStack{ctx: ctx}
   251  
   252  	// Create the source iterator. We visit nodes in a linear order because we
   253  	// have to evaluate on all nodes.
   254  	s.source = NewStaticIterator(ctx, nil)
   255  
   256  	// Create the quota iterator to determine if placements would result in the
   257  	// quota attached to the namespace of the job to go over.
   258  	s.quota = NewQuotaIterator(ctx, s.source)
   259  
   260  	// Attach the job constraints. The job is filled in later.
   261  	s.jobConstraint = NewConstraintChecker(ctx, nil)
   262  
   263  	// Filter on task group drivers first as they are faster
   264  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
   265  
   266  	// Filter on task group constraints second
   267  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
   268  
   269  	// Create the feasibility wrapper which wraps all feasibility checks in
   270  	// which feasibility checking can be skipped if the computed node class has
   271  	// previously been marked as eligible or ineligible. Generally this will be
   272  	// checks that only needs to examine the single node to determine feasibility.
   273  	jobs := []FeasibilityChecker{s.jobConstraint}
   274  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint}
   275  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
   276  
   277  	// Filter on distinct property constraints.
   278  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks)
   279  
   280  	// Upgrade from feasible to rank iterator
   281  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   282  
   283  	// Apply the bin packing, this depends on the resources needed
   284  	// by a particular task group. Enable eviction as system jobs are high
   285  	// priority.
   286  	s.binPack = NewBinPackIterator(ctx, rankSource, true, 0)
   287  
   288  	// Apply score normalization
   289  	s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack)
   290  	return s
   291  }
   292  
   293  func (s *SystemStack) SetNodes(baseNodes []*structs.Node) {
   294  	// Update the set of base nodes
   295  	s.source.SetNodes(baseNodes)
   296  }
   297  
   298  func (s *SystemStack) SetJob(job *structs.Job) {
   299  	s.jobConstraint.SetConstraints(job.Constraints)
   300  	s.distinctPropertyConstraint.SetJob(job)
   301  	s.binPack.SetPriority(job.Priority)
   302  	s.ctx.Eligibility().SetJob(job)
   303  
   304  	if contextual, ok := s.quota.(ContextualIterator); ok {
   305  		contextual.SetJob(job)
   306  	}
   307  }
   308  
   309  func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
   310  	// Reset the binpack selector and context
   311  	s.scoreNorm.Reset()
   312  	s.ctx.Reset()
   313  	start := time.Now()
   314  
   315  	// Get the task groups constraints.
   316  	tgConstr := taskGroupConstraints(tg)
   317  
   318  	// Update the parameters of iterators
   319  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   320  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   321  	s.wrappedChecks.SetTaskGroup(tg.Name)
   322  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   323  	s.binPack.SetTaskGroup(tg)
   324  
   325  	if contextual, ok := s.quota.(ContextualIterator); ok {
   326  		contextual.SetTaskGroup(tg)
   327  	}
   328  
   329  	// Get the next option that satisfies the constraints.
   330  	option := s.scoreNorm.Next()
   331  
   332  	// Ensure that the task resources were specified
   333  	if option != nil && len(option.TaskResources) != len(tg.Tasks) {
   334  		for _, task := range tg.Tasks {
   335  			option.SetTaskResources(task, task.Resources)
   336  		}
   337  	}
   338  
   339  	// Store the compute time
   340  	s.ctx.Metrics().AllocationTime = time.Since(start)
   341  	return option, tgConstr.size
   342  }