github.com/smithx10/nomad@v0.9.1-rc1/scheduler/stack.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"math"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
    12  	// that have a score lower than this. -1 is the lowest possible score for a
    13  	// node with penalties (based on job anti affinity and node rescheduling penalties
    14  	skipScoreThreshold = 0.0
    15  
    16  	// maxSkip limits the number of nodes that can be skipped in the limit iterator
    17  	maxSkip = 3
    18  )
    19  
    20  // Stack is a chained collection of iterators. The stack is used to
    21  // make placement decisions. Different schedulers may customize the
    22  // stack they use to vary the way placements are made.
    23  type Stack interface {
    24  	// SetNodes is used to set the base set of potential nodes
    25  	SetNodes([]*structs.Node)
    26  
    27  	// SetTaskGroup is used to set the job for selection
    28  	SetJob(job *structs.Job)
    29  
    30  	// Select is used to select a node for the task group
    31  	Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode
    32  }
    33  
    34  type SelectOptions struct {
    35  	PenaltyNodeIDs map[string]struct{}
    36  	PreferredNodes []*structs.Node
    37  }
    38  
    39  // GenericStack is the Stack used for the Generic scheduler. It is
    40  // designed to make better placement decisions at the cost of performance.
    41  type GenericStack struct {
    42  	batch  bool
    43  	ctx    Context
    44  	source *StaticIterator
    45  
    46  	wrappedChecks       *FeasibilityWrapper
    47  	quota               FeasibleIterator
    48  	jobConstraint       *ConstraintChecker
    49  	taskGroupDrivers    *DriverChecker
    50  	taskGroupConstraint *ConstraintChecker
    51  	taskGroupDevices    *DeviceChecker
    52  
    53  	distinctHostsConstraint    *DistinctHostsIterator
    54  	distinctPropertyConstraint *DistinctPropertyIterator
    55  	binPack                    *BinPackIterator
    56  	jobAntiAff                 *JobAntiAffinityIterator
    57  	nodeReschedulingPenalty    *NodeReschedulingPenaltyIterator
    58  	limit                      *LimitIterator
    59  	maxScore                   *MaxScoreIterator
    60  	nodeAffinity               *NodeAffinityIterator
    61  	spread                     *SpreadIterator
    62  	scoreNorm                  *ScoreNormalizationIterator
    63  }
    64  
    65  // NewGenericStack constructs a stack used for selecting service placements
    66  func NewGenericStack(batch bool, ctx Context) *GenericStack {
    67  	// Create a new stack
    68  	s := &GenericStack{
    69  		batch: batch,
    70  		ctx:   ctx,
    71  	}
    72  
    73  	// Create the source iterator. We randomize the order we visit nodes
    74  	// to reduce collisions between schedulers and to do a basic load
    75  	// balancing across eligible nodes.
    76  	s.source = NewRandomIterator(ctx, nil)
    77  
    78  	// Create the quota iterator to determine if placements would result in the
    79  	// quota attached to the namespace of the job to go over.
    80  	s.quota = NewQuotaIterator(ctx, s.source)
    81  
    82  	// Attach the job constraints. The job is filled in later.
    83  	s.jobConstraint = NewConstraintChecker(ctx, nil)
    84  
    85  	// Filter on task group drivers first as they are faster
    86  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
    87  
    88  	// Filter on task group constraints second
    89  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
    90  
    91  	// Filter on task group devices
    92  	s.taskGroupDevices = NewDeviceChecker(ctx)
    93  
    94  	// Create the feasibility wrapper which wraps all feasibility checks in
    95  	// which feasibility checking can be skipped if the computed node class has
    96  	// previously been marked as eligible or ineligible. Generally this will be
    97  	// checks that only needs to examine the single node to determine feasibility.
    98  	jobs := []FeasibilityChecker{s.jobConstraint}
    99  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupDevices}
   100  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
   101  
   102  	// Filter on distinct host constraints.
   103  	s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks)
   104  
   105  	// Filter on distinct property constraints.
   106  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint)
   107  
   108  	// Upgrade from feasible to rank iterator
   109  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   110  
   111  	// Apply the bin packing, this depends on the resources needed
   112  	// by a particular task group.
   113  
   114  	s.binPack = NewBinPackIterator(ctx, rankSource, false, 0)
   115  
   116  	// Apply the job anti-affinity iterator. This is to avoid placing
   117  	// multiple allocations on the same node for this job.
   118  	s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, "")
   119  
   120  	s.nodeReschedulingPenalty = NewNodeReschedulingPenaltyIterator(ctx, s.jobAntiAff)
   121  
   122  	s.nodeAffinity = NewNodeAffinityIterator(ctx, s.nodeReschedulingPenalty)
   123  
   124  	s.spread = NewSpreadIterator(ctx, s.nodeAffinity)
   125  
   126  	s.scoreNorm = NewScoreNormalizationIterator(ctx, s.spread)
   127  
   128  	// Apply a limit function. This is to avoid scanning *every* possible node.
   129  	s.limit = NewLimitIterator(ctx, s.scoreNorm, 2, skipScoreThreshold, maxSkip)
   130  
   131  	// Select the node with the maximum score for placement
   132  	s.maxScore = NewMaxScoreIterator(ctx, s.limit)
   133  	return s
   134  }
   135  
   136  func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
   137  	// Shuffle base nodes
   138  	shuffleNodes(baseNodes)
   139  
   140  	// Update the set of base nodes
   141  	s.source.SetNodes(baseNodes)
   142  
   143  	// Apply a limit function. This is to avoid scanning *every* possible node.
   144  	// For batch jobs we only need to evaluate 2 options and depend on the
   145  	// power of two choices. For services jobs we need to visit "enough".
   146  	// Using a log of the total number of nodes is a good restriction, with
   147  	// at least 2 as the floor
   148  	limit := 2
   149  	if n := len(baseNodes); !s.batch && n > 0 {
   150  		logLimit := int(math.Ceil(math.Log2(float64(n))))
   151  		if logLimit > limit {
   152  			limit = logLimit
   153  		}
   154  	}
   155  	s.limit.SetLimit(limit)
   156  }
   157  
   158  func (s *GenericStack) SetJob(job *structs.Job) {
   159  	s.jobConstraint.SetConstraints(job.Constraints)
   160  	s.distinctHostsConstraint.SetJob(job)
   161  	s.distinctPropertyConstraint.SetJob(job)
   162  	s.binPack.SetPriority(job.Priority)
   163  	s.jobAntiAff.SetJob(job)
   164  	s.nodeAffinity.SetJob(job)
   165  	s.spread.SetJob(job)
   166  	s.ctx.Eligibility().SetJob(job)
   167  
   168  	if contextual, ok := s.quota.(ContextualIterator); ok {
   169  		contextual.SetJob(job)
   170  	}
   171  }
   172  
   173  func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode {
   174  
   175  	// This block handles trying to select from preferred nodes if options specify them
   176  	// It also sets back the set of nodes to the original nodes
   177  	if options != nil && len(options.PreferredNodes) > 0 {
   178  		originalNodes := s.source.nodes
   179  		s.source.SetNodes(options.PreferredNodes)
   180  		optionsNew := *options
   181  		optionsNew.PreferredNodes = nil
   182  		if option := s.Select(tg, &optionsNew); option != nil {
   183  			s.source.SetNodes(originalNodes)
   184  			return option
   185  		}
   186  		s.source.SetNodes(originalNodes)
   187  		return s.Select(tg, &optionsNew)
   188  	}
   189  
   190  	// Reset the max selector and context
   191  	s.maxScore.Reset()
   192  	s.ctx.Reset()
   193  	start := time.Now()
   194  
   195  	// Get the task groups constraints.
   196  	tgConstr := taskGroupConstraints(tg)
   197  
   198  	// Update the parameters of iterators
   199  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   200  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   201  	s.taskGroupDevices.SetTaskGroup(tg)
   202  	s.distinctHostsConstraint.SetTaskGroup(tg)
   203  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   204  	s.wrappedChecks.SetTaskGroup(tg.Name)
   205  	s.binPack.SetTaskGroup(tg)
   206  	s.jobAntiAff.SetTaskGroup(tg)
   207  	if options != nil {
   208  		s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs)
   209  	}
   210  	s.nodeAffinity.SetTaskGroup(tg)
   211  	s.spread.SetTaskGroup(tg)
   212  
   213  	if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() {
   214  		s.limit.SetLimit(math.MaxInt32)
   215  	}
   216  
   217  	if contextual, ok := s.quota.(ContextualIterator); ok {
   218  		contextual.SetTaskGroup(tg)
   219  	}
   220  
   221  	// Find the node with the max score
   222  	option := s.maxScore.Next()
   223  
   224  	// Store the compute time
   225  	s.ctx.Metrics().AllocationTime = time.Since(start)
   226  	return option
   227  }
   228  
   229  // SystemStack is the Stack used for the System scheduler. It is designed to
   230  // attempt to make placements on all nodes.
   231  type SystemStack struct {
   232  	ctx    Context
   233  	source *StaticIterator
   234  
   235  	wrappedChecks       *FeasibilityWrapper
   236  	quota               FeasibleIterator
   237  	jobConstraint       *ConstraintChecker
   238  	taskGroupDrivers    *DriverChecker
   239  	taskGroupConstraint *ConstraintChecker
   240  	taskGroupDevices    *DeviceChecker
   241  
   242  	distinctPropertyConstraint *DistinctPropertyIterator
   243  	binPack                    *BinPackIterator
   244  	scoreNorm                  *ScoreNormalizationIterator
   245  }
   246  
   247  // NewSystemStack constructs a stack used for selecting service placements
   248  func NewSystemStack(ctx Context) *SystemStack {
   249  	// Create a new stack
   250  	s := &SystemStack{ctx: ctx}
   251  
   252  	// Create the source iterator. We visit nodes in a linear order because we
   253  	// have to evaluate on all nodes.
   254  	s.source = NewStaticIterator(ctx, nil)
   255  
   256  	// Create the quota iterator to determine if placements would result in the
   257  	// quota attached to the namespace of the job to go over.
   258  	s.quota = NewQuotaIterator(ctx, s.source)
   259  
   260  	// Attach the job constraints. The job is filled in later.
   261  	s.jobConstraint = NewConstraintChecker(ctx, nil)
   262  
   263  	// Filter on task group drivers first as they are faster
   264  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
   265  
   266  	// Filter on task group constraints second
   267  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
   268  
   269  	// Filter on task group devices
   270  	s.taskGroupDevices = NewDeviceChecker(ctx)
   271  
   272  	// Create the feasibility wrapper which wraps all feasibility checks in
   273  	// which feasibility checking can be skipped if the computed node class has
   274  	// previously been marked as eligible or ineligible. Generally this will be
   275  	// checks that only needs to examine the single node to determine feasibility.
   276  	jobs := []FeasibilityChecker{s.jobConstraint}
   277  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupDevices}
   278  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs)
   279  
   280  	// Filter on distinct property constraints.
   281  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks)
   282  
   283  	// Upgrade from feasible to rank iterator
   284  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   285  
   286  	// Apply the bin packing, this depends on the resources needed
   287  	// by a particular task group. Enable eviction as system jobs are high
   288  	// priority.
   289  	_, schedConfig, _ := s.ctx.State().SchedulerConfig()
   290  	enablePreemption := true
   291  	if schedConfig != nil {
   292  		enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled
   293  	}
   294  	s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0)
   295  
   296  	// Apply score normalization
   297  	s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack)
   298  	return s
   299  }
   300  
   301  func (s *SystemStack) SetNodes(baseNodes []*structs.Node) {
   302  	// Update the set of base nodes
   303  	s.source.SetNodes(baseNodes)
   304  }
   305  
   306  func (s *SystemStack) SetJob(job *structs.Job) {
   307  	s.jobConstraint.SetConstraints(job.Constraints)
   308  	s.distinctPropertyConstraint.SetJob(job)
   309  	s.binPack.SetPriority(job.Priority)
   310  	s.ctx.Eligibility().SetJob(job)
   311  
   312  	if contextual, ok := s.quota.(ContextualIterator); ok {
   313  		contextual.SetJob(job)
   314  	}
   315  }
   316  
   317  func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode {
   318  	// Reset the binpack selector and context
   319  	s.scoreNorm.Reset()
   320  	s.ctx.Reset()
   321  	start := time.Now()
   322  
   323  	// Get the task groups constraints.
   324  	tgConstr := taskGroupConstraints(tg)
   325  
   326  	// Update the parameters of iterators
   327  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   328  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   329  	s.taskGroupDevices.SetTaskGroup(tg)
   330  	s.wrappedChecks.SetTaskGroup(tg.Name)
   331  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   332  	s.binPack.SetTaskGroup(tg)
   333  
   334  	if contextual, ok := s.quota.(ContextualIterator); ok {
   335  		contextual.SetTaskGroup(tg)
   336  	}
   337  
   338  	// Get the next option that satisfies the constraints.
   339  	option := s.scoreNorm.Next()
   340  
   341  	// Store the compute time
   342  	s.ctx.Metrics().AllocationTime = time.Since(start)
   343  	return option
   344  }