github.com/rohankumardubey/nomad@v0.11.8/scheduler/stack.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"math"
     5  	"time"
     6  
     7  	"github.com/hashicorp/nomad/nomad/structs"
     8  )
     9  
    10  const (
    11  	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
    12  	// that have a score lower than this. -1 is the lowest possible score for a
    13  	// node with penalties (based on job anti affinity and node rescheduling penalties
    14  	skipScoreThreshold = 0.0
    15  
    16  	// maxSkip limits the number of nodes that can be skipped in the limit iterator
    17  	maxSkip = 3
    18  )
    19  
    20  // Stack is a chained collection of iterators. The stack is used to
    21  // make placement decisions. Different schedulers may customize the
    22  // stack they use to vary the way placements are made.
    23  type Stack interface {
    24  	// SetNodes is used to set the base set of potential nodes
    25  	SetNodes([]*structs.Node)
    26  
    27  	// SetTaskGroup is used to set the job for selection
    28  	SetJob(job *structs.Job)
    29  
    30  	// Select is used to select a node for the task group
    31  	Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode
    32  }
    33  
    34  type SelectOptions struct {
    35  	PenaltyNodeIDs map[string]struct{}
    36  	PreferredNodes []*structs.Node
    37  	Preempt        bool
    38  }
    39  
    40  // GenericStack is the Stack used for the Generic scheduler. It is
    41  // designed to make better placement decisions at the cost of performance.
    42  type GenericStack struct {
    43  	batch  bool
    44  	ctx    Context
    45  	source *StaticIterator
    46  
    47  	wrappedChecks        *FeasibilityWrapper
    48  	quota                FeasibleIterator
    49  	jobConstraint        *ConstraintChecker
    50  	taskGroupDrivers     *DriverChecker
    51  	taskGroupConstraint  *ConstraintChecker
    52  	taskGroupDevices     *DeviceChecker
    53  	taskGroupHostVolumes *HostVolumeChecker
    54  	taskGroupCSIVolumes  *CSIVolumeChecker
    55  
    56  	distinctHostsConstraint    *DistinctHostsIterator
    57  	distinctPropertyConstraint *DistinctPropertyIterator
    58  	binPack                    *BinPackIterator
    59  	jobAntiAff                 *JobAntiAffinityIterator
    60  	nodeReschedulingPenalty    *NodeReschedulingPenaltyIterator
    61  	limit                      *LimitIterator
    62  	maxScore                   *MaxScoreIterator
    63  	nodeAffinity               *NodeAffinityIterator
    64  	spread                     *SpreadIterator
    65  	scoreNorm                  *ScoreNormalizationIterator
    66  }
    67  
    68  func (s *GenericStack) SetNodes(baseNodes []*structs.Node) {
    69  	// Shuffle base nodes
    70  	shuffleNodes(baseNodes)
    71  
    72  	// Update the set of base nodes
    73  	s.source.SetNodes(baseNodes)
    74  
    75  	// Apply a limit function. This is to avoid scanning *every* possible node.
    76  	// For batch jobs we only need to evaluate 2 options and depend on the
    77  	// power of two choices. For services jobs we need to visit "enough".
    78  	// Using a log of the total number of nodes is a good restriction, with
    79  	// at least 2 as the floor
    80  	limit := 2
    81  	if n := len(baseNodes); !s.batch && n > 0 {
    82  		logLimit := int(math.Ceil(math.Log2(float64(n))))
    83  		if logLimit > limit {
    84  			limit = logLimit
    85  		}
    86  	}
    87  	s.limit.SetLimit(limit)
    88  }
    89  
    90  func (s *GenericStack) SetJob(job *structs.Job) {
    91  	s.jobConstraint.SetConstraints(job.Constraints)
    92  	s.distinctHostsConstraint.SetJob(job)
    93  	s.distinctPropertyConstraint.SetJob(job)
    94  	s.binPack.SetJob(job)
    95  	s.jobAntiAff.SetJob(job)
    96  	s.nodeAffinity.SetJob(job)
    97  	s.spread.SetJob(job)
    98  	s.ctx.Eligibility().SetJob(job)
    99  	s.taskGroupCSIVolumes.SetNamespace(job.Namespace)
   100  	s.taskGroupCSIVolumes.SetJobID(job.ID)
   101  
   102  	if contextual, ok := s.quota.(ContextualIterator); ok {
   103  		contextual.SetJob(job)
   104  	}
   105  }
   106  
   107  func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode {
   108  
   109  	// This block handles trying to select from preferred nodes if options specify them
   110  	// It also sets back the set of nodes to the original nodes
   111  	if options != nil && len(options.PreferredNodes) > 0 {
   112  		originalNodes := s.source.nodes
   113  		s.source.SetNodes(options.PreferredNodes)
   114  		optionsNew := *options
   115  		optionsNew.PreferredNodes = nil
   116  		if option := s.Select(tg, &optionsNew); option != nil {
   117  			s.source.SetNodes(originalNodes)
   118  			return option
   119  		}
   120  		s.source.SetNodes(originalNodes)
   121  		return s.Select(tg, &optionsNew)
   122  	}
   123  
   124  	// Reset the max selector and context
   125  	s.maxScore.Reset()
   126  	s.ctx.Reset()
   127  	start := time.Now()
   128  
   129  	// Get the task groups constraints.
   130  	tgConstr := taskGroupConstraints(tg)
   131  
   132  	// Update the parameters of iterators
   133  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   134  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   135  	s.taskGroupDevices.SetTaskGroup(tg)
   136  	s.taskGroupHostVolumes.SetVolumes(tg.Volumes)
   137  	s.taskGroupCSIVolumes.SetVolumes(tg.Volumes)
   138  	s.distinctHostsConstraint.SetTaskGroup(tg)
   139  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   140  	s.wrappedChecks.SetTaskGroup(tg.Name)
   141  	s.binPack.SetTaskGroup(tg)
   142  	if options != nil {
   143  		s.binPack.evict = options.Preempt
   144  	}
   145  	s.jobAntiAff.SetTaskGroup(tg)
   146  	if options != nil {
   147  		s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs)
   148  	}
   149  	s.nodeAffinity.SetTaskGroup(tg)
   150  	s.spread.SetTaskGroup(tg)
   151  
   152  	if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() {
   153  		s.limit.SetLimit(math.MaxInt32)
   154  	}
   155  
   156  	if contextual, ok := s.quota.(ContextualIterator); ok {
   157  		contextual.SetTaskGroup(tg)
   158  	}
   159  
   160  	// Find the node with the max score
   161  	option := s.maxScore.Next()
   162  
   163  	// Store the compute time
   164  	s.ctx.Metrics().AllocationTime = time.Since(start)
   165  	return option
   166  }
   167  
   168  // SystemStack is the Stack used for the System scheduler. It is designed to
   169  // attempt to make placements on all nodes.
   170  type SystemStack struct {
   171  	ctx    Context
   172  	source *StaticIterator
   173  
   174  	wrappedChecks        *FeasibilityWrapper
   175  	quota                FeasibleIterator
   176  	jobConstraint        *ConstraintChecker
   177  	taskGroupDrivers     *DriverChecker
   178  	taskGroupConstraint  *ConstraintChecker
   179  	taskGroupDevices     *DeviceChecker
   180  	taskGroupHostVolumes *HostVolumeChecker
   181  	taskGroupCSIVolumes  *CSIVolumeChecker
   182  
   183  	distinctPropertyConstraint *DistinctPropertyIterator
   184  	binPack                    *BinPackIterator
   185  	scoreNorm                  *ScoreNormalizationIterator
   186  }
   187  
   188  // NewSystemStack constructs a stack used for selecting system job placements.
   189  func NewSystemStack(ctx Context) *SystemStack {
   190  	// Create a new stack
   191  	s := &SystemStack{ctx: ctx}
   192  
   193  	// Create the source iterator. We visit nodes in a linear order because we
   194  	// have to evaluate on all nodes.
   195  	s.source = NewStaticIterator(ctx, nil)
   196  
   197  	// Create the quota iterator to determine if placements would result in the
   198  	// quota attached to the namespace of the job to go over.
   199  	s.quota = NewQuotaIterator(ctx, s.source)
   200  
   201  	// Attach the job constraints. The job is filled in later.
   202  	s.jobConstraint = NewConstraintChecker(ctx, nil)
   203  
   204  	// Filter on task group drivers first as they are faster
   205  	s.taskGroupDrivers = NewDriverChecker(ctx, nil)
   206  
   207  	// Filter on task group constraints second
   208  	s.taskGroupConstraint = NewConstraintChecker(ctx, nil)
   209  
   210  	// Filter on task group host volumes
   211  	s.taskGroupHostVolumes = NewHostVolumeChecker(ctx)
   212  
   213  	// Filter on available, healthy CSI plugins
   214  	s.taskGroupCSIVolumes = NewCSIVolumeChecker(ctx)
   215  
   216  	// Filter on task group devices
   217  	s.taskGroupDevices = NewDeviceChecker(ctx)
   218  
   219  	// Create the feasibility wrapper which wraps all feasibility checks in
   220  	// which feasibility checking can be skipped if the computed node class has
   221  	// previously been marked as eligible or ineligible. Generally this will be
   222  	// checks that only needs to examine the single node to determine feasibility.
   223  	jobs := []FeasibilityChecker{s.jobConstraint}
   224  	tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint,
   225  		s.taskGroupHostVolumes,
   226  		s.taskGroupDevices}
   227  	avail := []FeasibilityChecker{s.taskGroupCSIVolumes}
   228  	s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail)
   229  
   230  	// Filter on distinct property constraints.
   231  	s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks)
   232  
   233  	// Upgrade from feasible to rank iterator
   234  	rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint)
   235  
   236  	// Apply the bin packing, this depends on the resources needed
   237  	// by a particular task group. Enable eviction as system jobs are high
   238  	// priority.
   239  	_, schedConfig, _ := s.ctx.State().SchedulerConfig()
   240  	schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm()
   241  	enablePreemption := true
   242  	if schedConfig != nil {
   243  		enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled
   244  	}
   245  
   246  	s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm)
   247  
   248  	// Apply score normalization
   249  	s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack)
   250  	return s
   251  }
   252  
   253  func (s *SystemStack) SetNodes(baseNodes []*structs.Node) {
   254  	// Update the set of base nodes
   255  	s.source.SetNodes(baseNodes)
   256  }
   257  
   258  func (s *SystemStack) SetJob(job *structs.Job) {
   259  	s.jobConstraint.SetConstraints(job.Constraints)
   260  	s.distinctPropertyConstraint.SetJob(job)
   261  	s.binPack.SetJob(job)
   262  	s.ctx.Eligibility().SetJob(job)
   263  
   264  	if contextual, ok := s.quota.(ContextualIterator); ok {
   265  		contextual.SetJob(job)
   266  	}
   267  }
   268  
   269  func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode {
   270  	// Reset the binpack selector and context
   271  	s.scoreNorm.Reset()
   272  	s.ctx.Reset()
   273  	start := time.Now()
   274  
   275  	// Get the task groups constraints.
   276  	tgConstr := taskGroupConstraints(tg)
   277  
   278  	// Update the parameters of iterators
   279  	s.taskGroupDrivers.SetDrivers(tgConstr.drivers)
   280  	s.taskGroupConstraint.SetConstraints(tgConstr.constraints)
   281  	s.taskGroupDevices.SetTaskGroup(tg)
   282  	s.taskGroupHostVolumes.SetVolumes(tg.Volumes)
   283  	s.taskGroupCSIVolumes.SetVolumes(tg.Volumes)
   284  	s.wrappedChecks.SetTaskGroup(tg.Name)
   285  	s.distinctPropertyConstraint.SetTaskGroup(tg)
   286  	s.binPack.SetTaskGroup(tg)
   287  
   288  	if contextual, ok := s.quota.(ContextualIterator); ok {
   289  		contextual.SetTaskGroup(tg)
   290  	}
   291  
   292  	// Get the next option that satisfies the constraints.
   293  	option := s.scoreNorm.Next()
   294  
   295  	// Store the compute time
   296  	s.ctx.Metrics().AllocationTime = time.Since(start)
   297  	return option
   298  }