github.com/rohankumardubey/nomad@v0.11.8/scheduler/stack.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "time" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // skipScoreThreshold is a threshold used in the limit iterator to skip nodes 12 // that have a score lower than this. -1 is the lowest possible score for a 13 // node with penalties (based on job anti affinity and node rescheduling penalties 14 skipScoreThreshold = 0.0 15 16 // maxSkip limits the number of nodes that can be skipped in the limit iterator 17 maxSkip = 3 18 ) 19 20 // Stack is a chained collection of iterators. The stack is used to 21 // make placement decisions. Different schedulers may customize the 22 // stack they use to vary the way placements are made. 23 type Stack interface { 24 // SetNodes is used to set the base set of potential nodes 25 SetNodes([]*structs.Node) 26 27 // SetTaskGroup is used to set the job for selection 28 SetJob(job *structs.Job) 29 30 // Select is used to select a node for the task group 31 Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode 32 } 33 34 type SelectOptions struct { 35 PenaltyNodeIDs map[string]struct{} 36 PreferredNodes []*structs.Node 37 Preempt bool 38 } 39 40 // GenericStack is the Stack used for the Generic scheduler. It is 41 // designed to make better placement decisions at the cost of performance. 42 type GenericStack struct { 43 batch bool 44 ctx Context 45 source *StaticIterator 46 47 wrappedChecks *FeasibilityWrapper 48 quota FeasibleIterator 49 jobConstraint *ConstraintChecker 50 taskGroupDrivers *DriverChecker 51 taskGroupConstraint *ConstraintChecker 52 taskGroupDevices *DeviceChecker 53 taskGroupHostVolumes *HostVolumeChecker 54 taskGroupCSIVolumes *CSIVolumeChecker 55 56 distinctHostsConstraint *DistinctHostsIterator 57 distinctPropertyConstraint *DistinctPropertyIterator 58 binPack *BinPackIterator 59 jobAntiAff *JobAntiAffinityIterator 60 nodeReschedulingPenalty *NodeReschedulingPenaltyIterator 61 limit *LimitIterator 62 maxScore *MaxScoreIterator 63 nodeAffinity *NodeAffinityIterator 64 spread *SpreadIterator 65 scoreNorm *ScoreNormalizationIterator 66 } 67 68 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { 69 // Shuffle base nodes 70 shuffleNodes(baseNodes) 71 72 // Update the set of base nodes 73 s.source.SetNodes(baseNodes) 74 75 // Apply a limit function. This is to avoid scanning *every* possible node. 76 // For batch jobs we only need to evaluate 2 options and depend on the 77 // power of two choices. For services jobs we need to visit "enough". 78 // Using a log of the total number of nodes is a good restriction, with 79 // at least 2 as the floor 80 limit := 2 81 if n := len(baseNodes); !s.batch && n > 0 { 82 logLimit := int(math.Ceil(math.Log2(float64(n)))) 83 if logLimit > limit { 84 limit = logLimit 85 } 86 } 87 s.limit.SetLimit(limit) 88 } 89 90 func (s *GenericStack) SetJob(job *structs.Job) { 91 s.jobConstraint.SetConstraints(job.Constraints) 92 s.distinctHostsConstraint.SetJob(job) 93 s.distinctPropertyConstraint.SetJob(job) 94 s.binPack.SetJob(job) 95 s.jobAntiAff.SetJob(job) 96 s.nodeAffinity.SetJob(job) 97 s.spread.SetJob(job) 98 s.ctx.Eligibility().SetJob(job) 99 s.taskGroupCSIVolumes.SetNamespace(job.Namespace) 100 s.taskGroupCSIVolumes.SetJobID(job.ID) 101 102 if contextual, ok := s.quota.(ContextualIterator); ok { 103 contextual.SetJob(job) 104 } 105 } 106 107 func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode { 108 109 // This block handles trying to select from preferred nodes if options specify them 110 // It also sets back the set of nodes to the original nodes 111 if options != nil && len(options.PreferredNodes) > 0 { 112 originalNodes := s.source.nodes 113 s.source.SetNodes(options.PreferredNodes) 114 optionsNew := *options 115 optionsNew.PreferredNodes = nil 116 if option := s.Select(tg, &optionsNew); option != nil { 117 s.source.SetNodes(originalNodes) 118 return option 119 } 120 s.source.SetNodes(originalNodes) 121 return s.Select(tg, &optionsNew) 122 } 123 124 // Reset the max selector and context 125 s.maxScore.Reset() 126 s.ctx.Reset() 127 start := time.Now() 128 129 // Get the task groups constraints. 130 tgConstr := taskGroupConstraints(tg) 131 132 // Update the parameters of iterators 133 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 134 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 135 s.taskGroupDevices.SetTaskGroup(tg) 136 s.taskGroupHostVolumes.SetVolumes(tg.Volumes) 137 s.taskGroupCSIVolumes.SetVolumes(tg.Volumes) 138 s.distinctHostsConstraint.SetTaskGroup(tg) 139 s.distinctPropertyConstraint.SetTaskGroup(tg) 140 s.wrappedChecks.SetTaskGroup(tg.Name) 141 s.binPack.SetTaskGroup(tg) 142 if options != nil { 143 s.binPack.evict = options.Preempt 144 } 145 s.jobAntiAff.SetTaskGroup(tg) 146 if options != nil { 147 s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs) 148 } 149 s.nodeAffinity.SetTaskGroup(tg) 150 s.spread.SetTaskGroup(tg) 151 152 if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() { 153 s.limit.SetLimit(math.MaxInt32) 154 } 155 156 if contextual, ok := s.quota.(ContextualIterator); ok { 157 contextual.SetTaskGroup(tg) 158 } 159 160 // Find the node with the max score 161 option := s.maxScore.Next() 162 163 // Store the compute time 164 s.ctx.Metrics().AllocationTime = time.Since(start) 165 return option 166 } 167 168 // SystemStack is the Stack used for the System scheduler. It is designed to 169 // attempt to make placements on all nodes. 170 type SystemStack struct { 171 ctx Context 172 source *StaticIterator 173 174 wrappedChecks *FeasibilityWrapper 175 quota FeasibleIterator 176 jobConstraint *ConstraintChecker 177 taskGroupDrivers *DriverChecker 178 taskGroupConstraint *ConstraintChecker 179 taskGroupDevices *DeviceChecker 180 taskGroupHostVolumes *HostVolumeChecker 181 taskGroupCSIVolumes *CSIVolumeChecker 182 183 distinctPropertyConstraint *DistinctPropertyIterator 184 binPack *BinPackIterator 185 scoreNorm *ScoreNormalizationIterator 186 } 187 188 // NewSystemStack constructs a stack used for selecting system job placements. 189 func NewSystemStack(ctx Context) *SystemStack { 190 // Create a new stack 191 s := &SystemStack{ctx: ctx} 192 193 // Create the source iterator. We visit nodes in a linear order because we 194 // have to evaluate on all nodes. 195 s.source = NewStaticIterator(ctx, nil) 196 197 // Create the quota iterator to determine if placements would result in the 198 // quota attached to the namespace of the job to go over. 199 s.quota = NewQuotaIterator(ctx, s.source) 200 201 // Attach the job constraints. The job is filled in later. 202 s.jobConstraint = NewConstraintChecker(ctx, nil) 203 204 // Filter on task group drivers first as they are faster 205 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 206 207 // Filter on task group constraints second 208 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 209 210 // Filter on task group host volumes 211 s.taskGroupHostVolumes = NewHostVolumeChecker(ctx) 212 213 // Filter on available, healthy CSI plugins 214 s.taskGroupCSIVolumes = NewCSIVolumeChecker(ctx) 215 216 // Filter on task group devices 217 s.taskGroupDevices = NewDeviceChecker(ctx) 218 219 // Create the feasibility wrapper which wraps all feasibility checks in 220 // which feasibility checking can be skipped if the computed node class has 221 // previously been marked as eligible or ineligible. Generally this will be 222 // checks that only needs to examine the single node to determine feasibility. 223 jobs := []FeasibilityChecker{s.jobConstraint} 224 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, 225 s.taskGroupHostVolumes, 226 s.taskGroupDevices} 227 avail := []FeasibilityChecker{s.taskGroupCSIVolumes} 228 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs, avail) 229 230 // Filter on distinct property constraints. 231 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks) 232 233 // Upgrade from feasible to rank iterator 234 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 235 236 // Apply the bin packing, this depends on the resources needed 237 // by a particular task group. Enable eviction as system jobs are high 238 // priority. 239 _, schedConfig, _ := s.ctx.State().SchedulerConfig() 240 schedulerAlgorithm := schedConfig.EffectiveSchedulerAlgorithm() 241 enablePreemption := true 242 if schedConfig != nil { 243 enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled 244 } 245 246 s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0, schedulerAlgorithm) 247 248 // Apply score normalization 249 s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack) 250 return s 251 } 252 253 func (s *SystemStack) SetNodes(baseNodes []*structs.Node) { 254 // Update the set of base nodes 255 s.source.SetNodes(baseNodes) 256 } 257 258 func (s *SystemStack) SetJob(job *structs.Job) { 259 s.jobConstraint.SetConstraints(job.Constraints) 260 s.distinctPropertyConstraint.SetJob(job) 261 s.binPack.SetJob(job) 262 s.ctx.Eligibility().SetJob(job) 263 264 if contextual, ok := s.quota.(ContextualIterator); ok { 265 contextual.SetJob(job) 266 } 267 } 268 269 func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode { 270 // Reset the binpack selector and context 271 s.scoreNorm.Reset() 272 s.ctx.Reset() 273 start := time.Now() 274 275 // Get the task groups constraints. 276 tgConstr := taskGroupConstraints(tg) 277 278 // Update the parameters of iterators 279 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 280 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 281 s.taskGroupDevices.SetTaskGroup(tg) 282 s.taskGroupHostVolumes.SetVolumes(tg.Volumes) 283 s.taskGroupCSIVolumes.SetVolumes(tg.Volumes) 284 s.wrappedChecks.SetTaskGroup(tg.Name) 285 s.distinctPropertyConstraint.SetTaskGroup(tg) 286 s.binPack.SetTaskGroup(tg) 287 288 if contextual, ok := s.quota.(ContextualIterator); ok { 289 contextual.SetTaskGroup(tg) 290 } 291 292 // Get the next option that satisfies the constraints. 293 option := s.scoreNorm.Next() 294 295 // Store the compute time 296 s.ctx.Metrics().AllocationTime = time.Since(start) 297 return option 298 }