github.com/smithx10/nomad@v0.9.1-rc1/scheduler/stack.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "time" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // skipScoreThreshold is a threshold used in the limit iterator to skip nodes 12 // that have a score lower than this. -1 is the lowest possible score for a 13 // node with penalties (based on job anti affinity and node rescheduling penalties 14 skipScoreThreshold = 0.0 15 16 // maxSkip limits the number of nodes that can be skipped in the limit iterator 17 maxSkip = 3 18 ) 19 20 // Stack is a chained collection of iterators. The stack is used to 21 // make placement decisions. Different schedulers may customize the 22 // stack they use to vary the way placements are made. 23 type Stack interface { 24 // SetNodes is used to set the base set of potential nodes 25 SetNodes([]*structs.Node) 26 27 // SetTaskGroup is used to set the job for selection 28 SetJob(job *structs.Job) 29 30 // Select is used to select a node for the task group 31 Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode 32 } 33 34 type SelectOptions struct { 35 PenaltyNodeIDs map[string]struct{} 36 PreferredNodes []*structs.Node 37 } 38 39 // GenericStack is the Stack used for the Generic scheduler. It is 40 // designed to make better placement decisions at the cost of performance. 41 type GenericStack struct { 42 batch bool 43 ctx Context 44 source *StaticIterator 45 46 wrappedChecks *FeasibilityWrapper 47 quota FeasibleIterator 48 jobConstraint *ConstraintChecker 49 taskGroupDrivers *DriverChecker 50 taskGroupConstraint *ConstraintChecker 51 taskGroupDevices *DeviceChecker 52 53 distinctHostsConstraint *DistinctHostsIterator 54 distinctPropertyConstraint *DistinctPropertyIterator 55 binPack *BinPackIterator 56 jobAntiAff *JobAntiAffinityIterator 57 nodeReschedulingPenalty *NodeReschedulingPenaltyIterator 58 limit *LimitIterator 59 maxScore *MaxScoreIterator 60 nodeAffinity *NodeAffinityIterator 61 spread *SpreadIterator 62 scoreNorm *ScoreNormalizationIterator 63 } 64 65 // NewGenericStack constructs a stack used for selecting service placements 66 func NewGenericStack(batch bool, ctx Context) *GenericStack { 67 // Create a new stack 68 s := &GenericStack{ 69 batch: batch, 70 ctx: ctx, 71 } 72 73 // Create the source iterator. We randomize the order we visit nodes 74 // to reduce collisions between schedulers and to do a basic load 75 // balancing across eligible nodes. 76 s.source = NewRandomIterator(ctx, nil) 77 78 // Create the quota iterator to determine if placements would result in the 79 // quota attached to the namespace of the job to go over. 80 s.quota = NewQuotaIterator(ctx, s.source) 81 82 // Attach the job constraints. The job is filled in later. 83 s.jobConstraint = NewConstraintChecker(ctx, nil) 84 85 // Filter on task group drivers first as they are faster 86 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 87 88 // Filter on task group constraints second 89 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 90 91 // Filter on task group devices 92 s.taskGroupDevices = NewDeviceChecker(ctx) 93 94 // Create the feasibility wrapper which wraps all feasibility checks in 95 // which feasibility checking can be skipped if the computed node class has 96 // previously been marked as eligible or ineligible. Generally this will be 97 // checks that only needs to examine the single node to determine feasibility. 98 jobs := []FeasibilityChecker{s.jobConstraint} 99 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupDevices} 100 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 101 102 // Filter on distinct host constraints. 103 s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks) 104 105 // Filter on distinct property constraints. 106 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint) 107 108 // Upgrade from feasible to rank iterator 109 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 110 111 // Apply the bin packing, this depends on the resources needed 112 // by a particular task group. 113 114 s.binPack = NewBinPackIterator(ctx, rankSource, false, 0) 115 116 // Apply the job anti-affinity iterator. This is to avoid placing 117 // multiple allocations on the same node for this job. 118 s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, "") 119 120 s.nodeReschedulingPenalty = NewNodeReschedulingPenaltyIterator(ctx, s.jobAntiAff) 121 122 s.nodeAffinity = NewNodeAffinityIterator(ctx, s.nodeReschedulingPenalty) 123 124 s.spread = NewSpreadIterator(ctx, s.nodeAffinity) 125 126 s.scoreNorm = NewScoreNormalizationIterator(ctx, s.spread) 127 128 // Apply a limit function. This is to avoid scanning *every* possible node. 129 s.limit = NewLimitIterator(ctx, s.scoreNorm, 2, skipScoreThreshold, maxSkip) 130 131 // Select the node with the maximum score for placement 132 s.maxScore = NewMaxScoreIterator(ctx, s.limit) 133 return s 134 } 135 136 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { 137 // Shuffle base nodes 138 shuffleNodes(baseNodes) 139 140 // Update the set of base nodes 141 s.source.SetNodes(baseNodes) 142 143 // Apply a limit function. This is to avoid scanning *every* possible node. 144 // For batch jobs we only need to evaluate 2 options and depend on the 145 // power of two choices. For services jobs we need to visit "enough". 146 // Using a log of the total number of nodes is a good restriction, with 147 // at least 2 as the floor 148 limit := 2 149 if n := len(baseNodes); !s.batch && n > 0 { 150 logLimit := int(math.Ceil(math.Log2(float64(n)))) 151 if logLimit > limit { 152 limit = logLimit 153 } 154 } 155 s.limit.SetLimit(limit) 156 } 157 158 func (s *GenericStack) SetJob(job *structs.Job) { 159 s.jobConstraint.SetConstraints(job.Constraints) 160 s.distinctHostsConstraint.SetJob(job) 161 s.distinctPropertyConstraint.SetJob(job) 162 s.binPack.SetPriority(job.Priority) 163 s.jobAntiAff.SetJob(job) 164 s.nodeAffinity.SetJob(job) 165 s.spread.SetJob(job) 166 s.ctx.Eligibility().SetJob(job) 167 168 if contextual, ok := s.quota.(ContextualIterator); ok { 169 contextual.SetJob(job) 170 } 171 } 172 173 func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode { 174 175 // This block handles trying to select from preferred nodes if options specify them 176 // It also sets back the set of nodes to the original nodes 177 if options != nil && len(options.PreferredNodes) > 0 { 178 originalNodes := s.source.nodes 179 s.source.SetNodes(options.PreferredNodes) 180 optionsNew := *options 181 optionsNew.PreferredNodes = nil 182 if option := s.Select(tg, &optionsNew); option != nil { 183 s.source.SetNodes(originalNodes) 184 return option 185 } 186 s.source.SetNodes(originalNodes) 187 return s.Select(tg, &optionsNew) 188 } 189 190 // Reset the max selector and context 191 s.maxScore.Reset() 192 s.ctx.Reset() 193 start := time.Now() 194 195 // Get the task groups constraints. 196 tgConstr := taskGroupConstraints(tg) 197 198 // Update the parameters of iterators 199 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 200 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 201 s.taskGroupDevices.SetTaskGroup(tg) 202 s.distinctHostsConstraint.SetTaskGroup(tg) 203 s.distinctPropertyConstraint.SetTaskGroup(tg) 204 s.wrappedChecks.SetTaskGroup(tg.Name) 205 s.binPack.SetTaskGroup(tg) 206 s.jobAntiAff.SetTaskGroup(tg) 207 if options != nil { 208 s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs) 209 } 210 s.nodeAffinity.SetTaskGroup(tg) 211 s.spread.SetTaskGroup(tg) 212 213 if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() { 214 s.limit.SetLimit(math.MaxInt32) 215 } 216 217 if contextual, ok := s.quota.(ContextualIterator); ok { 218 contextual.SetTaskGroup(tg) 219 } 220 221 // Find the node with the max score 222 option := s.maxScore.Next() 223 224 // Store the compute time 225 s.ctx.Metrics().AllocationTime = time.Since(start) 226 return option 227 } 228 229 // SystemStack is the Stack used for the System scheduler. It is designed to 230 // attempt to make placements on all nodes. 231 type SystemStack struct { 232 ctx Context 233 source *StaticIterator 234 235 wrappedChecks *FeasibilityWrapper 236 quota FeasibleIterator 237 jobConstraint *ConstraintChecker 238 taskGroupDrivers *DriverChecker 239 taskGroupConstraint *ConstraintChecker 240 taskGroupDevices *DeviceChecker 241 242 distinctPropertyConstraint *DistinctPropertyIterator 243 binPack *BinPackIterator 244 scoreNorm *ScoreNormalizationIterator 245 } 246 247 // NewSystemStack constructs a stack used for selecting service placements 248 func NewSystemStack(ctx Context) *SystemStack { 249 // Create a new stack 250 s := &SystemStack{ctx: ctx} 251 252 // Create the source iterator. We visit nodes in a linear order because we 253 // have to evaluate on all nodes. 254 s.source = NewStaticIterator(ctx, nil) 255 256 // Create the quota iterator to determine if placements would result in the 257 // quota attached to the namespace of the job to go over. 258 s.quota = NewQuotaIterator(ctx, s.source) 259 260 // Attach the job constraints. The job is filled in later. 261 s.jobConstraint = NewConstraintChecker(ctx, nil) 262 263 // Filter on task group drivers first as they are faster 264 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 265 266 // Filter on task group constraints second 267 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 268 269 // Filter on task group devices 270 s.taskGroupDevices = NewDeviceChecker(ctx) 271 272 // Create the feasibility wrapper which wraps all feasibility checks in 273 // which feasibility checking can be skipped if the computed node class has 274 // previously been marked as eligible or ineligible. Generally this will be 275 // checks that only needs to examine the single node to determine feasibility. 276 jobs := []FeasibilityChecker{s.jobConstraint} 277 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint, s.taskGroupDevices} 278 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 279 280 // Filter on distinct property constraints. 281 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks) 282 283 // Upgrade from feasible to rank iterator 284 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 285 286 // Apply the bin packing, this depends on the resources needed 287 // by a particular task group. Enable eviction as system jobs are high 288 // priority. 289 _, schedConfig, _ := s.ctx.State().SchedulerConfig() 290 enablePreemption := true 291 if schedConfig != nil { 292 enablePreemption = schedConfig.PreemptionConfig.SystemSchedulerEnabled 293 } 294 s.binPack = NewBinPackIterator(ctx, rankSource, enablePreemption, 0) 295 296 // Apply score normalization 297 s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack) 298 return s 299 } 300 301 func (s *SystemStack) SetNodes(baseNodes []*structs.Node) { 302 // Update the set of base nodes 303 s.source.SetNodes(baseNodes) 304 } 305 306 func (s *SystemStack) SetJob(job *structs.Job) { 307 s.jobConstraint.SetConstraints(job.Constraints) 308 s.distinctPropertyConstraint.SetJob(job) 309 s.binPack.SetPriority(job.Priority) 310 s.ctx.Eligibility().SetJob(job) 311 312 if contextual, ok := s.quota.(ContextualIterator); ok { 313 contextual.SetJob(job) 314 } 315 } 316 317 func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *RankedNode { 318 // Reset the binpack selector and context 319 s.scoreNorm.Reset() 320 s.ctx.Reset() 321 start := time.Now() 322 323 // Get the task groups constraints. 324 tgConstr := taskGroupConstraints(tg) 325 326 // Update the parameters of iterators 327 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 328 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 329 s.taskGroupDevices.SetTaskGroup(tg) 330 s.wrappedChecks.SetTaskGroup(tg.Name) 331 s.distinctPropertyConstraint.SetTaskGroup(tg) 332 s.binPack.SetTaskGroup(tg) 333 334 if contextual, ok := s.quota.(ContextualIterator); ok { 335 contextual.SetTaskGroup(tg) 336 } 337 338 // Get the next option that satisfies the constraints. 339 option := s.scoreNorm.Next() 340 341 // Store the compute time 342 s.ctx.Metrics().AllocationTime = time.Since(start) 343 return option 344 }