github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/stack.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "time" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // skipScoreThreshold is a threshold used in the limit iterator to skip nodes 12 // that have a score lower than this. -1 is the lowest possible score for a 13 // node with penalties (based on job anti affinity and node rescheduling penalties 14 skipScoreThreshold = 0.0 15 16 // maxSkip limits the number of nodes that can be skipped in the limit iterator 17 maxSkip = 3 18 ) 19 20 // Stack is a chained collection of iterators. The stack is used to 21 // make placement decisions. Different schedulers may customize the 22 // stack they use to vary the way placements are made. 23 type Stack interface { 24 // SetNodes is used to set the base set of potential nodes 25 SetNodes([]*structs.Node) 26 27 // SetTaskGroup is used to set the job for selection 28 SetJob(job *structs.Job) 29 30 // Select is used to select a node for the task group 31 Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) 32 } 33 34 type SelectOptions struct { 35 PenaltyNodeIDs map[string]struct{} 36 PreferredNodes []*structs.Node 37 } 38 39 // GenericStack is the Stack used for the Generic scheduler. It is 40 // designed to make better placement decisions at the cost of performance. 41 type GenericStack struct { 42 batch bool 43 ctx Context 44 source *StaticIterator 45 46 wrappedChecks *FeasibilityWrapper 47 quota FeasibleIterator 48 jobConstraint *ConstraintChecker 49 taskGroupDrivers *DriverChecker 50 taskGroupConstraint *ConstraintChecker 51 52 distinctHostsConstraint *DistinctHostsIterator 53 distinctPropertyConstraint *DistinctPropertyIterator 54 binPack *BinPackIterator 55 jobAntiAff *JobAntiAffinityIterator 56 nodeReschedulingPenalty *NodeReschedulingPenaltyIterator 57 limit *LimitIterator 58 maxScore *MaxScoreIterator 59 nodeAffinity *NodeAffinityIterator 60 spread *SpreadIterator 61 scoreNorm *ScoreNormalizationIterator 62 } 63 64 // NewGenericStack constructs a stack used for selecting service placements 65 func NewGenericStack(batch bool, ctx Context) *GenericStack { 66 // Create a new stack 67 s := &GenericStack{ 68 batch: batch, 69 ctx: ctx, 70 } 71 72 // Create the source iterator. We randomize the order we visit nodes 73 // to reduce collisions between schedulers and to do a basic load 74 // balancing across eligible nodes. 75 s.source = NewRandomIterator(ctx, nil) 76 77 // Create the quota iterator to determine if placements would result in the 78 // quota attached to the namespace of the job to go over. 79 s.quota = NewQuotaIterator(ctx, s.source) 80 81 // Attach the job constraints. The job is filled in later. 82 s.jobConstraint = NewConstraintChecker(ctx, nil) 83 84 // Filter on task group drivers first as they are faster 85 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 86 87 // Filter on task group constraints second 88 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 89 90 // Create the feasibility wrapper which wraps all feasibility checks in 91 // which feasibility checking can be skipped if the computed node class has 92 // previously been marked as eligible or ineligible. Generally this will be 93 // checks that only needs to examine the single node to determine feasibility. 94 jobs := []FeasibilityChecker{s.jobConstraint} 95 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 96 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 97 98 // Filter on distinct host constraints. 99 s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks) 100 101 // Filter on distinct property constraints. 102 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint) 103 104 // Upgrade from feasible to rank iterator 105 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 106 107 // Apply the bin packing, this depends on the resources needed 108 // by a particular task group. Only enable eviction for the service 109 // scheduler as that logic is expensive. 110 evict := !batch 111 s.binPack = NewBinPackIterator(ctx, rankSource, evict, 0) 112 113 // Apply the job anti-affinity iterator. This is to avoid placing 114 // multiple allocations on the same node for this job. 115 s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, "") 116 117 s.nodeReschedulingPenalty = NewNodeReschedulingPenaltyIterator(ctx, s.jobAntiAff) 118 119 s.nodeAffinity = NewNodeAffinityIterator(ctx, s.nodeReschedulingPenalty) 120 121 s.spread = NewSpreadIterator(ctx, s.nodeAffinity) 122 123 s.scoreNorm = NewScoreNormalizationIterator(ctx, s.spread) 124 125 // Apply a limit function. This is to avoid scanning *every* possible node. 126 s.limit = NewLimitIterator(ctx, s.scoreNorm, 2, skipScoreThreshold, maxSkip) 127 128 // Select the node with the maximum score for placement 129 s.maxScore = NewMaxScoreIterator(ctx, s.limit) 130 return s 131 } 132 133 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { 134 // Shuffle base nodes 135 shuffleNodes(baseNodes) 136 137 // Update the set of base nodes 138 s.source.SetNodes(baseNodes) 139 140 // Apply a limit function. This is to avoid scanning *every* possible node. 141 // For batch jobs we only need to evaluate 2 options and depend on the 142 // power of two choices. For services jobs we need to visit "enough". 143 // Using a log of the total number of nodes is a good restriction, with 144 // at least 2 as the floor 145 limit := 2 146 if n := len(baseNodes); !s.batch && n > 0 { 147 logLimit := int(math.Ceil(math.Log2(float64(n)))) 148 if logLimit > limit { 149 limit = logLimit 150 } 151 } 152 s.limit.SetLimit(limit) 153 } 154 155 func (s *GenericStack) SetJob(job *structs.Job) { 156 s.jobConstraint.SetConstraints(job.Constraints) 157 s.distinctHostsConstraint.SetJob(job) 158 s.distinctPropertyConstraint.SetJob(job) 159 s.binPack.SetPriority(job.Priority) 160 s.jobAntiAff.SetJob(job) 161 s.nodeAffinity.SetJob(job) 162 s.spread.SetJob(job) 163 s.ctx.Eligibility().SetJob(job) 164 165 if contextual, ok := s.quota.(ContextualIterator); ok { 166 contextual.SetJob(job) 167 } 168 } 169 170 func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { 171 172 // This block handles trying to select from preferred nodes if options specify them 173 // It also sets back the set of nodes to the original nodes 174 if options != nil && len(options.PreferredNodes) > 0 { 175 originalNodes := s.source.nodes 176 s.source.SetNodes(options.PreferredNodes) 177 optionsNew := *options 178 optionsNew.PreferredNodes = nil 179 if option, resources := s.Select(tg, &optionsNew); option != nil { 180 s.source.SetNodes(originalNodes) 181 return option, resources 182 } 183 s.source.SetNodes(originalNodes) 184 return s.Select(tg, &optionsNew) 185 } 186 187 // Reset the max selector and context 188 s.maxScore.Reset() 189 s.ctx.Reset() 190 start := time.Now() 191 192 // Get the task groups constraints. 193 tgConstr := taskGroupConstraints(tg) 194 195 // Update the parameters of iterators 196 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 197 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 198 s.distinctHostsConstraint.SetTaskGroup(tg) 199 s.distinctPropertyConstraint.SetTaskGroup(tg) 200 s.wrappedChecks.SetTaskGroup(tg.Name) 201 s.binPack.SetTaskGroup(tg) 202 s.jobAntiAff.SetTaskGroup(tg) 203 if options != nil { 204 s.nodeReschedulingPenalty.SetPenaltyNodes(options.PenaltyNodeIDs) 205 } 206 s.nodeAffinity.SetTaskGroup(tg) 207 s.spread.SetTaskGroup(tg) 208 209 if s.nodeAffinity.hasAffinities() || s.spread.hasSpreads() { 210 s.limit.SetLimit(math.MaxInt32) 211 } 212 213 if contextual, ok := s.quota.(ContextualIterator); ok { 214 contextual.SetTaskGroup(tg) 215 } 216 217 // Find the node with the max score 218 option := s.maxScore.Next() 219 220 // Ensure that the task resources were specified 221 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 222 for _, task := range tg.Tasks { 223 option.SetTaskResources(task, task.Resources) 224 } 225 } 226 227 // Store the compute time 228 s.ctx.Metrics().AllocationTime = time.Since(start) 229 return option, tgConstr.size 230 } 231 232 // SystemStack is the Stack used for the System scheduler. It is designed to 233 // attempt to make placements on all nodes. 234 type SystemStack struct { 235 ctx Context 236 source *StaticIterator 237 wrappedChecks *FeasibilityWrapper 238 quota FeasibleIterator 239 jobConstraint *ConstraintChecker 240 taskGroupDrivers *DriverChecker 241 taskGroupConstraint *ConstraintChecker 242 distinctPropertyConstraint *DistinctPropertyIterator 243 binPack *BinPackIterator 244 scoreNorm *ScoreNormalizationIterator 245 } 246 247 // NewSystemStack constructs a stack used for selecting service placements 248 func NewSystemStack(ctx Context) *SystemStack { 249 // Create a new stack 250 s := &SystemStack{ctx: ctx} 251 252 // Create the source iterator. We visit nodes in a linear order because we 253 // have to evaluate on all nodes. 254 s.source = NewStaticIterator(ctx, nil) 255 256 // Create the quota iterator to determine if placements would result in the 257 // quota attached to the namespace of the job to go over. 258 s.quota = NewQuotaIterator(ctx, s.source) 259 260 // Attach the job constraints. The job is filled in later. 261 s.jobConstraint = NewConstraintChecker(ctx, nil) 262 263 // Filter on task group drivers first as they are faster 264 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 265 266 // Filter on task group constraints second 267 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 268 269 // Create the feasibility wrapper which wraps all feasibility checks in 270 // which feasibility checking can be skipped if the computed node class has 271 // previously been marked as eligible or ineligible. Generally this will be 272 // checks that only needs to examine the single node to determine feasibility. 273 jobs := []FeasibilityChecker{s.jobConstraint} 274 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 275 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 276 277 // Filter on distinct property constraints. 278 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks) 279 280 // Upgrade from feasible to rank iterator 281 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 282 283 // Apply the bin packing, this depends on the resources needed 284 // by a particular task group. Enable eviction as system jobs are high 285 // priority. 286 s.binPack = NewBinPackIterator(ctx, rankSource, true, 0) 287 288 // Apply score normalization 289 s.scoreNorm = NewScoreNormalizationIterator(ctx, s.binPack) 290 return s 291 } 292 293 func (s *SystemStack) SetNodes(baseNodes []*structs.Node) { 294 // Update the set of base nodes 295 s.source.SetNodes(baseNodes) 296 } 297 298 func (s *SystemStack) SetJob(job *structs.Job) { 299 s.jobConstraint.SetConstraints(job.Constraints) 300 s.distinctPropertyConstraint.SetJob(job) 301 s.binPack.SetPriority(job.Priority) 302 s.ctx.Eligibility().SetJob(job) 303 304 if contextual, ok := s.quota.(ContextualIterator); ok { 305 contextual.SetJob(job) 306 } 307 } 308 309 func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { 310 // Reset the binpack selector and context 311 s.scoreNorm.Reset() 312 s.ctx.Reset() 313 start := time.Now() 314 315 // Get the task groups constraints. 316 tgConstr := taskGroupConstraints(tg) 317 318 // Update the parameters of iterators 319 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 320 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 321 s.wrappedChecks.SetTaskGroup(tg.Name) 322 s.distinctPropertyConstraint.SetTaskGroup(tg) 323 s.binPack.SetTaskGroup(tg) 324 325 if contextual, ok := s.quota.(ContextualIterator); ok { 326 contextual.SetTaskGroup(tg) 327 } 328 329 // Get the next option that satisfies the constraints. 330 option := s.scoreNorm.Next() 331 332 // Ensure that the task resources were specified 333 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 334 for _, task := range tg.Tasks { 335 option.SetTaskResources(task, task.Resources) 336 } 337 } 338 339 // Store the compute time 340 s.ctx.Metrics().AllocationTime = time.Since(start) 341 return option, tgConstr.size 342 }