github.com/djenriquez/nomad-1@v0.8.1/scheduler/stack.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "time" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // serviceJobAntiAffinityPenalty is the penalty applied 12 // to the score for placing an alloc on a node that 13 // already has an alloc for this job. 14 serviceJobAntiAffinityPenalty = 20.0 15 16 // batchJobAntiAffinityPenalty is the same as the 17 // serviceJobAntiAffinityPenalty but for batch type jobs. 18 batchJobAntiAffinityPenalty = 10.0 19 20 // previousFailedAllocNodePenalty is a scoring penalty for nodes 21 // that a failed allocation was previously run on 22 previousFailedAllocNodePenalty = 50.0 23 24 // skipScoreThreshold is a threshold used in the limit iterator to skip nodes 25 // that have a score lower than this. -10 is the highest possible score for a 26 // node with penalty (based on batchJobAntiAffinityPenalty) 27 skipScoreThreshold = -10.0 28 29 // maxSkip limits the number of nodes that can be skipped in the limit iterator 30 maxSkip = 3 31 ) 32 33 // Stack is a chained collection of iterators. The stack is used to 34 // make placement decisions. Different schedulers may customize the 35 // stack they use to vary the way placements are made. 36 type Stack interface { 37 // SetNodes is used to set the base set of potential nodes 38 SetNodes([]*structs.Node) 39 40 // SetTaskGroup is used to set the job for selection 41 SetJob(job *structs.Job) 42 43 // Select is used to select a node for the task group 44 Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) 45 } 46 47 type SelectOptions struct { 48 PenaltyNodeIDs map[string]struct{} 49 PreferredNodes []*structs.Node 50 } 51 52 // GenericStack is the Stack used for the Generic scheduler. It is 53 // designed to make better placement decisions at the cost of performance. 54 type GenericStack struct { 55 batch bool 56 ctx Context 57 source *StaticIterator 58 59 wrappedChecks *FeasibilityWrapper 60 quota FeasibleIterator 61 jobConstraint *ConstraintChecker 62 taskGroupDrivers *DriverChecker 63 taskGroupConstraint *ConstraintChecker 64 65 distinctHostsConstraint *DistinctHostsIterator 66 distinctPropertyConstraint *DistinctPropertyIterator 67 binPack *BinPackIterator 68 jobAntiAff *JobAntiAffinityIterator 69 nodeAntiAff *NodeAntiAffinityIterator 70 limit *LimitIterator 71 maxScore *MaxScoreIterator 72 } 73 74 // NewGenericStack constructs a stack used for selecting service placements 75 func NewGenericStack(batch bool, ctx Context) *GenericStack { 76 // Create a new stack 77 s := &GenericStack{ 78 batch: batch, 79 ctx: ctx, 80 } 81 82 // Create the source iterator. We randomize the order we visit nodes 83 // to reduce collisions between schedulers and to do a basic load 84 // balancing across eligible nodes. 85 s.source = NewRandomIterator(ctx, nil) 86 87 // Create the quota iterator to determine if placements would result in the 88 // quota attached to the namespace of the job to go over. 89 s.quota = NewQuotaIterator(ctx, s.source) 90 91 // Attach the job constraints. The job is filled in later. 92 s.jobConstraint = NewConstraintChecker(ctx, nil) 93 94 // Filter on task group drivers first as they are faster 95 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 96 97 // Filter on task group constraints second 98 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 99 100 // Create the feasibility wrapper which wraps all feasibility checks in 101 // which feasibility checking can be skipped if the computed node class has 102 // previously been marked as eligible or ineligible. Generally this will be 103 // checks that only needs to examine the single node to determine feasibility. 104 jobs := []FeasibilityChecker{s.jobConstraint} 105 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 106 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 107 108 // Filter on distinct host constraints. 109 s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks) 110 111 // Filter on distinct property constraints. 112 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint) 113 114 // Upgrade from feasible to rank iterator 115 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 116 117 // Apply the bin packing, this depends on the resources needed 118 // by a particular task group. Only enable eviction for the service 119 // scheduler as that logic is expensive. 120 evict := !batch 121 s.binPack = NewBinPackIterator(ctx, rankSource, evict, 0) 122 123 // Apply the job anti-affinity iterator. This is to avoid placing 124 // multiple allocations on the same node for this job. The penalty 125 // is less for batch jobs as it matters less. 126 penalty := serviceJobAntiAffinityPenalty 127 if batch { 128 penalty = batchJobAntiAffinityPenalty 129 } 130 s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "") 131 132 s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty) 133 134 // Apply a limit function. This is to avoid scanning *every* possible node. 135 s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2, skipScoreThreshold, maxSkip) 136 137 // Select the node with the maximum score for placement 138 s.maxScore = NewMaxScoreIterator(ctx, s.limit) 139 return s 140 } 141 142 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { 143 // Shuffle base nodes 144 shuffleNodes(baseNodes) 145 146 // Update the set of base nodes 147 s.source.SetNodes(baseNodes) 148 149 // Apply a limit function. This is to avoid scanning *every* possible node. 150 // For batch jobs we only need to evaluate 2 options and depend on the 151 // power of two choices. For services jobs we need to visit "enough". 152 // Using a log of the total number of nodes is a good restriction, with 153 // at least 2 as the floor 154 limit := 2 155 if n := len(baseNodes); !s.batch && n > 0 { 156 logLimit := int(math.Ceil(math.Log2(float64(n)))) 157 if logLimit > limit { 158 limit = logLimit 159 } 160 } 161 s.limit.SetLimit(limit) 162 } 163 164 func (s *GenericStack) SetJob(job *structs.Job) { 165 s.jobConstraint.SetConstraints(job.Constraints) 166 s.distinctHostsConstraint.SetJob(job) 167 s.distinctPropertyConstraint.SetJob(job) 168 s.binPack.SetPriority(job.Priority) 169 s.jobAntiAff.SetJob(job.ID) 170 s.ctx.Eligibility().SetJob(job) 171 172 if contextual, ok := s.quota.(ContextualIterator); ok { 173 contextual.SetJob(job) 174 } 175 } 176 177 func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { 178 179 // This block handles trying to select from preferred nodes if options specify them 180 // It also sets back the set of nodes to the original nodes 181 if options != nil && len(options.PreferredNodes) > 0 { 182 originalNodes := s.source.nodes 183 s.source.SetNodes(options.PreferredNodes) 184 optionsNew := *options 185 optionsNew.PreferredNodes = nil 186 if option, resources := s.Select(tg, &optionsNew); option != nil { 187 s.source.SetNodes(originalNodes) 188 return option, resources 189 } 190 s.source.SetNodes(originalNodes) 191 return s.Select(tg, &optionsNew) 192 } 193 194 // Reset the max selector and context 195 s.maxScore.Reset() 196 s.ctx.Reset() 197 start := time.Now() 198 199 // Get the task groups constraints. 200 tgConstr := taskGroupConstraints(tg) 201 202 // Update the parameters of iterators 203 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 204 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 205 s.distinctHostsConstraint.SetTaskGroup(tg) 206 s.distinctPropertyConstraint.SetTaskGroup(tg) 207 s.wrappedChecks.SetTaskGroup(tg.Name) 208 s.binPack.SetTaskGroup(tg) 209 if options != nil { 210 s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs) 211 } 212 213 if contextual, ok := s.quota.(ContextualIterator); ok { 214 contextual.SetTaskGroup(tg) 215 } 216 217 // Find the node with the max score 218 option := s.maxScore.Next() 219 220 // Ensure that the task resources were specified 221 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 222 for _, task := range tg.Tasks { 223 option.SetTaskResources(task, task.Resources) 224 } 225 } 226 227 // Store the compute time 228 s.ctx.Metrics().AllocationTime = time.Since(start) 229 return option, tgConstr.size 230 } 231 232 // SystemStack is the Stack used for the System scheduler. It is designed to 233 // attempt to make placements on all nodes. 234 type SystemStack struct { 235 ctx Context 236 source *StaticIterator 237 wrappedChecks *FeasibilityWrapper 238 quota FeasibleIterator 239 jobConstraint *ConstraintChecker 240 taskGroupDrivers *DriverChecker 241 taskGroupConstraint *ConstraintChecker 242 distinctPropertyConstraint *DistinctPropertyIterator 243 binPack *BinPackIterator 244 } 245 246 // NewSystemStack constructs a stack used for selecting service placements 247 func NewSystemStack(ctx Context) *SystemStack { 248 // Create a new stack 249 s := &SystemStack{ctx: ctx} 250 251 // Create the source iterator. We visit nodes in a linear order because we 252 // have to evaluate on all nodes. 253 s.source = NewStaticIterator(ctx, nil) 254 255 // Create the quota iterator to determine if placements would result in the 256 // quota attached to the namespace of the job to go over. 257 s.quota = NewQuotaIterator(ctx, s.source) 258 259 // Attach the job constraints. The job is filled in later. 260 s.jobConstraint = NewConstraintChecker(ctx, nil) 261 262 // Filter on task group drivers first as they are faster 263 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 264 265 // Filter on task group constraints second 266 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 267 268 // Create the feasibility wrapper which wraps all feasibility checks in 269 // which feasibility checking can be skipped if the computed node class has 270 // previously been marked as eligible or ineligible. Generally this will be 271 // checks that only needs to examine the single node to determine feasibility. 272 jobs := []FeasibilityChecker{s.jobConstraint} 273 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 274 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.quota, jobs, tgs) 275 276 // Filter on distinct property constraints. 277 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks) 278 279 // Upgrade from feasible to rank iterator 280 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 281 282 // Apply the bin packing, this depends on the resources needed 283 // by a particular task group. Enable eviction as system jobs are high 284 // priority. 285 s.binPack = NewBinPackIterator(ctx, rankSource, true, 0) 286 return s 287 } 288 289 func (s *SystemStack) SetNodes(baseNodes []*structs.Node) { 290 // Update the set of base nodes 291 s.source.SetNodes(baseNodes) 292 } 293 294 func (s *SystemStack) SetJob(job *structs.Job) { 295 s.jobConstraint.SetConstraints(job.Constraints) 296 s.distinctPropertyConstraint.SetJob(job) 297 s.binPack.SetPriority(job.Priority) 298 s.ctx.Eligibility().SetJob(job) 299 300 if contextual, ok := s.quota.(ContextualIterator); ok { 301 contextual.SetJob(job) 302 } 303 } 304 305 func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { 306 // Reset the binpack selector and context 307 s.binPack.Reset() 308 s.ctx.Reset() 309 start := time.Now() 310 311 // Get the task groups constraints. 312 tgConstr := taskGroupConstraints(tg) 313 314 // Update the parameters of iterators 315 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 316 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 317 s.wrappedChecks.SetTaskGroup(tg.Name) 318 s.distinctPropertyConstraint.SetTaskGroup(tg) 319 s.binPack.SetTaskGroup(tg) 320 321 if contextual, ok := s.quota.(ContextualIterator); ok { 322 contextual.SetTaskGroup(tg) 323 } 324 325 // Get the next option that satisfies the constraints. 326 option := s.binPack.Next() 327 328 // Ensure that the task resources were specified 329 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 330 for _, task := range tg.Tasks { 331 option.SetTaskResources(task, task.Resources) 332 } 333 } 334 335 // Store the compute time 336 s.ctx.Metrics().AllocationTime = time.Since(start) 337 return option, tgConstr.size 338 }