github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/scheduler/stack.go (about) 1 package scheduler 2 3 import ( 4 "math" 5 "time" 6 7 "github.com/hashicorp/nomad/nomad/structs" 8 ) 9 10 const ( 11 // serviceJobAntiAffinityPenalty is the penalty applied 12 // to the score for placing an alloc on a node that 13 // already has an alloc for this job. 14 serviceJobAntiAffinityPenalty = 20.0 15 16 // batchJobAntiAffinityPenalty is the same as the 17 // serviceJobAntiAffinityPenalty but for batch type jobs. 18 batchJobAntiAffinityPenalty = 10.0 19 ) 20 21 // Stack is a chained collection of iterators. The stack is used to 22 // make placement decisions. Different schedulers may customize the 23 // stack they use to vary the way placements are made. 24 type Stack interface { 25 // SetNodes is used to set the base set of potential nodes 26 SetNodes([]*structs.Node) 27 28 // SetTaskGroup is used to set the job for selection 29 SetJob(job *structs.Job) 30 31 // Select is used to select a node for the task group 32 Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) 33 } 34 35 // GenericStack is the Stack used for the Generic scheduler. It is 36 // designed to make better placement decisions at the cost of performance. 37 type GenericStack struct { 38 batch bool 39 ctx Context 40 source *StaticIterator 41 42 wrappedChecks *FeasibilityWrapper 43 jobConstraint *ConstraintChecker 44 taskGroupDrivers *DriverChecker 45 taskGroupConstraint *ConstraintChecker 46 47 distinctHostsConstraint *DistinctHostsIterator 48 distinctPropertyConstraint *DistinctPropertyIterator 49 binPack *BinPackIterator 50 jobAntiAff *JobAntiAffinityIterator 51 limit *LimitIterator 52 maxScore *MaxScoreIterator 53 } 54 55 // NewGenericStack constructs a stack used for selecting service placements 56 func NewGenericStack(batch bool, ctx Context) *GenericStack { 57 // Create a new stack 58 s := &GenericStack{ 59 batch: batch, 60 ctx: ctx, 61 } 62 63 // Create the source iterator. We randomize the order we visit nodes 64 // to reduce collisions between schedulers and to do a basic load 65 // balancing across eligible nodes. 66 s.source = NewRandomIterator(ctx, nil) 67 68 // Attach the job constraints. The job is filled in later. 69 s.jobConstraint = NewConstraintChecker(ctx, nil) 70 71 // Filter on task group drivers first as they are faster 72 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 73 74 // Filter on task group constraints second 75 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 76 77 // Create the feasibility wrapper which wraps all feasibility checks in 78 // which feasibility checking can be skipped if the computed node class has 79 // previously been marked as eligible or ineligible. Generally this will be 80 // checks that only needs to examine the single node to determine feasibility. 81 jobs := []FeasibilityChecker{s.jobConstraint} 82 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 83 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.source, jobs, tgs) 84 85 // Filter on distinct host constraints. 86 s.distinctHostsConstraint = NewDistinctHostsIterator(ctx, s.wrappedChecks) 87 88 // Filter on distinct property constraints. 89 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.distinctHostsConstraint) 90 91 // Upgrade from feasible to rank iterator 92 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 93 94 // Apply the bin packing, this depends on the resources needed 95 // by a particular task group. Only enable eviction for the service 96 // scheduler as that logic is expensive. 97 evict := !batch 98 s.binPack = NewBinPackIterator(ctx, rankSource, evict, 0) 99 100 // Apply the job anti-affinity iterator. This is to avoid placing 101 // multiple allocations on the same node for this job. The penalty 102 // is less for batch jobs as it matters less. 103 penalty := serviceJobAntiAffinityPenalty 104 if batch { 105 penalty = batchJobAntiAffinityPenalty 106 } 107 s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "") 108 109 // Apply a limit function. This is to avoid scanning *every* possible node. 110 s.limit = NewLimitIterator(ctx, s.jobAntiAff, 2) 111 112 // Select the node with the maximum score for placement 113 s.maxScore = NewMaxScoreIterator(ctx, s.limit) 114 return s 115 } 116 117 func (s *GenericStack) SetNodes(baseNodes []*structs.Node) { 118 // Shuffle base nodes 119 shuffleNodes(baseNodes) 120 121 // Update the set of base nodes 122 s.source.SetNodes(baseNodes) 123 124 // Apply a limit function. This is to avoid scanning *every* possible node. 125 // For batch jobs we only need to evaluate 2 options and depend on the 126 // power of two choices. For services jobs we need to visit "enough". 127 // Using a log of the total number of nodes is a good restriction, with 128 // at least 2 as the floor 129 limit := 2 130 if n := len(baseNodes); !s.batch && n > 0 { 131 logLimit := int(math.Ceil(math.Log2(float64(n)))) 132 if logLimit > limit { 133 limit = logLimit 134 } 135 } 136 s.limit.SetLimit(limit) 137 } 138 139 func (s *GenericStack) SetJob(job *structs.Job) { 140 s.jobConstraint.SetConstraints(job.Constraints) 141 s.distinctHostsConstraint.SetJob(job) 142 s.distinctPropertyConstraint.SetJob(job) 143 s.binPack.SetPriority(job.Priority) 144 s.jobAntiAff.SetJob(job.ID) 145 s.ctx.Eligibility().SetJob(job) 146 } 147 148 func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) { 149 // Reset the max selector and context 150 s.maxScore.Reset() 151 s.ctx.Reset() 152 start := time.Now() 153 154 // Get the task groups constraints. 155 tgConstr := taskGroupConstraints(tg) 156 157 // Update the parameters of iterators 158 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 159 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 160 s.distinctHostsConstraint.SetTaskGroup(tg) 161 s.distinctPropertyConstraint.SetTaskGroup(tg) 162 s.wrappedChecks.SetTaskGroup(tg.Name) 163 s.binPack.SetTaskGroup(tg) 164 165 // Find the node with the max score 166 option := s.maxScore.Next() 167 168 // Ensure that the task resources were specified 169 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 170 for _, task := range tg.Tasks { 171 option.SetTaskResources(task, task.Resources) 172 } 173 } 174 175 // Store the compute time 176 s.ctx.Metrics().AllocationTime = time.Since(start) 177 return option, tgConstr.size 178 } 179 180 // SelectPreferredNode returns a node where an allocation of the task group can 181 // be placed, the node passed to it is preferred over the other available nodes 182 func (s *GenericStack) SelectPreferringNodes(tg *structs.TaskGroup, nodes []*structs.Node) (*RankedNode, *structs.Resources) { 183 originalNodes := s.source.nodes 184 s.source.SetNodes(nodes) 185 if option, resources := s.Select(tg); option != nil { 186 s.source.SetNodes(originalNodes) 187 return option, resources 188 } 189 s.source.SetNodes(originalNodes) 190 return s.Select(tg) 191 } 192 193 // SystemStack is the Stack used for the System scheduler. It is designed to 194 // attempt to make placements on all nodes. 195 type SystemStack struct { 196 ctx Context 197 source *StaticIterator 198 wrappedChecks *FeasibilityWrapper 199 jobConstraint *ConstraintChecker 200 taskGroupDrivers *DriverChecker 201 taskGroupConstraint *ConstraintChecker 202 distinctPropertyConstraint *DistinctPropertyIterator 203 binPack *BinPackIterator 204 } 205 206 // NewSystemStack constructs a stack used for selecting service placements 207 func NewSystemStack(ctx Context) *SystemStack { 208 // Create a new stack 209 s := &SystemStack{ctx: ctx} 210 211 // Create the source iterator. We visit nodes in a linear order because we 212 // have to evaluate on all nodes. 213 s.source = NewStaticIterator(ctx, nil) 214 215 // Attach the job constraints. The job is filled in later. 216 s.jobConstraint = NewConstraintChecker(ctx, nil) 217 218 // Filter on task group drivers first as they are faster 219 s.taskGroupDrivers = NewDriverChecker(ctx, nil) 220 221 // Filter on task group constraints second 222 s.taskGroupConstraint = NewConstraintChecker(ctx, nil) 223 224 // Create the feasibility wrapper which wraps all feasibility checks in 225 // which feasibility checking can be skipped if the computed node class has 226 // previously been marked as eligible or ineligible. Generally this will be 227 // checks that only needs to examine the single node to determine feasibility. 228 jobs := []FeasibilityChecker{s.jobConstraint} 229 tgs := []FeasibilityChecker{s.taskGroupDrivers, s.taskGroupConstraint} 230 s.wrappedChecks = NewFeasibilityWrapper(ctx, s.source, jobs, tgs) 231 232 // Filter on distinct property constraints. 233 s.distinctPropertyConstraint = NewDistinctPropertyIterator(ctx, s.wrappedChecks) 234 235 // Upgrade from feasible to rank iterator 236 rankSource := NewFeasibleRankIterator(ctx, s.distinctPropertyConstraint) 237 238 // Apply the bin packing, this depends on the resources needed 239 // by a particular task group. Enable eviction as system jobs are high 240 // priority. 241 s.binPack = NewBinPackIterator(ctx, rankSource, true, 0) 242 return s 243 } 244 245 func (s *SystemStack) SetNodes(baseNodes []*structs.Node) { 246 // Update the set of base nodes 247 s.source.SetNodes(baseNodes) 248 } 249 250 func (s *SystemStack) SetJob(job *structs.Job) { 251 s.jobConstraint.SetConstraints(job.Constraints) 252 s.distinctPropertyConstraint.SetJob(job) 253 s.binPack.SetPriority(job.Priority) 254 s.ctx.Eligibility().SetJob(job) 255 } 256 257 func (s *SystemStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) { 258 // Reset the binpack selector and context 259 s.binPack.Reset() 260 s.ctx.Reset() 261 start := time.Now() 262 263 // Get the task groups constraints. 264 tgConstr := taskGroupConstraints(tg) 265 266 // Update the parameters of iterators 267 s.taskGroupDrivers.SetDrivers(tgConstr.drivers) 268 s.taskGroupConstraint.SetConstraints(tgConstr.constraints) 269 s.wrappedChecks.SetTaskGroup(tg.Name) 270 s.distinctPropertyConstraint.SetTaskGroup(tg) 271 s.binPack.SetTaskGroup(tg) 272 273 // Get the next option that satisfies the constraints. 274 option := s.binPack.Next() 275 276 // Ensure that the task resources were specified 277 if option != nil && len(option.TaskResources) != len(tg.Tasks) { 278 for _, task := range tg.Tasks { 279 option.SetTaskResources(task, task.Resources) 280 } 281 } 282 283 // Store the compute time 284 s.ctx.Metrics().AllocationTime = time.Since(start) 285 return option, tgConstr.size 286 }