volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/capacity/capacity.go (about) 1 /* 2 Copyright 2024 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package capacity 18 19 import ( 20 "math" 21 22 v1 "k8s.io/api/core/v1" 23 "k8s.io/klog/v2" 24 25 "volcano.sh/apis/pkg/apis/scheduling" 26 "volcano.sh/volcano/pkg/scheduler/api" 27 "volcano.sh/volcano/pkg/scheduler/api/helpers" 28 "volcano.sh/volcano/pkg/scheduler/framework" 29 "volcano.sh/volcano/pkg/scheduler/metrics" 30 "volcano.sh/volcano/pkg/scheduler/plugins/util" 31 ) 32 33 const ( 34 PluginName = "capacity" 35 ) 36 37 type capacityPlugin struct { 38 totalResource *api.Resource 39 totalGuarantee *api.Resource 40 41 queueOpts map[api.QueueID]*queueAttr 42 // Arguments given for the plugin 43 pluginArguments framework.Arguments 44 } 45 46 type queueAttr struct { 47 queueID api.QueueID 48 name string 49 share float64 50 51 deserved *api.Resource 52 allocated *api.Resource 53 request *api.Resource 54 // elastic represents the sum of job's elastic resource, job's elastic = job.allocated - job.minAvailable 55 elastic *api.Resource 56 // inqueue represents the resource request of the inqueue job 57 inqueue *api.Resource 58 capability *api.Resource 59 // realCapability represents the resource limit of the queue, LessEqual capability 60 realCapability *api.Resource 61 guarantee *api.Resource 62 } 63 64 // New return capacityPlugin action 65 func New(arguments framework.Arguments) framework.Plugin { 66 return &capacityPlugin{ 67 totalResource: api.EmptyResource(), 68 totalGuarantee: api.EmptyResource(), 69 queueOpts: map[api.QueueID]*queueAttr{}, 70 pluginArguments: arguments, 71 } 72 } 73 74 func (cp *capacityPlugin) Name() string { 75 return PluginName 76 } 77 78 func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { 79 // Prepare scheduling data for this session. 80 cp.totalResource.Add(ssn.TotalResource) 81 82 klog.V(4).Infof("The total resource is <%v>", cp.totalResource) 83 for _, queue := range ssn.Queues { 84 if len(queue.Queue.Spec.Guarantee.Resource) == 0 { 85 continue 86 } 87 guarantee := api.NewResource(queue.Queue.Spec.Guarantee.Resource) 88 cp.totalGuarantee.Add(guarantee) 89 } 90 klog.V(4).Infof("The total guarantee resource is <%v>", cp.totalGuarantee) 91 // Build attributes for Queues. 92 for _, job := range ssn.Jobs { 93 klog.V(4).Infof("Considering Job <%s/%s>.", job.Namespace, job.Name) 94 if _, found := cp.queueOpts[job.Queue]; !found { 95 queue := ssn.Queues[job.Queue] 96 attr := &queueAttr{ 97 queueID: queue.UID, 98 name: queue.Name, 99 100 deserved: api.NewResource(queue.Queue.Spec.Deserved), 101 allocated: api.EmptyResource(), 102 request: api.EmptyResource(), 103 elastic: api.EmptyResource(), 104 inqueue: api.EmptyResource(), 105 guarantee: api.EmptyResource(), 106 } 107 if len(queue.Queue.Spec.Capability) != 0 { 108 attr.capability = api.NewResource(queue.Queue.Spec.Capability) 109 if attr.capability.MilliCPU <= 0 { 110 attr.capability.MilliCPU = math.MaxFloat64 111 } 112 if attr.capability.Memory <= 0 { 113 attr.capability.Memory = math.MaxFloat64 114 } 115 } 116 if len(queue.Queue.Spec.Guarantee.Resource) != 0 { 117 attr.guarantee = api.NewResource(queue.Queue.Spec.Guarantee.Resource) 118 } 119 realCapability := cp.totalResource.Clone().Sub(cp.totalGuarantee).Add(attr.guarantee) 120 if attr.capability == nil { 121 attr.realCapability = realCapability 122 } else { 123 realCapability.MinDimensionResource(attr.capability, api.Infinity) 124 attr.realCapability = realCapability 125 } 126 cp.queueOpts[job.Queue] = attr 127 klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue) 128 } 129 130 attr := cp.queueOpts[job.Queue] 131 for status, tasks := range job.TaskStatusIndex { 132 if api.AllocatedStatus(status) { 133 for _, t := range tasks { 134 attr.allocated.Add(t.Resreq) 135 attr.request.Add(t.Resreq) 136 } 137 } else if status == api.Pending { 138 for _, t := range tasks { 139 attr.request.Add(t.Resreq) 140 } 141 } 142 } 143 144 if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue { 145 attr.inqueue.Add(job.GetMinResources()) 146 } 147 148 // calculate inqueue resource for running jobs 149 // the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition: 150 // Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement. 151 if job.PodGroup.Status.Phase == scheduling.PodGroupRunning && 152 job.PodGroup.Spec.MinResources != nil && 153 int32(util.CalculateAllocatedTaskNum(job)) >= job.PodGroup.Spec.MinMember { 154 inqueued := util.GetInqueueResource(job, job.Allocated) 155 attr.inqueue.Add(inqueued) 156 } 157 attr.elastic.Add(job.GetElasticResources()) 158 klog.V(5).Infof("Queue %s allocated <%s> request <%s> inqueue <%s> elastic <%s>", 159 attr.name, attr.allocated.String(), attr.request.String(), attr.inqueue.String(), attr.elastic.String()) 160 } 161 162 for _, attr := range cp.queueOpts { 163 if attr.realCapability != nil { 164 attr.deserved.MinDimensionResource(attr.realCapability, api.Infinity) 165 } 166 // When scalar resource not specified in deserved such as "pods", we should skip it and consider deserved resource as infinity. 167 attr.deserved.MinDimensionResource(attr.request, api.Infinity) 168 169 attr.deserved = helpers.Max(attr.deserved, attr.guarantee) 170 cp.updateShare(attr) 171 klog.V(4).Infof("The attributes of queue <%s> in capacity: deserved <%v>, realCapability <%v>, allocate <%v>, request <%v>, elastic <%v>, share <%0.2f>", 172 attr.name, attr.deserved, attr.realCapability, attr.allocated, attr.request, attr.elastic, attr.share) 173 } 174 175 // Record metrics 176 for queueID, queueInfo := range ssn.Queues { 177 queue := ssn.Queues[queueID] 178 if attr, ok := cp.queueOpts[queueID]; ok { 179 metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory) 180 metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) 181 metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory) 182 metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue) 183 metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending) 184 metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running) 185 metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown) 186 continue 187 } 188 deservedCPU, deservedMem := 0.0, 0.0 189 if queue.Queue.Spec.Deserved != nil { 190 deservedCPU = float64(queue.Queue.Spec.Deserved.Cpu().MilliValue()) 191 deservedMem = float64(queue.Queue.Spec.Deserved.Memory().Value()) 192 } 193 metrics.UpdateQueueDeserved(queueInfo.Name, deservedCPU, deservedMem) 194 metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0) 195 metrics.UpdateQueueRequest(queueInfo.Name, 0, 0) 196 metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0) 197 metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0) 198 metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0) 199 metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0) 200 } 201 202 ssn.AddQueueOrderFn(cp.Name(), func(l, r interface{}) int { 203 lv := l.(*api.QueueInfo) 204 rv := r.(*api.QueueInfo) 205 206 if cp.queueOpts[lv.UID].share == cp.queueOpts[rv.UID].share { 207 return 0 208 } 209 210 if cp.queueOpts[lv.UID].share < cp.queueOpts[rv.UID].share { 211 return -1 212 } 213 214 return 1 215 }) 216 217 ssn.AddReclaimableFn(cp.Name(), func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) { 218 var victims []*api.TaskInfo 219 allocations := map[api.QueueID]*api.Resource{} 220 221 for _, reclaimee := range reclaimees { 222 job := ssn.Jobs[reclaimee.Job] 223 attr := cp.queueOpts[job.Queue] 224 225 if _, found := allocations[job.Queue]; !found { 226 allocations[job.Queue] = attr.allocated.Clone() 227 } 228 allocated := allocations[job.Queue] 229 if allocated.LessPartly(reclaimer.Resreq, api.Zero) { 230 klog.V(3).Infof("Failed to allocate resource for Task <%s/%s> in Queue <%s>, not enough resource.", 231 reclaimee.Namespace, reclaimee.Name, job.Queue) 232 continue 233 } 234 235 exceptReclaimee := allocated.Clone().Sub(reclaimee.Resreq) 236 // When scalar resource not specified in deserved such as "pods", we should skip it and consider it as infinity, 237 // so the following first condition will be true and the current queue will not be reclaimed. 238 if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) { 239 continue 240 } 241 allocated.Sub(reclaimee.Resreq) 242 victims = append(victims, reclaimee) 243 } 244 klog.V(4).InfoS("Victims from capacity plugin", "victims", victims, "reclaimer", reclaimer) 245 return victims, util.Permit 246 }) 247 248 ssn.AddPreemptiveFn(cp.Name(), func(obj interface{}) bool { 249 queue := obj.(*api.QueueInfo) 250 attr := cp.queueOpts[queue.UID] 251 252 overused := attr.deserved.LessEqual(attr.allocated, api.Zero) 253 metrics.UpdateQueueOverused(attr.name, overused) 254 if overused { 255 klog.V(3).Infof("Queue <%v> can not reclaim, deserved <%v>, allocated <%v>, share <%v>", 256 queue.Name, attr.deserved, attr.allocated, attr.share) 257 } 258 259 return !overused 260 }) 261 262 ssn.AddAllocatableFn(cp.Name(), func(queue *api.QueueInfo, candidate *api.TaskInfo) bool { 263 attr := cp.queueOpts[queue.UID] 264 265 free, _ := attr.realCapability.Diff(attr.allocated, api.Zero) 266 allocatable := candidate.Resreq.LessEqual(free, api.Zero) 267 if !allocatable { 268 klog.V(3).Infof("Queue <%v>: realCapability <%v>, allocated <%v>; Candidate <%v>: resource request <%v>", 269 queue.Name, attr.realCapability, attr.allocated, candidate.Name, candidate.Resreq) 270 } 271 272 return allocatable 273 }) 274 275 ssn.AddJobEnqueueableFn(cp.Name(), func(obj interface{}) int { 276 job := obj.(*api.JobInfo) 277 queueID := job.Queue 278 attr := cp.queueOpts[queueID] 279 queue := ssn.Queues[queueID] 280 // If no capability is set, always enqueue the job. 281 if attr.realCapability == nil { 282 klog.V(4).Infof("Capability of queue <%s> was not set, allow job <%s/%s> to Inqueue.", 283 queue.Name, job.Namespace, job.Name) 284 return util.Permit 285 } 286 287 if job.PodGroup.Spec.MinResources == nil { 288 klog.V(4).Infof("job %s MinResources is null.", job.Name) 289 return util.Permit 290 } 291 minReq := job.GetMinResources() 292 293 klog.V(5).Infof("job %s min resource <%s>, queue %s capability <%s> allocated <%s> inqueue <%s> elastic <%s>", 294 job.Name, minReq.String(), queue.Name, attr.realCapability.String(), attr.allocated.String(), attr.inqueue.String(), attr.elastic.String()) 295 // The queue resource quota limit has not reached 296 r := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic) 297 rr := attr.realCapability.Clone() 298 299 for name := range rr.ScalarResources { 300 if _, ok := r.ScalarResources[name]; !ok { 301 delete(rr.ScalarResources, name) 302 } 303 } 304 305 inqueue := r.LessEqual(rr, api.Infinity) 306 klog.V(5).Infof("job %s inqueue %v", job.Name, inqueue) 307 if inqueue { 308 attr.inqueue.Add(job.GetMinResources()) 309 return util.Permit 310 } 311 ssn.RecordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupUnschedulableType), "queue resource quota insufficient") 312 return util.Reject 313 }) 314 315 // Register event handlers. 316 ssn.AddEventHandler(&framework.EventHandler{ 317 AllocateFunc: func(event *framework.Event) { 318 job := ssn.Jobs[event.Task.Job] 319 attr := cp.queueOpts[job.Queue] 320 attr.allocated.Add(event.Task.Resreq) 321 metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) 322 323 cp.updateShare(attr) 324 325 klog.V(4).Infof("Capacity AllocateFunc: task <%v/%v>, resreq <%v>, share <%v>", 326 event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share) 327 }, 328 DeallocateFunc: func(event *framework.Event) { 329 job := ssn.Jobs[event.Task.Job] 330 attr := cp.queueOpts[job.Queue] 331 attr.allocated.Sub(event.Task.Resreq) 332 metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) 333 334 cp.updateShare(attr) 335 336 klog.V(4).Infof("Capacity EvictFunc: task <%v/%v>, resreq <%v>, share <%v>", 337 event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share) 338 }, 339 }) 340 } 341 342 func (cp *capacityPlugin) OnSessionClose(ssn *framework.Session) { 343 cp.totalResource = nil 344 cp.totalGuarantee = nil 345 cp.queueOpts = nil 346 } 347 348 func (cp *capacityPlugin) updateShare(attr *queueAttr) { 349 res := float64(0) 350 351 for _, rn := range attr.deserved.ResourceNames() { 352 share := helpers.Share(attr.allocated.Get(rn), attr.deserved.Get(rn)) 353 if share > res { 354 res = share 355 } 356 } 357 358 attr.share = res 359 metrics.UpdateQueueShare(attr.name, attr.share) 360 }