volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/capacity/capacity.go (about)

     1  /*
     2  Copyright 2024 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package capacity
    18  
    19  import (
    20  	"math"
    21  
    22  	v1 "k8s.io/api/core/v1"
    23  	"k8s.io/klog/v2"
    24  
    25  	"volcano.sh/apis/pkg/apis/scheduling"
    26  	"volcano.sh/volcano/pkg/scheduler/api"
    27  	"volcano.sh/volcano/pkg/scheduler/api/helpers"
    28  	"volcano.sh/volcano/pkg/scheduler/framework"
    29  	"volcano.sh/volcano/pkg/scheduler/metrics"
    30  	"volcano.sh/volcano/pkg/scheduler/plugins/util"
    31  )
    32  
    33  const (
    34  	PluginName = "capacity"
    35  )
    36  
    37  type capacityPlugin struct {
    38  	totalResource  *api.Resource
    39  	totalGuarantee *api.Resource
    40  
    41  	queueOpts map[api.QueueID]*queueAttr
    42  	// Arguments given for the plugin
    43  	pluginArguments framework.Arguments
    44  }
    45  
    46  type queueAttr struct {
    47  	queueID api.QueueID
    48  	name    string
    49  	share   float64
    50  
    51  	deserved  *api.Resource
    52  	allocated *api.Resource
    53  	request   *api.Resource
    54  	// elastic represents the sum of job's elastic resource, job's elastic = job.allocated - job.minAvailable
    55  	elastic *api.Resource
    56  	// inqueue represents the resource request of the inqueue job
    57  	inqueue    *api.Resource
    58  	capability *api.Resource
    59  	// realCapability represents the resource limit of the queue, LessEqual capability
    60  	realCapability *api.Resource
    61  	guarantee      *api.Resource
    62  }
    63  
    64  // New return capacityPlugin action
    65  func New(arguments framework.Arguments) framework.Plugin {
    66  	return &capacityPlugin{
    67  		totalResource:   api.EmptyResource(),
    68  		totalGuarantee:  api.EmptyResource(),
    69  		queueOpts:       map[api.QueueID]*queueAttr{},
    70  		pluginArguments: arguments,
    71  	}
    72  }
    73  
    74  func (cp *capacityPlugin) Name() string {
    75  	return PluginName
    76  }
    77  
    78  func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) {
    79  	// Prepare scheduling data for this session.
    80  	cp.totalResource.Add(ssn.TotalResource)
    81  
    82  	klog.V(4).Infof("The total resource is <%v>", cp.totalResource)
    83  	for _, queue := range ssn.Queues {
    84  		if len(queue.Queue.Spec.Guarantee.Resource) == 0 {
    85  			continue
    86  		}
    87  		guarantee := api.NewResource(queue.Queue.Spec.Guarantee.Resource)
    88  		cp.totalGuarantee.Add(guarantee)
    89  	}
    90  	klog.V(4).Infof("The total guarantee resource is <%v>", cp.totalGuarantee)
    91  	// Build attributes for Queues.
    92  	for _, job := range ssn.Jobs {
    93  		klog.V(4).Infof("Considering Job <%s/%s>.", job.Namespace, job.Name)
    94  		if _, found := cp.queueOpts[job.Queue]; !found {
    95  			queue := ssn.Queues[job.Queue]
    96  			attr := &queueAttr{
    97  				queueID: queue.UID,
    98  				name:    queue.Name,
    99  
   100  				deserved:  api.NewResource(queue.Queue.Spec.Deserved),
   101  				allocated: api.EmptyResource(),
   102  				request:   api.EmptyResource(),
   103  				elastic:   api.EmptyResource(),
   104  				inqueue:   api.EmptyResource(),
   105  				guarantee: api.EmptyResource(),
   106  			}
   107  			if len(queue.Queue.Spec.Capability) != 0 {
   108  				attr.capability = api.NewResource(queue.Queue.Spec.Capability)
   109  				if attr.capability.MilliCPU <= 0 {
   110  					attr.capability.MilliCPU = math.MaxFloat64
   111  				}
   112  				if attr.capability.Memory <= 0 {
   113  					attr.capability.Memory = math.MaxFloat64
   114  				}
   115  			}
   116  			if len(queue.Queue.Spec.Guarantee.Resource) != 0 {
   117  				attr.guarantee = api.NewResource(queue.Queue.Spec.Guarantee.Resource)
   118  			}
   119  			realCapability := cp.totalResource.Clone().Sub(cp.totalGuarantee).Add(attr.guarantee)
   120  			if attr.capability == nil {
   121  				attr.realCapability = realCapability
   122  			} else {
   123  				realCapability.MinDimensionResource(attr.capability, api.Infinity)
   124  				attr.realCapability = realCapability
   125  			}
   126  			cp.queueOpts[job.Queue] = attr
   127  			klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue)
   128  		}
   129  
   130  		attr := cp.queueOpts[job.Queue]
   131  		for status, tasks := range job.TaskStatusIndex {
   132  			if api.AllocatedStatus(status) {
   133  				for _, t := range tasks {
   134  					attr.allocated.Add(t.Resreq)
   135  					attr.request.Add(t.Resreq)
   136  				}
   137  			} else if status == api.Pending {
   138  				for _, t := range tasks {
   139  					attr.request.Add(t.Resreq)
   140  				}
   141  			}
   142  		}
   143  
   144  		if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue {
   145  			attr.inqueue.Add(job.GetMinResources())
   146  		}
   147  
   148  		// calculate inqueue resource for running jobs
   149  		// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
   150  		// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
   151  		if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
   152  			job.PodGroup.Spec.MinResources != nil &&
   153  			int32(util.CalculateAllocatedTaskNum(job)) >= job.PodGroup.Spec.MinMember {
   154  			inqueued := util.GetInqueueResource(job, job.Allocated)
   155  			attr.inqueue.Add(inqueued)
   156  		}
   157  		attr.elastic.Add(job.GetElasticResources())
   158  		klog.V(5).Infof("Queue %s allocated <%s> request <%s> inqueue <%s> elastic <%s>",
   159  			attr.name, attr.allocated.String(), attr.request.String(), attr.inqueue.String(), attr.elastic.String())
   160  	}
   161  
   162  	for _, attr := range cp.queueOpts {
   163  		if attr.realCapability != nil {
   164  			attr.deserved.MinDimensionResource(attr.realCapability, api.Infinity)
   165  		}
   166  		// When scalar resource not specified in deserved such as "pods", we should skip it and consider deserved resource as infinity.
   167  		attr.deserved.MinDimensionResource(attr.request, api.Infinity)
   168  
   169  		attr.deserved = helpers.Max(attr.deserved, attr.guarantee)
   170  		cp.updateShare(attr)
   171  		klog.V(4).Infof("The attributes of queue <%s> in capacity: deserved <%v>, realCapability <%v>, allocate <%v>, request <%v>, elastic <%v>, share <%0.2f>",
   172  			attr.name, attr.deserved, attr.realCapability, attr.allocated, attr.request, attr.elastic, attr.share)
   173  	}
   174  
   175  	// Record metrics
   176  	for queueID, queueInfo := range ssn.Queues {
   177  		queue := ssn.Queues[queueID]
   178  		if attr, ok := cp.queueOpts[queueID]; ok {
   179  			metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory)
   180  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   181  			metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory)
   182  			metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue)
   183  			metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending)
   184  			metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running)
   185  			metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown)
   186  			continue
   187  		}
   188  		deservedCPU, deservedMem := 0.0, 0.0
   189  		if queue.Queue.Spec.Deserved != nil {
   190  			deservedCPU = float64(queue.Queue.Spec.Deserved.Cpu().MilliValue())
   191  			deservedMem = float64(queue.Queue.Spec.Deserved.Memory().Value())
   192  		}
   193  		metrics.UpdateQueueDeserved(queueInfo.Name, deservedCPU, deservedMem)
   194  		metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0)
   195  		metrics.UpdateQueueRequest(queueInfo.Name, 0, 0)
   196  		metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0)
   197  		metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0)
   198  		metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0)
   199  		metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0)
   200  	}
   201  
   202  	ssn.AddQueueOrderFn(cp.Name(), func(l, r interface{}) int {
   203  		lv := l.(*api.QueueInfo)
   204  		rv := r.(*api.QueueInfo)
   205  
   206  		if cp.queueOpts[lv.UID].share == cp.queueOpts[rv.UID].share {
   207  			return 0
   208  		}
   209  
   210  		if cp.queueOpts[lv.UID].share < cp.queueOpts[rv.UID].share {
   211  			return -1
   212  		}
   213  
   214  		return 1
   215  	})
   216  
   217  	ssn.AddReclaimableFn(cp.Name(), func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
   218  		var victims []*api.TaskInfo
   219  		allocations := map[api.QueueID]*api.Resource{}
   220  
   221  		for _, reclaimee := range reclaimees {
   222  			job := ssn.Jobs[reclaimee.Job]
   223  			attr := cp.queueOpts[job.Queue]
   224  
   225  			if _, found := allocations[job.Queue]; !found {
   226  				allocations[job.Queue] = attr.allocated.Clone()
   227  			}
   228  			allocated := allocations[job.Queue]
   229  			if allocated.LessPartly(reclaimer.Resreq, api.Zero) {
   230  				klog.V(3).Infof("Failed to allocate resource for Task <%s/%s> in Queue <%s>, not enough resource.",
   231  					reclaimee.Namespace, reclaimee.Name, job.Queue)
   232  				continue
   233  			}
   234  
   235  			exceptReclaimee := allocated.Clone().Sub(reclaimee.Resreq)
   236  			// When scalar resource not specified in deserved such as "pods", we should skip it and consider it as infinity,
   237  			// so the following first condition will be true and the current queue will not be reclaimed.
   238  			if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) {
   239  				continue
   240  			}
   241  			allocated.Sub(reclaimee.Resreq)
   242  			victims = append(victims, reclaimee)
   243  		}
   244  		klog.V(4).InfoS("Victims from capacity plugin", "victims", victims, "reclaimer", reclaimer)
   245  		return victims, util.Permit
   246  	})
   247  
   248  	ssn.AddPreemptiveFn(cp.Name(), func(obj interface{}) bool {
   249  		queue := obj.(*api.QueueInfo)
   250  		attr := cp.queueOpts[queue.UID]
   251  
   252  		overused := attr.deserved.LessEqual(attr.allocated, api.Zero)
   253  		metrics.UpdateQueueOverused(attr.name, overused)
   254  		if overused {
   255  			klog.V(3).Infof("Queue <%v> can not reclaim, deserved <%v>, allocated <%v>, share <%v>",
   256  				queue.Name, attr.deserved, attr.allocated, attr.share)
   257  		}
   258  
   259  		return !overused
   260  	})
   261  
   262  	ssn.AddAllocatableFn(cp.Name(), func(queue *api.QueueInfo, candidate *api.TaskInfo) bool {
   263  		attr := cp.queueOpts[queue.UID]
   264  
   265  		free, _ := attr.realCapability.Diff(attr.allocated, api.Zero)
   266  		allocatable := candidate.Resreq.LessEqual(free, api.Zero)
   267  		if !allocatable {
   268  			klog.V(3).Infof("Queue <%v>: realCapability <%v>, allocated <%v>; Candidate <%v>: resource request <%v>",
   269  				queue.Name, attr.realCapability, attr.allocated, candidate.Name, candidate.Resreq)
   270  		}
   271  
   272  		return allocatable
   273  	})
   274  
   275  	ssn.AddJobEnqueueableFn(cp.Name(), func(obj interface{}) int {
   276  		job := obj.(*api.JobInfo)
   277  		queueID := job.Queue
   278  		attr := cp.queueOpts[queueID]
   279  		queue := ssn.Queues[queueID]
   280  		// If no capability is set, always enqueue the job.
   281  		if attr.realCapability == nil {
   282  			klog.V(4).Infof("Capability of queue <%s> was not set, allow job <%s/%s> to Inqueue.",
   283  				queue.Name, job.Namespace, job.Name)
   284  			return util.Permit
   285  		}
   286  
   287  		if job.PodGroup.Spec.MinResources == nil {
   288  			klog.V(4).Infof("job %s MinResources is null.", job.Name)
   289  			return util.Permit
   290  		}
   291  		minReq := job.GetMinResources()
   292  
   293  		klog.V(5).Infof("job %s min resource <%s>, queue %s capability <%s> allocated <%s> inqueue <%s> elastic <%s>",
   294  			job.Name, minReq.String(), queue.Name, attr.realCapability.String(), attr.allocated.String(), attr.inqueue.String(), attr.elastic.String())
   295  		// The queue resource quota limit has not reached
   296  		r := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic)
   297  		rr := attr.realCapability.Clone()
   298  
   299  		for name := range rr.ScalarResources {
   300  			if _, ok := r.ScalarResources[name]; !ok {
   301  				delete(rr.ScalarResources, name)
   302  			}
   303  		}
   304  
   305  		inqueue := r.LessEqual(rr, api.Infinity)
   306  		klog.V(5).Infof("job %s inqueue %v", job.Name, inqueue)
   307  		if inqueue {
   308  			attr.inqueue.Add(job.GetMinResources())
   309  			return util.Permit
   310  		}
   311  		ssn.RecordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupUnschedulableType), "queue resource quota insufficient")
   312  		return util.Reject
   313  	})
   314  
   315  	// Register event handlers.
   316  	ssn.AddEventHandler(&framework.EventHandler{
   317  		AllocateFunc: func(event *framework.Event) {
   318  			job := ssn.Jobs[event.Task.Job]
   319  			attr := cp.queueOpts[job.Queue]
   320  			attr.allocated.Add(event.Task.Resreq)
   321  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   322  
   323  			cp.updateShare(attr)
   324  
   325  			klog.V(4).Infof("Capacity AllocateFunc: task <%v/%v>, resreq <%v>,  share <%v>",
   326  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share)
   327  		},
   328  		DeallocateFunc: func(event *framework.Event) {
   329  			job := ssn.Jobs[event.Task.Job]
   330  			attr := cp.queueOpts[job.Queue]
   331  			attr.allocated.Sub(event.Task.Resreq)
   332  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   333  
   334  			cp.updateShare(attr)
   335  
   336  			klog.V(4).Infof("Capacity EvictFunc: task <%v/%v>, resreq <%v>,  share <%v>",
   337  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share)
   338  		},
   339  	})
   340  }
   341  
   342  func (cp *capacityPlugin) OnSessionClose(ssn *framework.Session) {
   343  	cp.totalResource = nil
   344  	cp.totalGuarantee = nil
   345  	cp.queueOpts = nil
   346  }
   347  
   348  func (cp *capacityPlugin) updateShare(attr *queueAttr) {
   349  	res := float64(0)
   350  
   351  	for _, rn := range attr.deserved.ResourceNames() {
   352  		share := helpers.Share(attr.allocated.Get(rn), attr.deserved.Get(rn))
   353  		if share > res {
   354  			res = share
   355  		}
   356  	}
   357  
   358  	attr.share = res
   359  	metrics.UpdateQueueShare(attr.name, attr.share)
   360  }