volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/proportion/proportion.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package proportion
    18  
    19  import (
    20  	"math"
    21  	"reflect"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	"k8s.io/klog/v2"
    25  
    26  	"volcano.sh/apis/pkg/apis/scheduling"
    27  	"volcano.sh/volcano/pkg/scheduler/api"
    28  	"volcano.sh/volcano/pkg/scheduler/api/helpers"
    29  	"volcano.sh/volcano/pkg/scheduler/framework"
    30  	"volcano.sh/volcano/pkg/scheduler/metrics"
    31  	"volcano.sh/volcano/pkg/scheduler/plugins/util"
    32  )
    33  
    34  // PluginName indicates name of volcano scheduler plugin.
    35  const PluginName = "proportion"
    36  
    37  type proportionPlugin struct {
    38  	totalResource  *api.Resource
    39  	totalGuarantee *api.Resource
    40  	queueOpts      map[api.QueueID]*queueAttr
    41  	// Arguments given for the plugin
    42  	pluginArguments framework.Arguments
    43  }
    44  
    45  type queueAttr struct {
    46  	queueID api.QueueID
    47  	name    string
    48  	weight  int32
    49  	share   float64
    50  
    51  	deserved  *api.Resource
    52  	allocated *api.Resource
    53  	request   *api.Resource
    54  	// elastic represents the sum of job's elastic resource, job's elastic = job.allocated - job.minAvailable
    55  	elastic *api.Resource
    56  	// inqueue represents the resource request of the inqueue job
    57  	inqueue    *api.Resource
    58  	capability *api.Resource
    59  	// realCapability represents the resource limit of the queue, LessEqual capability
    60  	realCapability *api.Resource
    61  	guarantee      *api.Resource
    62  }
    63  
    64  // New return proportion action
    65  func New(arguments framework.Arguments) framework.Plugin {
    66  	return &proportionPlugin{
    67  		totalResource:   api.EmptyResource(),
    68  		totalGuarantee:  api.EmptyResource(),
    69  		queueOpts:       map[api.QueueID]*queueAttr{},
    70  		pluginArguments: arguments,
    71  	}
    72  }
    73  
    74  func (pp *proportionPlugin) Name() string {
    75  	return PluginName
    76  }
    77  
    78  func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) {
    79  	// Prepare scheduling data for this session.
    80  	pp.totalResource.Add(ssn.TotalResource)
    81  
    82  	klog.V(4).Infof("The total resource is <%v>", pp.totalResource)
    83  	for _, queue := range ssn.Queues {
    84  		if len(queue.Queue.Spec.Guarantee.Resource) == 0 {
    85  			continue
    86  		}
    87  		guarantee := api.NewResource(queue.Queue.Spec.Guarantee.Resource)
    88  		pp.totalGuarantee.Add(guarantee)
    89  	}
    90  	klog.V(4).Infof("The total guarantee resource is <%v>", pp.totalGuarantee)
    91  	// Build attributes for Queues.
    92  	for _, job := range ssn.Jobs {
    93  		klog.V(4).Infof("Considering Job <%s/%s>.", job.Namespace, job.Name)
    94  		if _, found := pp.queueOpts[job.Queue]; !found {
    95  			queue := ssn.Queues[job.Queue]
    96  			attr := &queueAttr{
    97  				queueID: queue.UID,
    98  				name:    queue.Name,
    99  				weight:  queue.Weight,
   100  
   101  				deserved:  api.EmptyResource(),
   102  				allocated: api.EmptyResource(),
   103  				request:   api.EmptyResource(),
   104  				elastic:   api.EmptyResource(),
   105  				inqueue:   api.EmptyResource(),
   106  				guarantee: api.EmptyResource(),
   107  			}
   108  			if len(queue.Queue.Spec.Capability) != 0 {
   109  				attr.capability = api.NewResource(queue.Queue.Spec.Capability)
   110  				if attr.capability.MilliCPU <= 0 {
   111  					attr.capability.MilliCPU = math.MaxFloat64
   112  				}
   113  				if attr.capability.Memory <= 0 {
   114  					attr.capability.Memory = math.MaxFloat64
   115  				}
   116  			}
   117  			if len(queue.Queue.Spec.Guarantee.Resource) != 0 {
   118  				attr.guarantee = api.NewResource(queue.Queue.Spec.Guarantee.Resource)
   119  			}
   120  			realCapability := pp.totalResource.Clone().Sub(pp.totalGuarantee).Add(attr.guarantee)
   121  			if attr.capability == nil {
   122  				attr.realCapability = realCapability
   123  			} else {
   124  				realCapability.MinDimensionResource(attr.capability, api.Infinity)
   125  				attr.realCapability = realCapability
   126  			}
   127  			pp.queueOpts[job.Queue] = attr
   128  			klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue)
   129  		}
   130  
   131  		attr := pp.queueOpts[job.Queue]
   132  		for status, tasks := range job.TaskStatusIndex {
   133  			if api.AllocatedStatus(status) {
   134  				for _, t := range tasks {
   135  					attr.allocated.Add(t.Resreq)
   136  					attr.request.Add(t.Resreq)
   137  				}
   138  			} else if status == api.Pending {
   139  				for _, t := range tasks {
   140  					attr.request.Add(t.Resreq)
   141  				}
   142  			}
   143  		}
   144  
   145  		if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue {
   146  			attr.inqueue.Add(job.GetMinResources())
   147  		}
   148  
   149  		// calculate inqueue resource for running jobs
   150  		// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
   151  		// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
   152  		if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
   153  			job.PodGroup.Spec.MinResources != nil &&
   154  			int32(util.CalculateAllocatedTaskNum(job)) >= job.PodGroup.Spec.MinMember {
   155  			inqueued := util.GetInqueueResource(job, job.Allocated)
   156  			attr.inqueue.Add(inqueued)
   157  		}
   158  		attr.elastic.Add(job.GetElasticResources())
   159  		klog.V(5).Infof("Queue %s allocated <%s> request <%s> inqueue <%s> elastic <%s>",
   160  			attr.name, attr.allocated.String(), attr.request.String(), attr.inqueue.String(), attr.elastic.String())
   161  	}
   162  
   163  	// Record metrics
   164  	for queueID, queueInfo := range ssn.Queues {
   165  		if attr, ok := pp.queueOpts[queueID]; ok {
   166  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   167  			metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory)
   168  			metrics.UpdateQueueWeight(attr.name, attr.weight)
   169  			queue := ssn.Queues[attr.queueID]
   170  			metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue)
   171  			metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending)
   172  			metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running)
   173  			metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown)
   174  			continue
   175  		}
   176  		metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0)
   177  		metrics.UpdateQueueRequest(queueInfo.Name, 0, 0)
   178  		metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0)
   179  		metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0)
   180  		metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0)
   181  		metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0)
   182  	}
   183  
   184  	remaining := pp.totalResource.Clone()
   185  	meet := map[api.QueueID]struct{}{}
   186  	for {
   187  		totalWeight := int32(0)
   188  		for _, attr := range pp.queueOpts {
   189  			if _, found := meet[attr.queueID]; found {
   190  				continue
   191  			}
   192  			totalWeight += attr.weight
   193  		}
   194  
   195  		// If no queues, break
   196  		if totalWeight == 0 {
   197  			klog.V(4).Infof("Exiting when total weight is 0")
   198  			break
   199  		}
   200  
   201  		oldRemaining := remaining.Clone()
   202  		// Calculates the deserved of each Queue.
   203  		// increasedDeserved is the increased value for attr.deserved of processed queues
   204  		// decreasedDeserved is the decreased value for attr.deserved of processed queues
   205  		increasedDeserved := api.EmptyResource()
   206  		decreasedDeserved := api.EmptyResource()
   207  		for _, attr := range pp.queueOpts {
   208  			klog.V(4).Infof("Considering Queue <%s>: weight <%d>, total weight <%d>.",
   209  				attr.name, attr.weight, totalWeight)
   210  			if _, found := meet[attr.queueID]; found {
   211  				continue
   212  			}
   213  
   214  			oldDeserved := attr.deserved.Clone()
   215  			attr.deserved.Add(remaining.Clone().Multi(float64(attr.weight) / float64(totalWeight)))
   216  
   217  			if attr.realCapability != nil {
   218  				attr.deserved.MinDimensionResource(attr.realCapability, api.Infinity)
   219  			}
   220  			attr.deserved.MinDimensionResource(attr.request, api.Zero)
   221  
   222  			attr.deserved = helpers.Max(attr.deserved, attr.guarantee)
   223  			pp.updateShare(attr)
   224  			klog.V(4).Infof("Format queue <%s> deserved resource to <%v>", attr.name, attr.deserved)
   225  
   226  			if attr.request.LessEqual(attr.deserved, api.Zero) {
   227  				meet[attr.queueID] = struct{}{}
   228  				klog.V(4).Infof("queue <%s> is meet", attr.name)
   229  			} else if reflect.DeepEqual(attr.deserved, oldDeserved) {
   230  				meet[attr.queueID] = struct{}{}
   231  				klog.V(4).Infof("queue <%s> is meet cause of the capability", attr.name)
   232  			}
   233  
   234  			klog.V(4).Infof("The attributes of queue <%s> in proportion: deserved <%v>, realCapability <%v>, allocate <%v>, request <%v>, elastic <%v>, share <%0.2f>",
   235  				attr.name, attr.deserved, attr.realCapability, attr.allocated, attr.request, attr.elastic, attr.share)
   236  
   237  			increased, decreased := attr.deserved.Diff(oldDeserved, api.Zero)
   238  			increasedDeserved.Add(increased)
   239  			decreasedDeserved.Add(decreased)
   240  
   241  			// Record metrics
   242  			metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory)
   243  		}
   244  
   245  		remaining.Sub(increasedDeserved).Add(decreasedDeserved)
   246  		klog.V(4).Infof("Remaining resource is  <%s>", remaining)
   247  		if remaining.IsEmpty() || reflect.DeepEqual(remaining, oldRemaining) {
   248  			klog.V(4).Infof("Exiting when remaining is empty or no queue has more resource request:  <%v>", remaining)
   249  			break
   250  		}
   251  	}
   252  
   253  	ssn.AddQueueOrderFn(pp.Name(), func(l, r interface{}) int {
   254  		lv := l.(*api.QueueInfo)
   255  		rv := r.(*api.QueueInfo)
   256  
   257  		if pp.queueOpts[lv.UID].share == pp.queueOpts[rv.UID].share {
   258  			return 0
   259  		}
   260  
   261  		if pp.queueOpts[lv.UID].share < pp.queueOpts[rv.UID].share {
   262  			return -1
   263  		}
   264  
   265  		return 1
   266  	})
   267  
   268  	ssn.AddReclaimableFn(pp.Name(), func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
   269  		var victims []*api.TaskInfo
   270  		allocations := map[api.QueueID]*api.Resource{}
   271  
   272  		for _, reclaimee := range reclaimees {
   273  			job := ssn.Jobs[reclaimee.Job]
   274  			attr := pp.queueOpts[job.Queue]
   275  
   276  			if _, found := allocations[job.Queue]; !found {
   277  				allocations[job.Queue] = attr.allocated.Clone()
   278  			}
   279  			allocated := allocations[job.Queue]
   280  			if allocated.LessPartly(reclaimer.Resreq, api.Zero) {
   281  				klog.V(3).Infof("Failed to allocate resource for Task <%s/%s> in Queue <%s>, not enough resource.",
   282  					reclaimee.Namespace, reclaimee.Name, job.Queue)
   283  				continue
   284  			}
   285  
   286  			if !allocated.LessEqual(attr.deserved, api.Zero) {
   287  				allocated.Sub(reclaimee.Resreq)
   288  				victims = append(victims, reclaimee)
   289  			}
   290  		}
   291  		klog.V(4).Infof("Victims from proportion plugins are %+v", victims)
   292  		return victims, util.Permit
   293  	})
   294  
   295  	ssn.AddOverusedFn(pp.Name(), func(obj interface{}) bool {
   296  		queue := obj.(*api.QueueInfo)
   297  		attr := pp.queueOpts[queue.UID]
   298  
   299  		overused := attr.deserved.LessEqual(attr.allocated, api.Zero)
   300  		metrics.UpdateQueueOverused(attr.name, overused)
   301  		if overused {
   302  			klog.V(3).Infof("Queue <%v>: deserved <%v>, allocated <%v>, share <%v>",
   303  				queue.Name, attr.deserved, attr.allocated, attr.share)
   304  		}
   305  
   306  		return overused
   307  	})
   308  
   309  	ssn.AddAllocatableFn(pp.Name(), func(queue *api.QueueInfo, candidate *api.TaskInfo) bool {
   310  		attr := pp.queueOpts[queue.UID]
   311  
   312  		free, _ := attr.deserved.Diff(attr.allocated, api.Zero)
   313  		allocatable := candidate.Resreq.LessEqual(free, api.Zero)
   314  		if !allocatable {
   315  			klog.V(3).Infof("Queue <%v>: deserved <%v>, allocated <%v>; Candidate <%v>: resource request <%v>",
   316  				queue.Name, attr.deserved, attr.allocated, candidate.Name, candidate.Resreq)
   317  		}
   318  
   319  		return allocatable
   320  	})
   321  
   322  	ssn.AddJobEnqueueableFn(pp.Name(), func(obj interface{}) int {
   323  		job := obj.(*api.JobInfo)
   324  		queueID := job.Queue
   325  		attr := pp.queueOpts[queueID]
   326  		queue := ssn.Queues[queueID]
   327  		// If no capability is set, always enqueue the job.
   328  		if attr.realCapability == nil {
   329  			klog.V(4).Infof("Capability of queue <%s> was not set, allow job <%s/%s> to Inqueue.",
   330  				queue.Name, job.Namespace, job.Name)
   331  			return util.Permit
   332  		}
   333  
   334  		if job.PodGroup.Spec.MinResources == nil {
   335  			klog.V(4).Infof("job %s MinResources is null.", job.Name)
   336  			return util.Permit
   337  		}
   338  		minReq := job.GetMinResources()
   339  
   340  		klog.V(5).Infof("job %s min resource <%s>, queue %s capability <%s> allocated <%s> inqueue <%s> elastic <%s>",
   341  			job.Name, minReq.String(), queue.Name, attr.realCapability.String(), attr.allocated.String(), attr.inqueue.String(), attr.elastic.String())
   342  		// The queue resource quota limit has not reached
   343  		r := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic)
   344  		rr := attr.realCapability.Clone()
   345  
   346  		for name := range rr.ScalarResources {
   347  			if _, ok := r.ScalarResources[name]; !ok {
   348  				delete(rr.ScalarResources, name)
   349  			}
   350  		}
   351  
   352  		inqueue := r.LessEqual(rr, api.Infinity)
   353  		klog.V(5).Infof("job %s inqueue %v", job.Name, inqueue)
   354  		if inqueue {
   355  			attr.inqueue.Add(job.GetMinResources())
   356  			return util.Permit
   357  		}
   358  		ssn.RecordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupUnschedulableType), "queue resource quota insufficient")
   359  		return util.Reject
   360  	})
   361  
   362  	// Register event handlers.
   363  	ssn.AddEventHandler(&framework.EventHandler{
   364  		AllocateFunc: func(event *framework.Event) {
   365  			job := ssn.Jobs[event.Task.Job]
   366  			attr := pp.queueOpts[job.Queue]
   367  			attr.allocated.Add(event.Task.Resreq)
   368  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   369  
   370  			pp.updateShare(attr)
   371  
   372  			klog.V(4).Infof("Proportion AllocateFunc: task <%v/%v>, resreq <%v>,  share <%v>",
   373  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share)
   374  		},
   375  		DeallocateFunc: func(event *framework.Event) {
   376  			job := ssn.Jobs[event.Task.Job]
   377  			attr := pp.queueOpts[job.Queue]
   378  			attr.allocated.Sub(event.Task.Resreq)
   379  			metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
   380  
   381  			pp.updateShare(attr)
   382  
   383  			klog.V(4).Infof("Proportion EvictFunc: task <%v/%v>, resreq <%v>,  share <%v>",
   384  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share)
   385  		},
   386  	})
   387  }
   388  
   389  func (pp *proportionPlugin) OnSessionClose(ssn *framework.Session) {
   390  	pp.totalResource = nil
   391  	pp.totalGuarantee = nil
   392  	pp.queueOpts = nil
   393  }
   394  
   395  func (pp *proportionPlugin) updateShare(attr *queueAttr) {
   396  	res := float64(0)
   397  
   398  	// TODO(k82cn): how to handle fragment issues?
   399  	for _, rn := range attr.deserved.ResourceNames() {
   400  		share := helpers.Share(attr.allocated.Get(rn), attr.deserved.Get(rn))
   401  		if share > res {
   402  			res = share
   403  		}
   404  	}
   405  
   406  	attr.share = res
   407  	metrics.UpdateQueueShare(attr.name, attr.share)
   408  }