volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/drf/drf.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package drf
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"strconv"
    23  	"strings"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/klog/v2"
    27  
    28  	"volcano.sh/volcano/pkg/scheduler/api"
    29  	"volcano.sh/volcano/pkg/scheduler/api/helpers"
    30  	"volcano.sh/volcano/pkg/scheduler/framework"
    31  	"volcano.sh/volcano/pkg/scheduler/metrics"
    32  	"volcano.sh/volcano/pkg/scheduler/plugins/util"
    33  )
    34  
    35  // PluginName indicates name of volcano scheduler plugin.
    36  const PluginName = "drf"
    37  
    38  var shareDelta = 0.000001
    39  
    40  // hierarchicalNode represents the node hierarchy
    41  // and the corresponding weight and drf attribute
    42  type hierarchicalNode struct {
    43  	parent *hierarchicalNode
    44  	attr   *drfAttr
    45  	// If the node is a leaf node,
    46  	// request represents the request of the job.
    47  	request   *api.Resource
    48  	weight    float64
    49  	saturated bool
    50  	hierarchy string
    51  	children  map[string]*hierarchicalNode
    52  }
    53  
    54  func (node *hierarchicalNode) Clone(parent *hierarchicalNode) *hierarchicalNode {
    55  	newNode := &hierarchicalNode{
    56  		parent: parent,
    57  		attr: &drfAttr{
    58  			share:            node.attr.share,
    59  			dominantResource: node.attr.dominantResource,
    60  			allocated:        node.attr.allocated.Clone(),
    61  		},
    62  		request:   node.request.Clone(),
    63  		weight:    node.weight,
    64  		saturated: node.saturated,
    65  		hierarchy: node.hierarchy,
    66  		children:  nil,
    67  	}
    68  	if node.children != nil {
    69  		newNode.children = map[string]*hierarchicalNode{}
    70  		for _, child := range node.children {
    71  			newNode.children[child.hierarchy] = child.Clone(newNode)
    72  		}
    73  	}
    74  	return newNode
    75  }
    76  
    77  // resourceSaturated returns true if any resource of the job is saturated or the job demands fully allocated resource
    78  func resourceSaturated(allocated *api.Resource,
    79  	jobRequest *api.Resource, demandingResources map[v1.ResourceName]bool) bool {
    80  	for _, rn := range allocated.ResourceNames() {
    81  		if allocated.Get(rn) != 0 && jobRequest.Get(rn) != 0 &&
    82  			allocated.Get(rn) >= jobRequest.Get(rn) {
    83  			return true
    84  		}
    85  		if !demandingResources[rn] && jobRequest.Get(rn) != 0 {
    86  			return true
    87  		}
    88  	}
    89  	return false
    90  }
    91  
    92  type drfAttr struct {
    93  	share            float64
    94  	dominantResource string
    95  	allocated        *api.Resource
    96  }
    97  
    98  func (attr *drfAttr) String() string {
    99  	return fmt.Sprintf("dominant resource <%s>, dominant share %f, allocated %s",
   100  		attr.dominantResource, attr.share, attr.allocated)
   101  }
   102  
   103  type drfPlugin struct {
   104  	totalResource  *api.Resource
   105  	totalAllocated *api.Resource
   106  
   107  	// Key is Job ID
   108  	jobAttrs map[api.JobID]*drfAttr
   109  
   110  	// map[namespaceName]->attr
   111  	namespaceOpts map[string]*drfAttr
   112  
   113  	// hierarchical tree root
   114  	hierarchicalRoot *hierarchicalNode
   115  
   116  	// Arguments given for the plugin
   117  	pluginArguments framework.Arguments
   118  }
   119  
   120  // New return drf plugin
   121  func New(arguments framework.Arguments) framework.Plugin {
   122  	return &drfPlugin{
   123  		totalResource:  api.EmptyResource(),
   124  		totalAllocated: api.EmptyResource(),
   125  		jobAttrs:       map[api.JobID]*drfAttr{},
   126  		namespaceOpts:  map[string]*drfAttr{},
   127  		hierarchicalRoot: &hierarchicalNode{
   128  			attr:      &drfAttr{allocated: api.EmptyResource()},
   129  			request:   api.EmptyResource(),
   130  			hierarchy: "root",
   131  			weight:    1,
   132  			children:  map[string]*hierarchicalNode{},
   133  		},
   134  		pluginArguments: arguments,
   135  	}
   136  }
   137  
   138  func (drf *drfPlugin) Name() string {
   139  	return PluginName
   140  }
   141  
   142  // HierarchyEnabled returns if hierarchy is enabled
   143  func (drf *drfPlugin) HierarchyEnabled(ssn *framework.Session) bool {
   144  	for _, tier := range ssn.Tiers {
   145  		for _, plugin := range tier.Plugins {
   146  			if plugin.Name != PluginName {
   147  				continue
   148  			}
   149  			return plugin.EnabledHierarchy != nil && *plugin.EnabledHierarchy
   150  		}
   151  	}
   152  	return false
   153  }
   154  
   155  func (drf *drfPlugin) compareQueues(root *hierarchicalNode, lqueue *api.QueueInfo, rqueue *api.QueueInfo) float64 {
   156  	lnode := root
   157  	lpaths := strings.Split(lqueue.Hierarchy, "/")
   158  	rnode := root
   159  	rpaths := strings.Split(rqueue.Hierarchy, "/")
   160  	depth := 0
   161  	if len(lpaths) < len(rpaths) {
   162  		depth = len(lpaths)
   163  	} else {
   164  		depth = len(rpaths)
   165  	}
   166  	for i := 0; i < depth; i++ {
   167  		// Saturated nodes have minumun prioirty,
   168  		// so that demanding nodes will be poped first.
   169  		if !lnode.saturated && rnode.saturated {
   170  			return -1
   171  		}
   172  		if lnode.saturated && !rnode.saturated {
   173  			return 1
   174  		}
   175  		if lnode.attr.share/lnode.weight == rnode.attr.share/rnode.weight {
   176  			if i < depth-1 {
   177  				lnode = lnode.children[lpaths[i+1]]
   178  				rnode = rnode.children[rpaths[i+1]]
   179  			}
   180  		} else {
   181  			return lnode.attr.share/lnode.weight - rnode.attr.share/rnode.weight
   182  		}
   183  	}
   184  	return 0
   185  }
   186  
   187  func (drf *drfPlugin) OnSessionOpen(ssn *framework.Session) {
   188  	// Prepare scheduling data for this session.
   189  	drf.totalResource.Add(ssn.TotalResource)
   190  
   191  	klog.V(4).Infof("Total Allocatable %s", drf.totalResource)
   192  
   193  	hierarchyEnabled := drf.HierarchyEnabled(ssn)
   194  
   195  	for _, job := range ssn.Jobs {
   196  		attr := &drfAttr{
   197  			allocated: api.EmptyResource(),
   198  		}
   199  
   200  		for status, tasks := range job.TaskStatusIndex {
   201  			if api.AllocatedStatus(status) {
   202  				for _, t := range tasks {
   203  					attr.allocated.Add(t.Resreq)
   204  				}
   205  			}
   206  		}
   207  
   208  		// Calculate the init share of Job
   209  		drf.updateJobShare(job.Namespace, job.Name, attr)
   210  
   211  		drf.jobAttrs[job.UID] = attr
   212  
   213  		if hierarchyEnabled {
   214  			queue := ssn.Queues[job.Queue]
   215  			drf.totalAllocated.Add(attr.allocated)
   216  			drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
   217  		}
   218  	}
   219  
   220  	preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
   221  		var victims []*api.TaskInfo
   222  
   223  		addVictim := func(candidate *api.TaskInfo) {
   224  			victims = append(victims, candidate)
   225  		}
   226  
   227  		latt := drf.jobAttrs[preemptor.Job]
   228  		lalloc := latt.allocated.Clone().Add(preemptor.Resreq)
   229  		_, ls := drf.calculateShare(lalloc, drf.totalResource)
   230  
   231  		allocations := map[api.JobID]*api.Resource{}
   232  
   233  		for _, preemptee := range preemptees {
   234  			if _, found := allocations[preemptee.Job]; !found {
   235  				ratt := drf.jobAttrs[preemptee.Job]
   236  				allocations[preemptee.Job] = ratt.allocated.Clone()
   237  			}
   238  			ralloc := allocations[preemptee.Job].Sub(preemptee.Resreq)
   239  			_, rs := drf.calculateShare(ralloc, drf.totalResource)
   240  
   241  			if ls < rs || math.Abs(ls-rs) <= shareDelta {
   242  				addVictim(preemptee)
   243  			}
   244  		}
   245  
   246  		klog.V(4).Infof("Victims from DRF plugins are %+v", victims)
   247  
   248  		return victims, util.Permit
   249  	}
   250  
   251  	ssn.AddPreemptableFn(drf.Name(), preemptableFn)
   252  
   253  	if hierarchyEnabled {
   254  		queueOrderFn := func(l interface{}, r interface{}) int {
   255  			lv := l.(*api.QueueInfo)
   256  			rv := r.(*api.QueueInfo)
   257  			ret := drf.compareQueues(drf.hierarchicalRoot, lv, rv)
   258  			if ret < 0 {
   259  				return -1
   260  			}
   261  			if ret > 0 {
   262  				return 1
   263  			}
   264  			return 0
   265  		}
   266  		ssn.AddQueueOrderFn(drf.Name(), queueOrderFn)
   267  
   268  		reclaimFn := func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
   269  			var victims []*api.TaskInfo
   270  			// clone hdrf tree
   271  			totalAllocated := drf.totalAllocated.Clone()
   272  			root := drf.hierarchicalRoot.Clone(nil)
   273  
   274  			//  update reclaimer hdrf
   275  			ljob := ssn.Jobs[reclaimer.Job]
   276  			lqueue := ssn.Queues[ljob.Queue]
   277  			ljob = ljob.Clone()
   278  			attr := drf.jobAttrs[ljob.UID]
   279  			lattr := &drfAttr{
   280  				allocated: attr.allocated.Clone(),
   281  			}
   282  			lattr.allocated.Add(reclaimer.Resreq)
   283  			totalAllocated.Add(reclaimer.Resreq)
   284  			drf.updateShare(lattr)
   285  			drf.UpdateHierarchicalShare(root, totalAllocated, ljob, lattr, lqueue.Hierarchy, lqueue.Weights)
   286  
   287  			for _, preemptee := range reclaimees {
   288  				rjob := ssn.Jobs[preemptee.Job]
   289  				rqueue := ssn.Queues[rjob.Queue]
   290  
   291  				// update hdrf of reclaimee job
   292  				totalAllocated.Sub(preemptee.Resreq)
   293  				rjob = rjob.Clone()
   294  				attr := drf.jobAttrs[rjob.UID]
   295  				rattr := &drfAttr{
   296  					allocated: attr.allocated.Clone(),
   297  				}
   298  				rattr.allocated.Sub(preemptee.Resreq)
   299  				drf.updateShare(rattr)
   300  				drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
   301  
   302  				// compare hdrf of queues
   303  				ret := drf.compareQueues(root, lqueue, rqueue)
   304  
   305  				// resume hdrf of reclaimee job
   306  				totalAllocated.Add(preemptee.Resreq)
   307  				rattr.allocated.Add(preemptee.Resreq)
   308  				drf.updateShare(rattr)
   309  				drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
   310  
   311  				if ret < 0 {
   312  					victims = append(victims, preemptee)
   313  				}
   314  
   315  				if ret > shareDelta {
   316  					continue
   317  				}
   318  			}
   319  
   320  			klog.V(4).Infof("Victims from HDRF plugins are %+v", victims)
   321  
   322  			return victims, util.Permit
   323  		}
   324  		ssn.AddReclaimableFn(drf.Name(), reclaimFn)
   325  	}
   326  
   327  	jobOrderFn := func(l interface{}, r interface{}) int {
   328  		lv := l.(*api.JobInfo)
   329  		rv := r.(*api.JobInfo)
   330  
   331  		klog.V(4).Infof("DRF JobOrderFn: <%v/%v> share state: %v, <%v/%v> share state: %v",
   332  			lv.Namespace, lv.Name, drf.jobAttrs[lv.UID].share, rv.Namespace, rv.Name, drf.jobAttrs[rv.UID].share)
   333  
   334  		if drf.jobAttrs[lv.UID].share == drf.jobAttrs[rv.UID].share {
   335  			return 0
   336  		}
   337  
   338  		if drf.jobAttrs[lv.UID].share < drf.jobAttrs[rv.UID].share {
   339  			return -1
   340  		}
   341  
   342  		return 1
   343  	}
   344  
   345  	ssn.AddJobOrderFn(drf.Name(), jobOrderFn)
   346  
   347  	// Register event handlers.
   348  	ssn.AddEventHandler(&framework.EventHandler{
   349  		AllocateFunc: func(event *framework.Event) {
   350  			attr := drf.jobAttrs[event.Task.Job]
   351  			attr.allocated.Add(event.Task.Resreq)
   352  
   353  			job := ssn.Jobs[event.Task.Job]
   354  			drf.updateJobShare(job.Namespace, job.Name, attr)
   355  
   356  			nsShare := -1.0
   357  			if hierarchyEnabled {
   358  				queue := ssn.Queues[job.Queue]
   359  
   360  				drf.totalAllocated.Add(event.Task.Resreq)
   361  				drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
   362  			}
   363  
   364  			klog.V(4).Infof("DRF AllocateFunc: task <%v/%v>, resreq <%v>,  share <%v>, namespace share <%v>",
   365  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
   366  		},
   367  		DeallocateFunc: func(event *framework.Event) {
   368  			attr := drf.jobAttrs[event.Task.Job]
   369  			attr.allocated.Sub(event.Task.Resreq)
   370  
   371  			job := ssn.Jobs[event.Task.Job]
   372  			drf.updateJobShare(job.Namespace, job.Name, attr)
   373  
   374  			nsShare := -1.0
   375  
   376  			if hierarchyEnabled {
   377  				queue := ssn.Queues[job.Queue]
   378  				drf.totalAllocated.Sub(event.Task.Resreq)
   379  				drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
   380  			}
   381  
   382  			klog.V(4).Infof("DRF EvictFunc: task <%v/%v>, resreq <%v>,  share <%v>, namespace share <%v>",
   383  				event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
   384  		},
   385  	})
   386  }
   387  
   388  // build hierarchy if the node does not exist
   389  func (drf *drfPlugin) buildHierarchy(root *hierarchicalNode, job *api.JobInfo, attr *drfAttr,
   390  	hierarchy, hierarchicalWeights string) {
   391  	inode := root
   392  	paths := strings.Split(hierarchy, "/")
   393  	weights := strings.Split(hierarchicalWeights, "/")
   394  
   395  	for i := 1; i < len(paths); i++ {
   396  		if child, ok := inode.children[paths[i]]; ok {
   397  			inode = child
   398  		} else {
   399  			fweight, _ := strconv.ParseFloat(weights[i], 64)
   400  			if fweight < 1 {
   401  				fweight = 1
   402  			}
   403  			child = &hierarchicalNode{
   404  				weight:    fweight,
   405  				hierarchy: paths[i],
   406  				request:   api.EmptyResource(),
   407  				attr: &drfAttr{
   408  					allocated: api.EmptyResource(),
   409  				},
   410  				children: make(map[string]*hierarchicalNode),
   411  			}
   412  			klog.V(4).Infof("Node %s added to %s, weight %f",
   413  				child.hierarchy, inode.hierarchy, fweight)
   414  			inode.children[paths[i]] = child
   415  			child.parent = inode
   416  			inode = child
   417  		}
   418  	}
   419  
   420  	child := &hierarchicalNode{
   421  		weight:    1,
   422  		attr:      attr,
   423  		hierarchy: string(job.UID),
   424  		request:   job.TotalRequest.Clone(),
   425  		children:  nil,
   426  	}
   427  	inode.children[string(job.UID)] = child
   428  	// update drf attribute bottom up
   429  	klog.V(4).Infof("Job <%s/%s> added to %s, weights %s, attr %v, total request: %s",
   430  		job.Namespace, job.Name, inode.hierarchy, hierarchicalWeights, child.attr, job.TotalRequest)
   431  }
   432  
   433  // updateHierarchicalShare updates the node attribute recursively
   434  func (drf *drfPlugin) updateHierarchicalShare(node *hierarchicalNode,
   435  	demandingResources map[v1.ResourceName]bool) {
   436  	if node.children == nil {
   437  		node.saturated = resourceSaturated(node.attr.allocated,
   438  			node.request, demandingResources)
   439  		klog.V(4).Infof("Update hierarchical node %s, share %f, dominant %s, resource %v, saturated: %t",
   440  			node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
   441  	} else {
   442  		var mdr float64 = 1
   443  		// get minimun dominant resource share
   444  		for _, child := range node.children {
   445  			drf.updateHierarchicalShare(child, demandingResources)
   446  			// skip empty child and saturated child
   447  			if child.attr.share != 0 && !child.saturated {
   448  				_, resShare := drf.calculateShare(child.attr.allocated, drf.totalResource)
   449  				if resShare < mdr {
   450  					mdr = resShare
   451  				}
   452  			}
   453  		}
   454  
   455  		node.attr.allocated = api.EmptyResource()
   456  		saturated := true
   457  		for _, child := range node.children {
   458  			if !child.saturated {
   459  				saturated = false
   460  			}
   461  			// only consider non-empty children
   462  			if child.attr.share != 0 {
   463  				// saturated child is not scaled
   464  				if child.saturated {
   465  					t := child.attr.allocated
   466  					node.attr.allocated.Add(t)
   467  				} else {
   468  					t := child.attr.allocated.Clone().Multi(mdr / child.attr.share)
   469  					node.attr.allocated.Add(t)
   470  				}
   471  			}
   472  		}
   473  		node.attr.dominantResource, node.attr.share = drf.calculateShare(
   474  			node.attr.allocated, drf.totalResource)
   475  		node.saturated = saturated
   476  		klog.V(4).Infof("Update hierarchical node %s, share %f, dominant resource %s, resource %v, saturated: %t",
   477  			node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
   478  	}
   479  }
   480  
   481  func (drf *drfPlugin) UpdateHierarchicalShare(root *hierarchicalNode, totalAllocated *api.Resource, job *api.JobInfo, attr *drfAttr, hierarchy, hierarchicalWeights string) {
   482  	// filter out demanding resources
   483  	demandingResources := map[v1.ResourceName]bool{}
   484  	for _, rn := range drf.totalResource.ResourceNames() {
   485  		if totalAllocated.Get(rn) < drf.totalResource.Get(rn) {
   486  			demandingResources[rn] = true
   487  		}
   488  	}
   489  	drf.buildHierarchy(root, job, attr, hierarchy, hierarchicalWeights)
   490  	drf.updateHierarchicalShare(root, demandingResources)
   491  }
   492  
   493  func (drf *drfPlugin) updateJobShare(jobNs, jobName string, attr *drfAttr) {
   494  	drf.updateShare(attr)
   495  	metrics.UpdateJobShare(jobNs, jobName, attr.share)
   496  }
   497  
   498  func (drf *drfPlugin) updateShare(attr *drfAttr) {
   499  	attr.dominantResource, attr.share = drf.calculateShare(attr.allocated, drf.totalResource)
   500  }
   501  
   502  func (drf *drfPlugin) calculateShare(allocated, totalResource *api.Resource) (string, float64) {
   503  	res := float64(0)
   504  	dominantResource := ""
   505  	for _, rn := range totalResource.ResourceNames() {
   506  		share := helpers.Share(allocated.Get(rn), totalResource.Get(rn))
   507  		if share > res {
   508  			res = share
   509  			dominantResource = string(rn)
   510  		}
   511  	}
   512  
   513  	return dominantResource, res
   514  }
   515  
   516  func (drf *drfPlugin) OnSessionClose(session *framework.Session) {
   517  	// Clean schedule data.
   518  	drf.totalResource = api.EmptyResource()
   519  	drf.totalAllocated = api.EmptyResource()
   520  	drf.jobAttrs = map[api.JobID]*drfAttr{}
   521  }