volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/numaaware/numaaware.go (about)

     1  /*
     2  Copyright 2021 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package numaaware
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/client-go/util/workqueue"
    26  	"k8s.io/klog/v2"
    27  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    28  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
    29  	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
    30  	"k8s.io/utils/cpuset"
    31  
    32  	nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
    33  
    34  	"volcano.sh/volcano/pkg/scheduler/api"
    35  	"volcano.sh/volcano/pkg/scheduler/framework"
    36  	"volcano.sh/volcano/pkg/scheduler/plugins/numaaware/policy"
    37  	"volcano.sh/volcano/pkg/scheduler/plugins/numaaware/provider/cpumanager"
    38  	"volcano.sh/volcano/pkg/scheduler/plugins/util"
    39  )
    40  
    41  const (
    42  	// PluginName indicates name of volcano scheduler plugin.
    43  	PluginName = "numa-aware"
    44  	// NumaTopoWeight indicates the weight of numa-aware plugin.
    45  	NumaTopoWeight = "weight"
    46  )
    47  
    48  type numaPlugin struct {
    49  	sync.Mutex
    50  	// Arguments given for the plugin
    51  	pluginArguments framework.Arguments
    52  	hintProviders   []policy.HintProvider
    53  	assignRes       map[api.TaskID]map[string]api.ResNumaSets // map[taskUID]map[nodename][resourceName]cpuset.CPUSet
    54  	nodeResSets     map[string]api.ResNumaSets                // map[nodename][resourceName]cpuset.CPUSet
    55  	taskBindNodeMap map[api.TaskID]string
    56  }
    57  
    58  // New function returns prioritize plugin object.
    59  func New(arguments framework.Arguments) framework.Plugin {
    60  	plugin := &numaPlugin{
    61  		pluginArguments: arguments,
    62  		assignRes:       make(map[api.TaskID]map[string]api.ResNumaSets),
    63  		taskBindNodeMap: make(map[api.TaskID]string),
    64  	}
    65  
    66  	plugin.hintProviders = append(plugin.hintProviders, cpumanager.NewProvider())
    67  	return plugin
    68  }
    69  
    70  func (pp *numaPlugin) Name() string {
    71  	return PluginName
    72  }
    73  
    74  func calculateWeight(args framework.Arguments) int {
    75  	weight := 1
    76  	args.GetInt(&weight, NumaTopoWeight)
    77  	return weight
    78  }
    79  
    80  func (pp *numaPlugin) OnSessionOpen(ssn *framework.Session) {
    81  	weight := calculateWeight(pp.pluginArguments)
    82  	numaNodes := api.GenerateNumaNodes(ssn.Nodes)
    83  	pp.nodeResSets = api.GenerateNodeResNumaSets(ssn.Nodes)
    84  
    85  	ssn.AddEventHandler(&framework.EventHandler{
    86  		AllocateFunc: func(event *framework.Event) {
    87  			node := pp.nodeResSets[event.Task.NodeName]
    88  			if _, ok := pp.assignRes[event.Task.UID]; !ok {
    89  				return
    90  			}
    91  
    92  			resNumaSets, ok := pp.assignRes[event.Task.UID][event.Task.NodeName]
    93  			if !ok {
    94  				return
    95  			}
    96  
    97  			node.Allocate(resNumaSets)
    98  			pp.taskBindNodeMap[event.Task.UID] = event.Task.NodeName
    99  		},
   100  		DeallocateFunc: func(event *framework.Event) {
   101  			node := pp.nodeResSets[event.Task.NodeName]
   102  			if _, ok := pp.assignRes[event.Task.UID]; !ok {
   103  				return
   104  			}
   105  
   106  			resNumaSets, ok := pp.assignRes[event.Task.UID][event.Task.NodeName]
   107  			if !ok {
   108  				return
   109  			}
   110  
   111  			delete(pp.taskBindNodeMap, event.Task.UID)
   112  			node.Release(resNumaSets)
   113  		},
   114  	})
   115  
   116  	predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) {
   117  		predicateStatus := make([]*api.Status, 0)
   118  		numaStatus := &api.Status{}
   119  		if v1qos.GetPodQOS(task.Pod) != v1.PodQOSGuaranteed {
   120  			klog.V(3).Infof("task %s isn't Guaranteed pod", task.Name)
   121  			return predicateStatus, nil
   122  		}
   123  
   124  		if fit, err := filterNodeByPolicy(task, node, pp.nodeResSets); !fit {
   125  			return predicateStatus, err
   126  		}
   127  
   128  		resNumaSets := pp.nodeResSets[node.Name].Clone()
   129  
   130  		taskPolicy := policy.GetPolicy(node, numaNodes[node.Name])
   131  		allResAssignMap := make(map[string]cpuset.CPUSet)
   132  		for _, container := range task.Pod.Spec.Containers {
   133  			providersHints := policy.AccumulateProvidersHints(&container, node.NumaSchedulerInfo, resNumaSets, pp.hintProviders)
   134  			hit, admit := taskPolicy.Predicate(providersHints)
   135  			if !admit {
   136  				numaStatus.Code = api.UnschedulableAndUnresolvable
   137  				numaStatus.Reason = fmt.Sprintf("plugin %s predicates failed for task %s container %s on node %s",
   138  					pp.Name(), task.Name, container.Name, node.Name)
   139  				predicateStatus = append(predicateStatus, numaStatus)
   140  				return predicateStatus, fmt.Errorf("plugin %s predicates failed for task %s container %s on node %s",
   141  					pp.Name(), task.Name, container.Name, node.Name)
   142  			}
   143  
   144  			klog.V(4).Infof("[numaaware] hits for task %s container '%v': %v on node %s, besthit: %v",
   145  				task.Name, container.Name, providersHints, node.Name, hit)
   146  			resAssignMap := policy.Allocate(&container, &hit, node.NumaSchedulerInfo, resNumaSets, pp.hintProviders)
   147  			for resName, assign := range resAssignMap {
   148  				allResAssignMap[resName] = allResAssignMap[resName].Union(assign)
   149  				resNumaSets[resName] = resNumaSets[resName].Difference(assign)
   150  			}
   151  		}
   152  
   153  		pp.Lock()
   154  		defer pp.Unlock()
   155  		if _, ok := pp.assignRes[task.UID]; !ok {
   156  			pp.assignRes[task.UID] = make(map[string]api.ResNumaSets)
   157  		}
   158  
   159  		pp.assignRes[task.UID][node.Name] = allResAssignMap
   160  
   161  		klog.V(4).Infof(" task %s's on node<%s> resAssignMap: %v",
   162  			task.Name, node.Name, pp.assignRes[task.UID][node.Name])
   163  
   164  		numaStatus.Code = api.Success
   165  		predicateStatus = append(predicateStatus, numaStatus)
   166  		return predicateStatus, nil
   167  	}
   168  
   169  	ssn.AddPredicateFn(pp.Name(), predicateFn)
   170  
   171  	batchNodeOrderFn := func(task *api.TaskInfo, nodeInfo []*api.NodeInfo) (map[string]float64, error) {
   172  		nodeScores := make(map[string]float64, len(nodeInfo))
   173  		if task.NumaInfo == nil || task.NumaInfo.Policy == "" || task.NumaInfo.Policy == "none" {
   174  			return nodeScores, nil
   175  		}
   176  
   177  		if _, found := pp.assignRes[task.UID]; !found {
   178  			return nodeScores, nil
   179  		}
   180  
   181  		scoreList := getNodeNumaNumForTask(nodeInfo, pp.assignRes[task.UID])
   182  		util.NormalizeScore(api.DefaultMaxNodeScore, true, scoreList)
   183  
   184  		for idx, scoreNode := range scoreList {
   185  			scoreNode.Score *= int64(weight)
   186  			nodeName := nodeInfo[idx].Name
   187  			nodeScores[nodeName] = float64(scoreNode.Score)
   188  		}
   189  
   190  		klog.V(4).Infof("numa-aware plugin Score for task %s/%s is: %v",
   191  			task.Namespace, task.Name, nodeScores)
   192  		return nodeScores, nil
   193  	}
   194  
   195  	ssn.AddBatchNodeOrderFn(pp.Name(), batchNodeOrderFn)
   196  }
   197  
   198  func filterNodeByPolicy(task *api.TaskInfo, node *api.NodeInfo, nodeResSets map[string]api.ResNumaSets) (fit bool, err error) {
   199  	if !(task.NumaInfo == nil || task.NumaInfo.Policy == "" || task.NumaInfo.Policy == "none") {
   200  		if node.NumaSchedulerInfo == nil {
   201  			return false, fmt.Errorf("numa info is empty")
   202  		}
   203  
   204  		if node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.CPUManagerPolicy] != "static" {
   205  			return false, fmt.Errorf("cpu manager policy isn't static")
   206  		}
   207  
   208  		if task.NumaInfo.Policy != node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] {
   209  			return false, fmt.Errorf("task topology polocy[%s] is different with node[%s]",
   210  				task.NumaInfo.Policy, node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy])
   211  		}
   212  
   213  		if _, ok := nodeResSets[node.Name]; !ok {
   214  			return false, fmt.Errorf("no topo information")
   215  		}
   216  
   217  		if nodeResSets[node.Name][string(v1.ResourceCPU)].Size() == 0 {
   218  			return false, fmt.Errorf("cpu allocatable map is empty")
   219  		}
   220  	} else {
   221  		if node.NumaSchedulerInfo == nil {
   222  			return false, nil
   223  		}
   224  
   225  		if node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.CPUManagerPolicy] != "static" {
   226  			return false, nil
   227  		}
   228  
   229  		if (node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] == "none") ||
   230  			(node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] == "") {
   231  			return false, nil
   232  		}
   233  	}
   234  
   235  	return true, nil
   236  }
   237  
   238  func getNodeNumaNumForTask(nodeInfo []*api.NodeInfo, resAssignMap map[string]api.ResNumaSets) []api.ScoredNode {
   239  	nodeNumaCnts := make([]api.ScoredNode, len(nodeInfo))
   240  	workqueue.ParallelizeUntil(context.TODO(), 16, len(nodeInfo), func(index int) {
   241  		node := nodeInfo[index]
   242  		assignCpus := resAssignMap[node.Name][string(v1.ResourceCPU)]
   243  		nodeNumaCnts[index] = api.ScoredNode{
   244  			NodeName: node.Name,
   245  			Score:    int64(getNumaNodeCntForCPUID(assignCpus, node.NumaSchedulerInfo.CPUDetail)),
   246  		}
   247  	})
   248  
   249  	return nodeNumaCnts
   250  }
   251  
   252  func getNumaNodeCntForCPUID(cpus cpuset.CPUSet, cpuDetails topology.CPUDetails) int {
   253  	mask, _ := bitmask.NewBitMask()
   254  	s := cpus.List()
   255  
   256  	for _, cpuID := range s {
   257  		mask.Add(cpuDetails[cpuID].NUMANodeID)
   258  	}
   259  
   260  	return mask.Count()
   261  }
   262  
   263  func (pp *numaPlugin) OnSessionClose(ssn *framework.Session) {
   264  	if len(pp.taskBindNodeMap) == 0 {
   265  		return
   266  	}
   267  
   268  	allocatedResSet := make(map[string]api.ResNumaSets)
   269  	for taskID, nodeName := range pp.taskBindNodeMap {
   270  		if _, existed := pp.assignRes[taskID]; !existed {
   271  			continue
   272  		}
   273  
   274  		if _, existed := pp.assignRes[taskID][nodeName]; !existed {
   275  			continue
   276  		}
   277  
   278  		if _, existed := allocatedResSet[nodeName]; !existed {
   279  			allocatedResSet[nodeName] = make(api.ResNumaSets)
   280  		}
   281  
   282  		resSet := pp.assignRes[taskID][nodeName]
   283  		for resName, set := range resSet {
   284  			if _, existed := allocatedResSet[nodeName][resName]; !existed {
   285  				allocatedResSet[nodeName][resName] = cpuset.New()
   286  			}
   287  
   288  			allocatedResSet[nodeName][resName] = allocatedResSet[nodeName][resName].Union(set)
   289  		}
   290  	}
   291  
   292  	klog.V(4).Infof("[numaPlugin]allocatedResSet: %v", allocatedResSet)
   293  	ssn.UpdateSchedulerNumaInfo(allocatedResSet)
   294  }