volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/usage/usage_test.go (about)

     1  /*
     2  Copyright 2023 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package usage
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"reflect"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/agiledragon/gomonkey/v2"
    27  	v1 "k8s.io/api/core/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/client-go/tools/record"
    30  
    31  	schedulingv1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    32  	"volcano.sh/volcano/pkg/scheduler/api"
    33  	"volcano.sh/volcano/pkg/scheduler/cache"
    34  	"volcano.sh/volcano/pkg/scheduler/conf"
    35  	"volcano.sh/volcano/pkg/scheduler/framework"
    36  	"volcano.sh/volcano/pkg/scheduler/metrics/source"
    37  	"volcano.sh/volcano/pkg/scheduler/util"
    38  )
    39  
    40  const (
    41  	eps = 1e-8
    42  )
    43  
    44  type predicateResult struct {
    45  	predicateStatus []*api.Status
    46  	err             error
    47  }
    48  
    49  func buildNodeUsage(cpuAvgUsage map[string]float64, memAvgUsage map[string]float64, metricsTime time.Time) *api.NodeUsage {
    50  	return &api.NodeUsage{
    51  		MetricsTime: metricsTime,
    52  		CPUUsageAvg: cpuAvgUsage,
    53  		MEMUsageAvg: memAvgUsage,
    54  	}
    55  }
    56  
    57  func updateNodeUsage(nodesInfo map[string]*api.NodeInfo, nodesUsage map[string]*api.NodeUsage) {
    58  	for nodeName, nodeInfo := range nodesInfo {
    59  		if nodeUsage, ok := nodesUsage[nodeName]; ok {
    60  			nodeInfo.ResourceUsage = nodeUsage
    61  		}
    62  	}
    63  }
    64  
    65  func TestUsage_predicateFn(t *testing.T) {
    66  	var tmp *cache.SchedulerCache
    67  	patchUpdateQueueStatus := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "UpdateQueueStatus", func(scCache *cache.SchedulerCache, queue *api.QueueInfo) error {
    68  		return nil
    69  	})
    70  	defer patchUpdateQueueStatus.Reset()
    71  
    72  	framework.RegisterPluginBuilder(PluginName, New)
    73  	defer framework.CleanupPluginBuilders()
    74  
    75  	p1 := util.BuildPod("c1", "p1", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string))
    76  	p2 := util.BuildPod("c1", "p2", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string))
    77  
    78  	n1 := util.BuildNode("n1", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
    79  	n2 := util.BuildNode("n2", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
    80  	n3 := util.BuildNode("n3", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
    81  	n4 := util.BuildNode("n4", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
    82  	n5 := util.BuildNode("n5", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
    83  
    84  	nodesUsage := make(map[string]*api.NodeUsage)
    85  	timeNow := time.Now()
    86  	// The CPU load of the node exceeds the upper limit.
    87  	// The node cannot be scheduled.
    88  	nodesUsage[n1.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 81}, map[string]float64{source.NODE_METRICS_PERIOD: 60}, timeNow)
    89  	// The memory load of the node exceeds the upper limit.
    90  	// The node cannot be scheduled.
    91  	nodesUsage[n2.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, timeNow)
    92  	// The CPU usage and memory usage do not exceed the upper limit.
    93  	// The node can be scheduled.
    94  	nodesUsage[n3.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 80}, map[string]float64{source.NODE_METRICS_PERIOD: 79}, timeNow)
    95  	// The memory and memory load of the node exceeds the upper limit.
    96  	// However, the metrics are not updated in the latest 5 minutes, and the usage function is invalid.
    97  	// The node can schedule pods.
    98  	nodesUsage[n4.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 90}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, timeNow.Add(-6*time.Minute))
    99  	// The memory and memory load of the node exceeds the upper limit.
   100  	// However, the metric time is in the initial state, and the usage function is invalid.
   101  	// The node can schedule pods.
   102  	nodesUsage[n5.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 90}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, time.Time{})
   103  
   104  	pg1 := &schedulingv1.PodGroup{
   105  		ObjectMeta: metav1.ObjectMeta{
   106  			Name:      "pg1",
   107  			Namespace: "c1",
   108  		},
   109  		Spec: schedulingv1.PodGroupSpec{
   110  			Queue: "q1",
   111  		},
   112  	}
   113  	queue1 := &schedulingv1.Queue{
   114  		ObjectMeta: metav1.ObjectMeta{
   115  			Name: "q1",
   116  		},
   117  		Spec: schedulingv1.QueueSpec{
   118  			Weight: 1,
   119  		},
   120  	}
   121  
   122  	tests := []struct {
   123  		name          string
   124  		podGroups     []*schedulingv1.PodGroup
   125  		queues        []*schedulingv1.Queue
   126  		pods          []*v1.Pod
   127  		nodes         []*v1.Node
   128  		nodesUsageMap map[string]*api.NodeUsage
   129  		arguments     framework.Arguments
   130  		expected      predicateResult
   131  	}{
   132  		{
   133  			name: "The node cannot be scheduled, because of the CPU load of the node exceeds the upper limit.",
   134  			podGroups: []*schedulingv1.PodGroup{
   135  				pg1,
   136  			},
   137  			queues: []*schedulingv1.Queue{
   138  				queue1,
   139  			},
   140  			pods: []*v1.Pod{
   141  				p1, p2,
   142  			},
   143  			nodes: []*v1.Node{
   144  				n1,
   145  			},
   146  			nodesUsageMap: nodesUsage,
   147  			arguments: framework.Arguments{
   148  				"usage.weight":  5,
   149  				"cpu.weight":    1,
   150  				"memory.weight": 1,
   151  				"thresholds": map[interface{}]interface{}{
   152  					"cpu": 80,
   153  					"mem": 80,
   154  				},
   155  			},
   156  			expected: predicateResult{
   157  				predicateStatus: []*api.Status{
   158  					{
   159  						Code:   api.UnschedulableAndUnresolvable,
   160  						Reason: NodeUsageCPUExtend,
   161  					},
   162  				},
   163  				err: fmt.Errorf("Plugin %s predicates failed, because of %s", PluginName, NodeUsageCPUExtend),
   164  			},
   165  		},
   166  		{
   167  			name: "The node cannot be scheduled, because of the memory load of the node exceeds the upper limit.",
   168  			podGroups: []*schedulingv1.PodGroup{
   169  				pg1,
   170  			},
   171  			queues: []*schedulingv1.Queue{
   172  				queue1,
   173  			},
   174  			pods: []*v1.Pod{
   175  				p1, p2,
   176  			},
   177  			nodes: []*v1.Node{
   178  				n2,
   179  			},
   180  			nodesUsageMap: nodesUsage,
   181  			arguments: framework.Arguments{
   182  				"usage.weight":  5,
   183  				"cpu.weight":    1,
   184  				"memory.weight": 1,
   185  				"thresholds": map[interface{}]interface{}{
   186  					"cpu": 80,
   187  					"mem": 80,
   188  				},
   189  			},
   190  			expected: predicateResult{
   191  				predicateStatus: []*api.Status{
   192  					{
   193  						Code:   api.UnschedulableAndUnresolvable,
   194  						Reason: NodeUsageMemoryExtend,
   195  					},
   196  				},
   197  				err: fmt.Errorf("Plugin %s predicates failed, because of %s", PluginName, NodeUsageMemoryExtend),
   198  			},
   199  		},
   200  		{
   201  			name: "The node can be scheduled, because of the CPU usage and memory usage do not exceed the upper limit.",
   202  			podGroups: []*schedulingv1.PodGroup{
   203  				pg1,
   204  			},
   205  			queues: []*schedulingv1.Queue{
   206  				queue1,
   207  			},
   208  			pods: []*v1.Pod{
   209  				p1, p2,
   210  			},
   211  			nodes: []*v1.Node{
   212  				n3,
   213  			},
   214  			nodesUsageMap: nodesUsage,
   215  			arguments: framework.Arguments{
   216  				"usage.weight":  5,
   217  				"cpu.weight":    1,
   218  				"memory.weight": 1,
   219  				"thresholds": map[interface{}]interface{}{
   220  					"cpu": 80,
   221  					"mem": 80,
   222  				},
   223  			},
   224  			expected: predicateResult{
   225  				predicateStatus: []*api.Status{
   226  					{
   227  						Code:   api.Success,
   228  						Reason: "",
   229  					},
   230  				},
   231  				err: nil,
   232  			},
   233  		},
   234  		{
   235  			name: "The node can be scheduled, because of the metrics are not updated in the latest 5 minutes, and the usage function is invalid.",
   236  			podGroups: []*schedulingv1.PodGroup{
   237  				pg1,
   238  			},
   239  			queues: []*schedulingv1.Queue{
   240  				queue1,
   241  			},
   242  			pods: []*v1.Pod{
   243  				p1, p2,
   244  			},
   245  			nodes: []*v1.Node{
   246  				n4,
   247  			},
   248  			nodesUsageMap: nodesUsage,
   249  			arguments: framework.Arguments{
   250  				"usage.weight":  5,
   251  				"cpu.weight":    1,
   252  				"memory.weight": 1,
   253  				"thresholds": map[interface{}]interface{}{
   254  					"cpu": 80,
   255  					"mem": 80,
   256  				},
   257  			},
   258  			expected: predicateResult{
   259  				predicateStatus: []*api.Status{
   260  					{
   261  						Code:   api.Success,
   262  						Reason: "",
   263  					},
   264  				},
   265  				err: nil,
   266  			},
   267  		},
   268  		{
   269  			name: "The node can be scheduled, because of the metric time is in the initial state, and the usage function is invalid.",
   270  			podGroups: []*schedulingv1.PodGroup{
   271  				pg1,
   272  			},
   273  			queues: []*schedulingv1.Queue{
   274  				queue1,
   275  			},
   276  			pods: []*v1.Pod{
   277  				p1, p2,
   278  			},
   279  			nodes: []*v1.Node{
   280  				n5,
   281  			},
   282  			nodesUsageMap: nodesUsage,
   283  			arguments: framework.Arguments{
   284  				"usage.weight":  5,
   285  				"cpu.weight":    1,
   286  				"memory.weight": 1,
   287  				"thresholds": map[interface{}]interface{}{
   288  					"cpu": 80,
   289  					"mem": 80,
   290  				},
   291  			},
   292  			expected: predicateResult{
   293  				predicateStatus: []*api.Status{
   294  					{
   295  						Code:   api.Success,
   296  						Reason: "",
   297  					},
   298  				},
   299  				err: nil,
   300  			},
   301  		},
   302  	}
   303  
   304  	for i, test := range tests {
   305  		t.Run(test.name, func(t *testing.T) {
   306  			schedulerCache := &cache.SchedulerCache{
   307  				Nodes:         make(map[string]*api.NodeInfo),
   308  				Jobs:          make(map[api.JobID]*api.JobInfo),
   309  				Queues:        make(map[api.QueueID]*api.QueueInfo),
   310  				StatusUpdater: &util.FakeStatusUpdater{},
   311  				VolumeBinder:  &util.FakeVolumeBinder{},
   312  
   313  				Recorder: record.NewFakeRecorder(100),
   314  			}
   315  
   316  			for _, node := range test.nodes {
   317  				schedulerCache.AddOrUpdateNode(node)
   318  			}
   319  			for _, pod := range test.pods {
   320  				schedulerCache.AddPod(pod)
   321  			}
   322  			for _, ss := range test.podGroups {
   323  				schedulerCache.AddPodGroupV1beta1(ss)
   324  			}
   325  			for _, q := range test.queues {
   326  				schedulerCache.AddQueueV1beta1(q)
   327  			}
   328  			updateNodeUsage(schedulerCache.Nodes, nodesUsage)
   329  
   330  			trueValue := true
   331  			ssn := framework.OpenSession(schedulerCache, []conf.Tier{
   332  				{
   333  					Plugins: []conf.PluginOption{
   334  						{
   335  							Name:             PluginName,
   336  							EnabledPredicate: &trueValue,
   337  							Arguments:        test.arguments,
   338  						},
   339  					},
   340  				},
   341  			}, nil)
   342  			defer framework.CloseSession(ssn)
   343  
   344  			for _, job := range ssn.Jobs {
   345  				for _, task := range job.Tasks {
   346  					taskID := fmt.Sprintf("%s/%s", task.Namespace, task.Name)
   347  					for _, node := range ssn.Nodes {
   348  						predicateStatus, err := ssn.PredicateFn(task, node)
   349  						if (test.expected.err == nil || err == nil) && test.expected.err != err {
   350  							t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v",
   351  								i, taskID, node.Name, test.expected.err, err)
   352  							continue
   353  						}
   354  						if test.expected.err != nil && test.expected.err.Error() != err.Error() {
   355  							t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v",
   356  								i, taskID, node.Name, test.expected.err, err)
   357  							continue
   358  						}
   359  
   360  						for index := range predicateStatus {
   361  							if predicateStatus[index].Code != test.expected.predicateStatus[index].Code ||
   362  								predicateStatus[index].Reason != test.expected.predicateStatus[index].Reason {
   363  								t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v",
   364  									i, taskID, node.Name, test.expected.predicateStatus[index], predicateStatus[index])
   365  								continue
   366  							}
   367  						}
   368  					}
   369  				}
   370  			}
   371  		})
   372  	}
   373  }
   374  
   375  func TestUsage_nodeOrderFn(t *testing.T) {
   376  	var tmp *cache.SchedulerCache
   377  	patchUpdateQueueStatus := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "UpdateQueueStatus", func(scCache *cache.SchedulerCache, queue *api.QueueInfo) error {
   378  		return nil
   379  	})
   380  	defer patchUpdateQueueStatus.Reset()
   381  
   382  	framework.RegisterPluginBuilder(PluginName, New)
   383  	defer framework.CleanupPluginBuilders()
   384  
   385  	p1 := util.BuildPod("c1", "p1", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string))
   386  
   387  	n1 := util.BuildNode("n1", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
   388  	n2 := util.BuildNode("n2", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
   389  	n3 := util.BuildNode("n3", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
   390  	n4 := util.BuildNode("n4", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
   391  	n5 := util.BuildNode("n5", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string))
   392  
   393  	nodesUsage := make(map[string]*api.NodeUsage)
   394  	timeNow := time.Now()
   395  	nodesUsage[n1.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 30}, map[string]float64{source.NODE_METRICS_PERIOD: 50}, timeNow)
   396  	nodesUsage[n2.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 50}, timeNow)
   397  	nodesUsage[n3.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 80}, timeNow)
   398  	// The metrics are not updated in the latest 5 minutes, and the usage function is invalid.
   399  	// The node score is 0.
   400  	nodesUsage[n4.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 10}, map[string]float64{source.NODE_METRICS_PERIOD: 20}, timeNow.Add(-6*time.Minute))
   401  	// The metric time is in the initial state, and the usage function is invalid.
   402  	// The node score is 0.
   403  	nodesUsage[n5.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 0}, map[string]float64{source.NODE_METRICS_PERIOD: 0}, time.Time{})
   404  
   405  	pg1 := &schedulingv1.PodGroup{
   406  		ObjectMeta: metav1.ObjectMeta{
   407  			Name:      "pg1",
   408  			Namespace: "c1",
   409  		},
   410  		Spec: schedulingv1.PodGroupSpec{
   411  			Queue: "q1",
   412  		},
   413  	}
   414  	queue1 := &schedulingv1.Queue{
   415  		ObjectMeta: metav1.ObjectMeta{
   416  			Name: "q1",
   417  		},
   418  		Spec: schedulingv1.QueueSpec{
   419  			Weight: 1,
   420  		},
   421  	}
   422  
   423  	tests := []struct {
   424  		name          string
   425  		podGroups     []*schedulingv1.PodGroup
   426  		queues        []*schedulingv1.Queue
   427  		pods          []*v1.Pod
   428  		nodes         []*v1.Node
   429  		nodesUsageMap map[string]*api.NodeUsage
   430  		arguments     framework.Arguments
   431  		expected      map[string]map[string]float64
   432  	}{
   433  		{
   434  			name: "Node scoring in the default weight configuration scenario.",
   435  			podGroups: []*schedulingv1.PodGroup{
   436  				pg1,
   437  			},
   438  			queues: []*schedulingv1.Queue{
   439  				queue1,
   440  			},
   441  			pods: []*v1.Pod{
   442  				p1,
   443  			},
   444  			nodes: []*v1.Node{
   445  				n1, n2, n3, n4, n5,
   446  			},
   447  			nodesUsageMap: nodesUsage,
   448  			arguments: framework.Arguments{
   449  				"usage.weight":  5,
   450  				"cpu.weight":    1,
   451  				"memory.weight": 1,
   452  				"thresholds": map[interface{}]interface{}{
   453  					"cpu": 80,
   454  					"mem": 80,
   455  				},
   456  			},
   457  			expected: map[string]map[string]float64{
   458  				"c1/p1": {
   459  					"n1": 300,
   460  					"n2": 225,
   461  					"n3": 150,
   462  					"n4": 0,
   463  					"n5": 0,
   464  				},
   465  			},
   466  		},
   467  		{
   468  			name: "Node scoring gives priority to memory resources",
   469  			podGroups: []*schedulingv1.PodGroup{
   470  				pg1,
   471  			},
   472  			queues: []*schedulingv1.Queue{
   473  				queue1,
   474  			},
   475  			pods: []*v1.Pod{
   476  				p1,
   477  			},
   478  			nodes: []*v1.Node{
   479  				n1, n2, n3, n4, n5,
   480  			},
   481  			nodesUsageMap: nodesUsage,
   482  			arguments: framework.Arguments{
   483  				"usage.weight":  5,
   484  				"cpu.weight":    2,
   485  				"memory.weight": 8,
   486  				"thresholds": map[interface{}]interface{}{
   487  					"cpu": 80,
   488  					"mem": 80,
   489  				},
   490  			},
   491  			expected: map[string]map[string]float64{
   492  				"c1/p1": {
   493  					"n1": 270,
   494  					"n2": 240,
   495  					"n3": 120,
   496  					"n4": 0,
   497  					"n5": 0,
   498  				},
   499  			},
   500  		},
   501  	}
   502  
   503  	for i, test := range tests {
   504  		t.Run(test.name, func(t *testing.T) {
   505  			schedulerCache := &cache.SchedulerCache{
   506  				Nodes:         make(map[string]*api.NodeInfo),
   507  				Jobs:          make(map[api.JobID]*api.JobInfo),
   508  				Queues:        make(map[api.QueueID]*api.QueueInfo),
   509  				StatusUpdater: &util.FakeStatusUpdater{},
   510  				VolumeBinder:  &util.FakeVolumeBinder{},
   511  
   512  				Recorder: record.NewFakeRecorder(100),
   513  			}
   514  
   515  			for _, node := range test.nodes {
   516  				schedulerCache.AddOrUpdateNode(node)
   517  			}
   518  			for _, pod := range test.pods {
   519  				schedulerCache.AddPod(pod)
   520  			}
   521  			for _, ss := range test.podGroups {
   522  				schedulerCache.AddPodGroupV1beta1(ss)
   523  			}
   524  			for _, q := range test.queues {
   525  				schedulerCache.AddQueueV1beta1(q)
   526  			}
   527  			updateNodeUsage(schedulerCache.Nodes, nodesUsage)
   528  
   529  			trueValue := true
   530  			ssn := framework.OpenSession(schedulerCache, []conf.Tier{
   531  				{
   532  					Plugins: []conf.PluginOption{
   533  						{
   534  							Name:             PluginName,
   535  							EnabledNodeOrder: &trueValue,
   536  							Arguments:        test.arguments,
   537  						},
   538  					},
   539  				},
   540  			}, nil)
   541  			defer framework.CloseSession(ssn)
   542  
   543  			for _, job := range ssn.Jobs {
   544  				for _, task := range job.Tasks {
   545  					taskID := fmt.Sprintf("%s/%s", task.Namespace, task.Name)
   546  					for _, node := range ssn.Nodes {
   547  						score, err := ssn.NodeOrderFn(task, node)
   548  						if err != nil {
   549  							t.Errorf("case%d: task %s on node %s has err %v", i, taskID, node.Name, err)
   550  							continue
   551  						}
   552  						if expectScore := test.expected[taskID][node.Name]; math.Abs(expectScore-score) > eps {
   553  							t.Errorf("case%d: task %s on node %s expect have score %v, but get %v", i, taskID, node.Name, expectScore, score)
   554  						}
   555  					}
   556  				}
   557  			}
   558  		})
   559  	}
   560  }