volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/usage/usage_test.go (about) 1 /* 2 Copyright 2023 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package usage 18 19 import ( 20 "fmt" 21 "math" 22 "reflect" 23 "testing" 24 "time" 25 26 "github.com/agiledragon/gomonkey/v2" 27 v1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/client-go/tools/record" 30 31 schedulingv1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 32 "volcano.sh/volcano/pkg/scheduler/api" 33 "volcano.sh/volcano/pkg/scheduler/cache" 34 "volcano.sh/volcano/pkg/scheduler/conf" 35 "volcano.sh/volcano/pkg/scheduler/framework" 36 "volcano.sh/volcano/pkg/scheduler/metrics/source" 37 "volcano.sh/volcano/pkg/scheduler/util" 38 ) 39 40 const ( 41 eps = 1e-8 42 ) 43 44 type predicateResult struct { 45 predicateStatus []*api.Status 46 err error 47 } 48 49 func buildNodeUsage(cpuAvgUsage map[string]float64, memAvgUsage map[string]float64, metricsTime time.Time) *api.NodeUsage { 50 return &api.NodeUsage{ 51 MetricsTime: metricsTime, 52 CPUUsageAvg: cpuAvgUsage, 53 MEMUsageAvg: memAvgUsage, 54 } 55 } 56 57 func updateNodeUsage(nodesInfo map[string]*api.NodeInfo, nodesUsage map[string]*api.NodeUsage) { 58 for nodeName, nodeInfo := range nodesInfo { 59 if nodeUsage, ok := nodesUsage[nodeName]; ok { 60 nodeInfo.ResourceUsage = nodeUsage 61 } 62 } 63 } 64 65 func TestUsage_predicateFn(t *testing.T) { 66 var tmp *cache.SchedulerCache 67 patchUpdateQueueStatus := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "UpdateQueueStatus", func(scCache *cache.SchedulerCache, queue *api.QueueInfo) error { 68 return nil 69 }) 70 defer patchUpdateQueueStatus.Reset() 71 72 framework.RegisterPluginBuilder(PluginName, New) 73 defer framework.CleanupPluginBuilders() 74 75 p1 := util.BuildPod("c1", "p1", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string)) 76 p2 := util.BuildPod("c1", "p2", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string)) 77 78 n1 := util.BuildNode("n1", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 79 n2 := util.BuildNode("n2", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 80 n3 := util.BuildNode("n3", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 81 n4 := util.BuildNode("n4", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 82 n5 := util.BuildNode("n5", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 83 84 nodesUsage := make(map[string]*api.NodeUsage) 85 timeNow := time.Now() 86 // The CPU load of the node exceeds the upper limit. 87 // The node cannot be scheduled. 88 nodesUsage[n1.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 81}, map[string]float64{source.NODE_METRICS_PERIOD: 60}, timeNow) 89 // The memory load of the node exceeds the upper limit. 90 // The node cannot be scheduled. 91 nodesUsage[n2.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, timeNow) 92 // The CPU usage and memory usage do not exceed the upper limit. 93 // The node can be scheduled. 94 nodesUsage[n3.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 80}, map[string]float64{source.NODE_METRICS_PERIOD: 79}, timeNow) 95 // The memory and memory load of the node exceeds the upper limit. 96 // However, the metrics are not updated in the latest 5 minutes, and the usage function is invalid. 97 // The node can schedule pods. 98 nodesUsage[n4.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 90}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, timeNow.Add(-6*time.Minute)) 99 // The memory and memory load of the node exceeds the upper limit. 100 // However, the metric time is in the initial state, and the usage function is invalid. 101 // The node can schedule pods. 102 nodesUsage[n5.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 90}, map[string]float64{source.NODE_METRICS_PERIOD: 81}, time.Time{}) 103 104 pg1 := &schedulingv1.PodGroup{ 105 ObjectMeta: metav1.ObjectMeta{ 106 Name: "pg1", 107 Namespace: "c1", 108 }, 109 Spec: schedulingv1.PodGroupSpec{ 110 Queue: "q1", 111 }, 112 } 113 queue1 := &schedulingv1.Queue{ 114 ObjectMeta: metav1.ObjectMeta{ 115 Name: "q1", 116 }, 117 Spec: schedulingv1.QueueSpec{ 118 Weight: 1, 119 }, 120 } 121 122 tests := []struct { 123 name string 124 podGroups []*schedulingv1.PodGroup 125 queues []*schedulingv1.Queue 126 pods []*v1.Pod 127 nodes []*v1.Node 128 nodesUsageMap map[string]*api.NodeUsage 129 arguments framework.Arguments 130 expected predicateResult 131 }{ 132 { 133 name: "The node cannot be scheduled, because of the CPU load of the node exceeds the upper limit.", 134 podGroups: []*schedulingv1.PodGroup{ 135 pg1, 136 }, 137 queues: []*schedulingv1.Queue{ 138 queue1, 139 }, 140 pods: []*v1.Pod{ 141 p1, p2, 142 }, 143 nodes: []*v1.Node{ 144 n1, 145 }, 146 nodesUsageMap: nodesUsage, 147 arguments: framework.Arguments{ 148 "usage.weight": 5, 149 "cpu.weight": 1, 150 "memory.weight": 1, 151 "thresholds": map[interface{}]interface{}{ 152 "cpu": 80, 153 "mem": 80, 154 }, 155 }, 156 expected: predicateResult{ 157 predicateStatus: []*api.Status{ 158 { 159 Code: api.UnschedulableAndUnresolvable, 160 Reason: NodeUsageCPUExtend, 161 }, 162 }, 163 err: fmt.Errorf("Plugin %s predicates failed, because of %s", PluginName, NodeUsageCPUExtend), 164 }, 165 }, 166 { 167 name: "The node cannot be scheduled, because of the memory load of the node exceeds the upper limit.", 168 podGroups: []*schedulingv1.PodGroup{ 169 pg1, 170 }, 171 queues: []*schedulingv1.Queue{ 172 queue1, 173 }, 174 pods: []*v1.Pod{ 175 p1, p2, 176 }, 177 nodes: []*v1.Node{ 178 n2, 179 }, 180 nodesUsageMap: nodesUsage, 181 arguments: framework.Arguments{ 182 "usage.weight": 5, 183 "cpu.weight": 1, 184 "memory.weight": 1, 185 "thresholds": map[interface{}]interface{}{ 186 "cpu": 80, 187 "mem": 80, 188 }, 189 }, 190 expected: predicateResult{ 191 predicateStatus: []*api.Status{ 192 { 193 Code: api.UnschedulableAndUnresolvable, 194 Reason: NodeUsageMemoryExtend, 195 }, 196 }, 197 err: fmt.Errorf("Plugin %s predicates failed, because of %s", PluginName, NodeUsageMemoryExtend), 198 }, 199 }, 200 { 201 name: "The node can be scheduled, because of the CPU usage and memory usage do not exceed the upper limit.", 202 podGroups: []*schedulingv1.PodGroup{ 203 pg1, 204 }, 205 queues: []*schedulingv1.Queue{ 206 queue1, 207 }, 208 pods: []*v1.Pod{ 209 p1, p2, 210 }, 211 nodes: []*v1.Node{ 212 n3, 213 }, 214 nodesUsageMap: nodesUsage, 215 arguments: framework.Arguments{ 216 "usage.weight": 5, 217 "cpu.weight": 1, 218 "memory.weight": 1, 219 "thresholds": map[interface{}]interface{}{ 220 "cpu": 80, 221 "mem": 80, 222 }, 223 }, 224 expected: predicateResult{ 225 predicateStatus: []*api.Status{ 226 { 227 Code: api.Success, 228 Reason: "", 229 }, 230 }, 231 err: nil, 232 }, 233 }, 234 { 235 name: "The node can be scheduled, because of the metrics are not updated in the latest 5 minutes, and the usage function is invalid.", 236 podGroups: []*schedulingv1.PodGroup{ 237 pg1, 238 }, 239 queues: []*schedulingv1.Queue{ 240 queue1, 241 }, 242 pods: []*v1.Pod{ 243 p1, p2, 244 }, 245 nodes: []*v1.Node{ 246 n4, 247 }, 248 nodesUsageMap: nodesUsage, 249 arguments: framework.Arguments{ 250 "usage.weight": 5, 251 "cpu.weight": 1, 252 "memory.weight": 1, 253 "thresholds": map[interface{}]interface{}{ 254 "cpu": 80, 255 "mem": 80, 256 }, 257 }, 258 expected: predicateResult{ 259 predicateStatus: []*api.Status{ 260 { 261 Code: api.Success, 262 Reason: "", 263 }, 264 }, 265 err: nil, 266 }, 267 }, 268 { 269 name: "The node can be scheduled, because of the metric time is in the initial state, and the usage function is invalid.", 270 podGroups: []*schedulingv1.PodGroup{ 271 pg1, 272 }, 273 queues: []*schedulingv1.Queue{ 274 queue1, 275 }, 276 pods: []*v1.Pod{ 277 p1, p2, 278 }, 279 nodes: []*v1.Node{ 280 n5, 281 }, 282 nodesUsageMap: nodesUsage, 283 arguments: framework.Arguments{ 284 "usage.weight": 5, 285 "cpu.weight": 1, 286 "memory.weight": 1, 287 "thresholds": map[interface{}]interface{}{ 288 "cpu": 80, 289 "mem": 80, 290 }, 291 }, 292 expected: predicateResult{ 293 predicateStatus: []*api.Status{ 294 { 295 Code: api.Success, 296 Reason: "", 297 }, 298 }, 299 err: nil, 300 }, 301 }, 302 } 303 304 for i, test := range tests { 305 t.Run(test.name, func(t *testing.T) { 306 schedulerCache := &cache.SchedulerCache{ 307 Nodes: make(map[string]*api.NodeInfo), 308 Jobs: make(map[api.JobID]*api.JobInfo), 309 Queues: make(map[api.QueueID]*api.QueueInfo), 310 StatusUpdater: &util.FakeStatusUpdater{}, 311 VolumeBinder: &util.FakeVolumeBinder{}, 312 313 Recorder: record.NewFakeRecorder(100), 314 } 315 316 for _, node := range test.nodes { 317 schedulerCache.AddOrUpdateNode(node) 318 } 319 for _, pod := range test.pods { 320 schedulerCache.AddPod(pod) 321 } 322 for _, ss := range test.podGroups { 323 schedulerCache.AddPodGroupV1beta1(ss) 324 } 325 for _, q := range test.queues { 326 schedulerCache.AddQueueV1beta1(q) 327 } 328 updateNodeUsage(schedulerCache.Nodes, nodesUsage) 329 330 trueValue := true 331 ssn := framework.OpenSession(schedulerCache, []conf.Tier{ 332 { 333 Plugins: []conf.PluginOption{ 334 { 335 Name: PluginName, 336 EnabledPredicate: &trueValue, 337 Arguments: test.arguments, 338 }, 339 }, 340 }, 341 }, nil) 342 defer framework.CloseSession(ssn) 343 344 for _, job := range ssn.Jobs { 345 for _, task := range job.Tasks { 346 taskID := fmt.Sprintf("%s/%s", task.Namespace, task.Name) 347 for _, node := range ssn.Nodes { 348 predicateStatus, err := ssn.PredicateFn(task, node) 349 if (test.expected.err == nil || err == nil) && test.expected.err != err { 350 t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v", 351 i, taskID, node.Name, test.expected.err, err) 352 continue 353 } 354 if test.expected.err != nil && test.expected.err.Error() != err.Error() { 355 t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v", 356 i, taskID, node.Name, test.expected.err, err) 357 continue 358 } 359 360 for index := range predicateStatus { 361 if predicateStatus[index].Code != test.expected.predicateStatus[index].Code || 362 predicateStatus[index].Reason != test.expected.predicateStatus[index].Reason { 363 t.Errorf("case%d: task %s on node %s has error, expect: %v, actual: %v", 364 i, taskID, node.Name, test.expected.predicateStatus[index], predicateStatus[index]) 365 continue 366 } 367 } 368 } 369 } 370 } 371 }) 372 } 373 } 374 375 func TestUsage_nodeOrderFn(t *testing.T) { 376 var tmp *cache.SchedulerCache 377 patchUpdateQueueStatus := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "UpdateQueueStatus", func(scCache *cache.SchedulerCache, queue *api.QueueInfo) error { 378 return nil 379 }) 380 defer patchUpdateQueueStatus.Reset() 381 382 framework.RegisterPluginBuilder(PluginName, New) 383 defer framework.CleanupPluginBuilders() 384 385 p1 := util.BuildPod("c1", "p1", "", v1.PodPending, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string)) 386 387 n1 := util.BuildNode("n1", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 388 n2 := util.BuildNode("n2", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 389 n3 := util.BuildNode("n3", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 390 n4 := util.BuildNode("n4", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 391 n5 := util.BuildNode("n5", api.BuildResourceList("4", "8Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)) 392 393 nodesUsage := make(map[string]*api.NodeUsage) 394 timeNow := time.Now() 395 nodesUsage[n1.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 30}, map[string]float64{source.NODE_METRICS_PERIOD: 50}, timeNow) 396 nodesUsage[n2.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 50}, timeNow) 397 nodesUsage[n3.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 60}, map[string]float64{source.NODE_METRICS_PERIOD: 80}, timeNow) 398 // The metrics are not updated in the latest 5 minutes, and the usage function is invalid. 399 // The node score is 0. 400 nodesUsage[n4.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 10}, map[string]float64{source.NODE_METRICS_PERIOD: 20}, timeNow.Add(-6*time.Minute)) 401 // The metric time is in the initial state, and the usage function is invalid. 402 // The node score is 0. 403 nodesUsage[n5.Name] = buildNodeUsage(map[string]float64{source.NODE_METRICS_PERIOD: 0}, map[string]float64{source.NODE_METRICS_PERIOD: 0}, time.Time{}) 404 405 pg1 := &schedulingv1.PodGroup{ 406 ObjectMeta: metav1.ObjectMeta{ 407 Name: "pg1", 408 Namespace: "c1", 409 }, 410 Spec: schedulingv1.PodGroupSpec{ 411 Queue: "q1", 412 }, 413 } 414 queue1 := &schedulingv1.Queue{ 415 ObjectMeta: metav1.ObjectMeta{ 416 Name: "q1", 417 }, 418 Spec: schedulingv1.QueueSpec{ 419 Weight: 1, 420 }, 421 } 422 423 tests := []struct { 424 name string 425 podGroups []*schedulingv1.PodGroup 426 queues []*schedulingv1.Queue 427 pods []*v1.Pod 428 nodes []*v1.Node 429 nodesUsageMap map[string]*api.NodeUsage 430 arguments framework.Arguments 431 expected map[string]map[string]float64 432 }{ 433 { 434 name: "Node scoring in the default weight configuration scenario.", 435 podGroups: []*schedulingv1.PodGroup{ 436 pg1, 437 }, 438 queues: []*schedulingv1.Queue{ 439 queue1, 440 }, 441 pods: []*v1.Pod{ 442 p1, 443 }, 444 nodes: []*v1.Node{ 445 n1, n2, n3, n4, n5, 446 }, 447 nodesUsageMap: nodesUsage, 448 arguments: framework.Arguments{ 449 "usage.weight": 5, 450 "cpu.weight": 1, 451 "memory.weight": 1, 452 "thresholds": map[interface{}]interface{}{ 453 "cpu": 80, 454 "mem": 80, 455 }, 456 }, 457 expected: map[string]map[string]float64{ 458 "c1/p1": { 459 "n1": 300, 460 "n2": 225, 461 "n3": 150, 462 "n4": 0, 463 "n5": 0, 464 }, 465 }, 466 }, 467 { 468 name: "Node scoring gives priority to memory resources", 469 podGroups: []*schedulingv1.PodGroup{ 470 pg1, 471 }, 472 queues: []*schedulingv1.Queue{ 473 queue1, 474 }, 475 pods: []*v1.Pod{ 476 p1, 477 }, 478 nodes: []*v1.Node{ 479 n1, n2, n3, n4, n5, 480 }, 481 nodesUsageMap: nodesUsage, 482 arguments: framework.Arguments{ 483 "usage.weight": 5, 484 "cpu.weight": 2, 485 "memory.weight": 8, 486 "thresholds": map[interface{}]interface{}{ 487 "cpu": 80, 488 "mem": 80, 489 }, 490 }, 491 expected: map[string]map[string]float64{ 492 "c1/p1": { 493 "n1": 270, 494 "n2": 240, 495 "n3": 120, 496 "n4": 0, 497 "n5": 0, 498 }, 499 }, 500 }, 501 } 502 503 for i, test := range tests { 504 t.Run(test.name, func(t *testing.T) { 505 schedulerCache := &cache.SchedulerCache{ 506 Nodes: make(map[string]*api.NodeInfo), 507 Jobs: make(map[api.JobID]*api.JobInfo), 508 Queues: make(map[api.QueueID]*api.QueueInfo), 509 StatusUpdater: &util.FakeStatusUpdater{}, 510 VolumeBinder: &util.FakeVolumeBinder{}, 511 512 Recorder: record.NewFakeRecorder(100), 513 } 514 515 for _, node := range test.nodes { 516 schedulerCache.AddOrUpdateNode(node) 517 } 518 for _, pod := range test.pods { 519 schedulerCache.AddPod(pod) 520 } 521 for _, ss := range test.podGroups { 522 schedulerCache.AddPodGroupV1beta1(ss) 523 } 524 for _, q := range test.queues { 525 schedulerCache.AddQueueV1beta1(q) 526 } 527 updateNodeUsage(schedulerCache.Nodes, nodesUsage) 528 529 trueValue := true 530 ssn := framework.OpenSession(schedulerCache, []conf.Tier{ 531 { 532 Plugins: []conf.PluginOption{ 533 { 534 Name: PluginName, 535 EnabledNodeOrder: &trueValue, 536 Arguments: test.arguments, 537 }, 538 }, 539 }, 540 }, nil) 541 defer framework.CloseSession(ssn) 542 543 for _, job := range ssn.Jobs { 544 for _, task := range job.Tasks { 545 taskID := fmt.Sprintf("%s/%s", task.Namespace, task.Name) 546 for _, node := range ssn.Nodes { 547 score, err := ssn.NodeOrderFn(task, node) 548 if err != nil { 549 t.Errorf("case%d: task %s on node %s has err %v", i, taskID, node.Name, err) 550 continue 551 } 552 if expectScore := test.expected[taskID][node.Name]; math.Abs(expectScore-score) > eps { 553 t.Errorf("case%d: task %s on node %s expect have score %v, but get %v", i, taskID, node.Name, expectScore, score) 554 } 555 } 556 } 557 } 558 }) 559 } 560 }