volcano.sh/volcano@v1.9.0/test/e2e/schedulingbase/job_scheduling.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package schedulingbase 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "time" 24 25 . "github.com/onsi/ginkgo/v2" 26 . "github.com/onsi/gomega" 27 28 batchv1 "k8s.io/api/batch/v1" 29 v1 "k8s.io/api/core/v1" 30 "k8s.io/apimachinery/pkg/api/errors" 31 "k8s.io/apimachinery/pkg/api/resource" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/util/wait" 34 35 vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1" 36 vcscheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 37 schedulingapi "volcano.sh/volcano/pkg/scheduler/api" 38 e2eutil "volcano.sh/volcano/test/e2e/util" 39 ) 40 41 var _ = Describe("Job E2E Test", func() { 42 It("Schedule Job", func() { 43 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 44 defer e2eutil.CleanupTestContext(ctx) 45 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 46 47 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 48 Name: "qj-1", 49 Tasks: []e2eutil.TaskSpec{ 50 { 51 Img: e2eutil.DefaultBusyBoxImage, 52 Req: e2eutil.OneCPU, 53 Min: 2, 54 Rep: rep, 55 }, 56 }, 57 }) 58 59 err := e2eutil.WaitJobReady(ctx, job) 60 Expect(err).NotTo(HaveOccurred()) 61 }) 62 63 It("Schedule Multiple Jobs", func() { 64 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 65 defer e2eutil.CleanupTestContext(ctx) 66 67 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 68 69 job := &e2eutil.JobSpec{ 70 Tasks: []e2eutil.TaskSpec{ 71 { 72 Img: e2eutil.DefaultBusyBoxImage, 73 Req: e2eutil.OneCPU, 74 Min: 2, 75 Rep: rep, 76 }, 77 }, 78 } 79 80 job.Name = "mqj-1" 81 job1 := e2eutil.CreateJob(ctx, job) 82 job.Name = "mqj-2" 83 job2 := e2eutil.CreateJob(ctx, job) 84 job.Name = "mqj-3" 85 job3 := e2eutil.CreateJob(ctx, job) 86 87 err := e2eutil.WaitJobReady(ctx, job1) 88 Expect(err).NotTo(HaveOccurred()) 89 90 err = e2eutil.WaitJobReady(ctx, job2) 91 Expect(err).NotTo(HaveOccurred()) 92 93 err = e2eutil.WaitJobReady(ctx, job3) 94 Expect(err).NotTo(HaveOccurred()) 95 }) 96 97 It("Gang scheduling", func() { 98 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 99 defer e2eutil.CleanupTestContext(ctx) 100 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU)/2 + 1 101 102 replicaset := e2eutil.CreateReplicaSet(ctx, "rs-1", rep, e2eutil.DefaultNginxImage, e2eutil.OneCPU) 103 err := e2eutil.WaitReplicaSetReady(ctx, replicaset.Name) 104 Expect(err).NotTo(HaveOccurred()) 105 106 jobSpec := &e2eutil.JobSpec{ 107 Name: "gang-qj", 108 Namespace: ctx.Namespace, 109 Tasks: []e2eutil.TaskSpec{ 110 { 111 Img: e2eutil.DefaultBusyBoxImage, 112 Req: e2eutil.OneCPU, 113 Min: rep, 114 Rep: rep, 115 Command: "sleep 10s", 116 }, 117 }, 118 } 119 120 job := e2eutil.CreateJob(ctx, jobSpec) 121 err = e2eutil.WaitJobStatePending(ctx, job) 122 Expect(err).NotTo(HaveOccurred()) 123 124 err = e2eutil.WaitJobUnschedulable(ctx, job) 125 Expect(err).NotTo(HaveOccurred()) 126 127 err = e2eutil.DeleteReplicaSet(ctx, replicaset.Name) 128 Expect(err).NotTo(HaveOccurred()) 129 130 err = e2eutil.WaitJobReady(ctx, job) 131 Expect(err).NotTo(HaveOccurred()) 132 }) 133 134 It("Gang scheduling: Full Occupied", func() { 135 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 136 defer e2eutil.CleanupTestContext(ctx) 137 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 138 139 job := &e2eutil.JobSpec{ 140 Namespace: ctx.Namespace, 141 Tasks: []e2eutil.TaskSpec{ 142 { 143 Img: e2eutil.DefaultNginxImage, 144 Req: e2eutil.OneCPU, 145 Min: rep, 146 Rep: rep, 147 }, 148 }, 149 } 150 151 job.Name = "gang-fq-qj1" 152 job1 := e2eutil.CreateJob(ctx, job) 153 err := e2eutil.WaitJobReady(ctx, job1) 154 Expect(err).NotTo(HaveOccurred()) 155 156 job.Name = "gang-fq-qj2" 157 job2 := e2eutil.CreateJob(ctx, job) 158 err = e2eutil.WaitJobStatePending(ctx, job2) 159 Expect(err).NotTo(HaveOccurred()) 160 161 err = e2eutil.WaitJobReady(ctx, job1) 162 Expect(err).NotTo(HaveOccurred()) 163 }) 164 165 It("Gang scheduling: Contains both best-effort pod and non-best-effort pod", func() { 166 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 167 defer e2eutil.CleanupTestContext(ctx) 168 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 169 170 if rep < 2 { 171 fmt.Println("Skip e2e test for insufficient resources.") 172 return 173 } 174 175 jobSpec := &e2eutil.JobSpec{ 176 Name: "gang-both-best-effort-non-best-effort-pods", 177 Namespace: ctx.Namespace, 178 Tasks: []e2eutil.TaskSpec{ 179 { 180 Name: "best-effort", 181 Img: e2eutil.DefaultNginxImage, 182 Req: e2eutil.OneCPU, 183 Min: rep / 2, 184 Rep: rep / 2, 185 }, 186 { 187 Name: "non-best-effort", 188 Img: e2eutil.DefaultNginxImage, 189 Min: rep - rep/2, 190 Rep: rep - rep/2, 191 }, 192 }, 193 } 194 195 job := e2eutil.CreateJob(ctx, jobSpec) 196 err := e2eutil.WaitJobReady(ctx, job) 197 Expect(err).NotTo(HaveOccurred()) 198 }) 199 200 It("Schedule BestEffort Job", func() { 201 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 202 defer e2eutil.CleanupTestContext(ctx) 203 204 slot := e2eutil.OneCPU 205 rep := e2eutil.ClusterSize(ctx, slot) 206 207 spec := &e2eutil.JobSpec{ 208 Name: "test", 209 Tasks: []e2eutil.TaskSpec{ 210 { 211 Img: e2eutil.DefaultNginxImage, 212 Req: slot, 213 Min: 2, 214 Rep: rep, 215 }, 216 { 217 Img: e2eutil.DefaultNginxImage, 218 Min: 2, 219 Rep: rep / 2, 220 }, 221 }, 222 } 223 224 job := e2eutil.CreateJob(ctx, spec) 225 226 err := e2eutil.WaitJobReady(ctx, job) 227 Expect(err).NotTo(HaveOccurred()) 228 }) 229 230 It("Statement", func() { 231 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 232 defer e2eutil.CleanupTestContext(ctx) 233 234 slot := e2eutil.OneCPU 235 rep := e2eutil.ClusterSize(ctx, slot) 236 237 spec := &e2eutil.JobSpec{ 238 Namespace: ctx.Namespace, 239 Tasks: []e2eutil.TaskSpec{ 240 { 241 Img: e2eutil.DefaultNginxImage, 242 Req: slot, 243 Min: rep, 244 Rep: rep, 245 }, 246 }, 247 } 248 249 spec.Name = "st-qj-1" 250 job1 := e2eutil.CreateJob(ctx, spec) 251 err := e2eutil.WaitJobReady(ctx, job1) 252 Expect(err).NotTo(HaveOccurred()) 253 254 now := time.Now() 255 256 spec.Name = "st-qj-2" 257 job2 := e2eutil.CreateJob(ctx, spec) 258 err = e2eutil.WaitJobUnschedulable(ctx, job2) 259 Expect(err).NotTo(HaveOccurred()) 260 261 // No preemption event 262 evicted, err := e2eutil.JobEvicted(ctx, job1, now)() 263 Expect(err).NotTo(HaveOccurred()) 264 Expect(evicted).NotTo(BeTrue()) 265 }) 266 267 It("support binpack policy", func() { 268 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 269 defer e2eutil.CleanupTestContext(ctx) 270 271 slot := e2eutil.OneCPU 272 273 By("create base job") 274 spec := &e2eutil.JobSpec{ 275 Name: "binpack-base-1", 276 Namespace: ctx.Namespace, 277 Tasks: []e2eutil.TaskSpec{ 278 { 279 Img: e2eutil.DefaultNginxImage, 280 Req: slot, 281 Min: 1, 282 Rep: 1, 283 }, 284 }, 285 } 286 287 baseJob := e2eutil.CreateJob(ctx, spec) 288 err := e2eutil.WaitJobReady(ctx, baseJob) 289 Expect(err).NotTo(HaveOccurred()) 290 291 basePods := e2eutil.GetTasksOfJob(ctx, baseJob) 292 basePod := basePods[0] 293 baseNodeName := basePod.Spec.NodeName 294 295 node, err := ctx.Kubeclient.CoreV1().Nodes().Get(context.TODO(), baseNodeName, metav1.GetOptions{}) 296 Expect(err).NotTo(HaveOccurred()) 297 298 clusterPods, err := ctx.Kubeclient.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{}) 299 Expect(err).NotTo(HaveOccurred()) 300 301 alloc := schedulingapi.NewResource(node.Status.Allocatable) 302 for _, pod := range clusterPods.Items { 303 nodeName := pod.Spec.NodeName 304 if nodeName != baseNodeName || len(nodeName) == 0 || pod.DeletionTimestamp != nil { 305 continue 306 } 307 308 if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { 309 continue 310 } 311 312 for _, c := range pod.Spec.Containers { 313 req := schedulingapi.NewResource(c.Resources.Requests) 314 alloc.Sub(req) 315 } 316 } 317 318 need := schedulingapi.NewResource(v1.ResourceList{"cpu": resource.MustParse("500m")}) 319 var count int32 320 for need.LessEqual(alloc, schedulingapi.Zero) { 321 count++ 322 alloc.Sub(need) 323 } 324 325 By(fmt.Sprintf("create test job with %d pods", count)) 326 spec = &e2eutil.JobSpec{ 327 Name: "binpack-test-1", 328 Namespace: ctx.Namespace, 329 Tasks: []e2eutil.TaskSpec{ 330 { 331 Img: e2eutil.DefaultNginxImage, 332 Req: e2eutil.HalfCPU, 333 Min: count, 334 Rep: count, 335 }, 336 }, 337 } 338 job := e2eutil.CreateJob(ctx, spec) 339 err = e2eutil.WaitJobReady(ctx, job) 340 Expect(err).NotTo(HaveOccurred()) 341 342 pods := e2eutil.GetTasksOfJob(ctx, baseJob) 343 for _, pod := range pods { 344 nodeName := pod.Spec.NodeName 345 Expect(nodeName).Should(Equal(baseNodeName), 346 fmt.Sprintf("Pod %s/%s should assign to node %s, but not %s", pod.Namespace, pod.Name, baseNodeName, nodeName)) 347 } 348 }) 349 350 It("Schedule v1.Job type using Volcano scheduler", func() { 351 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 352 defer e2eutil.CleanupTestContext(ctx) 353 parallel := int32(2) 354 355 job := &batchv1.Job{ 356 ObjectMeta: metav1.ObjectMeta{ 357 Name: "job1", 358 Namespace: ctx.Namespace, 359 }, 360 Spec: batchv1.JobSpec{ 361 Parallelism: ¶llel, 362 Template: v1.PodTemplateSpec{ 363 Spec: v1.PodSpec{ 364 RestartPolicy: v1.RestartPolicyNever, 365 SchedulerName: e2eutil.SchedulerName, 366 Containers: []v1.Container{ 367 { 368 Name: "test-container", 369 Image: "nginx", 370 }, 371 }, 372 }, 373 }, 374 }, 375 } 376 377 //create job 378 job, err := ctx.Kubeclient.BatchV1().Jobs(ctx.Namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 379 Expect(err).NotTo(HaveOccurred()) 380 381 err = e2eutil.WaitJobPhaseReady(ctx, job) 382 Expect(err).NotTo(HaveOccurred()) 383 }) 384 385 It("Schedule v1.Job type using Volcano scheduler with error case", func() { 386 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 387 defer e2eutil.CleanupTestContext(ctx) 388 parallel := int32(2) 389 390 errorJob := &batchv1.Job{ 391 ObjectMeta: metav1.ObjectMeta{ 392 Name: "job1", 393 Namespace: ctx.Namespace, 394 }, 395 Spec: batchv1.JobSpec{ 396 Parallelism: ¶llel, 397 Template: v1.PodTemplateSpec{ 398 Spec: v1.PodSpec{ 399 SchedulerName: e2eutil.SchedulerName, 400 Containers: []v1.Container{ 401 { 402 Name: "test-container", 403 Image: "nginx", 404 }, 405 }, 406 }, 407 }, 408 }, 409 } 410 411 job := &batchv1.Job{ 412 ObjectMeta: metav1.ObjectMeta{ 413 Name: "job1", 414 Namespace: ctx.Namespace, 415 }, 416 Spec: batchv1.JobSpec{ 417 Parallelism: ¶llel, 418 Template: v1.PodTemplateSpec{ 419 Spec: v1.PodSpec{ 420 RestartPolicy: v1.RestartPolicyNever, 421 SchedulerName: e2eutil.SchedulerName, 422 Containers: []v1.Container{ 423 { 424 Name: "test-container", 425 Image: "nginx", 426 }, 427 }, 428 }, 429 }, 430 }, 431 } 432 433 //create error job 434 _, err := ctx.Kubeclient.BatchV1().Jobs(ctx.Namespace).Create(context.TODO(), errorJob, metav1.CreateOptions{}) 435 Expect(err).To(HaveOccurred()) 436 437 //create job 438 job, err = ctx.Kubeclient.BatchV1().Jobs(ctx.Namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 439 Expect(err).NotTo(HaveOccurred()) 440 441 err = e2eutil.WaitJobPhaseReady(ctx, job) 442 Expect(err).NotTo(HaveOccurred()) 443 }) 444 445 It("Queue Fair Share", func() { 446 Skip("Failed when add yaml, test case may fail in some condition") 447 q1, q2 := "q1", "q2" 448 ctx := e2eutil.InitTestContext(e2eutil.Options{ 449 Queues: []string{q1, q2}, 450 }) 451 defer e2eutil.CleanupTestContext(ctx) 452 453 slot := e2eutil.HalfCPU 454 rep := e2eutil.ClusterSize(ctx, slot) 455 456 createJobToQueue := func(queue string, index int, replica int32) *vcbatch.Job { 457 spec := &e2eutil.JobSpec{ 458 Name: fmt.Sprintf("queue-fair-share-%s-%d", queue, index), 459 Namespace: ctx.Namespace, 460 Queue: queue, 461 Tasks: []e2eutil.TaskSpec{ 462 { 463 Img: e2eutil.DefaultNginxImage, 464 Command: "sleep 10000", 465 Req: slot, 466 Min: 2, 467 Rep: replica, 468 }, 469 }, 470 } 471 job := e2eutil.CreateJob(ctx, spec) 472 return job 473 } 474 475 By("occupy all cluster resources") 476 occupiedJob := createJobToQueue("default", 123, rep*2) 477 err := e2eutil.WaitJobReady(ctx, occupiedJob) 478 Expect(err).NotTo(HaveOccurred()) 479 480 for i := 0; i < int(rep); i++ { 481 createJobToQueue(q1, i, 2) 482 createJobToQueue(q2, i, 2) 483 } 484 485 By(fmt.Sprintf("release occupied cluster resources, %s/%s", occupiedJob.Namespace, occupiedJob.Name)) 486 deleteForeground := metav1.DeletePropagationBackground 487 err = ctx.Vcclient.BatchV1alpha1().Jobs(occupiedJob.Namespace).Delete(context.TODO(), 488 occupiedJob.Name, 489 metav1.DeleteOptions{ 490 PropagationPolicy: &deleteForeground, 491 }) 492 Expect(err).NotTo(HaveOccurred()) 493 494 By("wait occupied cluster resources releasing") 495 err = e2eutil.WaitJobCleanedUp(ctx, occupiedJob) 496 Expect(err).NotTo(HaveOccurred()) 497 498 By("wait pod in queue q1/q2 scheduled") 499 q1ScheduledPod := 0 500 q2ScheduledPod := 0 501 expectPod := int(rep) 502 if expectPod%1 == 1 { 503 expectPod-- 504 } 505 err = wait.Poll(100*time.Millisecond, e2eutil.FiveMinute, func() (bool, error) { 506 q1ScheduledPod = 0 507 q2ScheduledPod = 0 508 509 pods, err := ctx.Kubeclient.CoreV1().Pods(ctx.Namespace).List(context.TODO(), metav1.ListOptions{}) 510 if err != nil { 511 return false, err 512 } 513 for _, pod := range pods.Items { 514 if !e2eutil.IsPodScheduled(&pod) { 515 continue 516 } 517 jobName := pod.Annotations[vcbatch.JobNameKey] 518 if strings.Contains(jobName, "queue-fair-share-"+q1) { 519 q1ScheduledPod++ 520 } 521 if strings.Contains(jobName, "queue-fair-share-"+q2) { 522 q2ScheduledPod++ 523 } 524 } 525 526 if q2ScheduledPod+q1ScheduledPod == expectPod { 527 return true, nil 528 } 529 530 return false, nil 531 }) 532 Expect(err).NotTo(HaveOccurred()) 533 Expect(q2ScheduledPod).Should(BeNumerically(">=", expectPod/2-1), 534 fmt.Sprintf("expectPod %d, q1ScheduledPod %d, q2ScheduledPod %d", expectPod, q1ScheduledPod, q2ScheduledPod)) 535 536 Expect(q2ScheduledPod).Should(BeNumerically("<=", expectPod/2+1), 537 fmt.Sprintf("expectPod %d, q1ScheduledPod %d, q2ScheduledPod %d", expectPod, q1ScheduledPod, q2ScheduledPod)) 538 }) 539 540 It("PodGroup's Count change with Deployment's Request change", func() { 541 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 542 defer e2eutil.CleanupTestContext(ctx) 543 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU)/2 + 1 544 545 d := e2eutil.CreateDeployment(ctx, "d-1", rep, e2eutil.DefaultNginxImage, e2eutil.OneCPU) 546 err := e2eutil.WaitDeploymentReady(ctx, d.Name) 547 Expect(err).NotTo(HaveOccurred()) 548 549 pgs, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(ctx.Namespace).List(context.TODO(), metav1.ListOptions{}) 550 Expect(err).NotTo(HaveOccurred(), "failed to list podGroups in namespace %s", ctx.Namespace) 551 Expect(len(pgs.Items)).To(Equal(1), "this test need a clean cluster") 552 oldOne := &pgs.Items[0] 553 554 d.ResourceVersion = "" 555 d.Spec.Template.Spec.Containers[0].Resources.Requests = e2eutil.HalfCPU 556 d, err = ctx.Kubeclient.AppsV1().Deployments(ctx.Namespace).Update(context.TODO(), d, metav1.UpdateOptions{}) 557 Expect(err).NotTo(HaveOccurred(), "failed to update deployment(%s) in namespace %s", d.Name, ctx.Namespace) 558 err = e2eutil.WaitDeploymentReady(ctx, d.Name) 559 Expect(err).NotTo(HaveOccurred()) 560 561 wait.Poll(time.Second, time.Minute, func() (bool, error) { 562 oldOne, err = ctx.Vcclient.SchedulingV1beta1().PodGroups(ctx.Namespace).Get(context.TODO(), oldOne.Name, metav1.GetOptions{}) 563 if err != nil { 564 return true, nil 565 } 566 return false, nil 567 }) 568 Expect(errors.IsNotFound(err)).To(BeTrue(), "old pg(%s) should not found", oldOne.Name) 569 570 pgs, err = ctx.Vcclient.SchedulingV1beta1().PodGroups(ctx.Namespace).List(context.TODO(), metav1.ListOptions{}) 571 Expect(err).NotTo(HaveOccurred(), "failed to list podGroups in namespace %s", ctx.Namespace) 572 Expect(len(pgs.Items)).To(Equal(1), "only one podGroup should be exists") 573 }) 574 575 It("PodGroup's Phase with k8s Job in Completed", func() { 576 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 577 defer e2eutil.CleanupTestContext(ctx) 578 579 jb := e2eutil.CreateSampleK8sJob(ctx, "job1", e2eutil.DefaultNginxImage, e2eutil.OneCPU) 580 err := e2eutil.Waitk8sJobCompleted(ctx, jb.Name) 581 Expect(err).NotTo(HaveOccurred()) 582 583 var pgPhase vcscheduling.PodGroupPhase 584 wait.Poll(time.Second, time.Second*30, func() (bool, error) { 585 pgs, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(ctx.Namespace).List(context.TODO(), metav1.ListOptions{}) 586 Expect(err).NotTo(HaveOccurred(), "failed to list podGroups in namespace %s", ctx.Namespace) 587 Expect(len(pgs.Items)).To(Equal(1), "this test need a clean cluster") 588 pgPhase = pgs.Items[0].Status.Phase 589 if pgPhase != vcscheduling.PodGroupRunning { 590 return true, nil 591 } 592 return false, nil 593 }) 594 Expect(pgPhase).To(Equal(vcscheduling.PodGroupCompleted), "podGroup Phase is %s, should be %s", 595 ctx.Namespace, vcscheduling.PodGroupCompleted) 596 }) 597 })