k8s.io/kubernetes@v1.29.3/test/e2e/autoscaling/cluster_size_autoscaling.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package autoscaling 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "math" 24 "net/http" 25 "os" 26 "os/exec" 27 "regexp" 28 "strconv" 29 "strings" 30 "time" 31 32 v1 "k8s.io/api/core/v1" 33 policyv1 "k8s.io/api/policy/v1" 34 schedulingv1 "k8s.io/api/scheduling/v1" 35 apierrors "k8s.io/apimachinery/pkg/api/errors" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/fields" 38 "k8s.io/apimachinery/pkg/labels" 39 utilerrors "k8s.io/apimachinery/pkg/util/errors" 40 "k8s.io/apimachinery/pkg/util/intstr" 41 "k8s.io/apimachinery/pkg/util/sets" 42 "k8s.io/apimachinery/pkg/util/uuid" 43 "k8s.io/apimachinery/pkg/util/wait" 44 clientset "k8s.io/client-go/kubernetes" 45 "k8s.io/klog/v2" 46 "k8s.io/kubernetes/test/e2e/feature" 47 "k8s.io/kubernetes/test/e2e/framework" 48 e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" 49 e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest" 50 e2enetwork "k8s.io/kubernetes/test/e2e/framework/network" 51 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 52 e2epv "k8s.io/kubernetes/test/e2e/framework/pv" 53 e2erc "k8s.io/kubernetes/test/e2e/framework/rc" 54 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 55 "k8s.io/kubernetes/test/e2e/scheduling" 56 testutils "k8s.io/kubernetes/test/utils" 57 imageutils "k8s.io/kubernetes/test/utils/image" 58 admissionapi "k8s.io/pod-security-admission/api" 59 60 "github.com/onsi/ginkgo/v2" 61 "github.com/onsi/gomega" 62 ) 63 64 const ( 65 defaultTimeout = 3 * time.Minute 66 resizeTimeout = 5 * time.Minute 67 manualResizeTimeout = 6 * time.Minute 68 scaleUpTimeout = 5 * time.Minute 69 scaleUpTriggerTimeout = 2 * time.Minute 70 scaleDownTimeout = 20 * time.Minute 71 podTimeout = 2 * time.Minute 72 nodesRecoverTimeout = 5 * time.Minute 73 rcCreationRetryTimeout = 4 * time.Minute 74 rcCreationRetryDelay = 20 * time.Second 75 makeSchedulableTimeout = 10 * time.Minute 76 makeSchedulableDelay = 20 * time.Second 77 freshStatusLimit = 20 * time.Second 78 79 gkeUpdateTimeout = 15 * time.Minute 80 gkeNodepoolNameKey = "cloud.google.com/gke-nodepool" 81 82 disabledTaint = "DisabledForAutoscalingTest" 83 criticalAddonsOnlyTaint = "CriticalAddonsOnly" 84 newNodesForScaledownTests = 2 85 unhealthyClusterThreshold = 4 86 87 caNoScaleUpStatus = "NoActivity" 88 caOngoingScaleUpStatus = "InProgress" 89 timestampFormat = "2006-01-02 15:04:05 -0700 MST" 90 91 expendablePriorityClassName = "expendable-priority" 92 highPriorityClassName = "high-priority" 93 94 gpuLabel = "cloud.google.com/gke-accelerator" 95 ) 96 97 var _ = SIGDescribe("Cluster size autoscaling", framework.WithSlow(), func() { 98 f := framework.NewDefaultFramework("autoscaling") 99 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 100 var c clientset.Interface 101 var nodeCount int 102 var memAllocatableMb int 103 var originalSizes map[string]int 104 105 ginkgo.BeforeEach(func(ctx context.Context) { 106 c = f.ClientSet 107 e2eskipper.SkipUnlessProviderIs("gce", "gke") 108 109 originalSizes = make(map[string]int) 110 sum := 0 111 for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { 112 size, err := framework.GroupSize(mig) 113 framework.ExpectNoError(err) 114 ginkgo.By(fmt.Sprintf("Initial size of %s: %d", mig, size)) 115 originalSizes[mig] = size 116 sum += size 117 } 118 // Give instances time to spin up 119 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, sum, scaleUpTimeout)) 120 121 nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 122 framework.ExpectNoError(err) 123 nodeCount = len(nodes.Items) 124 ginkgo.By(fmt.Sprintf("Initial number of schedulable nodes: %v", nodeCount)) 125 framework.ExpectNotEqual(nodeCount, 0) 126 mem := nodes.Items[0].Status.Allocatable[v1.ResourceMemory] 127 memAllocatableMb = int((&mem).Value() / 1024 / 1024) 128 129 gomega.Expect(nodeCount).To(gomega.Equal(sum)) 130 131 if framework.ProviderIs("gke") { 132 val, err := isAutoscalerEnabled(5) 133 framework.ExpectNoError(err) 134 if !val { 135 err = enableAutoscaler("default-pool", 3, 5) 136 framework.ExpectNoError(err) 137 } 138 } 139 }) 140 141 ginkgo.AfterEach(func(ctx context.Context) { 142 e2eskipper.SkipUnlessProviderIs("gce", "gke") 143 ginkgo.By(fmt.Sprintf("Restoring initial size of the cluster")) 144 setMigSizes(originalSizes) 145 expectedNodes := 0 146 for _, size := range originalSizes { 147 expectedNodes += size 148 } 149 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, expectedNodes, scaleDownTimeout)) 150 nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 151 framework.ExpectNoError(err) 152 153 s := time.Now() 154 makeSchedulableLoop: 155 for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) { 156 for _, n := range nodes.Items { 157 err = makeNodeSchedulable(ctx, c, &n, true) 158 switch err.(type) { 159 case CriticalAddonsOnlyError: 160 continue makeSchedulableLoop 161 default: 162 framework.ExpectNoError(err) 163 } 164 } 165 break 166 } 167 klog.Infof("Made nodes schedulable again in %v", time.Since(s).String()) 168 }) 169 170 f.It("shouldn't increase cluster size if pending pod is too large", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 171 ginkgo.By("Creating unschedulable pod") 172 ReserveMemory(ctx, f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, defaultTimeout) 173 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 174 175 ginkgo.By("Waiting for scale up hoping it won't happen") 176 // Verify that the appropriate event was generated 177 eventFound := false 178 EventsLoop: 179 for start := time.Now(); time.Since(start) < scaleUpTimeout; time.Sleep(20 * time.Second) { 180 ginkgo.By("Waiting for NotTriggerScaleUp event") 181 events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) 182 framework.ExpectNoError(err) 183 184 for _, e := range events.Items { 185 if e.InvolvedObject.Kind == "Pod" && e.Reason == "NotTriggerScaleUp" { 186 ginkgo.By("NotTriggerScaleUp event found") 187 eventFound = true 188 break EventsLoop 189 } 190 } 191 } 192 if !eventFound { 193 framework.Failf("Expected event with kind 'Pod' and reason 'NotTriggerScaleUp' not found.") 194 } 195 // Verify that cluster size is not changed 196 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 197 func(size int) bool { return size <= nodeCount }, time.Second)) 198 }) 199 200 simpleScaleUpTest := func(ctx context.Context, unready int) { 201 ReserveMemory(ctx, f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) 202 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 203 204 // Verify that cluster size is increased 205 framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, 206 func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout, unready)) 207 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 208 } 209 210 f.It("should increase cluster size if pending pods are small", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 211 simpleScaleUpTest(ctx, 0) 212 }) 213 214 gpuType := os.Getenv("TESTED_GPU_TYPE") 215 216 f.It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s]", gpuType), feature.ClusterSizeAutoscalingGpu, func(ctx context.Context) { 217 e2eskipper.SkipUnlessProviderIs("gke") 218 if gpuType == "" { 219 framework.Failf("TEST_GPU_TYPE not defined") 220 return 221 } 222 223 const gpuPoolName = "gpu-pool" 224 addGpuNodePool(gpuPoolName, gpuType, 1, 0) 225 defer deleteNodePool(gpuPoolName) 226 227 installNvidiaDriversDaemonSet(ctx, f) 228 229 ginkgo.By("Enable autoscaler") 230 framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) 231 defer disableAutoscaler(gpuPoolName, 0, 1) 232 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.BeEmpty()) 233 234 ginkgo.By("Schedule a pod which requires GPU") 235 framework.ExpectNoError(ScheduleAnySingleGpuPod(ctx, f, "gpu-pod-rc")) 236 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "gpu-pod-rc") 237 238 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 239 func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) 240 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.HaveLen(1)) 241 }) 242 243 f.It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s]", gpuType), feature.ClusterSizeAutoscalingGpu, func(ctx context.Context) { 244 e2eskipper.SkipUnlessProviderIs("gke") 245 if gpuType == "" { 246 framework.Failf("TEST_GPU_TYPE not defined") 247 return 248 } 249 250 const gpuPoolName = "gpu-pool" 251 addGpuNodePool(gpuPoolName, gpuType, 1, 1) 252 defer deleteNodePool(gpuPoolName) 253 254 installNvidiaDriversDaemonSet(ctx, f) 255 256 ginkgo.By("Schedule a single pod which requires GPU") 257 framework.ExpectNoError(ScheduleAnySingleGpuPod(ctx, f, "gpu-pod-rc")) 258 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "gpu-pod-rc") 259 260 ginkgo.By("Enable autoscaler") 261 framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2)) 262 defer disableAutoscaler(gpuPoolName, 0, 2) 263 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.HaveLen(1)) 264 265 ginkgo.By("Scale GPU deployment") 266 e2erc.ScaleRC(ctx, f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true) 267 268 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 269 func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout)) 270 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.HaveLen(2)) 271 }) 272 273 f.It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s]", gpuType), feature.ClusterSizeAutoscalingGpu, func(ctx context.Context) { 274 e2eskipper.SkipUnlessProviderIs("gke") 275 if gpuType == "" { 276 framework.Failf("TEST_GPU_TYPE not defined") 277 return 278 } 279 280 const gpuPoolName = "gpu-pool" 281 addGpuNodePool(gpuPoolName, gpuType, 1, 0) 282 defer deleteNodePool(gpuPoolName) 283 284 installNvidiaDriversDaemonSet(ctx, f) 285 286 ginkgo.By("Enable autoscaler") 287 framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) 288 defer disableAutoscaler(gpuPoolName, 0, 1) 289 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.BeEmpty()) 290 291 ginkgo.By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs") 292 ReserveMemory(ctx, f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) 293 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 294 // Verify that cluster size is increased 295 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 296 func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) 297 298 // Expect gpu pool to stay intact 299 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.BeEmpty()) 300 }) 301 302 f.It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s]", gpuType), feature.ClusterSizeAutoscalingGpu, func(ctx context.Context) { 303 e2eskipper.SkipUnlessProviderIs("gke") 304 if gpuType == "" { 305 framework.Failf("TEST_GPU_TYPE not defined") 306 return 307 } 308 309 const gpuPoolName = "gpu-pool" 310 addGpuNodePool(gpuPoolName, gpuType, 1, 1) 311 defer deleteNodePool(gpuPoolName) 312 313 installNvidiaDriversDaemonSet(ctx, f) 314 315 ginkgo.By("Schedule a single pod which requires GPU") 316 framework.ExpectNoError(ScheduleAnySingleGpuPod(ctx, f, "gpu-pod-rc")) 317 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "gpu-pod-rc") 318 319 ginkgo.By("Enable autoscaler") 320 framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) 321 defer disableAutoscaler(gpuPoolName, 0, 1) 322 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.HaveLen(1)) 323 324 ginkgo.By("Remove the only POD requiring GPU") 325 e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, "gpu-pod-rc") 326 327 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 328 func(size int) bool { return size == nodeCount }, scaleDownTimeout)) 329 gomega.Expect(getPoolNodes(ctx, f, gpuPoolName)).To(gomega.BeEmpty()) 330 }) 331 332 f.It("should increase cluster size if pending pods are small and one node is broken", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 333 e2enetwork.TestUnderTemporaryNetworkFailure(ctx, c, "default", getAnyNode(ctx, c), func(ctx context.Context) { simpleScaleUpTest(ctx, 1) }) 334 }) 335 336 f.It("shouldn't trigger additional scale-ups during processing scale-up", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 337 // Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info. 338 status, err := waitForScaleUpStatus(ctx, c, func(s *scaleUpStatus) bool { 339 return s.ready == s.target && s.ready <= nodeCount 340 }, scaleUpTriggerTimeout) 341 framework.ExpectNoError(err) 342 343 unmanagedNodes := nodeCount - status.ready 344 345 ginkgo.By("Schedule more pods than can fit and wait for cluster to scale-up") 346 ReserveMemory(ctx, f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) 347 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 348 349 status, err = waitForScaleUpStatus(ctx, c, func(s *scaleUpStatus) bool { 350 return s.status == caOngoingScaleUpStatus 351 }, scaleUpTriggerTimeout) 352 framework.ExpectNoError(err) 353 target := status.target 354 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 355 356 ginkgo.By("Expect no more scale-up to be happening after all pods are scheduled") 357 358 // wait for a while until scale-up finishes; we cannot read CA status immediately 359 // after pods are scheduled as status config map is updated by CA once every loop iteration 360 status, err = waitForScaleUpStatus(ctx, c, func(s *scaleUpStatus) bool { 361 return s.status == caNoScaleUpStatus 362 }, 2*freshStatusLimit) 363 framework.ExpectNoError(err) 364 365 if status.target != target { 366 klog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target) 367 } 368 gomega.Expect(status.timestamp.Add(freshStatusLimit)).To(gomega.BeTemporally(">=", time.Now())) 369 gomega.Expect(status.status).To(gomega.Equal(caNoScaleUpStatus)) 370 gomega.Expect(status.ready).To(gomega.Equal(status.target)) 371 nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 372 framework.ExpectNoError(err) 373 gomega.Expect(nodes.Items).To(gomega.HaveLen(status.target + unmanagedNodes)) 374 }) 375 376 f.It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 377 e2eskipper.SkipUnlessProviderIs("gke") 378 379 ginkgo.By("Creating new node-pool with n1-standard-4 machines") 380 const extraPoolName = "extra-pool" 381 addNodePool(extraPoolName, "n1-standard-4", 1) 382 defer deleteNodePool(extraPoolName) 383 extraNodes := getPoolInitialSize(extraPoolName) 384 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+extraNodes, resizeTimeout)) 385 // We wait for nodes to become schedulable to make sure the new nodes 386 // will be returned by getPoolNodes below. 387 framework.ExpectNoError(e2enode.WaitForAllNodesSchedulable(ctx, c, resizeTimeout)) 388 klog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).") 389 390 ginkgo.By("Getting memory available on new nodes, so we can account for it when creating RC") 391 nodes := getPoolNodes(ctx, f, extraPoolName) 392 gomega.Expect(nodes).To(gomega.HaveLen(extraNodes)) 393 extraMemMb := 0 394 for _, node := range nodes { 395 mem := node.Status.Allocatable[v1.ResourceMemory] 396 extraMemMb += int((&mem).Value() / 1024 / 1024) 397 } 398 399 ginkgo.By("Reserving 0.1x more memory than the cluster holds to trigger scale up") 400 totalMemoryReservation := int(1.1 * float64(nodeCount*memAllocatableMb+extraMemMb)) 401 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 402 ReserveMemory(ctx, f, "memory-reservation", 100, totalMemoryReservation, false, defaultTimeout) 403 404 // Verify, that cluster size is increased 405 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 406 func(size int) bool { return size >= nodeCount+extraNodes+1 }, scaleUpTimeout)) 407 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 408 }) 409 410 f.It("should disable node pool autoscaling", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 411 e2eskipper.SkipUnlessProviderIs("gke") 412 413 ginkgo.By("Creating new node-pool with n1-standard-4 machines") 414 const extraPoolName = "extra-pool" 415 addNodePool(extraPoolName, "n1-standard-4", 1) 416 defer deleteNodePool(extraPoolName) 417 extraNodes := getPoolInitialSize(extraPoolName) 418 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+extraNodes, resizeTimeout)) 419 framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2)) 420 framework.ExpectNoError(disableAutoscaler(extraPoolName, 1, 2)) 421 }) 422 423 f.It("should increase cluster size if pods are pending due to host port conflict", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 424 scheduling.CreateHostPortPods(ctx, f, "host-port", nodeCount+2, false) 425 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "host-port") 426 427 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 428 func(size int) bool { return size >= nodeCount+2 }, scaleUpTimeout)) 429 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 430 }) 431 432 f.It("should increase cluster size if pods are pending due to pod anti-affinity", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 433 pods := nodeCount 434 newPods := 2 435 labels := map[string]string{ 436 "anti-affinity": "yes", 437 } 438 ginkgo.By("starting a pod with anti-affinity on each node") 439 framework.ExpectNoError(runAntiAffinityPods(ctx, f, f.Namespace.Name, pods, "some-pod", labels, labels)) 440 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "some-pod") 441 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 442 443 ginkgo.By("scheduling extra pods with anti-affinity to existing ones") 444 framework.ExpectNoError(runAntiAffinityPods(ctx, f, f.Namespace.Name, newPods, "extra-pod", labels, labels)) 445 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "extra-pod") 446 447 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 448 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+newPods, scaleUpTimeout)) 449 }) 450 451 f.It("should increase cluster size if pod requesting EmptyDir volume is pending", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 452 ginkgo.By("creating pods") 453 pods := nodeCount 454 newPods := 1 455 labels := map[string]string{ 456 "anti-affinity": "yes", 457 } 458 framework.ExpectNoError(runAntiAffinityPods(ctx, f, f.Namespace.Name, pods, "some-pod", labels, labels)) 459 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "some-pod") 460 461 ginkgo.By("waiting for all pods before triggering scale up") 462 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 463 464 ginkgo.By("creating a pod requesting EmptyDir") 465 framework.ExpectNoError(runVolumeAntiAffinityPods(ctx, f, f.Namespace.Name, newPods, "extra-pod", labels, labels, emptyDirVolumes)) 466 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "extra-pod") 467 468 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 469 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+newPods, scaleUpTimeout)) 470 }) 471 472 f.It("should increase cluster size if pod requesting volume is pending", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 473 e2eskipper.SkipUnlessProviderIs("gce", "gke") 474 475 volumeLabels := labels.Set{ 476 e2epv.VolumeSelectorKey: f.Namespace.Name, 477 } 478 selector := metav1.SetAsLabelSelector(volumeLabels) 479 480 ginkgo.By("creating volume & pvc") 481 diskName, err := e2epv.CreatePDWithRetry(ctx) 482 framework.ExpectNoError(err) 483 pvConfig := e2epv.PersistentVolumeConfig{ 484 NamePrefix: "gce-", 485 Labels: volumeLabels, 486 PVSource: v1.PersistentVolumeSource{ 487 GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{ 488 PDName: diskName, 489 FSType: "ext3", 490 ReadOnly: false, 491 }, 492 }, 493 Prebind: nil, 494 } 495 emptyStorageClass := "" 496 pvcConfig := e2epv.PersistentVolumeClaimConfig{ 497 Selector: selector, 498 StorageClassName: &emptyStorageClass, 499 } 500 501 pv, pvc, err := e2epv.CreatePVPVC(ctx, c, f.Timeouts, pvConfig, pvcConfig, f.Namespace.Name, false) 502 framework.ExpectNoError(err) 503 framework.ExpectNoError(e2epv.WaitOnPVandPVC(ctx, c, f.Timeouts, f.Namespace.Name, pv, pvc)) 504 505 defer func() { 506 errs := e2epv.PVPVCCleanup(ctx, c, f.Namespace.Name, pv, pvc) 507 if len(errs) > 0 { 508 framework.Failf("failed to delete PVC and/or PV. Errors: %v", utilerrors.NewAggregate(errs)) 509 } 510 pv, pvc = nil, nil 511 if diskName != "" { 512 framework.ExpectNoError(e2epv.DeletePDWithRetry(ctx, diskName)) 513 } 514 }() 515 516 ginkgo.By("creating pods") 517 pods := nodeCount 518 labels := map[string]string{ 519 "anti-affinity": "yes", 520 } 521 framework.ExpectNoError(runAntiAffinityPods(ctx, f, f.Namespace.Name, pods, "some-pod", labels, labels)) 522 ginkgo.DeferCleanup(func(ctx context.Context) { 523 e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, "some-pod") 524 klog.Infof("RC and pods not using volume deleted") 525 }) 526 527 ginkgo.By("waiting for all pods before triggering scale up") 528 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 529 530 ginkgo.By("creating a pod requesting PVC") 531 pvcPodName := "pvc-pod" 532 newPods := 1 533 volumes := buildVolumes(pv, pvc) 534 framework.ExpectNoError(runVolumeAntiAffinityPods(ctx, f, f.Namespace.Name, newPods, pvcPodName, labels, labels, volumes)) 535 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, pvcPodName) 536 ginkgo.DeferCleanup(waitForAllCaPodsReadyInNamespace, f, c) 537 538 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 539 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+newPods, scaleUpTimeout)) 540 }) 541 542 f.It("should add node to the particular mig", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 543 labelKey := "cluster-autoscaling-test.special-node" 544 labelValue := "true" 545 546 ginkgo.By("Finding the smallest MIG") 547 minMig := "" 548 minSize := nodeCount 549 for mig, size := range originalSizes { 550 if size <= minSize { 551 minMig = mig 552 minSize = size 553 } 554 } 555 556 if minSize == 0 { 557 newSizes := make(map[string]int) 558 for mig, size := range originalSizes { 559 newSizes[mig] = size 560 } 561 newSizes[minMig] = 1 562 setMigSizes(newSizes) 563 } 564 565 removeLabels := func(nodesToClean sets.String) { 566 ginkgo.By("Removing labels from nodes") 567 for node := range nodesToClean { 568 e2enode.RemoveLabelOffNode(c, node, labelKey) 569 } 570 } 571 572 nodes, err := framework.GetGroupNodes(minMig) 573 framework.ExpectNoError(err) 574 nodesSet := sets.NewString(nodes...) 575 defer removeLabels(nodesSet) 576 ginkgo.By(fmt.Sprintf("Annotating nodes of the smallest MIG(%s): %v", minMig, nodes)) 577 578 for node := range nodesSet { 579 e2enode.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue) 580 } 581 582 err = scheduling.CreateNodeSelectorPods(ctx, f, "node-selector", minSize+1, map[string]string{labelKey: labelValue}, false) 583 framework.ExpectNoError(err) 584 ginkgo.By("Waiting for new node to appear and annotating it") 585 framework.WaitForGroupSize(minMig, int32(minSize+1)) 586 // Verify that cluster size is increased 587 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 588 func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) 589 590 newNodes, err := framework.GetGroupNodes(minMig) 591 framework.ExpectNoError(err) 592 newNodesSet := sets.NewString(newNodes...) 593 newNodesSet.Delete(nodes...) 594 if len(newNodesSet) > 1 { 595 ginkgo.By(fmt.Sprintf("Spotted following new nodes in %s: %v", minMig, newNodesSet)) 596 klog.Infof("Usually only 1 new node is expected, investigating") 597 klog.Infof("Kubectl:%s\n", e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "get", "nodes", "-o", "json")) 598 if output, err := exec.Command("gcloud", "compute", "instances", "list", 599 "--project="+framework.TestContext.CloudConfig.ProjectID, 600 "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil { 601 klog.Infof("Gcloud compute instances list: %s", output) 602 } else { 603 klog.Errorf("Failed to get instances list: %v", err) 604 } 605 606 for newNode := range newNodesSet { 607 if output, err := execCmd("gcloud", "compute", "instances", "describe", 608 newNode, 609 "--project="+framework.TestContext.CloudConfig.ProjectID, 610 "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil { 611 klog.Infof("Gcloud compute instances describe: %s", output) 612 } else { 613 klog.Errorf("Failed to get instances describe: %v", err) 614 } 615 } 616 617 // TODO: possibly remove broken node from newNodesSet to prevent removeLabel from crashing. 618 // However at this moment we DO WANT it to crash so that we don't check all test runs for the 619 // rare behavior, but only the broken ones. 620 } 621 ginkgo.By(fmt.Sprintf("New nodes: %v\n", newNodesSet)) 622 registeredNodes := sets.NewString() 623 for nodeName := range newNodesSet { 624 node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 625 if err == nil && node != nil { 626 registeredNodes.Insert(nodeName) 627 } else { 628 klog.Errorf("Failed to get node %v: %v", nodeName, err) 629 } 630 } 631 ginkgo.By(fmt.Sprintf("Setting labels for registered new nodes: %v", registeredNodes.List())) 632 for node := range registeredNodes { 633 e2enode.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue) 634 } 635 636 defer removeLabels(registeredNodes) 637 638 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 639 framework.ExpectNoError(e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, "node-selector")) 640 }) 641 642 f.It("should scale up correct target pool", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 643 e2eskipper.SkipUnlessProviderIs("gke") 644 645 ginkgo.By("Creating new node-pool with n1-standard-4 machines") 646 const extraPoolName = "extra-pool" 647 addNodePool(extraPoolName, "n1-standard-4", 1) 648 defer deleteNodePool(extraPoolName) 649 extraNodes := getPoolInitialSize(extraPoolName) 650 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+extraNodes, resizeTimeout)) 651 framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2)) 652 defer disableAutoscaler(extraPoolName, 1, 2) 653 654 extraPods := extraNodes + 1 655 totalMemoryReservation := int(float64(extraPods) * 1.5 * float64(memAllocatableMb)) 656 ginkgo.By(fmt.Sprintf("Creating rc with %v pods too big to fit default-pool but fitting extra-pool", extraPods)) 657 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 658 ReserveMemory(ctx, f, "memory-reservation", extraPods, totalMemoryReservation, false, defaultTimeout) 659 660 // Apparently GKE master is restarted couple minutes after the node pool is added 661 // resetting all the timers in scale down code. Adding 5 extra minutes to workaround 662 // this issue. 663 // TODO: Remove the extra time when GKE restart is fixed. 664 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+extraNodes+1, scaleUpTimeout+5*time.Minute)) 665 }) 666 667 simpleScaleDownTest := func(ctx context.Context, unready int) { 668 err := addKubeSystemPdbs(ctx, f) 669 framework.ExpectNoError(err) 670 671 ginkgo.By("Manually increase cluster size") 672 increasedSize := 0 673 newSizes := make(map[string]int) 674 for key, val := range originalSizes { 675 newSizes[key] = val + 2 + unready 676 increasedSize += val + 2 + unready 677 } 678 setMigSizes(newSizes) 679 framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, 680 func(size int) bool { return size >= increasedSize }, manualResizeTimeout, unready)) 681 682 ginkgo.By("Some node should be removed") 683 framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, 684 func(size int) bool { return size < increasedSize }, scaleDownTimeout, unready)) 685 } 686 687 f.It("should correctly scale down after a node is not needed", feature.ClusterSizeAutoscalingScaleDown, 688 func(ctx context.Context) { simpleScaleDownTest(ctx, 0) }) 689 690 f.It("should correctly scale down after a node is not needed and one node is broken", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 691 e2eskipper.SkipUnlessSSHKeyPresent() 692 e2enetwork.TestUnderTemporaryNetworkFailure(ctx, c, "default", getAnyNode(ctx, c), func(ctx context.Context) { simpleScaleDownTest(ctx, 1) }) 693 }) 694 695 f.It("should correctly scale down after a node is not needed when there is non autoscaled pool", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 696 e2eskipper.SkipUnlessProviderIs("gke") 697 698 increasedSize := manuallyIncreaseClusterSize(ctx, f, originalSizes) 699 700 const extraPoolName = "extra-pool" 701 addNodePool(extraPoolName, "n1-standard-1", 3) 702 defer deleteNodePool(extraPoolName) 703 extraNodes := getPoolInitialSize(extraPoolName) 704 705 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 706 func(size int) bool { return size >= increasedSize+extraNodes }, scaleUpTimeout)) 707 708 ginkgo.By("Some node should be removed") 709 // Apparently GKE master is restarted couple minutes after the node pool is added 710 // resetting all the timers in scale down code. Adding 10 extra minutes to workaround 711 // this issue. 712 // TODO: Remove the extra time when GKE restart is fixed. 713 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 714 func(size int) bool { return size < increasedSize+extraNodes }, scaleDownTimeout+10*time.Minute)) 715 }) 716 717 f.It("should be able to scale down when rescheduling a pod is required and pdb allows for it", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 718 runDrainTest(ctx, f, originalSizes, f.Namespace.Name, 1, 1, func(increasedSize int) { 719 ginkgo.By("Some node should be removed") 720 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 721 func(size int) bool { return size < increasedSize }, scaleDownTimeout)) 722 }) 723 }) 724 725 f.It("shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 726 runDrainTest(ctx, f, originalSizes, f.Namespace.Name, 1, 0, func(increasedSize int) { 727 ginkgo.By("No nodes should be removed") 728 time.Sleep(scaleDownTimeout) 729 nodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 730 framework.ExpectNoError(err) 731 gomega.Expect(nodes.Items).To(gomega.HaveLen(increasedSize)) 732 }) 733 }) 734 735 f.It("should be able to scale down by draining multiple pods one by one as dictated by pdb", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 736 runDrainTest(ctx, f, originalSizes, f.Namespace.Name, 2, 1, func(increasedSize int) { 737 ginkgo.By("Some node should be removed") 738 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 739 func(size int) bool { return size < increasedSize }, scaleDownTimeout)) 740 }) 741 }) 742 743 f.It("should be able to scale down by draining system pods with pdb", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 744 runDrainTest(ctx, f, originalSizes, "kube-system", 2, 1, func(increasedSize int) { 745 ginkgo.By("Some node should be removed") 746 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 747 func(size int) bool { return size < increasedSize }, scaleDownTimeout)) 748 }) 749 }) 750 751 f.It("Should be able to scale a node group up from 0", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 752 // Provider-specific setup 753 if framework.ProviderIs("gke") { 754 // GKE-specific setup 755 ginkgo.By("Add a new node pool with 0 nodes and min size 0") 756 const extraPoolName = "extra-pool" 757 addNodePool(extraPoolName, "n1-standard-4", 0) 758 defer deleteNodePool(extraPoolName) 759 framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1)) 760 defer disableAutoscaler(extraPoolName, 0, 1) 761 } else { 762 // on GCE, run only if there are already at least 2 node groups 763 e2eskipper.SkipUnlessAtLeast(len(originalSizes), 2, "At least 2 node groups are needed for scale-to-0 tests") 764 765 ginkgo.By("Manually scale smallest node group to 0") 766 minMig := "" 767 minSize := nodeCount 768 for mig, size := range originalSizes { 769 if size <= minSize { 770 minMig = mig 771 minSize = size 772 } 773 } 774 framework.ExpectNoError(framework.ResizeGroup(minMig, int32(0))) 775 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount-minSize, resizeTimeout)) 776 } 777 778 ginkgo.By("Make remaining nodes unschedulable") 779 nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 780 "spec.unschedulable": "false", 781 }.AsSelector().String()}) 782 framework.ExpectNoError(err) 783 784 for _, node := range nodes.Items { 785 err = makeNodeUnschedulable(ctx, f.ClientSet, &node) 786 787 n := node 788 ginkgo.DeferCleanup(makeNodeSchedulable, f.ClientSet, &n, false) 789 790 framework.ExpectNoError(err) 791 } 792 793 ginkgo.By("Run a scale-up test") 794 ReserveMemory(ctx, f, "memory-reservation", 1, 100, false, 1*time.Second) 795 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 796 797 // Verify that cluster size is increased 798 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 799 func(size int) bool { return size >= len(nodes.Items)+1 }, scaleUpTimeout)) 800 framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(ctx, f, c)) 801 }) 802 803 // Scale to 0 test is split into two functions (for GKE & GCE.) 804 // The reason for it is that scenario is exactly the same, 805 // but setup & verification use different APIs. 806 // 807 // Scenario: 808 // (GKE only) add an extra node pool with size 1 & enable autoscaling for it 809 // (GCE only) find the smallest MIG & resize it to 1 810 // manually drain the single node from this node pool/MIG 811 // wait for cluster size to decrease 812 // verify the targeted node pool/MIG is of size 0 813 gkeScaleToZero := func(ctx context.Context) { 814 // GKE-specific setup 815 ginkgo.By("Add a new node pool with size 1 and min size 0") 816 const extraPoolName = "extra-pool" 817 addNodePool(extraPoolName, "n1-standard-4", 1) 818 defer deleteNodePool(extraPoolName) 819 extraNodes := getPoolInitialSize(extraPoolName) 820 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount+extraNodes, resizeTimeout)) 821 framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1)) 822 defer disableAutoscaler(extraPoolName, 0, 1) 823 824 ngNodes := getPoolNodes(ctx, f, extraPoolName) 825 gomega.Expect(ngNodes).To(gomega.HaveLen(extraNodes)) 826 for _, node := range ngNodes { 827 ginkgo.By(fmt.Sprintf("Target node for scale-down: %s", node.Name)) 828 } 829 830 for _, node := range ngNodes { 831 drainNode(ctx, f, node) 832 } 833 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 834 func(size int) bool { return size <= nodeCount }, scaleDownTimeout)) 835 836 // GKE-specific check 837 newSize := getPoolSize(ctx, f, extraPoolName) 838 gomega.Expect(newSize).To(gomega.BeEmpty()) 839 } 840 841 gceScaleToZero := func(ctx context.Context) { 842 // non-GKE only 843 ginkgo.By("Find smallest node group and manually scale it to a single node") 844 minMig := "" 845 minSize := nodeCount 846 for mig, size := range originalSizes { 847 if size <= minSize { 848 minMig = mig 849 minSize = size 850 } 851 } 852 framework.ExpectNoError(framework.ResizeGroup(minMig, int32(1))) 853 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, nodeCount-minSize+1, resizeTimeout)) 854 ngNodes, err := framework.GetGroupNodes(minMig) 855 framework.ExpectNoError(err) 856 if len(ngNodes) != 1 { 857 framework.Failf("Expected one node, got instead: %v", ngNodes) 858 } 859 node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, ngNodes[0], metav1.GetOptions{}) 860 ginkgo.By(fmt.Sprintf("Target node for scale-down: %s", node.Name)) 861 framework.ExpectNoError(err) 862 863 // this part is identical 864 drainNode(ctx, f, node) 865 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 866 func(size int) bool { return size < nodeCount-minSize+1 }, scaleDownTimeout)) 867 868 // non-GKE only 869 newSize, err := framework.GroupSize(minMig) 870 framework.ExpectNoError(err) 871 gomega.Expect(newSize).To(gomega.BeEmpty()) 872 } 873 874 f.It("Should be able to scale a node group down to 0", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 875 if framework.ProviderIs("gke") { // In GKE, we can just add a node pool 876 gkeScaleToZero(ctx) 877 } else if len(originalSizes) >= 2 { 878 gceScaleToZero(ctx) 879 } else { 880 e2eskipper.Skipf("At least 2 node groups are needed for scale-to-0 tests") 881 } 882 }) 883 884 f.It("Shouldn't perform scale up operation and should list unhealthy status if most of the cluster is broken", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 885 e2eskipper.SkipUnlessSSHKeyPresent() 886 887 clusterSize := nodeCount 888 for clusterSize < unhealthyClusterThreshold+1 { 889 clusterSize = manuallyIncreaseClusterSize(ctx, f, originalSizes) 890 } 891 892 // If new nodes are disconnected too soon, they'll be considered not started 893 // instead of unready, and cluster won't be considered unhealthy. 894 // 895 // More precisely, Cluster Autoscaler will never consider a 896 // node to be unhealthy unless it was created more than 15m 897 // ago. Within that 15m window, it'll assume node is just 898 // starting and not unhealthy. 899 // 900 // However, waiting for 15m would allow scale down to kick in 901 // and remove recently added nodes, so here we just wait 2m for 902 // nodes to come up (1m should be enough, another 1m added as 903 // an extra buffer. Then, we break connectivity to a subset of 904 // nodes and only after that we wait for 15m, since scale down 905 // shouldn't happen when the cluster is unhealthy. 906 time.Sleep(2 * time.Minute) 907 908 ginkgo.By("Block network connectivity to some nodes to simulate unhealthy cluster") 909 nodesToBreakCount := int(math.Ceil(math.Max(float64(unhealthyClusterThreshold), 0.5*float64(clusterSize)))) 910 nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 911 "spec.unschedulable": "false", 912 }.AsSelector().String()}) 913 framework.ExpectNoError(err) 914 if nodesToBreakCount > len(nodes.Items) { 915 framework.Failf("Expected at most %d nodes to break, got %d", len(nodes.Items), nodesToBreakCount) 916 } 917 nodesToBreak := nodes.Items[:nodesToBreakCount] 918 919 // TestUnderTemporaryNetworkFailure only removes connectivity to a single node, 920 // and accepts func() callback. This is expanding the loop to recursive call 921 // to avoid duplicating TestUnderTemporaryNetworkFailure 922 var testFunction func(ctx context.Context) 923 testFunction = func(ctx context.Context) { 924 if len(nodesToBreak) > 0 { 925 ntb := &nodesToBreak[0] 926 nodesToBreak = nodesToBreak[1:] 927 e2enetwork.TestUnderTemporaryNetworkFailure(ctx, c, "default", ntb, testFunction) 928 } else { 929 ReserveMemory(ctx, f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, defaultTimeout) 930 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, f.Namespace.Name, "memory-reservation") 931 // Wait for 15m to ensure Cluster Autoscaler won't consider broken nodes as still starting. 932 time.Sleep(15 * time.Minute) 933 currentNodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 934 framework.ExpectNoError(err) 935 framework.Logf("Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v", len(currentNodes.Items), len(nodes.Items), nodesToBreakCount) 936 gomega.Expect(currentNodes.Items).To(gomega.HaveLen(len(nodes.Items) - nodesToBreakCount)) 937 status, err := getClusterwideStatus(ctx, c) 938 framework.Logf("Clusterwide status: %v", status) 939 framework.ExpectNoError(err) 940 gomega.Expect(status).To(gomega.Equal("Unhealthy")) 941 } 942 } 943 testFunction(ctx) 944 // Give nodes time to recover from network failure 945 framework.ExpectNoError(e2enode.WaitForReadyNodes(ctx, c, len(nodes.Items), nodesRecoverTimeout)) 946 }) 947 948 f.It("shouldn't scale up when expendable pod is created", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 949 createPriorityClasses(ctx, f) 950 // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created. 951 ginkgo.DeferCleanup(ReserveMemoryWithPriority, f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), false, time.Second, expendablePriorityClassName) 952 ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String())) 953 time.Sleep(scaleUpTimeout) 954 // Verify that cluster size is not changed 955 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 956 func(size int) bool { return size == nodeCount }, time.Second)) 957 }) 958 959 f.It("should scale up when non expendable pod is created", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 960 createPriorityClasses(ctx, f) 961 // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created. 962 cleanupFunc := ReserveMemoryWithPriority(ctx, f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName) 963 defer cleanupFunc() 964 // Verify that cluster size is not changed 965 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 966 func(size int) bool { return size > nodeCount }, time.Second)) 967 }) 968 969 f.It("shouldn't scale up when expendable pod is preempted", feature.ClusterSizeAutoscalingScaleUp, func(ctx context.Context) { 970 createPriorityClasses(ctx, f) 971 // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node. 972 cleanupFunc1 := ReserveMemoryWithPriority(ctx, f, "memory-reservation1", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, expendablePriorityClassName) 973 defer cleanupFunc1() 974 // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node. Pods created here should preempt pods created above. 975 cleanupFunc2 := ReserveMemoryWithPriority(ctx, f, "memory-reservation2", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, highPriorityClassName) 976 defer cleanupFunc2() 977 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 978 func(size int) bool { return size == nodeCount }, time.Second)) 979 }) 980 981 f.It("should scale down when expendable pod is running", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 982 createPriorityClasses(ctx, f) 983 increasedSize := manuallyIncreaseClusterSize(ctx, f, originalSizes) 984 // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node. 985 cleanupFunc := ReserveMemoryWithPriority(ctx, f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, expendablePriorityClassName) 986 defer cleanupFunc() 987 ginkgo.By("Waiting for scale down") 988 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 989 func(size int) bool { return size == nodeCount }, scaleDownTimeout)) 990 }) 991 992 f.It("shouldn't scale down when non expendable pod is running", feature.ClusterSizeAutoscalingScaleDown, func(ctx context.Context) { 993 createPriorityClasses(ctx, f) 994 increasedSize := manuallyIncreaseClusterSize(ctx, f, originalSizes) 995 // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node. 996 cleanupFunc := ReserveMemoryWithPriority(ctx, f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName) 997 defer cleanupFunc() 998 ginkgo.By(fmt.Sprintf("Waiting for scale down hoping it won't happen, sleep for %s", scaleDownTimeout.String())) 999 time.Sleep(scaleDownTimeout) 1000 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, 1001 func(size int) bool { return size == increasedSize }, time.Second)) 1002 }) 1003 }) 1004 1005 func installNvidiaDriversDaemonSet(ctx context.Context, f *framework.Framework) { 1006 ginkgo.By("Add daemonset which installs nvidia drivers") 1007 1008 dsYamlURL := "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml" 1009 framework.Logf("Using %v", dsYamlURL) 1010 // Creates the DaemonSet that installs Nvidia Drivers. 1011 ds, err := e2emanifest.DaemonSetFromURL(ctx, dsYamlURL) 1012 framework.ExpectNoError(err) 1013 ds.Namespace = f.Namespace.Name 1014 1015 _, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ctx, ds, metav1.CreateOptions{}) 1016 framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset") 1017 } 1018 1019 func execCmd(args ...string) *exec.Cmd { 1020 klog.Infof("Executing: %s", strings.Join(args, " ")) 1021 return exec.Command(args[0], args[1:]...) 1022 } 1023 1024 func runDrainTest(ctx context.Context, f *framework.Framework, migSizes map[string]int, namespace string, podsPerNode, pdbSize int, verifyFunction func(int)) { 1025 increasedSize := manuallyIncreaseClusterSize(ctx, f, migSizes) 1026 1027 nodes, err := f.ClientSet.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 1028 "spec.unschedulable": "false", 1029 }.AsSelector().String()}) 1030 framework.ExpectNoError(err) 1031 numPods := len(nodes.Items) * podsPerNode 1032 testID := string(uuid.NewUUID()) // So that we can label and find pods 1033 labelMap := map[string]string{"test_id": testID} 1034 framework.ExpectNoError(runReplicatedPodOnEachNode(ctx, f, nodes.Items, namespace, podsPerNode, "reschedulable-pods", labelMap, 0)) 1035 1036 ginkgo.DeferCleanup(e2erc.DeleteRCAndWaitForGC, f.ClientSet, namespace, "reschedulable-pods") 1037 1038 ginkgo.By("Create a PodDisruptionBudget") 1039 minAvailable := intstr.FromInt32(int32(numPods - pdbSize)) 1040 pdb := &policyv1.PodDisruptionBudget{ 1041 ObjectMeta: metav1.ObjectMeta{ 1042 Name: "test_pdb", 1043 Namespace: namespace, 1044 }, 1045 Spec: policyv1.PodDisruptionBudgetSpec{ 1046 Selector: &metav1.LabelSelector{MatchLabels: labelMap}, 1047 MinAvailable: &minAvailable, 1048 }, 1049 } 1050 _, err = f.ClientSet.PolicyV1().PodDisruptionBudgets(namespace).Create(ctx, pdb, metav1.CreateOptions{}) 1051 1052 ginkgo.DeferCleanup(framework.IgnoreNotFound(f.ClientSet.PolicyV1().PodDisruptionBudgets(namespace).Delete), pdb.Name, metav1.DeleteOptions{}) 1053 1054 framework.ExpectNoError(err) 1055 verifyFunction(increasedSize) 1056 } 1057 1058 func getGkeAPIEndpoint() string { 1059 gkeAPIEndpoint := os.Getenv("CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER") 1060 if gkeAPIEndpoint == "" { 1061 gkeAPIEndpoint = "https://test-container.sandbox.googleapis.com" 1062 } 1063 if strings.HasSuffix(gkeAPIEndpoint, "/") { 1064 gkeAPIEndpoint = gkeAPIEndpoint[:len(gkeAPIEndpoint)-1] 1065 } 1066 return gkeAPIEndpoint 1067 } 1068 1069 func getGKEURL(apiVersion string, suffix string) string { 1070 out, err := execCmd("gcloud", "auth", "print-access-token").Output() 1071 framework.ExpectNoError(err) 1072 token := strings.Replace(string(out), "\n", "", -1) 1073 1074 return fmt.Sprintf("%s/%s/%s?access_token=%s", 1075 getGkeAPIEndpoint(), 1076 apiVersion, 1077 suffix, 1078 token) 1079 } 1080 1081 func getGKEClusterURL(apiVersion string) string { 1082 if isRegionalCluster() { 1083 // TODO(bskiba): Use locations API for all clusters once it's graduated to v1. 1084 return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/locations/%s/clusters/%s", 1085 framework.TestContext.CloudConfig.ProjectID, 1086 framework.TestContext.CloudConfig.Region, 1087 framework.TestContext.CloudConfig.Cluster)) 1088 } 1089 return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/zones/%s/clusters/%s", 1090 framework.TestContext.CloudConfig.ProjectID, 1091 framework.TestContext.CloudConfig.Zone, 1092 framework.TestContext.CloudConfig.Cluster)) 1093 } 1094 1095 func getCluster(apiVersion string) (string, error) { 1096 resp, err := http.Get(getGKEClusterURL(apiVersion)) 1097 if err != nil { 1098 return "", err 1099 } 1100 defer resp.Body.Close() 1101 body, err := io.ReadAll(resp.Body) 1102 if err != nil { 1103 return "", err 1104 } 1105 if resp.StatusCode != http.StatusOK { 1106 return "", fmt.Errorf("error: %s %s", resp.Status, body) 1107 } 1108 1109 return string(body), nil 1110 } 1111 1112 func isAutoscalerEnabled(expectedMaxNodeCountInTargetPool int) (bool, error) { 1113 apiVersion := "v1" 1114 if isRegionalCluster() { 1115 apiVersion = "v1beta1" 1116 } 1117 strBody, err := getCluster(apiVersion) 1118 if err != nil { 1119 return false, err 1120 } 1121 if strings.Contains(strBody, "\"maxNodeCount\": "+strconv.Itoa(expectedMaxNodeCountInTargetPool)) { 1122 return true, nil 1123 } 1124 return false, nil 1125 } 1126 1127 func getClusterLocation() string { 1128 if isRegionalCluster() { 1129 return "--region=" + framework.TestContext.CloudConfig.Region 1130 } 1131 return "--zone=" + framework.TestContext.CloudConfig.Zone 1132 } 1133 1134 func getGcloudCommandFromTrack(commandTrack string, args []string) []string { 1135 command := []string{"gcloud"} 1136 if commandTrack == "beta" || commandTrack == "alpha" { 1137 command = append(command, commandTrack) 1138 } 1139 command = append(command, args...) 1140 command = append(command, getClusterLocation()) 1141 command = append(command, "--project="+framework.TestContext.CloudConfig.ProjectID) 1142 return command 1143 } 1144 1145 func getGcloudCommand(args []string) []string { 1146 track := "" 1147 if isRegionalCluster() { 1148 track = "beta" 1149 } 1150 return getGcloudCommandFromTrack(track, args) 1151 } 1152 1153 func isRegionalCluster() bool { 1154 // TODO(bskiba): Use an appropriate indicator that the cluster is regional. 1155 return framework.TestContext.CloudConfig.MultiZone 1156 } 1157 1158 func enableAutoscaler(nodePool string, minCount, maxCount int) error { 1159 klog.Infof("Using gcloud to enable autoscaling for pool %s", nodePool) 1160 1161 args := []string{"container", "clusters", "update", framework.TestContext.CloudConfig.Cluster, 1162 "--enable-autoscaling", 1163 "--min-nodes=" + strconv.Itoa(minCount), 1164 "--max-nodes=" + strconv.Itoa(maxCount), 1165 "--node-pool=" + nodePool} 1166 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1167 1168 if err != nil { 1169 klog.Errorf("Failed config update result: %s", output) 1170 return fmt.Errorf("Failed to enable autoscaling: %w", err) 1171 } 1172 klog.Infof("Config update result: %s", output) 1173 1174 var finalErr error 1175 for startTime := time.Now(); startTime.Add(gkeUpdateTimeout).After(time.Now()); time.Sleep(30 * time.Second) { 1176 val, err := isAutoscalerEnabled(maxCount) 1177 if err == nil && val { 1178 return nil 1179 } 1180 finalErr = err 1181 } 1182 return fmt.Errorf("autoscaler not enabled, last error: %v", finalErr) 1183 } 1184 1185 func disableAutoscaler(nodePool string, minCount, maxCount int) error { 1186 klog.Infof("Using gcloud to disable autoscaling for pool %s", nodePool) 1187 args := []string{"container", "clusters", "update", framework.TestContext.CloudConfig.Cluster, 1188 "--no-enable-autoscaling", 1189 "--node-pool=" + nodePool} 1190 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1191 1192 if err != nil { 1193 klog.Errorf("Failed config update result: %s", output) 1194 return fmt.Errorf("Failed to disable autoscaling: %w", err) 1195 } 1196 klog.Infof("Config update result: %s", output) 1197 1198 var finalErr error 1199 for startTime := time.Now(); startTime.Add(gkeUpdateTimeout).After(time.Now()); time.Sleep(30 * time.Second) { 1200 val, err := isAutoscalerEnabled(maxCount) 1201 if err == nil && !val { 1202 return nil 1203 } 1204 finalErr = err 1205 } 1206 return fmt.Errorf("autoscaler still enabled, last error: %v", finalErr) 1207 } 1208 1209 func addNodePool(name string, machineType string, numNodes int) { 1210 args := []string{"container", "node-pools", "create", name, "--quiet", 1211 "--machine-type=" + machineType, 1212 "--num-nodes=" + strconv.Itoa(numNodes), 1213 "--cluster=" + framework.TestContext.CloudConfig.Cluster} 1214 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1215 klog.Infof("Creating node-pool %s: %s", name, output) 1216 framework.ExpectNoError(err, string(output)) 1217 } 1218 1219 func addGpuNodePool(name string, gpuType string, gpuCount int, numNodes int) { 1220 args := []string{"beta", "container", "node-pools", "create", name, "--quiet", 1221 "--accelerator", "type=" + gpuType + ",count=" + strconv.Itoa(gpuCount), 1222 "--num-nodes=" + strconv.Itoa(numNodes), 1223 "--cluster=" + framework.TestContext.CloudConfig.Cluster} 1224 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1225 klog.Infof("Creating node-pool %s: %s", name, output) 1226 framework.ExpectNoError(err, string(output)) 1227 } 1228 1229 func deleteNodePool(name string) { 1230 klog.Infof("Deleting node pool %s", name) 1231 args := []string{"container", "node-pools", "delete", name, "--quiet", 1232 "--cluster=" + framework.TestContext.CloudConfig.Cluster} 1233 err := wait.ExponentialBackoff( 1234 wait.Backoff{Duration: 1 * time.Minute, Factor: float64(3), Steps: 3}, 1235 func() (bool, error) { 1236 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1237 if err != nil { 1238 klog.Warningf("Error deleting nodegroup - error:%v, output: %s", err, output) 1239 return false, nil 1240 } 1241 klog.Infof("Node-pool deletion output: %s", output) 1242 return true, nil 1243 }) 1244 framework.ExpectNoError(err) 1245 } 1246 1247 func getPoolNodes(ctx context.Context, f *framework.Framework, poolName string) []*v1.Node { 1248 nodes := make([]*v1.Node, 0, 1) 1249 nodeList, err := e2enode.GetReadyNodesIncludingTainted(ctx, f.ClientSet) 1250 if err != nil { 1251 framework.Logf("Unexpected error occurred: %v", err) 1252 } 1253 framework.ExpectNoErrorWithOffset(0, err) 1254 for _, node := range nodeList.Items { 1255 if node.Labels[gkeNodepoolNameKey] == poolName { 1256 node := node 1257 nodes = append(nodes, &node) 1258 } 1259 } 1260 return nodes 1261 } 1262 1263 // getPoolInitialSize returns the initial size of the node pool taking into 1264 // account that it may span multiple zones. In that case, node pool consists of 1265 // multiple migs all containing initialNodeCount nodes. 1266 func getPoolInitialSize(poolName string) int { 1267 // get initial node count 1268 args := []string{"container", "node-pools", "describe", poolName, "--quiet", 1269 "--cluster=" + framework.TestContext.CloudConfig.Cluster, 1270 "--format=value(initialNodeCount)"} 1271 output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() 1272 klog.Infof("Node-pool initial size: %s", output) 1273 framework.ExpectNoError(err, string(output)) 1274 fields := strings.Fields(string(output)) 1275 gomega.Expect(fields).To(gomega.HaveLen(1)) 1276 size, err := strconv.ParseInt(fields[0], 10, 64) 1277 framework.ExpectNoError(err) 1278 1279 // get number of node pools 1280 args = []string{"container", "node-pools", "describe", poolName, "--quiet", 1281 "--cluster=" + framework.TestContext.CloudConfig.Cluster, 1282 "--format=value(instanceGroupUrls)"} 1283 output, err = execCmd(getGcloudCommand(args)...).CombinedOutput() 1284 framework.ExpectNoError(err, string(output)) 1285 nodeGroupCount := len(strings.Split(string(output), ";")) 1286 return int(size) * nodeGroupCount 1287 } 1288 1289 func getPoolSize(ctx context.Context, f *framework.Framework, poolName string) int { 1290 size := 0 1291 nodeList, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 1292 framework.ExpectNoError(err) 1293 for _, node := range nodeList.Items { 1294 if node.Labels[gkeNodepoolNameKey] == poolName { 1295 size++ 1296 } 1297 } 1298 return size 1299 } 1300 1301 func reserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName string) func() error { 1302 ginkgo.By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes)) 1303 request := int64(1024 * 1024 * megabytes / replicas) 1304 config := &testutils.RCConfig{ 1305 Client: f.ClientSet, 1306 Name: id, 1307 Namespace: f.Namespace.Name, 1308 Timeout: timeout, 1309 Image: imageutils.GetPauseImageName(), 1310 Replicas: replicas, 1311 MemRequest: request, 1312 NodeSelector: selector, 1313 Tolerations: tolerations, 1314 PriorityClassName: priorityClassName, 1315 } 1316 for start := time.Now(); time.Since(start) < rcCreationRetryTimeout; time.Sleep(rcCreationRetryDelay) { 1317 err := e2erc.RunRC(ctx, *config) 1318 if err != nil && strings.Contains(err.Error(), "Error creating replication controller") { 1319 klog.Warningf("Failed to create memory reservation: %v", err) 1320 continue 1321 } 1322 if expectRunning { 1323 framework.ExpectNoError(err) 1324 } 1325 return func() error { 1326 return e2erc.DeleteRCAndWaitForGC(ctx, f.ClientSet, f.Namespace.Name, id) 1327 } 1328 } 1329 framework.Failf("Failed to reserve memory within timeout") 1330 return nil 1331 } 1332 1333 // ReserveMemoryWithPriority creates a replication controller with pods with priority that, in summation, 1334 // request the specified amount of memory. 1335 func ReserveMemoryWithPriority(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, priorityClassName string) func() error { 1336 return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName) 1337 } 1338 1339 // ReserveMemoryWithSelectorAndTolerations creates a replication controller with pods with node selector that, in summation, 1340 // request the specified amount of memory. 1341 func ReserveMemoryWithSelectorAndTolerations(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration) func() error { 1342 return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "") 1343 } 1344 1345 // ReserveMemory creates a replication controller with pods that, in summation, 1346 // request the specified amount of memory. 1347 func ReserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) func() error { 1348 return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "") 1349 } 1350 1351 // WaitForClusterSizeFunc waits until the cluster size matches the given function. 1352 func WaitForClusterSizeFunc(ctx context.Context, c clientset.Interface, sizeFunc func(int) bool, timeout time.Duration) error { 1353 return WaitForClusterSizeFuncWithUnready(ctx, c, sizeFunc, timeout, 0) 1354 } 1355 1356 // WaitForClusterSizeFuncWithUnready waits until the cluster size matches the given function and assumes some unready nodes. 1357 func WaitForClusterSizeFuncWithUnready(ctx context.Context, c clientset.Interface, sizeFunc func(int) bool, timeout time.Duration, expectedUnready int) error { 1358 for start := time.Now(); time.Since(start) < timeout && ctx.Err() == nil; time.Sleep(20 * time.Second) { 1359 nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 1360 "spec.unschedulable": "false", 1361 }.AsSelector().String()}) 1362 if err != nil { 1363 klog.Warningf("Failed to list nodes: %v", err) 1364 continue 1365 } 1366 numNodes := len(nodes.Items) 1367 1368 // Filter out not-ready nodes. 1369 e2enode.Filter(nodes, func(node v1.Node) bool { 1370 return e2enode.IsConditionSetAsExpected(&node, v1.NodeReady, true) 1371 }) 1372 numReady := len(nodes.Items) 1373 1374 if numNodes == numReady+expectedUnready && sizeFunc(numNodes) { 1375 klog.Infof("Cluster has reached the desired size") 1376 return nil 1377 } 1378 klog.Infof("Waiting for cluster with func, current size %d, not ready nodes %d", numNodes, numNodes-numReady) 1379 } 1380 return fmt.Errorf("timeout waiting %v for appropriate cluster size", timeout) 1381 } 1382 1383 func waitForCaPodsReadyInNamespace(ctx context.Context, f *framework.Framework, c clientset.Interface, tolerateUnreadyCount int) error { 1384 var notready []string 1385 for start := time.Now(); time.Now().Before(start.Add(scaleUpTimeout)) && ctx.Err() == nil; time.Sleep(20 * time.Second) { 1386 pods, err := c.CoreV1().Pods(f.Namespace.Name).List(ctx, metav1.ListOptions{}) 1387 if err != nil { 1388 return fmt.Errorf("failed to get pods: %w", err) 1389 } 1390 notready = make([]string, 0) 1391 for _, pod := range pods.Items { 1392 ready := false 1393 for _, c := range pod.Status.Conditions { 1394 if c.Type == v1.PodReady && c.Status == v1.ConditionTrue { 1395 ready = true 1396 } 1397 } 1398 // Failed pods in this context generally mean that they have been 1399 // double scheduled onto a node, but then failed a constraint check. 1400 if pod.Status.Phase == v1.PodFailed { 1401 klog.Warningf("Pod has failed: %v", pod) 1402 } 1403 if !ready && pod.Status.Phase != v1.PodFailed { 1404 notready = append(notready, pod.Name) 1405 } 1406 } 1407 if len(notready) <= tolerateUnreadyCount { 1408 klog.Infof("sufficient number of pods ready. Tolerating %d unready", tolerateUnreadyCount) 1409 return nil 1410 } 1411 klog.Infof("Too many pods are not ready yet: %v", notready) 1412 } 1413 klog.Info("Timeout on waiting for pods being ready") 1414 klog.Info(e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "get", "pods", "-o", "json", "--all-namespaces")) 1415 klog.Info(e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "get", "nodes", "-o", "json")) 1416 1417 // Some pods are still not running. 1418 return fmt.Errorf("Too many pods are still not running: %v", notready) 1419 } 1420 1421 func waitForAllCaPodsReadyInNamespace(ctx context.Context, f *framework.Framework, c clientset.Interface) error { 1422 return waitForCaPodsReadyInNamespace(ctx, f, c, 0) 1423 } 1424 1425 func getAnyNode(ctx context.Context, c clientset.Interface) *v1.Node { 1426 nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 1427 "spec.unschedulable": "false", 1428 }.AsSelector().String()}) 1429 if err != nil { 1430 klog.Errorf("Failed to get node list: %v", err) 1431 return nil 1432 } 1433 if len(nodes.Items) == 0 { 1434 klog.Errorf("No nodes") 1435 return nil 1436 } 1437 return &nodes.Items[0] 1438 } 1439 1440 func setMigSizes(sizes map[string]int) bool { 1441 madeChanges := false 1442 for mig, desiredSize := range sizes { 1443 currentSize, err := framework.GroupSize(mig) 1444 framework.ExpectNoError(err) 1445 if desiredSize != currentSize { 1446 ginkgo.By(fmt.Sprintf("Setting size of %s to %d", mig, desiredSize)) 1447 err = framework.ResizeGroup(mig, int32(desiredSize)) 1448 framework.ExpectNoError(err) 1449 madeChanges = true 1450 } 1451 } 1452 return madeChanges 1453 } 1454 1455 func drainNode(ctx context.Context, f *framework.Framework, node *v1.Node) { 1456 ginkgo.By("Make the single node unschedulable") 1457 framework.ExpectNoError(makeNodeUnschedulable(ctx, f.ClientSet, node)) 1458 1459 ginkgo.By("Manually drain the single node") 1460 podOpts := metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector("spec.nodeName", node.Name).String()} 1461 pods, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceAll).List(ctx, podOpts) 1462 framework.ExpectNoError(err) 1463 for _, pod := range pods.Items { 1464 err = f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, *metav1.NewDeleteOptions(0)) 1465 framework.ExpectNoError(err) 1466 } 1467 } 1468 1469 func makeNodeUnschedulable(ctx context.Context, c clientset.Interface, node *v1.Node) error { 1470 ginkgo.By(fmt.Sprintf("Taint node %s", node.Name)) 1471 for j := 0; j < 3; j++ { 1472 freshNode, err := c.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{}) 1473 if err != nil { 1474 return err 1475 } 1476 for _, taint := range freshNode.Spec.Taints { 1477 if taint.Key == disabledTaint { 1478 return nil 1479 } 1480 } 1481 freshNode.Spec.Taints = append(freshNode.Spec.Taints, v1.Taint{ 1482 Key: disabledTaint, 1483 Value: "DisabledForTest", 1484 Effect: v1.TaintEffectNoSchedule, 1485 }) 1486 _, err = c.CoreV1().Nodes().Update(ctx, freshNode, metav1.UpdateOptions{}) 1487 if err == nil { 1488 return nil 1489 } 1490 if !apierrors.IsConflict(err) { 1491 return err 1492 } 1493 klog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j) 1494 } 1495 return fmt.Errorf("Failed to taint node in allowed number of retries") 1496 } 1497 1498 // CriticalAddonsOnlyError implements the `error` interface, and signifies the 1499 // presence of the `CriticalAddonsOnly` taint on the node. 1500 type CriticalAddonsOnlyError struct{} 1501 1502 func (CriticalAddonsOnlyError) Error() string { 1503 return fmt.Sprintf("CriticalAddonsOnly taint found on node") 1504 } 1505 1506 func makeNodeSchedulable(ctx context.Context, c clientset.Interface, node *v1.Node, failOnCriticalAddonsOnly bool) error { 1507 ginkgo.By(fmt.Sprintf("Remove taint from node %s", node.Name)) 1508 for j := 0; j < 3; j++ { 1509 freshNode, err := c.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{}) 1510 if err != nil { 1511 return err 1512 } 1513 var newTaints []v1.Taint 1514 for _, taint := range freshNode.Spec.Taints { 1515 if failOnCriticalAddonsOnly && taint.Key == criticalAddonsOnlyTaint { 1516 return CriticalAddonsOnlyError{} 1517 } 1518 if taint.Key != disabledTaint { 1519 newTaints = append(newTaints, taint) 1520 } 1521 } 1522 1523 if len(newTaints) == len(freshNode.Spec.Taints) { 1524 return nil 1525 } 1526 freshNode.Spec.Taints = newTaints 1527 _, err = c.CoreV1().Nodes().Update(ctx, freshNode, metav1.UpdateOptions{}) 1528 if err == nil { 1529 return nil 1530 } 1531 if !apierrors.IsConflict(err) { 1532 return err 1533 } 1534 klog.Warningf("Got 409 conflict when trying to taint node, retries left: %v", 3-j) 1535 } 1536 return fmt.Errorf("Failed to remove taint from node in allowed number of retries") 1537 } 1538 1539 // ScheduleAnySingleGpuPod schedules a pod which requires single GPU of any type 1540 func ScheduleAnySingleGpuPod(ctx context.Context, f *framework.Framework, id string) error { 1541 return ScheduleGpuPod(ctx, f, id, "", 1) 1542 } 1543 1544 // ScheduleGpuPod schedules a pod which requires a given number of gpus of given type 1545 func ScheduleGpuPod(ctx context.Context, f *framework.Framework, id string, gpuType string, gpuLimit int64) error { 1546 config := &testutils.RCConfig{ 1547 Client: f.ClientSet, 1548 Name: id, 1549 Namespace: f.Namespace.Name, 1550 Timeout: 3 * scaleUpTimeout, // spinning up GPU node is slow 1551 Image: imageutils.GetPauseImageName(), 1552 Replicas: 1, 1553 GpuLimit: gpuLimit, 1554 Labels: map[string]string{"requires-gpu": "yes"}, 1555 } 1556 1557 if gpuType != "" { 1558 config.NodeSelector = map[string]string{gpuLabel: gpuType} 1559 } 1560 1561 err := e2erc.RunRC(ctx, *config) 1562 if err != nil { 1563 return err 1564 } 1565 return nil 1566 } 1567 1568 // Create an RC running a given number of pods with anti-affinity 1569 func runAntiAffinityPods(ctx context.Context, f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error { 1570 config := &testutils.RCConfig{ 1571 Affinity: buildAntiAffinity(antiAffinityLabels), 1572 Client: f.ClientSet, 1573 Name: id, 1574 Namespace: namespace, 1575 Timeout: scaleUpTimeout, 1576 Image: imageutils.GetPauseImageName(), 1577 Replicas: pods, 1578 Labels: podLabels, 1579 } 1580 err := e2erc.RunRC(ctx, *config) 1581 if err != nil { 1582 return err 1583 } 1584 _, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(ctx, id, metav1.GetOptions{}) 1585 if err != nil { 1586 return err 1587 } 1588 return nil 1589 } 1590 1591 func runVolumeAntiAffinityPods(ctx context.Context, f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string, volumes []v1.Volume) error { 1592 config := &testutils.RCConfig{ 1593 Affinity: buildAntiAffinity(antiAffinityLabels), 1594 Volumes: volumes, 1595 Client: f.ClientSet, 1596 Name: id, 1597 Namespace: namespace, 1598 Timeout: scaleUpTimeout, 1599 Image: imageutils.GetPauseImageName(), 1600 Replicas: pods, 1601 Labels: podLabels, 1602 } 1603 err := e2erc.RunRC(ctx, *config) 1604 if err != nil { 1605 return err 1606 } 1607 _, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(ctx, id, metav1.GetOptions{}) 1608 if err != nil { 1609 return err 1610 } 1611 return nil 1612 } 1613 1614 var emptyDirVolumes = []v1.Volume{ 1615 { 1616 Name: "empty-volume", 1617 VolumeSource: v1.VolumeSource{ 1618 EmptyDir: &v1.EmptyDirVolumeSource{}, 1619 }, 1620 }, 1621 } 1622 1623 func buildVolumes(pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) []v1.Volume { 1624 return []v1.Volume{ 1625 { 1626 Name: pv.Name, 1627 VolumeSource: v1.VolumeSource{ 1628 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1629 ClaimName: pvc.Name, 1630 ReadOnly: false, 1631 }, 1632 }, 1633 }, 1634 } 1635 } 1636 1637 func buildAntiAffinity(labels map[string]string) *v1.Affinity { 1638 return &v1.Affinity{ 1639 PodAntiAffinity: &v1.PodAntiAffinity{ 1640 RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ 1641 { 1642 LabelSelector: &metav1.LabelSelector{ 1643 MatchLabels: labels, 1644 }, 1645 TopologyKey: "kubernetes.io/hostname", 1646 }, 1647 }, 1648 }, 1649 } 1650 } 1651 1652 // Create an RC running a given number of pods on each node without adding any constraint forcing 1653 // such pod distribution. This is meant to create a bunch of underutilized (but not unused) nodes 1654 // with pods that can be rescheduled on different nodes. 1655 // This is achieved using the following method: 1656 // 1. disable scheduling on each node 1657 // 2. create an empty RC 1658 // 3. for each node: 1659 // 3a. enable scheduling on that node 1660 // 3b. increase number of replicas in RC by podsPerNode 1661 func runReplicatedPodOnEachNode(ctx context.Context, f *framework.Framework, nodes []v1.Node, namespace string, podsPerNode int, id string, labels map[string]string, memRequest int64) error { 1662 ginkgo.By("Run a pod on each node") 1663 for _, node := range nodes { 1664 err := makeNodeUnschedulable(ctx, f.ClientSet, &node) 1665 1666 n := node 1667 ginkgo.DeferCleanup(makeNodeSchedulable, f.ClientSet, &n, false) 1668 1669 if err != nil { 1670 return err 1671 } 1672 } 1673 config := &testutils.RCConfig{ 1674 Client: f.ClientSet, 1675 Name: id, 1676 Namespace: namespace, 1677 Timeout: defaultTimeout, 1678 Image: imageutils.GetPauseImageName(), 1679 Replicas: 0, 1680 Labels: labels, 1681 MemRequest: memRequest, 1682 } 1683 err := e2erc.RunRC(ctx, *config) 1684 if err != nil { 1685 return err 1686 } 1687 rc, err := f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(ctx, id, metav1.GetOptions{}) 1688 if err != nil { 1689 return err 1690 } 1691 for i, node := range nodes { 1692 err = makeNodeSchedulable(ctx, f.ClientSet, &node, false) 1693 if err != nil { 1694 return err 1695 } 1696 1697 // Update replicas count, to create new pods that will be allocated on node 1698 // (we retry 409 errors in case rc reference got out of sync) 1699 for j := 0; j < 3; j++ { 1700 *rc.Spec.Replicas = int32((i + 1) * podsPerNode) 1701 rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Update(ctx, rc, metav1.UpdateOptions{}) 1702 if err == nil { 1703 break 1704 } 1705 if !apierrors.IsConflict(err) { 1706 return err 1707 } 1708 klog.Warningf("Got 409 conflict when trying to scale RC, retries left: %v", 3-j) 1709 rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(ctx, id, metav1.GetOptions{}) 1710 if err != nil { 1711 return err 1712 } 1713 } 1714 1715 err = wait.PollImmediate(5*time.Second, podTimeout, func() (bool, error) { 1716 rc, err = f.ClientSet.CoreV1().ReplicationControllers(namespace).Get(ctx, id, metav1.GetOptions{}) 1717 if err != nil || rc.Status.ReadyReplicas < int32((i+1)*podsPerNode) { 1718 return false, nil 1719 } 1720 return true, nil 1721 }) 1722 if err != nil { 1723 return fmt.Errorf("failed to coerce RC into spawning a pod on node %s within timeout", node.Name) 1724 } 1725 err = makeNodeUnschedulable(ctx, f.ClientSet, &node) 1726 if err != nil { 1727 return err 1728 } 1729 } 1730 return nil 1731 } 1732 1733 // Increase cluster size by newNodesForScaledownTests to create some unused nodes 1734 // that can be later removed by cluster autoscaler. 1735 func manuallyIncreaseClusterSize(ctx context.Context, f *framework.Framework, originalSizes map[string]int) int { 1736 ginkgo.By("Manually increase cluster size") 1737 increasedSize := 0 1738 newSizes := make(map[string]int) 1739 for key, val := range originalSizes { 1740 newSizes[key] = val + newNodesForScaledownTests 1741 increasedSize += val + newNodesForScaledownTests 1742 } 1743 setMigSizes(newSizes) 1744 1745 checkClusterSize := func(size int) bool { 1746 if size >= increasedSize { 1747 return true 1748 } 1749 resized := setMigSizes(newSizes) 1750 if resized { 1751 klog.Warning("Unexpected node group size while waiting for cluster resize. Setting size to target again.") 1752 } 1753 return false 1754 } 1755 1756 framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet, checkClusterSize, manualResizeTimeout)) 1757 return increasedSize 1758 } 1759 1760 // Try to get clusterwide health from CA status configmap. 1761 // Status configmap is not parsing-friendly, so evil regexpery follows. 1762 func getClusterwideStatus(ctx context.Context, c clientset.Interface) (string, error) { 1763 configMap, err := c.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{}) 1764 if err != nil { 1765 return "", err 1766 } 1767 status, ok := configMap.Data["status"] 1768 if !ok { 1769 return "", fmt.Errorf("Status information not found in configmap") 1770 } 1771 matcher, err := regexp.Compile("Cluster-wide:\\s*\n\\s*Health:\\s*([A-Za-z]+)") 1772 if err != nil { 1773 return "", err 1774 } 1775 result := matcher.FindStringSubmatch(status) 1776 if len(result) < 2 { 1777 return "", fmt.Errorf("Failed to parse CA status configmap, raw status: %v", status) 1778 } 1779 return result[1], nil 1780 } 1781 1782 type scaleUpStatus struct { 1783 status string 1784 ready int 1785 target int 1786 timestamp time.Time 1787 } 1788 1789 // Try to get timestamp from status. 1790 // Status configmap is not parsing-friendly, so evil regexpery follows. 1791 func getStatusTimestamp(status string) (time.Time, error) { 1792 timestampMatcher, err := regexp.Compile("Cluster-autoscaler status at \\s*([0-9\\-]+ [0-9]+:[0-9]+:[0-9]+\\.[0-9]+ \\+[0-9]+ [A-Za-z]+)") 1793 if err != nil { 1794 return time.Time{}, err 1795 } 1796 1797 timestampMatch := timestampMatcher.FindStringSubmatch(status) 1798 if len(timestampMatch) < 2 { 1799 return time.Time{}, fmt.Errorf("Failed to parse CA status timestamp, raw status: %v", status) 1800 } 1801 1802 timestamp, err := time.Parse(timestampFormat, timestampMatch[1]) 1803 if err != nil { 1804 return time.Time{}, err 1805 } 1806 return timestamp, nil 1807 } 1808 1809 // Try to get scaleup statuses of all node groups. 1810 // Status configmap is not parsing-friendly, so evil regexpery follows. 1811 func getScaleUpStatus(ctx context.Context, c clientset.Interface) (*scaleUpStatus, error) { 1812 configMap, err := c.CoreV1().ConfigMaps("kube-system").Get(ctx, "cluster-autoscaler-status", metav1.GetOptions{}) 1813 if err != nil { 1814 return nil, err 1815 } 1816 status, ok := configMap.Data["status"] 1817 if !ok { 1818 return nil, fmt.Errorf("Status information not found in configmap") 1819 } 1820 1821 timestamp, err := getStatusTimestamp(status) 1822 if err != nil { 1823 return nil, err 1824 } 1825 1826 matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)") 1827 if err != nil { 1828 return nil, err 1829 } 1830 matches := matcher.FindAllStringSubmatch(status, -1) 1831 if len(matches) < 1 { 1832 return nil, fmt.Errorf("Failed to parse CA status configmap, raw status: %v", status) 1833 } 1834 1835 result := scaleUpStatus{ 1836 status: caNoScaleUpStatus, 1837 ready: 0, 1838 target: 0, 1839 timestamp: timestamp, 1840 } 1841 for _, match := range matches { 1842 if match[1] == caOngoingScaleUpStatus { 1843 result.status = caOngoingScaleUpStatus 1844 } 1845 newReady, err := strconv.Atoi(match[2]) 1846 if err != nil { 1847 return nil, err 1848 } 1849 result.ready += newReady 1850 newTarget, err := strconv.Atoi(match[3]) 1851 if err != nil { 1852 return nil, err 1853 } 1854 result.target += newTarget 1855 } 1856 klog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target) 1857 return &result, nil 1858 } 1859 1860 func waitForScaleUpStatus(ctx context.Context, c clientset.Interface, cond func(s *scaleUpStatus) bool, timeout time.Duration) (*scaleUpStatus, error) { 1861 var finalErr error 1862 var status *scaleUpStatus 1863 err := wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { 1864 status, finalErr = getScaleUpStatus(ctx, c) 1865 if finalErr != nil { 1866 return false, nil 1867 } 1868 if status.timestamp.Add(freshStatusLimit).Before(time.Now()) { 1869 // stale status 1870 finalErr = fmt.Errorf("Status too old") 1871 return false, nil 1872 } 1873 return cond(status), nil 1874 }) 1875 if err != nil { 1876 err = fmt.Errorf("Failed to find expected scale up status: %v, last status: %v, final err: %v", err, status, finalErr) 1877 } 1878 return status, err 1879 } 1880 1881 // This is a temporary fix to allow CA to migrate some kube-system pods 1882 // TODO: Remove this when the PDB is added for some of those components 1883 func addKubeSystemPdbs(ctx context.Context, f *framework.Framework) error { 1884 ginkgo.By("Create PodDisruptionBudgets for kube-system components, so they can be migrated if required") 1885 1886 var newPdbs []string 1887 cleanup := func(ctx context.Context) { 1888 var finalErr error 1889 for _, newPdbName := range newPdbs { 1890 ginkgo.By(fmt.Sprintf("Delete PodDisruptionBudget %v", newPdbName)) 1891 err := f.ClientSet.PolicyV1().PodDisruptionBudgets("kube-system").Delete(ctx, newPdbName, metav1.DeleteOptions{}) 1892 if err != nil { 1893 // log error, but attempt to remove other pdbs 1894 klog.Errorf("Failed to delete PodDisruptionBudget %v, err: %v", newPdbName, err) 1895 finalErr = err 1896 } 1897 } 1898 if finalErr != nil { 1899 framework.Failf("Error during PodDisruptionBudget cleanup: %v", finalErr) 1900 } 1901 } 1902 ginkgo.DeferCleanup(cleanup) 1903 1904 type pdbInfo struct { 1905 label string 1906 minAvailable int 1907 } 1908 pdbsToAdd := []pdbInfo{ 1909 {label: "kube-dns", minAvailable: 1}, 1910 {label: "kube-dns-autoscaler", minAvailable: 0}, 1911 {label: "metrics-server", minAvailable: 0}, 1912 {label: "kubernetes-dashboard", minAvailable: 0}, 1913 {label: "glbc", minAvailable: 0}, 1914 } 1915 for _, pdbData := range pdbsToAdd { 1916 ginkgo.By(fmt.Sprintf("Create PodDisruptionBudget for %v", pdbData.label)) 1917 labelMap := map[string]string{"k8s-app": pdbData.label} 1918 pdbName := fmt.Sprintf("test-pdb-for-%v", pdbData.label) 1919 minAvailable := intstr.FromInt32(int32(pdbData.minAvailable)) 1920 pdb := &policyv1.PodDisruptionBudget{ 1921 ObjectMeta: metav1.ObjectMeta{ 1922 Name: pdbName, 1923 Namespace: "kube-system", 1924 }, 1925 Spec: policyv1.PodDisruptionBudgetSpec{ 1926 Selector: &metav1.LabelSelector{MatchLabels: labelMap}, 1927 MinAvailable: &minAvailable, 1928 }, 1929 } 1930 _, err := f.ClientSet.PolicyV1().PodDisruptionBudgets("kube-system").Create(ctx, pdb, metav1.CreateOptions{}) 1931 newPdbs = append(newPdbs, pdbName) 1932 1933 if err != nil { 1934 return err 1935 } 1936 } 1937 return nil 1938 } 1939 1940 func createPriorityClasses(ctx context.Context, f *framework.Framework) { 1941 priorityClasses := map[string]int32{ 1942 expendablePriorityClassName: -15, 1943 highPriorityClassName: 1000, 1944 } 1945 for className, priority := range priorityClasses { 1946 _, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: className}, Value: priority}, metav1.CreateOptions{}) 1947 if err != nil { 1948 klog.Errorf("Error creating priority class: %v", err) 1949 } 1950 if err != nil && !apierrors.IsAlreadyExists(err) { 1951 framework.Failf("unexpected error while creating priority class: %v", err) 1952 } 1953 } 1954 1955 ginkgo.DeferCleanup(func(ctx context.Context) { 1956 for className := range priorityClasses { 1957 err := f.ClientSet.SchedulingV1().PriorityClasses().Delete(ctx, className, metav1.DeleteOptions{}) 1958 if err != nil { 1959 klog.Errorf("Error deleting priority class: %v", err) 1960 } 1961 } 1962 }) 1963 }