k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/e2e.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2e 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "os" 24 "os/exec" 25 "path/filepath" 26 "strings" 27 "testing" 28 "time" 29 30 "k8s.io/klog/v2" 31 32 "github.com/onsi/ginkgo/v2" 33 "github.com/onsi/gomega" 34 35 appsv1 "k8s.io/api/apps/v1" 36 v1 "k8s.io/api/core/v1" 37 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/component-base/logs" 40 "k8s.io/component-base/version" 41 commontest "k8s.io/kubernetes/test/e2e/common" 42 "k8s.io/kubernetes/test/e2e/framework" 43 "k8s.io/kubernetes/test/e2e/framework/daemonset" 44 e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" 45 e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" 46 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 47 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 48 e2ereporters "k8s.io/kubernetes/test/e2e/reporters" 49 utilnet "k8s.io/utils/net" 50 51 clientset "k8s.io/client-go/kubernetes" 52 // ensure auth plugins are loaded 53 _ "k8s.io/client-go/plugin/pkg/client/auth" 54 55 // Ensure that logging flags are part of the command line. 56 _ "k8s.io/component-base/logs/testinit" 57 ) 58 59 const ( 60 // namespaceCleanupTimeout is how long to wait for the namespace to be deleted. 61 // If there are any orphaned namespaces to clean up, this test is running 62 // on a long lived cluster. A long wait here is preferably to spurious test 63 // failures caused by leaked resources from a previous test run. 64 namespaceCleanupTimeout = 15 * time.Minute 65 ) 66 67 var progressReporter = &e2ereporters.ProgressReporter{} 68 69 var _ = ginkgo.SynchronizedBeforeSuite(func(ctx context.Context) []byte { 70 // Reference common test to make the import valid. 71 commontest.CurrentSuite = commontest.E2E 72 progressReporter.SetStartMsg() 73 setupSuite(ctx) 74 return nil 75 }, func(ctx context.Context, data []byte) { 76 // Run on all Ginkgo nodes 77 setupSuitePerGinkgoNode(ctx) 78 }) 79 80 var _ = ginkgo.SynchronizedAfterSuite(func() { 81 progressReporter.SetEndMsg() 82 }, func(ctx context.Context) { 83 AfterSuiteActions(ctx) 84 }) 85 86 // RunE2ETests checks configuration parameters (specified through flags) and then runs 87 // E2E tests using the Ginkgo runner. 88 // If a "report directory" is specified, one or more JUnit test reports will be 89 // generated in this directory, and cluster logs will also be saved. 90 // This function is called on each Ginkgo node in parallel mode. 91 func RunE2ETests(t *testing.T) { 92 // InitLogs disables contextual logging, without a way to enable it again 93 // in the E2E test suite because it has no feature gates. It used to have a 94 // misleading --feature-gates parameter but that didn't do what users 95 // and developers expected (define which features the cluster supports) 96 // and therefore got removed. 97 // 98 // Because contextual logging is useful and should get tested, it gets 99 // re-enabled here unconditionally. 100 logs.InitLogs() 101 defer logs.FlushLogs() 102 klog.EnableContextualLogging(true) 103 104 progressReporter = e2ereporters.NewProgressReporter(framework.TestContext.ProgressReportURL) 105 gomega.RegisterFailHandler(framework.Fail) 106 107 // Run tests through the Ginkgo runner with output to console + JUnit for Jenkins 108 suiteConfig, reporterConfig := framework.CreateGinkgoConfig() 109 klog.Infof("Starting e2e run %q on Ginkgo node %d", framework.RunID, suiteConfig.ParallelProcess) 110 ginkgo.RunSpecs(t, "Kubernetes e2e suite", suiteConfig, reporterConfig) 111 } 112 113 // getDefaultClusterIPFamily obtains the default IP family of the cluster 114 // using the Cluster IP address of the kubernetes service created in the default namespace 115 // This unequivocally identifies the default IP family because services are single family 116 // TODO: dual-stack may support multiple families per service 117 // but we can detect if a cluster is dual stack because pods have two addresses (one per family) 118 func getDefaultClusterIPFamily(ctx context.Context, c clientset.Interface) string { 119 // Get the ClusterIP of the kubernetes service created in the default namespace 120 svc, err := c.CoreV1().Services(metav1.NamespaceDefault).Get(ctx, "kubernetes", metav1.GetOptions{}) 121 if err != nil { 122 framework.Failf("Failed to get kubernetes service ClusterIP: %v", err) 123 } 124 125 if utilnet.IsIPv6String(svc.Spec.ClusterIP) { 126 return "ipv6" 127 } 128 return "ipv4" 129 } 130 131 // waitForDaemonSets for all daemonsets in the given namespace to be ready 132 // (defined as all but 'allowedNotReadyNodes' pods associated with that 133 // daemonset are ready). 134 // 135 // If allowedNotReadyNodes is -1, this method returns immediately without waiting. 136 func waitForDaemonSets(ctx context.Context, c clientset.Interface, ns string, allowedNotReadyNodes int32, timeout time.Duration) error { 137 if allowedNotReadyNodes == -1 { 138 return nil 139 } 140 141 start := time.Now() 142 framework.Logf("Waiting up to %v for all daemonsets in namespace '%s' to start", 143 timeout, ns) 144 145 return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) { 146 dsList, err := c.AppsV1().DaemonSets(ns).List(ctx, metav1.ListOptions{}) 147 if err != nil { 148 framework.Logf("Error getting daemonsets in namespace: '%s': %v", ns, err) 149 return false, err 150 } 151 var notReadyDaemonSets []string 152 for _, ds := range dsList.Items { 153 framework.Logf("%d / %d pods ready in namespace '%s' in daemonset '%s' (%d seconds elapsed)", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ns, ds.ObjectMeta.Name, int(time.Since(start).Seconds())) 154 if ds.Status.DesiredNumberScheduled-ds.Status.NumberReady > allowedNotReadyNodes { 155 notReadyDaemonSets = append(notReadyDaemonSets, ds.ObjectMeta.Name) 156 } 157 } 158 159 if len(notReadyDaemonSets) > 0 { 160 framework.Logf("there are not ready daemonsets: %v", notReadyDaemonSets) 161 return false, nil 162 } 163 164 return true, nil 165 }) 166 } 167 168 // setupSuite is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step. 169 // There are certain operations we only want to run once per overall test invocation 170 // (such as deleting old namespaces, or verifying that all system pods are running. 171 // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite 172 // to ensure that these operations only run on the first parallel Ginkgo node. 173 // 174 // This function takes two parameters: one function which runs on only the first Ginkgo node, 175 // returning an opaque byte array, and then a second function which runs on all Ginkgo nodes, 176 // accepting the byte array. 177 func setupSuite(ctx context.Context) { 178 // Run only on Ginkgo node 1 179 180 switch framework.TestContext.Provider { 181 case "gce", "gke": 182 logClusterImageSources() 183 } 184 185 c, err := framework.LoadClientset() 186 framework.ExpectNoError(err, "Error loading client") 187 188 // Delete any namespaces except those created by the system. This ensures no 189 // lingering resources are left over from a previous test run. 190 if framework.TestContext.CleanStart { 191 deleted, err := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */ 192 []string{ 193 metav1.NamespaceSystem, 194 metav1.NamespaceDefault, 195 metav1.NamespacePublic, 196 v1.NamespaceNodeLease, 197 }) 198 if err != nil { 199 framework.Failf("Error deleting orphaned namespaces: %v", err) 200 } 201 if err := framework.WaitForNamespacesDeleted(ctx, c, deleted, namespaceCleanupTimeout); err != nil { 202 framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err) 203 } 204 } 205 206 timeouts := framework.NewTimeoutContext() 207 208 // In large clusters we may get to this point but still have a bunch 209 // of nodes without Routes created. Since this would make a node 210 // unschedulable, we need to wait until all of them are schedulable. 211 framework.ExpectNoError(e2enode.WaitForAllNodesSchedulable(ctx, c, timeouts.NodeSchedulable)) 212 213 // If NumNodes is not specified then auto-detect how many are scheduleable and not tainted 214 if framework.TestContext.CloudConfig.NumNodes == framework.DefaultNumNodes { 215 nodes, err := e2enode.GetReadySchedulableNodes(ctx, c) 216 framework.ExpectNoError(err) 217 framework.TestContext.CloudConfig.NumNodes = len(nodes.Items) 218 } 219 220 // Ensure all pods are running and ready before starting tests (otherwise, 221 // cluster infrastructure pods that are being pulled or started can block 222 // test pods from running, and tests that ensure all pods are running and 223 // ready will fail). 224 // 225 // TODO: In large clusters, we often observe a non-starting pods due to 226 // #41007. To avoid those pods preventing the whole test runs (and just 227 // wasting the whole run), we allow for some not-ready pods (with the 228 // number equal to the number of allowed not-ready nodes). 229 if err := e2epod.WaitForAlmostAllPodsReady(ctx, c, metav1.NamespaceSystem, framework.TestContext.MinStartupPods, framework.TestContext.AllowedNotReadyNodes, timeouts.SystemPodsStartup); err != nil { 230 e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem) 231 e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf) 232 framework.Failf("Error waiting for all pods to be running and ready: %v", err) 233 } 234 235 if err := waitForDaemonSets(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemDaemonsetStartup); err != nil { 236 framework.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err) 237 } 238 239 if framework.TestContext.PrepullImages { 240 framework.Logf("Pre-pulling images so that they are cached for the tests.") 241 prepullImages(ctx, c) 242 } 243 244 // Log the version of the server and this client. 245 framework.Logf("e2e test version: %s", version.Get().GitVersion) 246 247 dc := c.DiscoveryClient 248 249 serverVersion, serverErr := dc.ServerVersion() 250 if serverErr != nil { 251 framework.Logf("Unexpected server error retrieving version: %v", serverErr) 252 } 253 if serverVersion != nil { 254 framework.Logf("kube-apiserver version: %s", serverVersion.GitVersion) 255 } 256 257 if framework.TestContext.NodeKiller.Enabled { 258 nodeKiller := e2enode.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider) 259 go nodeKiller.Run(framework.TestContext.NodeKiller.NodeKillerStopCtx) 260 } 261 } 262 263 // logClusterImageSources writes out cluster image sources. 264 func logClusterImageSources() { 265 controlPlaneNodeImg, workerNodeImg, err := lookupClusterImageSources() 266 if err != nil { 267 framework.Logf("Cluster image sources lookup failed: %v\n", err) 268 return 269 } 270 framework.Logf("cluster-control-plane-node-image: %s", controlPlaneNodeImg) 271 framework.Logf("cluster-worker-node-image: %s", workerNodeImg) 272 273 images := map[string]string{ 274 "control_plane_node_os_image": controlPlaneNodeImg, 275 "worker_node_os_image": workerNodeImg, 276 } 277 278 outputBytes, _ := json.MarshalIndent(images, "", " ") 279 filePath := filepath.Join(framework.TestContext.ReportDir, "images.json") 280 if err := os.WriteFile(filePath, outputBytes, 0644); err != nil { 281 framework.Logf("cluster images sources, could not write to %q: %v", filePath, err) 282 } 283 } 284 285 // TODO: These should really just use the GCE API client library or at least use 286 // better formatted output from the --format flag. 287 288 // Returns control plane node & worker node image string, or error 289 func lookupClusterImageSources() (string, string, error) { 290 // Given args for a gcloud compute command, run it with other args, and return the values, 291 // whether separated by newlines, commas or semicolons. 292 gcloudf := func(argv ...string) ([]string, error) { 293 args := []string{"compute"} 294 args = append(args, argv...) 295 args = append(args, "--project", framework.TestContext.CloudConfig.ProjectID) 296 if framework.TestContext.CloudConfig.MultiMaster { 297 args = append(args, "--region", framework.TestContext.CloudConfig.Region) 298 } else { 299 args = append(args, "--zone", framework.TestContext.CloudConfig.Zone) 300 } 301 outputBytes, err := exec.Command("gcloud", args...).CombinedOutput() 302 str := strings.Replace(string(outputBytes), ",", "\n", -1) 303 str = strings.Replace(str, ";", "\n", -1) 304 lines := strings.Split(str, "\n") 305 if err != nil { 306 framework.Logf("lookupDiskImageSources: gcloud error with [%#v]; err:%v", argv, err) 307 for _, l := range lines { 308 framework.Logf(" > %s", l) 309 } 310 } 311 return lines, err 312 } 313 314 // Given a GCE instance, look through its disks, finding one that has a sourceImage 315 host2image := func(instance string) (string, error) { 316 // gcloud compute instances describe {INSTANCE} --format="get(disks[].source)" 317 // gcloud compute disks describe {DISKURL} --format="get(sourceImage)" 318 disks, err := gcloudf("instances", "describe", instance, "--format=get(disks[].source)") 319 if err != nil { 320 return "", err 321 } else if len(disks) == 0 { 322 return "", fmt.Errorf("instance %q had no findable disks", instance) 323 } 324 // Loop over disks, looking for the boot disk 325 for _, disk := range disks { 326 lines, err := gcloudf("disks", "describe", disk, "--format=get(sourceImage)") 327 if err != nil { 328 return "", err 329 } else if len(lines) > 0 && lines[0] != "" { 330 return lines[0], nil // break, we're done 331 } 332 } 333 return "", fmt.Errorf("instance %q had no disk with a sourceImage", instance) 334 } 335 336 // gcloud compute instance-groups list-instances {GROUPNAME} --format="get(instance)" 337 workerNodeName := "" 338 instGroupName := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")[0] 339 if lines, err := gcloudf("instance-groups", "list-instances", instGroupName, "--format=get(instance)"); err != nil { 340 return "", "", err 341 } else if len(lines) == 0 { 342 return "", "", fmt.Errorf("no instances inside instance-group %q", instGroupName) 343 } else { 344 workerNodeName = lines[0] 345 } 346 347 workerNodeImg, err := host2image(workerNodeName) 348 if err != nil { 349 return "", "", err 350 } 351 frags := strings.Split(workerNodeImg, "/") 352 workerNodeImg = frags[len(frags)-1] 353 354 // For GKE clusters, controlPlaneNodeName will not be defined; we just leave controlPlaneNodeImg blank. 355 controlPlaneNodeImg := "" 356 if controlPlaneNodeName := framework.TestContext.CloudConfig.MasterName; controlPlaneNodeName != "" { 357 img, err := host2image(controlPlaneNodeName) 358 if err != nil { 359 return "", "", err 360 } 361 frags = strings.Split(img, "/") 362 controlPlaneNodeImg = frags[len(frags)-1] 363 } 364 365 return controlPlaneNodeImg, workerNodeImg, nil 366 } 367 368 // setupSuitePerGinkgoNode is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step. 369 // There are certain operations we only want to run once per overall test invocation on each Ginkgo node 370 // such as making some global variables accessible to all parallel executions 371 // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite 372 // Ref: https://onsi.github.io/ginkgo/#parallel-specs 373 func setupSuitePerGinkgoNode(ctx context.Context) { 374 // Obtain the default IP family of the cluster 375 // Some e2e test are designed to work on IPv4 only, this global variable 376 // allows to adapt those tests to work on both IPv4 and IPv6 377 // TODO: dual-stack 378 // the dual stack clusters can be ipv4-ipv6 or ipv6-ipv4, order matters, 379 // and services use the primary IP family by default 380 c, err := framework.LoadClientset() 381 framework.ExpectNoError(err, "Error loading client") 382 framework.TestContext.IPFamily = getDefaultClusterIPFamily(ctx, c) 383 framework.Logf("Cluster IP family: %s", framework.TestContext.IPFamily) 384 } 385 386 func prepullImages(ctx context.Context, c clientset.Interface) { 387 namespace, err := framework.CreateTestingNS(ctx, "img-puller", c, map[string]string{ 388 "e2e-framework": "img-puller", 389 }) 390 framework.ExpectNoError(err) 391 ns := namespace.Name 392 ginkgo.DeferCleanup(c.CoreV1().Namespaces().Delete, ns, metav1.DeleteOptions{}) 393 394 images := commontest.PrePulledImages 395 if framework.NodeOSDistroIs("windows") { 396 images = commontest.WindowsPrePulledImages 397 } 398 399 label := map[string]string{"app": "prepull-daemonset"} 400 var imgPullers []*appsv1.DaemonSet 401 for _, img := range images.List() { 402 dsName := fmt.Sprintf("img-pull-%s", strings.ReplaceAll(strings.ReplaceAll(img, "/", "-"), ":", "-")) 403 404 dsSpec := daemonset.NewDaemonSet(dsName, img, label, nil, nil, nil) 405 ds, err := c.AppsV1().DaemonSets(ns).Create(ctx, dsSpec, metav1.CreateOptions{}) 406 framework.ExpectNoError(err) 407 imgPullers = append(imgPullers, ds) 408 } 409 410 // this should not be a multiple of 5, because node status updates 411 // every 5 seconds. See https://github.com/kubernetes/kubernetes/pull/14915. 412 dsRetryPeriod := 9 * time.Second 413 dsRetryTimeout := 5 * time.Minute 414 415 for _, imgPuller := range imgPullers { 416 checkDaemonset := func(ctx context.Context) (bool, error) { 417 return daemonset.CheckPresentOnNodes(ctx, c, imgPuller, ns, framework.TestContext.CloudConfig.NumNodes) 418 } 419 framework.Logf("Waiting for %s", imgPuller.Name) 420 err := wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkDaemonset) 421 framework.ExpectNoError(err, "error waiting for image to be pulled") 422 } 423 }