k8s.io/kubernetes@v1.29.3/test/e2e/e2e.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2e 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "os" 24 "os/exec" 25 "path/filepath" 26 "strings" 27 "testing" 28 "time" 29 30 "k8s.io/klog/v2" 31 32 "github.com/onsi/ginkgo/v2" 33 "github.com/onsi/gomega" 34 35 appsv1 "k8s.io/api/apps/v1" 36 v1 "k8s.io/api/core/v1" 37 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/component-base/logs" 40 "k8s.io/component-base/version" 41 commontest "k8s.io/kubernetes/test/e2e/common" 42 "k8s.io/kubernetes/test/e2e/framework" 43 "k8s.io/kubernetes/test/e2e/framework/daemonset" 44 e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" 45 e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" 46 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 47 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 48 e2ereporters "k8s.io/kubernetes/test/e2e/reporters" 49 utilnet "k8s.io/utils/net" 50 51 clientset "k8s.io/client-go/kubernetes" 52 // ensure auth plugins are loaded 53 _ "k8s.io/client-go/plugin/pkg/client/auth" 54 55 // ensure that cloud providers are loaded 56 _ "k8s.io/kubernetes/test/e2e/framework/providers/aws" 57 _ "k8s.io/kubernetes/test/e2e/framework/providers/azure" 58 _ "k8s.io/kubernetes/test/e2e/framework/providers/gce" 59 _ "k8s.io/kubernetes/test/e2e/framework/providers/kubemark" 60 _ "k8s.io/kubernetes/test/e2e/framework/providers/openstack" 61 _ "k8s.io/kubernetes/test/e2e/framework/providers/vsphere" 62 63 // Ensure that logging flags are part of the command line. 64 _ "k8s.io/component-base/logs/testinit" 65 ) 66 67 const ( 68 // namespaceCleanupTimeout is how long to wait for the namespace to be deleted. 69 // If there are any orphaned namespaces to clean up, this test is running 70 // on a long lived cluster. A long wait here is preferably to spurious test 71 // failures caused by leaked resources from a previous test run. 72 namespaceCleanupTimeout = 15 * time.Minute 73 ) 74 75 var progressReporter = &e2ereporters.ProgressReporter{} 76 77 var _ = ginkgo.SynchronizedBeforeSuite(func(ctx context.Context) []byte { 78 // Reference common test to make the import valid. 79 commontest.CurrentSuite = commontest.E2E 80 progressReporter.SetStartMsg() 81 setupSuite(ctx) 82 return nil 83 }, func(ctx context.Context, data []byte) { 84 // Run on all Ginkgo nodes 85 setupSuitePerGinkgoNode(ctx) 86 }) 87 88 var _ = ginkgo.SynchronizedAfterSuite(func() { 89 progressReporter.SetEndMsg() 90 }, func(ctx context.Context) { 91 AfterSuiteActions(ctx) 92 }) 93 94 // RunE2ETests checks configuration parameters (specified through flags) and then runs 95 // E2E tests using the Ginkgo runner. 96 // If a "report directory" is specified, one or more JUnit test reports will be 97 // generated in this directory, and cluster logs will also be saved. 98 // This function is called on each Ginkgo node in parallel mode. 99 func RunE2ETests(t *testing.T) { 100 // InitLogs disables contextual logging, without a way to enable it again 101 // in the E2E test suite because it has no feature gates. It used to have a 102 // misleading --feature-gates parameter but that didn't do what users 103 // and developers expected (define which features the cluster supports) 104 // and therefore got removed. 105 // 106 // Because contextual logging is useful and should get tested, it gets 107 // re-enabled here unconditionally. 108 logs.InitLogs() 109 defer logs.FlushLogs() 110 klog.EnableContextualLogging(true) 111 112 progressReporter = e2ereporters.NewProgressReporter(framework.TestContext.ProgressReportURL) 113 gomega.RegisterFailHandler(framework.Fail) 114 115 // Run tests through the Ginkgo runner with output to console + JUnit for Jenkins 116 suiteConfig, reporterConfig := framework.CreateGinkgoConfig() 117 klog.Infof("Starting e2e run %q on Ginkgo node %d", framework.RunID, suiteConfig.ParallelProcess) 118 ginkgo.RunSpecs(t, "Kubernetes e2e suite", suiteConfig, reporterConfig) 119 } 120 121 // getDefaultClusterIPFamily obtains the default IP family of the cluster 122 // using the Cluster IP address of the kubernetes service created in the default namespace 123 // This unequivocally identifies the default IP family because services are single family 124 // TODO: dual-stack may support multiple families per service 125 // but we can detect if a cluster is dual stack because pods have two addresses (one per family) 126 func getDefaultClusterIPFamily(ctx context.Context, c clientset.Interface) string { 127 // Get the ClusterIP of the kubernetes service created in the default namespace 128 svc, err := c.CoreV1().Services(metav1.NamespaceDefault).Get(ctx, "kubernetes", metav1.GetOptions{}) 129 if err != nil { 130 framework.Failf("Failed to get kubernetes service ClusterIP: %v", err) 131 } 132 133 if utilnet.IsIPv6String(svc.Spec.ClusterIP) { 134 return "ipv6" 135 } 136 return "ipv4" 137 } 138 139 // waitForDaemonSets for all daemonsets in the given namespace to be ready 140 // (defined as all but 'allowedNotReadyNodes' pods associated with that 141 // daemonset are ready). 142 // 143 // If allowedNotReadyNodes is -1, this method returns immediately without waiting. 144 func waitForDaemonSets(ctx context.Context, c clientset.Interface, ns string, allowedNotReadyNodes int32, timeout time.Duration) error { 145 if allowedNotReadyNodes == -1 { 146 return nil 147 } 148 149 start := time.Now() 150 framework.Logf("Waiting up to %v for all daemonsets in namespace '%s' to start", 151 timeout, ns) 152 153 return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) { 154 dsList, err := c.AppsV1().DaemonSets(ns).List(ctx, metav1.ListOptions{}) 155 if err != nil { 156 framework.Logf("Error getting daemonsets in namespace: '%s': %v", ns, err) 157 return false, err 158 } 159 var notReadyDaemonSets []string 160 for _, ds := range dsList.Items { 161 framework.Logf("%d / %d pods ready in namespace '%s' in daemonset '%s' (%d seconds elapsed)", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ns, ds.ObjectMeta.Name, int(time.Since(start).Seconds())) 162 if ds.Status.DesiredNumberScheduled-ds.Status.NumberReady > allowedNotReadyNodes { 163 notReadyDaemonSets = append(notReadyDaemonSets, ds.ObjectMeta.Name) 164 } 165 } 166 167 if len(notReadyDaemonSets) > 0 { 168 framework.Logf("there are not ready daemonsets: %v", notReadyDaemonSets) 169 return false, nil 170 } 171 172 return true, nil 173 }) 174 } 175 176 // setupSuite is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step. 177 // There are certain operations we only want to run once per overall test invocation 178 // (such as deleting old namespaces, or verifying that all system pods are running. 179 // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite 180 // to ensure that these operations only run on the first parallel Ginkgo node. 181 // 182 // This function takes two parameters: one function which runs on only the first Ginkgo node, 183 // returning an opaque byte array, and then a second function which runs on all Ginkgo nodes, 184 // accepting the byte array. 185 func setupSuite(ctx context.Context) { 186 // Run only on Ginkgo node 1 187 188 switch framework.TestContext.Provider { 189 case "gce", "gke": 190 logClusterImageSources() 191 } 192 193 c, err := framework.LoadClientset() 194 if err != nil { 195 klog.Fatal("Error loading client: ", err) 196 } 197 198 // Delete any namespaces except those created by the system. This ensures no 199 // lingering resources are left over from a previous test run. 200 if framework.TestContext.CleanStart { 201 deleted, err := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */ 202 []string{ 203 metav1.NamespaceSystem, 204 metav1.NamespaceDefault, 205 metav1.NamespacePublic, 206 v1.NamespaceNodeLease, 207 }) 208 if err != nil { 209 framework.Failf("Error deleting orphaned namespaces: %v", err) 210 } 211 if err := framework.WaitForNamespacesDeleted(ctx, c, deleted, namespaceCleanupTimeout); err != nil { 212 framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err) 213 } 214 } 215 216 timeouts := framework.NewTimeoutContext() 217 218 // In large clusters we may get to this point but still have a bunch 219 // of nodes without Routes created. Since this would make a node 220 // unschedulable, we need to wait until all of them are schedulable. 221 framework.ExpectNoError(e2enode.WaitForAllNodesSchedulable(ctx, c, timeouts.NodeSchedulable)) 222 223 // If NumNodes is not specified then auto-detect how many are scheduleable and not tainted 224 if framework.TestContext.CloudConfig.NumNodes == framework.DefaultNumNodes { 225 nodes, err := e2enode.GetReadySchedulableNodes(ctx, c) 226 framework.ExpectNoError(err) 227 framework.TestContext.CloudConfig.NumNodes = len(nodes.Items) 228 } 229 230 // Ensure all pods are running and ready before starting tests (otherwise, 231 // cluster infrastructure pods that are being pulled or started can block 232 // test pods from running, and tests that ensure all pods are running and 233 // ready will fail). 234 // 235 // TODO: In large clusters, we often observe a non-starting pods due to 236 // #41007. To avoid those pods preventing the whole test runs (and just 237 // wasting the whole run), we allow for some not-ready pods (with the 238 // number equal to the number of allowed not-ready nodes). 239 if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil { 240 e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem) 241 e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf) 242 framework.Failf("Error waiting for all pods to be running and ready: %v", err) 243 } 244 245 if err := waitForDaemonSets(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemDaemonsetStartup); err != nil { 246 framework.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err) 247 } 248 249 if framework.TestContext.PrepullImages { 250 framework.Logf("Pre-pulling images so that they are cached for the tests.") 251 prepullImages(ctx, c) 252 } 253 254 // Log the version of the server and this client. 255 framework.Logf("e2e test version: %s", version.Get().GitVersion) 256 257 dc := c.DiscoveryClient 258 259 serverVersion, serverErr := dc.ServerVersion() 260 if serverErr != nil { 261 framework.Logf("Unexpected server error retrieving version: %v", serverErr) 262 } 263 if serverVersion != nil { 264 framework.Logf("kube-apiserver version: %s", serverVersion.GitVersion) 265 } 266 267 if framework.TestContext.NodeKiller.Enabled { 268 nodeKiller := e2enode.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider) 269 go nodeKiller.Run(framework.TestContext.NodeKiller.NodeKillerStopCtx) 270 } 271 } 272 273 // logClusterImageSources writes out cluster image sources. 274 func logClusterImageSources() { 275 controlPlaneNodeImg, workerNodeImg, err := lookupClusterImageSources() 276 if err != nil { 277 framework.Logf("Cluster image sources lookup failed: %v\n", err) 278 return 279 } 280 framework.Logf("cluster-control-plane-node-image: %s", controlPlaneNodeImg) 281 framework.Logf("cluster-worker-node-image: %s", workerNodeImg) 282 283 images := map[string]string{ 284 "control_plane_node_os_image": controlPlaneNodeImg, 285 "worker_node_os_image": workerNodeImg, 286 } 287 288 outputBytes, _ := json.MarshalIndent(images, "", " ") 289 filePath := filepath.Join(framework.TestContext.ReportDir, "images.json") 290 if err := os.WriteFile(filePath, outputBytes, 0644); err != nil { 291 framework.Logf("cluster images sources, could not write to %q: %v", filePath, err) 292 } 293 } 294 295 // TODO: These should really just use the GCE API client library or at least use 296 // better formatted output from the --format flag. 297 298 // Returns control plane node & worker node image string, or error 299 func lookupClusterImageSources() (string, string, error) { 300 // Given args for a gcloud compute command, run it with other args, and return the values, 301 // whether separated by newlines, commas or semicolons. 302 gcloudf := func(argv ...string) ([]string, error) { 303 args := []string{"compute"} 304 args = append(args, argv...) 305 args = append(args, "--project", framework.TestContext.CloudConfig.ProjectID) 306 if framework.TestContext.CloudConfig.MultiMaster { 307 args = append(args, "--region", framework.TestContext.CloudConfig.Region) 308 } else { 309 args = append(args, "--zone", framework.TestContext.CloudConfig.Zone) 310 } 311 outputBytes, err := exec.Command("gcloud", args...).CombinedOutput() 312 str := strings.Replace(string(outputBytes), ",", "\n", -1) 313 str = strings.Replace(str, ";", "\n", -1) 314 lines := strings.Split(str, "\n") 315 if err != nil { 316 framework.Logf("lookupDiskImageSources: gcloud error with [%#v]; err:%v", argv, err) 317 for _, l := range lines { 318 framework.Logf(" > %s", l) 319 } 320 } 321 return lines, err 322 } 323 324 // Given a GCE instance, look through its disks, finding one that has a sourceImage 325 host2image := func(instance string) (string, error) { 326 // gcloud compute instances describe {INSTANCE} --format="get(disks[].source)" 327 // gcloud compute disks describe {DISKURL} --format="get(sourceImage)" 328 disks, err := gcloudf("instances", "describe", instance, "--format=get(disks[].source)") 329 if err != nil { 330 return "", err 331 } else if len(disks) == 0 { 332 return "", fmt.Errorf("instance %q had no findable disks", instance) 333 } 334 // Loop over disks, looking for the boot disk 335 for _, disk := range disks { 336 lines, err := gcloudf("disks", "describe", disk, "--format=get(sourceImage)") 337 if err != nil { 338 return "", err 339 } else if len(lines) > 0 && lines[0] != "" { 340 return lines[0], nil // break, we're done 341 } 342 } 343 return "", fmt.Errorf("instance %q had no disk with a sourceImage", instance) 344 } 345 346 // gcloud compute instance-groups list-instances {GROUPNAME} --format="get(instance)" 347 workerNodeName := "" 348 instGroupName := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")[0] 349 if lines, err := gcloudf("instance-groups", "list-instances", instGroupName, "--format=get(instance)"); err != nil { 350 return "", "", err 351 } else if len(lines) == 0 { 352 return "", "", fmt.Errorf("no instances inside instance-group %q", instGroupName) 353 } else { 354 workerNodeName = lines[0] 355 } 356 357 workerNodeImg, err := host2image(workerNodeName) 358 if err != nil { 359 return "", "", err 360 } 361 frags := strings.Split(workerNodeImg, "/") 362 workerNodeImg = frags[len(frags)-1] 363 364 // For GKE clusters, controlPlaneNodeName will not be defined; we just leave controlPlaneNodeImg blank. 365 controlPlaneNodeImg := "" 366 if controlPlaneNodeName := framework.TestContext.CloudConfig.MasterName; controlPlaneNodeName != "" { 367 img, err := host2image(controlPlaneNodeName) 368 if err != nil { 369 return "", "", err 370 } 371 frags = strings.Split(img, "/") 372 controlPlaneNodeImg = frags[len(frags)-1] 373 } 374 375 return controlPlaneNodeImg, workerNodeImg, nil 376 } 377 378 // setupSuitePerGinkgoNode is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step. 379 // There are certain operations we only want to run once per overall test invocation on each Ginkgo node 380 // such as making some global variables accessible to all parallel executions 381 // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite 382 // Ref: https://onsi.github.io/ginkgo/#parallel-specs 383 func setupSuitePerGinkgoNode(ctx context.Context) { 384 // Obtain the default IP family of the cluster 385 // Some e2e test are designed to work on IPv4 only, this global variable 386 // allows to adapt those tests to work on both IPv4 and IPv6 387 // TODO: dual-stack 388 // the dual stack clusters can be ipv4-ipv6 or ipv6-ipv4, order matters, 389 // and services use the primary IP family by default 390 c, err := framework.LoadClientset() 391 if err != nil { 392 klog.Fatal("Error loading client: ", err) 393 } 394 framework.TestContext.IPFamily = getDefaultClusterIPFamily(ctx, c) 395 framework.Logf("Cluster IP family: %s", framework.TestContext.IPFamily) 396 } 397 398 func prepullImages(ctx context.Context, c clientset.Interface) { 399 namespace, err := framework.CreateTestingNS(ctx, "img-puller", c, map[string]string{ 400 "e2e-framework": "img-puller", 401 }) 402 framework.ExpectNoError(err) 403 ns := namespace.Name 404 ginkgo.DeferCleanup(c.CoreV1().Namespaces().Delete, ns, metav1.DeleteOptions{}) 405 406 images := commontest.PrePulledImages 407 if framework.NodeOSDistroIs("windows") { 408 images = commontest.WindowsPrePulledImages 409 } 410 411 label := map[string]string{"app": "prepull-daemonset"} 412 var imgPullers []*appsv1.DaemonSet 413 for _, img := range images.List() { 414 dsName := fmt.Sprintf("img-pull-%s", strings.ReplaceAll(strings.ReplaceAll(img, "/", "-"), ":", "-")) 415 416 dsSpec := daemonset.NewDaemonSet(dsName, img, label, nil, nil, nil) 417 ds, err := c.AppsV1().DaemonSets(ns).Create(ctx, dsSpec, metav1.CreateOptions{}) 418 framework.ExpectNoError(err) 419 imgPullers = append(imgPullers, ds) 420 } 421 422 // this should not be a multiple of 5, because node status updates 423 // every 5 seconds. See https://github.com/kubernetes/kubernetes/pull/14915. 424 dsRetryPeriod := 9 * time.Second 425 dsRetryTimeout := 5 * time.Minute 426 427 for _, imgPuller := range imgPullers { 428 checkDaemonset := func(ctx context.Context) (bool, error) { 429 return daemonset.CheckPresentOnNodes(ctx, c, imgPuller, ns, framework.TestContext.CloudConfig.NumNodes) 430 } 431 framework.Logf("Waiting for %s", imgPuller.Name) 432 err := wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkDaemonset) 433 framework.ExpectNoError(err, "error waiting for image to be pulled") 434 } 435 }