k8s.io/kubernetes@v1.29.3/test/e2e/e2e.go

k8s.io/kubernetes@v1.29.3/test/e2e/e2e.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2e
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"os"
    24  	"os/exec"
    25  	"path/filepath"
    26  	"strings"
    27  	"testing"
    28  	"time"
    29  
    30  	"k8s.io/klog/v2"
    31  
    32  	"github.com/onsi/ginkgo/v2"
    33  	"github.com/onsi/gomega"
    34  
    35  	appsv1 "k8s.io/api/apps/v1"
    36  	v1 "k8s.io/api/core/v1"
    37  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/component-base/logs"
    40  	"k8s.io/component-base/version"
    41  	commontest "k8s.io/kubernetes/test/e2e/common"
    42  	"k8s.io/kubernetes/test/e2e/framework"
    43  	"k8s.io/kubernetes/test/e2e/framework/daemonset"
    44  	e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
    45  	e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl"
    46  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    47  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    48  	e2ereporters "k8s.io/kubernetes/test/e2e/reporters"
    49  	utilnet "k8s.io/utils/net"
    50  
    51  	clientset "k8s.io/client-go/kubernetes"
    52  	// ensure auth plugins are loaded
    53  	_ "k8s.io/client-go/plugin/pkg/client/auth"
    54  
    55  	// ensure that cloud providers are loaded
    56  	_ "k8s.io/kubernetes/test/e2e/framework/providers/aws"
    57  	_ "k8s.io/kubernetes/test/e2e/framework/providers/azure"
    58  	_ "k8s.io/kubernetes/test/e2e/framework/providers/gce"
    59  	_ "k8s.io/kubernetes/test/e2e/framework/providers/kubemark"
    60  	_ "k8s.io/kubernetes/test/e2e/framework/providers/openstack"
    61  	_ "k8s.io/kubernetes/test/e2e/framework/providers/vsphere"
    62  
    63  	// Ensure that logging flags are part of the command line.
    64  	_ "k8s.io/component-base/logs/testinit"
    65  )
    66  
    67  const (
    68  	// namespaceCleanupTimeout is how long to wait for the namespace to be deleted.
    69  	// If there are any orphaned namespaces to clean up, this test is running
    70  	// on a long lived cluster. A long wait here is preferably to spurious test
    71  	// failures caused by leaked resources from a previous test run.
    72  	namespaceCleanupTimeout = 15 * time.Minute
    73  )
    74  
    75  var progressReporter = &e2ereporters.ProgressReporter{}
    76  
    77  var _ = ginkgo.SynchronizedBeforeSuite(func(ctx context.Context) []byte {
    78  	// Reference common test to make the import valid.
    79  	commontest.CurrentSuite = commontest.E2E
    80  	progressReporter.SetStartMsg()
    81  	setupSuite(ctx)
    82  	return nil
    83  }, func(ctx context.Context, data []byte) {
    84  	// Run on all Ginkgo nodes
    85  	setupSuitePerGinkgoNode(ctx)
    86  })
    87  
    88  var _ = ginkgo.SynchronizedAfterSuite(func() {
    89  	progressReporter.SetEndMsg()
    90  }, func(ctx context.Context) {
    91  	AfterSuiteActions(ctx)
    92  })
    93  
    94  // RunE2ETests checks configuration parameters (specified through flags) and then runs
    95  // E2E tests using the Ginkgo runner.
    96  // If a "report directory" is specified, one or more JUnit test reports will be
    97  // generated in this directory, and cluster logs will also be saved.
    98  // This function is called on each Ginkgo node in parallel mode.
    99  func RunE2ETests(t *testing.T) {
   100  	// InitLogs disables contextual logging, without a way to enable it again
   101  	// in the E2E test suite because it has no feature gates. It used to have a
   102  	// misleading --feature-gates parameter but that didn't do what users
   103  	// and developers expected (define which features the cluster supports)
   104  	// and therefore got removed.
   105  	//
   106  	// Because contextual logging is useful and should get tested, it gets
   107  	// re-enabled here unconditionally.
   108  	logs.InitLogs()
   109  	defer logs.FlushLogs()
   110  	klog.EnableContextualLogging(true)
   111  
   112  	progressReporter = e2ereporters.NewProgressReporter(framework.TestContext.ProgressReportURL)
   113  	gomega.RegisterFailHandler(framework.Fail)
   114  
   115  	// Run tests through the Ginkgo runner with output to console + JUnit for Jenkins
   116  	suiteConfig, reporterConfig := framework.CreateGinkgoConfig()
   117  	klog.Infof("Starting e2e run %q on Ginkgo node %d", framework.RunID, suiteConfig.ParallelProcess)
   118  	ginkgo.RunSpecs(t, "Kubernetes e2e suite", suiteConfig, reporterConfig)
   119  }
   120  
   121  // getDefaultClusterIPFamily obtains the default IP family of the cluster
   122  // using the Cluster IP address of the kubernetes service created in the default namespace
   123  // This unequivocally identifies the default IP family because services are single family
   124  // TODO: dual-stack may support multiple families per service
   125  // but we can detect if a cluster is dual stack because pods have two addresses (one per family)
   126  func getDefaultClusterIPFamily(ctx context.Context, c clientset.Interface) string {
   127  	// Get the ClusterIP of the kubernetes service created in the default namespace
   128  	svc, err := c.CoreV1().Services(metav1.NamespaceDefault).Get(ctx, "kubernetes", metav1.GetOptions{})
   129  	if err != nil {
   130  		framework.Failf("Failed to get kubernetes service ClusterIP: %v", err)
   131  	}
   132  
   133  	if utilnet.IsIPv6String(svc.Spec.ClusterIP) {
   134  		return "ipv6"
   135  	}
   136  	return "ipv4"
   137  }
   138  
   139  // waitForDaemonSets for all daemonsets in the given namespace to be ready
   140  // (defined as all but 'allowedNotReadyNodes' pods associated with that
   141  // daemonset are ready).
   142  //
   143  // If allowedNotReadyNodes is -1, this method returns immediately without waiting.
   144  func waitForDaemonSets(ctx context.Context, c clientset.Interface, ns string, allowedNotReadyNodes int32, timeout time.Duration) error {
   145  	if allowedNotReadyNodes == -1 {
   146  		return nil
   147  	}
   148  
   149  	start := time.Now()
   150  	framework.Logf("Waiting up to %v for all daemonsets in namespace '%s' to start",
   151  		timeout, ns)
   152  
   153  	return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) {
   154  		dsList, err := c.AppsV1().DaemonSets(ns).List(ctx, metav1.ListOptions{})
   155  		if err != nil {
   156  			framework.Logf("Error getting daemonsets in namespace: '%s': %v", ns, err)
   157  			return false, err
   158  		}
   159  		var notReadyDaemonSets []string
   160  		for _, ds := range dsList.Items {
   161  			framework.Logf("%d / %d pods ready in namespace '%s' in daemonset '%s' (%d seconds elapsed)", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ns, ds.ObjectMeta.Name, int(time.Since(start).Seconds()))
   162  			if ds.Status.DesiredNumberScheduled-ds.Status.NumberReady > allowedNotReadyNodes {
   163  				notReadyDaemonSets = append(notReadyDaemonSets, ds.ObjectMeta.Name)
   164  			}
   165  		}
   166  
   167  		if len(notReadyDaemonSets) > 0 {
   168  			framework.Logf("there are not ready daemonsets: %v", notReadyDaemonSets)
   169  			return false, nil
   170  		}
   171  
   172  		return true, nil
   173  	})
   174  }
   175  
   176  // setupSuite is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step.
   177  // There are certain operations we only want to run once per overall test invocation
   178  // (such as deleting old namespaces, or verifying that all system pods are running.
   179  // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite
   180  // to ensure that these operations only run on the first parallel Ginkgo node.
   181  //
   182  // This function takes two parameters: one function which runs on only the first Ginkgo node,
   183  // returning an opaque byte array, and then a second function which runs on all Ginkgo nodes,
   184  // accepting the byte array.
   185  func setupSuite(ctx context.Context) {
   186  	// Run only on Ginkgo node 1
   187  
   188  	switch framework.TestContext.Provider {
   189  	case "gce", "gke":
   190  		logClusterImageSources()
   191  	}
   192  
   193  	c, err := framework.LoadClientset()
   194  	if err != nil {
   195  		klog.Fatal("Error loading client: ", err)
   196  	}
   197  
   198  	// Delete any namespaces except those created by the system. This ensures no
   199  	// lingering resources are left over from a previous test run.
   200  	if framework.TestContext.CleanStart {
   201  		deleted, err := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */
   202  			[]string{
   203  				metav1.NamespaceSystem,
   204  				metav1.NamespaceDefault,
   205  				metav1.NamespacePublic,
   206  				v1.NamespaceNodeLease,
   207  			})
   208  		if err != nil {
   209  			framework.Failf("Error deleting orphaned namespaces: %v", err)
   210  		}
   211  		if err := framework.WaitForNamespacesDeleted(ctx, c, deleted, namespaceCleanupTimeout); err != nil {
   212  			framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err)
   213  		}
   214  	}
   215  
   216  	timeouts := framework.NewTimeoutContext()
   217  
   218  	// In large clusters we may get to this point but still have a bunch
   219  	// of nodes without Routes created. Since this would make a node
   220  	// unschedulable, we need to wait until all of them are schedulable.
   221  	framework.ExpectNoError(e2enode.WaitForAllNodesSchedulable(ctx, c, timeouts.NodeSchedulable))
   222  
   223  	// If NumNodes is not specified then auto-detect how many are scheduleable and not tainted
   224  	if framework.TestContext.CloudConfig.NumNodes == framework.DefaultNumNodes {
   225  		nodes, err := e2enode.GetReadySchedulableNodes(ctx, c)
   226  		framework.ExpectNoError(err)
   227  		framework.TestContext.CloudConfig.NumNodes = len(nodes.Items)
   228  	}
   229  
   230  	// Ensure all pods are running and ready before starting tests (otherwise,
   231  	// cluster infrastructure pods that are being pulled or started can block
   232  	// test pods from running, and tests that ensure all pods are running and
   233  	// ready will fail).
   234  	//
   235  	// TODO: In large clusters, we often observe a non-starting pods due to
   236  	// #41007. To avoid those pods preventing the whole test runs (and just
   237  	// wasting the whole run), we allow for some not-ready pods (with the
   238  	// number equal to the number of allowed not-ready nodes).
   239  	if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil {
   240  		e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem)
   241  		e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf)
   242  		framework.Failf("Error waiting for all pods to be running and ready: %v", err)
   243  	}
   244  
   245  	if err := waitForDaemonSets(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemDaemonsetStartup); err != nil {
   246  		framework.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err)
   247  	}
   248  
   249  	if framework.TestContext.PrepullImages {
   250  		framework.Logf("Pre-pulling images so that they are cached for the tests.")
   251  		prepullImages(ctx, c)
   252  	}
   253  
   254  	// Log the version of the server and this client.
   255  	framework.Logf("e2e test version: %s", version.Get().GitVersion)
   256  
   257  	dc := c.DiscoveryClient
   258  
   259  	serverVersion, serverErr := dc.ServerVersion()
   260  	if serverErr != nil {
   261  		framework.Logf("Unexpected server error retrieving version: %v", serverErr)
   262  	}
   263  	if serverVersion != nil {
   264  		framework.Logf("kube-apiserver version: %s", serverVersion.GitVersion)
   265  	}
   266  
   267  	if framework.TestContext.NodeKiller.Enabled {
   268  		nodeKiller := e2enode.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
   269  		go nodeKiller.Run(framework.TestContext.NodeKiller.NodeKillerStopCtx)
   270  	}
   271  }
   272  
   273  // logClusterImageSources writes out cluster image sources.
   274  func logClusterImageSources() {
   275  	controlPlaneNodeImg, workerNodeImg, err := lookupClusterImageSources()
   276  	if err != nil {
   277  		framework.Logf("Cluster image sources lookup failed: %v\n", err)
   278  		return
   279  	}
   280  	framework.Logf("cluster-control-plane-node-image: %s", controlPlaneNodeImg)
   281  	framework.Logf("cluster-worker-node-image: %s", workerNodeImg)
   282  
   283  	images := map[string]string{
   284  		"control_plane_node_os_image": controlPlaneNodeImg,
   285  		"worker_node_os_image":        workerNodeImg,
   286  	}
   287  
   288  	outputBytes, _ := json.MarshalIndent(images, "", "  ")
   289  	filePath := filepath.Join(framework.TestContext.ReportDir, "images.json")
   290  	if err := os.WriteFile(filePath, outputBytes, 0644); err != nil {
   291  		framework.Logf("cluster images sources, could not write to %q: %v", filePath, err)
   292  	}
   293  }
   294  
   295  // TODO: These should really just use the GCE API client library or at least use
   296  // better formatted output from the --format flag.
   297  
   298  // Returns control plane node & worker node image string, or error
   299  func lookupClusterImageSources() (string, string, error) {
   300  	// Given args for a gcloud compute command, run it with other args, and return the values,
   301  	// whether separated by newlines, commas or semicolons.
   302  	gcloudf := func(argv ...string) ([]string, error) {
   303  		args := []string{"compute"}
   304  		args = append(args, argv...)
   305  		args = append(args, "--project", framework.TestContext.CloudConfig.ProjectID)
   306  		if framework.TestContext.CloudConfig.MultiMaster {
   307  			args = append(args, "--region", framework.TestContext.CloudConfig.Region)
   308  		} else {
   309  			args = append(args, "--zone", framework.TestContext.CloudConfig.Zone)
   310  		}
   311  		outputBytes, err := exec.Command("gcloud", args...).CombinedOutput()
   312  		str := strings.Replace(string(outputBytes), ",", "\n", -1)
   313  		str = strings.Replace(str, ";", "\n", -1)
   314  		lines := strings.Split(str, "\n")
   315  		if err != nil {
   316  			framework.Logf("lookupDiskImageSources: gcloud error with [%#v]; err:%v", argv, err)
   317  			for _, l := range lines {
   318  				framework.Logf(" > %s", l)
   319  			}
   320  		}
   321  		return lines, err
   322  	}
   323  
   324  	// Given a GCE instance, look through its disks, finding one that has a sourceImage
   325  	host2image := func(instance string) (string, error) {
   326  		// gcloud compute instances describe {INSTANCE} --format="get(disks[].source)"
   327  		// gcloud compute disks describe {DISKURL} --format="get(sourceImage)"
   328  		disks, err := gcloudf("instances", "describe", instance, "--format=get(disks[].source)")
   329  		if err != nil {
   330  			return "", err
   331  		} else if len(disks) == 0 {
   332  			return "", fmt.Errorf("instance %q had no findable disks", instance)
   333  		}
   334  		// Loop over disks, looking for the boot disk
   335  		for _, disk := range disks {
   336  			lines, err := gcloudf("disks", "describe", disk, "--format=get(sourceImage)")
   337  			if err != nil {
   338  				return "", err
   339  			} else if len(lines) > 0 && lines[0] != "" {
   340  				return lines[0], nil // break, we're done
   341  			}
   342  		}
   343  		return "", fmt.Errorf("instance %q had no disk with a sourceImage", instance)
   344  	}
   345  
   346  	// gcloud compute instance-groups list-instances {GROUPNAME} --format="get(instance)"
   347  	workerNodeName := ""
   348  	instGroupName := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")[0]
   349  	if lines, err := gcloudf("instance-groups", "list-instances", instGroupName, "--format=get(instance)"); err != nil {
   350  		return "", "", err
   351  	} else if len(lines) == 0 {
   352  		return "", "", fmt.Errorf("no instances inside instance-group %q", instGroupName)
   353  	} else {
   354  		workerNodeName = lines[0]
   355  	}
   356  
   357  	workerNodeImg, err := host2image(workerNodeName)
   358  	if err != nil {
   359  		return "", "", err
   360  	}
   361  	frags := strings.Split(workerNodeImg, "/")
   362  	workerNodeImg = frags[len(frags)-1]
   363  
   364  	// For GKE clusters, controlPlaneNodeName will not be defined; we just leave controlPlaneNodeImg blank.
   365  	controlPlaneNodeImg := ""
   366  	if controlPlaneNodeName := framework.TestContext.CloudConfig.MasterName; controlPlaneNodeName != "" {
   367  		img, err := host2image(controlPlaneNodeName)
   368  		if err != nil {
   369  			return "", "", err
   370  		}
   371  		frags = strings.Split(img, "/")
   372  		controlPlaneNodeImg = frags[len(frags)-1]
   373  	}
   374  
   375  	return controlPlaneNodeImg, workerNodeImg, nil
   376  }
   377  
   378  // setupSuitePerGinkgoNode is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step.
   379  // There are certain operations we only want to run once per overall test invocation on each Ginkgo node
   380  // such as making some global variables accessible to all parallel executions
   381  // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite
   382  // Ref: https://onsi.github.io/ginkgo/#parallel-specs
   383  func setupSuitePerGinkgoNode(ctx context.Context) {
   384  	// Obtain the default IP family of the cluster
   385  	// Some e2e test are designed to work on IPv4 only, this global variable
   386  	// allows to adapt those tests to work on both IPv4 and IPv6
   387  	// TODO: dual-stack
   388  	// the dual stack clusters can be ipv4-ipv6 or ipv6-ipv4, order matters,
   389  	// and services use the primary IP family by default
   390  	c, err := framework.LoadClientset()
   391  	if err != nil {
   392  		klog.Fatal("Error loading client: ", err)
   393  	}
   394  	framework.TestContext.IPFamily = getDefaultClusterIPFamily(ctx, c)
   395  	framework.Logf("Cluster IP family: %s", framework.TestContext.IPFamily)
   396  }
   397  
   398  func prepullImages(ctx context.Context, c clientset.Interface) {
   399  	namespace, err := framework.CreateTestingNS(ctx, "img-puller", c, map[string]string{
   400  		"e2e-framework": "img-puller",
   401  	})
   402  	framework.ExpectNoError(err)
   403  	ns := namespace.Name
   404  	ginkgo.DeferCleanup(c.CoreV1().Namespaces().Delete, ns, metav1.DeleteOptions{})
   405  
   406  	images := commontest.PrePulledImages
   407  	if framework.NodeOSDistroIs("windows") {
   408  		images = commontest.WindowsPrePulledImages
   409  	}
   410  
   411  	label := map[string]string{"app": "prepull-daemonset"}
   412  	var imgPullers []*appsv1.DaemonSet
   413  	for _, img := range images.List() {
   414  		dsName := fmt.Sprintf("img-pull-%s", strings.ReplaceAll(strings.ReplaceAll(img, "/", "-"), ":", "-"))
   415  
   416  		dsSpec := daemonset.NewDaemonSet(dsName, img, label, nil, nil, nil)
   417  		ds, err := c.AppsV1().DaemonSets(ns).Create(ctx, dsSpec, metav1.CreateOptions{})
   418  		framework.ExpectNoError(err)
   419  		imgPullers = append(imgPullers, ds)
   420  	}
   421  
   422  	// this should not be a multiple of 5, because node status updates
   423  	// every 5 seconds. See https://github.com/kubernetes/kubernetes/pull/14915.
   424  	dsRetryPeriod := 9 * time.Second
   425  	dsRetryTimeout := 5 * time.Minute
   426  
   427  	for _, imgPuller := range imgPullers {
   428  		checkDaemonset := func(ctx context.Context) (bool, error) {
   429  			return daemonset.CheckPresentOnNodes(ctx, c, imgPuller, ns, framework.TestContext.CloudConfig.NumNodes)
   430  		}
   431  		framework.Logf("Waiting for %s", imgPuller.Name)
   432  		err := wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkDaemonset)
   433  		framework.ExpectNoError(err, "error waiting for image to be pulled")
   434  	}
   435  }