k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e_node/util.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"crypto/tls"
    22  	"encoding/json"
    23  	"flag"
    24  	"fmt"
    25  	"io"
    26  	"net"
    27  	"net/http"
    28  	"os"
    29  	"os/exec"
    30  	"regexp"
    31  	"strconv"
    32  	"strings"
    33  	"time"
    34  
    35  	"k8s.io/kubernetes/pkg/util/procfs"
    36  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    37  
    38  	"go.opentelemetry.io/otel/trace/noop"
    39  
    40  	v1 "k8s.io/api/core/v1"
    41  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    42  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    43  	"k8s.io/apimachinery/pkg/util/runtime"
    44  	"k8s.io/apimachinery/pkg/util/sets"
    45  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    46  	clientset "k8s.io/client-go/kubernetes"
    47  	"k8s.io/component-base/featuregate"
    48  	internalapi "k8s.io/cri-api/pkg/apis"
    49  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    50  	remote "k8s.io/cri-client/pkg"
    51  	"k8s.io/klog/v2"
    52  	kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
    53  	kubeletpodresourcesv1alpha1 "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
    54  	stats "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    55  	"k8s.io/kubelet/pkg/types"
    56  	"k8s.io/kubernetes/pkg/cluster/ports"
    57  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    58  	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
    59  	"k8s.io/kubernetes/pkg/kubelet/cm"
    60  	kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
    61  	"k8s.io/kubernetes/pkg/kubelet/util"
    62  
    63  	"github.com/coreos/go-systemd/v22/dbus"
    64  	"k8s.io/kubernetes/test/e2e/framework"
    65  	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
    66  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    67  	e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig"
    68  	imageutils "k8s.io/kubernetes/test/utils/image"
    69  
    70  	"github.com/onsi/ginkgo/v2"
    71  	"github.com/onsi/gomega"
    72  )
    73  
    74  var startServices = flag.Bool("start-services", true, "If true, start local node services")
    75  var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests")
    76  var busyboxImage = imageutils.GetE2EImage(imageutils.BusyBox)
    77  var agnhostImage = imageutils.GetE2EImage(imageutils.Agnhost)
    78  
    79  const (
    80  	// Kubelet internal cgroup name for node allocatable cgroup.
    81  	defaultNodeAllocatableCgroup = "kubepods"
    82  	// defaultPodResourcesPath is the path to the local endpoint serving the podresources GRPC service.
    83  	defaultPodResourcesPath    = "/var/lib/kubelet/pod-resources"
    84  	defaultPodResourcesTimeout = 10 * time.Second
    85  	defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb
    86  	// state files
    87  	cpuManagerStateFile    = "/var/lib/kubelet/cpu_manager_state"
    88  	memoryManagerStateFile = "/var/lib/kubelet/memory_manager_state"
    89  )
    90  
    91  var (
    92  	kubeletHealthCheckURL    = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort)
    93  	containerRuntimeUnitName = ""
    94  	// KubeletConfig is the kubelet configuration the test is running against.
    95  	kubeletCfg *kubeletconfig.KubeletConfiguration
    96  )
    97  
    98  func getNodeSummary(ctx context.Context) (*stats.Summary, error) {
    99  	kubeletConfig, err := getCurrentKubeletConfig(ctx)
   100  	if err != nil {
   101  		return nil, fmt.Errorf("failed to get current kubelet config")
   102  	}
   103  	req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
   104  	if err != nil {
   105  		return nil, fmt.Errorf("failed to build http request: %w", err)
   106  	}
   107  	req.Header.Add("Accept", "application/json")
   108  
   109  	client := &http.Client{}
   110  	resp, err := client.Do(req)
   111  	if err != nil {
   112  		return nil, fmt.Errorf("failed to get /stats/summary: %w", err)
   113  	}
   114  
   115  	defer resp.Body.Close()
   116  	contentsBytes, err := io.ReadAll(resp.Body)
   117  	if err != nil {
   118  		return nil, fmt.Errorf("failed to read /stats/summary: %+v", resp)
   119  	}
   120  
   121  	decoder := json.NewDecoder(strings.NewReader(string(contentsBytes)))
   122  	summary := stats.Summary{}
   123  	err = decoder.Decode(&summary)
   124  	if err != nil {
   125  		return nil, fmt.Errorf("failed to parse /stats/summary to go struct: %+v", resp)
   126  	}
   127  	return &summary, nil
   128  }
   129  
   130  func getV1alpha1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1alpha1.ListPodResourcesResponse, error) {
   131  	endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   132  	if err != nil {
   133  		return nil, fmt.Errorf("Error getting local endpoint: %w", err)
   134  	}
   135  	client, conn, err := podresources.GetV1alpha1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   136  	if err != nil {
   137  		return nil, fmt.Errorf("Error getting grpc client: %w", err)
   138  	}
   139  	defer conn.Close()
   140  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   141  	defer cancel()
   142  	resp, err := client.List(ctx, &kubeletpodresourcesv1alpha1.ListPodResourcesRequest{})
   143  	if err != nil {
   144  		return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err)
   145  	}
   146  	return resp, nil
   147  }
   148  
   149  func getV1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1.ListPodResourcesResponse, error) {
   150  	endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   151  	if err != nil {
   152  		return nil, fmt.Errorf("Error getting local endpoint: %w", err)
   153  	}
   154  	client, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   155  	if err != nil {
   156  		return nil, fmt.Errorf("Error getting gRPC client: %w", err)
   157  	}
   158  	defer conn.Close()
   159  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   160  	defer cancel()
   161  	resp, err := client.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{})
   162  	if err != nil {
   163  		return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err)
   164  	}
   165  	return resp, nil
   166  }
   167  
   168  // Returns the current KubeletConfiguration
   169  func getCurrentKubeletConfig(ctx context.Context) (*kubeletconfig.KubeletConfiguration, error) {
   170  	// namespace only relevant if useProxy==true, so we don't bother
   171  	return e2enodekubelet.GetCurrentKubeletConfig(ctx, framework.TestContext.NodeName, "", false, framework.TestContext.StandaloneMode)
   172  }
   173  
   174  func cleanupPods(f *framework.Framework) {
   175  	ginkgo.AfterEach(func(ctx context.Context) {
   176  		ginkgo.By("Deleting any Pods created by the test in namespace: " + f.Namespace.Name)
   177  		l, err := e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{})
   178  		framework.ExpectNoError(err)
   179  		for _, p := range l.Items {
   180  			if p.Namespace != f.Namespace.Name {
   181  				continue
   182  			}
   183  			framework.Logf("Deleting pod: %s", p.Name)
   184  			e2epod.NewPodClient(f).DeleteSync(ctx, p.Name, metav1.DeleteOptions{}, 2*time.Minute)
   185  		}
   186  	})
   187  }
   188  
   189  // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
   190  // The change is reverted in the AfterEach of the context.
   191  // Returns true on success.
   192  func tempSetCurrentKubeletConfig(f *framework.Framework, updateFunction func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration)) {
   193  	var oldCfg *kubeletconfig.KubeletConfiguration
   194  
   195  	ginkgo.BeforeEach(func(ctx context.Context) {
   196  		var err error
   197  		oldCfg, err = getCurrentKubeletConfig(ctx)
   198  		framework.ExpectNoError(err)
   199  
   200  		newCfg := oldCfg.DeepCopy()
   201  		updateFunction(ctx, newCfg)
   202  		if apiequality.Semantic.DeepEqual(*newCfg, *oldCfg) {
   203  			return
   204  		}
   205  
   206  		updateKubeletConfig(ctx, f, newCfg, true)
   207  	})
   208  
   209  	ginkgo.AfterEach(func(ctx context.Context) {
   210  		if oldCfg != nil {
   211  			// Update the Kubelet configuration.
   212  			updateKubeletConfig(ctx, f, oldCfg, true)
   213  		}
   214  	})
   215  }
   216  
   217  func updateKubeletConfig(ctx context.Context, f *framework.Framework, kubeletConfig *kubeletconfig.KubeletConfiguration, deleteStateFiles bool) {
   218  	// Update the Kubelet configuration.
   219  	ginkgo.By("Stopping the kubelet")
   220  	startKubelet := stopKubelet()
   221  
   222  	// wait until the kubelet health check will fail
   223  	gomega.Eventually(ctx, func() bool {
   224  		return kubeletHealthCheck(kubeletHealthCheckURL)
   225  	}, time.Minute, time.Second).Should(gomega.BeFalse())
   226  
   227  	// Delete CPU and memory manager state files to be sure it will not prevent the kubelet restart
   228  	if deleteStateFiles {
   229  		deleteStateFile(cpuManagerStateFile)
   230  		deleteStateFile(memoryManagerStateFile)
   231  	}
   232  
   233  	framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(kubeletConfig))
   234  
   235  	ginkgo.By("Starting the kubelet")
   236  	startKubelet()
   237  	waitForKubeletToStart(ctx, f)
   238  }
   239  
   240  func waitForKubeletToStart(ctx context.Context, f *framework.Framework) {
   241  	// wait until the kubelet health check will succeed
   242  	gomega.Eventually(ctx, func() bool {
   243  		return kubeletHealthCheck(kubeletHealthCheckURL)
   244  	}, 2*time.Minute, 5*time.Second).Should(gomega.BeTrue())
   245  
   246  	// Wait for the Kubelet to be ready.
   247  	gomega.Eventually(ctx, func(ctx context.Context) bool {
   248  		nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
   249  		framework.ExpectNoError(err)
   250  		return nodes == 1
   251  	}, time.Minute, time.Second).Should(gomega.BeTrue())
   252  }
   253  
   254  func deleteStateFile(stateFileName string) {
   255  	err := exec.Command("/bin/sh", "-c", fmt.Sprintf("rm -f %s", stateFileName)).Run()
   256  	framework.ExpectNoError(err, "failed to delete the state file")
   257  }
   258  
   259  // listNamespaceEvents lists the events in the given namespace.
   260  func listNamespaceEvents(ctx context.Context, c clientset.Interface, ns string) error {
   261  	ls, err := c.CoreV1().Events(ns).List(ctx, metav1.ListOptions{})
   262  	if err != nil {
   263  		return err
   264  	}
   265  	for _, event := range ls.Items {
   266  		klog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message)
   267  	}
   268  	return nil
   269  }
   270  
   271  func logPodEvents(ctx context.Context, f *framework.Framework) {
   272  	framework.Logf("Summary of pod events during the test:")
   273  	err := listNamespaceEvents(ctx, f.ClientSet, f.Namespace.Name)
   274  	framework.ExpectNoError(err)
   275  }
   276  
   277  func logNodeEvents(ctx context.Context, f *framework.Framework) {
   278  	framework.Logf("Summary of node events during the test:")
   279  	err := listNamespaceEvents(ctx, f.ClientSet, "")
   280  	framework.ExpectNoError(err)
   281  }
   282  
   283  func getLocalNode(ctx context.Context, f *framework.Framework) *v1.Node {
   284  	nodeList, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
   285  	framework.ExpectNoError(err)
   286  	gomega.Expect(nodeList.Items).Should(gomega.HaveLen(1), "Unexpected number of node objects for node e2e. Expects only one node.")
   287  	return &nodeList.Items[0]
   288  }
   289  
   290  // getLocalTestNode fetches the node object describing the local worker node set up by the e2e_node infra, alongside with its ready state.
   291  // getLocalTestNode is a variant of `getLocalNode` which reports but does not set any requirement about the node readiness state, letting
   292  // the caller decide. The check is intentionally done like `getLocalNode` does.
   293  // Note `getLocalNode` aborts (as in ginkgo.Expect) the test implicitly if the worker node is not ready.
   294  func getLocalTestNode(ctx context.Context, f *framework.Framework) (*v1.Node, bool) {
   295  	node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{})
   296  	framework.ExpectNoError(err)
   297  	ready := e2enode.IsNodeReady(node)
   298  	schedulable := e2enode.IsNodeSchedulable(node)
   299  	framework.Logf("node %q ready=%v schedulable=%v", node.Name, ready, schedulable)
   300  	return node, ready && schedulable
   301  }
   302  
   303  // logKubeletLatencyMetrics logs KubeletLatencyMetrics computed from the Prometheus
   304  // metrics exposed on the current node and identified by the metricNames.
   305  // The Kubelet subsystem prefix is automatically prepended to these metric names.
   306  func logKubeletLatencyMetrics(ctx context.Context, metricNames ...string) {
   307  	metricSet := sets.NewString()
   308  	for _, key := range metricNames {
   309  		metricSet.Insert(kubeletmetrics.KubeletSubsystem + "_" + key)
   310  	}
   311  	metric, err := e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, fmt.Sprintf("%s:%d", nodeNameOrIP(), ports.KubeletReadOnlyPort), "/metrics")
   312  	if err != nil {
   313  		framework.Logf("Error getting kubelet metrics: %v", err)
   314  	} else {
   315  		framework.Logf("Kubelet Metrics: %+v", e2emetrics.GetKubeletLatencyMetrics(metric, metricSet))
   316  	}
   317  }
   318  
   319  // getCRIClient connects CRI and returns CRI runtime service clients and image service client.
   320  func getCRIClient() (internalapi.RuntimeService, internalapi.ImageManagerService, error) {
   321  	// connection timeout for CRI service connection
   322  	logger := klog.Background()
   323  	const connectionTimeout = 2 * time.Minute
   324  	runtimeEndpoint := framework.TestContext.ContainerRuntimeEndpoint
   325  	r, err := remote.NewRemoteRuntimeService(runtimeEndpoint, connectionTimeout, noop.NewTracerProvider(), &logger)
   326  	if err != nil {
   327  		return nil, nil, err
   328  	}
   329  	imageManagerEndpoint := runtimeEndpoint
   330  	if framework.TestContext.ImageServiceEndpoint != "" {
   331  		//ImageServiceEndpoint is the same as ContainerRuntimeEndpoint if not
   332  		//explicitly specified
   333  		imageManagerEndpoint = framework.TestContext.ImageServiceEndpoint
   334  	}
   335  	i, err := remote.NewRemoteImageService(imageManagerEndpoint, connectionTimeout, noop.NewTracerProvider(), &logger)
   336  	if err != nil {
   337  		return nil, nil, err
   338  	}
   339  	return r, i, nil
   340  }
   341  
   342  // findKubeletServiceName searches the unit name among the services known to systemd.
   343  // if the `running` parameter is true, restricts the search among currently running services;
   344  // otherwise, also stopped, failed, exited (non-running in general) services are also considered.
   345  // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
   346  func findKubeletServiceName(running bool) string {
   347  	cmdLine := []string{
   348  		"systemctl", "list-units", "*kubelet*",
   349  	}
   350  	if running {
   351  		cmdLine = append(cmdLine, "--state=running")
   352  	}
   353  	stdout, err := exec.Command("sudo", cmdLine...).CombinedOutput()
   354  	framework.ExpectNoError(err)
   355  	regex := regexp.MustCompile("(kubelet-\\w+)")
   356  	matches := regex.FindStringSubmatch(string(stdout))
   357  	gomega.Expect(matches).ToNot(gomega.BeEmpty(), "Found more than one kubelet service running: %q", stdout)
   358  	kubeletServiceName := matches[0]
   359  	framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kubeletServiceName)
   360  	return kubeletServiceName
   361  }
   362  
   363  func findContainerRuntimeServiceName() (string, error) {
   364  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   365  	defer cancel()
   366  
   367  	conn, err := dbus.NewWithContext(ctx)
   368  	framework.ExpectNoError(err, "Failed to setup dbus connection")
   369  	defer conn.Close()
   370  
   371  	runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile)
   372  	framework.ExpectNoError(err, "failed to get list of container runtime pids")
   373  	gomega.Expect(runtimePids).To(gomega.HaveLen(1), "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids))
   374  
   375  	containerRuntimePid := runtimePids[0]
   376  
   377  	unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid))
   378  	framework.ExpectNoError(err, "Failed to get container runtime unit name")
   379  
   380  	return unitName, nil
   381  }
   382  
   383  type containerRuntimeUnitOp int
   384  
   385  const (
   386  	startContainerRuntimeUnitOp containerRuntimeUnitOp = iota
   387  	stopContainerRuntimeUnitOp
   388  )
   389  
   390  func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error {
   391  	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
   392  	defer cancel()
   393  
   394  	conn, err := dbus.NewWithContext(ctx)
   395  	framework.ExpectNoError(err, "Failed to setup dbus connection")
   396  	defer conn.Close()
   397  
   398  	if containerRuntimeUnitName == "" {
   399  		containerRuntimeUnitName, err = findContainerRuntimeServiceName()
   400  		framework.ExpectNoError(err, "Failed to find container runtime name")
   401  	}
   402  
   403  	reschan := make(chan string)
   404  
   405  	switch op {
   406  	case startContainerRuntimeUnitOp:
   407  		_, err = conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
   408  	case stopContainerRuntimeUnitOp:
   409  		_, err = conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
   410  	default:
   411  		framework.Failf("Unexpected container runtime op: %v", op)
   412  	}
   413  	framework.ExpectNoError(err, "dbus connection error")
   414  
   415  	job := <-reschan
   416  	gomega.Expect(job).To(gomega.Equal("done"), "Expected job to complete with done")
   417  
   418  	return nil
   419  }
   420  
   421  func stopContainerRuntime() error {
   422  	return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp)
   423  }
   424  
   425  func startContainerRuntime() error {
   426  	return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp)
   427  }
   428  
   429  // restartKubelet restarts the current kubelet service.
   430  // the "current" kubelet service is the instance managed by the current e2e_node test run.
   431  // If `running` is true, restarts only if the current kubelet is actually running. In some cases,
   432  // the kubelet may have exited or can be stopped, typically because it was intentionally stopped
   433  // earlier during a test, or, sometimes, because it just crashed.
   434  // Warning: the "current" kubelet is poorly defined. The "current" kubelet is assumed to be the most
   435  // recent kubelet service unit, IOW there is not a unique ID we use to bind explicitly a kubelet
   436  // instance to a test run.
   437  func restartKubelet(running bool) {
   438  	kubeletServiceName := findKubeletServiceName(running)
   439  	// reset the kubelet service start-limit-hit
   440  	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
   441  	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
   442  
   443  	stdout, err = exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput()
   444  	framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout))
   445  }
   446  
   447  // stopKubelet will kill the running kubelet, and returns a func that will restart the process again
   448  func stopKubelet() func() {
   449  	kubeletServiceName := findKubeletServiceName(true)
   450  
   451  	// reset the kubelet service start-limit-hit
   452  	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
   453  	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
   454  
   455  	stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput()
   456  	framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %s", err, string(stdout))
   457  
   458  	return func() {
   459  		// we should restart service, otherwise the transient service start will fail
   460  		stdout, err := exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput()
   461  		framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
   462  	}
   463  }
   464  
   465  func kubeletHealthCheck(url string) bool {
   466  	insecureTransport := http.DefaultTransport.(*http.Transport).Clone()
   467  	insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
   468  	insecureHTTPClient := &http.Client{
   469  		Transport: insecureTransport,
   470  	}
   471  
   472  	req, err := http.NewRequest("HEAD", url, nil)
   473  	if err != nil {
   474  		return false
   475  	}
   476  	req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", framework.TestContext.BearerToken))
   477  	resp, err := insecureHTTPClient.Do(req)
   478  	if err != nil {
   479  		klog.Warningf("Health check on %q failed, error=%v", url, err)
   480  	} else if resp.StatusCode != http.StatusOK {
   481  		klog.Warningf("Health check on %q failed, status=%d", url, resp.StatusCode)
   482  	}
   483  	return err == nil && resp.StatusCode == http.StatusOK
   484  }
   485  
   486  func toCgroupFsName(cgroupName cm.CgroupName) string {
   487  	if kubeletCfg.CgroupDriver == "systemd" {
   488  		return cgroupName.ToSystemd()
   489  	}
   490  	return cgroupName.ToCgroupfs()
   491  }
   492  
   493  // reduceAllocatableMemoryUsageIfCgroupv1 uses memory.force_empty (https://lwn.net/Articles/432224/)
   494  // to make the kernel reclaim memory in the allocatable cgroup
   495  // the time to reduce pressure may be unbounded, but usually finishes within a second.
   496  // memory.force_empty is no supported in cgroupv2.
   497  func reduceAllocatableMemoryUsageIfCgroupv1() {
   498  	if !IsCgroup2UnifiedMode() {
   499  		cmd := fmt.Sprintf("echo 0 > /sys/fs/cgroup/memory/%s/memory.force_empty", toCgroupFsName(cm.NewCgroupName(cm.RootCgroupName, defaultNodeAllocatableCgroup)))
   500  		_, err := exec.Command("sudo", "sh", "-c", cmd).CombinedOutput()
   501  		framework.ExpectNoError(err)
   502  	}
   503  }
   504  
   505  // Equivalent of featuregatetesting.SetFeatureGateDuringTest
   506  // which can't be used here because we're not in a Testing context.
   507  // This must be in a non-"_test" file to pass
   508  // make verify WHAT=test-featuregates
   509  func withFeatureGate(feature featuregate.Feature, desired bool) func() {
   510  	current := utilfeature.DefaultFeatureGate.Enabled(feature)
   511  	utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), desired))
   512  	return func() {
   513  		utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), current))
   514  	}
   515  }
   516  
   517  // waitForAllContainerRemoval waits until all the containers on a given pod are really gone.
   518  // This is needed by the e2e tests which involve exclusive resource allocation (cpu, topology manager; podresources; etc.)
   519  // In these cases, we need to make sure the tests clean up after themselves to make sure each test runs in
   520  // a pristine environment. The only way known so far to do that is to introduce this wait.
   521  // Worth noting, however, that this makes the test runtime much bigger.
   522  func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) {
   523  	rs, _, err := getCRIClient()
   524  	framework.ExpectNoError(err)
   525  	gomega.Eventually(ctx, func(ctx context.Context) error {
   526  		containers, err := rs.ListContainers(ctx, &runtimeapi.ContainerFilter{
   527  			LabelSelector: map[string]string{
   528  				types.KubernetesPodNameLabel:      podName,
   529  				types.KubernetesPodNamespaceLabel: podNS,
   530  			},
   531  		})
   532  		if err != nil {
   533  			return fmt.Errorf("got error waiting for all containers to be removed from CRI: %v", err)
   534  		}
   535  
   536  		if len(containers) > 0 {
   537  			return fmt.Errorf("expected all containers to be removed from CRI but %v containers still remain. Containers: %+v", len(containers), containers)
   538  		}
   539  		return nil
   540  	}, 2*time.Minute, 1*time.Second).Should(gomega.Succeed())
   541  }
   542  
   543  func getPidsForProcess(name, pidFile string) ([]int, error) {
   544  	if len(pidFile) > 0 {
   545  		pid, err := getPidFromPidFile(pidFile)
   546  		if err == nil {
   547  			return []int{pid}, nil
   548  		}
   549  		// log the error and fall back to pidof
   550  		runtime.HandleError(err)
   551  	}
   552  	return procfs.PidOf(name)
   553  }
   554  
   555  func getPidFromPidFile(pidFile string) (int, error) {
   556  	file, err := os.Open(pidFile)
   557  	if err != nil {
   558  		return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
   559  	}
   560  	defer file.Close()
   561  
   562  	data, err := io.ReadAll(file)
   563  	if err != nil {
   564  		return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
   565  	}
   566  
   567  	pid, err := strconv.Atoi(string(data))
   568  	if err != nil {
   569  		return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
   570  	}
   571  
   572  	return pid, nil
   573  }
   574  
   575  // WaitForPodInitContainerRestartCount waits for the given Pod init container
   576  // to achieve at least a given restartCount
   577  // TODO: eventually look at moving to test/e2e/framework/pod
   578  func WaitForPodInitContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, initContainerIndex int, desiredRestartCount int32, timeout time.Duration) error {
   579  	conditionDesc := fmt.Sprintf("init container %d started", initContainerIndex)
   580  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   581  		if initContainerIndex > len(pod.Status.InitContainerStatuses)-1 {
   582  			return false, nil
   583  		}
   584  		containerStatus := pod.Status.InitContainerStatuses[initContainerIndex]
   585  		return containerStatus.RestartCount >= desiredRestartCount, nil
   586  	})
   587  }
   588  
   589  // WaitForPodContainerRestartCount waits for the given Pod container to achieve at least a given restartCount
   590  // TODO: eventually look at moving to test/e2e/framework/pod
   591  func WaitForPodContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, desiredRestartCount int32, timeout time.Duration) error {
   592  	conditionDesc := fmt.Sprintf("container %d started", containerIndex)
   593  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   594  		if containerIndex > len(pod.Status.ContainerStatuses)-1 {
   595  			return false, nil
   596  		}
   597  		containerStatus := pod.Status.ContainerStatuses[containerIndex]
   598  		return containerStatus.RestartCount >= desiredRestartCount, nil
   599  	})
   600  }
   601  
   602  // WaitForPodInitContainerToFail waits for the given Pod init container to fail with the given reason, specifically due to
   603  // invalid container configuration. In this case, the container will remain in a waiting state with a specific
   604  // reason set, which should match the given reason.
   605  // TODO: eventually look at moving to test/e2e/framework/pod
   606  func WaitForPodInitContainerToFail(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, reason string, timeout time.Duration) error {
   607  	conditionDesc := fmt.Sprintf("container %d failed with reason %s", containerIndex, reason)
   608  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   609  		switch pod.Status.Phase {
   610  		case v1.PodPending:
   611  			if len(pod.Status.InitContainerStatuses) == 0 {
   612  				return false, nil
   613  			}
   614  			containerStatus := pod.Status.InitContainerStatuses[containerIndex]
   615  			if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == reason {
   616  				return true, nil
   617  			}
   618  			return false, nil
   619  		case v1.PodFailed, v1.PodRunning, v1.PodSucceeded:
   620  			return false, fmt.Errorf("pod was expected to be pending, but it is in the state: %s", pod.Status.Phase)
   621  		}
   622  		return false, nil
   623  	})
   624  }
   625  
   626  func nodeNameOrIP() string {
   627  	return "localhost"
   628  }