k8s.io/kubernetes@v1.29.3/test/e2e_node/util.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"crypto/tls"
    22  	"encoding/json"
    23  	"flag"
    24  	"fmt"
    25  	"io"
    26  	"net"
    27  	"net/http"
    28  	"os"
    29  	"os/exec"
    30  	"regexp"
    31  	"strconv"
    32  	"strings"
    33  	"time"
    34  
    35  	"k8s.io/kubernetes/pkg/util/procfs"
    36  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    37  
    38  	oteltrace "go.opentelemetry.io/otel/trace"
    39  
    40  	v1 "k8s.io/api/core/v1"
    41  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    42  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    43  	"k8s.io/apimachinery/pkg/util/runtime"
    44  	"k8s.io/apimachinery/pkg/util/sets"
    45  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    46  	clientset "k8s.io/client-go/kubernetes"
    47  	"k8s.io/component-base/featuregate"
    48  	internalapi "k8s.io/cri-api/pkg/apis"
    49  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    50  	"k8s.io/klog/v2"
    51  	kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
    52  	kubeletpodresourcesv1alpha1 "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
    53  	stats "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    54  	"k8s.io/kubelet/pkg/types"
    55  	"k8s.io/kubernetes/pkg/cluster/ports"
    56  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    57  	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
    58  	"k8s.io/kubernetes/pkg/kubelet/cm"
    59  	"k8s.io/kubernetes/pkg/kubelet/cri/remote"
    60  	kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
    61  	"k8s.io/kubernetes/pkg/kubelet/util"
    62  
    63  	"github.com/coreos/go-systemd/v22/dbus"
    64  	"k8s.io/kubernetes/test/e2e/framework"
    65  	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
    66  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    67  	e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig"
    68  	imageutils "k8s.io/kubernetes/test/utils/image"
    69  
    70  	"github.com/onsi/ginkgo/v2"
    71  	"github.com/onsi/gomega"
    72  )
    73  
    74  var startServices = flag.Bool("start-services", true, "If true, start local node services")
    75  var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests")
    76  var busyboxImage = imageutils.GetE2EImage(imageutils.BusyBox)
    77  
    78  const (
    79  	// Kubelet internal cgroup name for node allocatable cgroup.
    80  	defaultNodeAllocatableCgroup = "kubepods"
    81  	// defaultPodResourcesPath is the path to the local endpoint serving the podresources GRPC service.
    82  	defaultPodResourcesPath    = "/var/lib/kubelet/pod-resources"
    83  	defaultPodResourcesTimeout = 10 * time.Second
    84  	defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb
    85  	// state files
    86  	cpuManagerStateFile    = "/var/lib/kubelet/cpu_manager_state"
    87  	memoryManagerStateFile = "/var/lib/kubelet/memory_manager_state"
    88  )
    89  
    90  var (
    91  	kubeletHealthCheckURL    = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort)
    92  	containerRuntimeUnitName = ""
    93  	// KubeletConfig is the kubelet configuration the test is running against.
    94  	kubeletCfg *kubeletconfig.KubeletConfiguration
    95  )
    96  
    97  func getNodeSummary(ctx context.Context) (*stats.Summary, error) {
    98  	kubeletConfig, err := getCurrentKubeletConfig(ctx)
    99  	if err != nil {
   100  		return nil, fmt.Errorf("failed to get current kubelet config")
   101  	}
   102  	req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
   103  	if err != nil {
   104  		return nil, fmt.Errorf("failed to build http request: %w", err)
   105  	}
   106  	req.Header.Add("Accept", "application/json")
   107  
   108  	client := &http.Client{}
   109  	resp, err := client.Do(req)
   110  	if err != nil {
   111  		return nil, fmt.Errorf("failed to get /stats/summary: %w", err)
   112  	}
   113  
   114  	defer resp.Body.Close()
   115  	contentsBytes, err := io.ReadAll(resp.Body)
   116  	if err != nil {
   117  		return nil, fmt.Errorf("failed to read /stats/summary: %+v", resp)
   118  	}
   119  
   120  	decoder := json.NewDecoder(strings.NewReader(string(contentsBytes)))
   121  	summary := stats.Summary{}
   122  	err = decoder.Decode(&summary)
   123  	if err != nil {
   124  		return nil, fmt.Errorf("failed to parse /stats/summary to go struct: %+v", resp)
   125  	}
   126  	return &summary, nil
   127  }
   128  
   129  func getV1alpha1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1alpha1.ListPodResourcesResponse, error) {
   130  	endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   131  	if err != nil {
   132  		return nil, fmt.Errorf("Error getting local endpoint: %w", err)
   133  	}
   134  	client, conn, err := podresources.GetV1alpha1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   135  	if err != nil {
   136  		return nil, fmt.Errorf("Error getting grpc client: %w", err)
   137  	}
   138  	defer conn.Close()
   139  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   140  	defer cancel()
   141  	resp, err := client.List(ctx, &kubeletpodresourcesv1alpha1.ListPodResourcesRequest{})
   142  	if err != nil {
   143  		return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err)
   144  	}
   145  	return resp, nil
   146  }
   147  
   148  func getV1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1.ListPodResourcesResponse, error) {
   149  	endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   150  	if err != nil {
   151  		return nil, fmt.Errorf("Error getting local endpoint: %w", err)
   152  	}
   153  	client, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   154  	if err != nil {
   155  		return nil, fmt.Errorf("Error getting gRPC client: %w", err)
   156  	}
   157  	defer conn.Close()
   158  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   159  	defer cancel()
   160  	resp, err := client.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{})
   161  	if err != nil {
   162  		return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err)
   163  	}
   164  	return resp, nil
   165  }
   166  
   167  // Returns the current KubeletConfiguration
   168  func getCurrentKubeletConfig(ctx context.Context) (*kubeletconfig.KubeletConfiguration, error) {
   169  	// namespace only relevant if useProxy==true, so we don't bother
   170  	return e2enodekubelet.GetCurrentKubeletConfig(ctx, framework.TestContext.NodeName, "", false, framework.TestContext.StandaloneMode)
   171  }
   172  
   173  func cleanupPods(f *framework.Framework) {
   174  	ginkgo.AfterEach(func(ctx context.Context) {
   175  		ginkgo.By("Deleting any Pods created by the test in namespace: " + f.Namespace.Name)
   176  		l, err := e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{})
   177  		framework.ExpectNoError(err)
   178  		for _, p := range l.Items {
   179  			if p.Namespace != f.Namespace.Name {
   180  				continue
   181  			}
   182  			framework.Logf("Deleting pod: %s", p.Name)
   183  			e2epod.NewPodClient(f).DeleteSync(ctx, p.Name, metav1.DeleteOptions{}, 2*time.Minute)
   184  		}
   185  	})
   186  }
   187  
   188  // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
   189  // The change is reverted in the AfterEach of the context.
   190  // Returns true on success.
   191  func tempSetCurrentKubeletConfig(f *framework.Framework, updateFunction func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration)) {
   192  	var oldCfg *kubeletconfig.KubeletConfiguration
   193  
   194  	ginkgo.BeforeEach(func(ctx context.Context) {
   195  		var err error
   196  		oldCfg, err = getCurrentKubeletConfig(ctx)
   197  		framework.ExpectNoError(err)
   198  
   199  		newCfg := oldCfg.DeepCopy()
   200  		updateFunction(ctx, newCfg)
   201  		if apiequality.Semantic.DeepEqual(*newCfg, *oldCfg) {
   202  			return
   203  		}
   204  
   205  		updateKubeletConfig(ctx, f, newCfg, true)
   206  	})
   207  
   208  	ginkgo.AfterEach(func(ctx context.Context) {
   209  		if oldCfg != nil {
   210  			// Update the Kubelet configuration.
   211  			updateKubeletConfig(ctx, f, oldCfg, true)
   212  		}
   213  	})
   214  }
   215  
   216  func updateKubeletConfig(ctx context.Context, f *framework.Framework, kubeletConfig *kubeletconfig.KubeletConfiguration, deleteStateFiles bool) {
   217  	// Update the Kubelet configuration.
   218  	ginkgo.By("Stopping the kubelet")
   219  	startKubelet := stopKubelet()
   220  
   221  	// wait until the kubelet health check will fail
   222  	gomega.Eventually(ctx, func() bool {
   223  		return kubeletHealthCheck(kubeletHealthCheckURL)
   224  	}, time.Minute, time.Second).Should(gomega.BeFalse())
   225  
   226  	// Delete CPU and memory manager state files to be sure it will not prevent the kubelet restart
   227  	if deleteStateFiles {
   228  		deleteStateFile(cpuManagerStateFile)
   229  		deleteStateFile(memoryManagerStateFile)
   230  	}
   231  
   232  	framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(kubeletConfig))
   233  
   234  	ginkgo.By("Starting the kubelet")
   235  	startKubelet()
   236  
   237  	// wait until the kubelet health check will succeed
   238  	gomega.Eventually(ctx, func() bool {
   239  		return kubeletHealthCheck(kubeletHealthCheckURL)
   240  	}, 2*time.Minute, 5*time.Second).Should(gomega.BeTrue())
   241  
   242  	// Wait for the Kubelet to be ready.
   243  	gomega.Eventually(ctx, func(ctx context.Context) bool {
   244  		nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
   245  		framework.ExpectNoError(err)
   246  		return nodes == 1
   247  	}, time.Minute, time.Second).Should(gomega.BeTrue())
   248  }
   249  
   250  func deleteStateFile(stateFileName string) {
   251  	err := exec.Command("/bin/sh", "-c", fmt.Sprintf("rm -f %s", stateFileName)).Run()
   252  	framework.ExpectNoError(err, "failed to delete the state file")
   253  }
   254  
   255  // listNamespaceEvents lists the events in the given namespace.
   256  func listNamespaceEvents(ctx context.Context, c clientset.Interface, ns string) error {
   257  	ls, err := c.CoreV1().Events(ns).List(ctx, metav1.ListOptions{})
   258  	if err != nil {
   259  		return err
   260  	}
   261  	for _, event := range ls.Items {
   262  		klog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message)
   263  	}
   264  	return nil
   265  }
   266  
   267  func logPodEvents(ctx context.Context, f *framework.Framework) {
   268  	framework.Logf("Summary of pod events during the test:")
   269  	err := listNamespaceEvents(ctx, f.ClientSet, f.Namespace.Name)
   270  	framework.ExpectNoError(err)
   271  }
   272  
   273  func logNodeEvents(ctx context.Context, f *framework.Framework) {
   274  	framework.Logf("Summary of node events during the test:")
   275  	err := listNamespaceEvents(ctx, f.ClientSet, "")
   276  	framework.ExpectNoError(err)
   277  }
   278  
   279  func getLocalNode(ctx context.Context, f *framework.Framework) *v1.Node {
   280  	nodeList, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet)
   281  	framework.ExpectNoError(err)
   282  	gomega.Expect(nodeList.Items).Should(gomega.HaveLen(1), "Unexpected number of node objects for node e2e. Expects only one node.")
   283  	return &nodeList.Items[0]
   284  }
   285  
   286  // getLocalTestNode fetches the node object describing the local worker node set up by the e2e_node infra, alongside with its ready state.
   287  // getLocalTestNode is a variant of `getLocalNode` which reports but does not set any requirement about the node readiness state, letting
   288  // the caller decide. The check is intentionally done like `getLocalNode` does.
   289  // Note `getLocalNode` aborts (as in ginkgo.Expect) the test implicitly if the worker node is not ready.
   290  func getLocalTestNode(ctx context.Context, f *framework.Framework) (*v1.Node, bool) {
   291  	node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{})
   292  	framework.ExpectNoError(err)
   293  	ready := e2enode.IsNodeReady(node)
   294  	schedulable := e2enode.IsNodeSchedulable(node)
   295  	framework.Logf("node %q ready=%v schedulable=%v", node.Name, ready, schedulable)
   296  	return node, ready && schedulable
   297  }
   298  
   299  // logKubeletLatencyMetrics logs KubeletLatencyMetrics computed from the Prometheus
   300  // metrics exposed on the current node and identified by the metricNames.
   301  // The Kubelet subsystem prefix is automatically prepended to these metric names.
   302  func logKubeletLatencyMetrics(ctx context.Context, metricNames ...string) {
   303  	metricSet := sets.NewString()
   304  	for _, key := range metricNames {
   305  		metricSet.Insert(kubeletmetrics.KubeletSubsystem + "_" + key)
   306  	}
   307  	metric, err := e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, fmt.Sprintf("%s:%d", nodeNameOrIP(), ports.KubeletReadOnlyPort), "/metrics")
   308  	if err != nil {
   309  		framework.Logf("Error getting kubelet metrics: %v", err)
   310  	} else {
   311  		framework.Logf("Kubelet Metrics: %+v", e2emetrics.GetKubeletLatencyMetrics(metric, metricSet))
   312  	}
   313  }
   314  
   315  // runCommand runs the cmd and returns the combined stdout and stderr, or an
   316  // error if the command failed.
   317  func runCommand(cmd ...string) (string, error) {
   318  	output, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
   319  	if err != nil {
   320  		return "", fmt.Errorf("failed to run %q: %s (%s)", strings.Join(cmd, " "), err, output)
   321  	}
   322  	return string(output), nil
   323  }
   324  
   325  // getCRIClient connects CRI and returns CRI runtime service clients and image service client.
   326  func getCRIClient() (internalapi.RuntimeService, internalapi.ImageManagerService, error) {
   327  	// connection timeout for CRI service connection
   328  	const connectionTimeout = 2 * time.Minute
   329  	runtimeEndpoint := framework.TestContext.ContainerRuntimeEndpoint
   330  	r, err := remote.NewRemoteRuntimeService(runtimeEndpoint, connectionTimeout, oteltrace.NewNoopTracerProvider())
   331  	if err != nil {
   332  		return nil, nil, err
   333  	}
   334  	imageManagerEndpoint := runtimeEndpoint
   335  	if framework.TestContext.ImageServiceEndpoint != "" {
   336  		//ImageServiceEndpoint is the same as ContainerRuntimeEndpoint if not
   337  		//explicitly specified
   338  		imageManagerEndpoint = framework.TestContext.ImageServiceEndpoint
   339  	}
   340  	i, err := remote.NewRemoteImageService(imageManagerEndpoint, connectionTimeout, oteltrace.NewNoopTracerProvider())
   341  	if err != nil {
   342  		return nil, nil, err
   343  	}
   344  	return r, i, nil
   345  }
   346  
   347  // findKubeletServiceName searches the unit name among the services known to systemd.
   348  // if the `running` parameter is true, restricts the search among currently running services;
   349  // otherwise, also stopped, failed, exited (non-running in general) services are also considered.
   350  // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
   351  func findKubeletServiceName(running bool) string {
   352  	cmdLine := []string{
   353  		"systemctl", "list-units", "*kubelet*",
   354  	}
   355  	if running {
   356  		cmdLine = append(cmdLine, "--state=running")
   357  	}
   358  	stdout, err := exec.Command("sudo", cmdLine...).CombinedOutput()
   359  	framework.ExpectNoError(err)
   360  	regex := regexp.MustCompile("(kubelet-\\w+)")
   361  	matches := regex.FindStringSubmatch(string(stdout))
   362  	gomega.Expect(matches).ToNot(gomega.BeEmpty(), "Found more than one kubelet service running: %q", stdout)
   363  	kubeletServiceName := matches[0]
   364  	framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kubeletServiceName)
   365  	return kubeletServiceName
   366  }
   367  
   368  func findContainerRuntimeServiceName() (string, error) {
   369  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   370  	defer cancel()
   371  
   372  	conn, err := dbus.NewWithContext(ctx)
   373  	framework.ExpectNoError(err, "Failed to setup dbus connection")
   374  	defer conn.Close()
   375  
   376  	runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile)
   377  	framework.ExpectNoError(err, "failed to get list of container runtime pids")
   378  	gomega.Expect(runtimePids).To(gomega.HaveLen(1), "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids))
   379  
   380  	containerRuntimePid := runtimePids[0]
   381  
   382  	unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid))
   383  	framework.ExpectNoError(err, "Failed to get container runtime unit name")
   384  
   385  	return unitName, nil
   386  }
   387  
   388  type containerRuntimeUnitOp int
   389  
   390  const (
   391  	startContainerRuntimeUnitOp containerRuntimeUnitOp = iota
   392  	stopContainerRuntimeUnitOp
   393  )
   394  
   395  func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error {
   396  	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
   397  	defer cancel()
   398  
   399  	conn, err := dbus.NewWithContext(ctx)
   400  	framework.ExpectNoError(err, "Failed to setup dbus connection")
   401  	defer conn.Close()
   402  
   403  	if containerRuntimeUnitName == "" {
   404  		containerRuntimeUnitName, err = findContainerRuntimeServiceName()
   405  		framework.ExpectNoError(err, "Failed to find container runtime name")
   406  	}
   407  
   408  	reschan := make(chan string)
   409  
   410  	switch op {
   411  	case startContainerRuntimeUnitOp:
   412  		_, err = conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
   413  	case stopContainerRuntimeUnitOp:
   414  		_, err = conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
   415  	default:
   416  		framework.Failf("Unexpected container runtime op: %v", op)
   417  	}
   418  	framework.ExpectNoError(err, "dbus connection error")
   419  
   420  	job := <-reschan
   421  	gomega.Expect(job).To(gomega.Equal("done"), "Expected job to complete with done")
   422  
   423  	return nil
   424  }
   425  
   426  func stopContainerRuntime() error {
   427  	return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp)
   428  }
   429  
   430  func startContainerRuntime() error {
   431  	return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp)
   432  }
   433  
   434  // restartKubelet restarts the current kubelet service.
   435  // the "current" kubelet service is the instance managed by the current e2e_node test run.
   436  // If `running` is true, restarts only if the current kubelet is actually running. In some cases,
   437  // the kubelet may have exited or can be stopped, typically because it was intentionally stopped
   438  // earlier during a test, or, sometimes, because it just crashed.
   439  // Warning: the "current" kubelet is poorly defined. The "current" kubelet is assumed to be the most
   440  // recent kubelet service unit, IOW there is not a unique ID we use to bind explicitly a kubelet
   441  // instance to a test run.
   442  func restartKubelet(running bool) {
   443  	kubeletServiceName := findKubeletServiceName(running)
   444  	// reset the kubelet service start-limit-hit
   445  	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
   446  	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
   447  
   448  	stdout, err = exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput()
   449  	framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout))
   450  }
   451  
   452  // stopKubelet will kill the running kubelet, and returns a func that will restart the process again
   453  func stopKubelet() func() {
   454  	kubeletServiceName := findKubeletServiceName(true)
   455  
   456  	// reset the kubelet service start-limit-hit
   457  	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
   458  	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))
   459  
   460  	stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput()
   461  	framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %s", err, string(stdout))
   462  
   463  	return func() {
   464  		// we should restart service, otherwise the transient service start will fail
   465  		stdout, err := exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput()
   466  		framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
   467  	}
   468  }
   469  
   470  // killKubelet sends a signal (SIGINT, SIGSTOP, SIGTERM...) to the running kubelet
   471  func killKubelet(sig string) {
   472  	kubeletServiceName := findKubeletServiceName(true)
   473  
   474  	// reset the kubelet service start-limit-hit
   475  	stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
   476  	framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout)
   477  
   478  	stdout, err = exec.Command("sudo", "systemctl", "kill", "-s", sig, kubeletServiceName).CombinedOutput()
   479  	framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout)
   480  }
   481  
   482  func kubeletHealthCheck(url string) bool {
   483  	insecureTransport := http.DefaultTransport.(*http.Transport).Clone()
   484  	insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
   485  	insecureHTTPClient := &http.Client{
   486  		Transport: insecureTransport,
   487  	}
   488  
   489  	req, err := http.NewRequest("HEAD", url, nil)
   490  	if err != nil {
   491  		return false
   492  	}
   493  	req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", framework.TestContext.BearerToken))
   494  	resp, err := insecureHTTPClient.Do(req)
   495  	if err != nil {
   496  		klog.Warningf("Health check on %q failed, error=%v", url, err)
   497  	} else if resp.StatusCode != http.StatusOK {
   498  		klog.Warningf("Health check on %q failed, status=%d", url, resp.StatusCode)
   499  	}
   500  	return err == nil && resp.StatusCode == http.StatusOK
   501  }
   502  
   503  func toCgroupFsName(cgroupName cm.CgroupName) string {
   504  	if kubeletCfg.CgroupDriver == "systemd" {
   505  		return cgroupName.ToSystemd()
   506  	}
   507  	return cgroupName.ToCgroupfs()
   508  }
   509  
   510  // reduceAllocatableMemoryUsageIfCgroupv1 uses memory.force_empty (https://lwn.net/Articles/432224/)
   511  // to make the kernel reclaim memory in the allocatable cgroup
   512  // the time to reduce pressure may be unbounded, but usually finishes within a second.
   513  // memory.force_empty is no supported in cgroupv2.
   514  func reduceAllocatableMemoryUsageIfCgroupv1() {
   515  	if !IsCgroup2UnifiedMode() {
   516  		cmd := fmt.Sprintf("echo 0 > /sys/fs/cgroup/memory/%s/memory.force_empty", toCgroupFsName(cm.NewCgroupName(cm.RootCgroupName, defaultNodeAllocatableCgroup)))
   517  		_, err := exec.Command("sudo", "sh", "-c", cmd).CombinedOutput()
   518  		framework.ExpectNoError(err)
   519  	}
   520  }
   521  
   522  // Equivalent of featuregatetesting.SetFeatureGateDuringTest
   523  // which can't be used here because we're not in a Testing context.
   524  // This must be in a non-"_test" file to pass
   525  // make verify WHAT=test-featuregates
   526  func withFeatureGate(feature featuregate.Feature, desired bool) func() {
   527  	current := utilfeature.DefaultFeatureGate.Enabled(feature)
   528  	utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), desired))
   529  	return func() {
   530  		utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), current))
   531  	}
   532  }
   533  
   534  // waitForAllContainerRemoval waits until all the containers on a given pod are really gone.
   535  // This is needed by the e2e tests which involve exclusive resource allocation (cpu, topology manager; podresources; etc.)
   536  // In these cases, we need to make sure the tests clean up after themselves to make sure each test runs in
   537  // a pristine environment. The only way known so far to do that is to introduce this wait.
   538  // Worth noting, however, that this makes the test runtime much bigger.
   539  func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) {
   540  	rs, _, err := getCRIClient()
   541  	framework.ExpectNoError(err)
   542  	gomega.Eventually(ctx, func(ctx context.Context) error {
   543  		containers, err := rs.ListContainers(ctx, &runtimeapi.ContainerFilter{
   544  			LabelSelector: map[string]string{
   545  				types.KubernetesPodNameLabel:      podName,
   546  				types.KubernetesPodNamespaceLabel: podNS,
   547  			},
   548  		})
   549  		if err != nil {
   550  			return fmt.Errorf("got error waiting for all containers to be removed from CRI: %v", err)
   551  		}
   552  
   553  		if len(containers) > 0 {
   554  			return fmt.Errorf("expected all containers to be removed from CRI but %v containers still remain. Containers: %+v", len(containers), containers)
   555  		}
   556  		return nil
   557  	}, 2*time.Minute, 1*time.Second).Should(gomega.Succeed())
   558  }
   559  
   560  func getPidsForProcess(name, pidFile string) ([]int, error) {
   561  	if len(pidFile) > 0 {
   562  		pid, err := getPidFromPidFile(pidFile)
   563  		if err == nil {
   564  			return []int{pid}, nil
   565  		}
   566  		// log the error and fall back to pidof
   567  		runtime.HandleError(err)
   568  	}
   569  	return procfs.PidOf(name)
   570  }
   571  
   572  func getPidFromPidFile(pidFile string) (int, error) {
   573  	file, err := os.Open(pidFile)
   574  	if err != nil {
   575  		return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
   576  	}
   577  	defer file.Close()
   578  
   579  	data, err := io.ReadAll(file)
   580  	if err != nil {
   581  		return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
   582  	}
   583  
   584  	pid, err := strconv.Atoi(string(data))
   585  	if err != nil {
   586  		return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
   587  	}
   588  
   589  	return pid, nil
   590  }
   591  
   592  // WaitForPodInitContainerRestartCount waits for the given Pod init container
   593  // to achieve at least a given restartCount
   594  // TODO: eventually look at moving to test/e2e/framework/pod
   595  func WaitForPodInitContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, initContainerIndex int, desiredRestartCount int32, timeout time.Duration) error {
   596  	conditionDesc := fmt.Sprintf("init container %d started", initContainerIndex)
   597  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   598  		if initContainerIndex > len(pod.Status.InitContainerStatuses)-1 {
   599  			return false, nil
   600  		}
   601  		containerStatus := pod.Status.InitContainerStatuses[initContainerIndex]
   602  		return containerStatus.RestartCount >= desiredRestartCount, nil
   603  	})
   604  }
   605  
   606  // WaitForPodContainerRestartCount waits for the given Pod container to achieve at least a given restartCount
   607  // TODO: eventually look at moving to test/e2e/framework/pod
   608  func WaitForPodContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, desiredRestartCount int32, timeout time.Duration) error {
   609  	conditionDesc := fmt.Sprintf("container %d started", containerIndex)
   610  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   611  		if containerIndex > len(pod.Status.ContainerStatuses)-1 {
   612  			return false, nil
   613  		}
   614  		containerStatus := pod.Status.ContainerStatuses[containerIndex]
   615  		return containerStatus.RestartCount >= desiredRestartCount, nil
   616  	})
   617  }
   618  
   619  // WaitForPodInitContainerToFail waits for the given Pod init container to fail with the given reason, specifically due to
   620  // invalid container configuration. In this case, the container will remain in a waiting state with a specific
   621  // reason set, which should match the given reason.
   622  // TODO: eventually look at moving to test/e2e/framework/pod
   623  func WaitForPodInitContainerToFail(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, reason string, timeout time.Duration) error {
   624  	conditionDesc := fmt.Sprintf("container %d failed with reason %s", containerIndex, reason)
   625  	return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) {
   626  		switch pod.Status.Phase {
   627  		case v1.PodPending:
   628  			if len(pod.Status.InitContainerStatuses) == 0 {
   629  				return false, nil
   630  			}
   631  			containerStatus := pod.Status.InitContainerStatuses[containerIndex]
   632  			if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == reason {
   633  				return true, nil
   634  			}
   635  			return false, nil
   636  		case v1.PodFailed, v1.PodRunning, v1.PodSucceeded:
   637  			return false, fmt.Errorf("pod was expected to be pending, but it is in the state: %s", pod.Status.Phase)
   638  		}
   639  		return false, nil
   640  	})
   641  }
   642  
   643  func nodeNameOrIP() string {
   644  	return "localhost"
   645  }