k8s.io/kubernetes@v1.29.3/test/e2e_node/util.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2enode 18 19 import ( 20 "context" 21 "crypto/tls" 22 "encoding/json" 23 "flag" 24 "fmt" 25 "io" 26 "net" 27 "net/http" 28 "os" 29 "os/exec" 30 "regexp" 31 "strconv" 32 "strings" 33 "time" 34 35 "k8s.io/kubernetes/pkg/util/procfs" 36 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 37 38 oteltrace "go.opentelemetry.io/otel/trace" 39 40 v1 "k8s.io/api/core/v1" 41 apiequality "k8s.io/apimachinery/pkg/api/equality" 42 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 43 "k8s.io/apimachinery/pkg/util/runtime" 44 "k8s.io/apimachinery/pkg/util/sets" 45 utilfeature "k8s.io/apiserver/pkg/util/feature" 46 clientset "k8s.io/client-go/kubernetes" 47 "k8s.io/component-base/featuregate" 48 internalapi "k8s.io/cri-api/pkg/apis" 49 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 50 "k8s.io/klog/v2" 51 kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" 52 kubeletpodresourcesv1alpha1 "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" 53 stats "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 54 "k8s.io/kubelet/pkg/types" 55 "k8s.io/kubernetes/pkg/cluster/ports" 56 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 57 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 58 "k8s.io/kubernetes/pkg/kubelet/cm" 59 "k8s.io/kubernetes/pkg/kubelet/cri/remote" 60 kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics" 61 "k8s.io/kubernetes/pkg/kubelet/util" 62 63 "github.com/coreos/go-systemd/v22/dbus" 64 "k8s.io/kubernetes/test/e2e/framework" 65 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 66 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 67 e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig" 68 imageutils "k8s.io/kubernetes/test/utils/image" 69 70 "github.com/onsi/ginkgo/v2" 71 "github.com/onsi/gomega" 72 ) 73 74 var startServices = flag.Bool("start-services", true, "If true, start local node services") 75 var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests") 76 var busyboxImage = imageutils.GetE2EImage(imageutils.BusyBox) 77 78 const ( 79 // Kubelet internal cgroup name for node allocatable cgroup. 80 defaultNodeAllocatableCgroup = "kubepods" 81 // defaultPodResourcesPath is the path to the local endpoint serving the podresources GRPC service. 82 defaultPodResourcesPath = "/var/lib/kubelet/pod-resources" 83 defaultPodResourcesTimeout = 10 * time.Second 84 defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb 85 // state files 86 cpuManagerStateFile = "/var/lib/kubelet/cpu_manager_state" 87 memoryManagerStateFile = "/var/lib/kubelet/memory_manager_state" 88 ) 89 90 var ( 91 kubeletHealthCheckURL = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort) 92 containerRuntimeUnitName = "" 93 // KubeletConfig is the kubelet configuration the test is running against. 94 kubeletCfg *kubeletconfig.KubeletConfiguration 95 ) 96 97 func getNodeSummary(ctx context.Context) (*stats.Summary, error) { 98 kubeletConfig, err := getCurrentKubeletConfig(ctx) 99 if err != nil { 100 return nil, fmt.Errorf("failed to get current kubelet config") 101 } 102 req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil) 103 if err != nil { 104 return nil, fmt.Errorf("failed to build http request: %w", err) 105 } 106 req.Header.Add("Accept", "application/json") 107 108 client := &http.Client{} 109 resp, err := client.Do(req) 110 if err != nil { 111 return nil, fmt.Errorf("failed to get /stats/summary: %w", err) 112 } 113 114 defer resp.Body.Close() 115 contentsBytes, err := io.ReadAll(resp.Body) 116 if err != nil { 117 return nil, fmt.Errorf("failed to read /stats/summary: %+v", resp) 118 } 119 120 decoder := json.NewDecoder(strings.NewReader(string(contentsBytes))) 121 summary := stats.Summary{} 122 err = decoder.Decode(&summary) 123 if err != nil { 124 return nil, fmt.Errorf("failed to parse /stats/summary to go struct: %+v", resp) 125 } 126 return &summary, nil 127 } 128 129 func getV1alpha1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1alpha1.ListPodResourcesResponse, error) { 130 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 131 if err != nil { 132 return nil, fmt.Errorf("Error getting local endpoint: %w", err) 133 } 134 client, conn, err := podresources.GetV1alpha1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 135 if err != nil { 136 return nil, fmt.Errorf("Error getting grpc client: %w", err) 137 } 138 defer conn.Close() 139 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 140 defer cancel() 141 resp, err := client.List(ctx, &kubeletpodresourcesv1alpha1.ListPodResourcesRequest{}) 142 if err != nil { 143 return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err) 144 } 145 return resp, nil 146 } 147 148 func getV1NodeDevices(ctx context.Context) (*kubeletpodresourcesv1.ListPodResourcesResponse, error) { 149 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 150 if err != nil { 151 return nil, fmt.Errorf("Error getting local endpoint: %w", err) 152 } 153 client, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 154 if err != nil { 155 return nil, fmt.Errorf("Error getting gRPC client: %w", err) 156 } 157 defer conn.Close() 158 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 159 defer cancel() 160 resp, err := client.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 161 if err != nil { 162 return nil, fmt.Errorf("%v.Get(_) = _, %v", client, err) 163 } 164 return resp, nil 165 } 166 167 // Returns the current KubeletConfiguration 168 func getCurrentKubeletConfig(ctx context.Context) (*kubeletconfig.KubeletConfiguration, error) { 169 // namespace only relevant if useProxy==true, so we don't bother 170 return e2enodekubelet.GetCurrentKubeletConfig(ctx, framework.TestContext.NodeName, "", false, framework.TestContext.StandaloneMode) 171 } 172 173 func cleanupPods(f *framework.Framework) { 174 ginkgo.AfterEach(func(ctx context.Context) { 175 ginkgo.By("Deleting any Pods created by the test in namespace: " + f.Namespace.Name) 176 l, err := e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{}) 177 framework.ExpectNoError(err) 178 for _, p := range l.Items { 179 if p.Namespace != f.Namespace.Name { 180 continue 181 } 182 framework.Logf("Deleting pod: %s", p.Name) 183 e2epod.NewPodClient(f).DeleteSync(ctx, p.Name, metav1.DeleteOptions{}, 2*time.Minute) 184 } 185 }) 186 } 187 188 // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context. 189 // The change is reverted in the AfterEach of the context. 190 // Returns true on success. 191 func tempSetCurrentKubeletConfig(f *framework.Framework, updateFunction func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration)) { 192 var oldCfg *kubeletconfig.KubeletConfiguration 193 194 ginkgo.BeforeEach(func(ctx context.Context) { 195 var err error 196 oldCfg, err = getCurrentKubeletConfig(ctx) 197 framework.ExpectNoError(err) 198 199 newCfg := oldCfg.DeepCopy() 200 updateFunction(ctx, newCfg) 201 if apiequality.Semantic.DeepEqual(*newCfg, *oldCfg) { 202 return 203 } 204 205 updateKubeletConfig(ctx, f, newCfg, true) 206 }) 207 208 ginkgo.AfterEach(func(ctx context.Context) { 209 if oldCfg != nil { 210 // Update the Kubelet configuration. 211 updateKubeletConfig(ctx, f, oldCfg, true) 212 } 213 }) 214 } 215 216 func updateKubeletConfig(ctx context.Context, f *framework.Framework, kubeletConfig *kubeletconfig.KubeletConfiguration, deleteStateFiles bool) { 217 // Update the Kubelet configuration. 218 ginkgo.By("Stopping the kubelet") 219 startKubelet := stopKubelet() 220 221 // wait until the kubelet health check will fail 222 gomega.Eventually(ctx, func() bool { 223 return kubeletHealthCheck(kubeletHealthCheckURL) 224 }, time.Minute, time.Second).Should(gomega.BeFalse()) 225 226 // Delete CPU and memory manager state files to be sure it will not prevent the kubelet restart 227 if deleteStateFiles { 228 deleteStateFile(cpuManagerStateFile) 229 deleteStateFile(memoryManagerStateFile) 230 } 231 232 framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(kubeletConfig)) 233 234 ginkgo.By("Starting the kubelet") 235 startKubelet() 236 237 // wait until the kubelet health check will succeed 238 gomega.Eventually(ctx, func() bool { 239 return kubeletHealthCheck(kubeletHealthCheckURL) 240 }, 2*time.Minute, 5*time.Second).Should(gomega.BeTrue()) 241 242 // Wait for the Kubelet to be ready. 243 gomega.Eventually(ctx, func(ctx context.Context) bool { 244 nodes, err := e2enode.TotalReady(ctx, f.ClientSet) 245 framework.ExpectNoError(err) 246 return nodes == 1 247 }, time.Minute, time.Second).Should(gomega.BeTrue()) 248 } 249 250 func deleteStateFile(stateFileName string) { 251 err := exec.Command("/bin/sh", "-c", fmt.Sprintf("rm -f %s", stateFileName)).Run() 252 framework.ExpectNoError(err, "failed to delete the state file") 253 } 254 255 // listNamespaceEvents lists the events in the given namespace. 256 func listNamespaceEvents(ctx context.Context, c clientset.Interface, ns string) error { 257 ls, err := c.CoreV1().Events(ns).List(ctx, metav1.ListOptions{}) 258 if err != nil { 259 return err 260 } 261 for _, event := range ls.Items { 262 klog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message) 263 } 264 return nil 265 } 266 267 func logPodEvents(ctx context.Context, f *framework.Framework) { 268 framework.Logf("Summary of pod events during the test:") 269 err := listNamespaceEvents(ctx, f.ClientSet, f.Namespace.Name) 270 framework.ExpectNoError(err) 271 } 272 273 func logNodeEvents(ctx context.Context, f *framework.Framework) { 274 framework.Logf("Summary of node events during the test:") 275 err := listNamespaceEvents(ctx, f.ClientSet, "") 276 framework.ExpectNoError(err) 277 } 278 279 func getLocalNode(ctx context.Context, f *framework.Framework) *v1.Node { 280 nodeList, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 281 framework.ExpectNoError(err) 282 gomega.Expect(nodeList.Items).Should(gomega.HaveLen(1), "Unexpected number of node objects for node e2e. Expects only one node.") 283 return &nodeList.Items[0] 284 } 285 286 // getLocalTestNode fetches the node object describing the local worker node set up by the e2e_node infra, alongside with its ready state. 287 // getLocalTestNode is a variant of `getLocalNode` which reports but does not set any requirement about the node readiness state, letting 288 // the caller decide. The check is intentionally done like `getLocalNode` does. 289 // Note `getLocalNode` aborts (as in ginkgo.Expect) the test implicitly if the worker node is not ready. 290 func getLocalTestNode(ctx context.Context, f *framework.Framework) (*v1.Node, bool) { 291 node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) 292 framework.ExpectNoError(err) 293 ready := e2enode.IsNodeReady(node) 294 schedulable := e2enode.IsNodeSchedulable(node) 295 framework.Logf("node %q ready=%v schedulable=%v", node.Name, ready, schedulable) 296 return node, ready && schedulable 297 } 298 299 // logKubeletLatencyMetrics logs KubeletLatencyMetrics computed from the Prometheus 300 // metrics exposed on the current node and identified by the metricNames. 301 // The Kubelet subsystem prefix is automatically prepended to these metric names. 302 func logKubeletLatencyMetrics(ctx context.Context, metricNames ...string) { 303 metricSet := sets.NewString() 304 for _, key := range metricNames { 305 metricSet.Insert(kubeletmetrics.KubeletSubsystem + "_" + key) 306 } 307 metric, err := e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, fmt.Sprintf("%s:%d", nodeNameOrIP(), ports.KubeletReadOnlyPort), "/metrics") 308 if err != nil { 309 framework.Logf("Error getting kubelet metrics: %v", err) 310 } else { 311 framework.Logf("Kubelet Metrics: %+v", e2emetrics.GetKubeletLatencyMetrics(metric, metricSet)) 312 } 313 } 314 315 // runCommand runs the cmd and returns the combined stdout and stderr, or an 316 // error if the command failed. 317 func runCommand(cmd ...string) (string, error) { 318 output, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput() 319 if err != nil { 320 return "", fmt.Errorf("failed to run %q: %s (%s)", strings.Join(cmd, " "), err, output) 321 } 322 return string(output), nil 323 } 324 325 // getCRIClient connects CRI and returns CRI runtime service clients and image service client. 326 func getCRIClient() (internalapi.RuntimeService, internalapi.ImageManagerService, error) { 327 // connection timeout for CRI service connection 328 const connectionTimeout = 2 * time.Minute 329 runtimeEndpoint := framework.TestContext.ContainerRuntimeEndpoint 330 r, err := remote.NewRemoteRuntimeService(runtimeEndpoint, connectionTimeout, oteltrace.NewNoopTracerProvider()) 331 if err != nil { 332 return nil, nil, err 333 } 334 imageManagerEndpoint := runtimeEndpoint 335 if framework.TestContext.ImageServiceEndpoint != "" { 336 //ImageServiceEndpoint is the same as ContainerRuntimeEndpoint if not 337 //explicitly specified 338 imageManagerEndpoint = framework.TestContext.ImageServiceEndpoint 339 } 340 i, err := remote.NewRemoteImageService(imageManagerEndpoint, connectionTimeout, oteltrace.NewNoopTracerProvider()) 341 if err != nil { 342 return nil, nil, err 343 } 344 return r, i, nil 345 } 346 347 // findKubeletServiceName searches the unit name among the services known to systemd. 348 // if the `running` parameter is true, restricts the search among currently running services; 349 // otherwise, also stopped, failed, exited (non-running in general) services are also considered. 350 // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494 351 func findKubeletServiceName(running bool) string { 352 cmdLine := []string{ 353 "systemctl", "list-units", "*kubelet*", 354 } 355 if running { 356 cmdLine = append(cmdLine, "--state=running") 357 } 358 stdout, err := exec.Command("sudo", cmdLine...).CombinedOutput() 359 framework.ExpectNoError(err) 360 regex := regexp.MustCompile("(kubelet-\\w+)") 361 matches := regex.FindStringSubmatch(string(stdout)) 362 gomega.Expect(matches).ToNot(gomega.BeEmpty(), "Found more than one kubelet service running: %q", stdout) 363 kubeletServiceName := matches[0] 364 framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kubeletServiceName) 365 return kubeletServiceName 366 } 367 368 func findContainerRuntimeServiceName() (string, error) { 369 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 370 defer cancel() 371 372 conn, err := dbus.NewWithContext(ctx) 373 framework.ExpectNoError(err, "Failed to setup dbus connection") 374 defer conn.Close() 375 376 runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile) 377 framework.ExpectNoError(err, "failed to get list of container runtime pids") 378 gomega.Expect(runtimePids).To(gomega.HaveLen(1), "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids)) 379 380 containerRuntimePid := runtimePids[0] 381 382 unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid)) 383 framework.ExpectNoError(err, "Failed to get container runtime unit name") 384 385 return unitName, nil 386 } 387 388 type containerRuntimeUnitOp int 389 390 const ( 391 startContainerRuntimeUnitOp containerRuntimeUnitOp = iota 392 stopContainerRuntimeUnitOp 393 ) 394 395 func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error { 396 ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) 397 defer cancel() 398 399 conn, err := dbus.NewWithContext(ctx) 400 framework.ExpectNoError(err, "Failed to setup dbus connection") 401 defer conn.Close() 402 403 if containerRuntimeUnitName == "" { 404 containerRuntimeUnitName, err = findContainerRuntimeServiceName() 405 framework.ExpectNoError(err, "Failed to find container runtime name") 406 } 407 408 reschan := make(chan string) 409 410 switch op { 411 case startContainerRuntimeUnitOp: 412 _, err = conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan) 413 case stopContainerRuntimeUnitOp: 414 _, err = conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan) 415 default: 416 framework.Failf("Unexpected container runtime op: %v", op) 417 } 418 framework.ExpectNoError(err, "dbus connection error") 419 420 job := <-reschan 421 gomega.Expect(job).To(gomega.Equal("done"), "Expected job to complete with done") 422 423 return nil 424 } 425 426 func stopContainerRuntime() error { 427 return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp) 428 } 429 430 func startContainerRuntime() error { 431 return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp) 432 } 433 434 // restartKubelet restarts the current kubelet service. 435 // the "current" kubelet service is the instance managed by the current e2e_node test run. 436 // If `running` is true, restarts only if the current kubelet is actually running. In some cases, 437 // the kubelet may have exited or can be stopped, typically because it was intentionally stopped 438 // earlier during a test, or, sometimes, because it just crashed. 439 // Warning: the "current" kubelet is poorly defined. The "current" kubelet is assumed to be the most 440 // recent kubelet service unit, IOW there is not a unique ID we use to bind explicitly a kubelet 441 // instance to a test run. 442 func restartKubelet(running bool) { 443 kubeletServiceName := findKubeletServiceName(running) 444 // reset the kubelet service start-limit-hit 445 stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() 446 framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout)) 447 448 stdout, err = exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput() 449 framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %s", err, string(stdout)) 450 } 451 452 // stopKubelet will kill the running kubelet, and returns a func that will restart the process again 453 func stopKubelet() func() { 454 kubeletServiceName := findKubeletServiceName(true) 455 456 // reset the kubelet service start-limit-hit 457 stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() 458 framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout)) 459 460 stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() 461 framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %s", err, string(stdout)) 462 463 return func() { 464 // we should restart service, otherwise the transient service start will fail 465 stdout, err := exec.Command("sudo", "systemctl", "restart", kubeletServiceName).CombinedOutput() 466 framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) 467 } 468 } 469 470 // killKubelet sends a signal (SIGINT, SIGSTOP, SIGTERM...) to the running kubelet 471 func killKubelet(sig string) { 472 kubeletServiceName := findKubeletServiceName(true) 473 474 // reset the kubelet service start-limit-hit 475 stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() 476 framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) 477 478 stdout, err = exec.Command("sudo", "systemctl", "kill", "-s", sig, kubeletServiceName).CombinedOutput() 479 framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout) 480 } 481 482 func kubeletHealthCheck(url string) bool { 483 insecureTransport := http.DefaultTransport.(*http.Transport).Clone() 484 insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} 485 insecureHTTPClient := &http.Client{ 486 Transport: insecureTransport, 487 } 488 489 req, err := http.NewRequest("HEAD", url, nil) 490 if err != nil { 491 return false 492 } 493 req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", framework.TestContext.BearerToken)) 494 resp, err := insecureHTTPClient.Do(req) 495 if err != nil { 496 klog.Warningf("Health check on %q failed, error=%v", url, err) 497 } else if resp.StatusCode != http.StatusOK { 498 klog.Warningf("Health check on %q failed, status=%d", url, resp.StatusCode) 499 } 500 return err == nil && resp.StatusCode == http.StatusOK 501 } 502 503 func toCgroupFsName(cgroupName cm.CgroupName) string { 504 if kubeletCfg.CgroupDriver == "systemd" { 505 return cgroupName.ToSystemd() 506 } 507 return cgroupName.ToCgroupfs() 508 } 509 510 // reduceAllocatableMemoryUsageIfCgroupv1 uses memory.force_empty (https://lwn.net/Articles/432224/) 511 // to make the kernel reclaim memory in the allocatable cgroup 512 // the time to reduce pressure may be unbounded, but usually finishes within a second. 513 // memory.force_empty is no supported in cgroupv2. 514 func reduceAllocatableMemoryUsageIfCgroupv1() { 515 if !IsCgroup2UnifiedMode() { 516 cmd := fmt.Sprintf("echo 0 > /sys/fs/cgroup/memory/%s/memory.force_empty", toCgroupFsName(cm.NewCgroupName(cm.RootCgroupName, defaultNodeAllocatableCgroup))) 517 _, err := exec.Command("sudo", "sh", "-c", cmd).CombinedOutput() 518 framework.ExpectNoError(err) 519 } 520 } 521 522 // Equivalent of featuregatetesting.SetFeatureGateDuringTest 523 // which can't be used here because we're not in a Testing context. 524 // This must be in a non-"_test" file to pass 525 // make verify WHAT=test-featuregates 526 func withFeatureGate(feature featuregate.Feature, desired bool) func() { 527 current := utilfeature.DefaultFeatureGate.Enabled(feature) 528 utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), desired)) 529 return func() { 530 utilfeature.DefaultMutableFeatureGate.Set(fmt.Sprintf("%s=%v", string(feature), current)) 531 } 532 } 533 534 // waitForAllContainerRemoval waits until all the containers on a given pod are really gone. 535 // This is needed by the e2e tests which involve exclusive resource allocation (cpu, topology manager; podresources; etc.) 536 // In these cases, we need to make sure the tests clean up after themselves to make sure each test runs in 537 // a pristine environment. The only way known so far to do that is to introduce this wait. 538 // Worth noting, however, that this makes the test runtime much bigger. 539 func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) { 540 rs, _, err := getCRIClient() 541 framework.ExpectNoError(err) 542 gomega.Eventually(ctx, func(ctx context.Context) error { 543 containers, err := rs.ListContainers(ctx, &runtimeapi.ContainerFilter{ 544 LabelSelector: map[string]string{ 545 types.KubernetesPodNameLabel: podName, 546 types.KubernetesPodNamespaceLabel: podNS, 547 }, 548 }) 549 if err != nil { 550 return fmt.Errorf("got error waiting for all containers to be removed from CRI: %v", err) 551 } 552 553 if len(containers) > 0 { 554 return fmt.Errorf("expected all containers to be removed from CRI but %v containers still remain. Containers: %+v", len(containers), containers) 555 } 556 return nil 557 }, 2*time.Minute, 1*time.Second).Should(gomega.Succeed()) 558 } 559 560 func getPidsForProcess(name, pidFile string) ([]int, error) { 561 if len(pidFile) > 0 { 562 pid, err := getPidFromPidFile(pidFile) 563 if err == nil { 564 return []int{pid}, nil 565 } 566 // log the error and fall back to pidof 567 runtime.HandleError(err) 568 } 569 return procfs.PidOf(name) 570 } 571 572 func getPidFromPidFile(pidFile string) (int, error) { 573 file, err := os.Open(pidFile) 574 if err != nil { 575 return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err) 576 } 577 defer file.Close() 578 579 data, err := io.ReadAll(file) 580 if err != nil { 581 return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err) 582 } 583 584 pid, err := strconv.Atoi(string(data)) 585 if err != nil { 586 return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err) 587 } 588 589 return pid, nil 590 } 591 592 // WaitForPodInitContainerRestartCount waits for the given Pod init container 593 // to achieve at least a given restartCount 594 // TODO: eventually look at moving to test/e2e/framework/pod 595 func WaitForPodInitContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, initContainerIndex int, desiredRestartCount int32, timeout time.Duration) error { 596 conditionDesc := fmt.Sprintf("init container %d started", initContainerIndex) 597 return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { 598 if initContainerIndex > len(pod.Status.InitContainerStatuses)-1 { 599 return false, nil 600 } 601 containerStatus := pod.Status.InitContainerStatuses[initContainerIndex] 602 return containerStatus.RestartCount >= desiredRestartCount, nil 603 }) 604 } 605 606 // WaitForPodContainerRestartCount waits for the given Pod container to achieve at least a given restartCount 607 // TODO: eventually look at moving to test/e2e/framework/pod 608 func WaitForPodContainerRestartCount(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, desiredRestartCount int32, timeout time.Duration) error { 609 conditionDesc := fmt.Sprintf("container %d started", containerIndex) 610 return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { 611 if containerIndex > len(pod.Status.ContainerStatuses)-1 { 612 return false, nil 613 } 614 containerStatus := pod.Status.ContainerStatuses[containerIndex] 615 return containerStatus.RestartCount >= desiredRestartCount, nil 616 }) 617 } 618 619 // WaitForPodInitContainerToFail waits for the given Pod init container to fail with the given reason, specifically due to 620 // invalid container configuration. In this case, the container will remain in a waiting state with a specific 621 // reason set, which should match the given reason. 622 // TODO: eventually look at moving to test/e2e/framework/pod 623 func WaitForPodInitContainerToFail(ctx context.Context, c clientset.Interface, namespace, podName string, containerIndex int, reason string, timeout time.Duration) error { 624 conditionDesc := fmt.Sprintf("container %d failed with reason %s", containerIndex, reason) 625 return e2epod.WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { 626 switch pod.Status.Phase { 627 case v1.PodPending: 628 if len(pod.Status.InitContainerStatuses) == 0 { 629 return false, nil 630 } 631 containerStatus := pod.Status.InitContainerStatuses[containerIndex] 632 if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == reason { 633 return true, nil 634 } 635 return false, nil 636 case v1.PodFailed, v1.PodRunning, v1.PodSucceeded: 637 return false, fmt.Errorf("pod was expected to be pending, but it is in the state: %s", pod.Status.Phase) 638 } 639 return false, nil 640 }) 641 } 642 643 func nodeNameOrIP() string { 644 return "localhost" 645 }