k8s.io/kubernetes@v1.29.3/test/e2e_node/node_perf_test.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/resource"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    28  	admissionapi "k8s.io/pod-security-admission/api"
    29  
    30  	"k8s.io/kubernetes/test/e2e/framework"
    31  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    32  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    33  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    34  	e2enodekubelet "k8s.io/kubernetes/test/e2e_node/kubeletconfig"
    35  	"k8s.io/kubernetes/test/e2e_node/perf/workloads"
    36  
    37  	"github.com/onsi/ginkgo/v2"
    38  	"github.com/onsi/gomega"
    39  )
    40  
    41  // makeNodePerfPod returns a pod with the information provided from the workload.
    42  func makeNodePerfPod(w workloads.NodePerfWorkload) *v1.Pod {
    43  	return &v1.Pod{
    44  		ObjectMeta: metav1.ObjectMeta{
    45  			Name: fmt.Sprintf("%s-pod", w.Name()),
    46  		},
    47  		Spec: w.PodSpec(),
    48  	}
    49  }
    50  
    51  func setKubeletConfig(ctx context.Context, f *framework.Framework, cfg *kubeletconfig.KubeletConfiguration) {
    52  	if cfg != nil {
    53  		// Update the Kubelet configuration.
    54  		ginkgo.By("Stopping the kubelet")
    55  		startKubelet := stopKubelet()
    56  
    57  		// wait until the kubelet health check will fail
    58  		gomega.Eventually(ctx, func() bool {
    59  			return kubeletHealthCheck(kubeletHealthCheckURL)
    60  		}, time.Minute, time.Second).Should(gomega.BeFalse())
    61  
    62  		framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(cfg))
    63  
    64  		ginkgo.By("Starting the kubelet")
    65  		startKubelet()
    66  
    67  		// wait until the kubelet health check will succeed
    68  		gomega.Eventually(ctx, func() bool {
    69  			return kubeletHealthCheck(kubeletHealthCheckURL)
    70  		}, 2*time.Minute, 5*time.Second).Should(gomega.BeTrue())
    71  	}
    72  
    73  	// Wait for the Kubelet to be ready.
    74  	gomega.Eventually(ctx, func(ctx context.Context) bool {
    75  		nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
    76  		framework.ExpectNoError(err)
    77  		return nodes == 1
    78  	}, time.Minute, time.Second).Should(gomega.BeTrue())
    79  }
    80  
    81  // Serial because the test updates kubelet configuration.
    82  // Slow by design.
    83  var _ = SIGDescribe("Node Performance Testing", framework.WithSerial(), framework.WithSlow(), func() {
    84  	f := framework.NewDefaultFramework("node-performance-testing")
    85  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    86  	var (
    87  		wl     workloads.NodePerfWorkload
    88  		oldCfg *kubeletconfig.KubeletConfiguration
    89  		newCfg *kubeletconfig.KubeletConfiguration
    90  		pod    *v1.Pod
    91  	)
    92  	ginkgo.JustBeforeEach(func(ctx context.Context) {
    93  		err := wl.PreTestExec()
    94  		framework.ExpectNoError(err)
    95  		oldCfg, err = getCurrentKubeletConfig(ctx)
    96  		framework.ExpectNoError(err)
    97  		newCfg, err = wl.KubeletConfig(oldCfg)
    98  		framework.ExpectNoError(err)
    99  		setKubeletConfig(ctx, f, newCfg)
   100  	})
   101  
   102  	cleanup := func(ctx context.Context) {
   103  		gp := int64(0)
   104  		delOpts := metav1.DeleteOptions{
   105  			GracePeriodSeconds: &gp,
   106  		}
   107  		e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, delOpts, e2epod.DefaultPodDeletionTimeout)
   108  
   109  		// We are going to give some more time for the CPU manager to do any clean
   110  		// up it needs to do now that the pod has been deleted. Otherwise we may
   111  		// run into a data race condition in which the PostTestExec function
   112  		// deletes the CPU manager's checkpoint file while the CPU manager is still
   113  		// doing work and we end with a new checkpoint file after PosttestExec has
   114  		// finished. This issues would result in the kubelet panicking after we try
   115  		// and set the kubelet config.
   116  		time.Sleep(15 * time.Second)
   117  		ginkgo.By("running the post test exec from the workload")
   118  		err := wl.PostTestExec()
   119  		framework.ExpectNoError(err)
   120  		setKubeletConfig(ctx, f, oldCfg)
   121  	}
   122  
   123  	runWorkload := func(ctx context.Context) {
   124  		ginkgo.By("running the workload and waiting for success")
   125  		// Make the pod for the workload.
   126  		pod = makeNodePerfPod(wl)
   127  		// Create the pod.
   128  		pod = e2epod.NewPodClient(f).CreateSync(ctx, pod)
   129  		// Wait for pod success.
   130  		// but avoid using WaitForSuccess because we want the container logs upon failure #109295
   131  		podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, fmt.Sprintf("%s or %s", v1.PodSucceeded, v1.PodFailed), wl.Timeout(),
   132  			func(pod *v1.Pod) (bool, error) {
   133  				switch pod.Status.Phase {
   134  				case v1.PodFailed:
   135  					return true, fmt.Errorf("pod %q failed with reason: %q, message: %q", pod.Name, pod.Status.Reason, pod.Status.Message)
   136  				case v1.PodSucceeded:
   137  					return true, nil
   138  				default:
   139  					return false, nil
   140  				}
   141  			},
   142  		)
   143  		podLogs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
   144  		framework.ExpectNoError(err)
   145  		if podErr != nil {
   146  			framework.Logf("dumping pod logs due to pod error detected: \n%s", podLogs)
   147  			framework.Failf("pod error: %v", podErr)
   148  		}
   149  		perf, err := wl.ExtractPerformanceFromLogs(podLogs)
   150  		framework.ExpectNoError(err)
   151  		framework.Logf("Time to complete workload %s: %v", wl.Name(), perf)
   152  		// using framework.ExpectNoError for consistency would cause changes the output format
   153  		gomega.Expect(podErr).To(gomega.Succeed(), "wait for pod %q to succeed", pod.Name)
   154  	}
   155  
   156  	ginkgo.BeforeEach(func(ctx context.Context) {
   157  		ginkgo.By("ensure environment has enough CPU + Memory to run")
   158  		minimumRequiredCPU := resource.MustParse("15")
   159  		minimumRequiredMemory := resource.MustParse("48Gi")
   160  		localNodeCap := getLocalNode(ctx, f).Status.Allocatable
   161  		cpuCap := localNodeCap[v1.ResourceCPU]
   162  		memCap := localNodeCap[v1.ResourceMemory]
   163  		if cpuCap.Cmp(minimumRequiredCPU) == -1 {
   164  			e2eskipper.Skipf("Skipping Node Performance Tests due to lack of CPU. Required %v is less than capacity %v.", minimumRequiredCPU, cpuCap)
   165  		}
   166  		if memCap.Cmp(minimumRequiredMemory) == -1 {
   167  			e2eskipper.Skipf("Skipping Node Performance Tests due to lack of memory. Required %v is less than capacity %v.", minimumRequiredMemory, memCap)
   168  		}
   169  	})
   170  
   171  	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
   172  		ginkgo.BeforeEach(func() {
   173  			wl = workloads.NodePerfWorkloads[0]
   174  		})
   175  		ginkgo.It("NAS parallel benchmark (NPB) suite - Integer Sort (IS) workload", func(ctx context.Context) {
   176  			ginkgo.DeferCleanup(cleanup)
   177  			runWorkload(ctx)
   178  		})
   179  	})
   180  	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
   181  		ginkgo.BeforeEach(func() {
   182  			wl = workloads.NodePerfWorkloads[1]
   183  		})
   184  		ginkgo.It("NAS parallel benchmark (NPB) suite - Embarrassingly Parallel (EP) workload", func(ctx context.Context) {
   185  			ginkgo.DeferCleanup(cleanup)
   186  			runWorkload(ctx)
   187  		})
   188  	})
   189  	ginkgo.Context("Run node performance testing with pre-defined workloads", func() {
   190  		ginkgo.BeforeEach(func() {
   191  			wl = workloads.NodePerfWorkloads[2]
   192  		})
   193  		ginkgo.It("TensorFlow workload", func(ctx context.Context) {
   194  			ginkgo.DeferCleanup(cleanup)
   195  			runWorkload(ctx)
   196  		})
   197  	})
   198  })