k8s.io/kubernetes@v1.29.3/test/e2e_node/mirror_pod_grace_period_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/onsi/ginkgo/v2"
    27  	"github.com/onsi/gomega"
    28  	"github.com/onsi/gomega/gstruct"
    29  	"github.com/prometheus/common/model"
    30  	v1 "k8s.io/api/core/v1"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	"k8s.io/apimachinery/pkg/util/uuid"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/kubernetes/test/e2e/framework"
    36  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    37  	imageutils "k8s.io/kubernetes/test/utils/image"
    38  	admissionapi "k8s.io/pod-security-admission/api"
    39  )
    40  
    41  var _ = SIGDescribe("MirrorPodWithGracePeriod", func() {
    42  	f := framework.NewDefaultFramework("mirror-pod-with-grace-period")
    43  	f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
    44  	ginkgo.Context("when create a mirror pod ", func() {
    45  		var ns, podPath, staticPodName, mirrorPodName string
    46  		ginkgo.BeforeEach(func(ctx context.Context) {
    47  			ns = f.Namespace.Name
    48  			staticPodName = "graceful-pod-" + string(uuid.NewUUID())
    49  			mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName
    50  
    51  			podPath = kubeletCfg.StaticPodPath
    52  
    53  			ginkgo.By("create the static pod")
    54  			err := createStaticPodWithGracePeriod(podPath, staticPodName, ns)
    55  			framework.ExpectNoError(err)
    56  
    57  			ginkgo.By("wait for the mirror pod to be running")
    58  			gomega.Eventually(ctx, func(ctx context.Context) error {
    59  				return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns)
    60  			}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
    61  		})
    62  
    63  		f.It("mirror pod termination should satisfy grace period when static pod is deleted", f.WithNodeConformance(), func(ctx context.Context) {
    64  			ginkgo.By("get mirror pod uid")
    65  			pod, err := f.ClientSet.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
    66  			framework.ExpectNoError(err)
    67  			uid := pod.UID
    68  
    69  			ginkgo.By("delete the static pod")
    70  			file := staticPodPath(podPath, staticPodName, ns)
    71  			framework.Logf("deleting static pod manifest %q", file)
    72  			err = os.Remove(file)
    73  			framework.ExpectNoError(err)
    74  
    75  			ginkgo.By("wait for the mirror pod to be running for grace period")
    76  			gomega.Consistently(ctx, func(ctx context.Context) error {
    77  				return checkMirrorPodRunningWithUID(ctx, f.ClientSet, mirrorPodName, ns, uid)
    78  			}, 19*time.Second, 200*time.Millisecond).Should(gomega.BeNil())
    79  		})
    80  
    81  		f.It("mirror pod termination should satisfy grace period when static pod is updated", f.WithNodeConformance(), func(ctx context.Context) {
    82  			ginkgo.By("get mirror pod uid")
    83  			pod, err := f.ClientSet.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
    84  			framework.ExpectNoError(err)
    85  			uid := pod.UID
    86  
    87  			ginkgo.By("update the static pod container image")
    88  			image := imageutils.GetPauseImageName()
    89  			err = createStaticPod(podPath, staticPodName, ns, image, v1.RestartPolicyAlways)
    90  			framework.ExpectNoError(err)
    91  
    92  			ginkgo.By("wait for the mirror pod to be running for grace period")
    93  			gomega.Consistently(ctx, func(ctx context.Context) error {
    94  				return checkMirrorPodRunningWithUID(ctx, f.ClientSet, mirrorPodName, ns, uid)
    95  			}, 19*time.Second, 200*time.Millisecond).Should(gomega.BeNil())
    96  
    97  			ginkgo.By("wait for the mirror pod to be updated")
    98  			gomega.Eventually(ctx, func(ctx context.Context) error {
    99  				return checkMirrorPodRecreatedAndRunning(ctx, f.ClientSet, mirrorPodName, ns, uid)
   100  			}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
   101  
   102  			ginkgo.By("check the mirror pod container image is updated")
   103  			pod, err = f.ClientSet.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
   104  			framework.ExpectNoError(err)
   105  			gomega.Expect(pod.Spec.Containers).To(gomega.HaveLen(1))
   106  			gomega.Expect(pod.Spec.Containers[0].Image).To(gomega.Equal(image))
   107  		})
   108  
   109  		f.It("should update a static pod when the static pod is updated multiple times during the graceful termination period", f.WithNodeConformance(), func(ctx context.Context) {
   110  			ginkgo.By("get mirror pod uid")
   111  			pod, err := f.ClientSet.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
   112  			framework.ExpectNoError(err)
   113  			uid := pod.UID
   114  
   115  			ginkgo.By("update the pod manifest multiple times during the graceful termination period")
   116  			for i := 0; i < 300; i++ {
   117  				err = createStaticPod(podPath, staticPodName, ns,
   118  					fmt.Sprintf("image-%d", i), v1.RestartPolicyAlways)
   119  				framework.ExpectNoError(err)
   120  				time.Sleep(100 * time.Millisecond)
   121  			}
   122  			image := imageutils.GetPauseImageName()
   123  			err = createStaticPod(podPath, staticPodName, ns, image, v1.RestartPolicyAlways)
   124  			framework.ExpectNoError(err)
   125  
   126  			ginkgo.By("wait for the mirror pod to be updated")
   127  			gomega.Eventually(ctx, func(ctx context.Context) error {
   128  				return checkMirrorPodRecreatedAndRunning(ctx, f.ClientSet, mirrorPodName, ns, uid)
   129  			}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
   130  
   131  			ginkgo.By("check the mirror pod container image is updated")
   132  			pod, err = f.ClientSet.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
   133  			framework.ExpectNoError(err)
   134  			gomega.Expect(pod.Spec.Containers).To(gomega.HaveLen(1))
   135  			gomega.Expect(pod.Spec.Containers[0].Image).To(gomega.Equal(image))
   136  		})
   137  
   138  		f.Context("and the container runtime is temporarily down during pod termination", f.WithNodeConformance(), f.WithSerial(), f.WithDisruptive(), func() {
   139  			ginkgo.BeforeEach(func(ctx context.Context) {
   140  				// Ensure that prior to the test starting, no other pods are running or in the process of being terminated other than the mirror pod.
   141  				// This is necessary as the test verifies metrics that assume that there is only one pod (the static pod) being run, and all other pods have been terminated.
   142  				gomega.Eventually(ctx, func(ctx context.Context) error {
   143  					podList, err := e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{})
   144  					if err != nil {
   145  						return fmt.Errorf("failed listing pods while waiting for all pods to be terminated: %v", err)
   146  					}
   147  					var remainingPods []string
   148  
   149  					for _, pod := range podList.Items {
   150  						// The mirror pod is the only expected pod to be running
   151  						if pod.Name == mirrorPodName && pod.Namespace == ns {
   152  							continue
   153  						}
   154  						remainingPods = append(remainingPods, fmt.Sprintf("(%s/%s)", pod.Namespace, pod.Name))
   155  					}
   156  
   157  					if len(remainingPods) > 0 {
   158  						return fmt.Errorf("not all pods are terminated yet prior to starting mirror pod test: %v pods that still exist: %v", len(remainingPods), strings.Join(remainingPods, ","))
   159  					}
   160  					return nil
   161  				}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.Succeed())
   162  			})
   163  			ginkgo.It("the mirror pod should terminate successfully", func(ctx context.Context) {
   164  				ginkgo.By("verifying the pod is described as syncing in metrics")
   165  				gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
   166  					"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   167  						`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`:                    timelessSample(0),
   168  						`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`:                timelessSample(1),
   169  						`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`:                     timelessSample(0),
   170  						`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`:                 timelessSample(0),
   171  						`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`:        timelessSample(0),
   172  						`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`:             timelessSample(0),
   173  						`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`:         timelessSample(0),
   174  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`:              timelessSample(0),
   175  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`:          timelessSample(0),
   176  						`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
   177  						`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`:              timelessSample(0),
   178  						`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`:          timelessSample(0),
   179  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`:               timelessSample(0),
   180  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`:           timelessSample(0),
   181  						`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`:  timelessSample(0),
   182  					}),
   183  					"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   184  						`kubelet_mirror_pods`: timelessSample(1),
   185  					}),
   186  					"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   187  						`kubelet_active_pods{static=""}`:     timelessSample(0),
   188  						`kubelet_active_pods{static="true"}`: timelessSample(1),
   189  					}),
   190  					"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   191  						`kubelet_desired_pods{static=""}`:     timelessSample(0),
   192  						`kubelet_desired_pods{static="true"}`: timelessSample(1),
   193  					}),
   194  				}))
   195  
   196  				ginkgo.By("delete the static pod")
   197  				err := deleteStaticPod(podPath, staticPodName, ns)
   198  				framework.ExpectNoError(err)
   199  
   200  				// Note it is important we have a small delay here as we would like to reproduce https://issues.k8s.io/113091 which requires a failure in syncTerminatingPod()
   201  				// This requires waiting a small period between the static pod being deleted so that syncTerminatingPod() will attempt to run
   202  				ginkgo.By("sleeping before stopping the container runtime")
   203  				time.Sleep(2 * time.Second)
   204  
   205  				ginkgo.By("stop the container runtime")
   206  				err = stopContainerRuntime()
   207  				framework.ExpectNoError(err, "expected no error stopping the container runtime")
   208  
   209  				ginkgo.By("waiting for the container runtime to be stopped")
   210  				gomega.Eventually(ctx, func(ctx context.Context) error {
   211  					_, _, err := getCRIClient()
   212  					return err
   213  				}, 2*time.Minute, time.Second*5).ShouldNot(gomega.Succeed())
   214  
   215  				ginkgo.By("verifying the mirror pod is running")
   216  				gomega.Consistently(ctx, func(ctx context.Context) error {
   217  					return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns)
   218  				}, 19*time.Second, 200*time.Millisecond).Should(gomega.BeNil())
   219  
   220  				ginkgo.By("verifying the pod is described as terminating in metrics")
   221  				gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
   222  					"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   223  						`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`:                    timelessSample(0),
   224  						`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`:                timelessSample(0),
   225  						`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`:                     timelessSample(0),
   226  						`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`:                 timelessSample(0),
   227  						`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`:        timelessSample(0),
   228  						`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`:             timelessSample(0),
   229  						`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`:         timelessSample(0),
   230  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`:              timelessSample(0),
   231  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`:          timelessSample(1),
   232  						`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
   233  						`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`:              timelessSample(0),
   234  						`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`:          timelessSample(0),
   235  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`:               timelessSample(0),
   236  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`:           timelessSample(0),
   237  						`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`:  timelessSample(0),
   238  					}),
   239  					"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   240  						`kubelet_mirror_pods`: timelessSample(1),
   241  					}),
   242  					"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   243  						`kubelet_active_pods{static=""}`: timelessSample(0),
   244  						// TODO: the pod is still running and consuming resources, it should be considered in
   245  						// admission https://github.com/kubernetes/kubernetes/issues/104824 for static pods at
   246  						// least, which means it should be 1
   247  						`kubelet_active_pods{static="true"}`: timelessSample(0),
   248  					}),
   249  					"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   250  						`kubelet_desired_pods{static=""}`:     timelessSample(0),
   251  						`kubelet_desired_pods{static="true"}`: timelessSample(0),
   252  					})}))
   253  
   254  				ginkgo.By("start the container runtime")
   255  				err = startContainerRuntime()
   256  				framework.ExpectNoError(err, "expected no error starting the container runtime")
   257  				ginkgo.By("waiting for the container runtime to start")
   258  				gomega.Eventually(ctx, func(ctx context.Context) error {
   259  					r, _, err := getCRIClient()
   260  					if err != nil {
   261  						return fmt.Errorf("error getting CRI client: %w", err)
   262  					}
   263  					status, err := r.Status(ctx, true)
   264  					if err != nil {
   265  						return fmt.Errorf("error checking CRI status: %w", err)
   266  					}
   267  					framework.Logf("Runtime started: %#v", status)
   268  					return nil
   269  				}, 2*time.Minute, time.Second*5).Should(gomega.Succeed())
   270  
   271  				ginkgo.By(fmt.Sprintf("verifying that the mirror pod (%s/%s) stops running after about 30s", ns, mirrorPodName))
   272  				// from the time the container runtime starts, it should take a maximum of:
   273  				// 20s (grace period) + 2 sync transitions * 1s + 2s between housekeeping + 3s to detect CRI up +
   274  				//   2s overhead
   275  				// which we calculate here as "about 30s", so we try a bit longer than that but verify that it is
   276  				// tightly bounded by not waiting longer (we want to catch regressions to shutdown)
   277  				time.Sleep(30 * time.Second)
   278  				gomega.Eventually(ctx, func(ctx context.Context) error {
   279  					return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns)
   280  				}, time.Second*3, time.Second).Should(gomega.Succeed())
   281  
   282  				ginkgo.By("verifying the pod finishes terminating and is removed from metrics")
   283  				gomega.Eventually(ctx, getKubeletMetrics, 15*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
   284  					"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   285  						`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`:                    timelessSample(0),
   286  						`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`:                timelessSample(0),
   287  						`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`:                     timelessSample(0),
   288  						`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`:                 timelessSample(0),
   289  						`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`:        timelessSample(0),
   290  						`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`:             timelessSample(0),
   291  						`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`:         timelessSample(0),
   292  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`:              timelessSample(0),
   293  						`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`:          timelessSample(0),
   294  						`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
   295  						`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`:              timelessSample(0),
   296  						`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`:          timelessSample(0),
   297  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`:               timelessSample(0),
   298  						`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`:           timelessSample(0),
   299  						`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`:  timelessSample(0),
   300  					}),
   301  					"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   302  						`kubelet_mirror_pods`: timelessSample(0),
   303  					}),
   304  					"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   305  						`kubelet_active_pods{static=""}`:     timelessSample(0),
   306  						`kubelet_active_pods{static="true"}`: timelessSample(0),
   307  					}),
   308  					"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
   309  						`kubelet_desired_pods{static=""}`:     timelessSample(0),
   310  						`kubelet_desired_pods{static="true"}`: timelessSample(0),
   311  					}),
   312  				}))
   313  			})
   314  
   315  			ginkgo.AfterEach(func(ctx context.Context) {
   316  				ginkgo.By("starting the container runtime")
   317  				err := startContainerRuntime()
   318  				framework.ExpectNoError(err, "expected no error starting the container runtime")
   319  				ginkgo.By("waiting for the container runtime to start")
   320  				gomega.Eventually(ctx, func(ctx context.Context) error {
   321  					_, _, err := getCRIClient()
   322  					if err != nil {
   323  						return fmt.Errorf("error getting cri client: %v", err)
   324  					}
   325  					return nil
   326  				}, 2*time.Minute, time.Second*5).Should(gomega.Succeed())
   327  			})
   328  		})
   329  
   330  		ginkgo.AfterEach(func(ctx context.Context) {
   331  			ginkgo.By("delete the static pod")
   332  			err := deleteStaticPod(podPath, staticPodName, ns)
   333  			if !os.IsNotExist(err) {
   334  				framework.ExpectNoError(err)
   335  			}
   336  
   337  			ginkgo.By("wait for the mirror pod to disappear")
   338  			gomega.Eventually(ctx, func(ctx context.Context) error {
   339  				return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns)
   340  			}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
   341  		})
   342  	})
   343  })
   344  
   345  func createStaticPodWithGracePeriod(dir, name, namespace string) error {
   346  	template := `
   347  apiVersion: v1
   348  kind: Pod
   349  metadata:
   350    name: %s
   351    namespace: %s
   352  spec:
   353    terminationGracePeriodSeconds: 20
   354    containers:
   355    - name: m-test
   356      image: %s
   357      command:
   358        - /bin/sh
   359      args:
   360        - '-c'
   361        - |
   362          _term() {
   363          echo "Caught SIGTERM signal!"
   364          sleep 100
   365          }
   366          trap _term SIGTERM
   367          sleep 1000
   368  `
   369  	file := staticPodPath(dir, name, namespace)
   370  	podYaml := fmt.Sprintf(template, name, namespace, imageutils.GetE2EImage(imageutils.BusyBox))
   371  
   372  	f, err := os.OpenFile(file, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666)
   373  	if err != nil {
   374  		return err
   375  	}
   376  	defer f.Close()
   377  
   378  	_, err = f.WriteString(podYaml)
   379  	framework.Logf("has written %v", file)
   380  	return err
   381  }
   382  
   383  func checkMirrorPodRunningWithUID(ctx context.Context, cl clientset.Interface, name, namespace string, oUID types.UID) error {
   384  	pod, err := cl.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
   385  	if err != nil {
   386  		return fmt.Errorf("expected the mirror pod %q to appear: %w", name, err)
   387  	}
   388  	if pod.UID != oUID {
   389  		return fmt.Errorf("expected the uid of mirror pod %q to be same, got %q", name, pod.UID)
   390  	}
   391  	if pod.Status.Phase != v1.PodRunning {
   392  		return fmt.Errorf("expected the mirror pod %q to be running, got %q", name, pod.Status.Phase)
   393  	}
   394  	return nil
   395  }
   396  
   397  func sampleLabelID(element interface{}) string {
   398  	el := element.(*model.Sample)
   399  	return el.Metric.String()
   400  }