k8s.io/kubernetes@v1.29.3/test/e2e_node/garbage_collector_test.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strconv"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	internalapi "k8s.io/cri-api/pkg/apis"
    28  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    29  	"k8s.io/kubelet/pkg/types"
    30  	"k8s.io/kubernetes/test/e2e/framework"
    31  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    32  	"k8s.io/kubernetes/test/e2e/nodefeature"
    33  	admissionapi "k8s.io/pod-security-admission/api"
    34  
    35  	"github.com/onsi/ginkgo/v2"
    36  	"github.com/onsi/gomega"
    37  )
    38  
    39  const (
    40  	//TODO (dashpole): Once dynamic config is possible, test different values for maxPerPodContainer and maxContainers
    41  	// Currently using default values for maxPerPodContainer and maxTotalContainers
    42  	maxPerPodContainer = 1
    43  	maxTotalContainers = -1
    44  
    45  	garbageCollectDuration = 3 * time.Minute
    46  	setupDuration          = 10 * time.Minute
    47  	runtimePollInterval    = 10 * time.Second
    48  )
    49  
    50  type testPodSpec struct {
    51  	podName string
    52  	// containerPrefix must be unique for each pod, and cannot end in a number.
    53  	// containerPrefix is used to identify which containers belong to which pod in the test.
    54  	containerPrefix string
    55  	// the number of times each container should restart
    56  	restartCount int32
    57  	// the number of containers in the test pod
    58  	numContainers int
    59  	// a function that returns the number of containers currently on the node (including dead containers).
    60  	getContainerNames func() ([]string, error)
    61  }
    62  
    63  func (pod *testPodSpec) getContainerName(containerNumber int) string {
    64  	return fmt.Sprintf("%s%d", pod.containerPrefix, containerNumber)
    65  }
    66  
    67  type testRun struct {
    68  	// Name for logging purposes
    69  	testName string
    70  	// Pod specs for the test
    71  	testPods []*testPodSpec
    72  }
    73  
    74  // GarbageCollect tests that the Kubelet conforms to the Kubelet Garbage Collection Policy, found here:
    75  // http://kubernetes.io/docs/admin/garbage-collection/
    76  var _ = SIGDescribe("GarbageCollect", framework.WithSerial(), nodefeature.GarbageCollect, func() {
    77  	f := framework.NewDefaultFramework("garbage-collect-test")
    78  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    79  	containerNamePrefix := "gc-test-container-"
    80  	podNamePrefix := "gc-test-pod-"
    81  
    82  	// These suffixes are appended to pod and container names.
    83  	// They differentiate pods from one another, and allow filtering
    84  	// by names to identify which containers belong to which pods
    85  	// They must be unique, and must not end in a number
    86  	firstSuffix := "one-container-no-restarts"
    87  	secondSuffix := "many-containers-many-restarts-one-pod"
    88  	thirdSuffix := "many-containers-many-restarts-"
    89  	tests := []testRun{
    90  		{
    91  			testName: "One Non-restarting Container",
    92  			testPods: []*testPodSpec{
    93  				{
    94  					podName:         podNamePrefix + firstSuffix,
    95  					containerPrefix: containerNamePrefix + firstSuffix,
    96  					restartCount:    0,
    97  					numContainers:   1,
    98  				},
    99  			},
   100  		},
   101  		{
   102  			testName: "Many Restarting Containers",
   103  			testPods: []*testPodSpec{
   104  				{
   105  					podName:         podNamePrefix + secondSuffix,
   106  					containerPrefix: containerNamePrefix + secondSuffix,
   107  					restartCount:    4,
   108  					numContainers:   4,
   109  				},
   110  			},
   111  		},
   112  		{
   113  			testName: "Many Pods with Many Restarting Containers",
   114  			testPods: []*testPodSpec{
   115  				{
   116  					podName:         podNamePrefix + thirdSuffix + "one",
   117  					containerPrefix: containerNamePrefix + thirdSuffix + "one",
   118  					restartCount:    3,
   119  					numContainers:   4,
   120  				},
   121  				{
   122  					podName:         podNamePrefix + thirdSuffix + "two",
   123  					containerPrefix: containerNamePrefix + thirdSuffix + "two",
   124  					restartCount:    2,
   125  					numContainers:   6,
   126  				},
   127  				{
   128  					podName:         podNamePrefix + thirdSuffix + "three",
   129  					containerPrefix: containerNamePrefix + thirdSuffix + "three",
   130  					restartCount:    3,
   131  					numContainers:   5,
   132  				},
   133  			},
   134  		},
   135  	}
   136  	for _, test := range tests {
   137  		containerGCTest(f, test)
   138  	}
   139  })
   140  
   141  // Tests the following:
   142  //
   143  //	pods are created, and all containers restart the specified number of times
   144  //	while containers are running, the number of copies of a single container does not exceed maxPerPodContainer
   145  //	while containers are running, the total number of containers does not exceed maxTotalContainers
   146  //	while containers are running, if not constrained by maxPerPodContainer or maxTotalContainers, keep an extra copy of each container
   147  //	once pods are killed, all containers are eventually cleaned up
   148  func containerGCTest(f *framework.Framework, test testRun) {
   149  	var runtime internalapi.RuntimeService
   150  	ginkgo.BeforeEach(func() {
   151  		var err error
   152  		runtime, _, err = getCRIClient()
   153  		framework.ExpectNoError(err)
   154  	})
   155  	for _, pod := range test.testPods {
   156  		// Initialize the getContainerNames function to use CRI runtime client.
   157  		pod.getContainerNames = func() ([]string, error) {
   158  			relevantContainers := []string{}
   159  			containers, err := runtime.ListContainers(context.Background(), &runtimeapi.ContainerFilter{
   160  				LabelSelector: map[string]string{
   161  					types.KubernetesPodNameLabel:      pod.podName,
   162  					types.KubernetesPodNamespaceLabel: f.Namespace.Name,
   163  				},
   164  			})
   165  			if err != nil {
   166  				return relevantContainers, err
   167  			}
   168  			for _, container := range containers {
   169  				relevantContainers = append(relevantContainers, container.Labels[types.KubernetesContainerNameLabel])
   170  			}
   171  			return relevantContainers, nil
   172  		}
   173  	}
   174  
   175  	ginkgo.Context(fmt.Sprintf("Garbage Collection Test: %s", test.testName), func() {
   176  		ginkgo.BeforeEach(func(ctx context.Context) {
   177  			realPods := getPods(test.testPods)
   178  			e2epod.NewPodClient(f).CreateBatch(ctx, realPods)
   179  			ginkgo.By("Making sure all containers restart the specified number of times")
   180  			gomega.Eventually(ctx, func(ctx context.Context) error {
   181  				for _, podSpec := range test.testPods {
   182  					err := verifyPodRestartCount(ctx, f, podSpec.podName, podSpec.numContainers, podSpec.restartCount)
   183  					if err != nil {
   184  						return err
   185  					}
   186  				}
   187  				return nil
   188  			}, setupDuration, runtimePollInterval).Should(gomega.BeNil())
   189  		})
   190  
   191  		ginkgo.It("Should eventually garbage collect containers when we exceed the number of dead containers per container", func(ctx context.Context) {
   192  			totalContainers := 0
   193  			for _, pod := range test.testPods {
   194  				totalContainers += pod.numContainers*2 + 1
   195  			}
   196  			gomega.Eventually(ctx, func() error {
   197  				total := 0
   198  				for _, pod := range test.testPods {
   199  					containerNames, err := pod.getContainerNames()
   200  					if err != nil {
   201  						return err
   202  					}
   203  					total += len(containerNames)
   204  					// Check maxPerPodContainer for each container in the pod
   205  					for i := 0; i < pod.numContainers; i++ {
   206  						containerCount := 0
   207  						for _, containerName := range containerNames {
   208  							if containerName == pod.getContainerName(i) {
   209  								containerCount++
   210  							}
   211  						}
   212  						if containerCount > maxPerPodContainer+1 {
   213  							return fmt.Errorf("expected number of copies of container: %s, to be <= maxPerPodContainer: %d; list of containers: %v",
   214  								pod.getContainerName(i), maxPerPodContainer, containerNames)
   215  						}
   216  					}
   217  				}
   218  				//Check maxTotalContainers.  Currently, the default is -1, so this will never happen until we can configure maxTotalContainers
   219  				if maxTotalContainers > 0 && totalContainers <= maxTotalContainers && total > maxTotalContainers {
   220  					return fmt.Errorf("expected total number of containers: %v, to be <= maxTotalContainers: %v", total, maxTotalContainers)
   221  				}
   222  				return nil
   223  			}, garbageCollectDuration, runtimePollInterval).Should(gomega.BeNil())
   224  
   225  			if maxPerPodContainer >= 2 && maxTotalContainers < 0 { // make sure constraints wouldn't make us gc old containers
   226  				ginkgo.By("Making sure the kubelet consistently keeps around an extra copy of each container.")
   227  				gomega.Consistently(ctx, func() error {
   228  					for _, pod := range test.testPods {
   229  						containerNames, err := pod.getContainerNames()
   230  						if err != nil {
   231  							return err
   232  						}
   233  						for i := 0; i < pod.numContainers; i++ {
   234  							containerCount := 0
   235  							for _, containerName := range containerNames {
   236  								if containerName == pod.getContainerName(i) {
   237  									containerCount++
   238  								}
   239  							}
   240  							if pod.restartCount > 0 && containerCount < maxPerPodContainer+1 {
   241  								return fmt.Errorf("expected pod %v to have extra copies of old containers", pod.podName)
   242  							}
   243  						}
   244  					}
   245  					return nil
   246  				}, garbageCollectDuration, runtimePollInterval).Should(gomega.BeNil())
   247  			}
   248  		})
   249  
   250  		ginkgo.AfterEach(func(ctx context.Context) {
   251  			for _, pod := range test.testPods {
   252  				ginkgo.By(fmt.Sprintf("Deleting Pod %v", pod.podName))
   253  				e2epod.NewPodClient(f).DeleteSync(ctx, pod.podName, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout)
   254  			}
   255  
   256  			ginkgo.By("Making sure all containers get cleaned up")
   257  			gomega.Eventually(ctx, func() error {
   258  				for _, pod := range test.testPods {
   259  					containerNames, err := pod.getContainerNames()
   260  					if err != nil {
   261  						return err
   262  					}
   263  					if len(containerNames) > 0 {
   264  						return fmt.Errorf("%v containers still remain", containerNames)
   265  					}
   266  				}
   267  				return nil
   268  			}, garbageCollectDuration, runtimePollInterval).Should(gomega.BeNil())
   269  
   270  			if ginkgo.CurrentSpecReport().Failed() && framework.TestContext.DumpLogsOnFailure {
   271  				logNodeEvents(ctx, f)
   272  				logPodEvents(ctx, f)
   273  			}
   274  		})
   275  	})
   276  }
   277  
   278  func getPods(specs []*testPodSpec) (pods []*v1.Pod) {
   279  	for _, spec := range specs {
   280  		ginkgo.By(fmt.Sprintf("Creating %v containers with restartCount: %v", spec.numContainers, spec.restartCount))
   281  		containers := []v1.Container{}
   282  		for i := 0; i < spec.numContainers; i++ {
   283  			containers = append(containers, v1.Container{
   284  				Image:   busyboxImage,
   285  				Name:    spec.getContainerName(i),
   286  				Command: getRestartingContainerCommand("/test-empty-dir-mnt", i, spec.restartCount, ""),
   287  				VolumeMounts: []v1.VolumeMount{
   288  					{MountPath: "/test-empty-dir-mnt", Name: "test-empty-dir"},
   289  				},
   290  			})
   291  		}
   292  		pods = append(pods, &v1.Pod{
   293  			ObjectMeta: metav1.ObjectMeta{Name: spec.podName},
   294  			Spec: v1.PodSpec{
   295  				RestartPolicy: v1.RestartPolicyAlways,
   296  				Containers:    containers,
   297  				Volumes: []v1.Volume{
   298  					{Name: "test-empty-dir", VolumeSource: v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}},
   299  				},
   300  			},
   301  		})
   302  	}
   303  	return
   304  }
   305  
   306  func getRestartingContainerCommand(path string, containerNum int, restarts int32, loopingCommand string) []string {
   307  	return []string{
   308  		"sh",
   309  		"-c",
   310  		fmt.Sprintf(`
   311  			f=%s/countfile%s
   312  			count=$(echo 'hello' >> $f ; wc -l $f | awk {'print $1'})
   313  			if [ $count -lt %d ]; then
   314  				exit 0
   315  			fi
   316  			while true; do %s sleep 1; done`,
   317  			path, strconv.Itoa(containerNum), restarts+1, loopingCommand),
   318  	}
   319  }
   320  
   321  func verifyPodRestartCount(ctx context.Context, f *framework.Framework, podName string, expectedNumContainers int, expectedRestartCount int32) error {
   322  	updatedPod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, podName, metav1.GetOptions{})
   323  	if err != nil {
   324  		return err
   325  	}
   326  	if len(updatedPod.Status.ContainerStatuses) != expectedNumContainers {
   327  		return fmt.Errorf("expected pod %s to have %d containers, actual: %d",
   328  			updatedPod.Name, expectedNumContainers, len(updatedPod.Status.ContainerStatuses))
   329  	}
   330  	for _, containerStatus := range updatedPod.Status.ContainerStatuses {
   331  		if containerStatus.RestartCount != expectedRestartCount {
   332  			return fmt.Errorf("pod %s had container with restartcount %d.  Should have been at least %d",
   333  				updatedPod.Name, containerStatus.RestartCount, expectedRestartCount)
   334  		}
   335  	}
   336  	return nil
   337  }