k8s.io/kubernetes@v1.29.3/test/e2e_node/eviction_test.go

k8s.io/kubernetes@v1.29.3/test/e2e_node/eviction_test.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"path/filepath"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	schedulingv1 "k8s.io/api/scheduling/v1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	"k8s.io/apimachinery/pkg/api/resource"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/fields"
    33  	kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    34  	"k8s.io/kubernetes/pkg/features"
    35  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    36  	"k8s.io/kubernetes/pkg/kubelet/eviction"
    37  	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
    38  	kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
    39  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
    40  	"k8s.io/kubernetes/test/e2e/feature"
    41  	"k8s.io/kubernetes/test/e2e/framework"
    42  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    43  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    44  	"k8s.io/kubernetes/test/e2e/nodefeature"
    45  	testutils "k8s.io/kubernetes/test/utils"
    46  	imageutils "k8s.io/kubernetes/test/utils/image"
    47  	admissionapi "k8s.io/pod-security-admission/api"
    48  
    49  	"github.com/onsi/ginkgo/v2"
    50  	"github.com/onsi/gomega"
    51  )
    52  
    53  // Eviction Policy is described here:
    54  // https://github.com/kubernetes/design-proposals-archive/blob/main/node/kubelet-eviction.md
    55  
    56  const (
    57  	postTestConditionMonitoringPeriod = 1 * time.Minute
    58  	evictionPollInterval              = 2 * time.Second
    59  	pressureDisappearTimeout          = 10 * time.Minute
    60  	// pressure conditions often surface after evictions because the kubelet only updates
    61  	// node conditions periodically.
    62  	// we wait this period after evictions to make sure that we wait out this delay
    63  	pressureDelay     = 20 * time.Second
    64  	testContextFmt    = "when we run containers that should cause %s"
    65  	noPressure        = v1.NodeConditionType("NoPressure")
    66  	lotsOfDisk        = 10240      // 10 Gb in Mb
    67  	lotsOfFiles       = 1000000000 // 1 billion
    68  	resourceInodes    = v1.ResourceName("inodes")
    69  	noStarvedResource = v1.ResourceName("none")
    70  )
    71  
    72  // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods.
    73  // Node disk pressure is induced by consuming all inodes on the node.
    74  var _ = SIGDescribe("InodeEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
    75  	f := framework.NewDefaultFramework("inode-eviction-test")
    76  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    77  	expectedNodeCondition := v1.NodeDiskPressure
    78  	expectedStarvedResource := resourceInodes
    79  	pressureTimeout := 15 * time.Minute
    80  	inodesConsumed := uint64(200000)
    81  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
    82  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
    83  			// Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction.
    84  			summary := eventuallyGetSummary(ctx)
    85  			inodesFree := *summary.Node.Fs.InodesFree
    86  			if inodesFree <= inodesConsumed {
    87  				e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run")
    88  			}
    89  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)}
    90  			initialConfig.EvictionMinimumReclaim = map[string]string{}
    91  		})
    92  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{
    93  			{
    94  				evictionPriority: 1,
    95  				pod:              inodeConsumingPod("container-inode-hog", lotsOfFiles, nil),
    96  			},
    97  			{
    98  				evictionPriority: 1,
    99  				pod:              inodeConsumingPod("volume-inode-hog", lotsOfFiles, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}}),
   100  			},
   101  			{
   102  				evictionPriority: 0,
   103  				pod:              innocentPod(),
   104  			},
   105  		})
   106  	})
   107  })
   108  
   109  // ImageGCNoEviction tests that the node does not evict pods when inodes are consumed by images
   110  // Disk pressure is induced by pulling large images
   111  var _ = SIGDescribe("ImageGCNoEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   112  	f := framework.NewDefaultFramework("image-gc-eviction-test")
   113  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   114  	pressureTimeout := 10 * time.Minute
   115  	expectedNodeCondition := v1.NodeDiskPressure
   116  	expectedStarvedResource := resourceInodes
   117  	inodesConsumed := uint64(100000)
   118  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   119  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   120  			// Set the eviction threshold to inodesFree - inodesConsumed, so that using inodesConsumed causes an eviction.
   121  			summary := eventuallyGetSummary(ctx)
   122  			inodesFree := *summary.Node.Fs.InodesFree
   123  			if inodesFree <= inodesConsumed {
   124  				e2eskipper.Skipf("Too few inodes free on the host for the InodeEviction test to run")
   125  			}
   126  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)}
   127  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   128  		})
   129  		// Consume enough inodes to induce disk pressure,
   130  		// but expect that image garbage collection can reduce it enough to avoid an eviction
   131  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
   132  			{
   133  				evictionPriority: 0,
   134  				pod:              inodeConsumingPod("container-inode", 110000, nil),
   135  			},
   136  		})
   137  	})
   138  })
   139  
   140  // MemoryAllocatableEviction tests that the node responds to node memory pressure by evicting only responsible pods.
   141  // Node memory pressure is only encountered because we reserve the majority of the node's capacity via kube-reserved.
   142  var _ = SIGDescribe("MemoryAllocatableEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   143  	f := framework.NewDefaultFramework("memory-allocatable-eviction-test")
   144  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   145  	expectedNodeCondition := v1.NodeMemoryPressure
   146  	expectedStarvedResource := v1.ResourceMemory
   147  	pressureTimeout := 10 * time.Minute
   148  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   149  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   150  			// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
   151  			kubeReserved := getNodeCPUAndMemoryCapacity(ctx, f)[v1.ResourceMemory]
   152  			// The default hard eviction threshold is 250Mb, so Allocatable = Capacity - Reserved - 250Mb
   153  			// We want Allocatable = 50Mb, so set Reserved = Capacity - Allocatable - 250Mb = Capacity - 300Mb
   154  			kubeReserved.Sub(resource.MustParse("300Mi"))
   155  			initialConfig.KubeReserved = map[string]string{
   156  				string(v1.ResourceMemory): kubeReserved.String(),
   157  			}
   158  			initialConfig.EnforceNodeAllocatable = []string{kubetypes.NodeAllocatableEnforcementKey}
   159  			initialConfig.CgroupsPerQOS = true
   160  		})
   161  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logMemoryMetrics, []podEvictSpec{
   162  			{
   163  				evictionPriority: 1,
   164  				pod:              getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
   165  			},
   166  			{
   167  				evictionPriority: 0,
   168  				pod:              innocentPod(),
   169  			},
   170  		})
   171  	})
   172  })
   173  
   174  // LocalStorageEviction tests that the node responds to node disk pressure by evicting only responsible pods
   175  // Disk pressure is induced by running pods which consume disk space.
   176  var _ = SIGDescribe("LocalStorageEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   177  	f := framework.NewDefaultFramework("localstorage-eviction-test")
   178  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   179  	pressureTimeout := 15 * time.Minute
   180  	expectedNodeCondition := v1.NodeDiskPressure
   181  	expectedStarvedResource := v1.ResourceEphemeralStorage
   182  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   183  
   184  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   185  			summary := eventuallyGetSummary(ctx)
   186  
   187  			diskConsumedByTest := resource.MustParse("4Gi")
   188  			availableBytesOnSystem := *(summary.Node.Fs.AvailableBytes)
   189  			evictionThreshold := strconv.FormatUint(availableBytesOnSystem-uint64(diskConsumedByTest.Value()), 10)
   190  
   191  			if availableBytesOnSystem <= uint64(diskConsumedByTest.Value()) {
   192  				e2eskipper.Skipf("Too little disk free on the host for the LocalStorageEviction test to run")
   193  			}
   194  
   195  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): evictionThreshold}
   196  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   197  		})
   198  
   199  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
   200  			{
   201  				evictionPriority: 1,
   202  				pod:              diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}),
   203  			},
   204  			{
   205  				evictionPriority: 0,
   206  				pod:              innocentPod(),
   207  			},
   208  		})
   209  	})
   210  })
   211  
   212  // LocalStorageEviction tests that the node responds to node disk pressure by evicting only responsible pods
   213  // Disk pressure is induced by running pods which consume disk space, which exceed the soft eviction threshold.
   214  // Note: This test's purpose is to test Soft Evictions.  Local storage was chosen since it is the least costly to run.
   215  var _ = SIGDescribe("LocalStorageSoftEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   216  	f := framework.NewDefaultFramework("localstorage-eviction-test")
   217  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   218  	pressureTimeout := 10 * time.Minute
   219  	expectedNodeCondition := v1.NodeDiskPressure
   220  	expectedStarvedResource := v1.ResourceEphemeralStorage
   221  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   222  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   223  			diskConsumed := resource.MustParse("4Gi")
   224  			summary := eventuallyGetSummary(ctx)
   225  			availableBytes := *(summary.Node.Fs.AvailableBytes)
   226  			if availableBytes <= uint64(diskConsumed.Value()) {
   227  				e2eskipper.Skipf("Too little disk free on the host for the LocalStorageSoftEviction test to run")
   228  			}
   229  			initialConfig.EvictionSoft = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))}
   230  			initialConfig.EvictionSoftGracePeriod = map[string]string{string(evictionapi.SignalNodeFsAvailable): "1m"}
   231  			// Defer to the pod default grace period
   232  			initialConfig.EvictionMaxPodGracePeriod = 30
   233  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   234  			// Ensure that pods are not evicted because of the eviction-hard threshold
   235  			// setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty)
   236  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"}
   237  		})
   238  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{
   239  			{
   240  				evictionPriority: 1,
   241  				pod:              diskConsumingPod("container-disk-hog", lotsOfDisk, nil, v1.ResourceRequirements{}),
   242  			},
   243  			{
   244  				evictionPriority: 0,
   245  				pod:              innocentPod(),
   246  			},
   247  		})
   248  	})
   249  })
   250  
   251  // This test validates that in-memory EmptyDir's are evicted when the Kubelet does
   252  // not have Sized Memory Volumes enabled. When Sized volumes are enabled, it's
   253  // not possible to exhaust the quota.
   254  var _ = SIGDescribe("LocalStorageCapacityIsolationMemoryBackedVolumeEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.LocalStorageCapacityIsolation, nodefeature.Eviction, func() {
   255  	f := framework.NewDefaultFramework("localstorage-eviction-test")
   256  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   257  	evictionTestTimeout := 7 * time.Minute
   258  	ginkgo.Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() {
   259  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   260  			// setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty)
   261  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"}
   262  			if initialConfig.FeatureGates == nil {
   263  				initialConfig.FeatureGates = make(map[string]bool)
   264  			}
   265  			initialConfig.FeatureGates["SizeMemoryBackedVolumes"] = false
   266  		})
   267  
   268  		sizeLimit := resource.MustParse("100Mi")
   269  		useOverLimit := 200 /* Mb */
   270  		useUnderLimit := 80 /* Mb */
   271  		containerLimit := v1.ResourceList{v1.ResourceEphemeralStorage: sizeLimit}
   272  
   273  		runEvictionTest(f, evictionTestTimeout, noPressure, noStarvedResource, logDiskMetrics, []podEvictSpec{
   274  			{
   275  				evictionPriority: 1, // Should be evicted due to disk limit
   276  				pod: diskConsumingPod("emptydir-memory-over-volume-sizelimit", useOverLimit, &v1.VolumeSource{
   277  					EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit},
   278  				}, v1.ResourceRequirements{}),
   279  			},
   280  			{
   281  				evictionPriority: 0, // Should not be evicted, as container limits do not account for memory backed volumes
   282  				pod: diskConsumingPod("emptydir-memory-over-container-sizelimit", useOverLimit, &v1.VolumeSource{
   283  					EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory"},
   284  				}, v1.ResourceRequirements{Limits: containerLimit}),
   285  			},
   286  			{
   287  				evictionPriority: 0,
   288  				pod: diskConsumingPod("emptydir-memory-innocent", useUnderLimit, &v1.VolumeSource{
   289  					EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit},
   290  				}, v1.ResourceRequirements{}),
   291  			},
   292  		})
   293  	})
   294  })
   295  
   296  // LocalStorageCapacityIsolationEviction tests that container and volume local storage limits are enforced through evictions
   297  var _ = SIGDescribe("LocalStorageCapacityIsolationEviction", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), feature.LocalStorageCapacityIsolation, nodefeature.Eviction, func() {
   298  	f := framework.NewDefaultFramework("localstorage-eviction-test")
   299  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   300  	evictionTestTimeout := 10 * time.Minute
   301  	ginkgo.Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() {
   302  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   303  			// setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty)
   304  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"}
   305  		})
   306  		sizeLimit := resource.MustParse("100Mi")
   307  		useOverLimit := 101 /* Mb */
   308  		useUnderLimit := 99 /* Mb */
   309  		containerLimit := v1.ResourceList{v1.ResourceEphemeralStorage: sizeLimit}
   310  
   311  		runEvictionTest(f, evictionTestTimeout, noPressure, noStarvedResource, logDiskMetrics, []podEvictSpec{
   312  			{
   313  				evictionPriority: 1, // This pod should be evicted because emptyDir (default storage type) usage violation
   314  				pod: diskConsumingPod("emptydir-disk-sizelimit", useOverLimit, &v1.VolumeSource{
   315  					EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit},
   316  				}, v1.ResourceRequirements{}),
   317  			},
   318  			{
   319  				evictionPriority: 1, // This pod should cross the container limit by writing to its writable layer.
   320  				pod:              diskConsumingPod("container-disk-limit", useOverLimit, nil, v1.ResourceRequirements{Limits: containerLimit}),
   321  			},
   322  			{
   323  				evictionPriority: 1, // This pod should hit the container limit by writing to an emptydir
   324  				pod: diskConsumingPod("container-emptydir-disk-limit", useOverLimit, &v1.VolumeSource{EmptyDir: &v1.EmptyDirVolumeSource{}},
   325  					v1.ResourceRequirements{Limits: containerLimit}),
   326  			},
   327  			{
   328  				evictionPriority: 0, // This pod should not be evicted because MemoryBackedVolumes cannot use more space than is allocated to them since SizeMemoryBackedVolumes was enabled
   329  				pod: diskConsumingPod("emptydir-memory-sizelimit", useOverLimit, &v1.VolumeSource{
   330  					EmptyDir: &v1.EmptyDirVolumeSource{Medium: "Memory", SizeLimit: &sizeLimit},
   331  				}, v1.ResourceRequirements{}),
   332  			},
   333  			{
   334  				evictionPriority: 0, // This pod should not be evicted because it uses less than its limit
   335  				pod: diskConsumingPod("emptydir-disk-below-sizelimit", useUnderLimit, &v1.VolumeSource{
   336  					EmptyDir: &v1.EmptyDirVolumeSource{SizeLimit: &sizeLimit},
   337  				}, v1.ResourceRequirements{}),
   338  			},
   339  			{
   340  				evictionPriority: 0, // This pod should not be evicted because it uses less than its limit
   341  				pod:              diskConsumingPod("container-disk-below-sizelimit", useUnderLimit, nil, v1.ResourceRequirements{Limits: containerLimit}),
   342  			},
   343  		})
   344  	})
   345  })
   346  
   347  // PriorityMemoryEvictionOrdering tests that the node responds to node memory pressure by evicting pods.
   348  // This test tests that the guaranteed pod is never evicted, and that the lower-priority pod is evicted before
   349  // the higher priority pod.
   350  var _ = SIGDescribe("PriorityMemoryEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   351  	f := framework.NewDefaultFramework("priority-memory-eviction-ordering-test")
   352  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   353  	expectedNodeCondition := v1.NodeMemoryPressure
   354  	expectedStarvedResource := v1.ResourceMemory
   355  	pressureTimeout := 10 * time.Minute
   356  
   357  	highPriorityClassName := f.BaseName + "-high-priority"
   358  	highPriority := int32(999999999)
   359  
   360  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   361  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   362  			memoryConsumed := resource.MustParse("600Mi")
   363  			summary := eventuallyGetSummary(ctx)
   364  			availableBytes := *(summary.Node.Memory.AvailableBytes)
   365  			if availableBytes <= uint64(memoryConsumed.Value()) {
   366  				e2eskipper.Skipf("Too little memory free on the host for the PriorityMemoryEvictionOrdering test to run")
   367  			}
   368  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): fmt.Sprintf("%d", availableBytes-uint64(memoryConsumed.Value()))}
   369  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   370  		})
   371  		ginkgo.BeforeEach(func(ctx context.Context) {
   372  			_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority}, metav1.CreateOptions{})
   373  			if err != nil && !apierrors.IsAlreadyExists(err) {
   374  				framework.ExpectNoError(err, "failed to create priority class")
   375  			}
   376  		})
   377  		ginkgo.AfterEach(func(ctx context.Context) {
   378  			err := f.ClientSet.SchedulingV1().PriorityClasses().Delete(ctx, highPriorityClassName, metav1.DeleteOptions{})
   379  			framework.ExpectNoError(err)
   380  		})
   381  		specs := []podEvictSpec{
   382  			{
   383  				evictionPriority: 2,
   384  				pod:              getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
   385  			},
   386  			{
   387  				evictionPriority: 1,
   388  				pod:              getMemhogPod("high-priority-memory-hog-pod", "high-priority-memory-hog", v1.ResourceRequirements{}),
   389  			},
   390  			{
   391  				evictionPriority: 0,
   392  				pod: getMemhogPod("guaranteed-pod", "guaranteed-pod", v1.ResourceRequirements{
   393  					Requests: v1.ResourceList{
   394  						v1.ResourceMemory: resource.MustParse("300Mi"),
   395  					},
   396  					Limits: v1.ResourceList{
   397  						v1.ResourceMemory: resource.MustParse("300Mi"),
   398  					},
   399  				}),
   400  			},
   401  		}
   402  		specs[1].pod.Spec.PriorityClassName = highPriorityClassName
   403  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logMemoryMetrics, specs)
   404  	})
   405  })
   406  
   407  // PriorityLocalStorageEvictionOrdering tests that the node responds to node disk pressure by evicting pods.
   408  // This test tests that the guaranteed pod is never evicted, and that the lower-priority pod is evicted before
   409  // the higher priority pod.
   410  var _ = SIGDescribe("PriorityLocalStorageEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   411  	f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test")
   412  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   413  	expectedNodeCondition := v1.NodeDiskPressure
   414  	expectedStarvedResource := v1.ResourceEphemeralStorage
   415  	pressureTimeout := 15 * time.Minute
   416  
   417  	highPriorityClassName := f.BaseName + "-high-priority"
   418  	highPriority := int32(999999999)
   419  
   420  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   421  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   422  			diskConsumed := resource.MustParse("4Gi")
   423  			summary := eventuallyGetSummary(ctx)
   424  			availableBytes := *(summary.Node.Fs.AvailableBytes)
   425  			if availableBytes <= uint64(diskConsumed.Value()) {
   426  				e2eskipper.Skipf("Too little disk free on the host for the PriorityLocalStorageEvictionOrdering test to run")
   427  			}
   428  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))}
   429  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   430  		})
   431  		ginkgo.BeforeEach(func(ctx context.Context) {
   432  			_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority}, metav1.CreateOptions{})
   433  			if err != nil && !apierrors.IsAlreadyExists(err) {
   434  				framework.ExpectNoError(err, "failed to create priority class")
   435  			}
   436  		})
   437  		ginkgo.AfterEach(func(ctx context.Context) {
   438  			err := f.ClientSet.SchedulingV1().PriorityClasses().Delete(ctx, highPriorityClassName, metav1.DeleteOptions{})
   439  			framework.ExpectNoError(err)
   440  		})
   441  		specs := []podEvictSpec{
   442  			{
   443  				evictionPriority: 2,
   444  				pod:              diskConsumingPod("best-effort-disk", lotsOfDisk, nil, v1.ResourceRequirements{}),
   445  			},
   446  			{
   447  				evictionPriority: 1,
   448  				pod:              diskConsumingPod("high-priority-disk", lotsOfDisk, nil, v1.ResourceRequirements{}),
   449  			},
   450  			{
   451  				evictionPriority: 0,
   452  				// Only require 99% accuracy (297/300 Mb) because on some OS distributions, the file itself (excluding contents), consumes disk space.
   453  				pod: diskConsumingPod("guaranteed-disk", 297 /* Mb */, nil, v1.ResourceRequirements{
   454  					Requests: v1.ResourceList{
   455  						v1.ResourceEphemeralStorage: resource.MustParse("300Mi"),
   456  					},
   457  					Limits: v1.ResourceList{
   458  						v1.ResourceEphemeralStorage: resource.MustParse("300Mi"),
   459  					},
   460  				}),
   461  			},
   462  		}
   463  		specs[1].pod.Spec.PriorityClassName = highPriorityClassName
   464  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, specs)
   465  	})
   466  })
   467  
   468  // PriorityPidEvictionOrdering tests that the node emits pid pressure in response to a fork bomb, and evicts pods by priority
   469  var _ = SIGDescribe("PriorityPidEvictionOrdering", framework.WithSlow(), framework.WithSerial(), framework.WithDisruptive(), nodefeature.Eviction, func() {
   470  	f := framework.NewDefaultFramework("pidpressure-eviction-test")
   471  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   472  	pressureTimeout := 10 * time.Minute
   473  	expectedNodeCondition := v1.NodePIDPressure
   474  	expectedStarvedResource := noStarvedResource
   475  
   476  	highPriorityClassName := f.BaseName + "-high-priority"
   477  	highPriority := int32(999999999)
   478  
   479  	ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() {
   480  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   481  			pidsConsumed := int64(10000)
   482  			summary := eventuallyGetSummary(ctx)
   483  			availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses)
   484  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)}
   485  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   486  		})
   487  		ginkgo.BeforeEach(func(ctx context.Context) {
   488  			_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority}, metav1.CreateOptions{})
   489  			if err != nil && !apierrors.IsAlreadyExists(err) {
   490  				framework.ExpectNoError(err, "failed to create priority class")
   491  			}
   492  		})
   493  		ginkgo.AfterEach(func(ctx context.Context) {
   494  			err := f.ClientSet.SchedulingV1().PriorityClasses().Delete(ctx, highPriorityClassName, metav1.DeleteOptions{})
   495  			framework.ExpectNoError(err)
   496  		})
   497  		specs := []podEvictSpec{
   498  			{
   499  				evictionPriority: 2,
   500  				pod:              pidConsumingPod("fork-bomb-container-with-low-priority", 12000),
   501  			},
   502  			{
   503  				evictionPriority: 0,
   504  				pod:              innocentPod(),
   505  			},
   506  			{
   507  				evictionPriority: 1,
   508  				pod:              pidConsumingPod("fork-bomb-container-with-high-priority", 12000),
   509  			},
   510  		}
   511  		specs[1].pod.Spec.PriorityClassName = highPriorityClassName
   512  		specs[2].pod.Spec.PriorityClassName = highPriorityClassName
   513  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logPidMetrics, specs)
   514  	})
   515  
   516  	f.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition)+"; PodDisruptionConditions enabled", nodefeature.PodDisruptionConditions, func() {
   517  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   518  			pidsConsumed := int64(10000)
   519  			summary := eventuallyGetSummary(ctx)
   520  			availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses)
   521  			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)}
   522  			initialConfig.EvictionMinimumReclaim = map[string]string{}
   523  			initialConfig.FeatureGates = map[string]bool{
   524  				string(features.PodDisruptionConditions): true,
   525  			}
   526  		})
   527  		disruptionTarget := v1.DisruptionTarget
   528  		specs := []podEvictSpec{
   529  			{
   530  				evictionPriority:           1,
   531  				pod:                        pidConsumingPod("fork-bomb-container", 30000),
   532  				wantPodDisruptionCondition: &disruptionTarget,
   533  			},
   534  		}
   535  		runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logPidMetrics, specs)
   536  	})
   537  })
   538  
   539  // Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
   540  type podEvictSpec struct {
   541  	// P0 should never be evicted, P1 shouldn't evict before P2, etc.
   542  	// If two are ranked at P1, either is permitted to fail before the other.
   543  	// The test ends when all pods other than p0 have been evicted
   544  	evictionPriority           int
   545  	pod                        *v1.Pod
   546  	wantPodDisruptionCondition *v1.PodConditionType
   547  }
   548  
   549  // runEvictionTest sets up a testing environment given the provided pods, and checks a few things:
   550  //
   551  //	It ensures that the desired expectedNodeCondition is actually triggered.
   552  //	It ensures that evictionPriority 0 pods are not evicted
   553  //	It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
   554  //	It ensures that all pods with non-zero evictionPriority are eventually evicted.
   555  //
   556  // runEvictionTest then cleans up the testing environment by deleting provided pods, and ensures that expectedNodeCondition no longer exists
   557  func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expectedNodeCondition v1.NodeConditionType, expectedStarvedResource v1.ResourceName, logFunc func(ctx context.Context), testSpecs []podEvictSpec) {
   558  	// Place the remainder of the test within a context so that the kubelet config is set before and after the test.
   559  	ginkgo.Context("", func() {
   560  		ginkgo.BeforeEach(func(ctx context.Context) {
   561  			// reduce memory usage in the allocatable cgroup to ensure we do not have MemoryPressure
   562  			reduceAllocatableMemoryUsageIfCgroupv1()
   563  			// Nodes do not immediately report local storage capacity
   564  			// Sleep so that pods requesting local storage do not fail to schedule
   565  			time.Sleep(30 * time.Second)
   566  			ginkgo.By("setting up pods to be used by tests")
   567  			pods := []*v1.Pod{}
   568  			for _, spec := range testSpecs {
   569  				pods = append(pods, spec.pod)
   570  			}
   571  			e2epod.NewPodClient(f).CreateBatch(ctx, pods)
   572  		})
   573  
   574  		ginkgo.It("should eventually evict all of the correct pods", func(ctx context.Context) {
   575  			ginkgo.By(fmt.Sprintf("Waiting for node to have NodeCondition: %s", expectedNodeCondition))
   576  			gomega.Eventually(ctx, func(ctx context.Context) error {
   577  				logFunc(ctx)
   578  				if expectedNodeCondition == noPressure || hasNodeCondition(ctx, f, expectedNodeCondition) {
   579  					return nil
   580  				}
   581  				return fmt.Errorf("NodeCondition: %s not encountered", expectedNodeCondition)
   582  			}, pressureTimeout, evictionPollInterval).Should(gomega.BeNil())
   583  
   584  			ginkgo.By("Waiting for evictions to occur")
   585  			gomega.Eventually(ctx, func(ctx context.Context) error {
   586  				if expectedNodeCondition != noPressure {
   587  					if hasNodeCondition(ctx, f, expectedNodeCondition) {
   588  						framework.Logf("Node has %s", expectedNodeCondition)
   589  					} else {
   590  						framework.Logf("Node does NOT have %s", expectedNodeCondition)
   591  					}
   592  				}
   593  				logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
   594  				logFunc(ctx)
   595  				return verifyEvictionOrdering(ctx, f, testSpecs)
   596  			}, pressureTimeout, evictionPollInterval).Should(gomega.Succeed())
   597  
   598  			ginkgo.By("checking for the expected pod conditions for evicted pods")
   599  			verifyPodConditions(ctx, f, testSpecs)
   600  
   601  			// We observe pressure from the API server.  The eviction manager observes pressure from the kubelet internal stats.
   602  			// This means the eviction manager will observe pressure before we will, creating a delay between when the eviction manager
   603  			// evicts a pod, and when we observe the pressure by querying the API server.  Add a delay here to account for this delay
   604  			ginkgo.By("making sure pressure from test has surfaced before continuing")
   605  			time.Sleep(pressureDelay)
   606  
   607  			ginkgo.By(fmt.Sprintf("Waiting for NodeCondition: %s to no longer exist on the node", expectedNodeCondition))
   608  			gomega.Eventually(ctx, func(ctx context.Context) error {
   609  				logFunc(ctx)
   610  				logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
   611  				if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
   612  					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
   613  				}
   614  				return nil
   615  			}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
   616  
   617  			ginkgo.By("checking for stable, pressure-free condition without unexpected pod failures")
   618  			gomega.Consistently(ctx, func(ctx context.Context) error {
   619  				if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
   620  					return fmt.Errorf("%s disappeared and then reappeared", expectedNodeCondition)
   621  				}
   622  				logFunc(ctx)
   623  				logKubeletLatencyMetrics(ctx, kubeletmetrics.EvictionStatsAgeKey)
   624  				return verifyEvictionOrdering(ctx, f, testSpecs)
   625  			}, postTestConditionMonitoringPeriod, evictionPollInterval).Should(gomega.Succeed())
   626  
   627  			ginkgo.By("checking for correctly formatted eviction events")
   628  			verifyEvictionEvents(ctx, f, testSpecs, expectedStarvedResource)
   629  		})
   630  
   631  		ginkgo.AfterEach(func(ctx context.Context) {
   632  			prePullImagesIfNeccecary := func() {
   633  				if expectedNodeCondition == v1.NodeDiskPressure && framework.TestContext.PrepullImages {
   634  					// The disk eviction test may cause the prepulled images to be evicted,
   635  					// prepull those images again to ensure this test not affect following tests.
   636  					PrePullAllImages()
   637  				}
   638  			}
   639  			// Run prePull using a defer to make sure it is executed even when the assertions below fails
   640  			defer prePullImagesIfNeccecary()
   641  
   642  			ginkgo.By("deleting pods")
   643  			for _, spec := range testSpecs {
   644  				ginkgo.By(fmt.Sprintf("deleting pod: %s", spec.pod.Name))
   645  				e2epod.NewPodClient(f).DeleteSync(ctx, spec.pod.Name, metav1.DeleteOptions{}, 10*time.Minute)
   646  			}
   647  
   648  			// In case a test fails before verifying that NodeCondition no longer exist on the node,
   649  			// we should wait for the NodeCondition to disappear
   650  			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s no longer exists on the node", expectedNodeCondition))
   651  			gomega.Eventually(ctx, func(ctx context.Context) error {
   652  				if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
   653  					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
   654  				}
   655  				return nil
   656  			}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
   657  
   658  			reduceAllocatableMemoryUsageIfCgroupv1()
   659  			ginkgo.By("making sure we have all the required images for testing")
   660  			prePullImagesIfNeccecary()
   661  
   662  			// Ensure that the NodeCondition hasn't returned after pulling images
   663  			ginkgo.By(fmt.Sprintf("making sure NodeCondition %s doesn't exist again after pulling images", expectedNodeCondition))
   664  			gomega.Eventually(ctx, func(ctx context.Context) error {
   665  				if expectedNodeCondition != noPressure && hasNodeCondition(ctx, f, expectedNodeCondition) {
   666  					return fmt.Errorf("Conditions haven't returned to normal, node still has %s", expectedNodeCondition)
   667  				}
   668  				return nil
   669  			}, pressureDisappearTimeout, evictionPollInterval).Should(gomega.BeNil())
   670  
   671  			ginkgo.By("making sure we can start a new pod after the test")
   672  			podName := "test-admit-pod"
   673  			e2epod.NewPodClient(f).CreateSync(ctx, &v1.Pod{
   674  				ObjectMeta: metav1.ObjectMeta{
   675  					Name: podName,
   676  				},
   677  				Spec: v1.PodSpec{
   678  					RestartPolicy: v1.RestartPolicyNever,
   679  					Containers: []v1.Container{
   680  						{
   681  							Image: imageutils.GetPauseImageName(),
   682  							Name:  podName,
   683  						},
   684  					},
   685  				},
   686  			})
   687  
   688  			if ginkgo.CurrentSpecReport().Failed() {
   689  				if framework.TestContext.DumpLogsOnFailure {
   690  					logPodEvents(ctx, f)
   691  					logNodeEvents(ctx, f)
   692  				}
   693  			}
   694  		})
   695  	})
   696  }
   697  
   698  // verifyEvictionOrdering returns an error if all non-zero priority pods have not been evicted, nil otherwise
   699  // This function panics (via Expect) if eviction ordering is violated, or if a priority-zero pod fails.
   700  func verifyEvictionOrdering(ctx context.Context, f *framework.Framework, testSpecs []podEvictSpec) error {
   701  	// Gather current information
   702  	updatedPodList, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).List(ctx, metav1.ListOptions{})
   703  	if err != nil {
   704  		return err
   705  	}
   706  	updatedPods := updatedPodList.Items
   707  	for _, p := range updatedPods {
   708  		framework.Logf("fetching pod %s; phase= %v", p.Name, p.Status.Phase)
   709  	}
   710  
   711  	ginkgo.By("checking eviction ordering and ensuring important pods don't fail")
   712  	done := true
   713  	pendingPods := []string{}
   714  	for _, priorityPodSpec := range testSpecs {
   715  		var priorityPod v1.Pod
   716  		for _, p := range updatedPods {
   717  			if p.Name == priorityPodSpec.pod.Name {
   718  				priorityPod = p
   719  			}
   720  		}
   721  		gomega.Expect(priorityPod).NotTo(gomega.BeNil())
   722  		gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodSucceeded),
   723  			fmt.Sprintf("pod: %s succeeded unexpectedly", priorityPod.Name))
   724  
   725  		// Check eviction ordering.
   726  		// Note: it is alright for a priority 1 and priority 2 pod (for example) to fail in the same round,
   727  		// but never alright for a priority 1 pod to fail while the priority 2 pod is still running
   728  		for _, lowPriorityPodSpec := range testSpecs {
   729  			var lowPriorityPod v1.Pod
   730  			for _, p := range updatedPods {
   731  				if p.Name == lowPriorityPodSpec.pod.Name {
   732  					lowPriorityPod = p
   733  				}
   734  			}
   735  			gomega.Expect(lowPriorityPod).NotTo(gomega.BeNil())
   736  			if priorityPodSpec.evictionPriority < lowPriorityPodSpec.evictionPriority && lowPriorityPod.Status.Phase == v1.PodRunning {
   737  				gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodFailed),
   738  					fmt.Sprintf("priority %d pod: %s failed before priority %d pod: %s",
   739  						priorityPodSpec.evictionPriority, priorityPodSpec.pod.Name, lowPriorityPodSpec.evictionPriority, lowPriorityPodSpec.pod.Name))
   740  			}
   741  		}
   742  
   743  		if priorityPod.Status.Phase == v1.PodFailed {
   744  			gomega.Expect(priorityPod.Status.Reason).To(gomega.Equal(eviction.Reason), "pod %s failed; expected Status.Reason to be %s, but got %s",
   745  				priorityPod.Name, eviction.Reason, priorityPod.Status.Reason)
   746  		}
   747  
   748  		// EvictionPriority 0 pods should not fail
   749  		if priorityPodSpec.evictionPriority == 0 {
   750  			gomega.Expect(priorityPod.Status.Phase).ToNot(gomega.Equal(v1.PodFailed),
   751  				fmt.Sprintf("priority 0 pod: %s failed", priorityPod.Name))
   752  		}
   753  
   754  		// If a pod that is not evictionPriority 0 has not been evicted, we are not done
   755  		if priorityPodSpec.evictionPriority != 0 && priorityPod.Status.Phase != v1.PodFailed {
   756  			pendingPods = append(pendingPods, priorityPod.ObjectMeta.Name)
   757  			done = false
   758  		}
   759  	}
   760  	if done {
   761  		return nil
   762  	}
   763  	return fmt.Errorf("pods that should be evicted are still running: %#v", pendingPods)
   764  }
   765  
   766  func verifyPodConditions(ctx context.Context, f *framework.Framework, testSpecs []podEvictSpec) {
   767  	for _, spec := range testSpecs {
   768  		if spec.wantPodDisruptionCondition != nil {
   769  			pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, spec.pod.Name, metav1.GetOptions{})
   770  			framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
   771  
   772  			cType := *spec.wantPodDisruptionCondition
   773  			podDisruptionCondition := e2epod.FindPodConditionByType(&pod.Status, cType)
   774  			if podDisruptionCondition == nil {
   775  				framework.Failf("pod %q should have the condition: %q, pod status: %v", pod.Name, cType, pod.Status)
   776  			}
   777  		}
   778  	}
   779  }
   780  
   781  func verifyEvictionEvents(ctx context.Context, f *framework.Framework, testSpecs []podEvictSpec, expectedStarvedResource v1.ResourceName) {
   782  	for _, spec := range testSpecs {
   783  		pod := spec.pod
   784  		if spec.evictionPriority != 0 {
   785  			selector := fields.Set{
   786  				"involvedObject.kind":      "Pod",
   787  				"involvedObject.name":      pod.Name,
   788  				"involvedObject.namespace": f.Namespace.Name,
   789  				"reason":                   eviction.Reason,
   790  			}.AsSelector().String()
   791  			podEvictEvents, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{FieldSelector: selector})
   792  			framework.ExpectNoError(err, "getting events")
   793  			gomega.Expect(podEvictEvents.Items).To(gomega.HaveLen(1), "Expected to find 1 eviction event for pod %s, got %d", pod.Name, len(podEvictEvents.Items))
   794  			event := podEvictEvents.Items[0]
   795  
   796  			if expectedStarvedResource != noStarvedResource {
   797  				// Check the eviction.StarvedResourceKey
   798  				starved, found := event.Annotations[eviction.StarvedResourceKey]
   799  				if !found {
   800  					framework.Failf("Expected to find an annotation on the eviction event for pod %s containing the starved resource %s, but it was not found",
   801  						pod.Name, expectedStarvedResource)
   802  				}
   803  				starvedResource := v1.ResourceName(starved)
   804  				gomega.Expect(starvedResource).To(gomega.Equal(expectedStarvedResource), "Expected to the starved_resource annotation on pod %s to contain %s, but got %s instead",
   805  					pod.Name, expectedStarvedResource, starvedResource)
   806  
   807  				// We only check these keys for memory, because ephemeral storage evictions may be due to volume usage, in which case these values are not present
   808  				if expectedStarvedResource == v1.ResourceMemory {
   809  					// Check the eviction.OffendingContainersKey
   810  					offendersString, found := event.Annotations[eviction.OffendingContainersKey]
   811  					if !found {
   812  						framework.Failf("Expected to find an annotation on the eviction event for pod %s containing the offending containers, but it was not found",
   813  							pod.Name)
   814  					}
   815  					offendingContainers := strings.Split(offendersString, ",")
   816  					gomega.Expect(offendingContainers).To(gomega.HaveLen(1), "Expected to find the offending container's usage in the %s annotation, but no container was found",
   817  						eviction.OffendingContainersKey)
   818  					gomega.Expect(offendingContainers[0]).To(gomega.Equal(pod.Spec.Containers[0].Name), "Expected to find the offending container: %s's usage in the %s annotation, but found %s instead",
   819  						pod.Spec.Containers[0].Name, eviction.OffendingContainersKey, offendingContainers[0])
   820  
   821  					// Check the eviction.OffendingContainersUsageKey
   822  					offendingUsageString, found := event.Annotations[eviction.OffendingContainersUsageKey]
   823  					if !found {
   824  						framework.Failf("Expected to find an annotation on the eviction event for pod %s containing the offending containers' usage, but it was not found",
   825  							pod.Name)
   826  					}
   827  					offendingContainersUsage := strings.Split(offendingUsageString, ",")
   828  					gomega.Expect(offendingContainersUsage).To(gomega.HaveLen(1), "Expected to find the offending container's usage in the %s annotation, but found %+v",
   829  						eviction.OffendingContainersUsageKey, offendingContainersUsage)
   830  					usageQuantity, err := resource.ParseQuantity(offendingContainersUsage[0])
   831  					framework.ExpectNoError(err, "parsing pod %s's %s annotation as a quantity", pod.Name, eviction.OffendingContainersUsageKey)
   832  					request := pod.Spec.Containers[0].Resources.Requests[starvedResource]
   833  					gomega.Expect(usageQuantity.Cmp(request)).To(gomega.Equal(1), "Expected usage of offending container: %s in pod %s to exceed its request %s",
   834  						usageQuantity.String(), pod.Name, request.String())
   835  				}
   836  			}
   837  		}
   838  	}
   839  }
   840  
   841  // Returns TRUE if the node has the node condition, FALSE otherwise
   842  func hasNodeCondition(ctx context.Context, f *framework.Framework, expectedNodeCondition v1.NodeConditionType) bool {
   843  	localNodeStatus := getLocalNode(ctx, f).Status
   844  	_, actualNodeCondition := testutils.GetNodeCondition(&localNodeStatus, expectedNodeCondition)
   845  	gomega.Expect(actualNodeCondition).NotTo(gomega.BeNil())
   846  	return actualNodeCondition.Status == v1.ConditionTrue
   847  }
   848  
   849  func logInodeMetrics(ctx context.Context) {
   850  	summary, err := getNodeSummary(ctx)
   851  	if err != nil {
   852  		framework.Logf("Error getting summary: %v", err)
   853  		return
   854  	}
   855  	if summary.Node.Runtime != nil && summary.Node.Runtime.ImageFs != nil && summary.Node.Runtime.ImageFs.Inodes != nil && summary.Node.Runtime.ImageFs.InodesFree != nil {
   856  		framework.Logf("imageFsInfo.Inodes: %d, imageFsInfo.InodesFree: %d", *summary.Node.Runtime.ImageFs.Inodes, *summary.Node.Runtime.ImageFs.InodesFree)
   857  	}
   858  	if summary.Node.Fs != nil && summary.Node.Fs.Inodes != nil && summary.Node.Fs.InodesFree != nil {
   859  		framework.Logf("rootFsInfo.Inodes: %d, rootFsInfo.InodesFree: %d", *summary.Node.Fs.Inodes, *summary.Node.Fs.InodesFree)
   860  	}
   861  	for _, pod := range summary.Pods {
   862  		framework.Logf("Pod: %s", pod.PodRef.Name)
   863  		for _, container := range pod.Containers {
   864  			if container.Rootfs != nil && container.Rootfs.InodesUsed != nil {
   865  				framework.Logf("--- summary Container: %s inodeUsage: %d", container.Name, *container.Rootfs.InodesUsed)
   866  			}
   867  		}
   868  		for _, volume := range pod.VolumeStats {
   869  			if volume.FsStats.InodesUsed != nil {
   870  				framework.Logf("--- summary Volume: %s inodeUsage: %d", volume.Name, *volume.FsStats.InodesUsed)
   871  			}
   872  		}
   873  	}
   874  }
   875  
   876  func logDiskMetrics(ctx context.Context) {
   877  	summary, err := getNodeSummary(ctx)
   878  	if err != nil {
   879  		framework.Logf("Error getting summary: %v", err)
   880  		return
   881  	}
   882  	if summary.Node.Runtime != nil && summary.Node.Runtime.ImageFs != nil && summary.Node.Runtime.ImageFs.CapacityBytes != nil && summary.Node.Runtime.ImageFs.AvailableBytes != nil {
   883  		framework.Logf("imageFsInfo.CapacityBytes: %d, imageFsInfo.AvailableBytes: %d", *summary.Node.Runtime.ImageFs.CapacityBytes, *summary.Node.Runtime.ImageFs.AvailableBytes)
   884  	}
   885  	if summary.Node.Fs != nil && summary.Node.Fs.CapacityBytes != nil && summary.Node.Fs.AvailableBytes != nil {
   886  		framework.Logf("rootFsInfo.CapacityBytes: %d, rootFsInfo.AvailableBytes: %d", *summary.Node.Fs.CapacityBytes, *summary.Node.Fs.AvailableBytes)
   887  	}
   888  	for _, pod := range summary.Pods {
   889  		framework.Logf("Pod: %s", pod.PodRef.Name)
   890  		for _, container := range pod.Containers {
   891  			if container.Rootfs != nil && container.Rootfs.UsedBytes != nil {
   892  				framework.Logf("--- summary Container: %s UsedBytes: %d", container.Name, *container.Rootfs.UsedBytes)
   893  			}
   894  		}
   895  		for _, volume := range pod.VolumeStats {
   896  			if volume.FsStats.InodesUsed != nil {
   897  				framework.Logf("--- summary Volume: %s UsedBytes: %d", volume.Name, *volume.FsStats.UsedBytes)
   898  			}
   899  		}
   900  	}
   901  }
   902  
   903  func logMemoryMetrics(ctx context.Context) {
   904  	summary, err := getNodeSummary(ctx)
   905  	if err != nil {
   906  		framework.Logf("Error getting summary: %v", err)
   907  		return
   908  	}
   909  	if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
   910  		framework.Logf("Node.Memory.WorkingSetBytes: %d, Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
   911  	}
   912  	for _, sysContainer := range summary.Node.SystemContainers {
   913  		if sysContainer.Name == kubeletstatsv1alpha1.SystemContainerPods && sysContainer.Memory != nil && sysContainer.Memory.WorkingSetBytes != nil && sysContainer.Memory.AvailableBytes != nil {
   914  			framework.Logf("Allocatable.Memory.WorkingSetBytes: %d, Allocatable.Memory.AvailableBytes: %d", *sysContainer.Memory.WorkingSetBytes, *sysContainer.Memory.AvailableBytes)
   915  		}
   916  	}
   917  	for _, pod := range summary.Pods {
   918  		framework.Logf("Pod: %s", pod.PodRef.Name)
   919  		for _, container := range pod.Containers {
   920  			if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
   921  				framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
   922  			}
   923  		}
   924  	}
   925  }
   926  
   927  func logPidMetrics(ctx context.Context) {
   928  	summary, err := getNodeSummary(ctx)
   929  	if err != nil {
   930  		framework.Logf("Error getting summary: %v", err)
   931  		return
   932  	}
   933  	if summary.Node.Rlimit != nil && summary.Node.Rlimit.MaxPID != nil && summary.Node.Rlimit.NumOfRunningProcesses != nil {
   934  		framework.Logf("Node.Rlimit.MaxPID: %d, Node.Rlimit.RunningProcesses: %d", *summary.Node.Rlimit.MaxPID, *summary.Node.Rlimit.NumOfRunningProcesses)
   935  	}
   936  }
   937  
   938  func eventuallyGetSummary(ctx context.Context) (s *kubeletstatsv1alpha1.Summary) {
   939  	gomega.Eventually(ctx, func() error {
   940  		summary, err := getNodeSummary(ctx)
   941  		if err != nil {
   942  			return err
   943  		}
   944  		if summary == nil || summary.Node.Fs == nil || summary.Node.Fs.InodesFree == nil || summary.Node.Fs.AvailableBytes == nil {
   945  			return fmt.Errorf("some part of data is nil")
   946  		}
   947  		s = summary
   948  		return nil
   949  	}, time.Minute, evictionPollInterval).Should(gomega.BeNil())
   950  	return
   951  }
   952  
   953  // returns a pod that does not use any resources
   954  func innocentPod() *v1.Pod {
   955  	// Due to https://github.com/kubernetes/kubernetes/issues/115819,
   956  	// When evictionHard to used, we were setting grace period to 0 which meant the default setting (30 seconds)
   957  	// This could help with flakiness as we should send sigterm right away.
   958  	var gracePeriod int64 = 1
   959  	return &v1.Pod{
   960  		ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
   961  		Spec: v1.PodSpec{
   962  			RestartPolicy:                 v1.RestartPolicyNever,
   963  			TerminationGracePeriodSeconds: &gracePeriod,
   964  			Containers: []v1.Container{
   965  				{
   966  					Image: busyboxImage,
   967  					Name:  "innocent-container",
   968  					Command: []string{
   969  						"sh",
   970  						"-c",
   971  						"while true; do sleep 5; done",
   972  					},
   973  				},
   974  			},
   975  		},
   976  	}
   977  }
   978  
   979  const (
   980  	volumeMountPath = "/test-mnt"
   981  	volumeName      = "test-volume"
   982  )
   983  
   984  func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource) *v1.Pod {
   985  	path := ""
   986  	if volumeSource != nil {
   987  		path = volumeMountPath
   988  	}
   989  	// Each iteration creates an empty file
   990  	return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file")))
   991  }
   992  
   993  func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements) *v1.Pod {
   994  	path := ""
   995  	if volumeSource != nil {
   996  		path = volumeMountPath
   997  	}
   998  	// Each iteration writes 1 Mb, so do diskConsumedMB iterations.
   999  	return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null; sleep .1;", filepath.Join(path, "file")))
  1000  }
  1001  
  1002  func pidConsumingPod(name string, numProcesses int) *v1.Pod {
  1003  	// Each iteration forks once, but creates two processes
  1004  	return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses/2, name, "(while true; do /bin/sleep 5; done)&")
  1005  }
  1006  
  1007  // podWithCommand returns a pod with the provided volumeSource and resourceRequirements.
  1008  func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string) *v1.Pod {
  1009  	// Due to https://github.com/kubernetes/kubernetes/issues/115819,
  1010  	// When evictionHard to used, we were setting grace period to 0 which meant the default setting (30 seconds)
  1011  	// This could help with flakiness as we should send sigterm right away.
  1012  	var gracePeriod int64 = 1
  1013  	volumeMounts := []v1.VolumeMount{}
  1014  	volumes := []v1.Volume{}
  1015  	if volumeSource != nil {
  1016  		volumeMounts = []v1.VolumeMount{{MountPath: volumeMountPath, Name: volumeName}}
  1017  		volumes = []v1.Volume{{Name: volumeName, VolumeSource: *volumeSource}}
  1018  	}
  1019  	return &v1.Pod{
  1020  		ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod", name)},
  1021  		Spec: v1.PodSpec{
  1022  			RestartPolicy:                 v1.RestartPolicyNever,
  1023  			TerminationGracePeriodSeconds: &gracePeriod,
  1024  			Containers: []v1.Container{
  1025  				{
  1026  					Image: busyboxImage,
  1027  					Name:  fmt.Sprintf("%s-container", name),
  1028  					Command: []string{
  1029  						"sh",
  1030  						"-c",
  1031  						fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while true; do sleep 5; done", iterations, command),
  1032  					},
  1033  					Resources:    resources,
  1034  					VolumeMounts: volumeMounts,
  1035  				},
  1036  			},
  1037  			Volumes: volumes,
  1038  		},
  1039  	}
  1040  }
  1041  
  1042  func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
  1043  	// Due to https://github.com/kubernetes/kubernetes/issues/115819,
  1044  	// When evictionHard to used, we were setting grace period to 0 which meant the default setting (30 seconds)
  1045  	// This could help with flakiness as we should send sigterm right away.
  1046  	var gracePeriod int64 = 1
  1047  	env := []v1.EnvVar{
  1048  		{
  1049  			Name: "MEMORY_LIMIT",
  1050  			ValueFrom: &v1.EnvVarSource{
  1051  				ResourceFieldRef: &v1.ResourceFieldSelector{
  1052  					Resource: "limits.memory",
  1053  				},
  1054  			},
  1055  		},
  1056  	}
  1057  
  1058  	// If there is a limit specified, pass 80% of it for -mem-total, otherwise use the downward API
  1059  	// to pass limits.memory, which will be the total memory available.
  1060  	// This helps prevent a guaranteed pod from triggering an OOM kill due to it's low memory limit,
  1061  	// which will cause the test to fail inappropriately.
  1062  	var memLimit string
  1063  	if limit, ok := res.Limits[v1.ResourceMemory]; ok {
  1064  		memLimit = strconv.Itoa(int(
  1065  			float64(limit.Value()) * 0.8))
  1066  	} else {
  1067  		memLimit = "$(MEMORY_LIMIT)"
  1068  	}
  1069  
  1070  	return &v1.Pod{
  1071  		ObjectMeta: metav1.ObjectMeta{
  1072  			Name: podName,
  1073  		},
  1074  		Spec: v1.PodSpec{
  1075  			RestartPolicy:                 v1.RestartPolicyNever,
  1076  			TerminationGracePeriodSeconds: &gracePeriod,
  1077  			Containers: []v1.Container{
  1078  				{
  1079  					Name:            ctnName,
  1080  					Image:           imageutils.GetE2EImage(imageutils.Agnhost),
  1081  					ImagePullPolicy: "Always",
  1082  					Env:             env,
  1083  					// 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick
  1084  					// to fill ~4Gi of memory, so initial ballpark 12Mi/tick.
  1085  					// We might see flakes due to timeout if the total memory on the nodes increases.
  1086  					Args:      []string{"stress", "--mem-alloc-size", "12Mi", "--mem-alloc-sleep", "10s", "--mem-total", memLimit},
  1087  					Resources: res,
  1088  				},
  1089  			},
  1090  		},
  1091  	}
  1092  }