k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/eviction/eviction_manager.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package eviction
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"sync"
    24  	"time"
    25  
    26  	"k8s.io/klog/v2"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	"k8s.io/apimachinery/pkg/api/resource"
    30  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    31  	"k8s.io/client-go/tools/record"
    32  	corev1helpers "k8s.io/component-helpers/scheduling/corev1"
    33  	statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    34  	"k8s.io/utils/clock"
    35  
    36  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    37  	resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource"
    38  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    39  	"k8s.io/kubernetes/pkg/features"
    40  	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
    41  	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
    42  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    43  	"k8s.io/kubernetes/pkg/kubelet/server/stats"
    44  	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
    45  )
    46  
    47  const (
    48  	podCleanupTimeout  = 30 * time.Second
    49  	podCleanupPollFreq = time.Second
    50  )
    51  
    52  const (
    53  	// signalEphemeralContainerFsLimit is amount of storage available on filesystem requested by the container
    54  	signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit"
    55  	// signalEphemeralPodFsLimit is amount of storage available on filesystem requested by the pod
    56  	signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
    57  	// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
    58  	signalEmptyDirFsLimit string = "emptydirfs.limit"
    59  	// immediateEvictionGracePeriodSeconds is how long we give pods to shut down when we
    60  	// need them to evict quickly due to resource pressure
    61  	immediateEvictionGracePeriodSeconds = 1
    62  )
    63  
    64  // managerImpl implements Manager
    65  type managerImpl struct {
    66  	//  used to track time
    67  	clock clock.WithTicker
    68  	// config is how the manager is configured
    69  	config Config
    70  	// the function to invoke to kill a pod
    71  	killPodFunc KillPodFunc
    72  	// the interface that knows how to do image gc
    73  	imageGC ImageGC
    74  	// the interface that knows how to do container gc
    75  	containerGC ContainerGC
    76  	// protects access to internal state
    77  	sync.RWMutex
    78  	// node conditions are the set of conditions present
    79  	nodeConditions []v1.NodeConditionType
    80  	// captures when a node condition was last observed based on a threshold being met
    81  	nodeConditionsLastObservedAt nodeConditionsObservedAt
    82  	// nodeRef is a reference to the node
    83  	nodeRef *v1.ObjectReference
    84  	// used to record events about the node
    85  	recorder record.EventRecorder
    86  	// used to measure usage stats on system
    87  	summaryProvider stats.SummaryProvider
    88  	// records when a threshold was first observed
    89  	thresholdsFirstObservedAt thresholdsObservedAt
    90  	// records the set of thresholds that have been met (including graceperiod) but not yet resolved
    91  	thresholdsMet []evictionapi.Threshold
    92  	// signalToRankFunc maps a resource to ranking function for that resource.
    93  	signalToRankFunc map[evictionapi.Signal]rankFunc
    94  	// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
    95  	signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
    96  	// last observations from synchronize
    97  	lastObservations signalObservations
    98  	// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
    99  	dedicatedImageFs *bool
   100  	// splitContainerImageFs indicates if containerfs is on a separate device from imagefs
   101  	splitContainerImageFs *bool
   102  	// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
   103  	thresholdNotifiers []ThresholdNotifier
   104  	// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
   105  	thresholdsLastUpdated time.Time
   106  	// whether can support local storage capacity isolation
   107  	localStorageCapacityIsolation bool
   108  }
   109  
   110  // ensure it implements the required interface
   111  var _ Manager = &managerImpl{}
   112  
   113  // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
   114  func NewManager(
   115  	summaryProvider stats.SummaryProvider,
   116  	config Config,
   117  	killPodFunc KillPodFunc,
   118  	imageGC ImageGC,
   119  	containerGC ContainerGC,
   120  	recorder record.EventRecorder,
   121  	nodeRef *v1.ObjectReference,
   122  	clock clock.WithTicker,
   123  	localStorageCapacityIsolation bool,
   124  ) (Manager, lifecycle.PodAdmitHandler) {
   125  	manager := &managerImpl{
   126  		clock:                         clock,
   127  		killPodFunc:                   killPodFunc,
   128  		imageGC:                       imageGC,
   129  		containerGC:                   containerGC,
   130  		config:                        config,
   131  		recorder:                      recorder,
   132  		summaryProvider:               summaryProvider,
   133  		nodeRef:                       nodeRef,
   134  		nodeConditionsLastObservedAt:  nodeConditionsObservedAt{},
   135  		thresholdsFirstObservedAt:     thresholdsObservedAt{},
   136  		dedicatedImageFs:              nil,
   137  		splitContainerImageFs:         nil,
   138  		thresholdNotifiers:            []ThresholdNotifier{},
   139  		localStorageCapacityIsolation: localStorageCapacityIsolation,
   140  	}
   141  	return manager, manager
   142  }
   143  
   144  // Admit rejects a pod if its not safe to admit for node stability.
   145  func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
   146  	m.RLock()
   147  	defer m.RUnlock()
   148  	if len(m.nodeConditions) == 0 {
   149  		return lifecycle.PodAdmitResult{Admit: true}
   150  	}
   151  	// Admit Critical pods even under resource pressure since they are required for system stability.
   152  	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
   153  	if kubelettypes.IsCriticalPod(attrs.Pod) {
   154  		return lifecycle.PodAdmitResult{Admit: true}
   155  	}
   156  
   157  	// Conditions other than memory pressure reject all pods
   158  	nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1
   159  	if nodeOnlyHasMemoryPressureCondition {
   160  		notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)
   161  		if notBestEffort {
   162  			return lifecycle.PodAdmitResult{Admit: true}
   163  		}
   164  
   165  		// When node has memory pressure, check BestEffort Pod's toleration:
   166  		// admit it if tolerates memory pressure taint, fail for other tolerations, e.g. DiskPressure.
   167  		if corev1helpers.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
   168  			Key:    v1.TaintNodeMemoryPressure,
   169  			Effect: v1.TaintEffectNoSchedule,
   170  		}) {
   171  			return lifecycle.PodAdmitResult{Admit: true}
   172  		}
   173  	}
   174  
   175  	// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
   176  	klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions)
   177  	return lifecycle.PodAdmitResult{
   178  		Admit:   false,
   179  		Reason:  Reason,
   180  		Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),
   181  	}
   182  }
   183  
   184  // Start starts the control loop to observe and response to low compute resources.
   185  func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
   186  	thresholdHandler := func(message string) {
   187  		klog.InfoS(message)
   188  		m.synchronize(diskInfoProvider, podFunc)
   189  	}
   190  	if m.config.KernelMemcgNotification {
   191  		for _, threshold := range m.config.Thresholds {
   192  			if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
   193  				notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
   194  				if err != nil {
   195  					klog.InfoS("Eviction manager: failed to create memory threshold notifier", "err", err)
   196  				} else {
   197  					go notifier.Start()
   198  					m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
   199  				}
   200  			}
   201  		}
   202  	}
   203  	// start the eviction manager monitoring
   204  	go func() {
   205  		for {
   206  			evictedPods, err := m.synchronize(diskInfoProvider, podFunc)
   207  			if evictedPods != nil && err == nil {
   208  				klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods))
   209  				m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
   210  			} else {
   211  				if err != nil {
   212  					klog.ErrorS(err, "Eviction manager: failed to synchronize")
   213  				}
   214  				time.Sleep(monitoringInterval)
   215  			}
   216  		}
   217  	}()
   218  }
   219  
   220  // IsUnderMemoryPressure returns true if the node is under memory pressure.
   221  func (m *managerImpl) IsUnderMemoryPressure() bool {
   222  	m.RLock()
   223  	defer m.RUnlock()
   224  	return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure)
   225  }
   226  
   227  // IsUnderDiskPressure returns true if the node is under disk pressure.
   228  func (m *managerImpl) IsUnderDiskPressure() bool {
   229  	m.RLock()
   230  	defer m.RUnlock()
   231  	return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure)
   232  }
   233  
   234  // IsUnderPIDPressure returns true if the node is under PID pressure.
   235  func (m *managerImpl) IsUnderPIDPressure() bool {
   236  	m.RLock()
   237  	defer m.RUnlock()
   238  	return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
   239  }
   240  
   241  // synchronize is the main control loop that enforces eviction thresholds.
   242  // Returns the pod that was killed, or nil if no pod was killed.
   243  func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) {
   244  	ctx := context.Background()
   245  	// if we have nothing to do, just return
   246  	thresholds := m.config.Thresholds
   247  	if len(thresholds) == 0 && !m.localStorageCapacityIsolation {
   248  		return nil, nil
   249  	}
   250  
   251  	klog.V(3).InfoS("Eviction manager: synchronize housekeeping")
   252  	// build the ranking functions (if not yet known)
   253  	// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
   254  	if m.dedicatedImageFs == nil {
   255  		hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx)
   256  		if splitDiskError != nil {
   257  			klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs")
   258  			return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError)
   259  		}
   260  		m.dedicatedImageFs = &hasImageFs
   261  		splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx)
   262  
   263  		// If we are a split filesystem but the feature is turned off
   264  		// we should return an error.
   265  		// This is a bad state.
   266  		if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs {
   267  			splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem")
   268  			return nil, splitDiskError
   269  		}
   270  		thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs)
   271  		m.config.Thresholds = thresholds
   272  		if err != nil {
   273  			klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.")
   274  		}
   275  		m.splitContainerImageFs = &splitContainerImageFs
   276  		m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs)
   277  		m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs)
   278  	}
   279  
   280  	klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs)
   281  	activePods := podFunc()
   282  	updateStats := true
   283  	summary, err := m.summaryProvider.Get(ctx, updateStats)
   284  	if err != nil {
   285  		klog.ErrorS(err, "Eviction manager: failed to get summary stats")
   286  		return nil, nil
   287  	}
   288  
   289  	if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
   290  		m.thresholdsLastUpdated = m.clock.Now()
   291  		for _, notifier := range m.thresholdNotifiers {
   292  			if err := notifier.UpdateThreshold(summary); err != nil {
   293  				klog.InfoS("Eviction manager: failed to update notifier", "notifier", notifier.Description(), "err", err)
   294  			}
   295  		}
   296  	}
   297  
   298  	// make observations and get a function to derive pod usage stats relative to those observations.
   299  	observations, statsFunc := makeSignalObservations(summary)
   300  	debugLogObservations("observations", observations)
   301  
   302  	// determine the set of thresholds met independent of grace period
   303  	thresholds = thresholdsMet(thresholds, observations, false)
   304  	debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
   305  
   306  	// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
   307  	if len(m.thresholdsMet) > 0 {
   308  		thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
   309  		thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
   310  	}
   311  	debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)
   312  
   313  	// track when a threshold was first observed
   314  	now := m.clock.Now()
   315  	thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
   316  
   317  	// the set of node conditions that are triggered by currently observed thresholds
   318  	nodeConditions := nodeConditions(thresholds)
   319  	if len(nodeConditions) > 0 {
   320  		klog.V(3).InfoS("Eviction manager: node conditions - observed", "nodeCondition", nodeConditions)
   321  	}
   322  
   323  	// track when a node condition was last observed
   324  	nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
   325  
   326  	// node conditions report true if it has been observed within the transition period window
   327  	nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
   328  	if len(nodeConditions) > 0 {
   329  		klog.V(3).InfoS("Eviction manager: node conditions - transition period not met", "nodeCondition", nodeConditions)
   330  	}
   331  
   332  	// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
   333  	thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
   334  	debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations)
   335  
   336  	// update internal state
   337  	m.Lock()
   338  	m.nodeConditions = nodeConditions
   339  	m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
   340  	m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
   341  	m.thresholdsMet = thresholds
   342  
   343  	// determine the set of thresholds whose stats have been updated since the last sync
   344  	thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
   345  	debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)
   346  
   347  	m.lastObservations = observations
   348  	m.Unlock()
   349  
   350  	// evict pods if there is a resource usage violation from local volume temporary storage
   351  	// If eviction happens in localStorageEviction function, skip the rest of eviction action
   352  	if m.localStorageCapacityIsolation {
   353  		if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 {
   354  			return evictedPods, nil
   355  		}
   356  	}
   357  
   358  	if len(thresholds) == 0 {
   359  		klog.V(3).InfoS("Eviction manager: no resources are starved")
   360  		return nil, nil
   361  	}
   362  
   363  	// rank the thresholds by eviction priority
   364  	sort.Sort(byEvictionPriority(thresholds))
   365  	thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds)
   366  	if !foundAny {
   367  		return nil, nil
   368  	}
   369  	klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim)
   370  
   371  	// record an event about the resources we are now attempting to reclaim via eviction
   372  	m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
   373  
   374  	// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
   375  	if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) {
   376  		klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim)
   377  		return nil, nil
   378  	}
   379  
   380  	klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim)
   381  
   382  	// rank the pods for eviction
   383  	rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
   384  	if !ok {
   385  		klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal)
   386  		return nil, nil
   387  	}
   388  
   389  	// the only candidates viable for eviction are those pods that had anything running.
   390  	if len(activePods) == 0 {
   391  		klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict")
   392  		return nil, nil
   393  	}
   394  
   395  	// rank the running pods for eviction for the specified resource
   396  	rank(activePods, statsFunc)
   397  
   398  	klog.InfoS("Eviction manager: pods ranked for eviction", "pods", klog.KObjSlice(activePods))
   399  
   400  	//record age of metrics for met thresholds that we are using for evictions.
   401  	for _, t := range thresholds {
   402  		timeObserved := observations[t.Signal].time
   403  		if !timeObserved.IsZero() {
   404  			metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time))
   405  		}
   406  	}
   407  
   408  	// we kill at most a single pod during each eviction interval
   409  	for i := range activePods {
   410  		pod := activePods[i]
   411  		gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds)
   412  		if !isHardEvictionThreshold(thresholdToReclaim) {
   413  			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
   414  		}
   415  		message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations)
   416  		var condition *v1.PodCondition
   417  		if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
   418  			condition = &v1.PodCondition{
   419  				Type:    v1.DisruptionTarget,
   420  				Status:  v1.ConditionTrue,
   421  				Reason:  v1.PodReasonTerminationByKubelet,
   422  				Message: message,
   423  			}
   424  		}
   425  		if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) {
   426  			metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc()
   427  			return []*v1.Pod{pod}, nil
   428  		}
   429  	}
   430  	klog.InfoS("Eviction manager: unable to evict any pods from the node")
   431  	return nil, nil
   432  }
   433  
   434  func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
   435  	timeout := m.clock.NewTimer(podCleanupTimeout)
   436  	defer timeout.Stop()
   437  	ticker := m.clock.NewTicker(podCleanupPollFreq)
   438  	defer ticker.Stop()
   439  	for {
   440  		select {
   441  		case <-timeout.C():
   442  			klog.InfoS("Eviction manager: timed out waiting for pods to be cleaned up", "pods", klog.KObjSlice(pods))
   443  			return
   444  		case <-ticker.C():
   445  			for i, pod := range pods {
   446  				if !podCleanedUpFunc(pod) {
   447  					break
   448  				}
   449  				if i == len(pods)-1 {
   450  					klog.InfoS("Eviction manager: pods successfully cleaned up", "pods", klog.KObjSlice(pods))
   451  					return
   452  				}
   453  			}
   454  		}
   455  	}
   456  }
   457  
   458  // reclaimNodeLevelResources attempts to reclaim node level resources.  returns true if thresholds were satisfied and no pod eviction is required.
   459  func (m *managerImpl) reclaimNodeLevelResources(ctx context.Context, signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
   460  	nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
   461  	for _, nodeReclaimFunc := range nodeReclaimFuncs {
   462  		// attempt to reclaim the pressured resource.
   463  		if err := nodeReclaimFunc(ctx); err != nil {
   464  			klog.InfoS("Eviction manager: unexpected error when attempting to reduce resource pressure", "resourceName", resourceToReclaim, "err", err)
   465  		}
   466  
   467  	}
   468  	if len(nodeReclaimFuncs) > 0 {
   469  		summary, err := m.summaryProvider.Get(ctx, true)
   470  		if err != nil {
   471  			klog.ErrorS(err, "Eviction manager: failed to get summary stats after resource reclaim")
   472  			return false
   473  		}
   474  
   475  		// make observations and get a function to derive pod usage stats relative to those observations.
   476  		observations, _ := makeSignalObservations(summary)
   477  		debugLogObservations("observations after resource reclaim", observations)
   478  
   479  		// evaluate all thresholds independently of their grace period to see if with
   480  		// the new observations, we think we have met min reclaim goals
   481  		thresholds := thresholdsMet(m.config.Thresholds, observations, true)
   482  		debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
   483  
   484  		if len(thresholds) == 0 {
   485  			return true
   486  		}
   487  	}
   488  	return false
   489  }
   490  
   491  // localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
   492  // to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
   493  func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod {
   494  	evicted := []*v1.Pod{}
   495  	for _, pod := range pods {
   496  		podStats, ok := statsFunc(pod)
   497  		if !ok {
   498  			continue
   499  		}
   500  
   501  		if m.emptyDirLimitEviction(podStats, pod) {
   502  			evicted = append(evicted, pod)
   503  			continue
   504  		}
   505  
   506  		if m.podEphemeralStorageLimitEviction(podStats, pod) {
   507  			evicted = append(evicted, pod)
   508  			continue
   509  		}
   510  
   511  		if m.containerEphemeralStorageLimitEviction(podStats, pod) {
   512  			evicted = append(evicted, pod)
   513  		}
   514  	}
   515  
   516  	return evicted
   517  }
   518  
   519  func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   520  	podVolumeUsed := make(map[string]*resource.Quantity)
   521  	for _, volume := range podStats.VolumeStats {
   522  		podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
   523  	}
   524  	for i := range pod.Spec.Volumes {
   525  		source := &pod.Spec.Volumes[i].VolumeSource
   526  		if source.EmptyDir != nil {
   527  			size := source.EmptyDir.SizeLimit
   528  			used := podVolumeUsed[pod.Spec.Volumes[i].Name]
   529  			if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
   530  				// the emptyDir usage exceeds the size limit, evict the pod
   531  				if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
   532  					metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
   533  					return true
   534  				}
   535  				return false
   536  			}
   537  		}
   538  	}
   539  
   540  	return false
   541  }
   542  
   543  func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   544  	podLimits := resourcehelper.PodLimits(pod, resourcehelper.PodResourcesOptions{})
   545  	_, found := podLimits[v1.ResourceEphemeralStorage]
   546  	if !found {
   547  		return false
   548  	}
   549  
   550  	// pod stats api summarizes ephemeral storage usage (container, emptyDir, host[etc-hosts, logs])
   551  	podEphemeralStorageTotalUsage := &resource.Quantity{}
   552  	if podStats.EphemeralStorage != nil && podStats.EphemeralStorage.UsedBytes != nil {
   553  		podEphemeralStorageTotalUsage = resource.NewQuantity(int64(*podStats.EphemeralStorage.UsedBytes), resource.BinarySI)
   554  	}
   555  	podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
   556  	if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
   557  		// the total usage of pod exceeds the total size limit of containers, evict the pod
   558  		message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
   559  		if m.evictPod(pod, immediateEvictionGracePeriodSeconds, message, nil, nil) {
   560  			metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
   561  			return true
   562  		}
   563  		return false
   564  	}
   565  	return false
   566  }
   567  
   568  func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   569  	thresholdsMap := make(map[string]*resource.Quantity)
   570  	for _, container := range pod.Spec.Containers {
   571  		ephemeralLimit := container.Resources.Limits.StorageEphemeral()
   572  		if ephemeralLimit != nil && ephemeralLimit.Value() != 0 {
   573  			thresholdsMap[container.Name] = ephemeralLimit
   574  		}
   575  	}
   576  
   577  	for _, containerStat := range podStats.Containers {
   578  		containerUsed := diskUsage(containerStat.Logs)
   579  		if !*m.dedicatedImageFs {
   580  			containerUsed.Add(*diskUsage(containerStat.Rootfs))
   581  		}
   582  
   583  		if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
   584  			if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
   585  				if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
   586  					metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
   587  					return true
   588  				}
   589  				return false
   590  			}
   591  		}
   592  	}
   593  	return false
   594  }
   595  
   596  func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool {
   597  	// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
   598  	// do not evict such pods. Static pods are not re-admitted after evictions.
   599  	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
   600  	if kubelettypes.IsCriticalPod(pod) {
   601  		klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod))
   602  		return false
   603  	}
   604  	// record that we are evicting the pod
   605  	m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
   606  	// this is a blocking call and should only return when the pod and its containers are killed.
   607  	klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg)
   608  	err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) {
   609  		status.Phase = v1.PodFailed
   610  		status.Reason = Reason
   611  		status.Message = evictMsg
   612  		if condition != nil {
   613  			podutil.UpdatePodCondition(status, condition)
   614  		}
   615  	})
   616  	if err != nil {
   617  		klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod))
   618  	} else {
   619  		klog.InfoS("Eviction manager: pod is evicted successfully", "pod", klog.KObj(pod))
   620  	}
   621  	return true
   622  }