k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/eviction/eviction_manager.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package eviction 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "sync" 24 "time" 25 26 "k8s.io/klog/v2" 27 28 v1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/api/resource" 30 utilfeature "k8s.io/apiserver/pkg/util/feature" 31 "k8s.io/client-go/tools/record" 32 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 33 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 34 "k8s.io/utils/clock" 35 36 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 37 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource" 38 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 39 "k8s.io/kubernetes/pkg/features" 40 evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" 41 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 42 "k8s.io/kubernetes/pkg/kubelet/metrics" 43 "k8s.io/kubernetes/pkg/kubelet/server/stats" 44 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" 45 ) 46 47 const ( 48 podCleanupTimeout = 30 * time.Second 49 podCleanupPollFreq = time.Second 50 ) 51 52 const ( 53 // signalEphemeralContainerFsLimit is amount of storage available on filesystem requested by the container 54 signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit" 55 // signalEphemeralPodFsLimit is amount of storage available on filesystem requested by the pod 56 signalEphemeralPodFsLimit string = "ephemeralpodfs.limit" 57 // signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir 58 signalEmptyDirFsLimit string = "emptydirfs.limit" 59 // immediateEvictionGracePeriodSeconds is how long we give pods to shut down when we 60 // need them to evict quickly due to resource pressure 61 immediateEvictionGracePeriodSeconds = 1 62 ) 63 64 // managerImpl implements Manager 65 type managerImpl struct { 66 // used to track time 67 clock clock.WithTicker 68 // config is how the manager is configured 69 config Config 70 // the function to invoke to kill a pod 71 killPodFunc KillPodFunc 72 // the interface that knows how to do image gc 73 imageGC ImageGC 74 // the interface that knows how to do container gc 75 containerGC ContainerGC 76 // protects access to internal state 77 sync.RWMutex 78 // node conditions are the set of conditions present 79 nodeConditions []v1.NodeConditionType 80 // captures when a node condition was last observed based on a threshold being met 81 nodeConditionsLastObservedAt nodeConditionsObservedAt 82 // nodeRef is a reference to the node 83 nodeRef *v1.ObjectReference 84 // used to record events about the node 85 recorder record.EventRecorder 86 // used to measure usage stats on system 87 summaryProvider stats.SummaryProvider 88 // records when a threshold was first observed 89 thresholdsFirstObservedAt thresholdsObservedAt 90 // records the set of thresholds that have been met (including graceperiod) but not yet resolved 91 thresholdsMet []evictionapi.Threshold 92 // signalToRankFunc maps a resource to ranking function for that resource. 93 signalToRankFunc map[evictionapi.Signal]rankFunc 94 // signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. 95 signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs 96 // last observations from synchronize 97 lastObservations signalObservations 98 // dedicatedImageFs indicates if imagefs is on a separate device from the rootfs 99 dedicatedImageFs *bool 100 // splitContainerImageFs indicates if containerfs is on a separate device from imagefs 101 splitContainerImageFs *bool 102 // thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold 103 thresholdNotifiers []ThresholdNotifier 104 // thresholdsLastUpdated is the last time the thresholdNotifiers were updated. 105 thresholdsLastUpdated time.Time 106 // whether can support local storage capacity isolation 107 localStorageCapacityIsolation bool 108 } 109 110 // ensure it implements the required interface 111 var _ Manager = &managerImpl{} 112 113 // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration. 114 func NewManager( 115 summaryProvider stats.SummaryProvider, 116 config Config, 117 killPodFunc KillPodFunc, 118 imageGC ImageGC, 119 containerGC ContainerGC, 120 recorder record.EventRecorder, 121 nodeRef *v1.ObjectReference, 122 clock clock.WithTicker, 123 localStorageCapacityIsolation bool, 124 ) (Manager, lifecycle.PodAdmitHandler) { 125 manager := &managerImpl{ 126 clock: clock, 127 killPodFunc: killPodFunc, 128 imageGC: imageGC, 129 containerGC: containerGC, 130 config: config, 131 recorder: recorder, 132 summaryProvider: summaryProvider, 133 nodeRef: nodeRef, 134 nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, 135 thresholdsFirstObservedAt: thresholdsObservedAt{}, 136 dedicatedImageFs: nil, 137 splitContainerImageFs: nil, 138 thresholdNotifiers: []ThresholdNotifier{}, 139 localStorageCapacityIsolation: localStorageCapacityIsolation, 140 } 141 return manager, manager 142 } 143 144 // Admit rejects a pod if its not safe to admit for node stability. 145 func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { 146 m.RLock() 147 defer m.RUnlock() 148 if len(m.nodeConditions) == 0 { 149 return lifecycle.PodAdmitResult{Admit: true} 150 } 151 // Admit Critical pods even under resource pressure since they are required for system stability. 152 // https://github.com/kubernetes/kubernetes/issues/40573 has more details. 153 if kubelettypes.IsCriticalPod(attrs.Pod) { 154 return lifecycle.PodAdmitResult{Admit: true} 155 } 156 157 // Conditions other than memory pressure reject all pods 158 nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1 159 if nodeOnlyHasMemoryPressureCondition { 160 notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod) 161 if notBestEffort { 162 return lifecycle.PodAdmitResult{Admit: true} 163 } 164 165 // When node has memory pressure, check BestEffort Pod's toleration: 166 // admit it if tolerates memory pressure taint, fail for other tolerations, e.g. DiskPressure. 167 if corev1helpers.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{ 168 Key: v1.TaintNodeMemoryPressure, 169 Effect: v1.TaintEffectNoSchedule, 170 }) { 171 return lifecycle.PodAdmitResult{Admit: true} 172 } 173 } 174 175 // reject pods when under memory pressure (if pod is best effort), or if under disk pressure. 176 klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions) 177 return lifecycle.PodAdmitResult{ 178 Admit: false, 179 Reason: Reason, 180 Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions), 181 } 182 } 183 184 // Start starts the control loop to observe and response to low compute resources. 185 func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { 186 thresholdHandler := func(message string) { 187 klog.InfoS(message) 188 m.synchronize(diskInfoProvider, podFunc) 189 } 190 if m.config.KernelMemcgNotification { 191 for _, threshold := range m.config.Thresholds { 192 if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable { 193 notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler) 194 if err != nil { 195 klog.InfoS("Eviction manager: failed to create memory threshold notifier", "err", err) 196 } else { 197 go notifier.Start() 198 m.thresholdNotifiers = append(m.thresholdNotifiers, notifier) 199 } 200 } 201 } 202 } 203 // start the eviction manager monitoring 204 go func() { 205 for { 206 evictedPods, err := m.synchronize(diskInfoProvider, podFunc) 207 if evictedPods != nil && err == nil { 208 klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) 209 m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) 210 } else { 211 if err != nil { 212 klog.ErrorS(err, "Eviction manager: failed to synchronize") 213 } 214 time.Sleep(monitoringInterval) 215 } 216 } 217 }() 218 } 219 220 // IsUnderMemoryPressure returns true if the node is under memory pressure. 221 func (m *managerImpl) IsUnderMemoryPressure() bool { 222 m.RLock() 223 defer m.RUnlock() 224 return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) 225 } 226 227 // IsUnderDiskPressure returns true if the node is under disk pressure. 228 func (m *managerImpl) IsUnderDiskPressure() bool { 229 m.RLock() 230 defer m.RUnlock() 231 return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure) 232 } 233 234 // IsUnderPIDPressure returns true if the node is under PID pressure. 235 func (m *managerImpl) IsUnderPIDPressure() bool { 236 m.RLock() 237 defer m.RUnlock() 238 return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure) 239 } 240 241 // synchronize is the main control loop that enforces eviction thresholds. 242 // Returns the pod that was killed, or nil if no pod was killed. 243 func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) { 244 ctx := context.Background() 245 // if we have nothing to do, just return 246 thresholds := m.config.Thresholds 247 if len(thresholds) == 0 && !m.localStorageCapacityIsolation { 248 return nil, nil 249 } 250 251 klog.V(3).InfoS("Eviction manager: synchronize housekeeping") 252 // build the ranking functions (if not yet known) 253 // TODO: have a function in cadvisor that lets us know if global housekeeping has completed 254 if m.dedicatedImageFs == nil { 255 hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx) 256 if splitDiskError != nil { 257 klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs") 258 return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError) 259 } 260 m.dedicatedImageFs = &hasImageFs 261 splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx) 262 263 // If we are a split filesystem but the feature is turned off 264 // we should return an error. 265 // This is a bad state. 266 if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs { 267 splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem") 268 return nil, splitDiskError 269 } 270 thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs) 271 m.config.Thresholds = thresholds 272 if err != nil { 273 klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.") 274 } 275 m.splitContainerImageFs = &splitContainerImageFs 276 m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs) 277 m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs) 278 } 279 280 klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs) 281 activePods := podFunc() 282 updateStats := true 283 summary, err := m.summaryProvider.Get(ctx, updateStats) 284 if err != nil { 285 klog.ErrorS(err, "Eviction manager: failed to get summary stats") 286 return nil, nil 287 } 288 289 if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval { 290 m.thresholdsLastUpdated = m.clock.Now() 291 for _, notifier := range m.thresholdNotifiers { 292 if err := notifier.UpdateThreshold(summary); err != nil { 293 klog.InfoS("Eviction manager: failed to update notifier", "notifier", notifier.Description(), "err", err) 294 } 295 } 296 } 297 298 // make observations and get a function to derive pod usage stats relative to those observations. 299 observations, statsFunc := makeSignalObservations(summary) 300 debugLogObservations("observations", observations) 301 302 // determine the set of thresholds met independent of grace period 303 thresholds = thresholdsMet(thresholds, observations, false) 304 debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations) 305 306 // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim 307 if len(m.thresholdsMet) > 0 { 308 thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true) 309 thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved) 310 } 311 debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations) 312 313 // track when a threshold was first observed 314 now := m.clock.Now() 315 thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now) 316 317 // the set of node conditions that are triggered by currently observed thresholds 318 nodeConditions := nodeConditions(thresholds) 319 if len(nodeConditions) > 0 { 320 klog.V(3).InfoS("Eviction manager: node conditions - observed", "nodeCondition", nodeConditions) 321 } 322 323 // track when a node condition was last observed 324 nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now) 325 326 // node conditions report true if it has been observed within the transition period window 327 nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now) 328 if len(nodeConditions) > 0 { 329 klog.V(3).InfoS("Eviction manager: node conditions - transition period not met", "nodeCondition", nodeConditions) 330 } 331 332 // determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met) 333 thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now) 334 debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations) 335 336 // update internal state 337 m.Lock() 338 m.nodeConditions = nodeConditions 339 m.thresholdsFirstObservedAt = thresholdsFirstObservedAt 340 m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt 341 m.thresholdsMet = thresholds 342 343 // determine the set of thresholds whose stats have been updated since the last sync 344 thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations) 345 debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations) 346 347 m.lastObservations = observations 348 m.Unlock() 349 350 // evict pods if there is a resource usage violation from local volume temporary storage 351 // If eviction happens in localStorageEviction function, skip the rest of eviction action 352 if m.localStorageCapacityIsolation { 353 if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 { 354 return evictedPods, nil 355 } 356 } 357 358 if len(thresholds) == 0 { 359 klog.V(3).InfoS("Eviction manager: no resources are starved") 360 return nil, nil 361 } 362 363 // rank the thresholds by eviction priority 364 sort.Sort(byEvictionPriority(thresholds)) 365 thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds) 366 if !foundAny { 367 return nil, nil 368 } 369 klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim) 370 371 // record an event about the resources we are now attempting to reclaim via eviction 372 m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) 373 374 // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. 375 if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { 376 klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) 377 return nil, nil 378 } 379 380 klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim) 381 382 // rank the pods for eviction 383 rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] 384 if !ok { 385 klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal) 386 return nil, nil 387 } 388 389 // the only candidates viable for eviction are those pods that had anything running. 390 if len(activePods) == 0 { 391 klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict") 392 return nil, nil 393 } 394 395 // rank the running pods for eviction for the specified resource 396 rank(activePods, statsFunc) 397 398 klog.InfoS("Eviction manager: pods ranked for eviction", "pods", klog.KObjSlice(activePods)) 399 400 //record age of metrics for met thresholds that we are using for evictions. 401 for _, t := range thresholds { 402 timeObserved := observations[t.Signal].time 403 if !timeObserved.IsZero() { 404 metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time)) 405 } 406 } 407 408 // we kill at most a single pod during each eviction interval 409 for i := range activePods { 410 pod := activePods[i] 411 gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds) 412 if !isHardEvictionThreshold(thresholdToReclaim) { 413 gracePeriodOverride = m.config.MaxPodGracePeriodSeconds 414 } 415 message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations) 416 var condition *v1.PodCondition 417 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) { 418 condition = &v1.PodCondition{ 419 Type: v1.DisruptionTarget, 420 Status: v1.ConditionTrue, 421 Reason: v1.PodReasonTerminationByKubelet, 422 Message: message, 423 } 424 } 425 if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { 426 metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() 427 return []*v1.Pod{pod}, nil 428 } 429 } 430 klog.InfoS("Eviction manager: unable to evict any pods from the node") 431 return nil, nil 432 } 433 434 func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { 435 timeout := m.clock.NewTimer(podCleanupTimeout) 436 defer timeout.Stop() 437 ticker := m.clock.NewTicker(podCleanupPollFreq) 438 defer ticker.Stop() 439 for { 440 select { 441 case <-timeout.C(): 442 klog.InfoS("Eviction manager: timed out waiting for pods to be cleaned up", "pods", klog.KObjSlice(pods)) 443 return 444 case <-ticker.C(): 445 for i, pod := range pods { 446 if !podCleanedUpFunc(pod) { 447 break 448 } 449 if i == len(pods)-1 { 450 klog.InfoS("Eviction manager: pods successfully cleaned up", "pods", klog.KObjSlice(pods)) 451 return 452 } 453 } 454 } 455 } 456 } 457 458 // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. 459 func (m *managerImpl) reclaimNodeLevelResources(ctx context.Context, signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool { 460 nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim] 461 for _, nodeReclaimFunc := range nodeReclaimFuncs { 462 // attempt to reclaim the pressured resource. 463 if err := nodeReclaimFunc(ctx); err != nil { 464 klog.InfoS("Eviction manager: unexpected error when attempting to reduce resource pressure", "resourceName", resourceToReclaim, "err", err) 465 } 466 467 } 468 if len(nodeReclaimFuncs) > 0 { 469 summary, err := m.summaryProvider.Get(ctx, true) 470 if err != nil { 471 klog.ErrorS(err, "Eviction manager: failed to get summary stats after resource reclaim") 472 return false 473 } 474 475 // make observations and get a function to derive pod usage stats relative to those observations. 476 observations, _ := makeSignalObservations(summary) 477 debugLogObservations("observations after resource reclaim", observations) 478 479 // evaluate all thresholds independently of their grace period to see if with 480 // the new observations, we think we have met min reclaim goals 481 thresholds := thresholdsMet(m.config.Thresholds, observations, true) 482 debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations) 483 484 if len(thresholds) == 0 { 485 return true 486 } 487 } 488 return false 489 } 490 491 // localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs 492 // to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too. 493 func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod { 494 evicted := []*v1.Pod{} 495 for _, pod := range pods { 496 podStats, ok := statsFunc(pod) 497 if !ok { 498 continue 499 } 500 501 if m.emptyDirLimitEviction(podStats, pod) { 502 evicted = append(evicted, pod) 503 continue 504 } 505 506 if m.podEphemeralStorageLimitEviction(podStats, pod) { 507 evicted = append(evicted, pod) 508 continue 509 } 510 511 if m.containerEphemeralStorageLimitEviction(podStats, pod) { 512 evicted = append(evicted, pod) 513 } 514 } 515 516 return evicted 517 } 518 519 func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 520 podVolumeUsed := make(map[string]*resource.Quantity) 521 for _, volume := range podStats.VolumeStats { 522 podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI) 523 } 524 for i := range pod.Spec.Volumes { 525 source := &pod.Spec.Volumes[i].VolumeSource 526 if source.EmptyDir != nil { 527 size := source.EmptyDir.SizeLimit 528 used := podVolumeUsed[pod.Spec.Volumes[i].Name] 529 if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 { 530 // the emptyDir usage exceeds the size limit, evict the pod 531 if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) { 532 metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc() 533 return true 534 } 535 return false 536 } 537 } 538 } 539 540 return false 541 } 542 543 func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 544 podLimits := resourcehelper.PodLimits(pod, resourcehelper.PodResourcesOptions{}) 545 _, found := podLimits[v1.ResourceEphemeralStorage] 546 if !found { 547 return false 548 } 549 550 // pod stats api summarizes ephemeral storage usage (container, emptyDir, host[etc-hosts, logs]) 551 podEphemeralStorageTotalUsage := &resource.Quantity{} 552 if podStats.EphemeralStorage != nil && podStats.EphemeralStorage.UsedBytes != nil { 553 podEphemeralStorageTotalUsage = resource.NewQuantity(int64(*podStats.EphemeralStorage.UsedBytes), resource.BinarySI) 554 } 555 podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage] 556 if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 { 557 // the total usage of pod exceeds the total size limit of containers, evict the pod 558 message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String()) 559 if m.evictPod(pod, immediateEvictionGracePeriodSeconds, message, nil, nil) { 560 metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc() 561 return true 562 } 563 return false 564 } 565 return false 566 } 567 568 func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 569 thresholdsMap := make(map[string]*resource.Quantity) 570 for _, container := range pod.Spec.Containers { 571 ephemeralLimit := container.Resources.Limits.StorageEphemeral() 572 if ephemeralLimit != nil && ephemeralLimit.Value() != 0 { 573 thresholdsMap[container.Name] = ephemeralLimit 574 } 575 } 576 577 for _, containerStat := range podStats.Containers { 578 containerUsed := diskUsage(containerStat.Logs) 579 if !*m.dedicatedImageFs { 580 containerUsed.Add(*diskUsage(containerStat.Rootfs)) 581 } 582 583 if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok { 584 if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 { 585 if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) { 586 metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc() 587 return true 588 } 589 return false 590 } 591 } 592 } 593 return false 594 } 595 596 func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool { 597 // If the pod is marked as critical and static, and support for critical pod annotations is enabled, 598 // do not evict such pods. Static pods are not re-admitted after evictions. 599 // https://github.com/kubernetes/kubernetes/issues/40573 has more details. 600 if kubelettypes.IsCriticalPod(pod) { 601 klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod)) 602 return false 603 } 604 // record that we are evicting the pod 605 m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg) 606 // this is a blocking call and should only return when the pod and its containers are killed. 607 klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg) 608 err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) { 609 status.Phase = v1.PodFailed 610 status.Reason = Reason 611 status.Message = evictMsg 612 if condition != nil { 613 podutil.UpdatePodCondition(status, condition) 614 } 615 }) 616 if err != nil { 617 klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod)) 618 } else { 619 klog.InfoS("Eviction manager: pod is evicted successfully", "pod", klog.KObj(pod)) 620 } 621 return true 622 }