k8s.io/kubernetes@v1.29.3/pkg/kubelet/eviction/eviction_manager.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package eviction 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "sync" 24 "time" 25 26 "k8s.io/klog/v2" 27 28 v1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/api/resource" 30 utilfeature "k8s.io/apiserver/pkg/util/feature" 31 "k8s.io/client-go/tools/record" 32 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 33 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 34 "k8s.io/utils/clock" 35 36 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 37 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource" 38 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 39 "k8s.io/kubernetes/pkg/features" 40 evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" 41 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 42 "k8s.io/kubernetes/pkg/kubelet/metrics" 43 "k8s.io/kubernetes/pkg/kubelet/server/stats" 44 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" 45 ) 46 47 const ( 48 podCleanupTimeout = 30 * time.Second 49 podCleanupPollFreq = time.Second 50 ) 51 52 const ( 53 // signalEphemeralContainerFsLimit is amount of storage available on filesystem requested by the container 54 signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit" 55 // signalEphemeralPodFsLimit is amount of storage available on filesystem requested by the pod 56 signalEphemeralPodFsLimit string = "ephemeralpodfs.limit" 57 // signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir 58 signalEmptyDirFsLimit string = "emptydirfs.limit" 59 ) 60 61 // managerImpl implements Manager 62 type managerImpl struct { 63 // used to track time 64 clock clock.WithTicker 65 // config is how the manager is configured 66 config Config 67 // the function to invoke to kill a pod 68 killPodFunc KillPodFunc 69 // the interface that knows how to do image gc 70 imageGC ImageGC 71 // the interface that knows how to do container gc 72 containerGC ContainerGC 73 // protects access to internal state 74 sync.RWMutex 75 // node conditions are the set of conditions present 76 nodeConditions []v1.NodeConditionType 77 // captures when a node condition was last observed based on a threshold being met 78 nodeConditionsLastObservedAt nodeConditionsObservedAt 79 // nodeRef is a reference to the node 80 nodeRef *v1.ObjectReference 81 // used to record events about the node 82 recorder record.EventRecorder 83 // used to measure usage stats on system 84 summaryProvider stats.SummaryProvider 85 // records when a threshold was first observed 86 thresholdsFirstObservedAt thresholdsObservedAt 87 // records the set of thresholds that have been met (including graceperiod) but not yet resolved 88 thresholdsMet []evictionapi.Threshold 89 // signalToRankFunc maps a resource to ranking function for that resource. 90 signalToRankFunc map[evictionapi.Signal]rankFunc 91 // signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. 92 signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs 93 // last observations from synchronize 94 lastObservations signalObservations 95 // dedicatedImageFs indicates if imagefs is on a separate device from the rootfs 96 dedicatedImageFs *bool 97 // splitContainerImageFs indicates if containerfs is on a separate device from imagefs 98 splitContainerImageFs *bool 99 // thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold 100 thresholdNotifiers []ThresholdNotifier 101 // thresholdsLastUpdated is the last time the thresholdNotifiers were updated. 102 thresholdsLastUpdated time.Time 103 // whether can support local storage capacity isolation 104 localStorageCapacityIsolation bool 105 } 106 107 // ensure it implements the required interface 108 var _ Manager = &managerImpl{} 109 110 // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration. 111 func NewManager( 112 summaryProvider stats.SummaryProvider, 113 config Config, 114 killPodFunc KillPodFunc, 115 imageGC ImageGC, 116 containerGC ContainerGC, 117 recorder record.EventRecorder, 118 nodeRef *v1.ObjectReference, 119 clock clock.WithTicker, 120 localStorageCapacityIsolation bool, 121 ) (Manager, lifecycle.PodAdmitHandler) { 122 manager := &managerImpl{ 123 clock: clock, 124 killPodFunc: killPodFunc, 125 imageGC: imageGC, 126 containerGC: containerGC, 127 config: config, 128 recorder: recorder, 129 summaryProvider: summaryProvider, 130 nodeRef: nodeRef, 131 nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, 132 thresholdsFirstObservedAt: thresholdsObservedAt{}, 133 dedicatedImageFs: nil, 134 splitContainerImageFs: nil, 135 thresholdNotifiers: []ThresholdNotifier{}, 136 localStorageCapacityIsolation: localStorageCapacityIsolation, 137 } 138 return manager, manager 139 } 140 141 // Admit rejects a pod if its not safe to admit for node stability. 142 func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { 143 m.RLock() 144 defer m.RUnlock() 145 if len(m.nodeConditions) == 0 { 146 return lifecycle.PodAdmitResult{Admit: true} 147 } 148 // Admit Critical pods even under resource pressure since they are required for system stability. 149 // https://github.com/kubernetes/kubernetes/issues/40573 has more details. 150 if kubelettypes.IsCriticalPod(attrs.Pod) { 151 return lifecycle.PodAdmitResult{Admit: true} 152 } 153 154 // Conditions other than memory pressure reject all pods 155 nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1 156 if nodeOnlyHasMemoryPressureCondition { 157 notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod) 158 if notBestEffort { 159 return lifecycle.PodAdmitResult{Admit: true} 160 } 161 162 // When node has memory pressure, check BestEffort Pod's toleration: 163 // admit it if tolerates memory pressure taint, fail for other tolerations, e.g. DiskPressure. 164 if corev1helpers.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{ 165 Key: v1.TaintNodeMemoryPressure, 166 Effect: v1.TaintEffectNoSchedule, 167 }) { 168 return lifecycle.PodAdmitResult{Admit: true} 169 } 170 } 171 172 // reject pods when under memory pressure (if pod is best effort), or if under disk pressure. 173 klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions) 174 return lifecycle.PodAdmitResult{ 175 Admit: false, 176 Reason: Reason, 177 Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions), 178 } 179 } 180 181 // Start starts the control loop to observe and response to low compute resources. 182 func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { 183 thresholdHandler := func(message string) { 184 klog.InfoS(message) 185 m.synchronize(diskInfoProvider, podFunc) 186 } 187 if m.config.KernelMemcgNotification { 188 for _, threshold := range m.config.Thresholds { 189 if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable { 190 notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler) 191 if err != nil { 192 klog.InfoS("Eviction manager: failed to create memory threshold notifier", "err", err) 193 } else { 194 go notifier.Start() 195 m.thresholdNotifiers = append(m.thresholdNotifiers, notifier) 196 } 197 } 198 } 199 } 200 // start the eviction manager monitoring 201 go func() { 202 for { 203 evictedPods, err := m.synchronize(diskInfoProvider, podFunc) 204 if evictedPods != nil && err == nil { 205 klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods)) 206 m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) 207 } else { 208 if err != nil { 209 klog.ErrorS(err, "Eviction manager: failed to synchronize") 210 } 211 time.Sleep(monitoringInterval) 212 } 213 } 214 }() 215 } 216 217 // IsUnderMemoryPressure returns true if the node is under memory pressure. 218 func (m *managerImpl) IsUnderMemoryPressure() bool { 219 m.RLock() 220 defer m.RUnlock() 221 return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) 222 } 223 224 // IsUnderDiskPressure returns true if the node is under disk pressure. 225 func (m *managerImpl) IsUnderDiskPressure() bool { 226 m.RLock() 227 defer m.RUnlock() 228 return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure) 229 } 230 231 // IsUnderPIDPressure returns true if the node is under PID pressure. 232 func (m *managerImpl) IsUnderPIDPressure() bool { 233 m.RLock() 234 defer m.RUnlock() 235 return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure) 236 } 237 238 // synchronize is the main control loop that enforces eviction thresholds. 239 // Returns the pod that was killed, or nil if no pod was killed. 240 func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) { 241 ctx := context.Background() 242 // if we have nothing to do, just return 243 thresholds := m.config.Thresholds 244 if len(thresholds) == 0 && !m.localStorageCapacityIsolation { 245 return nil, nil 246 } 247 248 klog.V(3).InfoS("Eviction manager: synchronize housekeeping") 249 // build the ranking functions (if not yet known) 250 // TODO: have a function in cadvisor that lets us know if global housekeeping has completed 251 if m.dedicatedImageFs == nil { 252 hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx) 253 if splitDiskError != nil { 254 klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs") 255 return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError) 256 } 257 m.dedicatedImageFs = &hasImageFs 258 splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx) 259 260 // If we are a split filesystem but the feature is turned off 261 // we should return an error. 262 // This is a bad state. 263 if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs { 264 splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem") 265 return nil, splitDiskError 266 } 267 thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs) 268 m.config.Thresholds = thresholds 269 if err != nil { 270 klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.") 271 } 272 m.splitContainerImageFs = &splitContainerImageFs 273 m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs) 274 m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs) 275 } 276 277 klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs) 278 activePods := podFunc() 279 updateStats := true 280 summary, err := m.summaryProvider.Get(ctx, updateStats) 281 if err != nil { 282 klog.ErrorS(err, "Eviction manager: failed to get summary stats") 283 return nil, nil 284 } 285 286 if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval { 287 m.thresholdsLastUpdated = m.clock.Now() 288 for _, notifier := range m.thresholdNotifiers { 289 if err := notifier.UpdateThreshold(summary); err != nil { 290 klog.InfoS("Eviction manager: failed to update notifier", "notifier", notifier.Description(), "err", err) 291 } 292 } 293 } 294 295 // make observations and get a function to derive pod usage stats relative to those observations. 296 observations, statsFunc := makeSignalObservations(summary) 297 debugLogObservations("observations", observations) 298 299 // determine the set of thresholds met independent of grace period 300 thresholds = thresholdsMet(thresholds, observations, false) 301 debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations) 302 303 // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim 304 if len(m.thresholdsMet) > 0 { 305 thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true) 306 thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved) 307 } 308 debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations) 309 310 // track when a threshold was first observed 311 now := m.clock.Now() 312 thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now) 313 314 // the set of node conditions that are triggered by currently observed thresholds 315 nodeConditions := nodeConditions(thresholds) 316 if len(nodeConditions) > 0 { 317 klog.V(3).InfoS("Eviction manager: node conditions - observed", "nodeCondition", nodeConditions) 318 } 319 320 // track when a node condition was last observed 321 nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now) 322 323 // node conditions report true if it has been observed within the transition period window 324 nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now) 325 if len(nodeConditions) > 0 { 326 klog.V(3).InfoS("Eviction manager: node conditions - transition period not met", "nodeCondition", nodeConditions) 327 } 328 329 // determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met) 330 thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now) 331 debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations) 332 333 // update internal state 334 m.Lock() 335 m.nodeConditions = nodeConditions 336 m.thresholdsFirstObservedAt = thresholdsFirstObservedAt 337 m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt 338 m.thresholdsMet = thresholds 339 340 // determine the set of thresholds whose stats have been updated since the last sync 341 thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations) 342 debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations) 343 344 m.lastObservations = observations 345 m.Unlock() 346 347 // evict pods if there is a resource usage violation from local volume temporary storage 348 // If eviction happens in localStorageEviction function, skip the rest of eviction action 349 if m.localStorageCapacityIsolation { 350 if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 { 351 return evictedPods, nil 352 } 353 } 354 355 if len(thresholds) == 0 { 356 klog.V(3).InfoS("Eviction manager: no resources are starved") 357 return nil, nil 358 } 359 360 // rank the thresholds by eviction priority 361 sort.Sort(byEvictionPriority(thresholds)) 362 thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds) 363 if !foundAny { 364 return nil, nil 365 } 366 klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim) 367 368 // record an event about the resources we are now attempting to reclaim via eviction 369 m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) 370 371 // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. 372 if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) { 373 klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim) 374 return nil, nil 375 } 376 377 klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim) 378 379 // rank the pods for eviction 380 rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] 381 if !ok { 382 klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal) 383 return nil, nil 384 } 385 386 // the only candidates viable for eviction are those pods that had anything running. 387 if len(activePods) == 0 { 388 klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict") 389 return nil, nil 390 } 391 392 // rank the running pods for eviction for the specified resource 393 rank(activePods, statsFunc) 394 395 klog.InfoS("Eviction manager: pods ranked for eviction", "pods", klog.KObjSlice(activePods)) 396 397 //record age of metrics for met thresholds that we are using for evictions. 398 for _, t := range thresholds { 399 timeObserved := observations[t.Signal].time 400 if !timeObserved.IsZero() { 401 metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time)) 402 } 403 } 404 405 // we kill at most a single pod during each eviction interval 406 for i := range activePods { 407 pod := activePods[i] 408 gracePeriodOverride := int64(0) 409 if !isHardEvictionThreshold(thresholdToReclaim) { 410 gracePeriodOverride = m.config.MaxPodGracePeriodSeconds 411 } 412 message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations) 413 var condition *v1.PodCondition 414 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) { 415 condition = &v1.PodCondition{ 416 Type: v1.DisruptionTarget, 417 Status: v1.ConditionTrue, 418 Reason: v1.PodReasonTerminationByKubelet, 419 Message: message, 420 } 421 } 422 if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) { 423 metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc() 424 return []*v1.Pod{pod}, nil 425 } 426 } 427 klog.InfoS("Eviction manager: unable to evict any pods from the node") 428 return nil, nil 429 } 430 431 func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { 432 timeout := m.clock.NewTimer(podCleanupTimeout) 433 defer timeout.Stop() 434 ticker := m.clock.NewTicker(podCleanupPollFreq) 435 defer ticker.Stop() 436 for { 437 select { 438 case <-timeout.C(): 439 klog.InfoS("Eviction manager: timed out waiting for pods to be cleaned up", "pods", klog.KObjSlice(pods)) 440 return 441 case <-ticker.C(): 442 for i, pod := range pods { 443 if !podCleanedUpFunc(pod) { 444 break 445 } 446 if i == len(pods)-1 { 447 klog.InfoS("Eviction manager: pods successfully cleaned up", "pods", klog.KObjSlice(pods)) 448 return 449 } 450 } 451 } 452 } 453 } 454 455 // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. 456 func (m *managerImpl) reclaimNodeLevelResources(ctx context.Context, signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool { 457 nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim] 458 for _, nodeReclaimFunc := range nodeReclaimFuncs { 459 // attempt to reclaim the pressured resource. 460 if err := nodeReclaimFunc(ctx); err != nil { 461 klog.InfoS("Eviction manager: unexpected error when attempting to reduce resource pressure", "resourceName", resourceToReclaim, "err", err) 462 } 463 464 } 465 if len(nodeReclaimFuncs) > 0 { 466 summary, err := m.summaryProvider.Get(ctx, true) 467 if err != nil { 468 klog.ErrorS(err, "Eviction manager: failed to get summary stats after resource reclaim") 469 return false 470 } 471 472 // make observations and get a function to derive pod usage stats relative to those observations. 473 observations, _ := makeSignalObservations(summary) 474 debugLogObservations("observations after resource reclaim", observations) 475 476 // evaluate all thresholds independently of their grace period to see if with 477 // the new observations, we think we have met min reclaim goals 478 thresholds := thresholdsMet(m.config.Thresholds, observations, true) 479 debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations) 480 481 if len(thresholds) == 0 { 482 return true 483 } 484 } 485 return false 486 } 487 488 // localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs 489 // to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too. 490 func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod { 491 evicted := []*v1.Pod{} 492 for _, pod := range pods { 493 podStats, ok := statsFunc(pod) 494 if !ok { 495 continue 496 } 497 498 if m.emptyDirLimitEviction(podStats, pod) { 499 evicted = append(evicted, pod) 500 continue 501 } 502 503 if m.podEphemeralStorageLimitEviction(podStats, pod) { 504 evicted = append(evicted, pod) 505 continue 506 } 507 508 if m.containerEphemeralStorageLimitEviction(podStats, pod) { 509 evicted = append(evicted, pod) 510 } 511 } 512 513 return evicted 514 } 515 516 func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 517 podVolumeUsed := make(map[string]*resource.Quantity) 518 for _, volume := range podStats.VolumeStats { 519 podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI) 520 } 521 for i := range pod.Spec.Volumes { 522 source := &pod.Spec.Volumes[i].VolumeSource 523 if source.EmptyDir != nil { 524 size := source.EmptyDir.SizeLimit 525 used := podVolumeUsed[pod.Spec.Volumes[i].Name] 526 if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 { 527 // the emptyDir usage exceeds the size limit, evict the pod 528 if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) { 529 metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc() 530 return true 531 } 532 return false 533 } 534 } 535 } 536 537 return false 538 } 539 540 func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 541 podLimits := resourcehelper.PodLimits(pod, resourcehelper.PodResourcesOptions{}) 542 _, found := podLimits[v1.ResourceEphemeralStorage] 543 if !found { 544 return false 545 } 546 547 // pod stats api summarizes ephemeral storage usage (container, emptyDir, host[etc-hosts, logs]) 548 podEphemeralStorageTotalUsage := &resource.Quantity{} 549 if podStats.EphemeralStorage != nil && podStats.EphemeralStorage.UsedBytes != nil { 550 podEphemeralStorageTotalUsage = resource.NewQuantity(int64(*podStats.EphemeralStorage.UsedBytes), resource.BinarySI) 551 } 552 podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage] 553 if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 { 554 // the total usage of pod exceeds the total size limit of containers, evict the pod 555 message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String()) 556 if m.evictPod(pod, 0, message, nil, nil) { 557 metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc() 558 return true 559 } 560 return false 561 } 562 return false 563 } 564 565 func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { 566 thresholdsMap := make(map[string]*resource.Quantity) 567 for _, container := range pod.Spec.Containers { 568 ephemeralLimit := container.Resources.Limits.StorageEphemeral() 569 if ephemeralLimit != nil && ephemeralLimit.Value() != 0 { 570 thresholdsMap[container.Name] = ephemeralLimit 571 } 572 } 573 574 for _, containerStat := range podStats.Containers { 575 containerUsed := diskUsage(containerStat.Logs) 576 if !*m.dedicatedImageFs { 577 containerUsed.Add(*diskUsage(containerStat.Rootfs)) 578 } 579 580 if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok { 581 if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 { 582 if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) { 583 metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc() 584 return true 585 } 586 return false 587 } 588 } 589 } 590 return false 591 } 592 593 func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool { 594 // If the pod is marked as critical and static, and support for critical pod annotations is enabled, 595 // do not evict such pods. Static pods are not re-admitted after evictions. 596 // https://github.com/kubernetes/kubernetes/issues/40573 has more details. 597 if kubelettypes.IsCriticalPod(pod) { 598 klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod)) 599 return false 600 } 601 // record that we are evicting the pod 602 m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg) 603 // this is a blocking call and should only return when the pod and its containers are killed. 604 klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg) 605 err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) { 606 status.Phase = v1.PodFailed 607 status.Reason = Reason 608 status.Message = evictMsg 609 if condition != nil { 610 podutil.UpdatePodCondition(status, condition) 611 } 612 }) 613 if err != nil { 614 klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod)) 615 } else { 616 klog.InfoS("Eviction manager: pod is evicted successfully", "pod", klog.KObj(pod)) 617 } 618 return true 619 }