k8s.io/kubernetes@v1.29.3/pkg/kubelet/eviction/helpers.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package eviction 18 19 import ( 20 "errors" 21 "fmt" 22 "sort" 23 "strconv" 24 "strings" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/api/resource" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 31 "k8s.io/klog/v2" 32 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 33 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 34 v1resource "k8s.io/kubernetes/pkg/api/v1/resource" 35 "k8s.io/kubernetes/pkg/features" 36 evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" 37 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 38 volumeutils "k8s.io/kubernetes/pkg/volume/util" 39 ) 40 41 const ( 42 unsupportedEvictionSignal = "unsupported eviction signal %v" 43 // Reason is the reason reported back in status. 44 Reason = "Evicted" 45 // nodeLowMessageFmt is the message for evictions due to resource pressure. 46 nodeLowMessageFmt = "The node was low on resource: %v. " 47 // nodeConditionMessageFmt is the message for evictions due to resource pressure. 48 nodeConditionMessageFmt = "The node had condition: %v. " 49 // containerMessageFmt provides additional information for containers exceeding requests 50 containerMessageFmt = "Container %s was using %s, request is %s, has larger consumption of %v. " 51 // containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit 52 containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. " 53 // podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit 54 podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. " 55 // emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit 56 emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. " 57 // inodes, number. internal to this module, used to account for local disk inode consumption. 58 resourceInodes v1.ResourceName = "inodes" 59 // resourcePids, number. internal to this module, used to account for local pid consumption. 60 resourcePids v1.ResourceName = "pids" 61 // OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests 62 OffendingContainersKey = "offending_containers" 63 // OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests 64 OffendingContainersUsageKey = "offending_containers_usage" 65 // StarvedResourceKey is the key for the starved resource in eviction event annotations 66 StarvedResourceKey = "starved_resource" 67 // thresholdMetMessageFmt is the message for evictions due to resource pressure. 68 thresholdMetMessageFmt = "Threshold quantity: %v, available: %v. " 69 ) 70 71 var ( 72 // signalToNodeCondition maps a signal to the node condition to report if threshold is met. 73 signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType 74 // signalToResource maps a Signal to its associated Resource. 75 signalToResource map[evictionapi.Signal]v1.ResourceName 76 ) 77 78 func init() { 79 // map eviction signals to node conditions 80 signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{} 81 signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure 82 signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure 83 signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure 84 signalToNodeCondition[evictionapi.SignalContainerFsAvailable] = v1.NodeDiskPressure 85 signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure 86 signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure 87 signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure 88 signalToNodeCondition[evictionapi.SignalContainerFsInodesFree] = v1.NodeDiskPressure 89 signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure 90 91 // map signals to resources (and vice-versa) 92 signalToResource = map[evictionapi.Signal]v1.ResourceName{} 93 signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory 94 signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory 95 signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage 96 signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes 97 signalToResource[evictionapi.SignalContainerFsAvailable] = v1.ResourceEphemeralStorage 98 signalToResource[evictionapi.SignalContainerFsInodesFree] = resourceInodes 99 signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage 100 signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes 101 signalToResource[evictionapi.SignalPIDAvailable] = resourcePids 102 } 103 104 // validSignal returns true if the signal is supported. 105 func validSignal(signal evictionapi.Signal) bool { 106 _, found := signalToResource[signal] 107 return found 108 } 109 110 // getReclaimableThreshold finds the threshold and resource to reclaim 111 func getReclaimableThreshold(thresholds []evictionapi.Threshold) (evictionapi.Threshold, v1.ResourceName, bool) { 112 for _, thresholdToReclaim := range thresholds { 113 if resourceToReclaim, ok := signalToResource[thresholdToReclaim.Signal]; ok { 114 return thresholdToReclaim, resourceToReclaim, true 115 } 116 klog.V(3).InfoS("Eviction manager: threshold was crossed, but reclaim is not implemented for this threshold.", "threshold", thresholdToReclaim.Signal) 117 } 118 return evictionapi.Threshold{}, "", false 119 } 120 121 // ParseThresholdConfig parses the flags for thresholds. 122 func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim map[string]string) ([]evictionapi.Threshold, error) { 123 results := []evictionapi.Threshold{} 124 hardThresholds, err := parseThresholdStatements(evictionHard) 125 if err != nil { 126 return nil, err 127 } 128 results = append(results, hardThresholds...) 129 softThresholds, err := parseThresholdStatements(evictionSoft) 130 if err != nil { 131 return nil, err 132 } 133 gracePeriods, err := parseGracePeriods(evictionSoftGracePeriod) 134 if err != nil { 135 return nil, err 136 } 137 minReclaims, err := parseMinimumReclaims(evictionMinimumReclaim) 138 if err != nil { 139 return nil, err 140 } 141 for i := range softThresholds { 142 signal := softThresholds[i].Signal 143 period, found := gracePeriods[signal] 144 if !found { 145 return nil, fmt.Errorf("grace period must be specified for the soft eviction threshold %v", signal) 146 } 147 softThresholds[i].GracePeriod = period 148 } 149 results = append(results, softThresholds...) 150 for i := range results { 151 if minReclaim, ok := minReclaims[results[i].Signal]; ok { 152 results[i].MinReclaim = &minReclaim 153 } 154 } 155 for _, key := range allocatableConfig { 156 if key == kubetypes.NodeAllocatableEnforcementKey { 157 results = addAllocatableThresholds(results) 158 break 159 } 160 } 161 return results, nil 162 } 163 164 func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi.Threshold { 165 additionalThresholds := []evictionapi.Threshold{} 166 for _, threshold := range thresholds { 167 if threshold.Signal == evictionapi.SignalMemoryAvailable && isHardEvictionThreshold(threshold) { 168 // Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable 169 additionalThresholds = append(additionalThresholds, evictionapi.Threshold{ 170 Signal: evictionapi.SignalAllocatableMemoryAvailable, 171 Operator: threshold.Operator, 172 Value: threshold.Value, 173 MinReclaim: threshold.MinReclaim, 174 }) 175 } 176 } 177 return append(append([]evictionapi.Threshold{}, thresholds...), additionalThresholds...) 178 } 179 180 // UpdateContainerFsThresholds will add containerfs eviction hard/soft 181 // settings based on container runtime settings. 182 // Thresholds are parsed from evictionHard and evictionSoft limits so we will override. 183 // If there is a single filesystem, then containerfs settings are same as nodefs. 184 // If there is a separate image filesystem for both containers and images then containerfs settings are same as imagefs. 185 func UpdateContainerFsThresholds(thresholds []evictionapi.Threshold, imageFs, separateContainerImageFs bool) ([]evictionapi.Threshold, error) { 186 hardNodeFsDisk := evictionapi.Threshold{} 187 softNodeFsDisk := evictionapi.Threshold{} 188 hardNodeINodeDisk := evictionapi.Threshold{} 189 softNodeINodeDisk := evictionapi.Threshold{} 190 hardImageFsDisk := evictionapi.Threshold{} 191 softImageFsDisk := evictionapi.Threshold{} 192 hardImageINodeDisk := evictionapi.Threshold{} 193 softImageINodeDisk := evictionapi.Threshold{} 194 195 hardContainerFsDisk := -1 196 softContainerFsDisk := -1 197 hardContainerFsINodes := -1 198 softContainerFsINodes := -1 199 // Find the imagefs and nodefs thresholds 200 var err error = nil 201 for idx, threshold := range thresholds { 202 if threshold.Signal == evictionapi.SignalImageFsAvailable && isHardEvictionThreshold(threshold) { 203 hardImageFsDisk = threshold 204 } 205 if threshold.Signal == evictionapi.SignalImageFsAvailable && !isHardEvictionThreshold(threshold) { 206 softImageFsDisk = threshold 207 } 208 if threshold.Signal == evictionapi.SignalImageFsInodesFree && isHardEvictionThreshold(threshold) { 209 hardImageINodeDisk = threshold 210 } 211 if threshold.Signal == evictionapi.SignalImageFsInodesFree && !isHardEvictionThreshold(threshold) { 212 softImageINodeDisk = threshold 213 } 214 if threshold.Signal == evictionapi.SignalNodeFsAvailable && isHardEvictionThreshold(threshold) { 215 hardNodeFsDisk = threshold 216 } 217 if threshold.Signal == evictionapi.SignalNodeFsAvailable && !isHardEvictionThreshold(threshold) { 218 softNodeFsDisk = threshold 219 } 220 if threshold.Signal == evictionapi.SignalNodeFsInodesFree && isHardEvictionThreshold(threshold) { 221 hardNodeINodeDisk = threshold 222 } 223 if threshold.Signal == evictionapi.SignalNodeFsInodesFree && !isHardEvictionThreshold(threshold) { 224 softNodeINodeDisk = threshold 225 } 226 // We are logging a warning and we will override the settings. 227 // In this case this is safe because we do not support a separate container filesystem. 228 // So we want either limits to be same as nodefs or imagefs. 229 if threshold.Signal == evictionapi.SignalContainerFsAvailable && isHardEvictionThreshold(threshold) { 230 err = errors.Join(fmt.Errorf("found containerfs.available for hard eviction. ignoring")) 231 hardContainerFsDisk = idx 232 } 233 if threshold.Signal == evictionapi.SignalContainerFsAvailable && !isHardEvictionThreshold(threshold) { 234 err = errors.Join(fmt.Errorf("found containerfs.available for soft eviction. ignoring")) 235 softContainerFsDisk = idx 236 } 237 if threshold.Signal == evictionapi.SignalContainerFsInodesFree && isHardEvictionThreshold(threshold) { 238 err = errors.Join(fmt.Errorf("found containerfs.inodesFree for hard eviction. ignoring")) 239 hardContainerFsINodes = idx 240 } 241 if threshold.Signal == evictionapi.SignalContainerFsInodesFree && !isHardEvictionThreshold(threshold) { 242 err = errors.Join(fmt.Errorf("found containerfs.inodesFree for soft eviction. ignoring")) 243 softContainerFsINodes = idx 244 } 245 } 246 // Either split disk case (containerfs=nodefs) or single filesystem 247 if (imageFs && separateContainerImageFs) || (!imageFs && !separateContainerImageFs) { 248 if hardContainerFsDisk != -1 { 249 thresholds[hardContainerFsDisk] = evictionapi.Threshold{ 250 Signal: evictionapi.SignalContainerFsAvailable, Operator: hardNodeFsDisk.Operator, Value: hardNodeFsDisk.Value, MinReclaim: hardNodeFsDisk.MinReclaim, 251 } 252 } else { 253 thresholds = append(thresholds, evictionapi.Threshold{ 254 Signal: evictionapi.SignalContainerFsAvailable, 255 Operator: hardNodeFsDisk.Operator, 256 Value: hardNodeFsDisk.Value, 257 MinReclaim: hardNodeFsDisk.MinReclaim, 258 }) 259 } 260 if softContainerFsDisk != -1 { 261 thresholds[softContainerFsDisk] = evictionapi.Threshold{ 262 Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softNodeFsDisk.GracePeriod, Operator: softNodeFsDisk.Operator, Value: softNodeFsDisk.Value, MinReclaim: softNodeFsDisk.MinReclaim, 263 } 264 } else { 265 thresholds = append(thresholds, evictionapi.Threshold{ 266 Signal: evictionapi.SignalContainerFsAvailable, 267 Operator: softNodeFsDisk.Operator, 268 Value: softNodeFsDisk.Value, 269 MinReclaim: softNodeFsDisk.MinReclaim, 270 GracePeriod: softNodeFsDisk.GracePeriod, 271 }) 272 } 273 if hardContainerFsINodes != -1 { 274 thresholds[hardContainerFsINodes] = evictionapi.Threshold{ 275 Signal: evictionapi.SignalContainerFsInodesFree, Operator: hardNodeINodeDisk.Operator, Value: hardNodeINodeDisk.Value, MinReclaim: hardNodeINodeDisk.MinReclaim, 276 } 277 } else { 278 thresholds = append(thresholds, evictionapi.Threshold{ 279 Signal: evictionapi.SignalContainerFsInodesFree, 280 Operator: hardNodeINodeDisk.Operator, 281 Value: hardNodeINodeDisk.Value, 282 MinReclaim: hardNodeINodeDisk.MinReclaim, 283 }) 284 } 285 if softContainerFsINodes != -1 { 286 thresholds[softContainerFsINodes] = evictionapi.Threshold{ 287 Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softNodeINodeDisk.GracePeriod, Operator: softNodeINodeDisk.Operator, Value: softNodeINodeDisk.Value, MinReclaim: softNodeINodeDisk.MinReclaim, 288 } 289 } else { 290 thresholds = append(thresholds, evictionapi.Threshold{ 291 Signal: evictionapi.SignalContainerFsInodesFree, 292 Operator: softNodeINodeDisk.Operator, 293 Value: softNodeINodeDisk.Value, 294 MinReclaim: softNodeINodeDisk.MinReclaim, 295 GracePeriod: softNodeINodeDisk.GracePeriod, 296 }) 297 } 298 } 299 // Separate image filesystem case 300 if imageFs && !separateContainerImageFs { 301 if hardContainerFsDisk != -1 { 302 thresholds[hardContainerFsDisk] = evictionapi.Threshold{ 303 Signal: evictionapi.SignalContainerFsAvailable, Operator: hardImageFsDisk.Operator, Value: hardImageFsDisk.Value, MinReclaim: hardImageFsDisk.MinReclaim, 304 } 305 } else { 306 thresholds = append(thresholds, evictionapi.Threshold{ 307 Signal: evictionapi.SignalContainerFsAvailable, 308 Operator: hardImageFsDisk.Operator, 309 Value: hardImageFsDisk.Value, 310 MinReclaim: hardImageFsDisk.MinReclaim, 311 }) 312 } 313 if softContainerFsDisk != -1 { 314 thresholds[softContainerFsDisk] = evictionapi.Threshold{ 315 Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softImageFsDisk.GracePeriod, Operator: softImageFsDisk.Operator, Value: softImageFsDisk.Value, MinReclaim: softImageFsDisk.MinReclaim, 316 } 317 } else { 318 thresholds = append(thresholds, evictionapi.Threshold{ 319 Signal: evictionapi.SignalContainerFsAvailable, 320 Operator: softImageFsDisk.Operator, 321 Value: softImageFsDisk.Value, 322 MinReclaim: softImageFsDisk.MinReclaim, 323 GracePeriod: softImageFsDisk.GracePeriod, 324 }) 325 } 326 if hardContainerFsINodes != -1 { 327 thresholds[hardContainerFsINodes] = evictionapi.Threshold{ 328 Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: hardImageINodeDisk.GracePeriod, Operator: hardImageINodeDisk.Operator, Value: hardImageINodeDisk.Value, MinReclaim: hardImageINodeDisk.MinReclaim, 329 } 330 } else { 331 thresholds = append(thresholds, evictionapi.Threshold{ 332 Signal: evictionapi.SignalContainerFsInodesFree, 333 Operator: hardImageINodeDisk.Operator, 334 Value: hardImageINodeDisk.Value, 335 MinReclaim: hardImageINodeDisk.MinReclaim, 336 }) 337 } 338 if softContainerFsINodes != -1 { 339 thresholds[softContainerFsINodes] = evictionapi.Threshold{ 340 Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softImageINodeDisk.GracePeriod, Operator: softImageINodeDisk.Operator, Value: softImageINodeDisk.Value, MinReclaim: softImageINodeDisk.MinReclaim, 341 } 342 } else { 343 thresholds = append(thresholds, evictionapi.Threshold{ 344 Signal: evictionapi.SignalContainerFsInodesFree, 345 Operator: softImageINodeDisk.Operator, 346 Value: softImageINodeDisk.Value, 347 MinReclaim: softImageINodeDisk.MinReclaim, 348 GracePeriod: softImageINodeDisk.GracePeriod, 349 }) 350 } 351 } 352 return thresholds, err 353 } 354 355 // parseThresholdStatements parses the input statements into a list of Threshold objects. 356 func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) { 357 if len(statements) == 0 { 358 return nil, nil 359 } 360 results := []evictionapi.Threshold{} 361 for signal, val := range statements { 362 result, err := parseThresholdStatement(evictionapi.Signal(signal), val) 363 if err != nil { 364 return nil, err 365 } 366 if result != nil { 367 results = append(results, *result) 368 } 369 } 370 return results, nil 371 } 372 373 // parseThresholdStatement parses a threshold statement and returns a threshold, 374 // or nil if the threshold should be ignored. 375 func parseThresholdStatement(signal evictionapi.Signal, val string) (*evictionapi.Threshold, error) { 376 if !validSignal(signal) { 377 return nil, fmt.Errorf(unsupportedEvictionSignal, signal) 378 } 379 operator := evictionapi.OpForSignal[signal] 380 if strings.HasSuffix(val, "%") { 381 // ignore 0% and 100% 382 if val == "0%" || val == "100%" { 383 return nil, nil 384 } 385 percentage, err := parsePercentage(val) 386 if err != nil { 387 return nil, err 388 } 389 if percentage < 0 { 390 return nil, fmt.Errorf("eviction percentage threshold %v must be >= 0%%: %s", signal, val) 391 } 392 // percentage is a float and should not be greater than 1 (100%) 393 if percentage > 1 { 394 return nil, fmt.Errorf("eviction percentage threshold %v must be <= 100%%: %s", signal, val) 395 } 396 return &evictionapi.Threshold{ 397 Signal: signal, 398 Operator: operator, 399 Value: evictionapi.ThresholdValue{ 400 Percentage: percentage, 401 }, 402 }, nil 403 } 404 quantity, err := resource.ParseQuantity(val) 405 if err != nil { 406 return nil, err 407 } 408 if quantity.Sign() < 0 || quantity.IsZero() { 409 return nil, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) 410 } 411 return &evictionapi.Threshold{ 412 Signal: signal, 413 Operator: operator, 414 Value: evictionapi.ThresholdValue{ 415 Quantity: &quantity, 416 }, 417 }, nil 418 } 419 420 // parsePercentage parses a string representing a percentage value 421 func parsePercentage(input string) (float32, error) { 422 value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32) 423 if err != nil { 424 return 0, err 425 } 426 return float32(value) / 100, nil 427 } 428 429 // parseGracePeriods parses the grace period statements 430 func parseGracePeriods(statements map[string]string) (map[evictionapi.Signal]time.Duration, error) { 431 if len(statements) == 0 { 432 return nil, nil 433 } 434 results := map[evictionapi.Signal]time.Duration{} 435 for signal, val := range statements { 436 signal := evictionapi.Signal(signal) 437 if !validSignal(signal) { 438 return nil, fmt.Errorf(unsupportedEvictionSignal, signal) 439 } 440 gracePeriod, err := time.ParseDuration(val) 441 if err != nil { 442 return nil, err 443 } 444 if gracePeriod < 0 { 445 return nil, fmt.Errorf("invalid eviction grace period specified: %v, must be a positive value", val) 446 } 447 results[signal] = gracePeriod 448 } 449 return results, nil 450 } 451 452 // parseMinimumReclaims parses the minimum reclaim statements 453 func parseMinimumReclaims(statements map[string]string) (map[evictionapi.Signal]evictionapi.ThresholdValue, error) { 454 if len(statements) == 0 { 455 return nil, nil 456 } 457 results := map[evictionapi.Signal]evictionapi.ThresholdValue{} 458 for signal, val := range statements { 459 signal := evictionapi.Signal(signal) 460 if !validSignal(signal) { 461 return nil, fmt.Errorf(unsupportedEvictionSignal, signal) 462 } 463 if strings.HasSuffix(val, "%") { 464 percentage, err := parsePercentage(val) 465 if err != nil { 466 return nil, err 467 } 468 if percentage <= 0 { 469 return nil, fmt.Errorf("eviction percentage minimum reclaim %v must be positive: %s", signal, val) 470 } 471 results[signal] = evictionapi.ThresholdValue{ 472 Percentage: percentage, 473 } 474 continue 475 } 476 quantity, err := resource.ParseQuantity(val) 477 if err != nil { 478 return nil, err 479 } 480 if quantity.Sign() < 0 { 481 return nil, fmt.Errorf("negative eviction minimum reclaim specified for %v", signal) 482 } 483 results[signal] = evictionapi.ThresholdValue{ 484 Quantity: &quantity, 485 } 486 } 487 return results, nil 488 } 489 490 // diskUsage converts used bytes into a resource quantity. 491 func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity { 492 if fsStats == nil || fsStats.UsedBytes == nil { 493 return &resource.Quantity{Format: resource.BinarySI} 494 } 495 usage := int64(*fsStats.UsedBytes) 496 return resource.NewQuantity(usage, resource.BinarySI) 497 } 498 499 // inodeUsage converts inodes consumed into a resource quantity. 500 func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity { 501 if fsStats == nil || fsStats.InodesUsed == nil { 502 return &resource.Quantity{Format: resource.DecimalSI} 503 } 504 usage := int64(*fsStats.InodesUsed) 505 return resource.NewQuantity(usage, resource.DecimalSI) 506 } 507 508 // memoryUsage converts working set into a resource quantity. 509 func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity { 510 if memStats == nil || memStats.WorkingSetBytes == nil { 511 return &resource.Quantity{Format: resource.BinarySI} 512 } 513 usage := int64(*memStats.WorkingSetBytes) 514 return resource.NewQuantity(usage, resource.BinarySI) 515 } 516 517 // processUsage converts working set into a process count. 518 func processUsage(processStats *statsapi.ProcessStats) uint64 { 519 if processStats == nil || processStats.ProcessCount == nil { 520 return 0 521 } 522 usage := uint64(*processStats.ProcessCount) 523 return usage 524 } 525 526 // localVolumeNames returns the set of volumes for the pod that are local 527 // TODO: summary API should report what volumes consume local storage rather than hard-code here. 528 func localVolumeNames(pod *v1.Pod) []string { 529 result := []string{} 530 for _, volume := range pod.Spec.Volumes { 531 if volume.HostPath != nil || 532 volumeutils.IsLocalEphemeralVolume(volume) { 533 result = append(result, volume.Name) 534 } 535 } 536 return result 537 } 538 539 // containerUsage aggregates container disk usage and inode consumption for the specified stats to measure. 540 func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList { 541 disk := resource.Quantity{Format: resource.BinarySI} 542 inodes := resource.Quantity{Format: resource.DecimalSI} 543 for _, container := range podStats.Containers { 544 if hasFsStatsType(statsToMeasure, fsStatsRoot) { 545 disk.Add(*diskUsage(container.Rootfs)) 546 inodes.Add(*inodeUsage(container.Rootfs)) 547 } 548 if hasFsStatsType(statsToMeasure, fsStatsLogs) { 549 disk.Add(*diskUsage(container.Logs)) 550 inodes.Add(*inodeUsage(container.Logs)) 551 } 552 } 553 return v1.ResourceList{ 554 v1.ResourceEphemeralStorage: disk, 555 resourceInodes: inodes, 556 } 557 } 558 559 // podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure. 560 func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList { 561 disk := resource.Quantity{Format: resource.BinarySI} 562 inodes := resource.Quantity{Format: resource.DecimalSI} 563 for _, volumeName := range volumeNames { 564 for _, volumeStats := range podStats.VolumeStats { 565 if volumeStats.Name == volumeName { 566 disk.Add(*diskUsage(&volumeStats.FsStats)) 567 inodes.Add(*inodeUsage(&volumeStats.FsStats)) 568 break 569 } 570 } 571 } 572 return v1.ResourceList{ 573 v1.ResourceEphemeralStorage: disk, 574 resourceInodes: inodes, 575 } 576 } 577 578 // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure. 579 func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) { 580 disk := resource.Quantity{Format: resource.BinarySI} 581 inodes := resource.Quantity{Format: resource.DecimalSI} 582 583 containerUsageList := containerUsage(podStats, statsToMeasure) 584 disk.Add(containerUsageList[v1.ResourceEphemeralStorage]) 585 inodes.Add(containerUsageList[resourceInodes]) 586 587 if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { 588 volumeNames := localVolumeNames(pod) 589 podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats) 590 disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage]) 591 inodes.Add(podLocalVolumeUsageList[resourceInodes]) 592 } 593 return v1.ResourceList{ 594 v1.ResourceEphemeralStorage: disk, 595 resourceInodes: inodes, 596 }, nil 597 } 598 599 // formatThreshold formats a threshold for logging. 600 func formatThreshold(threshold evictionapi.Threshold) string { 601 return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, threshold.Operator, evictionapi.ThresholdValue(threshold.Value), threshold.GracePeriod) 602 } 603 604 // cachedStatsFunc returns a statsFunc based on the provided pod stats. 605 func cachedStatsFunc(podStats []statsapi.PodStats) statsFunc { 606 uid2PodStats := map[string]statsapi.PodStats{} 607 for i := range podStats { 608 uid2PodStats[podStats[i].PodRef.UID] = podStats[i] 609 } 610 return func(pod *v1.Pod) (statsapi.PodStats, bool) { 611 stats, found := uid2PodStats[string(pod.UID)] 612 return stats, found 613 } 614 } 615 616 // Cmp compares p1 and p2 and returns: 617 // 618 // -1 if p1 < p2 619 // 0 if p1 == p2 620 // +1 if p1 > p2 621 type cmpFunc func(p1, p2 *v1.Pod) int 622 623 // multiSorter implements the Sort interface, sorting changes within. 624 type multiSorter struct { 625 pods []*v1.Pod 626 cmp []cmpFunc 627 } 628 629 // Sort sorts the argument slice according to the less functions passed to OrderedBy. 630 func (ms *multiSorter) Sort(pods []*v1.Pod) { 631 ms.pods = pods 632 sort.Sort(ms) 633 } 634 635 // OrderedBy returns a Sorter that sorts using the cmp functions, in order. 636 // Call its Sort method to sort the data. 637 func orderedBy(cmp ...cmpFunc) *multiSorter { 638 return &multiSorter{ 639 cmp: cmp, 640 } 641 } 642 643 // Len is part of sort.Interface. 644 func (ms *multiSorter) Len() int { 645 return len(ms.pods) 646 } 647 648 // Swap is part of sort.Interface. 649 func (ms *multiSorter) Swap(i, j int) { 650 ms.pods[i], ms.pods[j] = ms.pods[j], ms.pods[i] 651 } 652 653 // Less is part of sort.Interface. 654 func (ms *multiSorter) Less(i, j int) bool { 655 p1, p2 := ms.pods[i], ms.pods[j] 656 var k int 657 for k = 0; k < len(ms.cmp)-1; k++ { 658 cmpResult := ms.cmp[k](p1, p2) 659 // p1 is less than p2 660 if cmpResult < 0 { 661 return true 662 } 663 // p1 is greater than p2 664 if cmpResult > 0 { 665 return false 666 } 667 // we don't know yet 668 } 669 // the last cmp func is the final decider 670 return ms.cmp[k](p1, p2) < 0 671 } 672 673 // priority compares pods by Priority, if priority is enabled. 674 func priority(p1, p2 *v1.Pod) int { 675 priority1 := corev1helpers.PodPriority(p1) 676 priority2 := corev1helpers.PodPriority(p2) 677 if priority1 == priority2 { 678 return 0 679 } 680 if priority1 > priority2 { 681 return 1 682 } 683 return -1 684 } 685 686 // exceedMemoryRequests compares whether or not pods' memory usage exceeds their requests 687 func exceedMemoryRequests(stats statsFunc) cmpFunc { 688 return func(p1, p2 *v1.Pod) int { 689 p1Stats, p1Found := stats(p1) 690 p2Stats, p2Found := stats(p2) 691 if !p1Found || !p2Found { 692 // prioritize evicting the pod for which no stats were found 693 return cmpBool(!p1Found, !p2Found) 694 } 695 696 p1Memory := memoryUsage(p1Stats.Memory) 697 p2Memory := memoryUsage(p2Stats.Memory) 698 p1ExceedsRequests := p1Memory.Cmp(v1resource.GetResourceRequestQuantity(p1, v1.ResourceMemory)) == 1 699 p2ExceedsRequests := p2Memory.Cmp(v1resource.GetResourceRequestQuantity(p2, v1.ResourceMemory)) == 1 700 // prioritize evicting the pod which exceeds its requests 701 return cmpBool(p1ExceedsRequests, p2ExceedsRequests) 702 } 703 } 704 705 // memory compares pods by largest consumer of memory relative to request. 706 func memory(stats statsFunc) cmpFunc { 707 return func(p1, p2 *v1.Pod) int { 708 p1Stats, p1Found := stats(p1) 709 p2Stats, p2Found := stats(p2) 710 if !p1Found || !p2Found { 711 // prioritize evicting the pod for which no stats were found 712 return cmpBool(!p1Found, !p2Found) 713 } 714 715 // adjust p1, p2 usage relative to the request (if any) 716 p1Memory := memoryUsage(p1Stats.Memory) 717 p1Request := v1resource.GetResourceRequestQuantity(p1, v1.ResourceMemory) 718 p1Memory.Sub(p1Request) 719 720 p2Memory := memoryUsage(p2Stats.Memory) 721 p2Request := v1resource.GetResourceRequestQuantity(p2, v1.ResourceMemory) 722 p2Memory.Sub(p2Request) 723 724 // prioritize evicting the pod which has the larger consumption of memory 725 return p2Memory.Cmp(*p1Memory) 726 } 727 } 728 729 // process compares pods by largest consumer of process number relative to request. 730 func process(stats statsFunc) cmpFunc { 731 return func(p1, p2 *v1.Pod) int { 732 p1Stats, p1Found := stats(p1) 733 p2Stats, p2Found := stats(p2) 734 if !p1Found || !p2Found { 735 // prioritize evicting the pod for which no stats were found 736 return cmpBool(!p1Found, !p2Found) 737 } 738 739 p1Process := processUsage(p1Stats.ProcessStats) 740 p2Process := processUsage(p2Stats.ProcessStats) 741 // prioritize evicting the pod which has the larger consumption of process 742 return int(p2Process - p1Process) 743 } 744 } 745 746 // exceedDiskRequests compares whether or not pods' disk usage exceeds their requests 747 func exceedDiskRequests(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc { 748 return func(p1, p2 *v1.Pod) int { 749 p1Stats, p1Found := stats(p1) 750 p2Stats, p2Found := stats(p2) 751 if !p1Found || !p2Found { 752 // prioritize evicting the pod for which no stats were found 753 return cmpBool(!p1Found, !p2Found) 754 } 755 756 p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure) 757 p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure) 758 if p1Err != nil || p2Err != nil { 759 // prioritize evicting the pod which had an error getting stats 760 return cmpBool(p1Err != nil, p2Err != nil) 761 } 762 763 p1Disk := p1Usage[diskResource] 764 p2Disk := p2Usage[diskResource] 765 p1ExceedsRequests := p1Disk.Cmp(v1resource.GetResourceRequestQuantity(p1, diskResource)) == 1 766 p2ExceedsRequests := p2Disk.Cmp(v1resource.GetResourceRequestQuantity(p2, diskResource)) == 1 767 // prioritize evicting the pod which exceeds its requests 768 return cmpBool(p1ExceedsRequests, p2ExceedsRequests) 769 } 770 } 771 772 // disk compares pods by largest consumer of disk relative to request for the specified disk resource. 773 func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc { 774 return func(p1, p2 *v1.Pod) int { 775 p1Stats, p1Found := stats(p1) 776 p2Stats, p2Found := stats(p2) 777 if !p1Found || !p2Found { 778 // prioritize evicting the pod for which no stats were found 779 return cmpBool(!p1Found, !p2Found) 780 } 781 p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure) 782 p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure) 783 if p1Err != nil || p2Err != nil { 784 // prioritize evicting the pod which had an error getting stats 785 return cmpBool(p1Err != nil, p2Err != nil) 786 } 787 788 // adjust p1, p2 usage relative to the request (if any) 789 p1Disk := p1Usage[diskResource] 790 p2Disk := p2Usage[diskResource] 791 p1Request := v1resource.GetResourceRequestQuantity(p1, v1.ResourceEphemeralStorage) 792 p1Disk.Sub(p1Request) 793 p2Request := v1resource.GetResourceRequestQuantity(p2, v1.ResourceEphemeralStorage) 794 p2Disk.Sub(p2Request) 795 // prioritize evicting the pod which has the larger consumption of disk 796 return p2Disk.Cmp(p1Disk) 797 } 798 } 799 800 // cmpBool compares booleans, placing true before false 801 func cmpBool(a, b bool) int { 802 if a == b { 803 return 0 804 } 805 if !b { 806 return -1 807 } 808 return 1 809 } 810 811 // rankMemoryPressure orders the input pods for eviction in response to memory pressure. 812 // It ranks by whether or not the pod's usage exceeds its requests, then by priority, and 813 // finally by memory usage above requests. 814 func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) { 815 orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods) 816 } 817 818 // rankPIDPressure orders the input pods by priority in response to PID pressure. 819 func rankPIDPressure(pods []*v1.Pod, stats statsFunc) { 820 orderedBy(priority, process(stats)).Sort(pods) 821 } 822 823 // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats. 824 func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) rankFunc { 825 return func(pods []*v1.Pod, stats statsFunc) { 826 orderedBy(exceedDiskRequests(stats, fsStatsToMeasure, diskResource), priority, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods) 827 } 828 } 829 830 // byEvictionPriority implements sort.Interface for []v1.ResourceName. 831 type byEvictionPriority []evictionapi.Threshold 832 833 func (a byEvictionPriority) Len() int { return len(a) } 834 func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 835 836 // Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last 837 func (a byEvictionPriority) Less(i, j int) bool { 838 _, jSignalHasResource := signalToResource[a[j].Signal] 839 return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource 840 } 841 842 // makeSignalObservations derives observations using the specified summary provider. 843 func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) { 844 // build the function to work against for pod stats 845 statsFunc := cachedStatsFunc(summary.Pods) 846 // build an evaluation context for current eviction signals 847 result := signalObservations{} 848 849 if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { 850 result[evictionapi.SignalMemoryAvailable] = signalObservation{ 851 available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), 852 capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), 853 time: memory.Time, 854 } 855 } 856 if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil { 857 klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalAllocatableMemoryAvailable) 858 } else { 859 if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { 860 result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ 861 available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), 862 capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), 863 time: memory.Time, 864 } 865 } 866 } 867 if nodeFs := summary.Node.Fs; nodeFs != nil { 868 if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { 869 result[evictionapi.SignalNodeFsAvailable] = signalObservation{ 870 available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), 871 capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), 872 time: nodeFs.Time, 873 } 874 } 875 if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { 876 result[evictionapi.SignalNodeFsInodesFree] = signalObservation{ 877 available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI), 878 capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI), 879 time: nodeFs.Time, 880 } 881 } 882 } 883 if summary.Node.Runtime != nil { 884 if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil { 885 if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { 886 result[evictionapi.SignalImageFsAvailable] = signalObservation{ 887 available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), 888 capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), 889 time: imageFs.Time, 890 } 891 } 892 if imageFs.InodesFree != nil && imageFs.Inodes != nil { 893 result[evictionapi.SignalImageFsInodesFree] = signalObservation{ 894 available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI), 895 capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI), 896 time: imageFs.Time, 897 } 898 } 899 } 900 if containerFs := summary.Node.Runtime.ContainerFs; containerFs != nil { 901 if containerFs.AvailableBytes != nil && containerFs.CapacityBytes != nil { 902 result[evictionapi.SignalContainerFsAvailable] = signalObservation{ 903 available: resource.NewQuantity(int64(*containerFs.AvailableBytes), resource.BinarySI), 904 capacity: resource.NewQuantity(int64(*containerFs.CapacityBytes), resource.BinarySI), 905 time: containerFs.Time, 906 } 907 } 908 if containerFs.InodesFree != nil && containerFs.Inodes != nil { 909 result[evictionapi.SignalContainerFsInodesFree] = signalObservation{ 910 available: resource.NewQuantity(int64(*containerFs.InodesFree), resource.DecimalSI), 911 capacity: resource.NewQuantity(int64(*containerFs.Inodes), resource.DecimalSI), 912 time: containerFs.Time, 913 } 914 } 915 } 916 } 917 if rlimit := summary.Node.Rlimit; rlimit != nil { 918 if rlimit.NumOfRunningProcesses != nil && rlimit.MaxPID != nil { 919 available := int64(*rlimit.MaxPID) - int64(*rlimit.NumOfRunningProcesses) 920 result[evictionapi.SignalPIDAvailable] = signalObservation{ 921 available: resource.NewQuantity(available, resource.DecimalSI), 922 capacity: resource.NewQuantity(int64(*rlimit.MaxPID), resource.DecimalSI), 923 time: rlimit.Time, 924 } 925 } 926 } 927 return result, statsFunc 928 } 929 930 func getSysContainer(sysContainers []statsapi.ContainerStats, name string) (*statsapi.ContainerStats, error) { 931 for _, cont := range sysContainers { 932 if cont.Name == name { 933 return &cont, nil 934 } 935 } 936 return nil, fmt.Errorf("system container %q not found in metrics", name) 937 } 938 939 // thresholdsMet returns the set of thresholds that were met independent of grace period 940 func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold { 941 results := []evictionapi.Threshold{} 942 for i := range thresholds { 943 threshold := thresholds[i] 944 observed, found := observations[threshold.Signal] 945 if !found { 946 klog.InfoS("Eviction manager: no observation found for eviction signal", "signal", threshold.Signal) 947 continue 948 } 949 // determine if we have met the specified threshold 950 thresholdMet := false 951 quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) 952 // if enforceMinReclaim is specified, we compare relative to value - minreclaim 953 if enforceMinReclaim && threshold.MinReclaim != nil { 954 quantity.Add(*evictionapi.GetThresholdQuantity(*threshold.MinReclaim, observed.capacity)) 955 } 956 thresholdResult := quantity.Cmp(*observed.available) 957 switch threshold.Operator { 958 case evictionapi.OpLessThan: 959 thresholdMet = thresholdResult > 0 960 } 961 if thresholdMet { 962 results = append(results, threshold) 963 } 964 } 965 return results 966 } 967 968 func debugLogObservations(logPrefix string, observations signalObservations) { 969 klogV := klog.V(3) 970 if !klogV.Enabled() { 971 return 972 } 973 for k, v := range observations { 974 if !v.time.IsZero() { 975 klogV.InfoS("Eviction manager:", "log", logPrefix, "signal", k, "resourceName", signalToResource[k], "available", v.available, "capacity", v.capacity, "time", v.time) 976 } else { 977 klogV.InfoS("Eviction manager:", "log", logPrefix, "signal", k, "resourceName", signalToResource[k], "available", v.available, "capacity", v.capacity) 978 } 979 } 980 } 981 982 func debugLogThresholdsWithObservation(logPrefix string, thresholds []evictionapi.Threshold, observations signalObservations) { 983 klogV := klog.V(3) 984 if !klogV.Enabled() { 985 return 986 } 987 for i := range thresholds { 988 threshold := thresholds[i] 989 observed, found := observations[threshold.Signal] 990 if found { 991 quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) 992 klogV.InfoS("Eviction manager: threshold observed resource", "log", logPrefix, "signal", threshold.Signal, "resourceName", signalToResource[threshold.Signal], "quantity", quantity, "available", observed.available) 993 } else { 994 klogV.InfoS("Eviction manager: threshold had no observation", "log", logPrefix, "signal", threshold.Signal) 995 } 996 } 997 } 998 999 func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold { 1000 results := []evictionapi.Threshold{} 1001 for i := range thresholds { 1002 threshold := thresholds[i] 1003 observed, found := observations[threshold.Signal] 1004 if !found { 1005 klog.InfoS("Eviction manager: no observation found for eviction signal", "signal", threshold.Signal) 1006 continue 1007 } 1008 last, found := lastObservations[threshold.Signal] 1009 if !found || observed.time.IsZero() || observed.time.After(last.time.Time) { 1010 results = append(results, threshold) 1011 } 1012 } 1013 return results 1014 } 1015 1016 // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met. 1017 func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt { 1018 results := thresholdsObservedAt{} 1019 for i := range thresholds { 1020 observedAt, found := lastObservedAt[thresholds[i]] 1021 if !found { 1022 observedAt = now 1023 } 1024 results[thresholds[i]] = observedAt 1025 } 1026 return results 1027 } 1028 1029 // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period 1030 func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold { 1031 results := []evictionapi.Threshold{} 1032 for threshold, at := range observedAt { 1033 duration := now.Sub(at) 1034 if duration < threshold.GracePeriod { 1035 klog.V(2).InfoS("Eviction manager: eviction criteria not yet met", "threshold", formatThreshold(threshold), "duration", duration) 1036 continue 1037 } 1038 results = append(results, threshold) 1039 } 1040 return results 1041 } 1042 1043 // nodeConditions returns the set of node conditions associated with a threshold 1044 func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType { 1045 results := []v1.NodeConditionType{} 1046 for _, threshold := range thresholds { 1047 if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found { 1048 if !hasNodeCondition(results, nodeCondition) { 1049 results = append(results, nodeCondition) 1050 } 1051 } 1052 } 1053 return results 1054 } 1055 1056 // nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met. 1057 func nodeConditionsLastObservedAt(nodeConditions []v1.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt { 1058 results := nodeConditionsObservedAt{} 1059 // the input conditions were observed "now" 1060 for i := range nodeConditions { 1061 results[nodeConditions[i]] = now 1062 } 1063 // the conditions that were not observed now are merged in with their old time 1064 for key, value := range lastObservedAt { 1065 _, found := results[key] 1066 if !found { 1067 results[key] = value 1068 } 1069 } 1070 return results 1071 } 1072 1073 // nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period 1074 func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []v1.NodeConditionType { 1075 results := []v1.NodeConditionType{} 1076 for nodeCondition, at := range observedAt { 1077 duration := now.Sub(at) 1078 if duration < period { 1079 results = append(results, nodeCondition) 1080 } 1081 } 1082 return results 1083 } 1084 1085 // hasFsStatsType returns true if the fsStat is in the input list 1086 func hasFsStatsType(inputs []fsStatsType, item fsStatsType) bool { 1087 for _, input := range inputs { 1088 if input == item { 1089 return true 1090 } 1091 } 1092 return false 1093 } 1094 1095 // hasNodeCondition returns true if the node condition is in the input list 1096 func hasNodeCondition(inputs []v1.NodeConditionType, item v1.NodeConditionType) bool { 1097 for _, input := range inputs { 1098 if input == item { 1099 return true 1100 } 1101 } 1102 return false 1103 } 1104 1105 // mergeThresholds will merge both threshold lists eliminating duplicates. 1106 func mergeThresholds(inputsA []evictionapi.Threshold, inputsB []evictionapi.Threshold) []evictionapi.Threshold { 1107 results := inputsA 1108 for _, threshold := range inputsB { 1109 if !hasThreshold(results, threshold) { 1110 results = append(results, threshold) 1111 } 1112 } 1113 return results 1114 } 1115 1116 // hasThreshold returns true if the threshold is in the input list 1117 func hasThreshold(inputs []evictionapi.Threshold, item evictionapi.Threshold) bool { 1118 for _, input := range inputs { 1119 if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) { 1120 return true 1121 } 1122 } 1123 return false 1124 } 1125 1126 // compareThresholdValue returns true if the two thresholdValue objects are logically the same 1127 func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.ThresholdValue) bool { 1128 if a.Quantity != nil { 1129 if b.Quantity == nil { 1130 return false 1131 } 1132 return a.Quantity.Cmp(*b.Quantity) == 0 1133 } 1134 if b.Quantity != nil { 1135 return false 1136 } 1137 return a.Percentage == b.Percentage 1138 } 1139 1140 // isHardEvictionThreshold returns true if eviction should immediately occur 1141 func isHardEvictionThreshold(threshold evictionapi.Threshold) bool { 1142 return threshold.GracePeriod == time.Duration(0) 1143 } 1144 1145 func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool { 1146 return threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable 1147 } 1148 1149 // buildSignalToRankFunc returns ranking functions associated with resources 1150 func buildSignalToRankFunc(withImageFs bool, imageContainerSplitFs bool) map[evictionapi.Signal]rankFunc { 1151 signalToRankFunc := map[evictionapi.Signal]rankFunc{ 1152 evictionapi.SignalMemoryAvailable: rankMemoryPressure, 1153 evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure, 1154 evictionapi.SignalPIDAvailable: rankPIDPressure, 1155 } 1156 // usage of an imagefs is optional 1157 // We have a dedicated Image filesystem (images and containers are on same disk) 1158 // then we assume it is just a separate imagefs 1159 if withImageFs && !imageContainerSplitFs { 1160 // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes 1161 signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) 1162 signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) 1163 // with an imagefs, imagefs pod rank func for eviction only includes rootfs 1164 signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, v1.ResourceEphemeralStorage) 1165 signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, resourceInodes) 1166 signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalImageFsAvailable] 1167 signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalImageFsInodesFree] 1168 1169 // If both imagefs and container fs are on separate disks 1170 // we want to track the writeable layer in containerfs signals. 1171 } else if withImageFs && imageContainerSplitFs { 1172 // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes 1173 signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, v1.ResourceEphemeralStorage) 1174 signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, resourceInodes) 1175 signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable] 1176 signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree] 1177 // with an imagefs, containerfs pod rank func for eviction only includes rootfs 1178 1179 signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, v1.ResourceEphemeralStorage) 1180 signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, resourceInodes) 1181 // If image fs is not on separate disk as root but container fs is 1182 } else { 1183 // without an imagefs, nodefs pod rank func for eviction looks at all fs stats. 1184 // since imagefs and nodefs share a common device, they share common ranking functions. 1185 signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) 1186 signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) 1187 signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) 1188 signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) 1189 signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable] 1190 signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree] 1191 } 1192 return signalToRankFunc 1193 } 1194 1195 // PodIsEvicted returns true if the reported pod status is due to an eviction. 1196 func PodIsEvicted(podStatus v1.PodStatus) bool { 1197 return podStatus.Phase == v1.PodFailed && podStatus.Reason == Reason 1198 } 1199 1200 // buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources. 1201 func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool, splitContainerImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs { 1202 signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{} 1203 // usage of an imagefs is optional 1204 if withImageFs && !splitContainerImageFs { 1205 // with an imagefs, nodefs pressure should just delete logs 1206 signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{} 1207 signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{} 1208 // with an imagefs, imagefs pressure should delete unused images 1209 signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1210 signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1211 // usage of imagefs and container fs on separate disks 1212 // containers gc on containerfs pressure 1213 // image gc on imagefs pressure 1214 } else if withImageFs && splitContainerImageFs { 1215 // with an imagefs, imagefs pressure should delete unused images 1216 signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{imageGC.DeleteUnusedImages} 1217 signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{imageGC.DeleteUnusedImages} 1218 // with an split fs and imagefs, containerfs pressure should delete unused containers 1219 signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers} 1220 signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers} 1221 } else { 1222 // without an imagefs, nodefs pressure should delete logs, and unused images 1223 // since imagefs, containerfs and nodefs share a common device, they share common reclaim functions 1224 signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1225 signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1226 signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1227 signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} 1228 } 1229 return signalToReclaimFunc 1230 } 1231 1232 // evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction 1233 func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc, thresholds []evictionapi.Threshold, observations signalObservations) (message string, annotations map[string]string) { 1234 annotations = make(map[string]string) 1235 message = fmt.Sprintf(nodeLowMessageFmt, resourceToReclaim) 1236 quantity, available := getThresholdMetInfo(resourceToReclaim, thresholds, observations) 1237 if quantity != nil && available != nil { 1238 message += fmt.Sprintf(thresholdMetMessageFmt, quantity, available) 1239 } 1240 containers := []string{} 1241 containerUsage := []string{} 1242 podStats, ok := stats(pod) 1243 if !ok { 1244 return 1245 } 1246 for _, containerStats := range podStats.Containers { 1247 for _, container := range pod.Spec.Containers { 1248 if container.Name == containerStats.Name { 1249 requests := container.Resources.Requests[resourceToReclaim] 1250 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && 1251 (resourceToReclaim == v1.ResourceMemory || resourceToReclaim == v1.ResourceCPU) { 1252 if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { 1253 requests = cs.AllocatedResources[resourceToReclaim] 1254 } 1255 } 1256 var usage *resource.Quantity 1257 switch resourceToReclaim { 1258 case v1.ResourceEphemeralStorage: 1259 if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil { 1260 usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI) 1261 } 1262 case v1.ResourceMemory: 1263 if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil { 1264 usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI) 1265 } 1266 } 1267 if usage != nil && usage.Cmp(requests) > 0 { 1268 message += fmt.Sprintf(containerMessageFmt, container.Name, usage.String(), requests.String(), resourceToReclaim) 1269 containers = append(containers, container.Name) 1270 containerUsage = append(containerUsage, usage.String()) 1271 } 1272 } 1273 } 1274 } 1275 annotations[OffendingContainersKey] = strings.Join(containers, ",") 1276 annotations[OffendingContainersUsageKey] = strings.Join(containerUsage, ",") 1277 annotations[StarvedResourceKey] = string(resourceToReclaim) 1278 return 1279 } 1280 1281 // getThresholdMetInfo get the threshold quantity and available for the resource resourceToReclaim 1282 func getThresholdMetInfo(resourceToReclaim v1.ResourceName, thresholds []evictionapi.Threshold, observations signalObservations) (quantity *resource.Quantity, available *resource.Quantity) { 1283 for i := range thresholds { 1284 threshold := thresholds[i] 1285 if signalToResource[threshold.Signal] == resourceToReclaim { 1286 observed, found := observations[threshold.Signal] 1287 if found { 1288 quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) 1289 return quantity, observed.available 1290 } 1291 } 1292 } 1293 return nil, nil 1294 }