k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/cm/container_manager_linux.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2015 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package cm 21 22 import ( 23 "bytes" 24 "context" 25 "fmt" 26 "os" 27 "path" 28 "strings" 29 "sync" 30 "time" 31 32 "github.com/opencontainers/runc/libcontainer/cgroups" 33 "github.com/opencontainers/runc/libcontainer/cgroups/manager" 34 "github.com/opencontainers/runc/libcontainer/configs" 35 "k8s.io/klog/v2" 36 "k8s.io/mount-utils" 37 utilpath "k8s.io/utils/path" 38 39 v1 "k8s.io/api/core/v1" 40 "k8s.io/apimachinery/pkg/api/resource" 41 "k8s.io/apimachinery/pkg/types" 42 utilerrors "k8s.io/apimachinery/pkg/util/errors" 43 "k8s.io/apimachinery/pkg/util/sets" 44 "k8s.io/apimachinery/pkg/util/wait" 45 utilfeature "k8s.io/apiserver/pkg/util/feature" 46 clientset "k8s.io/client-go/kubernetes" 47 "k8s.io/client-go/tools/record" 48 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 49 internalapi "k8s.io/cri-api/pkg/apis" 50 podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" 51 kubefeatures "k8s.io/kubernetes/pkg/features" 52 "k8s.io/kubernetes/pkg/kubelet/cadvisor" 53 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" 54 "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" 55 "k8s.io/kubernetes/pkg/kubelet/cm/dra" 56 "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager" 57 memorymanagerstate "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state" 58 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 59 cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" 60 "k8s.io/kubernetes/pkg/kubelet/config" 61 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 62 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 63 "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 64 "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" 65 "k8s.io/kubernetes/pkg/kubelet/status" 66 "k8s.io/kubernetes/pkg/kubelet/userns/inuserns" 67 schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" 68 "k8s.io/kubernetes/pkg/util/oom" 69 ) 70 71 // A non-user container tracked by the Kubelet. 72 type systemContainer struct { 73 // Absolute name of the container. 74 name string 75 76 // CPU limit in millicores. 77 cpuMillicores int64 78 79 // Function that ensures the state of the container. 80 // m is the cgroup manager for the specified container. 81 ensureStateFunc func(m cgroups.Manager) error 82 83 // Manager for the cgroups of the external container. 84 manager cgroups.Manager 85 } 86 87 func newSystemCgroups(containerName string) (*systemContainer, error) { 88 manager, err := createManager(containerName) 89 if err != nil { 90 return nil, err 91 } 92 return &systemContainer{ 93 name: containerName, 94 manager: manager, 95 }, nil 96 } 97 98 type containerManagerImpl struct { 99 sync.RWMutex 100 cadvisorInterface cadvisor.Interface 101 mountUtil mount.Interface 102 NodeConfig 103 status Status 104 // External containers being managed. 105 systemContainers []*systemContainer 106 // Tasks that are run periodically 107 periodicTasks []func() 108 // Holds all the mounted cgroup subsystems 109 subsystems *CgroupSubsystems 110 nodeInfo *v1.Node 111 // Interface for cgroup management 112 cgroupManager CgroupManager 113 // Capacity of this node. 114 capacity v1.ResourceList 115 // Capacity of this node, including internal resources. 116 internalCapacity v1.ResourceList 117 // Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under. 118 // This path include a top level container for enforcing Node Allocatable. 119 cgroupRoot CgroupName 120 // Event recorder interface. 121 recorder record.EventRecorder 122 // Interface for QoS cgroup management 123 qosContainerManager QOSContainerManager 124 // Interface for exporting and allocating devices reported by device plugins. 125 deviceManager devicemanager.Manager 126 // Interface for CPU affinity management. 127 cpuManager cpumanager.Manager 128 // Interface for memory affinity management. 129 memoryManager memorymanager.Manager 130 // Interface for Topology resource co-ordination 131 topologyManager topologymanager.Manager 132 // Interface for Dynamic Resource Allocation management. 133 draManager dra.Manager 134 } 135 136 type features struct { 137 cpuHardcapping bool 138 } 139 140 var _ ContainerManager = &containerManagerImpl{} 141 142 // checks if the required cgroups subsystems are mounted. 143 // As of now, only 'cpu' and 'memory' are required. 144 // cpu quota is a soft requirement. 145 func validateSystemRequirements(mountUtil mount.Interface) (features, error) { 146 const ( 147 cgroupMountType = "cgroup" 148 localErr = "system validation failed" 149 ) 150 var ( 151 cpuMountPoint string 152 f features 153 ) 154 mountPoints, err := mountUtil.List() 155 if err != nil { 156 return f, fmt.Errorf("%s - %v", localErr, err) 157 } 158 159 if cgroups.IsCgroup2UnifiedMode() { 160 f.cpuHardcapping = true 161 return f, nil 162 } 163 164 expectedCgroups := sets.New("cpu", "cpuacct", "cpuset", "memory") 165 for _, mountPoint := range mountPoints { 166 if mountPoint.Type == cgroupMountType { 167 for _, opt := range mountPoint.Opts { 168 if expectedCgroups.Has(opt) { 169 expectedCgroups.Delete(opt) 170 } 171 if opt == "cpu" { 172 cpuMountPoint = mountPoint.Path 173 } 174 } 175 } 176 } 177 178 if expectedCgroups.Len() > 0 { 179 return f, fmt.Errorf("%s - Following Cgroup subsystem not mounted: %v", localErr, sets.List(expectedCgroups)) 180 } 181 182 // Check if cpu quota is available. 183 // CPU cgroup is required and so it expected to be mounted at this point. 184 periodExists, err := utilpath.Exists(utilpath.CheckFollowSymlink, path.Join(cpuMountPoint, "cpu.cfs_period_us")) 185 if err != nil { 186 klog.ErrorS(err, "Failed to detect if CPU cgroup cpu.cfs_period_us is available") 187 } 188 quotaExists, err := utilpath.Exists(utilpath.CheckFollowSymlink, path.Join(cpuMountPoint, "cpu.cfs_quota_us")) 189 if err != nil { 190 klog.ErrorS(err, "Failed to detect if CPU cgroup cpu.cfs_quota_us is available") 191 } 192 if quotaExists && periodExists { 193 f.cpuHardcapping = true 194 } 195 return f, nil 196 } 197 198 // TODO(vmarmol): Add limits to the system containers. 199 // Takes the absolute name of the specified containers. 200 // Empty container name disables use of the specified container. 201 func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) { 202 subsystems, err := GetCgroupSubsystems() 203 if err != nil { 204 return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err) 205 } 206 207 if failSwapOn { 208 // Check whether swap is enabled. The Kubelet does not support running with swap enabled. 209 swapFile := "/proc/swaps" 210 swapData, err := os.ReadFile(swapFile) 211 if err != nil { 212 if os.IsNotExist(err) { 213 klog.InfoS("File does not exist, assuming that swap is disabled", "path", swapFile) 214 } else { 215 return nil, err 216 } 217 } else { 218 swapData = bytes.TrimSpace(swapData) // extra trailing \n 219 swapLines := strings.Split(string(swapData), "\n") 220 221 // If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should 222 // error out unless --fail-swap-on is set to false. 223 if len(swapLines) > 1 { 224 return nil, fmt.Errorf("running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", swapLines) 225 } 226 } 227 } 228 229 var internalCapacity = v1.ResourceList{} 230 // It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because 231 // machine info is computed and cached once as part of cAdvisor object creation. 232 // But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts 233 machineInfo, err := cadvisorInterface.MachineInfo() 234 if err != nil { 235 return nil, err 236 } 237 capacity := cadvisor.CapacityFromMachineInfo(machineInfo) 238 for k, v := range capacity { 239 internalCapacity[k] = v 240 } 241 pidlimits, err := pidlimit.Stats() 242 if err == nil && pidlimits != nil && pidlimits.MaxPID != nil { 243 internalCapacity[pidlimit.PIDs] = *resource.NewQuantity( 244 int64(*pidlimits.MaxPID), 245 resource.DecimalSI) 246 } 247 248 // Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName 249 cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot) 250 cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver) 251 // Check if Cgroup-root actually exists on the node 252 if nodeConfig.CgroupsPerQOS { 253 // this does default to / when enabled, but this tests against regressions. 254 if nodeConfig.CgroupRoot == "" { 255 return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root") 256 } 257 258 // we need to check that the cgroup root actually exists for each subsystem 259 // of note, we always use the cgroupfs driver when performing this check since 260 // the input is provided in that format. 261 // this is important because we do not want any name conversion to occur. 262 if err := cgroupManager.Validate(cgroupRoot); err != nil { 263 return nil, fmt.Errorf("invalid configuration: %w", err) 264 } 265 klog.InfoS("Container manager verified user specified cgroup-root exists", "cgroupRoot", cgroupRoot) 266 // Include the top level cgroup for enforcing node allocatable into cgroup-root. 267 // This way, all sub modules can avoid having to understand the concept of node allocatable. 268 cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName) 269 } 270 klog.InfoS("Creating Container Manager object based on Node Config", "nodeConfig", nodeConfig) 271 272 qosContainerManager, err := NewQOSContainerManager(subsystems, cgroupRoot, nodeConfig, cgroupManager) 273 if err != nil { 274 return nil, err 275 } 276 277 cm := &containerManagerImpl{ 278 cadvisorInterface: cadvisorInterface, 279 mountUtil: mountUtil, 280 NodeConfig: nodeConfig, 281 subsystems: subsystems, 282 cgroupManager: cgroupManager, 283 capacity: capacity, 284 internalCapacity: internalCapacity, 285 cgroupRoot: cgroupRoot, 286 recorder: recorder, 287 qosContainerManager: qosContainerManager, 288 } 289 290 cm.topologyManager, err = topologymanager.NewManager( 291 machineInfo.Topology, 292 nodeConfig.TopologyManagerPolicy, 293 nodeConfig.TopologyManagerScope, 294 nodeConfig.TopologyManagerPolicyOptions, 295 ) 296 297 if err != nil { 298 return nil, err 299 } 300 301 klog.InfoS("Creating device plugin manager") 302 cm.deviceManager, err = devicemanager.NewManagerImpl(machineInfo.Topology, cm.topologyManager) 303 if err != nil { 304 return nil, err 305 } 306 cm.topologyManager.AddHintProvider(cm.deviceManager) 307 308 // Initialize DRA manager 309 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 310 klog.InfoS("Creating Dynamic Resource Allocation (DRA) manager") 311 cm.draManager, err = dra.NewManagerImpl(kubeClient, nodeConfig.KubeletRootDir, nodeConfig.NodeName) 312 if err != nil { 313 return nil, err 314 } 315 } 316 317 // Initialize CPU manager 318 cm.cpuManager, err = cpumanager.NewManager( 319 nodeConfig.CPUManagerPolicy, 320 nodeConfig.CPUManagerPolicyOptions, 321 nodeConfig.CPUManagerReconcilePeriod, 322 machineInfo, 323 nodeConfig.NodeAllocatableConfig.ReservedSystemCPUs, 324 cm.GetNodeAllocatableReservation(), 325 nodeConfig.KubeletRootDir, 326 cm.topologyManager, 327 ) 328 if err != nil { 329 klog.ErrorS(err, "Failed to initialize cpu manager") 330 return nil, err 331 } 332 cm.topologyManager.AddHintProvider(cm.cpuManager) 333 334 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryManager) { 335 cm.memoryManager, err = memorymanager.NewManager( 336 nodeConfig.ExperimentalMemoryManagerPolicy, 337 machineInfo, 338 cm.GetNodeAllocatableReservation(), 339 nodeConfig.ExperimentalMemoryManagerReservedMemory, 340 nodeConfig.KubeletRootDir, 341 cm.topologyManager, 342 ) 343 if err != nil { 344 klog.ErrorS(err, "Failed to initialize memory manager") 345 return nil, err 346 } 347 cm.topologyManager.AddHintProvider(cm.memoryManager) 348 } 349 350 return cm, nil 351 } 352 353 // NewPodContainerManager is a factory method returns a PodContainerManager object 354 // If qosCgroups are enabled then it returns the general pod container manager 355 // otherwise it returns a no-op manager which essentially does nothing 356 func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { 357 if cm.NodeConfig.CgroupsPerQOS { 358 return &podContainerManagerImpl{ 359 qosContainersInfo: cm.GetQOSContainersInfo(), 360 subsystems: cm.subsystems, 361 cgroupManager: cm.cgroupManager, 362 podPidsLimit: cm.PodPidsLimit, 363 enforceCPULimits: cm.EnforceCPULimits, 364 // cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds). 365 // Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds. 366 cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond), 367 } 368 } 369 return &podContainerManagerNoop{ 370 cgroupRoot: cm.cgroupRoot, 371 } 372 } 373 374 func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle { 375 return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager} 376 } 377 378 // Create a cgroup container manager. 379 func createManager(containerName string) (cgroups.Manager, error) { 380 cg := &configs.Cgroup{ 381 Parent: "/", 382 Name: containerName, 383 Resources: &configs.Resources{ 384 SkipDevices: true, 385 }, 386 Systemd: false, 387 } 388 389 return manager.New(cg) 390 } 391 392 type KernelTunableBehavior string 393 394 const ( 395 KernelTunableWarn KernelTunableBehavior = "warn" 396 KernelTunableError KernelTunableBehavior = "error" 397 KernelTunableModify KernelTunableBehavior = "modify" 398 ) 399 400 // setupKernelTunables validates kernel tunable flags are set as expected 401 // depending upon the specified option, it will either warn, error, or modify the kernel tunable flags 402 func setupKernelTunables(option KernelTunableBehavior) error { 403 desiredState := map[string]int{ 404 utilsysctl.VMOvercommitMemory: utilsysctl.VMOvercommitMemoryAlways, 405 utilsysctl.VMPanicOnOOM: utilsysctl.VMPanicOnOOMInvokeOOMKiller, 406 utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout, 407 utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways, 408 utilsysctl.RootMaxKeys: utilsysctl.RootMaxKeysSetting, 409 utilsysctl.RootMaxBytes: utilsysctl.RootMaxBytesSetting, 410 } 411 412 sysctl := utilsysctl.New() 413 414 errList := []error{} 415 for flag, expectedValue := range desiredState { 416 val, err := sysctl.GetSysctl(flag) 417 if err != nil { 418 errList = append(errList, err) 419 continue 420 } 421 if val == expectedValue { 422 continue 423 } 424 425 switch option { 426 case KernelTunableError: 427 errList = append(errList, fmt.Errorf("invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)) 428 case KernelTunableWarn: 429 klog.V(2).InfoS("Invalid kernel flag", "flag", flag, "expectedValue", expectedValue, "actualValue", val) 430 case KernelTunableModify: 431 klog.V(2).InfoS("Updating kernel flag", "flag", flag, "expectedValue", expectedValue, "actualValue", val) 432 err = sysctl.SetSysctl(flag, expectedValue) 433 if err != nil { 434 if inuserns.RunningInUserNS() { 435 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletInUserNamespace) { 436 klog.V(2).InfoS("Updating kernel flag failed (running in UserNS, ignoring)", "flag", flag, "err", err) 437 continue 438 } 439 klog.ErrorS(err, "Updating kernel flag failed (Hint: enable KubeletInUserNamespace feature flag to ignore the error)", "flag", flag) 440 } 441 errList = append(errList, err) 442 } 443 } 444 } 445 return utilerrors.NewAggregate(errList) 446 } 447 448 func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { 449 f, err := validateSystemRequirements(cm.mountUtil) 450 if err != nil { 451 return err 452 } 453 if !f.cpuHardcapping { 454 cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported") 455 } 456 b := KernelTunableModify 457 if cm.GetNodeConfig().ProtectKernelDefaults { 458 b = KernelTunableError 459 } 460 if err := setupKernelTunables(b); err != nil { 461 return err 462 } 463 464 // Setup top level qos containers only if CgroupsPerQOS flag is specified as true 465 if cm.NodeConfig.CgroupsPerQOS { 466 if err := cm.createNodeAllocatableCgroups(); err != nil { 467 return err 468 } 469 err = cm.qosContainerManager.Start(cm.GetNodeAllocatableAbsolute, activePods) 470 if err != nil { 471 return fmt.Errorf("failed to initialize top level QOS containers: %v", err) 472 } 473 } 474 475 // Enforce Node Allocatable (if required) 476 if err := cm.enforceNodeAllocatableCgroups(); err != nil { 477 return err 478 } 479 480 systemContainers := []*systemContainer{} 481 482 if cm.SystemCgroupsName != "" { 483 if cm.SystemCgroupsName == "/" { 484 return fmt.Errorf("system container cannot be root (\"/\")") 485 } 486 cont, err := newSystemCgroups(cm.SystemCgroupsName) 487 if err != nil { 488 return err 489 } 490 cont.ensureStateFunc = func(manager cgroups.Manager) error { 491 return ensureSystemCgroups("/", manager) 492 } 493 systemContainers = append(systemContainers, cont) 494 } 495 496 if cm.KubeletCgroupsName != "" { 497 cont, err := newSystemCgroups(cm.KubeletCgroupsName) 498 if err != nil { 499 return err 500 } 501 502 cont.ensureStateFunc = func(_ cgroups.Manager) error { 503 return ensureProcessInContainerWithOOMScore(os.Getpid(), int(cm.KubeletOOMScoreAdj), cont.manager) 504 } 505 systemContainers = append(systemContainers, cont) 506 } else { 507 cm.periodicTasks = append(cm.periodicTasks, func() { 508 if err := ensureProcessInContainerWithOOMScore(os.Getpid(), int(cm.KubeletOOMScoreAdj), nil); err != nil { 509 klog.ErrorS(err, "Failed to ensure process in container with oom score") 510 return 511 } 512 cont, err := getContainer(os.Getpid()) 513 if err != nil { 514 klog.ErrorS(err, "Failed to find cgroups of kubelet") 515 return 516 } 517 cm.Lock() 518 defer cm.Unlock() 519 520 cm.KubeletCgroupsName = cont 521 }) 522 } 523 524 cm.systemContainers = systemContainers 525 return nil 526 } 527 528 func (cm *containerManagerImpl) GetNodeConfig() NodeConfig { 529 cm.RLock() 530 defer cm.RUnlock() 531 return cm.NodeConfig 532 } 533 534 // GetPodCgroupRoot returns the literal cgroupfs value for the cgroup containing all pods. 535 func (cm *containerManagerImpl) GetPodCgroupRoot() string { 536 return cm.cgroupManager.Name(cm.cgroupRoot) 537 } 538 539 func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems { 540 return cm.subsystems 541 } 542 543 func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { 544 return cm.qosContainerManager.GetQOSContainersInfo() 545 } 546 547 func (cm *containerManagerImpl) UpdateQOSCgroups() error { 548 return cm.qosContainerManager.UpdateCgroups() 549 } 550 551 func (cm *containerManagerImpl) Status() Status { 552 cm.RLock() 553 defer cm.RUnlock() 554 return cm.status 555 } 556 557 func (cm *containerManagerImpl) Start(node *v1.Node, 558 activePods ActivePodsFunc, 559 sourcesReady config.SourcesReady, 560 podStatusProvider status.PodStatusProvider, 561 runtimeService internalapi.RuntimeService, 562 localStorageCapacityIsolation bool) error { 563 ctx := context.Background() 564 565 containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) 566 567 // Initialize DRA manager 568 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 569 err := cm.draManager.Start(dra.ActivePodsFunc(activePods), sourcesReady) 570 if err != nil { 571 return fmt.Errorf("start dra manager error: %w", err) 572 } 573 } 574 575 // Initialize CPU manager 576 err := cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap) 577 if err != nil { 578 return fmt.Errorf("start cpu manager error: %v", err) 579 } 580 581 // Initialize memory manager 582 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryManager) { 583 containerMap, _ := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) 584 err := cm.memoryManager.Start(memorymanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap) 585 if err != nil { 586 return fmt.Errorf("start memory manager error: %v", err) 587 } 588 } 589 590 // cache the node Info including resource capacity and 591 // allocatable of the node 592 cm.nodeInfo = node 593 594 if localStorageCapacityIsolation { 595 rootfs, err := cm.cadvisorInterface.RootFsInfo() 596 if err != nil { 597 return fmt.Errorf("failed to get rootfs info: %v", err) 598 } 599 for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) { 600 cm.capacity[rName] = rCap 601 } 602 } 603 604 // Ensure that node allocatable configuration is valid. 605 if err := cm.validateNodeAllocatable(); err != nil { 606 return err 607 } 608 609 // Setup the node 610 if err := cm.setupNode(activePods); err != nil { 611 return err 612 } 613 614 // Don't run a background thread if there are no ensureStateFuncs. 615 hasEnsureStateFuncs := false 616 for _, cont := range cm.systemContainers { 617 if cont.ensureStateFunc != nil { 618 hasEnsureStateFuncs = true 619 break 620 } 621 } 622 if hasEnsureStateFuncs { 623 // Run ensure state functions every minute. 624 go wait.Until(func() { 625 for _, cont := range cm.systemContainers { 626 if cont.ensureStateFunc != nil { 627 if err := cont.ensureStateFunc(cont.manager); err != nil { 628 klog.InfoS("Failed to ensure state", "containerName", cont.name, "err", err) 629 } 630 } 631 } 632 }, time.Minute, wait.NeverStop) 633 634 } 635 636 if len(cm.periodicTasks) > 0 { 637 go wait.Until(func() { 638 for _, task := range cm.periodicTasks { 639 if task != nil { 640 task() 641 } 642 } 643 }, 5*time.Minute, wait.NeverStop) 644 } 645 646 // Starts device manager. 647 if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady, containerMap, containerRunningSet); err != nil { 648 return err 649 } 650 651 return nil 652 } 653 654 func (cm *containerManagerImpl) GetPluginRegistrationHandler() cache.PluginHandler { 655 return cm.deviceManager.GetWatcherHandler() 656 } 657 658 // TODO: move the GetResources logic to PodContainerManager. 659 func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { 660 opts := &kubecontainer.RunContainerOptions{} 661 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 662 resOpts, err := cm.draManager.GetResources(pod, container) 663 if err != nil { 664 return nil, err 665 } 666 // NOTE: Passing CDI device names as annotations is a temporary solution 667 // It will be removed after all runtimes are updated 668 // to get CDI device names from the ContainerConfig.CDIDevices field 669 opts.Annotations = append(opts.Annotations, resOpts.Annotations...) 670 opts.CDIDevices = append(opts.CDIDevices, resOpts.CDIDevices...) 671 } 672 // Allocate should already be called during predicateAdmitHandler.Admit(), 673 // just try to fetch device runtime information from cached state here 674 devOpts, err := cm.deviceManager.GetDeviceRunContainerOptions(pod, container) 675 if err != nil { 676 return nil, err 677 } else if devOpts == nil { 678 return opts, nil 679 } 680 opts.Devices = append(opts.Devices, devOpts.Devices...) 681 opts.Mounts = append(opts.Mounts, devOpts.Mounts...) 682 opts.Envs = append(opts.Envs, devOpts.Envs...) 683 opts.Annotations = append(opts.Annotations, devOpts.Annotations...) 684 opts.CDIDevices = append(opts.CDIDevices, devOpts.CDIDevices...) 685 return opts, nil 686 } 687 688 func (cm *containerManagerImpl) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error { 689 return cm.deviceManager.UpdatePluginResources(node, attrs) 690 } 691 692 func (cm *containerManagerImpl) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler { 693 return cm.topologyManager 694 } 695 696 func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList { 697 cpuLimit := int64(0) 698 699 // Sum up resources of all external containers. 700 for _, cont := range cm.systemContainers { 701 cpuLimit += cont.cpuMillicores 702 } 703 704 return v1.ResourceList{ 705 v1.ResourceCPU: *resource.NewMilliQuantity( 706 cpuLimit, 707 resource.DecimalSI), 708 } 709 } 710 711 func isProcessRunningInHost(pid int) (bool, error) { 712 // Get init pid namespace. 713 initPidNs, err := os.Readlink("/proc/1/ns/pid") 714 if err != nil { 715 return false, fmt.Errorf("failed to find pid namespace of init process") 716 } 717 klog.V(10).InfoS("Found init PID namespace", "namespace", initPidNs) 718 processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid)) 719 if err != nil { 720 return false, fmt.Errorf("failed to find pid namespace of process %q", pid) 721 } 722 klog.V(10).InfoS("Process info", "pid", pid, "namespace", processPidNs) 723 return initPidNs == processPidNs, nil 724 } 725 726 func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager cgroups.Manager) error { 727 if runningInHost, err := isProcessRunningInHost(pid); err != nil { 728 // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. 729 return err 730 } else if !runningInHost { 731 // Process is running inside a container. Don't touch that. 732 klog.V(2).InfoS("PID is not running in the host namespace", "pid", pid) 733 return nil 734 } 735 736 var errs []error 737 if manager != nil { 738 cont, err := getContainer(pid) 739 if err != nil { 740 errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) 741 } 742 743 name := "" 744 cgroups, err := manager.GetCgroups() 745 if err != nil { 746 errs = append(errs, fmt.Errorf("failed to get cgroups for %d: %v", pid, err)) 747 } else { 748 name = cgroups.Name 749 } 750 751 if cont != name { 752 err = manager.Apply(pid) 753 if err != nil { 754 errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, name, err)) 755 } 756 } 757 } 758 759 // Also apply oom-score-adj to processes 760 oomAdjuster := oom.NewOOMAdjuster() 761 klog.V(5).InfoS("Attempting to apply oom_score_adj to process", "oomScoreAdj", oomScoreAdj, "pid", pid) 762 if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { 763 klog.V(3).InfoS("Failed to apply oom_score_adj to process", "oomScoreAdj", oomScoreAdj, "pid", pid, "err", err) 764 errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err)) 765 } 766 return utilerrors.NewAggregate(errs) 767 } 768 769 // getContainer returns the cgroup associated with the specified pid. 770 // It enforces a unified hierarchy for memory and cpu cgroups. 771 // On systemd environments, it uses the name=systemd cgroup for the specified pid. 772 func getContainer(pid int) (string, error) { 773 cgs, err := cgroups.ParseCgroupFile(fmt.Sprintf("/proc/%d/cgroup", pid)) 774 if err != nil { 775 return "", err 776 } 777 778 if cgroups.IsCgroup2UnifiedMode() { 779 c, found := cgs[""] 780 if !found { 781 return "", cgroups.NewNotFoundError("unified") 782 } 783 return c, nil 784 } 785 786 cpu, found := cgs["cpu"] 787 if !found { 788 return "", cgroups.NewNotFoundError("cpu") 789 } 790 memory, found := cgs["memory"] 791 if !found { 792 return "", cgroups.NewNotFoundError("memory") 793 } 794 795 // since we use this container for accounting, we need to ensure its a unified hierarchy. 796 if cpu != memory { 797 return "", fmt.Errorf("cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s", cpu, memory) 798 } 799 800 // on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls) 801 // cpu and memory accounting is off by default, users may choose to enable it per unit or globally. 802 // users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true). 803 // users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true 804 // we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal. 805 // for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory 806 // cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers. 807 // as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet. 808 // in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally). 809 if systemd, found := cgs["name=systemd"]; found { 810 if systemd != cpu { 811 klog.InfoS("CPUAccounting not enabled for process", "pid", pid) 812 } 813 if systemd != memory { 814 klog.InfoS("MemoryAccounting not enabled for process", "pid", pid) 815 } 816 return systemd, nil 817 } 818 819 return cpu, nil 820 } 821 822 // Ensures the system container is created and all non-kernel threads and process 1 823 // without a container are moved to it. 824 // 825 // The reason of leaving kernel threads at root cgroup is that we don't want to tie the 826 // execution of these threads with to-be defined /system quota and create priority inversions. 827 func ensureSystemCgroups(rootCgroupPath string, manager cgroups.Manager) error { 828 // Move non-kernel PIDs to the system container. 829 // Only keep errors on latest attempt. 830 var finalErr error 831 for i := 0; i <= 10; i++ { 832 allPids, err := cmutil.GetPids(rootCgroupPath) 833 if err != nil { 834 finalErr = fmt.Errorf("failed to list PIDs for root: %v", err) 835 continue 836 } 837 838 // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers) 839 pids := make([]int, 0, len(allPids)) 840 for _, pid := range allPids { 841 if pid == 1 || isKernelPid(pid) { 842 continue 843 } 844 845 pids = append(pids, pid) 846 } 847 848 // Check if we have moved all the non-kernel PIDs. 849 if len(pids) == 0 { 850 return nil 851 } 852 853 klog.V(3).InfoS("Moving non-kernel processes", "pids", pids) 854 for _, pid := range pids { 855 err := manager.Apply(pid) 856 if err != nil { 857 name := "" 858 cgroups, err := manager.GetCgroups() 859 if err == nil { 860 name = cgroups.Name 861 } 862 863 finalErr = fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, name, err) 864 } 865 } 866 867 } 868 869 return finalErr 870 } 871 872 // Determines whether the specified PID is a kernel PID. 873 func isKernelPid(pid int) bool { 874 // Kernel threads have no associated executable. 875 _, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid)) 876 return err != nil && os.IsNotExist(err) 877 } 878 879 // GetCapacity returns node capacity data for "cpu", "memory", "ephemeral-storage", and "huge-pages*" 880 // At present this method is only invoked when introspecting ephemeral storage 881 func (cm *containerManagerImpl) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList { 882 if localStorageCapacityIsolation { 883 // We store allocatable ephemeral-storage in the capacity property once we Start() the container manager 884 if _, ok := cm.capacity[v1.ResourceEphemeralStorage]; !ok { 885 // If we haven't yet stored the capacity for ephemeral-storage, we can try to fetch it directly from cAdvisor, 886 if cm.cadvisorInterface != nil { 887 rootfs, err := cm.cadvisorInterface.RootFsInfo() 888 if err != nil { 889 klog.ErrorS(err, "Unable to get rootfs data from cAdvisor interface") 890 // If the rootfsinfo retrieval from cAdvisor fails for any reason, fallback to returning the capacity property with no ephemeral storage data 891 return cm.capacity 892 } 893 // We don't want to mutate cm.capacity here so we'll manually construct a v1.ResourceList from it, 894 // and add ephemeral-storage 895 capacityWithEphemeralStorage := v1.ResourceList{} 896 for rName, rQuant := range cm.capacity { 897 capacityWithEphemeralStorage[rName] = rQuant 898 } 899 capacityWithEphemeralStorage[v1.ResourceEphemeralStorage] = cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs)[v1.ResourceEphemeralStorage] 900 return capacityWithEphemeralStorage 901 } 902 } 903 } 904 return cm.capacity 905 } 906 907 func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) { 908 return cm.deviceManager.GetCapacity() 909 } 910 911 func (cm *containerManagerImpl) GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices { 912 return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetDevices(podUID, containerName)) 913 } 914 915 func (cm *containerManagerImpl) GetAllocatableDevices() []*podresourcesapi.ContainerDevices { 916 return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetAllocatableDevices()) 917 } 918 919 func int64Slice(in []int) []int64 { 920 out := make([]int64, len(in)) 921 for i := range in { 922 out[i] = int64(in[i]) 923 } 924 return out 925 } 926 927 func (cm *containerManagerImpl) GetCPUs(podUID, containerName string) []int64 { 928 if cm.cpuManager != nil { 929 return int64Slice(cm.cpuManager.GetExclusiveCPUs(podUID, containerName).UnsortedList()) 930 } 931 return []int64{} 932 } 933 934 func (cm *containerManagerImpl) GetAllocatableCPUs() []int64 { 935 if cm.cpuManager != nil { 936 return int64Slice(cm.cpuManager.GetAllocatableCPUs().UnsortedList()) 937 } 938 return []int64{} 939 } 940 941 func (cm *containerManagerImpl) GetMemory(podUID, containerName string) []*podresourcesapi.ContainerMemory { 942 if cm.memoryManager == nil { 943 return []*podresourcesapi.ContainerMemory{} 944 } 945 946 return containerMemoryFromBlock(cm.memoryManager.GetMemory(podUID, containerName)) 947 } 948 949 func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory { 950 if cm.memoryManager == nil { 951 return []*podresourcesapi.ContainerMemory{} 952 } 953 954 return containerMemoryFromBlock(cm.memoryManager.GetAllocatableMemory()) 955 } 956 957 func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource { 958 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 959 return []*podresourcesapi.DynamicResource{} 960 } 961 962 var containerDynamicResources []*podresourcesapi.DynamicResource 963 containerClaimInfos, err := cm.draManager.GetContainerClaimInfos(pod, container) 964 if err != nil { 965 klog.ErrorS(err, "Unable to get container claim info state") 966 return []*podresourcesapi.DynamicResource{} 967 } 968 for _, containerClaimInfo := range containerClaimInfos { 969 var claimResources []*podresourcesapi.ClaimResource 970 // TODO: Currently we maintain a list of ClaimResources, each of which contains 971 // a set of CDIDevices from a different kubelet plugin. In the future we may want to 972 // include the name of the kubelet plugin and/or other types of resources that are 973 // not CDIDevices (assuming the DRAmanager supports this). 974 for _, klPluginCdiDevices := range containerClaimInfo.CDIDevices { 975 var cdiDevices []*podresourcesapi.CDIDevice 976 for _, cdiDevice := range klPluginCdiDevices { 977 cdiDevices = append(cdiDevices, &podresourcesapi.CDIDevice{Name: cdiDevice}) 978 } 979 claimResources = append(claimResources, &podresourcesapi.ClaimResource{CDIDevices: cdiDevices}) 980 } 981 containerDynamicResource := podresourcesapi.DynamicResource{ 982 ClassName: containerClaimInfo.ClassName, 983 ClaimName: containerClaimInfo.ClaimName, 984 ClaimNamespace: containerClaimInfo.Namespace, 985 ClaimResources: claimResources, 986 } 987 containerDynamicResources = append(containerDynamicResources, &containerDynamicResource) 988 } 989 return containerDynamicResources 990 } 991 992 func (cm *containerManagerImpl) ShouldResetExtendedResourceCapacity() bool { 993 return cm.deviceManager.ShouldResetExtendedResourceCapacity() 994 } 995 996 func (cm *containerManagerImpl) UpdateAllocatedDevices() { 997 cm.deviceManager.UpdateAllocatedDevices() 998 } 999 1000 func containerMemoryFromBlock(blocks []memorymanagerstate.Block) []*podresourcesapi.ContainerMemory { 1001 var containerMemories []*podresourcesapi.ContainerMemory 1002 1003 for _, b := range blocks { 1004 containerMemory := podresourcesapi.ContainerMemory{ 1005 MemoryType: string(b.Type), 1006 Size_: b.Size, 1007 Topology: &podresourcesapi.TopologyInfo{ 1008 Nodes: []*podresourcesapi.NUMANode{}, 1009 }, 1010 } 1011 1012 for _, numaNodeID := range b.NUMAAffinity { 1013 containerMemory.Topology.Nodes = append(containerMemory.Topology.Nodes, &podresourcesapi.NUMANode{ID: int64(numaNodeID)}) 1014 } 1015 1016 containerMemories = append(containerMemories, &containerMemory) 1017 } 1018 1019 return containerMemories 1020 } 1021 1022 func (cm *containerManagerImpl) PrepareDynamicResources(pod *v1.Pod) error { 1023 return cm.draManager.PrepareResources(pod) 1024 } 1025 1026 func (cm *containerManagerImpl) UnprepareDynamicResources(pod *v1.Pod) error { 1027 return cm.draManager.UnprepareResources(pod) 1028 } 1029 1030 func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool { 1031 return cm.draManager.PodMightNeedToUnprepareResources(UID) 1032 }