k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/container_manager_linux.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2015 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package cm 21 22 import ( 23 "bytes" 24 "context" 25 "fmt" 26 "os" 27 "path" 28 "strings" 29 "sync" 30 "time" 31 32 "github.com/opencontainers/runc/libcontainer/cgroups" 33 "github.com/opencontainers/runc/libcontainer/cgroups/manager" 34 "github.com/opencontainers/runc/libcontainer/configs" 35 "k8s.io/klog/v2" 36 "k8s.io/mount-utils" 37 utilpath "k8s.io/utils/path" 38 39 libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns" 40 v1 "k8s.io/api/core/v1" 41 "k8s.io/apimachinery/pkg/api/resource" 42 "k8s.io/apimachinery/pkg/types" 43 utilerrors "k8s.io/apimachinery/pkg/util/errors" 44 "k8s.io/apimachinery/pkg/util/sets" 45 "k8s.io/apimachinery/pkg/util/wait" 46 utilfeature "k8s.io/apiserver/pkg/util/feature" 47 clientset "k8s.io/client-go/kubernetes" 48 "k8s.io/client-go/tools/record" 49 utilsysctl "k8s.io/component-helpers/node/util/sysctl" 50 internalapi "k8s.io/cri-api/pkg/apis" 51 podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1" 52 kubefeatures "k8s.io/kubernetes/pkg/features" 53 "k8s.io/kubernetes/pkg/kubelet/cadvisor" 54 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" 55 "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager" 56 "k8s.io/kubernetes/pkg/kubelet/cm/dra" 57 "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager" 58 memorymanagerstate "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state" 59 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 60 cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" 61 "k8s.io/kubernetes/pkg/kubelet/config" 62 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 63 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 64 "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 65 "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" 66 "k8s.io/kubernetes/pkg/kubelet/status" 67 schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" 68 "k8s.io/kubernetes/pkg/util/oom" 69 ) 70 71 // A non-user container tracked by the Kubelet. 72 type systemContainer struct { 73 // Absolute name of the container. 74 name string 75 76 // CPU limit in millicores. 77 cpuMillicores int64 78 79 // Function that ensures the state of the container. 80 // m is the cgroup manager for the specified container. 81 ensureStateFunc func(m cgroups.Manager) error 82 83 // Manager for the cgroups of the external container. 84 manager cgroups.Manager 85 } 86 87 func newSystemCgroups(containerName string) (*systemContainer, error) { 88 manager, err := createManager(containerName) 89 if err != nil { 90 return nil, err 91 } 92 return &systemContainer{ 93 name: containerName, 94 manager: manager, 95 }, nil 96 } 97 98 type containerManagerImpl struct { 99 sync.RWMutex 100 cadvisorInterface cadvisor.Interface 101 mountUtil mount.Interface 102 NodeConfig 103 status Status 104 // External containers being managed. 105 systemContainers []*systemContainer 106 // Tasks that are run periodically 107 periodicTasks []func() 108 // Holds all the mounted cgroup subsystems 109 subsystems *CgroupSubsystems 110 nodeInfo *v1.Node 111 // Interface for cgroup management 112 cgroupManager CgroupManager 113 // Capacity of this node. 114 capacity v1.ResourceList 115 // Capacity of this node, including internal resources. 116 internalCapacity v1.ResourceList 117 // Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under. 118 // This path include a top level container for enforcing Node Allocatable. 119 cgroupRoot CgroupName 120 // Event recorder interface. 121 recorder record.EventRecorder 122 // Interface for QoS cgroup management 123 qosContainerManager QOSContainerManager 124 // Interface for exporting and allocating devices reported by device plugins. 125 deviceManager devicemanager.Manager 126 // Interface for CPU affinity management. 127 cpuManager cpumanager.Manager 128 // Interface for memory affinity management. 129 memoryManager memorymanager.Manager 130 // Interface for Topology resource co-ordination 131 topologyManager topologymanager.Manager 132 // Interface for Dynamic Resource Allocation management. 133 draManager dra.Manager 134 } 135 136 type features struct { 137 cpuHardcapping bool 138 } 139 140 var _ ContainerManager = &containerManagerImpl{} 141 142 // checks if the required cgroups subsystems are mounted. 143 // As of now, only 'cpu' and 'memory' are required. 144 // cpu quota is a soft requirement. 145 func validateSystemRequirements(mountUtil mount.Interface) (features, error) { 146 const ( 147 cgroupMountType = "cgroup" 148 localErr = "system validation failed" 149 ) 150 var ( 151 cpuMountPoint string 152 f features 153 ) 154 mountPoints, err := mountUtil.List() 155 if err != nil { 156 return f, fmt.Errorf("%s - %v", localErr, err) 157 } 158 159 if cgroups.IsCgroup2UnifiedMode() { 160 f.cpuHardcapping = true 161 return f, nil 162 } 163 164 expectedCgroups := sets.New("cpu", "cpuacct", "cpuset", "memory") 165 for _, mountPoint := range mountPoints { 166 if mountPoint.Type == cgroupMountType { 167 for _, opt := range mountPoint.Opts { 168 if expectedCgroups.Has(opt) { 169 expectedCgroups.Delete(opt) 170 } 171 if opt == "cpu" { 172 cpuMountPoint = mountPoint.Path 173 } 174 } 175 } 176 } 177 178 if expectedCgroups.Len() > 0 { 179 return f, fmt.Errorf("%s - Following Cgroup subsystem not mounted: %v", localErr, sets.List(expectedCgroups)) 180 } 181 182 // Check if cpu quota is available. 183 // CPU cgroup is required and so it expected to be mounted at this point. 184 periodExists, err := utilpath.Exists(utilpath.CheckFollowSymlink, path.Join(cpuMountPoint, "cpu.cfs_period_us")) 185 if err != nil { 186 klog.ErrorS(err, "Failed to detect if CPU cgroup cpu.cfs_period_us is available") 187 } 188 quotaExists, err := utilpath.Exists(utilpath.CheckFollowSymlink, path.Join(cpuMountPoint, "cpu.cfs_quota_us")) 189 if err != nil { 190 klog.ErrorS(err, "Failed to detect if CPU cgroup cpu.cfs_quota_us is available") 191 } 192 if quotaExists && periodExists { 193 f.cpuHardcapping = true 194 } 195 return f, nil 196 } 197 198 // TODO(vmarmol): Add limits to the system containers. 199 // Takes the absolute name of the specified containers. 200 // Empty container name disables use of the specified container. 201 func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) { 202 subsystems, err := GetCgroupSubsystems() 203 if err != nil { 204 return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err) 205 } 206 207 if failSwapOn { 208 // Check whether swap is enabled. The Kubelet does not support running with swap enabled. 209 swapFile := "/proc/swaps" 210 swapData, err := os.ReadFile(swapFile) 211 if err != nil { 212 if os.IsNotExist(err) { 213 klog.InfoS("File does not exist, assuming that swap is disabled", "path", swapFile) 214 } else { 215 return nil, err 216 } 217 } else { 218 swapData = bytes.TrimSpace(swapData) // extra trailing \n 219 swapLines := strings.Split(string(swapData), "\n") 220 221 // If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should 222 // error out unless --fail-swap-on is set to false. 223 if len(swapLines) > 1 { 224 return nil, fmt.Errorf("running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", swapLines) 225 } 226 } 227 } 228 229 var internalCapacity = v1.ResourceList{} 230 // It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because 231 // machine info is computed and cached once as part of cAdvisor object creation. 232 // But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts 233 machineInfo, err := cadvisorInterface.MachineInfo() 234 if err != nil { 235 return nil, err 236 } 237 capacity := cadvisor.CapacityFromMachineInfo(machineInfo) 238 for k, v := range capacity { 239 internalCapacity[k] = v 240 } 241 pidlimits, err := pidlimit.Stats() 242 if err == nil && pidlimits != nil && pidlimits.MaxPID != nil { 243 internalCapacity[pidlimit.PIDs] = *resource.NewQuantity( 244 int64(*pidlimits.MaxPID), 245 resource.DecimalSI) 246 } 247 248 // Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName 249 cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot) 250 cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver) 251 // Check if Cgroup-root actually exists on the node 252 if nodeConfig.CgroupsPerQOS { 253 // this does default to / when enabled, but this tests against regressions. 254 if nodeConfig.CgroupRoot == "" { 255 return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root") 256 } 257 258 // we need to check that the cgroup root actually exists for each subsystem 259 // of note, we always use the cgroupfs driver when performing this check since 260 // the input is provided in that format. 261 // this is important because we do not want any name conversion to occur. 262 if err := cgroupManager.Validate(cgroupRoot); err != nil { 263 return nil, fmt.Errorf("invalid configuration: %w", err) 264 } 265 klog.InfoS("Container manager verified user specified cgroup-root exists", "cgroupRoot", cgroupRoot) 266 // Include the top level cgroup for enforcing node allocatable into cgroup-root. 267 // This way, all sub modules can avoid having to understand the concept of node allocatable. 268 cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName) 269 } 270 klog.InfoS("Creating Container Manager object based on Node Config", "nodeConfig", nodeConfig) 271 272 qosContainerManager, err := NewQOSContainerManager(subsystems, cgroupRoot, nodeConfig, cgroupManager) 273 if err != nil { 274 return nil, err 275 } 276 277 cm := &containerManagerImpl{ 278 cadvisorInterface: cadvisorInterface, 279 mountUtil: mountUtil, 280 NodeConfig: nodeConfig, 281 subsystems: subsystems, 282 cgroupManager: cgroupManager, 283 capacity: capacity, 284 internalCapacity: internalCapacity, 285 cgroupRoot: cgroupRoot, 286 recorder: recorder, 287 qosContainerManager: qosContainerManager, 288 } 289 290 cm.topologyManager, err = topologymanager.NewManager( 291 machineInfo.Topology, 292 nodeConfig.TopologyManagerPolicy, 293 nodeConfig.TopologyManagerScope, 294 nodeConfig.TopologyManagerPolicyOptions, 295 ) 296 297 if err != nil { 298 return nil, err 299 } 300 301 klog.InfoS("Creating device plugin manager") 302 cm.deviceManager, err = devicemanager.NewManagerImpl(machineInfo.Topology, cm.topologyManager) 303 if err != nil { 304 return nil, err 305 } 306 cm.topologyManager.AddHintProvider(cm.deviceManager) 307 308 // initialize DRA manager 309 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 310 klog.InfoS("Creating Dynamic Resource Allocation (DRA) manager") 311 cm.draManager, err = dra.NewManagerImpl(kubeClient, nodeConfig.KubeletRootDir) 312 if err != nil { 313 return nil, err 314 } 315 } 316 317 // Initialize CPU manager 318 cm.cpuManager, err = cpumanager.NewManager( 319 nodeConfig.CPUManagerPolicy, 320 nodeConfig.CPUManagerPolicyOptions, 321 nodeConfig.CPUManagerReconcilePeriod, 322 machineInfo, 323 nodeConfig.NodeAllocatableConfig.ReservedSystemCPUs, 324 cm.GetNodeAllocatableReservation(), 325 nodeConfig.KubeletRootDir, 326 cm.topologyManager, 327 ) 328 if err != nil { 329 klog.ErrorS(err, "Failed to initialize cpu manager") 330 return nil, err 331 } 332 cm.topologyManager.AddHintProvider(cm.cpuManager) 333 334 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryManager) { 335 cm.memoryManager, err = memorymanager.NewManager( 336 nodeConfig.ExperimentalMemoryManagerPolicy, 337 machineInfo, 338 cm.GetNodeAllocatableReservation(), 339 nodeConfig.ExperimentalMemoryManagerReservedMemory, 340 nodeConfig.KubeletRootDir, 341 cm.topologyManager, 342 ) 343 if err != nil { 344 klog.ErrorS(err, "Failed to initialize memory manager") 345 return nil, err 346 } 347 cm.topologyManager.AddHintProvider(cm.memoryManager) 348 } 349 350 return cm, nil 351 } 352 353 // NewPodContainerManager is a factory method returns a PodContainerManager object 354 // If qosCgroups are enabled then it returns the general pod container manager 355 // otherwise it returns a no-op manager which essentially does nothing 356 func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { 357 if cm.NodeConfig.CgroupsPerQOS { 358 return &podContainerManagerImpl{ 359 qosContainersInfo: cm.GetQOSContainersInfo(), 360 subsystems: cm.subsystems, 361 cgroupManager: cm.cgroupManager, 362 podPidsLimit: cm.PodPidsLimit, 363 enforceCPULimits: cm.EnforceCPULimits, 364 // cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds). 365 // Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds. 366 cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond), 367 } 368 } 369 return &podContainerManagerNoop{ 370 cgroupRoot: cm.cgroupRoot, 371 } 372 } 373 374 func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle { 375 return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager} 376 } 377 378 // Create a cgroup container manager. 379 func createManager(containerName string) (cgroups.Manager, error) { 380 cg := &configs.Cgroup{ 381 Parent: "/", 382 Name: containerName, 383 Resources: &configs.Resources{ 384 SkipDevices: true, 385 }, 386 Systemd: false, 387 } 388 389 return manager.New(cg) 390 } 391 392 type KernelTunableBehavior string 393 394 const ( 395 KernelTunableWarn KernelTunableBehavior = "warn" 396 KernelTunableError KernelTunableBehavior = "error" 397 KernelTunableModify KernelTunableBehavior = "modify" 398 ) 399 400 // setupKernelTunables validates kernel tunable flags are set as expected 401 // depending upon the specified option, it will either warn, error, or modify the kernel tunable flags 402 func setupKernelTunables(option KernelTunableBehavior) error { 403 desiredState := map[string]int{ 404 utilsysctl.VMOvercommitMemory: utilsysctl.VMOvercommitMemoryAlways, 405 utilsysctl.VMPanicOnOOM: utilsysctl.VMPanicOnOOMInvokeOOMKiller, 406 utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout, 407 utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways, 408 utilsysctl.RootMaxKeys: utilsysctl.RootMaxKeysSetting, 409 utilsysctl.RootMaxBytes: utilsysctl.RootMaxBytesSetting, 410 } 411 412 sysctl := utilsysctl.New() 413 414 errList := []error{} 415 for flag, expectedValue := range desiredState { 416 val, err := sysctl.GetSysctl(flag) 417 if err != nil { 418 errList = append(errList, err) 419 continue 420 } 421 if val == expectedValue { 422 continue 423 } 424 425 switch option { 426 case KernelTunableError: 427 errList = append(errList, fmt.Errorf("invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)) 428 case KernelTunableWarn: 429 klog.V(2).InfoS("Invalid kernel flag", "flag", flag, "expectedValue", expectedValue, "actualValue", val) 430 case KernelTunableModify: 431 klog.V(2).InfoS("Updating kernel flag", "flag", flag, "expectedValue", expectedValue, "actualValue", val) 432 err = sysctl.SetSysctl(flag, expectedValue) 433 if err != nil { 434 if libcontaineruserns.RunningInUserNS() { 435 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletInUserNamespace) { 436 klog.V(2).InfoS("Updating kernel flag failed (running in UserNS, ignoring)", "flag", flag, "err", err) 437 continue 438 } 439 klog.ErrorS(err, "Updating kernel flag failed (Hint: enable KubeletInUserNamespace feature flag to ignore the error)", "flag", flag) 440 } 441 errList = append(errList, err) 442 } 443 } 444 } 445 return utilerrors.NewAggregate(errList) 446 } 447 448 func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { 449 f, err := validateSystemRequirements(cm.mountUtil) 450 if err != nil { 451 return err 452 } 453 if !f.cpuHardcapping { 454 cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported") 455 } 456 b := KernelTunableModify 457 if cm.GetNodeConfig().ProtectKernelDefaults { 458 b = KernelTunableError 459 } 460 if err := setupKernelTunables(b); err != nil { 461 return err 462 } 463 464 // Setup top level qos containers only if CgroupsPerQOS flag is specified as true 465 if cm.NodeConfig.CgroupsPerQOS { 466 if err := cm.createNodeAllocatableCgroups(); err != nil { 467 return err 468 } 469 err = cm.qosContainerManager.Start(cm.GetNodeAllocatableAbsolute, activePods) 470 if err != nil { 471 return fmt.Errorf("failed to initialize top level QOS containers: %v", err) 472 } 473 } 474 475 // Enforce Node Allocatable (if required) 476 if err := cm.enforceNodeAllocatableCgroups(); err != nil { 477 return err 478 } 479 480 systemContainers := []*systemContainer{} 481 482 if cm.SystemCgroupsName != "" { 483 if cm.SystemCgroupsName == "/" { 484 return fmt.Errorf("system container cannot be root (\"/\")") 485 } 486 cont, err := newSystemCgroups(cm.SystemCgroupsName) 487 if err != nil { 488 return err 489 } 490 cont.ensureStateFunc = func(manager cgroups.Manager) error { 491 return ensureSystemCgroups("/", manager) 492 } 493 systemContainers = append(systemContainers, cont) 494 } 495 496 if cm.KubeletCgroupsName != "" { 497 cont, err := newSystemCgroups(cm.KubeletCgroupsName) 498 if err != nil { 499 return err 500 } 501 502 cont.ensureStateFunc = func(_ cgroups.Manager) error { 503 return ensureProcessInContainerWithOOMScore(os.Getpid(), int(cm.KubeletOOMScoreAdj), cont.manager) 504 } 505 systemContainers = append(systemContainers, cont) 506 } else { 507 cm.periodicTasks = append(cm.periodicTasks, func() { 508 if err := ensureProcessInContainerWithOOMScore(os.Getpid(), int(cm.KubeletOOMScoreAdj), nil); err != nil { 509 klog.ErrorS(err, "Failed to ensure process in container with oom score") 510 return 511 } 512 cont, err := getContainer(os.Getpid()) 513 if err != nil { 514 klog.ErrorS(err, "Failed to find cgroups of kubelet") 515 return 516 } 517 cm.Lock() 518 defer cm.Unlock() 519 520 cm.KubeletCgroupsName = cont 521 }) 522 } 523 524 cm.systemContainers = systemContainers 525 return nil 526 } 527 528 func (cm *containerManagerImpl) GetNodeConfig() NodeConfig { 529 cm.RLock() 530 defer cm.RUnlock() 531 return cm.NodeConfig 532 } 533 534 // GetPodCgroupRoot returns the literal cgroupfs value for the cgroup containing all pods. 535 func (cm *containerManagerImpl) GetPodCgroupRoot() string { 536 return cm.cgroupManager.Name(cm.cgroupRoot) 537 } 538 539 func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems { 540 return cm.subsystems 541 } 542 543 func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { 544 return cm.qosContainerManager.GetQOSContainersInfo() 545 } 546 547 func (cm *containerManagerImpl) UpdateQOSCgroups() error { 548 return cm.qosContainerManager.UpdateCgroups() 549 } 550 551 func (cm *containerManagerImpl) Status() Status { 552 cm.RLock() 553 defer cm.RUnlock() 554 return cm.status 555 } 556 557 func (cm *containerManagerImpl) Start(node *v1.Node, 558 activePods ActivePodsFunc, 559 sourcesReady config.SourcesReady, 560 podStatusProvider status.PodStatusProvider, 561 runtimeService internalapi.RuntimeService, 562 localStorageCapacityIsolation bool) error { 563 ctx := context.Background() 564 565 containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) 566 567 // Initialize CPU manager 568 err := cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap) 569 if err != nil { 570 return fmt.Errorf("start cpu manager error: %v", err) 571 } 572 573 // Initialize memory manager 574 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryManager) { 575 containerMap, _ := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService) 576 err := cm.memoryManager.Start(memorymanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap) 577 if err != nil { 578 return fmt.Errorf("start memory manager error: %v", err) 579 } 580 } 581 582 // cache the node Info including resource capacity and 583 // allocatable of the node 584 cm.nodeInfo = node 585 586 if localStorageCapacityIsolation { 587 rootfs, err := cm.cadvisorInterface.RootFsInfo() 588 if err != nil { 589 return fmt.Errorf("failed to get rootfs info: %v", err) 590 } 591 for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) { 592 cm.capacity[rName] = rCap 593 } 594 } 595 596 // Ensure that node allocatable configuration is valid. 597 if err := cm.validateNodeAllocatable(); err != nil { 598 return err 599 } 600 601 // Setup the node 602 if err := cm.setupNode(activePods); err != nil { 603 return err 604 } 605 606 // Don't run a background thread if there are no ensureStateFuncs. 607 hasEnsureStateFuncs := false 608 for _, cont := range cm.systemContainers { 609 if cont.ensureStateFunc != nil { 610 hasEnsureStateFuncs = true 611 break 612 } 613 } 614 if hasEnsureStateFuncs { 615 // Run ensure state functions every minute. 616 go wait.Until(func() { 617 for _, cont := range cm.systemContainers { 618 if cont.ensureStateFunc != nil { 619 if err := cont.ensureStateFunc(cont.manager); err != nil { 620 klog.InfoS("Failed to ensure state", "containerName", cont.name, "err", err) 621 } 622 } 623 } 624 }, time.Minute, wait.NeverStop) 625 626 } 627 628 if len(cm.periodicTasks) > 0 { 629 go wait.Until(func() { 630 for _, task := range cm.periodicTasks { 631 if task != nil { 632 task() 633 } 634 } 635 }, 5*time.Minute, wait.NeverStop) 636 } 637 638 // Starts device manager. 639 if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady, containerMap, containerRunningSet); err != nil { 640 return err 641 } 642 643 return nil 644 } 645 646 func (cm *containerManagerImpl) GetPluginRegistrationHandler() cache.PluginHandler { 647 return cm.deviceManager.GetWatcherHandler() 648 } 649 650 // TODO: move the GetResources logic to PodContainerManager. 651 func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) { 652 opts := &kubecontainer.RunContainerOptions{} 653 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 654 resOpts, err := cm.draManager.GetResources(pod, container) 655 if err != nil { 656 return nil, err 657 } 658 // NOTE: Passing CDI device names as annotations is a temporary solution 659 // It will be removed after all runtimes are updated 660 // to get CDI device names from the ContainerConfig.CDIDevices field 661 opts.Annotations = append(opts.Annotations, resOpts.Annotations...) 662 opts.CDIDevices = append(opts.CDIDevices, resOpts.CDIDevices...) 663 } 664 // Allocate should already be called during predicateAdmitHandler.Admit(), 665 // just try to fetch device runtime information from cached state here 666 devOpts, err := cm.deviceManager.GetDeviceRunContainerOptions(pod, container) 667 if err != nil { 668 return nil, err 669 } else if devOpts == nil { 670 return opts, nil 671 } 672 opts.Devices = append(opts.Devices, devOpts.Devices...) 673 opts.Mounts = append(opts.Mounts, devOpts.Mounts...) 674 opts.Envs = append(opts.Envs, devOpts.Envs...) 675 opts.Annotations = append(opts.Annotations, devOpts.Annotations...) 676 opts.CDIDevices = append(opts.CDIDevices, devOpts.CDIDevices...) 677 return opts, nil 678 } 679 680 func (cm *containerManagerImpl) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error { 681 return cm.deviceManager.UpdatePluginResources(node, attrs) 682 } 683 684 func (cm *containerManagerImpl) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler { 685 return cm.topologyManager 686 } 687 688 func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList { 689 cpuLimit := int64(0) 690 691 // Sum up resources of all external containers. 692 for _, cont := range cm.systemContainers { 693 cpuLimit += cont.cpuMillicores 694 } 695 696 return v1.ResourceList{ 697 v1.ResourceCPU: *resource.NewMilliQuantity( 698 cpuLimit, 699 resource.DecimalSI), 700 } 701 } 702 703 func isProcessRunningInHost(pid int) (bool, error) { 704 // Get init pid namespace. 705 initPidNs, err := os.Readlink("/proc/1/ns/pid") 706 if err != nil { 707 return false, fmt.Errorf("failed to find pid namespace of init process") 708 } 709 klog.V(10).InfoS("Found init PID namespace", "namespace", initPidNs) 710 processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid)) 711 if err != nil { 712 return false, fmt.Errorf("failed to find pid namespace of process %q", pid) 713 } 714 klog.V(10).InfoS("Process info", "pid", pid, "namespace", processPidNs) 715 return initPidNs == processPidNs, nil 716 } 717 718 func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager cgroups.Manager) error { 719 if runningInHost, err := isProcessRunningInHost(pid); err != nil { 720 // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. 721 return err 722 } else if !runningInHost { 723 // Process is running inside a container. Don't touch that. 724 klog.V(2).InfoS("PID is not running in the host namespace", "pid", pid) 725 return nil 726 } 727 728 var errs []error 729 if manager != nil { 730 cont, err := getContainer(pid) 731 if err != nil { 732 errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) 733 } 734 735 name := "" 736 cgroups, err := manager.GetCgroups() 737 if err != nil { 738 errs = append(errs, fmt.Errorf("failed to get cgroups for %d: %v", pid, err)) 739 } else { 740 name = cgroups.Name 741 } 742 743 if cont != name { 744 err = manager.Apply(pid) 745 if err != nil { 746 errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, name, err)) 747 } 748 } 749 } 750 751 // Also apply oom-score-adj to processes 752 oomAdjuster := oom.NewOOMAdjuster() 753 klog.V(5).InfoS("Attempting to apply oom_score_adj to process", "oomScoreAdj", oomScoreAdj, "pid", pid) 754 if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { 755 klog.V(3).InfoS("Failed to apply oom_score_adj to process", "oomScoreAdj", oomScoreAdj, "pid", pid, "err", err) 756 errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err)) 757 } 758 return utilerrors.NewAggregate(errs) 759 } 760 761 // getContainer returns the cgroup associated with the specified pid. 762 // It enforces a unified hierarchy for memory and cpu cgroups. 763 // On systemd environments, it uses the name=systemd cgroup for the specified pid. 764 func getContainer(pid int) (string, error) { 765 cgs, err := cgroups.ParseCgroupFile(fmt.Sprintf("/proc/%d/cgroup", pid)) 766 if err != nil { 767 return "", err 768 } 769 770 if cgroups.IsCgroup2UnifiedMode() { 771 c, found := cgs[""] 772 if !found { 773 return "", cgroups.NewNotFoundError("unified") 774 } 775 return c, nil 776 } 777 778 cpu, found := cgs["cpu"] 779 if !found { 780 return "", cgroups.NewNotFoundError("cpu") 781 } 782 memory, found := cgs["memory"] 783 if !found { 784 return "", cgroups.NewNotFoundError("memory") 785 } 786 787 // since we use this container for accounting, we need to ensure its a unified hierarchy. 788 if cpu != memory { 789 return "", fmt.Errorf("cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s", cpu, memory) 790 } 791 792 // on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls) 793 // cpu and memory accounting is off by default, users may choose to enable it per unit or globally. 794 // users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true). 795 // users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true 796 // we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal. 797 // for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory 798 // cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers. 799 // as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet. 800 // in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally). 801 if systemd, found := cgs["name=systemd"]; found { 802 if systemd != cpu { 803 klog.InfoS("CPUAccounting not enabled for process", "pid", pid) 804 } 805 if systemd != memory { 806 klog.InfoS("MemoryAccounting not enabled for process", "pid", pid) 807 } 808 return systemd, nil 809 } 810 811 return cpu, nil 812 } 813 814 // Ensures the system container is created and all non-kernel threads and process 1 815 // without a container are moved to it. 816 // 817 // The reason of leaving kernel threads at root cgroup is that we don't want to tie the 818 // execution of these threads with to-be defined /system quota and create priority inversions. 819 func ensureSystemCgroups(rootCgroupPath string, manager cgroups.Manager) error { 820 // Move non-kernel PIDs to the system container. 821 // Only keep errors on latest attempt. 822 var finalErr error 823 for i := 0; i <= 10; i++ { 824 allPids, err := cmutil.GetPids(rootCgroupPath) 825 if err != nil { 826 finalErr = fmt.Errorf("failed to list PIDs for root: %v", err) 827 continue 828 } 829 830 // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers) 831 pids := make([]int, 0, len(allPids)) 832 for _, pid := range allPids { 833 if pid == 1 || isKernelPid(pid) { 834 continue 835 } 836 837 pids = append(pids, pid) 838 } 839 840 // Check if we have moved all the non-kernel PIDs. 841 if len(pids) == 0 { 842 return nil 843 } 844 845 klog.V(3).InfoS("Moving non-kernel processes", "pids", pids) 846 for _, pid := range pids { 847 err := manager.Apply(pid) 848 if err != nil { 849 name := "" 850 cgroups, err := manager.GetCgroups() 851 if err == nil { 852 name = cgroups.Name 853 } 854 855 finalErr = fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, name, err) 856 } 857 } 858 859 } 860 861 return finalErr 862 } 863 864 // Determines whether the specified PID is a kernel PID. 865 func isKernelPid(pid int) bool { 866 // Kernel threads have no associated executable. 867 _, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid)) 868 return err != nil && os.IsNotExist(err) 869 } 870 871 // GetCapacity returns node capacity data for "cpu", "memory", "ephemeral-storage", and "huge-pages*" 872 // At present this method is only invoked when introspecting ephemeral storage 873 func (cm *containerManagerImpl) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList { 874 if localStorageCapacityIsolation { 875 // We store allocatable ephemeral-storage in the capacity property once we Start() the container manager 876 if _, ok := cm.capacity[v1.ResourceEphemeralStorage]; !ok { 877 // If we haven't yet stored the capacity for ephemeral-storage, we can try to fetch it directly from cAdvisor, 878 if cm.cadvisorInterface != nil { 879 rootfs, err := cm.cadvisorInterface.RootFsInfo() 880 if err != nil { 881 klog.ErrorS(err, "Unable to get rootfs data from cAdvisor interface") 882 // If the rootfsinfo retrieval from cAdvisor fails for any reason, fallback to returning the capacity property with no ephemeral storage data 883 return cm.capacity 884 } 885 // We don't want to mutate cm.capacity here so we'll manually construct a v1.ResourceList from it, 886 // and add ephemeral-storage 887 capacityWithEphemeralStorage := v1.ResourceList{} 888 for rName, rQuant := range cm.capacity { 889 capacityWithEphemeralStorage[rName] = rQuant 890 } 891 capacityWithEphemeralStorage[v1.ResourceEphemeralStorage] = cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs)[v1.ResourceEphemeralStorage] 892 return capacityWithEphemeralStorage 893 } 894 } 895 } 896 return cm.capacity 897 } 898 899 func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) { 900 return cm.deviceManager.GetCapacity() 901 } 902 903 func (cm *containerManagerImpl) GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices { 904 return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetDevices(podUID, containerName)) 905 } 906 907 func (cm *containerManagerImpl) GetAllocatableDevices() []*podresourcesapi.ContainerDevices { 908 return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetAllocatableDevices()) 909 } 910 911 func int64Slice(in []int) []int64 { 912 out := make([]int64, len(in)) 913 for i := range in { 914 out[i] = int64(in[i]) 915 } 916 return out 917 } 918 919 func (cm *containerManagerImpl) GetCPUs(podUID, containerName string) []int64 { 920 if cm.cpuManager != nil { 921 return int64Slice(cm.cpuManager.GetExclusiveCPUs(podUID, containerName).UnsortedList()) 922 } 923 return []int64{} 924 } 925 926 func (cm *containerManagerImpl) GetAllocatableCPUs() []int64 { 927 if cm.cpuManager != nil { 928 return int64Slice(cm.cpuManager.GetAllocatableCPUs().UnsortedList()) 929 } 930 return []int64{} 931 } 932 933 func (cm *containerManagerImpl) GetMemory(podUID, containerName string) []*podresourcesapi.ContainerMemory { 934 if cm.memoryManager == nil { 935 return []*podresourcesapi.ContainerMemory{} 936 } 937 938 return containerMemoryFromBlock(cm.memoryManager.GetMemory(podUID, containerName)) 939 } 940 941 func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory { 942 if cm.memoryManager == nil { 943 return []*podresourcesapi.ContainerMemory{} 944 } 945 946 return containerMemoryFromBlock(cm.memoryManager.GetAllocatableMemory()) 947 } 948 949 func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource { 950 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DynamicResourceAllocation) { 951 return []*podresourcesapi.DynamicResource{} 952 } 953 954 var containerDynamicResources []*podresourcesapi.DynamicResource 955 containerClaimInfos, err := cm.draManager.GetContainerClaimInfos(pod, container) 956 if err != nil { 957 klog.ErrorS(err, "Unable to get container claim info state") 958 return []*podresourcesapi.DynamicResource{} 959 } 960 for _, containerClaimInfo := range containerClaimInfos { 961 var claimResources []*podresourcesapi.ClaimResource 962 containerClaimInfo.RLock() 963 // TODO: Currently we maintain a list of ClaimResources, each of which contains 964 // a set of CDIDevices from a different kubelet plugin. In the future we may want to 965 // include the name of the kubelet plugin and/or other types of resources that are 966 // not CDIDevices (assuming the DRAmanager supports this). 967 for _, klPluginCdiDevices := range containerClaimInfo.CDIDevices { 968 var cdiDevices []*podresourcesapi.CDIDevice 969 for _, cdiDevice := range klPluginCdiDevices { 970 cdiDevices = append(cdiDevices, &podresourcesapi.CDIDevice{Name: cdiDevice}) 971 } 972 claimResources = append(claimResources, &podresourcesapi.ClaimResource{CDIDevices: cdiDevices}) 973 } 974 containerClaimInfo.RUnlock() 975 containerDynamicResource := podresourcesapi.DynamicResource{ 976 ClassName: containerClaimInfo.ClassName, 977 ClaimName: containerClaimInfo.ClaimName, 978 ClaimNamespace: containerClaimInfo.Namespace, 979 ClaimResources: claimResources, 980 } 981 containerDynamicResources = append(containerDynamicResources, &containerDynamicResource) 982 } 983 return containerDynamicResources 984 } 985 986 func (cm *containerManagerImpl) ShouldResetExtendedResourceCapacity() bool { 987 return cm.deviceManager.ShouldResetExtendedResourceCapacity() 988 } 989 990 func (cm *containerManagerImpl) UpdateAllocatedDevices() { 991 cm.deviceManager.UpdateAllocatedDevices() 992 } 993 994 func containerMemoryFromBlock(blocks []memorymanagerstate.Block) []*podresourcesapi.ContainerMemory { 995 var containerMemories []*podresourcesapi.ContainerMemory 996 997 for _, b := range blocks { 998 containerMemory := podresourcesapi.ContainerMemory{ 999 MemoryType: string(b.Type), 1000 Size_: b.Size, 1001 Topology: &podresourcesapi.TopologyInfo{ 1002 Nodes: []*podresourcesapi.NUMANode{}, 1003 }, 1004 } 1005 1006 for _, numaNodeID := range b.NUMAAffinity { 1007 containerMemory.Topology.Nodes = append(containerMemory.Topology.Nodes, &podresourcesapi.NUMANode{ID: int64(numaNodeID)}) 1008 } 1009 1010 containerMemories = append(containerMemories, &containerMemory) 1011 } 1012 1013 return containerMemories 1014 } 1015 1016 func (cm *containerManagerImpl) PrepareDynamicResources(pod *v1.Pod) error { 1017 return cm.draManager.PrepareResources(pod) 1018 } 1019 1020 func (cm *containerManagerImpl) UnprepareDynamicResources(pod *v1.Pod) error { 1021 return cm.draManager.UnprepareResources(pod) 1022 } 1023 1024 func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool { 1025 return cm.draManager.PodMightNeedToUnprepareResources(UID) 1026 }