k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/pod_container_manager_linux.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cm 18 19 import ( 20 "errors" 21 "fmt" 22 "os" 23 "path" 24 "strings" 25 26 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/types" 29 utilerrors "k8s.io/apimachinery/pkg/util/errors" 30 utilfeature "k8s.io/apiserver/pkg/util/feature" 31 "k8s.io/klog/v2" 32 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 33 kubefeatures "k8s.io/kubernetes/pkg/features" 34 ) 35 36 const ( 37 podCgroupNamePrefix = "pod" 38 ) 39 40 // podContainerManagerImpl implements podContainerManager interface. 41 // It is the general implementation which allows pod level container 42 // management if qos Cgroup is enabled. 43 type podContainerManagerImpl struct { 44 // qosContainersInfo hold absolute paths of the top level qos containers 45 qosContainersInfo QOSContainersInfo 46 // Stores the mounted cgroup subsystems 47 subsystems *CgroupSubsystems 48 // cgroupManager is the cgroup Manager Object responsible for managing all 49 // pod cgroups. 50 cgroupManager CgroupManager 51 // Maximum number of pids in a pod 52 podPidsLimit int64 53 // enforceCPULimits controls whether cfs quota is enforced or not 54 enforceCPULimits bool 55 // cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per 56 // node for all containers in usec 57 cpuCFSQuotaPeriod uint64 58 } 59 60 // Make sure that podContainerManagerImpl implements the PodContainerManager interface 61 var _ PodContainerManager = &podContainerManagerImpl{} 62 63 // Exists checks if the pod's cgroup already exists 64 func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool { 65 podContainerName, _ := m.GetPodContainerName(pod) 66 return m.cgroupManager.Exists(podContainerName) 67 } 68 69 // EnsureExists takes a pod as argument and makes sure that 70 // pod cgroup exists if qos cgroup hierarchy flag is enabled. 71 // If the pod level container doesn't already exist it is created. 72 func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error { 73 // check if container already exist 74 alreadyExists := m.Exists(pod) 75 if !alreadyExists { 76 enforceMemoryQoS := false 77 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 78 libcontainercgroups.IsCgroup2UnifiedMode() { 79 enforceMemoryQoS = true 80 } 81 // Create the pod container 82 podContainerName, _ := m.GetPodContainerName(pod) 83 containerConfig := &CgroupConfig{ 84 Name: podContainerName, 85 ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS), 86 } 87 if m.podPidsLimit > 0 { 88 containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit 89 } 90 if enforceMemoryQoS { 91 klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified) 92 } 93 if err := m.cgroupManager.Create(containerConfig); err != nil { 94 return fmt.Errorf("failed to create container for %v : %v", podContainerName, err) 95 } 96 } 97 return nil 98 } 99 100 // GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host. 101 func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) { 102 podQOS := v1qos.GetPodQOS(pod) 103 // Get the parent QOS container name 104 var parentContainer CgroupName 105 switch podQOS { 106 case v1.PodQOSGuaranteed: 107 parentContainer = m.qosContainersInfo.Guaranteed 108 case v1.PodQOSBurstable: 109 parentContainer = m.qosContainersInfo.Burstable 110 case v1.PodQOSBestEffort: 111 parentContainer = m.qosContainersInfo.BestEffort 112 } 113 podContainer := GetPodCgroupNameSuffix(pod.UID) 114 115 // Get the absolute path of the cgroup 116 cgroupName := NewCgroupName(parentContainer, podContainer) 117 // Get the literal cgroupfs name 118 cgroupfsName := m.cgroupManager.Name(cgroupName) 119 120 return cgroupName, cgroupfsName 121 } 122 123 func (m *podContainerManagerImpl) GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error) { 124 podCgroupName, _ := m.GetPodContainerName(pod) 125 memUsage, err := m.cgroupManager.MemoryUsage(podCgroupName) 126 if err != nil { 127 return 0, err 128 } 129 return uint64(memUsage), nil 130 } 131 132 func (m *podContainerManagerImpl) GetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName) (*ResourceConfig, error) { 133 podCgroupName, _ := m.GetPodContainerName(pod) 134 return m.cgroupManager.GetCgroupConfig(podCgroupName, resource) 135 } 136 137 func (m *podContainerManagerImpl) SetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName, resourceConfig *ResourceConfig) error { 138 podCgroupName, _ := m.GetPodContainerName(pod) 139 return m.cgroupManager.SetCgroupConfig(podCgroupName, resource, resourceConfig) 140 } 141 142 // Kill one process ID 143 func (m *podContainerManagerImpl) killOnePid(pid int) error { 144 // os.FindProcess never returns an error on POSIX 145 // https://go-review.googlesource.com/c/go/+/19093 146 p, _ := os.FindProcess(pid) 147 if err := p.Kill(); err != nil { 148 // If the process already exited, that's fine. 149 if errors.Is(err, os.ErrProcessDone) { 150 klog.V(3).InfoS("Process no longer exists", "pid", pid) 151 return nil 152 } 153 return err 154 } 155 return nil 156 } 157 158 // Scan through the whole cgroup directory and kill all processes either 159 // attached to the pod cgroup or to a container cgroup under the pod cgroup 160 func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error { 161 pidsToKill := m.cgroupManager.Pids(podCgroup) 162 // No pids charged to the terminated pod cgroup return 163 if len(pidsToKill) == 0 { 164 return nil 165 } 166 167 var errlist []error 168 // os.Kill often errors out, 169 // We try killing all the pids multiple times 170 removed := map[int]bool{} 171 for i := 0; i < 5; i++ { 172 if i != 0 { 173 klog.V(3).InfoS("Attempt failed to kill all unwanted process from cgroup, retrying", "attempt", i, "cgroupName", podCgroup) 174 } 175 errlist = []error{} 176 for _, pid := range pidsToKill { 177 if _, ok := removed[pid]; ok { 178 continue 179 } 180 klog.V(3).InfoS("Attempting to kill process from cgroup", "pid", pid, "cgroupName", podCgroup) 181 if err := m.killOnePid(pid); err != nil { 182 klog.V(3).InfoS("Failed to kill process from cgroup", "pid", pid, "cgroupName", podCgroup, "err", err) 183 errlist = append(errlist, err) 184 } else { 185 removed[pid] = true 186 } 187 } 188 if len(errlist) == 0 { 189 klog.V(3).InfoS("Successfully killed all unwanted processes from cgroup", "cgroupName", podCgroup) 190 return nil 191 } 192 } 193 return utilerrors.NewAggregate(errlist) 194 } 195 196 // Destroy destroys the pod container cgroup paths 197 func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error { 198 // Try killing all the processes attached to the pod cgroup 199 if err := m.tryKillingCgroupProcesses(podCgroup); err != nil { 200 klog.InfoS("Failed to kill all the processes attached to cgroup", "cgroupName", podCgroup, "err", err) 201 return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err) 202 } 203 204 // Now its safe to remove the pod's cgroup 205 containerConfig := &CgroupConfig{ 206 Name: podCgroup, 207 ResourceParameters: &ResourceConfig{}, 208 } 209 if err := m.cgroupManager.Destroy(containerConfig); err != nil { 210 klog.InfoS("Failed to delete cgroup paths", "cgroupName", podCgroup, "err", err) 211 return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err) 212 } 213 return nil 214 } 215 216 // ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares. 217 func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error { 218 return m.cgroupManager.ReduceCPULimits(podCgroup) 219 } 220 221 // IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod 222 func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) { 223 // convert the literal cgroupfs form to the driver specific value 224 cgroupName := m.cgroupManager.CgroupName(cgroupfs) 225 qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed} 226 basePath := "" 227 for _, qosContainerName := range qosContainersList { 228 // a pod cgroup is a direct child of a qos node, so check if its a match 229 if len(cgroupName) == len(qosContainerName)+1 { 230 basePath = cgroupName[len(qosContainerName)] 231 } 232 } 233 if basePath == "" { 234 return false, types.UID("") 235 } 236 if !strings.HasPrefix(basePath, podCgroupNamePrefix) { 237 return false, types.UID("") 238 } 239 parts := strings.Split(basePath, podCgroupNamePrefix) 240 if len(parts) != 2 { 241 return false, types.UID("") 242 } 243 return true, types.UID(parts[1]) 244 } 245 246 // GetAllPodsFromCgroups scans through all the subsystems of pod cgroups 247 // Get list of pods whose cgroup still exist on the cgroup mounts 248 func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) { 249 // Map for storing all the found pods on the disk 250 foundPods := make(map[types.UID]CgroupName) 251 qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed} 252 // Scan through all the subsystem mounts 253 // and through each QoS cgroup directory for each subsystem mount 254 // If a pod cgroup exists in even a single subsystem mount 255 // we will attempt to delete it 256 for _, val := range m.subsystems.MountPoints { 257 for _, qosContainerName := range qosContainersList { 258 // get the subsystems QoS cgroup absolute name 259 qcConversion := m.cgroupManager.Name(qosContainerName) 260 qc := path.Join(val, qcConversion) 261 dirInfo, err := os.ReadDir(qc) 262 if err != nil { 263 if os.IsNotExist(err) { 264 continue 265 } 266 return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err) 267 } 268 for i := range dirInfo { 269 // its not a directory, so continue on... 270 if !dirInfo[i].IsDir() { 271 continue 272 } 273 // convert the concrete cgroupfs name back to an internal identifier 274 // this is needed to handle path conversion for systemd environments. 275 // we pass the fully qualified path so decoding can work as expected 276 // since systemd encodes the path in each segment. 277 cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name()) 278 internalPath := m.cgroupManager.CgroupName(cgroupfsPath) 279 // we only care about base segment of the converted path since that 280 // is what we are reading currently to know if it is a pod or not. 281 basePath := internalPath[len(internalPath)-1] 282 if !strings.Contains(basePath, podCgroupNamePrefix) { 283 continue 284 } 285 // we then split the name on the pod prefix to determine the uid 286 parts := strings.Split(basePath, podCgroupNamePrefix) 287 // the uid is missing, so we log the unexpected cgroup not of form pod<uid> 288 if len(parts) != 2 { 289 klog.InfoS("Pod cgroup manager ignored unexpected cgroup because it is not a pod", "path", cgroupfsPath) 290 continue 291 } 292 podUID := parts[1] 293 foundPods[types.UID(podUID)] = internalPath 294 } 295 } 296 } 297 return foundPods, nil 298 } 299 300 // podContainerManagerNoop implements podContainerManager interface. 301 // It is a no-op implementation and basically does nothing 302 // podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not 303 // enabled, so Exists() returns true always as the cgroupRoot 304 // is expected to always exist. 305 type podContainerManagerNoop struct { 306 cgroupRoot CgroupName 307 } 308 309 // Make sure that podContainerManagerStub implements the PodContainerManager interface 310 var _ PodContainerManager = &podContainerManagerNoop{} 311 312 func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool { 313 return true 314 } 315 316 func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error { 317 return nil 318 } 319 320 func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) { 321 return m.cgroupRoot, "" 322 } 323 324 func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string { 325 return "" 326 } 327 328 // Destroy destroys the pod container cgroup paths 329 func (m *podContainerManagerNoop) Destroy(_ CgroupName) error { 330 return nil 331 } 332 333 func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error { 334 return nil 335 } 336 337 func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) { 338 return nil, nil 339 } 340 341 func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) { 342 return false, types.UID("") 343 } 344 345 func (m *podContainerManagerNoop) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) { 346 return 0, nil 347 } 348 349 func (m *podContainerManagerNoop) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) { 350 return nil, nil 351 } 352 353 func (m *podContainerManagerNoop) SetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName, _ *ResourceConfig) error { 354 return nil 355 }