k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kubelet_pods.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "bytes" 21 "context" 22 goerrors "errors" 23 "fmt" 24 "io" 25 "net/http" 26 "net/url" 27 "os" 28 "os/exec" 29 "os/user" 30 "path/filepath" 31 "runtime" 32 "sort" 33 "strconv" 34 "strings" 35 36 "github.com/google/go-cmp/cmp" 37 v1 "k8s.io/api/core/v1" 38 "k8s.io/apimachinery/pkg/api/errors" 39 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 40 "k8s.io/apimachinery/pkg/labels" 41 "k8s.io/apimachinery/pkg/types" 42 "k8s.io/apimachinery/pkg/util/sets" 43 utilvalidation "k8s.io/apimachinery/pkg/util/validation" 44 "k8s.io/apimachinery/pkg/util/version" 45 utilfeature "k8s.io/apiserver/pkg/util/feature" 46 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 47 "k8s.io/klog/v2" 48 "k8s.io/kubelet/pkg/cri/streaming/portforward" 49 remotecommandserver "k8s.io/kubelet/pkg/cri/streaming/remotecommand" 50 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 51 "k8s.io/kubernetes/pkg/api/v1/resource" 52 podshelper "k8s.io/kubernetes/pkg/apis/core/pods" 53 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 54 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 55 "k8s.io/kubernetes/pkg/features" 56 "k8s.io/kubernetes/pkg/fieldpath" 57 "k8s.io/kubernetes/pkg/kubelet/cm" 58 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 59 "k8s.io/kubernetes/pkg/kubelet/envvars" 60 "k8s.io/kubernetes/pkg/kubelet/images" 61 "k8s.io/kubernetes/pkg/kubelet/metrics" 62 "k8s.io/kubernetes/pkg/kubelet/status" 63 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 64 "k8s.io/kubernetes/pkg/kubelet/util" 65 utilfs "k8s.io/kubernetes/pkg/util/filesystem" 66 utilkernel "k8s.io/kubernetes/pkg/util/kernel" 67 utilpod "k8s.io/kubernetes/pkg/util/pod" 68 volumeutil "k8s.io/kubernetes/pkg/volume/util" 69 "k8s.io/kubernetes/pkg/volume/util/hostutil" 70 "k8s.io/kubernetes/pkg/volume/util/subpath" 71 "k8s.io/kubernetes/pkg/volume/util/volumepathhandler" 72 volumevalidation "k8s.io/kubernetes/pkg/volume/validation" 73 "k8s.io/kubernetes/third_party/forked/golang/expansion" 74 utilnet "k8s.io/utils/net" 75 ) 76 77 const ( 78 managedHostsHeader = "# Kubernetes-managed hosts file.\n" 79 managedHostsHeaderWithHostNetwork = "# Kubernetes-managed hosts file (host network).\n" 80 ) 81 82 // Container state reason list 83 const ( 84 PodInitializing = "PodInitializing" 85 ContainerCreating = "ContainerCreating" 86 87 kubeletUser = "kubelet" 88 ) 89 90 // parseGetSubIdsOutput parses the output from the `getsubids` tool, which is used to query subordinate user or group ID ranges for 91 // a given user or group. getsubids produces a line for each mapping configured. 92 // Here we expect that there is a single mapping, and the same values are used for the subordinate user and group ID ranges. 93 // The output is something like: 94 // $ getsubids kubelet 95 // 0: kubelet 65536 2147483648 96 // $ getsubids -g kubelet 97 // 0: kubelet 65536 2147483648 98 func parseGetSubIdsOutput(input string) (uint32, uint32, error) { 99 lines := strings.Split(strings.Trim(input, "\n"), "\n") 100 if len(lines) != 1 { 101 return 0, 0, fmt.Errorf("error parsing line %q: it must contain only one line", input) 102 } 103 104 parts := strings.Fields(lines[0]) 105 if len(parts) != 4 { 106 return 0, 0, fmt.Errorf("invalid line %q", input) 107 } 108 109 // Parsing the numbers 110 num1, err := strconv.ParseUint(parts[2], 10, 32) 111 if err != nil { 112 return 0, 0, fmt.Errorf("error parsing line %q: %w", input, err) 113 } 114 115 num2, err := strconv.ParseUint(parts[3], 10, 32) 116 if err != nil { 117 return 0, 0, fmt.Errorf("error parsing line %q: %w", input, err) 118 } 119 120 return uint32(num1), uint32(num2), nil 121 } 122 123 // getKubeletMappings returns the range of IDs that can be used to configure user namespaces. 124 // If subordinate user or group ID ranges are specified for the kubelet user and the getsubids tool 125 // is installed, then the single mapping specified both for user and group IDs will be used. 126 // If the tool is not installed, or there are no IDs configured, the default mapping is returned. 127 // The default mapping includes the entire IDs range except IDs below 65536. 128 func (kl *Kubelet) getKubeletMappings() (uint32, uint32, error) { 129 // default mappings to return if there is no specific configuration 130 const defaultFirstID = 1 << 16 131 const defaultLen = 1<<32 - defaultFirstID 132 133 if !utilfeature.DefaultFeatureGate.Enabled(features.UserNamespacesSupport) { 134 return defaultFirstID, defaultLen, nil 135 } else { 136 kernelVersion, err := utilkernel.GetVersion() 137 if err != nil { 138 return 0, 0, fmt.Errorf("failed to get kernel version, unable to determine if feature %s can be supported : %w", 139 features.UserNamespacesSupport, err) 140 } 141 if kernelVersion != nil && !kernelVersion.AtLeast(version.MustParseGeneric(utilkernel.UserNamespacesSupportKernelVersion)) { 142 klog.InfoS("WARNING: the kernel version is incompatible with the feature gate, which needs as a minimum kernel version", 143 "kernelVersion", kernelVersion, "feature", features.UserNamespacesSupport, "minKernelVersion", utilkernel.UserNamespacesSupportKernelVersion) 144 } 145 } 146 147 _, err := user.Lookup(kubeletUser) 148 if err != nil { 149 var unknownUserErr user.UnknownUserError 150 if goerrors.As(err, &unknownUserErr) { 151 // if the user is not found, we assume that the user is not configured 152 return defaultFirstID, defaultLen, nil 153 } 154 return 0, 0, err 155 } 156 157 execName := "getsubids" 158 cmd, err := exec.LookPath(execName) 159 if err != nil { 160 if os.IsNotExist(err) { 161 klog.V(2).InfoS("Could not find executable, default mappings will be used for the user namespaces", "executable", execName, "err", err) 162 return defaultFirstID, defaultLen, nil 163 } 164 return 0, 0, err 165 } 166 outUids, err := exec.Command(cmd, kubeletUser).Output() 167 if err != nil { 168 return 0, 0, fmt.Errorf("error retrieving additional ids for user %q", kubeletUser) 169 } 170 outGids, err := exec.Command(cmd, "-g", kubeletUser).Output() 171 if err != nil { 172 return 0, 0, fmt.Errorf("error retrieving additional gids for user %q", kubeletUser) 173 } 174 if string(outUids) != string(outGids) { 175 return 0, 0, fmt.Errorf("mismatched subuids and subgids for user %q", kubeletUser) 176 } 177 return parseGetSubIdsOutput(string(outUids)) 178 } 179 180 // Get a list of pods that have data directories. 181 func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { 182 podInfos, err := os.ReadDir(kl.getPodsDir()) 183 if err != nil { 184 return nil, err 185 } 186 pods := []types.UID{} 187 for i := range podInfos { 188 if podInfos[i].IsDir() { 189 pods = append(pods, types.UID(podInfos[i].Name())) 190 } 191 } 192 return pods, nil 193 } 194 195 // GetActivePods returns pods that have been admitted to the kubelet that 196 // are not fully terminated. This is mapped to the "desired state" of the 197 // kubelet - what pods should be running. 198 // 199 // WARNING: Currently this list does not include pods that have been force 200 // deleted but may still be terminating, which means resources assigned to 201 // those pods during admission may still be in use. See 202 // https://github.com/kubernetes/kubernetes/issues/104824 203 func (kl *Kubelet) GetActivePods() []*v1.Pod { 204 allPods := kl.podManager.GetPods() 205 activePods := kl.filterOutInactivePods(allPods) 206 return activePods 207 } 208 209 // makeBlockVolumes maps the raw block devices specified in the path of the container 210 // Experimental 211 func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) { 212 var devices []kubecontainer.DeviceInfo 213 for _, device := range container.VolumeDevices { 214 // check path is absolute 215 if !utilfs.IsAbs(device.DevicePath) { 216 return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath) 217 } 218 vol, ok := podVolumes[device.Name] 219 if !ok || vol.BlockVolumeMapper == nil { 220 klog.ErrorS(nil, "Block volume cannot be satisfied for container, because the volume is missing or the volume mapper is nil", "containerName", container.Name, "device", device) 221 return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name) 222 } 223 // Get a symbolic link associated to a block device under pod device path 224 dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath() 225 symlinkPath := filepath.Join(dirPath, volName) 226 if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil { 227 return nil, checkErr 228 } else if islinkExist { 229 // Check readOnly in PVCVolumeSource and set read only permission if it's true. 230 permission := "mrw" 231 if vol.ReadOnly { 232 permission = "r" 233 } 234 klog.V(4).InfoS("Device will be attached to container in the corresponding path on host", "containerName", container.Name, "path", symlinkPath) 235 devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission}) 236 } 237 } 238 239 return devices, nil 240 } 241 242 // shouldMountHostsFile checks if the nodes /etc/hosts should be mounted 243 // Kubernetes only mounts on /etc/hosts if: 244 // - container is not an infrastructure (pause) container 245 // - container is not already mounting on /etc/hosts 246 // Kubernetes will not mount /etc/hosts if: 247 // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set. 248 // - Windows pod contains a hostProcess container 249 func shouldMountHostsFile(pod *v1.Pod, podIPs []string) bool { 250 shouldMount := len(podIPs) > 0 251 if runtime.GOOS == "windows" { 252 return shouldMount && !kubecontainer.HasWindowsHostProcessContainer(pod) 253 } 254 return shouldMount 255 } 256 257 // makeMounts determines the mount points for the given container. 258 func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar, supportsRRO bool) ([]kubecontainer.Mount, func(), error) { 259 mountEtcHostsFile := shouldMountHostsFile(pod, podIPs) 260 klog.V(3).InfoS("Creating hosts mount for container", "pod", klog.KObj(pod), "containerName", container.Name, "podIPs", podIPs, "path", mountEtcHostsFile) 261 mounts := []kubecontainer.Mount{} 262 var cleanupAction func() 263 for i, mount := range container.VolumeMounts { 264 // do not mount /etc/hosts if container is already mounting on the path 265 mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath) 266 vol, ok := podVolumes[mount.Name] 267 if !ok || vol.Mounter == nil { 268 klog.ErrorS(nil, "Mount cannot be satisfied for the container, because the volume is missing or the volume mounter (vol.Mounter) is nil", 269 "containerName", container.Name, "ok", ok, "volumeMounter", mount) 270 return nil, cleanupAction, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name) 271 } 272 273 relabelVolume := false 274 // If the volume supports SELinux and it has not been 275 // relabeled already and it is not a read-only volume, 276 // relabel it and mark it as labeled 277 if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SELinuxRelabel && !vol.SELinuxLabeled { 278 vol.SELinuxLabeled = true 279 relabelVolume = true 280 } 281 hostPath, err := volumeutil.GetPath(vol.Mounter) 282 if err != nil { 283 return nil, cleanupAction, err 284 } 285 286 subPath := mount.SubPath 287 if mount.SubPathExpr != "" { 288 subPath, err = kubecontainer.ExpandContainerVolumeMounts(mount, expandEnvs) 289 290 if err != nil { 291 return nil, cleanupAction, err 292 } 293 } 294 295 if subPath != "" { 296 if utilfs.IsAbs(subPath) { 297 return nil, cleanupAction, fmt.Errorf("error SubPath `%s` must not be an absolute path", subPath) 298 } 299 300 err = volumevalidation.ValidatePathNoBacksteps(subPath) 301 if err != nil { 302 return nil, cleanupAction, fmt.Errorf("unable to provision SubPath `%s`: %v", subPath, err) 303 } 304 305 volumePath := hostPath 306 hostPath = filepath.Join(volumePath, subPath) 307 308 if subPathExists, err := hu.PathExists(hostPath); err != nil { 309 klog.ErrorS(nil, "Could not determine if subPath exists, will not attempt to change its permissions", "path", hostPath) 310 } else if !subPathExists { 311 // Create the sub path now because if it's auto-created later when referenced, it may have an 312 // incorrect ownership and mode. For example, the sub path directory must have at least g+rwx 313 // when the pod specifies an fsGroup, and if the directory is not created here, Docker will 314 // later auto-create it with the incorrect mode 0750 315 // Make extra care not to escape the volume! 316 perm, err := hu.GetMode(volumePath) 317 if err != nil { 318 return nil, cleanupAction, err 319 } 320 if err := subpather.SafeMakeDir(subPath, volumePath, perm); err != nil { 321 // Don't pass detailed error back to the user because it could give information about host filesystem 322 klog.ErrorS(err, "Failed to create subPath directory for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name) 323 return nil, cleanupAction, fmt.Errorf("failed to create subPath directory for volumeMount %q of container %q", mount.Name, container.Name) 324 } 325 } 326 hostPath, cleanupAction, err = subpather.PrepareSafeSubpath(subpath.Subpath{ 327 VolumeMountIndex: i, 328 Path: hostPath, 329 VolumeName: vol.InnerVolumeSpecName, 330 VolumePath: volumePath, 331 PodDir: podDir, 332 ContainerName: container.Name, 333 }) 334 if err != nil { 335 // Don't pass detailed error back to the user because it could give information about host filesystem 336 klog.ErrorS(err, "Failed to prepare subPath for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name) 337 return nil, cleanupAction, fmt.Errorf("failed to prepare subPath for volumeMount %q of container %q", mount.Name, container.Name) 338 } 339 } 340 341 // Docker Volume Mounts fail on Windows if it is not of the form C:/ 342 if volumeutil.IsWindowsLocalPath(runtime.GOOS, hostPath) { 343 hostPath = volumeutil.MakeAbsolutePath(runtime.GOOS, hostPath) 344 } 345 346 containerPath := mount.MountPath 347 // IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath 348 if !volumeutil.IsWindowsUNCPath(runtime.GOOS, containerPath) && !utilfs.IsAbs(containerPath) { 349 containerPath = volumeutil.MakeAbsolutePath(runtime.GOOS, containerPath) 350 } 351 352 propagation, err := translateMountPropagation(mount.MountPropagation) 353 if err != nil { 354 return nil, cleanupAction, err 355 } 356 klog.V(5).InfoS("Mount has propagation", "pod", klog.KObj(pod), "containerName", container.Name, "volumeMountName", mount.Name, "propagation", propagation) 357 mustMountRO := vol.Mounter.GetAttributes().ReadOnly 358 359 rro, err := resolveRecursiveReadOnly(mount, supportsRRO) 360 if err != nil { 361 return nil, cleanupAction, fmt.Errorf("failed to resolve recursive read-only mode: %w", err) 362 } 363 if rro && !utilfeature.DefaultFeatureGate.Enabled(features.RecursiveReadOnlyMounts) { 364 return nil, cleanupAction, fmt.Errorf("recursive read-only mount needs feature gate %q to be enabled", features.RecursiveReadOnlyMounts) 365 } 366 367 mounts = append(mounts, kubecontainer.Mount{ 368 Name: mount.Name, 369 ContainerPath: containerPath, 370 HostPath: hostPath, 371 ReadOnly: mount.ReadOnly || mustMountRO, 372 RecursiveReadOnly: rro, 373 SELinuxRelabel: relabelVolume, 374 Propagation: propagation, 375 }) 376 } 377 if mountEtcHostsFile { 378 hostAliases := pod.Spec.HostAliases 379 hostsMount, err := makeHostsMount(podDir, podIPs, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork) 380 if err != nil { 381 return nil, cleanupAction, err 382 } 383 mounts = append(mounts, *hostsMount) 384 } 385 return mounts, cleanupAction, nil 386 } 387 388 // translateMountPropagation transforms v1.MountPropagationMode to 389 // runtimeapi.MountPropagation. 390 func translateMountPropagation(mountMode *v1.MountPropagationMode) (runtimeapi.MountPropagation, error) { 391 if runtime.GOOS == "windows" { 392 // Windows containers doesn't support mount propagation, use private for it. 393 // Refer https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation. 394 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 395 } 396 397 switch { 398 case mountMode == nil: 399 // PRIVATE is the default 400 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 401 case *mountMode == v1.MountPropagationHostToContainer: 402 return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil 403 case *mountMode == v1.MountPropagationBidirectional: 404 return runtimeapi.MountPropagation_PROPAGATION_BIDIRECTIONAL, nil 405 case *mountMode == v1.MountPropagationNone: 406 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 407 default: 408 return 0, fmt.Errorf("invalid MountPropagation mode: %q", *mountMode) 409 } 410 } 411 412 // getEtcHostsPath returns the full host-side path to a pod's generated /etc/hosts file 413 func getEtcHostsPath(podDir string) string { 414 hostsFilePath := filepath.Join(podDir, "etc-hosts") 415 // Volume Mounts fail on Windows if it is not of the form C:/ 416 return volumeutil.MakeAbsolutePath(runtime.GOOS, hostsFilePath) 417 } 418 419 // makeHostsMount makes the mountpoint for the hosts file that the containers 420 // in a pod are injected with. podIPs is provided instead of podIP as podIPs 421 // are present even if dual-stack feature flag is not enabled. 422 func makeHostsMount(podDir string, podIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) (*kubecontainer.Mount, error) { 423 hostsFilePath := getEtcHostsPath(podDir) 424 if err := ensureHostsFile(hostsFilePath, podIPs, hostName, hostDomainName, hostAliases, useHostNetwork); err != nil { 425 return nil, err 426 } 427 return &kubecontainer.Mount{ 428 Name: "k8s-managed-etc-hosts", 429 ContainerPath: etcHostsPath, 430 HostPath: hostsFilePath, 431 ReadOnly: false, 432 SELinuxRelabel: true, 433 }, nil 434 } 435 436 // ensureHostsFile ensures that the given host file has an up-to-date ip, host 437 // name, and domain name. 438 func ensureHostsFile(fileName string, hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) error { 439 var hostsFileContent []byte 440 var err error 441 442 if useHostNetwork { 443 // if Pod is using host network, read hosts file from the node's filesystem. 444 // `etcHostsPath` references the location of the hosts file on the node. 445 // `/etc/hosts` for *nix systems. 446 hostsFileContent, err = nodeHostsFileContent(etcHostsPath, hostAliases) 447 if err != nil { 448 return err 449 } 450 } else { 451 // if Pod is not using host network, create a managed hosts file with Pod IP and other information. 452 hostsFileContent = managedHostsFileContent(hostIPs, hostName, hostDomainName, hostAliases) 453 } 454 455 hostsFilePerm := os.FileMode(0644) 456 if err := os.WriteFile(fileName, hostsFileContent, hostsFilePerm); err != nil { 457 return err 458 } 459 return os.Chmod(fileName, hostsFilePerm) 460 } 461 462 // nodeHostsFileContent reads the content of node's hosts file. 463 func nodeHostsFileContent(hostsFilePath string, hostAliases []v1.HostAlias) ([]byte, error) { 464 hostsFileContent, err := os.ReadFile(hostsFilePath) 465 if err != nil { 466 return nil, err 467 } 468 var buffer bytes.Buffer 469 buffer.WriteString(managedHostsHeaderWithHostNetwork) 470 buffer.Write(hostsFileContent) 471 buffer.Write(hostsEntriesFromHostAliases(hostAliases)) 472 return buffer.Bytes(), nil 473 } 474 475 // managedHostsFileContent generates the content of the managed etc hosts based on Pod IPs and other 476 // information. 477 func managedHostsFileContent(hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias) []byte { 478 var buffer bytes.Buffer 479 buffer.WriteString(managedHostsHeader) 480 buffer.WriteString("127.0.0.1\tlocalhost\n") // ipv4 localhost 481 buffer.WriteString("::1\tlocalhost ip6-localhost ip6-loopback\n") // ipv6 localhost 482 buffer.WriteString("fe00::0\tip6-localnet\n") 483 buffer.WriteString("fe00::0\tip6-mcastprefix\n") 484 buffer.WriteString("fe00::1\tip6-allnodes\n") 485 buffer.WriteString("fe00::2\tip6-allrouters\n") 486 if len(hostDomainName) > 0 { 487 // host entry generated for all IPs in podIPs 488 // podIPs field is populated for clusters even 489 // dual-stack feature flag is not enabled. 490 for _, hostIP := range hostIPs { 491 buffer.WriteString(fmt.Sprintf("%s\t%s.%s\t%s\n", hostIP, hostName, hostDomainName, hostName)) 492 } 493 } else { 494 for _, hostIP := range hostIPs { 495 buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostIP, hostName)) 496 } 497 } 498 buffer.Write(hostsEntriesFromHostAliases(hostAliases)) 499 return buffer.Bytes() 500 } 501 502 func hostsEntriesFromHostAliases(hostAliases []v1.HostAlias) []byte { 503 if len(hostAliases) == 0 { 504 return []byte{} 505 } 506 507 var buffer bytes.Buffer 508 buffer.WriteString("\n") 509 buffer.WriteString("# Entries added by HostAliases.\n") 510 // for each IP, write all aliases onto single line in hosts file 511 for _, hostAlias := range hostAliases { 512 buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostAlias.IP, strings.Join(hostAlias.Hostnames, "\t"))) 513 } 514 return buffer.Bytes() 515 } 516 517 // truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars. 518 func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) { 519 // Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char). 520 const hostnameMaxLen = 63 521 if len(hostname) <= hostnameMaxLen { 522 return hostname, nil 523 } 524 truncated := hostname[:hostnameMaxLen] 525 klog.ErrorS(nil, "Hostname for pod was too long, truncated it", "podName", podName, "hostnameMaxLen", hostnameMaxLen, "truncatedHostname", truncated) 526 // hostname should not end with '-' or '.' 527 truncated = strings.TrimRight(truncated, "-.") 528 if len(truncated) == 0 { 529 // This should never happen. 530 return "", fmt.Errorf("hostname for pod %q was invalid: %q", podName, hostname) 531 } 532 return truncated, nil 533 } 534 535 // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace 536 func (kl *Kubelet) GetOrCreateUserNamespaceMappings(pod *v1.Pod, runtimeHandler string) (*runtimeapi.UserNamespace, error) { 537 return kl.usernsManager.GetOrCreateUserNamespaceMappings(pod, runtimeHandler) 538 } 539 540 // GeneratePodHostNameAndDomain creates a hostname and domain name for a pod, 541 // given that pod's spec and annotations or returns an error. 542 func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) { 543 clusterDomain := kl.dnsConfigurer.ClusterDomain 544 545 hostname := pod.Name 546 if len(pod.Spec.Hostname) > 0 { 547 if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Hostname); len(msgs) != 0 { 548 return "", "", fmt.Errorf("pod Hostname %q is not a valid DNS label: %s", pod.Spec.Hostname, strings.Join(msgs, ";")) 549 } 550 hostname = pod.Spec.Hostname 551 } 552 553 hostname, err := truncatePodHostnameIfNeeded(pod.Name, hostname) 554 if err != nil { 555 return "", "", err 556 } 557 558 hostDomain := "" 559 if len(pod.Spec.Subdomain) > 0 { 560 if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Subdomain); len(msgs) != 0 { 561 return "", "", fmt.Errorf("pod Subdomain %q is not a valid DNS label: %s", pod.Spec.Subdomain, strings.Join(msgs, ";")) 562 } 563 hostDomain = fmt.Sprintf("%s.%s.svc.%s", pod.Spec.Subdomain, pod.Namespace, clusterDomain) 564 } 565 566 return hostname, hostDomain, nil 567 } 568 569 // GetPodCgroupParent gets pod cgroup parent from container manager. 570 func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string { 571 pcm := kl.containerManager.NewPodContainerManager() 572 _, cgroupParent := pcm.GetPodContainerName(pod) 573 return cgroupParent 574 } 575 576 // GenerateRunContainerOptions generates the RunContainerOptions, which can be used by 577 // the container runtime to set parameters for launching a container. 578 func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) { 579 supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod) 580 581 opts, err := kl.containerManager.GetResources(pod, container) 582 if err != nil { 583 return nil, nil, err 584 } 585 // The value of hostname is the short host name and it is sent to makeMounts to create /etc/hosts file. 586 hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod) 587 if err != nil { 588 return nil, nil, err 589 } 590 // nodename will be equal to hostname if SetHostnameAsFQDN is nil or false. If SetHostnameFQDN 591 // is true and hostDomainName is defined, nodename will be the FQDN (hostname.hostDomainName) 592 nodename, err := util.GetNodenameForKernel(hostname, hostDomainName, pod.Spec.SetHostnameAsFQDN) 593 if err != nil { 594 return nil, nil, err 595 } 596 opts.Hostname = nodename 597 podName := volumeutil.GetUniquePodName(pod) 598 volumes := kl.volumeManager.GetMountedVolumesForPod(podName) 599 600 blkutil := volumepathhandler.NewBlockVolumePathHandler() 601 blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil) 602 if err != nil { 603 return nil, nil, err 604 } 605 opts.Devices = append(opts.Devices, blkVolumes...) 606 607 envs, err := kl.makeEnvironmentVariables(pod, container, podIP, podIPs) 608 if err != nil { 609 return nil, nil, err 610 } 611 opts.Envs = append(opts.Envs, envs...) 612 613 // only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled. 614 mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs, supportsRRO) 615 if err != nil { 616 return nil, cleanupAction, err 617 } 618 opts.Mounts = append(opts.Mounts, mounts...) 619 620 // adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot 621 // be mounted as volumes using Docker for Windows. 622 if len(container.TerminationMessagePath) != 0 { 623 p := kl.getPodContainerDir(pod.UID, container.Name) 624 if err := os.MkdirAll(p, 0750); err != nil { 625 klog.ErrorS(err, "Error on creating dir", "path", p) 626 } else { 627 opts.PodContainerDir = p 628 } 629 } 630 631 return opts, cleanupAction, nil 632 } 633 634 var masterServices = sets.NewString("kubernetes") 635 636 // getServiceEnvVarMap makes a map[string]string of env vars for services a 637 // pod in namespace ns should see. 638 func (kl *Kubelet) getServiceEnvVarMap(ns string, enableServiceLinks bool) (map[string]string, error) { 639 var ( 640 serviceMap = make(map[string]*v1.Service) 641 m = make(map[string]string) 642 ) 643 644 // Get all service resources from the master (via a cache), 645 // and populate them into service environment variables. 646 if kl.serviceLister == nil { 647 // Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars. 648 return m, nil 649 } 650 services, err := kl.serviceLister.List(labels.Everything()) 651 if err != nil { 652 return m, fmt.Errorf("failed to list services when setting up env vars") 653 } 654 655 // project the services in namespace ns onto the master services 656 for i := range services { 657 service := services[i] 658 // ignore services where ClusterIP is "None" or empty 659 if !v1helper.IsServiceIPSet(service) { 660 continue 661 } 662 serviceName := service.Name 663 664 // We always want to add environment variabled for master services 665 // from the default namespace, even if enableServiceLinks is false. 666 // We also add environment variables for other services in the same 667 // namespace, if enableServiceLinks is true. 668 if service.Namespace == metav1.NamespaceDefault && masterServices.Has(serviceName) { 669 if _, exists := serviceMap[serviceName]; !exists { 670 serviceMap[serviceName] = service 671 } 672 } else if service.Namespace == ns && enableServiceLinks { 673 serviceMap[serviceName] = service 674 } 675 } 676 677 mappedServices := []*v1.Service{} 678 for key := range serviceMap { 679 mappedServices = append(mappedServices, serviceMap[key]) 680 } 681 682 for _, e := range envvars.FromServices(mappedServices) { 683 m[e.Name] = e.Value 684 } 685 return m, nil 686 } 687 688 // Make the environment variables for a pod in the given namespace. 689 func (kl *Kubelet) makeEnvironmentVariables(pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) ([]kubecontainer.EnvVar, error) { 690 if pod.Spec.EnableServiceLinks == nil { 691 return nil, fmt.Errorf("nil pod.spec.enableServiceLinks encountered, cannot construct envvars") 692 } 693 694 // If the pod originates from the kube-api, when we know that the kube-apiserver is responding and the kubelet's credentials are valid. 695 // Knowing this, it is reasonable to wait until the service lister has synchronized at least once before attempting to build 696 // a service env var map. This doesn't present the race below from happening entirely, but it does prevent the "obvious" 697 // failure case of services simply not having completed a list operation that can reasonably be expected to succeed. 698 // One common case this prevents is a kubelet restart reading pods before services and some pod not having the 699 // KUBERNETES_SERVICE_HOST injected because we didn't wait a short time for services to sync before proceeding. 700 // The KUBERNETES_SERVICE_HOST link is special because it is unconditionally injected into pods and is read by the 701 // in-cluster-config for pod clients 702 if !kubetypes.IsStaticPod(pod) && !kl.serviceHasSynced() { 703 return nil, fmt.Errorf("services have not yet been read at least once, cannot construct envvars") 704 } 705 706 var result []kubecontainer.EnvVar 707 // Note: These are added to the docker Config, but are not included in the checksum computed 708 // by kubecontainer.HashContainer(...). That way, we can still determine whether an 709 // v1.Container is already running by its hash. (We don't want to restart a container just 710 // because some service changed.) 711 // 712 // Note that there is a race between Kubelet seeing the pod and kubelet seeing the service. 713 // To avoid this users can: (1) wait between starting a service and starting; or (2) detect 714 // missing service env var and exit and be restarted; or (3) use DNS instead of env vars 715 // and keep trying to resolve the DNS name of the service (recommended). 716 serviceEnv, err := kl.getServiceEnvVarMap(pod.Namespace, *pod.Spec.EnableServiceLinks) 717 if err != nil { 718 return result, err 719 } 720 721 var ( 722 configMaps = make(map[string]*v1.ConfigMap) 723 secrets = make(map[string]*v1.Secret) 724 tmpEnv = make(map[string]string) 725 ) 726 727 // Env will override EnvFrom variables. 728 // Process EnvFrom first then allow Env to replace existing values. 729 for _, envFrom := range container.EnvFrom { 730 switch { 731 case envFrom.ConfigMapRef != nil: 732 cm := envFrom.ConfigMapRef 733 name := cm.Name 734 configMap, ok := configMaps[name] 735 if !ok { 736 if kl.kubeClient == nil { 737 return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) 738 } 739 optional := cm.Optional != nil && *cm.Optional 740 configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) 741 if err != nil { 742 if errors.IsNotFound(err) && optional { 743 // ignore error when marked optional 744 continue 745 } 746 return result, err 747 } 748 configMaps[name] = configMap 749 } 750 751 for k, v := range configMap.Data { 752 if len(envFrom.Prefix) > 0 { 753 k = envFrom.Prefix + k 754 } 755 756 tmpEnv[k] = v 757 } 758 case envFrom.SecretRef != nil: 759 s := envFrom.SecretRef 760 name := s.Name 761 secret, ok := secrets[name] 762 if !ok { 763 if kl.kubeClient == nil { 764 return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) 765 } 766 optional := s.Optional != nil && *s.Optional 767 secret, err = kl.secretManager.GetSecret(pod.Namespace, name) 768 if err != nil { 769 if errors.IsNotFound(err) && optional { 770 // ignore error when marked optional 771 continue 772 } 773 return result, err 774 } 775 secrets[name] = secret 776 } 777 778 for k, v := range secret.Data { 779 if len(envFrom.Prefix) > 0 { 780 k = envFrom.Prefix + k 781 } 782 783 tmpEnv[k] = string(v) 784 } 785 } 786 } 787 788 // Determine the final values of variables: 789 // 790 // 1. Determine the final value of each variable: 791 // a. If the variable's Value is set, expand the `$(var)` references to other 792 // variables in the .Value field; the sources of variables are the declared 793 // variables of the container and the service environment variables 794 // b. If a source is defined for an environment variable, resolve the source 795 // 2. Create the container's environment in the order variables are declared 796 // 3. Add remaining service environment vars 797 var ( 798 mappingFunc = expansion.MappingFuncFor(tmpEnv, serviceEnv) 799 ) 800 for _, envVar := range container.Env { 801 runtimeVal := envVar.Value 802 if runtimeVal != "" { 803 // Step 1a: expand variable references 804 runtimeVal = expansion.Expand(runtimeVal, mappingFunc) 805 } else if envVar.ValueFrom != nil { 806 // Step 1b: resolve alternate env var sources 807 switch { 808 case envVar.ValueFrom.FieldRef != nil: 809 runtimeVal, err = kl.podFieldSelectorRuntimeValue(envVar.ValueFrom.FieldRef, pod, podIP, podIPs) 810 if err != nil { 811 return result, err 812 } 813 case envVar.ValueFrom.ResourceFieldRef != nil: 814 defaultedPod, defaultedContainer, err := kl.defaultPodLimitsForDownwardAPI(pod, container) 815 if err != nil { 816 return result, err 817 } 818 runtimeVal, err = containerResourceRuntimeValue(envVar.ValueFrom.ResourceFieldRef, defaultedPod, defaultedContainer) 819 if err != nil { 820 return result, err 821 } 822 case envVar.ValueFrom.ConfigMapKeyRef != nil: 823 cm := envVar.ValueFrom.ConfigMapKeyRef 824 name := cm.Name 825 key := cm.Key 826 optional := cm.Optional != nil && *cm.Optional 827 configMap, ok := configMaps[name] 828 if !ok { 829 if kl.kubeClient == nil { 830 return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) 831 } 832 configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) 833 if err != nil { 834 if errors.IsNotFound(err) && optional { 835 // ignore error when marked optional 836 continue 837 } 838 return result, err 839 } 840 configMaps[name] = configMap 841 } 842 runtimeVal, ok = configMap.Data[key] 843 if !ok { 844 if optional { 845 continue 846 } 847 return result, fmt.Errorf("couldn't find key %v in ConfigMap %v/%v", key, pod.Namespace, name) 848 } 849 case envVar.ValueFrom.SecretKeyRef != nil: 850 s := envVar.ValueFrom.SecretKeyRef 851 name := s.Name 852 key := s.Key 853 optional := s.Optional != nil && *s.Optional 854 secret, ok := secrets[name] 855 if !ok { 856 if kl.kubeClient == nil { 857 return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) 858 } 859 secret, err = kl.secretManager.GetSecret(pod.Namespace, name) 860 if err != nil { 861 if errors.IsNotFound(err) && optional { 862 // ignore error when marked optional 863 continue 864 } 865 return result, err 866 } 867 secrets[name] = secret 868 } 869 runtimeValBytes, ok := secret.Data[key] 870 if !ok { 871 if optional { 872 continue 873 } 874 return result, fmt.Errorf("couldn't find key %v in Secret %v/%v", key, pod.Namespace, name) 875 } 876 runtimeVal = string(runtimeValBytes) 877 } 878 } 879 880 tmpEnv[envVar.Name] = runtimeVal 881 } 882 883 // Append the env vars 884 for k, v := range tmpEnv { 885 result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) 886 } 887 888 // Append remaining service env vars. 889 for k, v := range serviceEnv { 890 // Accesses apiserver+Pods. 891 // So, the master may set service env vars, or kubelet may. In case both are doing 892 // it, we skip the key from the kubelet-generated ones so we don't have duplicate 893 // env vars. 894 // TODO: remove this next line once all platforms use apiserver+Pods. 895 if _, present := tmpEnv[k]; !present { 896 result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) 897 } 898 } 899 return result, nil 900 } 901 902 // podFieldSelectorRuntimeValue returns the runtime value of the given 903 // selector for a pod. 904 func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod *v1.Pod, podIP string, podIPs []string) (string, error) { 905 internalFieldPath, _, err := podshelper.ConvertDownwardAPIFieldLabel(fs.APIVersion, fs.FieldPath, "") 906 if err != nil { 907 return "", err 908 } 909 910 // make podIPs order match node IP family preference #97979 911 podIPs = kl.sortPodIPs(podIPs) 912 if len(podIPs) > 0 { 913 podIP = podIPs[0] 914 } 915 916 switch internalFieldPath { 917 case "spec.nodeName": 918 return pod.Spec.NodeName, nil 919 case "spec.serviceAccountName": 920 return pod.Spec.ServiceAccountName, nil 921 case "status.hostIP": 922 hostIPs, err := kl.getHostIPsAnyWay() 923 if err != nil { 924 return "", err 925 } 926 return hostIPs[0].String(), nil 927 case "status.hostIPs": 928 if !utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) { 929 return "", nil 930 } 931 hostIPs, err := kl.getHostIPsAnyWay() 932 if err != nil { 933 return "", err 934 } 935 ips := make([]string, 0, len(hostIPs)) 936 for _, ip := range hostIPs { 937 ips = append(ips, ip.String()) 938 } 939 return strings.Join(ips, ","), nil 940 case "status.podIP": 941 return podIP, nil 942 case "status.podIPs": 943 return strings.Join(podIPs, ","), nil 944 } 945 return fieldpath.ExtractFieldPathAsString(pod, internalFieldPath) 946 } 947 948 // containerResourceRuntimeValue returns the value of the provided container resource 949 func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, container *v1.Container) (string, error) { 950 containerName := fs.ContainerName 951 if len(containerName) == 0 { 952 return resource.ExtractContainerResourceValue(fs, container) 953 } 954 return resource.ExtractResourceValueByContainerName(fs, pod, containerName) 955 } 956 957 // killPod instructs the container runtime to kill the pod. This method requires that 958 // the pod status contains the result of the last syncPod, otherwise it may fail to 959 // terminate newly created containers and sandboxes. 960 func (kl *Kubelet) killPod(ctx context.Context, pod *v1.Pod, p kubecontainer.Pod, gracePeriodOverride *int64) error { 961 // Call the container runtime KillPod method which stops all known running containers of the pod 962 if err := kl.containerRuntime.KillPod(ctx, pod, p, gracePeriodOverride); err != nil { 963 return err 964 } 965 if err := kl.containerManager.UpdateQOSCgroups(); err != nil { 966 klog.V(2).InfoS("Failed to update QoS cgroups while killing pod", "err", err) 967 } 968 return nil 969 } 970 971 // makePodDataDirs creates the dirs for the pod datas. 972 func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error { 973 uid := pod.UID 974 if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) { 975 return err 976 } 977 if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) { 978 return err 979 } 980 if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) { 981 return err 982 } 983 return nil 984 } 985 986 // getPullSecretsForPod inspects the Pod and retrieves the referenced pull 987 // secrets. 988 func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret { 989 pullSecrets := []v1.Secret{} 990 failedPullSecrets := []string{} 991 992 for _, secretRef := range pod.Spec.ImagePullSecrets { 993 if len(secretRef.Name) == 0 { 994 // API validation permitted entries with empty names (https://issue.k8s.io/99454#issuecomment-787838112). 995 // Ignore to avoid unnecessary warnings. 996 continue 997 } 998 secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name) 999 if err != nil { 1000 klog.InfoS("Unable to retrieve pull secret, the image pull may not succeed.", "pod", klog.KObj(pod), "secret", klog.KObj(secret), "err", err) 1001 failedPullSecrets = append(failedPullSecrets, secretRef.Name) 1002 continue 1003 } 1004 1005 pullSecrets = append(pullSecrets, *secret) 1006 } 1007 1008 if len(failedPullSecrets) > 0 { 1009 kl.recorder.Eventf(pod, v1.EventTypeWarning, "FailedToRetrieveImagePullSecret", "Unable to retrieve some image pull secrets (%s); attempting to pull the image may not succeed.", strings.Join(failedPullSecrets, ", ")) 1010 } 1011 1012 return pullSecrets 1013 } 1014 1015 // PodCouldHaveRunningContainers returns true if the pod with the given UID could still have running 1016 // containers. This returns false if the pod has not yet been started or the pod is unknown. 1017 func (kl *Kubelet) PodCouldHaveRunningContainers(pod *v1.Pod) bool { 1018 if kl.podWorkers.CouldHaveRunningContainers(pod.UID) { 1019 return true 1020 } 1021 1022 // Check if pod might need to unprepare resources before termination 1023 // NOTE: This is a temporary solution. This call is here to avoid changing 1024 // status manager and its tests. 1025 // TODO: extend PodDeletionSafetyProvider interface and implement it 1026 // in a separate Kubelet method. 1027 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 1028 if kl.containerManager.PodMightNeedToUnprepareResources(pod.UID) { 1029 return true 1030 } 1031 } 1032 return false 1033 } 1034 1035 // PodIsFinished returns true if SyncTerminatedPod is finished, ie. 1036 // all required node-level resources that a pod was consuming have 1037 // been reclaimed by the kubelet. 1038 func (kl *Kubelet) PodIsFinished(pod *v1.Pod) bool { 1039 return kl.podWorkers.ShouldPodBeFinished(pod.UID) 1040 } 1041 1042 // filterOutInactivePods returns pods that are not in a terminal phase 1043 // or are known to be fully terminated. This method should only be used 1044 // when the set of pods being filtered is upstream of the pod worker, i.e. 1045 // the pods the pod manager is aware of. 1046 func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod { 1047 filteredPods := make([]*v1.Pod, 0, len(pods)) 1048 for _, p := range pods { 1049 // if a pod is fully terminated by UID, it should be excluded from the 1050 // list of pods 1051 if kl.podWorkers.IsPodKnownTerminated(p.UID) { 1052 continue 1053 } 1054 1055 // terminal pods are considered inactive UNLESS they are actively terminating 1056 if kl.isAdmittedPodTerminal(p) && !kl.podWorkers.IsPodTerminationRequested(p.UID) { 1057 continue 1058 } 1059 1060 filteredPods = append(filteredPods, p) 1061 } 1062 return filteredPods 1063 } 1064 1065 // isAdmittedPodTerminal returns true if the provided config source pod is in 1066 // a terminal phase, or if the Kubelet has already indicated the pod has reached 1067 // a terminal phase but the config source has not accepted it yet. This method 1068 // should only be used within the pod configuration loops that notify the pod 1069 // worker, other components should treat the pod worker as authoritative. 1070 func (kl *Kubelet) isAdmittedPodTerminal(pod *v1.Pod) bool { 1071 // pods are considered inactive if the config source has observed a 1072 // terminal phase (if the Kubelet recorded that the pod reached a terminal 1073 // phase the pod should never be restarted) 1074 if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { 1075 return true 1076 } 1077 // a pod that has been marked terminal within the Kubelet is considered 1078 // inactive (may have been rejected by Kubelet admission) 1079 if status, ok := kl.statusManager.GetPodStatus(pod.UID); ok { 1080 if status.Phase == v1.PodSucceeded || status.Phase == v1.PodFailed { 1081 return true 1082 } 1083 } 1084 return false 1085 } 1086 1087 // removeOrphanedPodStatuses removes obsolete entries in podStatus where 1088 // the pod is no longer considered bound to this node. 1089 func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Pod) { 1090 podUIDs := make(map[types.UID]bool) 1091 for _, pod := range pods { 1092 podUIDs[pod.UID] = true 1093 } 1094 for _, pod := range mirrorPods { 1095 podUIDs[pod.UID] = true 1096 } 1097 kl.statusManager.RemoveOrphanedStatuses(podUIDs) 1098 } 1099 1100 // HandlePodCleanups performs a series of cleanup work, including terminating 1101 // pod workers, killing unwanted pods, and removing orphaned volumes/pod 1102 // directories. No config changes are sent to pod workers while this method 1103 // is executing which means no new pods can appear. After this method completes 1104 // the desired state of the kubelet should be reconciled with the actual state 1105 // in the pod worker and other pod-related components. 1106 // 1107 // This function is executed by the main sync loop, so it must execute quickly 1108 // and all nested calls should be asynchronous. Any slow reconciliation actions 1109 // should be performed by other components (like the volume manager). The duration 1110 // of this call is the minimum latency for static pods to be restarted if they 1111 // are updated with a fixed UID (most should use a dynamic UID), and no config 1112 // updates are delivered to the pod workers while this method is running. 1113 func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { 1114 // The kubelet lacks checkpointing, so we need to introspect the set of pods 1115 // in the cgroup tree prior to inspecting the set of pods in our pod manager. 1116 // this ensures our view of the cgroup tree does not mistakenly observe pods 1117 // that are added after the fact... 1118 var ( 1119 cgroupPods map[types.UID]cm.CgroupName 1120 err error 1121 ) 1122 if kl.cgroupsPerQOS { 1123 pcm := kl.containerManager.NewPodContainerManager() 1124 cgroupPods, err = pcm.GetAllPodsFromCgroups() 1125 if err != nil { 1126 return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err) 1127 } 1128 } 1129 1130 allPods, mirrorPods, orphanedMirrorPodFullnames := kl.podManager.GetPodsAndMirrorPods() 1131 1132 // Pod phase progresses monotonically. Once a pod has reached a final state, 1133 // it should never leave regardless of the restart policy. The statuses 1134 // of such pods should not be changed, and there is no need to sync them. 1135 // TODO: the logic here does not handle two cases: 1136 // 1. If the containers were removed immediately after they died, kubelet 1137 // may fail to generate correct statuses, let alone filtering correctly. 1138 // 2. If kubelet restarted before writing the terminated status for a pod 1139 // to the apiserver, it could still restart the terminated pod (even 1140 // though the pod was not considered terminated by the apiserver). 1141 // These two conditions could be alleviated by checkpointing kubelet. 1142 1143 // Stop the workers for terminated pods not in the config source 1144 klog.V(3).InfoS("Clean up pod workers for terminated pods") 1145 workingPods := kl.podWorkers.SyncKnownPods(allPods) 1146 1147 // Reconcile: At this point the pod workers have been pruned to the set of 1148 // desired pods. Pods that must be restarted due to UID reuse, or leftover 1149 // pods from previous runs, are not known to the pod worker. 1150 1151 allPodsByUID := make(map[types.UID]*v1.Pod) 1152 for _, pod := range allPods { 1153 allPodsByUID[pod.UID] = pod 1154 } 1155 1156 // Identify the set of pods that have workers, which should be all pods 1157 // from config that are not terminated, as well as any terminating pods 1158 // that have already been removed from config. Pods that are terminating 1159 // will be added to possiblyRunningPods, to prevent overly aggressive 1160 // cleanup of pod cgroups. 1161 stringIfTrue := func(t bool) string { 1162 if t { 1163 return "true" 1164 } 1165 return "" 1166 } 1167 runningPods := make(map[types.UID]sets.Empty) 1168 possiblyRunningPods := make(map[types.UID]sets.Empty) 1169 for uid, sync := range workingPods { 1170 switch sync.State { 1171 case SyncPod: 1172 runningPods[uid] = struct{}{} 1173 possiblyRunningPods[uid] = struct{}{} 1174 case TerminatingPod: 1175 possiblyRunningPods[uid] = struct{}{} 1176 default: 1177 } 1178 } 1179 1180 // Retrieve the list of running containers from the runtime to perform cleanup. 1181 // We need the latest state to avoid delaying restarts of static pods that reuse 1182 // a UID. 1183 if err := kl.runtimeCache.ForceUpdateIfOlder(ctx, kl.clock.Now()); err != nil { 1184 klog.ErrorS(err, "Error listing containers") 1185 return err 1186 } 1187 runningRuntimePods, err := kl.runtimeCache.GetPods(ctx) 1188 if err != nil { 1189 klog.ErrorS(err, "Error listing containers") 1190 return err 1191 } 1192 1193 // Stop probing pods that are not running 1194 klog.V(3).InfoS("Clean up probes for terminated pods") 1195 kl.probeManager.CleanupPods(possiblyRunningPods) 1196 1197 // Remove orphaned pod statuses not in the total list of known config pods 1198 klog.V(3).InfoS("Clean up orphaned pod statuses") 1199 kl.removeOrphanedPodStatuses(allPods, mirrorPods) 1200 1201 // Remove orphaned pod user namespace allocations (if any). 1202 klog.V(3).InfoS("Clean up orphaned pod user namespace allocations") 1203 if err = kl.usernsManager.CleanupOrphanedPodUsernsAllocations(allPods, runningRuntimePods); err != nil { 1204 klog.ErrorS(err, "Failed cleaning up orphaned pod user namespaces allocations") 1205 } 1206 1207 // Remove orphaned volumes from pods that are known not to have any 1208 // containers. Note that we pass all pods (including terminated pods) to 1209 // the function, so that we don't remove volumes associated with terminated 1210 // but not yet deleted pods. 1211 // TODO: this method could more aggressively cleanup terminated pods 1212 // in the future (volumes, mount dirs, logs, and containers could all be 1213 // better separated) 1214 klog.V(3).InfoS("Clean up orphaned pod directories") 1215 err = kl.cleanupOrphanedPodDirs(allPods, runningRuntimePods) 1216 if err != nil { 1217 // We want all cleanup tasks to be run even if one of them failed. So 1218 // we just log an error here and continue other cleanup tasks. 1219 // This also applies to the other clean up tasks. 1220 klog.ErrorS(err, "Failed cleaning up orphaned pod directories") 1221 } 1222 1223 // Remove any orphaned mirror pods (mirror pods are tracked by name via the 1224 // pod worker) 1225 klog.V(3).InfoS("Clean up orphaned mirror pods") 1226 for _, podFullname := range orphanedMirrorPodFullnames { 1227 if !kl.podWorkers.IsPodForMirrorPodTerminatingByFullName(podFullname) { 1228 _, err := kl.mirrorPodClient.DeleteMirrorPod(podFullname, nil) 1229 if err != nil { 1230 klog.ErrorS(err, "Encountered error when deleting mirror pod", "podName", podFullname) 1231 } else { 1232 klog.V(3).InfoS("Deleted mirror pod", "podName", podFullname) 1233 } 1234 } 1235 } 1236 1237 // After pruning pod workers for terminated pods get the list of active pods for 1238 // metrics and to determine restarts. 1239 activePods := kl.filterOutInactivePods(allPods) 1240 allRegularPods, allStaticPods := splitPodsByStatic(allPods) 1241 activeRegularPods, activeStaticPods := splitPodsByStatic(activePods) 1242 metrics.DesiredPodCount.WithLabelValues("").Set(float64(len(allRegularPods))) 1243 metrics.DesiredPodCount.WithLabelValues("true").Set(float64(len(allStaticPods))) 1244 metrics.ActivePodCount.WithLabelValues("").Set(float64(len(activeRegularPods))) 1245 metrics.ActivePodCount.WithLabelValues("true").Set(float64(len(activeStaticPods))) 1246 metrics.MirrorPodCount.Set(float64(len(mirrorPods))) 1247 1248 // At this point, the pod worker is aware of which pods are not desired (SyncKnownPods). 1249 // We now look through the set of active pods for those that the pod worker is not aware of 1250 // and deliver an update. The most common reason a pod is not known is because the pod was 1251 // deleted and recreated with the same UID while the pod worker was driving its lifecycle (very 1252 // very rare for API pods, common for static pods with fixed UIDs). Containers that may still 1253 // be running from a previous execution must be reconciled by the pod worker's sync method. 1254 // We must use active pods because that is the set of admitted pods (podManager includes pods 1255 // that will never be run, and statusManager tracks already rejected pods). 1256 var restartCount, restartCountStatic int 1257 for _, desiredPod := range activePods { 1258 if _, knownPod := workingPods[desiredPod.UID]; knownPod { 1259 continue 1260 } 1261 1262 klog.V(3).InfoS("Pod will be restarted because it is in the desired set and not known to the pod workers (likely due to UID reuse)", "podUID", desiredPod.UID) 1263 isStatic := kubetypes.IsStaticPod(desiredPod) 1264 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(desiredPod) 1265 if pod == nil || wasMirror { 1266 klog.V(2).InfoS("Programmer error, restartable pod was a mirror pod but activePods should never contain a mirror pod", "podUID", desiredPod.UID) 1267 continue 1268 } 1269 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1270 UpdateType: kubetypes.SyncPodCreate, 1271 Pod: pod, 1272 MirrorPod: mirrorPod, 1273 }) 1274 1275 // the desired pod is now known as well 1276 workingPods[desiredPod.UID] = PodWorkerSync{State: SyncPod, HasConfig: true, Static: isStatic} 1277 if isStatic { 1278 // restartable static pods are the normal case 1279 restartCountStatic++ 1280 } else { 1281 // almost certainly means shenanigans, as API pods should never have the same UID after being deleted and recreated 1282 // unless there is a major API violation 1283 restartCount++ 1284 } 1285 } 1286 metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic)) 1287 metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount)) 1288 1289 // Complete termination of deleted pods that are not runtime pods (don't have 1290 // running containers), are terminal, and are not known to pod workers. 1291 // An example is pods rejected during kubelet admission that have never 1292 // started before (i.e. does not have an orphaned pod). 1293 // Adding the pods with SyncPodKill to pod workers allows to proceed with 1294 // force-deletion of such pods, yet preventing re-entry of the routine in the 1295 // next invocation of HandlePodCleanups. 1296 for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) { 1297 klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID) 1298 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1299 UpdateType: kubetypes.SyncPodKill, 1300 Pod: pod, 1301 }) 1302 } 1303 1304 // Finally, terminate any pods that are observed in the runtime but not present in the list of 1305 // known running pods from config. If we do terminate running runtime pods that will happen 1306 // asynchronously in the background and those will be processed in the next invocation of 1307 // HandlePodCleanups. 1308 var orphanCount int 1309 for _, runningPod := range runningRuntimePods { 1310 // If there are orphaned pod resources in CRI that are unknown to the pod worker, terminate them 1311 // now. Since housekeeping is exclusive to other pod worker updates, we know that no pods have 1312 // been added to the pod worker in the meantime. Note that pods that are not visible in the runtime 1313 // but which were previously known are terminated by SyncKnownPods(). 1314 _, knownPod := workingPods[runningPod.ID] 1315 if !knownPod { 1316 one := int64(1) 1317 killPodOptions := &KillPodOptions{ 1318 PodTerminationGracePeriodSecondsOverride: &one, 1319 } 1320 klog.V(2).InfoS("Clean up containers for orphaned pod we had not seen before", "podUID", runningPod.ID, "killPodOptions", killPodOptions) 1321 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1322 UpdateType: kubetypes.SyncPodKill, 1323 RunningPod: runningPod, 1324 KillPodOptions: killPodOptions, 1325 }) 1326 1327 // the running pod is now known as well 1328 workingPods[runningPod.ID] = PodWorkerSync{State: TerminatingPod, Orphan: true} 1329 orphanCount++ 1330 } 1331 } 1332 metrics.OrphanedRuntimePodTotal.Add(float64(orphanCount)) 1333 1334 // Now that we have recorded any terminating pods, and added new pods that should be running, 1335 // record a summary here. Not all possible combinations of PodWorkerSync values are valid. 1336 counts := make(map[PodWorkerSync]int) 1337 for _, sync := range workingPods { 1338 counts[sync]++ 1339 } 1340 for validSync, configState := range map[PodWorkerSync]string{ 1341 {HasConfig: true, Static: true}: "desired", 1342 {HasConfig: true, Static: false}: "desired", 1343 {Orphan: true, HasConfig: true, Static: true}: "orphan", 1344 {Orphan: true, HasConfig: true, Static: false}: "orphan", 1345 {Orphan: true, HasConfig: false}: "runtime_only", 1346 } { 1347 for _, state := range []PodWorkerState{SyncPod, TerminatingPod, TerminatedPod} { 1348 validSync.State = state 1349 count := counts[validSync] 1350 delete(counts, validSync) 1351 staticString := stringIfTrue(validSync.Static) 1352 if !validSync.HasConfig { 1353 staticString = "unknown" 1354 } 1355 metrics.WorkingPodCount.WithLabelValues(state.String(), configState, staticString).Set(float64(count)) 1356 } 1357 } 1358 if len(counts) > 0 { 1359 // in case a combination is lost 1360 klog.V(3).InfoS("Programmer error, did not report a kubelet_working_pods metric for a value returned by SyncKnownPods", "counts", counts) 1361 } 1362 1363 // Remove any cgroups in the hierarchy for pods that are definitely no longer 1364 // running (not in the container runtime). 1365 if kl.cgroupsPerQOS { 1366 pcm := kl.containerManager.NewPodContainerManager() 1367 klog.V(3).InfoS("Clean up orphaned pod cgroups") 1368 kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, possiblyRunningPods) 1369 } 1370 1371 // Cleanup any backoff entries. 1372 kl.backOff.GC() 1373 return nil 1374 } 1375 1376 // filterTerminalPodsToDelete returns terminal pods which are ready to be 1377 // deleted by the status manager, but are not in pod workers. 1378 // First, the check for deletionTimestamp is a performance optimization as we 1379 // don't need to do anything with terminal pods without deletionTimestamp. 1380 // Second, the check for terminal pods is to avoid race conditions of triggering 1381 // deletion on Pending pods which are not yet added to pod workers. 1382 // Third, the check to skip pods known to pod workers is that the lifecycle of 1383 // such pods is already handled by pod workers. 1384 // Finally, we skip runtime pods as their termination is handled separately in 1385 // the HandlePodCleanups routine. 1386 func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod { 1387 terminalPodsToDelete := make(map[types.UID]*v1.Pod) 1388 for _, pod := range allPods { 1389 if pod.DeletionTimestamp == nil { 1390 // skip pods which don't have a deletion timestamp 1391 continue 1392 } 1393 if !podutil.IsPodPhaseTerminal(pod.Status.Phase) { 1394 // skip the non-terminal pods 1395 continue 1396 } 1397 if _, knownPod := workingPods[pod.UID]; knownPod { 1398 // skip pods known to pod workers 1399 continue 1400 } 1401 terminalPodsToDelete[pod.UID] = pod 1402 } 1403 for _, runningRuntimePod := range runningRuntimePods { 1404 // skip running runtime pods - they are handled by a dedicated routine 1405 // which terminates the containers 1406 delete(terminalPodsToDelete, runningRuntimePod.ID) 1407 } 1408 return terminalPodsToDelete 1409 } 1410 1411 // splitPodsByStatic separates a list of desired pods from the pod manager into 1412 // regular or static pods. Mirror pods are not valid config sources (a mirror pod 1413 // being created cannot cause the Kubelet to start running a static pod) and are 1414 // excluded. 1415 func splitPodsByStatic(pods []*v1.Pod) (regular, static []*v1.Pod) { 1416 regular, static = make([]*v1.Pod, 0, len(pods)), make([]*v1.Pod, 0, len(pods)) 1417 for _, pod := range pods { 1418 if kubetypes.IsMirrorPod(pod) { 1419 continue 1420 } 1421 if kubetypes.IsStaticPod(pod) { 1422 static = append(static, pod) 1423 } else { 1424 regular = append(regular, pod) 1425 } 1426 } 1427 return regular, static 1428 } 1429 1430 // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state 1431 // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current 1432 // running container is preferred over a previous termination. If info about the container is not available then a specific 1433 // error is returned to the end user. 1434 func (kl *Kubelet) validateContainerLogStatus(podName string, podStatus *v1.PodStatus, containerName string, previous bool) (containerID kubecontainer.ContainerID, err error) { 1435 var cID string 1436 1437 cStatus, found := podutil.GetContainerStatus(podStatus.ContainerStatuses, containerName) 1438 if !found { 1439 cStatus, found = podutil.GetContainerStatus(podStatus.InitContainerStatuses, containerName) 1440 } 1441 if !found { 1442 cStatus, found = podutil.GetContainerStatus(podStatus.EphemeralContainerStatuses, containerName) 1443 } 1444 if !found { 1445 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is not available", containerName, podName) 1446 } 1447 lastState := cStatus.LastTerminationState 1448 waiting, running, terminated := cStatus.State.Waiting, cStatus.State.Running, cStatus.State.Terminated 1449 1450 switch { 1451 case previous: 1452 if lastState.Terminated == nil || lastState.Terminated.ContainerID == "" { 1453 return kubecontainer.ContainerID{}, fmt.Errorf("previous terminated container %q in pod %q not found", containerName, podName) 1454 } 1455 cID = lastState.Terminated.ContainerID 1456 1457 case running != nil: 1458 cID = cStatus.ContainerID 1459 1460 case terminated != nil: 1461 // in cases where the next container didn't start, terminated.ContainerID will be empty, so get logs from the lastState.Terminated. 1462 if terminated.ContainerID == "" { 1463 if lastState.Terminated != nil && lastState.Terminated.ContainerID != "" { 1464 cID = lastState.Terminated.ContainerID 1465 } else { 1466 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) 1467 } 1468 } else { 1469 cID = terminated.ContainerID 1470 } 1471 1472 case lastState.Terminated != nil: 1473 if lastState.Terminated.ContainerID == "" { 1474 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) 1475 } 1476 cID = lastState.Terminated.ContainerID 1477 1478 case waiting != nil: 1479 // output some info for the most common pending failures 1480 switch reason := waiting.Reason; reason { 1481 case images.ErrImagePull.Error(): 1482 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: image can't be pulled", containerName, podName) 1483 case images.ErrImagePullBackOff.Error(): 1484 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: trying and failing to pull image", containerName, podName) 1485 default: 1486 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: %v", containerName, podName, reason) 1487 } 1488 default: 1489 // unrecognized state 1490 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start - no logs yet", containerName, podName) 1491 } 1492 1493 return kubecontainer.ParseContainerID(cID), nil 1494 } 1495 1496 // GetKubeletContainerLogs returns logs from the container 1497 // TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt 1498 // or all of them. 1499 func (kl *Kubelet) GetKubeletContainerLogs(ctx context.Context, podFullName, containerName string, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) error { 1500 // Pod workers periodically write status to statusManager. If status is not 1501 // cached there, something is wrong (or kubelet just restarted and hasn't 1502 // caught up yet). Just assume the pod is not ready yet. 1503 name, namespace, err := kubecontainer.ParsePodFullName(podFullName) 1504 if err != nil { 1505 return fmt.Errorf("unable to parse pod full name %q: %v", podFullName, err) 1506 } 1507 1508 pod, ok := kl.GetPodByName(namespace, name) 1509 if !ok { 1510 return fmt.Errorf("pod %q cannot be found - no logs available", name) 1511 } 1512 1513 // TODO: this should be using the podWorker's pod store as authoritative, since 1514 // the mirrorPod might still exist, the pod may have been force deleted but 1515 // is still terminating (users should be able to view logs of force deleted static pods 1516 // based on full name). 1517 var podUID types.UID 1518 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 1519 if wasMirror { 1520 if pod == nil { 1521 return fmt.Errorf("mirror pod %q does not have a corresponding pod", name) 1522 } 1523 podUID = mirrorPod.UID 1524 } else { 1525 podUID = pod.UID 1526 } 1527 1528 podStatus, found := kl.statusManager.GetPodStatus(podUID) 1529 if !found { 1530 // If there is no cached status, use the status from the 1531 // config source (apiserver). This is useful if kubelet 1532 // has recently been restarted. 1533 podStatus = pod.Status 1534 } 1535 1536 // TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID, 1537 // but inside kuberuntime we convert container id back to container name and restart count. 1538 // TODO: After separate container log lifecycle management, we should get log based on the existing log files 1539 // instead of container status. 1540 containerID, err := kl.validateContainerLogStatus(pod.Name, &podStatus, containerName, logOptions.Previous) 1541 if err != nil { 1542 return err 1543 } 1544 1545 // Do a zero-byte write to stdout before handing off to the container runtime. 1546 // This ensures at least one Write call is made to the writer when copying starts, 1547 // even if we then block waiting for log output from the container. 1548 if _, err := stdout.Write([]byte{}); err != nil { 1549 return err 1550 } 1551 1552 return kl.containerRuntime.GetContainerLogs(ctx, pod, containerID, logOptions, stdout, stderr) 1553 } 1554 1555 // getPhase returns the phase of a pod given its container info. 1556 func getPhase(pod *v1.Pod, info []v1.ContainerStatus, podIsTerminal bool) v1.PodPhase { 1557 spec := pod.Spec 1558 pendingInitialization := 0 1559 failedInitialization := 0 1560 1561 // regular init containers 1562 for _, container := range spec.InitContainers { 1563 if kubetypes.IsRestartableInitContainer(&container) { 1564 // Skip the restartable init containers here to handle them separately as 1565 // they are slightly different from the init containers in terms of the 1566 // pod phase. 1567 continue 1568 } 1569 1570 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1571 if !ok { 1572 pendingInitialization++ 1573 continue 1574 } 1575 1576 switch { 1577 case containerStatus.State.Running != nil: 1578 pendingInitialization++ 1579 case containerStatus.State.Terminated != nil: 1580 if containerStatus.State.Terminated.ExitCode != 0 { 1581 failedInitialization++ 1582 } 1583 case containerStatus.State.Waiting != nil: 1584 if containerStatus.LastTerminationState.Terminated != nil { 1585 if containerStatus.LastTerminationState.Terminated.ExitCode != 0 { 1586 failedInitialization++ 1587 } 1588 } else { 1589 pendingInitialization++ 1590 } 1591 default: 1592 pendingInitialization++ 1593 } 1594 } 1595 1596 // counters for restartable init and regular containers 1597 unknown := 0 1598 running := 0 1599 waiting := 0 1600 stopped := 0 1601 succeeded := 0 1602 1603 // restartable init containers 1604 for _, container := range spec.InitContainers { 1605 if !kubetypes.IsRestartableInitContainer(&container) { 1606 // Skip the regular init containers, as they have been handled above. 1607 continue 1608 } 1609 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1610 if !ok { 1611 unknown++ 1612 continue 1613 } 1614 1615 switch { 1616 case containerStatus.State.Running != nil: 1617 if containerStatus.Started == nil || !*containerStatus.Started { 1618 pendingInitialization++ 1619 } 1620 running++ 1621 case containerStatus.State.Terminated != nil: 1622 // Do nothing here, as terminated restartable init containers are not 1623 // taken into account for the pod phase. 1624 case containerStatus.State.Waiting != nil: 1625 if containerStatus.LastTerminationState.Terminated != nil { 1626 // Do nothing here, as terminated restartable init containers are not 1627 // taken into account for the pod phase. 1628 } else { 1629 pendingInitialization++ 1630 waiting++ 1631 } 1632 default: 1633 pendingInitialization++ 1634 unknown++ 1635 } 1636 } 1637 1638 for _, container := range spec.Containers { 1639 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1640 if !ok { 1641 unknown++ 1642 continue 1643 } 1644 1645 switch { 1646 case containerStatus.State.Running != nil: 1647 running++ 1648 case containerStatus.State.Terminated != nil: 1649 stopped++ 1650 if containerStatus.State.Terminated.ExitCode == 0 { 1651 succeeded++ 1652 } 1653 case containerStatus.State.Waiting != nil: 1654 if containerStatus.LastTerminationState.Terminated != nil { 1655 stopped++ 1656 } else { 1657 waiting++ 1658 } 1659 default: 1660 unknown++ 1661 } 1662 } 1663 1664 if failedInitialization > 0 && spec.RestartPolicy == v1.RestartPolicyNever { 1665 return v1.PodFailed 1666 } 1667 1668 switch { 1669 case pendingInitialization > 0 && 1670 // This is needed to handle the case where the pod has been initialized but 1671 // the restartable init containers are restarting and the pod should not be 1672 // placed back into v1.PodPending since the regular containers have run. 1673 !kubecontainer.HasAnyRegularContainerStarted(&spec, info): 1674 fallthrough 1675 case waiting > 0: 1676 klog.V(5).InfoS("Pod waiting > 0, pending") 1677 // One or more containers has not been started 1678 return v1.PodPending 1679 case running > 0 && unknown == 0: 1680 // All containers have been started, and at least 1681 // one container is running 1682 return v1.PodRunning 1683 case running == 0 && stopped > 0 && unknown == 0: 1684 // The pod is terminal so its containers won't be restarted regardless 1685 // of the restart policy. 1686 if podIsTerminal { 1687 // TODO(#116484): Also assign terminal phase to static pods. 1688 if !kubetypes.IsStaticPod(pod) { 1689 // All regular containers are terminated in success and all restartable 1690 // init containers are stopped. 1691 if stopped == succeeded { 1692 return v1.PodSucceeded 1693 } 1694 // There is at least one failure 1695 return v1.PodFailed 1696 } 1697 } 1698 // All containers are terminated 1699 if spec.RestartPolicy == v1.RestartPolicyAlways { 1700 // All containers are in the process of restarting 1701 return v1.PodRunning 1702 } 1703 if stopped == succeeded { 1704 // RestartPolicy is not Always, all containers are terminated in success 1705 // and all restartable init containers are stopped. 1706 return v1.PodSucceeded 1707 } 1708 if spec.RestartPolicy == v1.RestartPolicyNever { 1709 // RestartPolicy is Never, and all containers are 1710 // terminated with at least one in failure 1711 return v1.PodFailed 1712 } 1713 // RestartPolicy is OnFailure, and at least one in failure 1714 // and in the process of restarting 1715 return v1.PodRunning 1716 default: 1717 klog.V(5).InfoS("Pod default case, pending") 1718 return v1.PodPending 1719 } 1720 } 1721 1722 func deleteCustomResourceFromResourceRequirements(target *v1.ResourceRequirements) { 1723 for resource := range target.Limits { 1724 if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage { 1725 delete(target.Limits, resource) 1726 } 1727 } 1728 for resource := range target.Requests { 1729 if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage { 1730 delete(target.Requests, resource) 1731 } 1732 } 1733 } 1734 1735 func (kl *Kubelet) determinePodResizeStatus(pod *v1.Pod, podStatus *v1.PodStatus) v1.PodResizeStatus { 1736 var podResizeStatus v1.PodResizeStatus 1737 specStatusDiffer := false 1738 for _, c := range pod.Spec.Containers { 1739 if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { 1740 cResourceCopy := c.Resources.DeepCopy() 1741 // for both requests and limits, we only compare the cpu, memory and ephemeralstorage 1742 // which are included in convertToAPIContainerStatuses 1743 deleteCustomResourceFromResourceRequirements(cResourceCopy) 1744 csResourceCopy := cs.Resources.DeepCopy() 1745 if csResourceCopy != nil && !cmp.Equal(*cResourceCopy, *csResourceCopy) { 1746 specStatusDiffer = true 1747 break 1748 } 1749 } 1750 } 1751 if !specStatusDiffer { 1752 // Clear last resize state from checkpoint 1753 if err := kl.statusManager.SetPodResizeStatus(pod.UID, ""); err != nil { 1754 klog.ErrorS(err, "SetPodResizeStatus failed", "pod", pod.Name) 1755 } 1756 } else { 1757 if resizeStatus, found := kl.statusManager.GetPodResizeStatus(string(pod.UID)); found { 1758 podResizeStatus = resizeStatus 1759 } 1760 } 1761 return podResizeStatus 1762 } 1763 1764 // generateAPIPodStatus creates the final API pod status for a pod, given the 1765 // internal pod status. This method should only be called from within sync*Pod methods. 1766 func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podIsTerminal bool) v1.PodStatus { 1767 klog.V(3).InfoS("Generating pod status", "podIsTerminal", podIsTerminal, "pod", klog.KObj(pod)) 1768 // use the previous pod status, or the api status, as the basis for this pod 1769 oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID) 1770 if !found { 1771 oldPodStatus = pod.Status 1772 } 1773 s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus) 1774 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 1775 s.Resize = kl.determinePodResizeStatus(pod, s) 1776 } 1777 // calculate the next phase and preserve reason 1778 allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) 1779 s.Phase = getPhase(pod, allStatus, podIsTerminal) 1780 klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "oldPhase", oldPodStatus.Phase, "phase", s.Phase) 1781 1782 // Perform a three-way merge between the statuses from the status manager, 1783 // runtime, and generated status to ensure terminal status is correctly set. 1784 if s.Phase != v1.PodFailed && s.Phase != v1.PodSucceeded { 1785 switch { 1786 case oldPodStatus.Phase == v1.PodFailed || oldPodStatus.Phase == v1.PodSucceeded: 1787 klog.V(4).InfoS("Status manager phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", oldPodStatus.Phase) 1788 s.Phase = oldPodStatus.Phase 1789 case pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded: 1790 klog.V(4).InfoS("API phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", pod.Status.Phase) 1791 s.Phase = pod.Status.Phase 1792 } 1793 } 1794 1795 if s.Phase == oldPodStatus.Phase { 1796 // preserve the reason and message which is associated with the phase 1797 s.Reason = oldPodStatus.Reason 1798 s.Message = oldPodStatus.Message 1799 if len(s.Reason) == 0 { 1800 s.Reason = pod.Status.Reason 1801 } 1802 if len(s.Message) == 0 { 1803 s.Message = pod.Status.Message 1804 } 1805 } 1806 1807 // check if an internal module has requested the pod is evicted and override the reason and message 1808 for _, podSyncHandler := range kl.PodSyncHandlers { 1809 if result := podSyncHandler.ShouldEvict(pod); result.Evict { 1810 s.Phase = v1.PodFailed 1811 s.Reason = result.Reason 1812 s.Message = result.Message 1813 break 1814 } 1815 } 1816 1817 // pods are not allowed to transition out of terminal phases 1818 if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { 1819 // API server shows terminal phase; transitions are not allowed 1820 if s.Phase != pod.Status.Phase { 1821 klog.ErrorS(nil, "Pod attempted illegal phase transition", "pod", klog.KObj(pod), "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s) 1822 // Force back to phase from the API server 1823 s.Phase = pod.Status.Phase 1824 } 1825 } 1826 1827 // ensure the probe managers have up to date status for containers 1828 kl.probeManager.UpdatePodStatus(pod, s) 1829 1830 // preserve all conditions not owned by the kubelet 1831 s.Conditions = make([]v1.PodCondition, 0, len(pod.Status.Conditions)+1) 1832 for _, c := range pod.Status.Conditions { 1833 if !kubetypes.PodConditionByKubelet(c.Type) { 1834 s.Conditions = append(s.Conditions, c) 1835 } 1836 } 1837 1838 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) { 1839 // copy over the pod disruption conditions from state which is already 1840 // updated during the eviciton (due to either node resource pressure or 1841 // node graceful shutdown). We do not re-generate the conditions based 1842 // on the container statuses as they are added based on one-time events. 1843 cType := v1.DisruptionTarget 1844 if _, condition := podutil.GetPodConditionFromList(oldPodStatus.Conditions, cType); condition != nil { 1845 s.Conditions = utilpod.ReplaceOrAppendPodCondition(s.Conditions, condition) 1846 } 1847 } 1848 1849 // set all Kubelet-owned conditions 1850 if utilfeature.DefaultFeatureGate.Enabled(features.PodReadyToStartContainersCondition) { 1851 s.Conditions = append(s.Conditions, status.GeneratePodReadyToStartContainersCondition(pod, podStatus)) 1852 } 1853 allContainerStatuses := append(s.InitContainerStatuses, s.ContainerStatuses...) 1854 s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(&pod.Spec, allContainerStatuses, s.Phase)) 1855 s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(&pod.Spec, s.Conditions, allContainerStatuses, s.Phase)) 1856 s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(&pod.Spec, allContainerStatuses, s.Phase)) 1857 s.Conditions = append(s.Conditions, v1.PodCondition{ 1858 Type: v1.PodScheduled, 1859 Status: v1.ConditionTrue, 1860 }) 1861 // set HostIP/HostIPs and initialize PodIP/PodIPs for host network pods 1862 if kl.kubeClient != nil { 1863 hostIPs, err := kl.getHostIPsAnyWay() 1864 if err != nil { 1865 klog.V(4).InfoS("Cannot get host IPs", "err", err) 1866 } else { 1867 if s.HostIP != "" { 1868 if utilnet.IPFamilyOfString(s.HostIP) != utilnet.IPFamilyOf(hostIPs[0]) { 1869 kl.recorder.Eventf(pod, v1.EventTypeWarning, "HostIPsIPFamilyMismatch", 1870 "Kubelet detected an IPv%s node IP (%s), but the cloud provider selected an IPv%s node IP (%s); pass an explicit `--node-ip` to kubelet to fix this.", 1871 utilnet.IPFamilyOfString(s.HostIP), s.HostIP, utilnet.IPFamilyOf(hostIPs[0]), hostIPs[0].String()) 1872 } 1873 } 1874 s.HostIP = hostIPs[0].String() 1875 if utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) { 1876 s.HostIPs = []v1.HostIP{{IP: s.HostIP}} 1877 if len(hostIPs) == 2 { 1878 s.HostIPs = append(s.HostIPs, v1.HostIP{IP: hostIPs[1].String()}) 1879 } 1880 } 1881 1882 // HostNetwork Pods inherit the node IPs as PodIPs. They are immutable once set, 1883 // other than that if the node becomes dual-stack, we add the secondary IP. 1884 if kubecontainer.IsHostNetworkPod(pod) { 1885 // Primary IP is not set 1886 if s.PodIP == "" { 1887 s.PodIP = hostIPs[0].String() 1888 s.PodIPs = []v1.PodIP{{IP: s.PodIP}} 1889 } 1890 // Secondary IP is not set #105320 1891 if len(hostIPs) == 2 && len(s.PodIPs) == 1 { 1892 if utilnet.IPFamilyOfString(s.PodIPs[0].IP) != utilnet.IPFamilyOf(hostIPs[1]) { 1893 s.PodIPs = append(s.PodIPs, v1.PodIP{IP: hostIPs[1].String()}) 1894 } 1895 } 1896 } 1897 } 1898 } 1899 1900 return *s 1901 } 1902 1903 // sortPodIPs return the PodIPs sorted and truncated by the cluster IP family preference. 1904 // The runtime pod status may have an arbitrary number of IPs, in an arbitrary order. 1905 // PodIPs are obtained by: func (m *kubeGenericRuntimeManager) determinePodSandboxIPs() 1906 // Pick out the first returned IP of the same IP family as the node IP 1907 // first, followed by the first IP of the opposite IP family (if any) 1908 // and use them for the Pod.Status.PodIPs and the Downward API environment variables 1909 func (kl *Kubelet) sortPodIPs(podIPs []string) []string { 1910 ips := make([]string, 0, 2) 1911 var validPrimaryIP, validSecondaryIP func(ip string) bool 1912 if len(kl.nodeIPs) == 0 || utilnet.IsIPv4(kl.nodeIPs[0]) { 1913 validPrimaryIP = utilnet.IsIPv4String 1914 validSecondaryIP = utilnet.IsIPv6String 1915 } else { 1916 validPrimaryIP = utilnet.IsIPv6String 1917 validSecondaryIP = utilnet.IsIPv4String 1918 } 1919 for _, ip := range podIPs { 1920 if validPrimaryIP(ip) { 1921 ips = append(ips, ip) 1922 break 1923 } 1924 } 1925 for _, ip := range podIPs { 1926 if validSecondaryIP(ip) { 1927 ips = append(ips, ip) 1928 break 1929 } 1930 } 1931 return ips 1932 } 1933 1934 // convertStatusToAPIStatus initialize an api PodStatus for the given pod from 1935 // the given internal pod status and the previous state of the pod from the API. 1936 // It is purely transformative and does not alter the kubelet state at all. 1937 func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, oldPodStatus v1.PodStatus) *v1.PodStatus { 1938 var apiPodStatus v1.PodStatus 1939 1940 // copy pod status IPs to avoid race conditions with PodStatus #102806 1941 podIPs := make([]string, len(podStatus.IPs)) 1942 copy(podIPs, podStatus.IPs) 1943 1944 // make podIPs order match node IP family preference #97979 1945 podIPs = kl.sortPodIPs(podIPs) 1946 for _, ip := range podIPs { 1947 apiPodStatus.PodIPs = append(apiPodStatus.PodIPs, v1.PodIP{IP: ip}) 1948 } 1949 if len(apiPodStatus.PodIPs) > 0 { 1950 apiPodStatus.PodIP = apiPodStatus.PodIPs[0].IP 1951 } 1952 1953 // set status for Pods created on versions of kube older than 1.6 1954 apiPodStatus.QOSClass = v1qos.GetPodQOS(pod) 1955 1956 apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses( 1957 pod, podStatus, 1958 oldPodStatus.ContainerStatuses, 1959 pod.Spec.Containers, 1960 len(pod.Spec.InitContainers) > 0, 1961 false, 1962 ) 1963 apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses( 1964 pod, podStatus, 1965 oldPodStatus.InitContainerStatuses, 1966 pod.Spec.InitContainers, 1967 len(pod.Spec.InitContainers) > 0, 1968 true, 1969 ) 1970 var ecSpecs []v1.Container 1971 for i := range pod.Spec.EphemeralContainers { 1972 ecSpecs = append(ecSpecs, v1.Container(pod.Spec.EphemeralContainers[i].EphemeralContainerCommon)) 1973 } 1974 1975 // #80875: By now we've iterated podStatus 3 times. We could refactor this to make a single 1976 // pass through podStatus.ContainerStatuses 1977 apiPodStatus.EphemeralContainerStatuses = kl.convertToAPIContainerStatuses( 1978 pod, podStatus, 1979 oldPodStatus.EphemeralContainerStatuses, 1980 ecSpecs, 1981 len(pod.Spec.InitContainers) > 0, 1982 false, 1983 ) 1984 1985 return &apiPodStatus 1986 } 1987 1988 // convertToAPIContainerStatuses converts the given internal container 1989 // statuses into API container statuses. 1990 func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus { 1991 convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus { 1992 cid := cs.ID.String() 1993 status := &v1.ContainerStatus{ 1994 Name: cs.Name, 1995 RestartCount: int32(cs.RestartCount), 1996 Image: cs.Image, 1997 // Converting the digested image ref to the Kubernetes public 1998 // ContainerStatus.ImageID is historically intentional and should 1999 // not change. 2000 ImageID: cs.ImageRef, 2001 ContainerID: cid, 2002 } 2003 if oldStatus != nil { 2004 status.VolumeMounts = oldStatus.VolumeMounts // immutable 2005 } 2006 switch { 2007 case cs.State == kubecontainer.ContainerStateRunning: 2008 status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)} 2009 case cs.State == kubecontainer.ContainerStateCreated: 2010 // containers that are created but not running are "waiting to be running" 2011 status.State.Waiting = &v1.ContainerStateWaiting{} 2012 case cs.State == kubecontainer.ContainerStateExited: 2013 status.State.Terminated = &v1.ContainerStateTerminated{ 2014 ExitCode: int32(cs.ExitCode), 2015 Reason: cs.Reason, 2016 Message: cs.Message, 2017 StartedAt: metav1.NewTime(cs.StartedAt), 2018 FinishedAt: metav1.NewTime(cs.FinishedAt), 2019 ContainerID: cid, 2020 } 2021 2022 case cs.State == kubecontainer.ContainerStateUnknown && 2023 oldStatus != nil && // we have an old status 2024 oldStatus.State.Running != nil: // our previous status was running 2025 // if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers). 2026 // you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result. 2027 // in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run 2028 // twice. "container never ran" is different than "container ran and failed". This is handled differently in the kubelet 2029 // and it is handled differently in higher order logic like crashloop detection and handling 2030 status.State.Terminated = &v1.ContainerStateTerminated{ 2031 Reason: "ContainerStatusUnknown", 2032 Message: "The container could not be located when the pod was terminated", 2033 ExitCode: 137, // this code indicates an error 2034 } 2035 // the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly 2036 // for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate. 2037 status.RestartCount = oldStatus.RestartCount + 1 2038 2039 default: 2040 // this collapses any unknown state to container waiting. If any container is waiting, then the pod status moves to pending even if it is running. 2041 // if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers 2042 // are actually running. 2043 // see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to 2044 // https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71 2045 // and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439 2046 status.State.Waiting = &v1.ContainerStateWaiting{} 2047 } 2048 return status 2049 } 2050 2051 convertContainerStatusResources := func(cName string, status *v1.ContainerStatus, cStatus *kubecontainer.Status, oldStatuses map[string]v1.ContainerStatus) *v1.ResourceRequirements { 2052 var requests, limits v1.ResourceList 2053 // oldStatus should always exist if container is running 2054 oldStatus, oldStatusFound := oldStatuses[cName] 2055 // Initialize limits/requests from container's spec upon transition to Running state 2056 // For cpu & memory, values queried from runtime via CRI always supercedes spec values 2057 // For ephemeral-storage, a running container's status.limit/request equals spec.limit/request 2058 determineResource := func(rName v1.ResourceName, v1ContainerResource, oldStatusResource, resource v1.ResourceList) { 2059 if oldStatusFound { 2060 if oldStatus.State.Running == nil || status.ContainerID != oldStatus.ContainerID { 2061 if r, exists := v1ContainerResource[rName]; exists { 2062 resource[rName] = r.DeepCopy() 2063 } 2064 } else { 2065 if oldStatusResource != nil { 2066 if r, exists := oldStatusResource[rName]; exists { 2067 resource[rName] = r.DeepCopy() 2068 } 2069 } 2070 } 2071 } 2072 } 2073 container := kubecontainer.GetContainerSpec(pod, cName) 2074 // AllocatedResources values come from checkpoint. It is the source-of-truth. 2075 found := false 2076 status.AllocatedResources, found = kl.statusManager.GetContainerResourceAllocation(string(pod.UID), cName) 2077 if !(container.Resources.Requests == nil && container.Resources.Limits == nil) && !found { 2078 // Log error and fallback to AllocatedResources in oldStatus if it exists 2079 klog.ErrorS(nil, "resource allocation not found in checkpoint store", "pod", pod.Name, "container", cName) 2080 if oldStatusFound { 2081 status.AllocatedResources = oldStatus.AllocatedResources 2082 } 2083 } 2084 if oldStatus.Resources == nil { 2085 oldStatus.Resources = &v1.ResourceRequirements{} 2086 } 2087 // Convert Limits 2088 if container.Resources.Limits != nil { 2089 limits = make(v1.ResourceList) 2090 if cStatus.Resources != nil && cStatus.Resources.CPULimit != nil { 2091 limits[v1.ResourceCPU] = cStatus.Resources.CPULimit.DeepCopy() 2092 } else { 2093 determineResource(v1.ResourceCPU, container.Resources.Limits, oldStatus.Resources.Limits, limits) 2094 } 2095 if cStatus.Resources != nil && cStatus.Resources.MemoryLimit != nil { 2096 limits[v1.ResourceMemory] = cStatus.Resources.MemoryLimit.DeepCopy() 2097 } else { 2098 determineResource(v1.ResourceMemory, container.Resources.Limits, oldStatus.Resources.Limits, limits) 2099 } 2100 if ephemeralStorage, found := container.Resources.Limits[v1.ResourceEphemeralStorage]; found { 2101 limits[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() 2102 } 2103 } 2104 // Convert Requests 2105 if status.AllocatedResources != nil { 2106 requests = make(v1.ResourceList) 2107 if cStatus.Resources != nil && cStatus.Resources.CPURequest != nil { 2108 requests[v1.ResourceCPU] = cStatus.Resources.CPURequest.DeepCopy() 2109 } else { 2110 determineResource(v1.ResourceCPU, status.AllocatedResources, oldStatus.Resources.Requests, requests) 2111 } 2112 if memory, found := status.AllocatedResources[v1.ResourceMemory]; found { 2113 requests[v1.ResourceMemory] = memory.DeepCopy() 2114 } 2115 if ephemeralStorage, found := status.AllocatedResources[v1.ResourceEphemeralStorage]; found { 2116 requests[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() 2117 } 2118 } 2119 //TODO(vinaykul,derekwaynecarr,InPlacePodVerticalScaling): Update this to include extended resources in 2120 // addition to CPU, memory, ephemeral storage. Add test case for extended resources. 2121 resources := &v1.ResourceRequirements{ 2122 Limits: limits, 2123 Requests: requests, 2124 } 2125 return resources 2126 } 2127 2128 // Fetch old containers statuses from old pod status. 2129 oldStatuses := make(map[string]v1.ContainerStatus, len(containers)) 2130 for _, status := range previousStatus { 2131 oldStatuses[status.Name] = status 2132 } 2133 2134 // Set all container statuses to default waiting state 2135 statuses := make(map[string]*v1.ContainerStatus, len(containers)) 2136 defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: ContainerCreating}} 2137 if hasInitContainers { 2138 defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: PodInitializing}} 2139 } 2140 2141 supportsRRO := kl.runtimeClassSupportsRecursiveReadOnlyMounts(pod) 2142 2143 for _, container := range containers { 2144 status := &v1.ContainerStatus{ 2145 Name: container.Name, 2146 Image: container.Image, 2147 State: defaultWaitingState, 2148 } 2149 // status.VolumeMounts cannot be propagated from kubecontainer.Status 2150 // because the CRI API is unaware of the volume names. 2151 if utilfeature.DefaultFeatureGate.Enabled(features.RecursiveReadOnlyMounts) { 2152 for _, vol := range container.VolumeMounts { 2153 volStatus := v1.VolumeMountStatus{ 2154 Name: vol.Name, 2155 MountPath: vol.MountPath, 2156 ReadOnly: vol.ReadOnly, 2157 } 2158 if vol.ReadOnly { 2159 rroMode := v1.RecursiveReadOnlyDisabled 2160 if b, err := resolveRecursiveReadOnly(vol, supportsRRO); err != nil { 2161 klog.ErrorS(err, "failed to resolve recursive read-only mode", "mode", *vol.RecursiveReadOnly) 2162 } else if b { 2163 if utilfeature.DefaultFeatureGate.Enabled(features.RecursiveReadOnlyMounts) { 2164 rroMode = v1.RecursiveReadOnlyEnabled 2165 } else { 2166 klog.ErrorS(nil, "recursive read-only mount needs feature gate to be enabled", 2167 "featureGate", features.RecursiveReadOnlyMounts) 2168 } 2169 } 2170 volStatus.RecursiveReadOnly = &rroMode // Disabled or Enabled 2171 } 2172 status.VolumeMounts = append(status.VolumeMounts, volStatus) 2173 } 2174 } 2175 oldStatus, found := oldStatuses[container.Name] 2176 if found { 2177 if oldStatus.State.Terminated != nil { 2178 status = &oldStatus 2179 } else { 2180 // Apply some values from the old statuses as the default values. 2181 status.RestartCount = oldStatus.RestartCount 2182 status.LastTerminationState = oldStatus.LastTerminationState 2183 } 2184 } 2185 statuses[container.Name] = status 2186 } 2187 2188 for _, container := range containers { 2189 found := false 2190 for _, cStatus := range podStatus.ContainerStatuses { 2191 if container.Name == cStatus.Name { 2192 found = true 2193 break 2194 } 2195 } 2196 if found { 2197 continue 2198 } 2199 // if no container is found, then assuming it should be waiting seems plausible, but the status code requires 2200 // that a previous termination be present. If we're offline long enough or something removed the container, then 2201 // the previous termination may not be present. This next code block ensures that if the container was previously running 2202 // then when that container status disappears, we can infer that it terminated even if we don't know the status code. 2203 // By setting the lasttermination state we are able to leave the container status waiting and present more accurate 2204 // data via the API. 2205 2206 oldStatus, ok := oldStatuses[container.Name] 2207 if !ok { 2208 continue 2209 } 2210 if oldStatus.State.Terminated != nil { 2211 // if the old container status was terminated, the lasttermination status is correct 2212 continue 2213 } 2214 if oldStatus.State.Running == nil { 2215 // if the old container status isn't running, then waiting is an appropriate status and we have nothing to do 2216 continue 2217 } 2218 2219 // If we're here, we know the pod was previously running, but doesn't have a terminated status. We will check now to 2220 // see if it's in a pending state. 2221 status := statuses[container.Name] 2222 // If the status we're about to write indicates the default, the Waiting status will force this pod back into Pending. 2223 // That isn't true, we know the pod was previously running. 2224 isDefaultWaitingStatus := status.State.Waiting != nil && status.State.Waiting.Reason == ContainerCreating 2225 if hasInitContainers { 2226 isDefaultWaitingStatus = status.State.Waiting != nil && status.State.Waiting.Reason == PodInitializing 2227 } 2228 if !isDefaultWaitingStatus { 2229 // the status was written, don't override 2230 continue 2231 } 2232 if status.LastTerminationState.Terminated != nil { 2233 // if we already have a termination state, nothing to do 2234 continue 2235 } 2236 2237 // setting this value ensures that we show as stopped here, not as waiting: 2238 // https://github.com/kubernetes/kubernetes/blob/90c9f7b3e198e82a756a68ffeac978a00d606e55/pkg/kubelet/kubelet_pods.go#L1440-L1445 2239 // This prevents the pod from becoming pending 2240 status.LastTerminationState.Terminated = &v1.ContainerStateTerminated{ 2241 Reason: "ContainerStatusUnknown", 2242 Message: "The container could not be located when the pod was deleted. The container used to be Running", 2243 ExitCode: 137, 2244 } 2245 2246 // If the pod was not deleted, then it's been restarted. Increment restart count. 2247 if pod.DeletionTimestamp == nil { 2248 status.RestartCount += 1 2249 } 2250 2251 statuses[container.Name] = status 2252 } 2253 2254 // Copy the slice before sorting it 2255 containerStatusesCopy := make([]*kubecontainer.Status, len(podStatus.ContainerStatuses)) 2256 copy(containerStatusesCopy, podStatus.ContainerStatuses) 2257 2258 // Make the latest container status comes first. 2259 sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(containerStatusesCopy))) 2260 // Set container statuses according to the statuses seen in pod status 2261 containerSeen := map[string]int{} 2262 for _, cStatus := range containerStatusesCopy { 2263 cName := cStatus.Name 2264 if _, ok := statuses[cName]; !ok { 2265 // This would also ignore the infra container. 2266 continue 2267 } 2268 if containerSeen[cName] >= 2 { 2269 continue 2270 } 2271 var oldStatusPtr *v1.ContainerStatus 2272 if oldStatus, ok := oldStatuses[cName]; ok { 2273 oldStatusPtr = &oldStatus 2274 } 2275 status := convertContainerStatus(cStatus, oldStatusPtr) 2276 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2277 if status.State.Running != nil { 2278 status.Resources = convertContainerStatusResources(cName, status, cStatus, oldStatuses) 2279 } 2280 } 2281 if containerSeen[cName] == 0 { 2282 statuses[cName] = status 2283 } else { 2284 statuses[cName].LastTerminationState = status.State 2285 } 2286 containerSeen[cName] = containerSeen[cName] + 1 2287 } 2288 2289 // Handle the containers failed to be started, which should be in Waiting state. 2290 for _, container := range containers { 2291 if isInitContainer { 2292 // If the init container is terminated with exit code 0, it won't be restarted. 2293 // TODO(random-liu): Handle this in a cleaner way. 2294 s := podStatus.FindContainerStatusByName(container.Name) 2295 if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 { 2296 continue 2297 } 2298 } 2299 // If a container should be restarted in next syncpod, it is *Waiting*. 2300 if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) { 2301 continue 2302 } 2303 status := statuses[container.Name] 2304 reason, ok := kl.reasonCache.Get(pod.UID, container.Name) 2305 if !ok { 2306 // In fact, we could also apply Waiting state here, but it is less informative, 2307 // and the container will be restarted soon, so we prefer the original state here. 2308 // Note that with the current implementation of ShouldContainerBeRestarted the original state here 2309 // could be: 2310 // * Waiting: There is no associated historical container and start failure reason record. 2311 // * Terminated: The container is terminated. 2312 continue 2313 } 2314 if status.State.Terminated != nil { 2315 status.LastTerminationState = status.State 2316 } 2317 status.State = v1.ContainerState{ 2318 Waiting: &v1.ContainerStateWaiting{ 2319 Reason: reason.Err.Error(), 2320 Message: reason.Message, 2321 }, 2322 } 2323 statuses[container.Name] = status 2324 } 2325 2326 // Sort the container statuses since clients of this interface expect the list 2327 // of containers in a pod has a deterministic order. 2328 if isInitContainer { 2329 return kubetypes.SortStatusesOfInitContainers(pod, statuses) 2330 } 2331 containerStatuses := make([]v1.ContainerStatus, 0, len(statuses)) 2332 for _, status := range statuses { 2333 containerStatuses = append(containerStatuses, *status) 2334 } 2335 2336 sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses)) 2337 return containerStatuses 2338 } 2339 2340 // ServeLogs returns logs of current machine. 2341 func (kl *Kubelet) ServeLogs(w http.ResponseWriter, req *http.Request) { 2342 // TODO: allowlist logs we are willing to serve 2343 kl.logServer.ServeHTTP(w, req) 2344 } 2345 2346 // findContainer finds and returns the container with the given pod ID, full name, and container name. 2347 // It returns nil if not found. 2348 func (kl *Kubelet) findContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string) (*kubecontainer.Container, error) { 2349 pods, err := kl.containerRuntime.GetPods(ctx, false) 2350 if err != nil { 2351 return nil, err 2352 } 2353 // Resolve and type convert back again. 2354 // We need the static pod UID but the kubecontainer API works with types.UID. 2355 podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) 2356 pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) 2357 return pod.FindContainerByName(containerName), nil 2358 } 2359 2360 // RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes 2361 func (kl *Kubelet) RunInContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string) ([]byte, error) { 2362 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2363 if err != nil { 2364 return nil, err 2365 } 2366 if container == nil { 2367 return nil, fmt.Errorf("container not found (%q)", containerName) 2368 } 2369 // TODO(tallclair): Pass a proper timeout value. 2370 return kl.runner.RunInContainer(ctx, container.ID, cmd, 0) 2371 } 2372 2373 // GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it. 2374 func (kl *Kubelet) GetExec(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommandserver.Options) (*url.URL, error) { 2375 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2376 if err != nil { 2377 return nil, err 2378 } 2379 if container == nil { 2380 return nil, fmt.Errorf("container not found (%q)", containerName) 2381 } 2382 return kl.streamingRuntime.GetExec(ctx, container.ID, cmd, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, streamOpts.TTY) 2383 } 2384 2385 // GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it. 2386 func (kl *Kubelet) GetAttach(ctx context.Context, podFullName string, podUID types.UID, containerName string, streamOpts remotecommandserver.Options) (*url.URL, error) { 2387 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2388 if err != nil { 2389 return nil, err 2390 } 2391 if container == nil { 2392 return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) 2393 } 2394 2395 // The TTY setting for attach must match the TTY setting in the initial container configuration, 2396 // since whether the process is running in a TTY cannot be changed after it has started. We 2397 // need the api.Pod to get the TTY status. 2398 pod, found := kl.GetPodByFullName(podFullName) 2399 if !found || (string(podUID) != "" && pod.UID != podUID) { 2400 return nil, fmt.Errorf("pod %s not found", podFullName) 2401 } 2402 containerSpec := kubecontainer.GetContainerSpec(pod, containerName) 2403 if containerSpec == nil { 2404 return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) 2405 } 2406 tty := containerSpec.TTY 2407 2408 return kl.streamingRuntime.GetAttach(ctx, container.ID, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, tty) 2409 } 2410 2411 // GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it. 2412 func (kl *Kubelet) GetPortForward(ctx context.Context, podName, podNamespace string, podUID types.UID, portForwardOpts portforward.V4Options) (*url.URL, error) { 2413 pods, err := kl.containerRuntime.GetPods(ctx, false) 2414 if err != nil { 2415 return nil, err 2416 } 2417 // Resolve and type convert back again. 2418 // We need the static pod UID but the kubecontainer API works with types.UID. 2419 podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) 2420 podFullName := kubecontainer.BuildPodFullName(podName, podNamespace) 2421 pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) 2422 if pod.IsEmpty() { 2423 return nil, fmt.Errorf("pod not found (%q)", podFullName) 2424 } 2425 2426 return kl.streamingRuntime.GetPortForward(ctx, podName, podNamespace, podUID, portForwardOpts.Ports) 2427 } 2428 2429 // cleanupOrphanedPodCgroups removes cgroups that should no longer exist. 2430 // it reconciles the cached state of cgroupPods with the specified list of runningPods 2431 func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupPods map[types.UID]cm.CgroupName, possiblyRunningPods map[types.UID]sets.Empty) { 2432 // Iterate over all the found pods to verify if they should be running 2433 for uid, val := range cgroupPods { 2434 // if the pod is in the running set, its not a candidate for cleanup 2435 if _, ok := possiblyRunningPods[uid]; ok { 2436 continue 2437 } 2438 2439 // If volumes have not been unmounted/detached, do not delete the cgroup 2440 // so any memory backed volumes don't have their charges propagated to the 2441 // parent croup. If the volumes still exist, reduce the cpu shares for any 2442 // process in the cgroup to the minimum value while we wait. 2443 if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist { 2444 klog.V(3).InfoS("Orphaned pod found, but volumes not yet removed. Reducing cpu to minimum", "podUID", uid) 2445 if err := pcm.ReduceCPULimits(val); err != nil { 2446 klog.InfoS("Failed to reduce cpu time for pod pending volume cleanup", "podUID", uid, "err", err) 2447 } 2448 continue 2449 } 2450 klog.V(3).InfoS("Orphaned pod found, removing pod cgroups", "podUID", uid) 2451 // Destroy all cgroups of pod that should not be running, 2452 // by first killing all the attached processes to these cgroups. 2453 // We ignore errors thrown by the method, as the housekeeping loop would 2454 // again try to delete these unwanted pod cgroups 2455 go pcm.Destroy(val) 2456 } 2457 } 2458 2459 func (kl *Kubelet) runtimeClassSupportsRecursiveReadOnlyMounts(pod *v1.Pod) bool { 2460 if kl.runtimeClassManager == nil { 2461 return false 2462 } 2463 runtimeHandlerName, err := kl.runtimeClassManager.LookupRuntimeHandler(pod.Spec.RuntimeClassName) 2464 if err != nil { 2465 klog.ErrorS(err, "failed to look up the runtime handler", "runtimeClassName", pod.Spec.RuntimeClassName) 2466 return false 2467 } 2468 runtimeHandlers := kl.runtimeState.runtimeHandlers() 2469 return runtimeHandlerSupportsRecursiveReadOnlyMounts(runtimeHandlerName, runtimeHandlers) 2470 } 2471 2472 // runtimeHandlerSupportsRecursiveReadOnlyMounts checks whether the runtime handler supports recursive read-only mounts. 2473 // The kubelet feature gate is not checked here. 2474 func runtimeHandlerSupportsRecursiveReadOnlyMounts(runtimeHandlerName string, runtimeHandlers []kubecontainer.RuntimeHandler) bool { 2475 if len(runtimeHandlers) == 0 { 2476 // The runtime does not support returning the handler list. 2477 // No need to print a warning here. 2478 return false 2479 } 2480 for _, h := range runtimeHandlers { 2481 if h.Name == runtimeHandlerName { 2482 return h.SupportsRecursiveReadOnlyMounts 2483 } 2484 } 2485 klog.ErrorS(nil, "Unknown runtime handler", "runtimeHandlerName", runtimeHandlerName) 2486 return false 2487 } 2488 2489 // resolveRecursiveReadOnly resolves the recursive read-only mount mode. 2490 func resolveRecursiveReadOnly(m v1.VolumeMount, runtimeSupportsRRO bool) (bool, error) { 2491 if m.RecursiveReadOnly == nil || *m.RecursiveReadOnly == v1.RecursiveReadOnlyDisabled { 2492 return false, nil 2493 } 2494 if !m.ReadOnly { 2495 return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not read-only", m.Name) 2496 } 2497 if m.MountPropagation != nil && *m.MountPropagation != v1.MountPropagationNone { 2498 return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not compatible with propagation %q", 2499 m.Name, *m.MountPropagation) 2500 } 2501 switch rroMode := *m.RecursiveReadOnly; rroMode { 2502 case v1.RecursiveReadOnlyIfPossible: 2503 return runtimeSupportsRRO, nil 2504 case v1.RecursiveReadOnlyEnabled: 2505 if !runtimeSupportsRRO { 2506 return false, fmt.Errorf("volume %q requested recursive read-only mode, but it is not supported by the runtime", m.Name) 2507 } 2508 return true, nil 2509 default: 2510 return false, fmt.Errorf("unknown recursive read-only mode %q", rroMode) 2511 } 2512 }