k8s.io/kubernetes@v1.29.3/pkg/kubelet/kubelet_pods.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "io" 24 "net/http" 25 "net/url" 26 "os" 27 "path/filepath" 28 "runtime" 29 "sort" 30 "strings" 31 32 "github.com/google/go-cmp/cmp" 33 v1 "k8s.io/api/core/v1" 34 "k8s.io/apimachinery/pkg/api/errors" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/apimachinery/pkg/labels" 37 "k8s.io/apimachinery/pkg/types" 38 "k8s.io/apimachinery/pkg/util/sets" 39 utilvalidation "k8s.io/apimachinery/pkg/util/validation" 40 utilfeature "k8s.io/apiserver/pkg/util/feature" 41 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 42 "k8s.io/klog/v2" 43 "k8s.io/kubelet/pkg/cri/streaming/portforward" 44 remotecommandserver "k8s.io/kubelet/pkg/cri/streaming/remotecommand" 45 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 46 "k8s.io/kubernetes/pkg/api/v1/resource" 47 podshelper "k8s.io/kubernetes/pkg/apis/core/pods" 48 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 49 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 50 "k8s.io/kubernetes/pkg/features" 51 "k8s.io/kubernetes/pkg/fieldpath" 52 "k8s.io/kubernetes/pkg/kubelet/cm" 53 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 54 "k8s.io/kubernetes/pkg/kubelet/envvars" 55 "k8s.io/kubernetes/pkg/kubelet/images" 56 "k8s.io/kubernetes/pkg/kubelet/metrics" 57 "k8s.io/kubernetes/pkg/kubelet/status" 58 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 59 "k8s.io/kubernetes/pkg/kubelet/util" 60 utilpod "k8s.io/kubernetes/pkg/util/pod" 61 volumeutil "k8s.io/kubernetes/pkg/volume/util" 62 "k8s.io/kubernetes/pkg/volume/util/hostutil" 63 "k8s.io/kubernetes/pkg/volume/util/subpath" 64 "k8s.io/kubernetes/pkg/volume/util/volumepathhandler" 65 volumevalidation "k8s.io/kubernetes/pkg/volume/validation" 66 "k8s.io/kubernetes/third_party/forked/golang/expansion" 67 utilnet "k8s.io/utils/net" 68 ) 69 70 const ( 71 managedHostsHeader = "# Kubernetes-managed hosts file.\n" 72 managedHostsHeaderWithHostNetwork = "# Kubernetes-managed hosts file (host network).\n" 73 ) 74 75 // Container state reason list 76 const ( 77 PodInitializing = "PodInitializing" 78 ContainerCreating = "ContainerCreating" 79 ) 80 81 // Get a list of pods that have data directories. 82 func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { 83 podInfos, err := os.ReadDir(kl.getPodsDir()) 84 if err != nil { 85 return nil, err 86 } 87 pods := []types.UID{} 88 for i := range podInfos { 89 if podInfos[i].IsDir() { 90 pods = append(pods, types.UID(podInfos[i].Name())) 91 } 92 } 93 return pods, nil 94 } 95 96 // GetActivePods returns pods that have been admitted to the kubelet that 97 // are not fully terminated. This is mapped to the "desired state" of the 98 // kubelet - what pods should be running. 99 // 100 // WARNING: Currently this list does not include pods that have been force 101 // deleted but may still be terminating, which means resources assigned to 102 // those pods during admission may still be in use. See 103 // https://github.com/kubernetes/kubernetes/issues/104824 104 func (kl *Kubelet) GetActivePods() []*v1.Pod { 105 allPods := kl.podManager.GetPods() 106 activePods := kl.filterOutInactivePods(allPods) 107 return activePods 108 } 109 110 // makeBlockVolumes maps the raw block devices specified in the path of the container 111 // Experimental 112 func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) { 113 var devices []kubecontainer.DeviceInfo 114 for _, device := range container.VolumeDevices { 115 // check path is absolute 116 if !filepath.IsAbs(device.DevicePath) { 117 return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath) 118 } 119 vol, ok := podVolumes[device.Name] 120 if !ok || vol.BlockVolumeMapper == nil { 121 klog.ErrorS(nil, "Block volume cannot be satisfied for container, because the volume is missing or the volume mapper is nil", "containerName", container.Name, "device", device) 122 return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name) 123 } 124 // Get a symbolic link associated to a block device under pod device path 125 dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath() 126 symlinkPath := filepath.Join(dirPath, volName) 127 if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil { 128 return nil, checkErr 129 } else if islinkExist { 130 // Check readOnly in PVCVolumeSource and set read only permission if it's true. 131 permission := "mrw" 132 if vol.ReadOnly { 133 permission = "r" 134 } 135 klog.V(4).InfoS("Device will be attached to container in the corresponding path on host", "containerName", container.Name, "path", symlinkPath) 136 devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission}) 137 } 138 } 139 140 return devices, nil 141 } 142 143 // shouldMountHostsFile checks if the nodes /etc/hosts should be mounted 144 // Kubernetes only mounts on /etc/hosts if: 145 // - container is not an infrastructure (pause) container 146 // - container is not already mounting on /etc/hosts 147 // Kubernetes will not mount /etc/hosts if: 148 // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set. 149 // - Windows pod contains a hostProcess container 150 func shouldMountHostsFile(pod *v1.Pod, podIPs []string) bool { 151 shouldMount := len(podIPs) > 0 152 if runtime.GOOS == "windows" { 153 return shouldMount && !kubecontainer.HasWindowsHostProcessContainer(pod) 154 } 155 return shouldMount 156 } 157 158 // makeMounts determines the mount points for the given container. 159 func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar) ([]kubecontainer.Mount, func(), error) { 160 mountEtcHostsFile := shouldMountHostsFile(pod, podIPs) 161 klog.V(3).InfoS("Creating hosts mount for container", "pod", klog.KObj(pod), "containerName", container.Name, "podIPs", podIPs, "path", mountEtcHostsFile) 162 mounts := []kubecontainer.Mount{} 163 var cleanupAction func() 164 for i, mount := range container.VolumeMounts { 165 // do not mount /etc/hosts if container is already mounting on the path 166 mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath) 167 vol, ok := podVolumes[mount.Name] 168 if !ok || vol.Mounter == nil { 169 klog.ErrorS(nil, "Mount cannot be satisfied for the container, because the volume is missing or the volume mounter (vol.Mounter) is nil", 170 "containerName", container.Name, "ok", ok, "volumeMounter", mount) 171 return nil, cleanupAction, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name) 172 } 173 174 relabelVolume := false 175 // If the volume supports SELinux and it has not been 176 // relabeled already and it is not a read-only volume, 177 // relabel it and mark it as labeled 178 if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SELinuxRelabel && !vol.SELinuxLabeled { 179 vol.SELinuxLabeled = true 180 relabelVolume = true 181 } 182 hostPath, err := volumeutil.GetPath(vol.Mounter) 183 if err != nil { 184 return nil, cleanupAction, err 185 } 186 187 subPath := mount.SubPath 188 if mount.SubPathExpr != "" { 189 subPath, err = kubecontainer.ExpandContainerVolumeMounts(mount, expandEnvs) 190 191 if err != nil { 192 return nil, cleanupAction, err 193 } 194 } 195 196 if subPath != "" { 197 if filepath.IsAbs(subPath) { 198 return nil, cleanupAction, fmt.Errorf("error SubPath `%s` must not be an absolute path", subPath) 199 } 200 201 err = volumevalidation.ValidatePathNoBacksteps(subPath) 202 if err != nil { 203 return nil, cleanupAction, fmt.Errorf("unable to provision SubPath `%s`: %v", subPath, err) 204 } 205 206 volumePath := hostPath 207 hostPath = filepath.Join(volumePath, subPath) 208 209 if subPathExists, err := hu.PathExists(hostPath); err != nil { 210 klog.ErrorS(nil, "Could not determine if subPath exists, will not attempt to change its permissions", "path", hostPath) 211 } else if !subPathExists { 212 // Create the sub path now because if it's auto-created later when referenced, it may have an 213 // incorrect ownership and mode. For example, the sub path directory must have at least g+rwx 214 // when the pod specifies an fsGroup, and if the directory is not created here, Docker will 215 // later auto-create it with the incorrect mode 0750 216 // Make extra care not to escape the volume! 217 perm, err := hu.GetMode(volumePath) 218 if err != nil { 219 return nil, cleanupAction, err 220 } 221 if err := subpather.SafeMakeDir(subPath, volumePath, perm); err != nil { 222 // Don't pass detailed error back to the user because it could give information about host filesystem 223 klog.ErrorS(err, "Failed to create subPath directory for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name) 224 return nil, cleanupAction, fmt.Errorf("failed to create subPath directory for volumeMount %q of container %q", mount.Name, container.Name) 225 } 226 } 227 hostPath, cleanupAction, err = subpather.PrepareSafeSubpath(subpath.Subpath{ 228 VolumeMountIndex: i, 229 Path: hostPath, 230 VolumeName: vol.InnerVolumeSpecName, 231 VolumePath: volumePath, 232 PodDir: podDir, 233 ContainerName: container.Name, 234 }) 235 if err != nil { 236 // Don't pass detailed error back to the user because it could give information about host filesystem 237 klog.ErrorS(err, "Failed to prepare subPath for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name) 238 return nil, cleanupAction, fmt.Errorf("failed to prepare subPath for volumeMount %q of container %q", mount.Name, container.Name) 239 } 240 } 241 242 // Docker Volume Mounts fail on Windows if it is not of the form C:/ 243 if volumeutil.IsWindowsLocalPath(runtime.GOOS, hostPath) { 244 hostPath = volumeutil.MakeAbsolutePath(runtime.GOOS, hostPath) 245 } 246 247 containerPath := mount.MountPath 248 // IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath 249 if !volumeutil.IsWindowsUNCPath(runtime.GOOS, containerPath) && !filepath.IsAbs(containerPath) { 250 containerPath = volumeutil.MakeAbsolutePath(runtime.GOOS, containerPath) 251 } 252 253 propagation, err := translateMountPropagation(mount.MountPropagation) 254 if err != nil { 255 return nil, cleanupAction, err 256 } 257 klog.V(5).InfoS("Mount has propagation", "pod", klog.KObj(pod), "containerName", container.Name, "volumeMountName", mount.Name, "propagation", propagation) 258 mustMountRO := vol.Mounter.GetAttributes().ReadOnly 259 260 mounts = append(mounts, kubecontainer.Mount{ 261 Name: mount.Name, 262 ContainerPath: containerPath, 263 HostPath: hostPath, 264 ReadOnly: mount.ReadOnly || mustMountRO, 265 SELinuxRelabel: relabelVolume, 266 Propagation: propagation, 267 }) 268 } 269 if mountEtcHostsFile { 270 hostAliases := pod.Spec.HostAliases 271 hostsMount, err := makeHostsMount(podDir, podIPs, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork) 272 if err != nil { 273 return nil, cleanupAction, err 274 } 275 mounts = append(mounts, *hostsMount) 276 } 277 return mounts, cleanupAction, nil 278 } 279 280 // translateMountPropagation transforms v1.MountPropagationMode to 281 // runtimeapi.MountPropagation. 282 func translateMountPropagation(mountMode *v1.MountPropagationMode) (runtimeapi.MountPropagation, error) { 283 if runtime.GOOS == "windows" { 284 // Windows containers doesn't support mount propagation, use private for it. 285 // Refer https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation. 286 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 287 } 288 289 switch { 290 case mountMode == nil: 291 // PRIVATE is the default 292 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 293 case *mountMode == v1.MountPropagationHostToContainer: 294 return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil 295 case *mountMode == v1.MountPropagationBidirectional: 296 return runtimeapi.MountPropagation_PROPAGATION_BIDIRECTIONAL, nil 297 case *mountMode == v1.MountPropagationNone: 298 return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil 299 default: 300 return 0, fmt.Errorf("invalid MountPropagation mode: %q", *mountMode) 301 } 302 } 303 304 // getEtcHostsPath returns the full host-side path to a pod's generated /etc/hosts file 305 func getEtcHostsPath(podDir string) string { 306 hostsFilePath := filepath.Join(podDir, "etc-hosts") 307 // Volume Mounts fail on Windows if it is not of the form C:/ 308 return volumeutil.MakeAbsolutePath(runtime.GOOS, hostsFilePath) 309 } 310 311 // makeHostsMount makes the mountpoint for the hosts file that the containers 312 // in a pod are injected with. podIPs is provided instead of podIP as podIPs 313 // are present even if dual-stack feature flag is not enabled. 314 func makeHostsMount(podDir string, podIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) (*kubecontainer.Mount, error) { 315 hostsFilePath := getEtcHostsPath(podDir) 316 if err := ensureHostsFile(hostsFilePath, podIPs, hostName, hostDomainName, hostAliases, useHostNetwork); err != nil { 317 return nil, err 318 } 319 return &kubecontainer.Mount{ 320 Name: "k8s-managed-etc-hosts", 321 ContainerPath: etcHostsPath, 322 HostPath: hostsFilePath, 323 ReadOnly: false, 324 SELinuxRelabel: true, 325 }, nil 326 } 327 328 // ensureHostsFile ensures that the given host file has an up-to-date ip, host 329 // name, and domain name. 330 func ensureHostsFile(fileName string, hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) error { 331 var hostsFileContent []byte 332 var err error 333 334 if useHostNetwork { 335 // if Pod is using host network, read hosts file from the node's filesystem. 336 // `etcHostsPath` references the location of the hosts file on the node. 337 // `/etc/hosts` for *nix systems. 338 hostsFileContent, err = nodeHostsFileContent(etcHostsPath, hostAliases) 339 if err != nil { 340 return err 341 } 342 } else { 343 // if Pod is not using host network, create a managed hosts file with Pod IP and other information. 344 hostsFileContent = managedHostsFileContent(hostIPs, hostName, hostDomainName, hostAliases) 345 } 346 347 hostsFilePerm := os.FileMode(0644) 348 if err := os.WriteFile(fileName, hostsFileContent, hostsFilePerm); err != nil { 349 return err 350 } 351 return os.Chmod(fileName, hostsFilePerm) 352 } 353 354 // nodeHostsFileContent reads the content of node's hosts file. 355 func nodeHostsFileContent(hostsFilePath string, hostAliases []v1.HostAlias) ([]byte, error) { 356 hostsFileContent, err := os.ReadFile(hostsFilePath) 357 if err != nil { 358 return nil, err 359 } 360 var buffer bytes.Buffer 361 buffer.WriteString(managedHostsHeaderWithHostNetwork) 362 buffer.Write(hostsFileContent) 363 buffer.Write(hostsEntriesFromHostAliases(hostAliases)) 364 return buffer.Bytes(), nil 365 } 366 367 // managedHostsFileContent generates the content of the managed etc hosts based on Pod IPs and other 368 // information. 369 func managedHostsFileContent(hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias) []byte { 370 var buffer bytes.Buffer 371 buffer.WriteString(managedHostsHeader) 372 buffer.WriteString("127.0.0.1\tlocalhost\n") // ipv4 localhost 373 buffer.WriteString("::1\tlocalhost ip6-localhost ip6-loopback\n") // ipv6 localhost 374 buffer.WriteString("fe00::0\tip6-localnet\n") 375 buffer.WriteString("fe00::0\tip6-mcastprefix\n") 376 buffer.WriteString("fe00::1\tip6-allnodes\n") 377 buffer.WriteString("fe00::2\tip6-allrouters\n") 378 if len(hostDomainName) > 0 { 379 // host entry generated for all IPs in podIPs 380 // podIPs field is populated for clusters even 381 // dual-stack feature flag is not enabled. 382 for _, hostIP := range hostIPs { 383 buffer.WriteString(fmt.Sprintf("%s\t%s.%s\t%s\n", hostIP, hostName, hostDomainName, hostName)) 384 } 385 } else { 386 for _, hostIP := range hostIPs { 387 buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostIP, hostName)) 388 } 389 } 390 buffer.Write(hostsEntriesFromHostAliases(hostAliases)) 391 return buffer.Bytes() 392 } 393 394 func hostsEntriesFromHostAliases(hostAliases []v1.HostAlias) []byte { 395 if len(hostAliases) == 0 { 396 return []byte{} 397 } 398 399 var buffer bytes.Buffer 400 buffer.WriteString("\n") 401 buffer.WriteString("# Entries added by HostAliases.\n") 402 // for each IP, write all aliases onto single line in hosts file 403 for _, hostAlias := range hostAliases { 404 buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostAlias.IP, strings.Join(hostAlias.Hostnames, "\t"))) 405 } 406 return buffer.Bytes() 407 } 408 409 // truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars. 410 func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) { 411 // Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char). 412 const hostnameMaxLen = 63 413 if len(hostname) <= hostnameMaxLen { 414 return hostname, nil 415 } 416 truncated := hostname[:hostnameMaxLen] 417 klog.ErrorS(nil, "Hostname for pod was too long, truncated it", "podName", podName, "hostnameMaxLen", hostnameMaxLen, "truncatedHostname", truncated) 418 // hostname should not end with '-' or '.' 419 truncated = strings.TrimRight(truncated, "-.") 420 if len(truncated) == 0 { 421 // This should never happen. 422 return "", fmt.Errorf("hostname for pod %q was invalid: %q", podName, hostname) 423 } 424 return truncated, nil 425 } 426 427 // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace 428 func (kl *Kubelet) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) { 429 return kl.usernsManager.GetOrCreateUserNamespaceMappings(pod) 430 } 431 432 // GeneratePodHostNameAndDomain creates a hostname and domain name for a pod, 433 // given that pod's spec and annotations or returns an error. 434 func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) { 435 clusterDomain := kl.dnsConfigurer.ClusterDomain 436 437 hostname := pod.Name 438 if len(pod.Spec.Hostname) > 0 { 439 if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Hostname); len(msgs) != 0 { 440 return "", "", fmt.Errorf("pod Hostname %q is not a valid DNS label: %s", pod.Spec.Hostname, strings.Join(msgs, ";")) 441 } 442 hostname = pod.Spec.Hostname 443 } 444 445 hostname, err := truncatePodHostnameIfNeeded(pod.Name, hostname) 446 if err != nil { 447 return "", "", err 448 } 449 450 hostDomain := "" 451 if len(pod.Spec.Subdomain) > 0 { 452 if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Subdomain); len(msgs) != 0 { 453 return "", "", fmt.Errorf("pod Subdomain %q is not a valid DNS label: %s", pod.Spec.Subdomain, strings.Join(msgs, ";")) 454 } 455 hostDomain = fmt.Sprintf("%s.%s.svc.%s", pod.Spec.Subdomain, pod.Namespace, clusterDomain) 456 } 457 458 return hostname, hostDomain, nil 459 } 460 461 // GetPodCgroupParent gets pod cgroup parent from container manager. 462 func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string { 463 pcm := kl.containerManager.NewPodContainerManager() 464 _, cgroupParent := pcm.GetPodContainerName(pod) 465 return cgroupParent 466 } 467 468 // GenerateRunContainerOptions generates the RunContainerOptions, which can be used by 469 // the container runtime to set parameters for launching a container. 470 func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) { 471 opts, err := kl.containerManager.GetResources(pod, container) 472 if err != nil { 473 return nil, nil, err 474 } 475 // The value of hostname is the short host name and it is sent to makeMounts to create /etc/hosts file. 476 hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod) 477 if err != nil { 478 return nil, nil, err 479 } 480 // nodename will be equal to hostname if SetHostnameAsFQDN is nil or false. If SetHostnameFQDN 481 // is true and hostDomainName is defined, nodename will be the FQDN (hostname.hostDomainName) 482 nodename, err := util.GetNodenameForKernel(hostname, hostDomainName, pod.Spec.SetHostnameAsFQDN) 483 if err != nil { 484 return nil, nil, err 485 } 486 opts.Hostname = nodename 487 podName := volumeutil.GetUniquePodName(pod) 488 volumes := kl.volumeManager.GetMountedVolumesForPod(podName) 489 490 blkutil := volumepathhandler.NewBlockVolumePathHandler() 491 blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil) 492 if err != nil { 493 return nil, nil, err 494 } 495 opts.Devices = append(opts.Devices, blkVolumes...) 496 497 envs, err := kl.makeEnvironmentVariables(pod, container, podIP, podIPs) 498 if err != nil { 499 return nil, nil, err 500 } 501 opts.Envs = append(opts.Envs, envs...) 502 503 // only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled. 504 mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs) 505 if err != nil { 506 return nil, cleanupAction, err 507 } 508 opts.Mounts = append(opts.Mounts, mounts...) 509 510 // adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot 511 // be mounted as volumes using Docker for Windows. 512 if len(container.TerminationMessagePath) != 0 { 513 p := kl.getPodContainerDir(pod.UID, container.Name) 514 if err := os.MkdirAll(p, 0750); err != nil { 515 klog.ErrorS(err, "Error on creating dir", "path", p) 516 } else { 517 opts.PodContainerDir = p 518 } 519 } 520 521 return opts, cleanupAction, nil 522 } 523 524 var masterServices = sets.NewString("kubernetes") 525 526 // getServiceEnvVarMap makes a map[string]string of env vars for services a 527 // pod in namespace ns should see. 528 func (kl *Kubelet) getServiceEnvVarMap(ns string, enableServiceLinks bool) (map[string]string, error) { 529 var ( 530 serviceMap = make(map[string]*v1.Service) 531 m = make(map[string]string) 532 ) 533 534 // Get all service resources from the master (via a cache), 535 // and populate them into service environment variables. 536 if kl.serviceLister == nil { 537 // Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars. 538 return m, nil 539 } 540 services, err := kl.serviceLister.List(labels.Everything()) 541 if err != nil { 542 return m, fmt.Errorf("failed to list services when setting up env vars") 543 } 544 545 // project the services in namespace ns onto the master services 546 for i := range services { 547 service := services[i] 548 // ignore services where ClusterIP is "None" or empty 549 if !v1helper.IsServiceIPSet(service) { 550 continue 551 } 552 serviceName := service.Name 553 554 // We always want to add environment variabled for master services 555 // from the default namespace, even if enableServiceLinks is false. 556 // We also add environment variables for other services in the same 557 // namespace, if enableServiceLinks is true. 558 if service.Namespace == metav1.NamespaceDefault && masterServices.Has(serviceName) { 559 if _, exists := serviceMap[serviceName]; !exists { 560 serviceMap[serviceName] = service 561 } 562 } else if service.Namespace == ns && enableServiceLinks { 563 serviceMap[serviceName] = service 564 } 565 } 566 567 mappedServices := []*v1.Service{} 568 for key := range serviceMap { 569 mappedServices = append(mappedServices, serviceMap[key]) 570 } 571 572 for _, e := range envvars.FromServices(mappedServices) { 573 m[e.Name] = e.Value 574 } 575 return m, nil 576 } 577 578 // Make the environment variables for a pod in the given namespace. 579 func (kl *Kubelet) makeEnvironmentVariables(pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) ([]kubecontainer.EnvVar, error) { 580 if pod.Spec.EnableServiceLinks == nil { 581 return nil, fmt.Errorf("nil pod.spec.enableServiceLinks encountered, cannot construct envvars") 582 } 583 584 // If the pod originates from the kube-api, when we know that the kube-apiserver is responding and the kubelet's credentials are valid. 585 // Knowing this, it is reasonable to wait until the service lister has synchronized at least once before attempting to build 586 // a service env var map. This doesn't present the race below from happening entirely, but it does prevent the "obvious" 587 // failure case of services simply not having completed a list operation that can reasonably be expected to succeed. 588 // One common case this prevents is a kubelet restart reading pods before services and some pod not having the 589 // KUBERNETES_SERVICE_HOST injected because we didn't wait a short time for services to sync before proceeding. 590 // The KUBERNETES_SERVICE_HOST link is special because it is unconditionally injected into pods and is read by the 591 // in-cluster-config for pod clients 592 if !kubetypes.IsStaticPod(pod) && !kl.serviceHasSynced() { 593 return nil, fmt.Errorf("services have not yet been read at least once, cannot construct envvars") 594 } 595 596 var result []kubecontainer.EnvVar 597 // Note: These are added to the docker Config, but are not included in the checksum computed 598 // by kubecontainer.HashContainer(...). That way, we can still determine whether an 599 // v1.Container is already running by its hash. (We don't want to restart a container just 600 // because some service changed.) 601 // 602 // Note that there is a race between Kubelet seeing the pod and kubelet seeing the service. 603 // To avoid this users can: (1) wait between starting a service and starting; or (2) detect 604 // missing service env var and exit and be restarted; or (3) use DNS instead of env vars 605 // and keep trying to resolve the DNS name of the service (recommended). 606 serviceEnv, err := kl.getServiceEnvVarMap(pod.Namespace, *pod.Spec.EnableServiceLinks) 607 if err != nil { 608 return result, err 609 } 610 611 var ( 612 configMaps = make(map[string]*v1.ConfigMap) 613 secrets = make(map[string]*v1.Secret) 614 tmpEnv = make(map[string]string) 615 ) 616 617 // Env will override EnvFrom variables. 618 // Process EnvFrom first then allow Env to replace existing values. 619 for _, envFrom := range container.EnvFrom { 620 switch { 621 case envFrom.ConfigMapRef != nil: 622 cm := envFrom.ConfigMapRef 623 name := cm.Name 624 configMap, ok := configMaps[name] 625 if !ok { 626 if kl.kubeClient == nil { 627 return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) 628 } 629 optional := cm.Optional != nil && *cm.Optional 630 configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) 631 if err != nil { 632 if errors.IsNotFound(err) && optional { 633 // ignore error when marked optional 634 continue 635 } 636 return result, err 637 } 638 configMaps[name] = configMap 639 } 640 641 invalidKeys := []string{} 642 for k, v := range configMap.Data { 643 if len(envFrom.Prefix) > 0 { 644 k = envFrom.Prefix + k 645 } 646 if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 { 647 invalidKeys = append(invalidKeys, k) 648 continue 649 } 650 tmpEnv[k] = v 651 } 652 if len(invalidKeys) > 0 { 653 sort.Strings(invalidKeys) 654 kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom configMap %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name) 655 } 656 case envFrom.SecretRef != nil: 657 s := envFrom.SecretRef 658 name := s.Name 659 secret, ok := secrets[name] 660 if !ok { 661 if kl.kubeClient == nil { 662 return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) 663 } 664 optional := s.Optional != nil && *s.Optional 665 secret, err = kl.secretManager.GetSecret(pod.Namespace, name) 666 if err != nil { 667 if errors.IsNotFound(err) && optional { 668 // ignore error when marked optional 669 continue 670 } 671 return result, err 672 } 673 secrets[name] = secret 674 } 675 676 invalidKeys := []string{} 677 for k, v := range secret.Data { 678 if len(envFrom.Prefix) > 0 { 679 k = envFrom.Prefix + k 680 } 681 if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 { 682 invalidKeys = append(invalidKeys, k) 683 continue 684 } 685 tmpEnv[k] = string(v) 686 } 687 if len(invalidKeys) > 0 { 688 sort.Strings(invalidKeys) 689 kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom secret %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name) 690 } 691 } 692 } 693 694 // Determine the final values of variables: 695 // 696 // 1. Determine the final value of each variable: 697 // a. If the variable's Value is set, expand the `$(var)` references to other 698 // variables in the .Value field; the sources of variables are the declared 699 // variables of the container and the service environment variables 700 // b. If a source is defined for an environment variable, resolve the source 701 // 2. Create the container's environment in the order variables are declared 702 // 3. Add remaining service environment vars 703 var ( 704 mappingFunc = expansion.MappingFuncFor(tmpEnv, serviceEnv) 705 ) 706 for _, envVar := range container.Env { 707 runtimeVal := envVar.Value 708 if runtimeVal != "" { 709 // Step 1a: expand variable references 710 runtimeVal = expansion.Expand(runtimeVal, mappingFunc) 711 } else if envVar.ValueFrom != nil { 712 // Step 1b: resolve alternate env var sources 713 switch { 714 case envVar.ValueFrom.FieldRef != nil: 715 runtimeVal, err = kl.podFieldSelectorRuntimeValue(envVar.ValueFrom.FieldRef, pod, podIP, podIPs) 716 if err != nil { 717 return result, err 718 } 719 case envVar.ValueFrom.ResourceFieldRef != nil: 720 defaultedPod, defaultedContainer, err := kl.defaultPodLimitsForDownwardAPI(pod, container) 721 if err != nil { 722 return result, err 723 } 724 runtimeVal, err = containerResourceRuntimeValue(envVar.ValueFrom.ResourceFieldRef, defaultedPod, defaultedContainer) 725 if err != nil { 726 return result, err 727 } 728 case envVar.ValueFrom.ConfigMapKeyRef != nil: 729 cm := envVar.ValueFrom.ConfigMapKeyRef 730 name := cm.Name 731 key := cm.Key 732 optional := cm.Optional != nil && *cm.Optional 733 configMap, ok := configMaps[name] 734 if !ok { 735 if kl.kubeClient == nil { 736 return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name) 737 } 738 configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name) 739 if err != nil { 740 if errors.IsNotFound(err) && optional { 741 // ignore error when marked optional 742 continue 743 } 744 return result, err 745 } 746 configMaps[name] = configMap 747 } 748 runtimeVal, ok = configMap.Data[key] 749 if !ok { 750 if optional { 751 continue 752 } 753 return result, fmt.Errorf("couldn't find key %v in ConfigMap %v/%v", key, pod.Namespace, name) 754 } 755 case envVar.ValueFrom.SecretKeyRef != nil: 756 s := envVar.ValueFrom.SecretKeyRef 757 name := s.Name 758 key := s.Key 759 optional := s.Optional != nil && *s.Optional 760 secret, ok := secrets[name] 761 if !ok { 762 if kl.kubeClient == nil { 763 return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name) 764 } 765 secret, err = kl.secretManager.GetSecret(pod.Namespace, name) 766 if err != nil { 767 if errors.IsNotFound(err) && optional { 768 // ignore error when marked optional 769 continue 770 } 771 return result, err 772 } 773 secrets[name] = secret 774 } 775 runtimeValBytes, ok := secret.Data[key] 776 if !ok { 777 if optional { 778 continue 779 } 780 return result, fmt.Errorf("couldn't find key %v in Secret %v/%v", key, pod.Namespace, name) 781 } 782 runtimeVal = string(runtimeValBytes) 783 } 784 } 785 786 tmpEnv[envVar.Name] = runtimeVal 787 } 788 789 // Append the env vars 790 for k, v := range tmpEnv { 791 result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) 792 } 793 794 // Append remaining service env vars. 795 for k, v := range serviceEnv { 796 // Accesses apiserver+Pods. 797 // So, the master may set service env vars, or kubelet may. In case both are doing 798 // it, we skip the key from the kubelet-generated ones so we don't have duplicate 799 // env vars. 800 // TODO: remove this next line once all platforms use apiserver+Pods. 801 if _, present := tmpEnv[k]; !present { 802 result = append(result, kubecontainer.EnvVar{Name: k, Value: v}) 803 } 804 } 805 return result, nil 806 } 807 808 // podFieldSelectorRuntimeValue returns the runtime value of the given 809 // selector for a pod. 810 func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod *v1.Pod, podIP string, podIPs []string) (string, error) { 811 internalFieldPath, _, err := podshelper.ConvertDownwardAPIFieldLabel(fs.APIVersion, fs.FieldPath, "") 812 if err != nil { 813 return "", err 814 } 815 816 // make podIPs order match node IP family preference #97979 817 podIPs = kl.sortPodIPs(podIPs) 818 if len(podIPs) > 0 { 819 podIP = podIPs[0] 820 } 821 822 switch internalFieldPath { 823 case "spec.nodeName": 824 return pod.Spec.NodeName, nil 825 case "spec.serviceAccountName": 826 return pod.Spec.ServiceAccountName, nil 827 case "status.hostIP": 828 hostIPs, err := kl.getHostIPsAnyWay() 829 if err != nil { 830 return "", err 831 } 832 return hostIPs[0].String(), nil 833 case "status.hostIPs": 834 if !utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) { 835 return "", nil 836 } 837 hostIPs, err := kl.getHostIPsAnyWay() 838 if err != nil { 839 return "", err 840 } 841 ips := make([]string, 0, len(hostIPs)) 842 for _, ip := range hostIPs { 843 ips = append(ips, ip.String()) 844 } 845 return strings.Join(ips, ","), nil 846 case "status.podIP": 847 return podIP, nil 848 case "status.podIPs": 849 return strings.Join(podIPs, ","), nil 850 } 851 return fieldpath.ExtractFieldPathAsString(pod, internalFieldPath) 852 } 853 854 // containerResourceRuntimeValue returns the value of the provided container resource 855 func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, container *v1.Container) (string, error) { 856 containerName := fs.ContainerName 857 if len(containerName) == 0 { 858 return resource.ExtractContainerResourceValue(fs, container) 859 } 860 return resource.ExtractResourceValueByContainerName(fs, pod, containerName) 861 } 862 863 // killPod instructs the container runtime to kill the pod. This method requires that 864 // the pod status contains the result of the last syncPod, otherwise it may fail to 865 // terminate newly created containers and sandboxes. 866 func (kl *Kubelet) killPod(ctx context.Context, pod *v1.Pod, p kubecontainer.Pod, gracePeriodOverride *int64) error { 867 // Call the container runtime KillPod method which stops all known running containers of the pod 868 if err := kl.containerRuntime.KillPod(ctx, pod, p, gracePeriodOverride); err != nil { 869 return err 870 } 871 if err := kl.containerManager.UpdateQOSCgroups(); err != nil { 872 klog.V(2).InfoS("Failed to update QoS cgroups while killing pod", "err", err) 873 } 874 return nil 875 } 876 877 // makePodDataDirs creates the dirs for the pod datas. 878 func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error { 879 uid := pod.UID 880 if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) { 881 return err 882 } 883 if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) { 884 return err 885 } 886 if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) { 887 return err 888 } 889 return nil 890 } 891 892 // getPullSecretsForPod inspects the Pod and retrieves the referenced pull 893 // secrets. 894 func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret { 895 pullSecrets := []v1.Secret{} 896 failedPullSecrets := []string{} 897 898 for _, secretRef := range pod.Spec.ImagePullSecrets { 899 if len(secretRef.Name) == 0 { 900 // API validation permitted entries with empty names (https://issue.k8s.io/99454#issuecomment-787838112). 901 // Ignore to avoid unnecessary warnings. 902 continue 903 } 904 secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name) 905 if err != nil { 906 klog.InfoS("Unable to retrieve pull secret, the image pull may not succeed.", "pod", klog.KObj(pod), "secret", klog.KObj(secret), "err", err) 907 failedPullSecrets = append(failedPullSecrets, secretRef.Name) 908 continue 909 } 910 911 pullSecrets = append(pullSecrets, *secret) 912 } 913 914 if len(failedPullSecrets) > 0 { 915 kl.recorder.Eventf(pod, v1.EventTypeWarning, "FailedToRetrieveImagePullSecret", "Unable to retrieve some image pull secrets (%s); attempting to pull the image may not succeed.", strings.Join(failedPullSecrets, ", ")) 916 } 917 918 return pullSecrets 919 } 920 921 // PodCouldHaveRunningContainers returns true if the pod with the given UID could still have running 922 // containers. This returns false if the pod has not yet been started or the pod is unknown. 923 func (kl *Kubelet) PodCouldHaveRunningContainers(pod *v1.Pod) bool { 924 if kl.podWorkers.CouldHaveRunningContainers(pod.UID) { 925 return true 926 } 927 928 // Check if pod might need to unprepare resources before termination 929 // NOTE: This is a temporary solution. This call is here to avoid changing 930 // status manager and its tests. 931 // TODO: extend PodDeletionSafetyProvider interface and implement it 932 // in a separate Kubelet method. 933 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 934 if kl.containerManager.PodMightNeedToUnprepareResources(pod.UID) { 935 return true 936 } 937 } 938 return false 939 } 940 941 // PodIsFinished returns true if SyncTerminatedPod is finished, ie. 942 // all required node-level resources that a pod was consuming have 943 // been reclaimed by the kubelet. 944 func (kl *Kubelet) PodIsFinished(pod *v1.Pod) bool { 945 return kl.podWorkers.ShouldPodBeFinished(pod.UID) 946 } 947 948 // filterOutInactivePods returns pods that are not in a terminal phase 949 // or are known to be fully terminated. This method should only be used 950 // when the set of pods being filtered is upstream of the pod worker, i.e. 951 // the pods the pod manager is aware of. 952 func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod { 953 filteredPods := make([]*v1.Pod, 0, len(pods)) 954 for _, p := range pods { 955 // if a pod is fully terminated by UID, it should be excluded from the 956 // list of pods 957 if kl.podWorkers.IsPodKnownTerminated(p.UID) { 958 continue 959 } 960 961 // terminal pods are considered inactive UNLESS they are actively terminating 962 if kl.isAdmittedPodTerminal(p) && !kl.podWorkers.IsPodTerminationRequested(p.UID) { 963 continue 964 } 965 966 filteredPods = append(filteredPods, p) 967 } 968 return filteredPods 969 } 970 971 // isAdmittedPodTerminal returns true if the provided config source pod is in 972 // a terminal phase, or if the Kubelet has already indicated the pod has reached 973 // a terminal phase but the config source has not accepted it yet. This method 974 // should only be used within the pod configuration loops that notify the pod 975 // worker, other components should treat the pod worker as authoritative. 976 func (kl *Kubelet) isAdmittedPodTerminal(pod *v1.Pod) bool { 977 // pods are considered inactive if the config source has observed a 978 // terminal phase (if the Kubelet recorded that the pod reached a terminal 979 // phase the pod should never be restarted) 980 if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed { 981 return true 982 } 983 // a pod that has been marked terminal within the Kubelet is considered 984 // inactive (may have been rejected by Kubelet admission) 985 if status, ok := kl.statusManager.GetPodStatus(pod.UID); ok { 986 if status.Phase == v1.PodSucceeded || status.Phase == v1.PodFailed { 987 return true 988 } 989 } 990 return false 991 } 992 993 // removeOrphanedPodStatuses removes obsolete entries in podStatus where 994 // the pod is no longer considered bound to this node. 995 func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Pod) { 996 podUIDs := make(map[types.UID]bool) 997 for _, pod := range pods { 998 podUIDs[pod.UID] = true 999 } 1000 for _, pod := range mirrorPods { 1001 podUIDs[pod.UID] = true 1002 } 1003 kl.statusManager.RemoveOrphanedStatuses(podUIDs) 1004 } 1005 1006 // HandlePodCleanups performs a series of cleanup work, including terminating 1007 // pod workers, killing unwanted pods, and removing orphaned volumes/pod 1008 // directories. No config changes are sent to pod workers while this method 1009 // is executing which means no new pods can appear. After this method completes 1010 // the desired state of the kubelet should be reconciled with the actual state 1011 // in the pod worker and other pod-related components. 1012 // 1013 // This function is executed by the main sync loop, so it must execute quickly 1014 // and all nested calls should be asynchronous. Any slow reconciliation actions 1015 // should be performed by other components (like the volume manager). The duration 1016 // of this call is the minimum latency for static pods to be restarted if they 1017 // are updated with a fixed UID (most should use a dynamic UID), and no config 1018 // updates are delivered to the pod workers while this method is running. 1019 func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { 1020 // The kubelet lacks checkpointing, so we need to introspect the set of pods 1021 // in the cgroup tree prior to inspecting the set of pods in our pod manager. 1022 // this ensures our view of the cgroup tree does not mistakenly observe pods 1023 // that are added after the fact... 1024 var ( 1025 cgroupPods map[types.UID]cm.CgroupName 1026 err error 1027 ) 1028 if kl.cgroupsPerQOS { 1029 pcm := kl.containerManager.NewPodContainerManager() 1030 cgroupPods, err = pcm.GetAllPodsFromCgroups() 1031 if err != nil { 1032 return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err) 1033 } 1034 } 1035 1036 allPods, mirrorPods, orphanedMirrorPodFullnames := kl.podManager.GetPodsAndMirrorPods() 1037 1038 // Pod phase progresses monotonically. Once a pod has reached a final state, 1039 // it should never leave regardless of the restart policy. The statuses 1040 // of such pods should not be changed, and there is no need to sync them. 1041 // TODO: the logic here does not handle two cases: 1042 // 1. If the containers were removed immediately after they died, kubelet 1043 // may fail to generate correct statuses, let alone filtering correctly. 1044 // 2. If kubelet restarted before writing the terminated status for a pod 1045 // to the apiserver, it could still restart the terminated pod (even 1046 // though the pod was not considered terminated by the apiserver). 1047 // These two conditions could be alleviated by checkpointing kubelet. 1048 1049 // Stop the workers for terminated pods not in the config source 1050 klog.V(3).InfoS("Clean up pod workers for terminated pods") 1051 workingPods := kl.podWorkers.SyncKnownPods(allPods) 1052 1053 // Reconcile: At this point the pod workers have been pruned to the set of 1054 // desired pods. Pods that must be restarted due to UID reuse, or leftover 1055 // pods from previous runs, are not known to the pod worker. 1056 1057 allPodsByUID := make(map[types.UID]*v1.Pod) 1058 for _, pod := range allPods { 1059 allPodsByUID[pod.UID] = pod 1060 } 1061 1062 // Identify the set of pods that have workers, which should be all pods 1063 // from config that are not terminated, as well as any terminating pods 1064 // that have already been removed from config. Pods that are terminating 1065 // will be added to possiblyRunningPods, to prevent overly aggressive 1066 // cleanup of pod cgroups. 1067 stringIfTrue := func(t bool) string { 1068 if t { 1069 return "true" 1070 } 1071 return "" 1072 } 1073 runningPods := make(map[types.UID]sets.Empty) 1074 possiblyRunningPods := make(map[types.UID]sets.Empty) 1075 for uid, sync := range workingPods { 1076 switch sync.State { 1077 case SyncPod: 1078 runningPods[uid] = struct{}{} 1079 possiblyRunningPods[uid] = struct{}{} 1080 case TerminatingPod: 1081 possiblyRunningPods[uid] = struct{}{} 1082 default: 1083 } 1084 } 1085 1086 // Retrieve the list of running containers from the runtime to perform cleanup. 1087 // We need the latest state to avoid delaying restarts of static pods that reuse 1088 // a UID. 1089 if err := kl.runtimeCache.ForceUpdateIfOlder(ctx, kl.clock.Now()); err != nil { 1090 klog.ErrorS(err, "Error listing containers") 1091 return err 1092 } 1093 runningRuntimePods, err := kl.runtimeCache.GetPods(ctx) 1094 if err != nil { 1095 klog.ErrorS(err, "Error listing containers") 1096 return err 1097 } 1098 1099 // Stop probing pods that are not running 1100 klog.V(3).InfoS("Clean up probes for terminated pods") 1101 kl.probeManager.CleanupPods(possiblyRunningPods) 1102 1103 // Remove orphaned pod statuses not in the total list of known config pods 1104 klog.V(3).InfoS("Clean up orphaned pod statuses") 1105 kl.removeOrphanedPodStatuses(allPods, mirrorPods) 1106 1107 // Remove orphaned pod user namespace allocations (if any). 1108 klog.V(3).InfoS("Clean up orphaned pod user namespace allocations") 1109 if err = kl.usernsManager.CleanupOrphanedPodUsernsAllocations(allPods, runningRuntimePods); err != nil { 1110 klog.ErrorS(err, "Failed cleaning up orphaned pod user namespaces allocations") 1111 } 1112 1113 // Remove orphaned volumes from pods that are known not to have any 1114 // containers. Note that we pass all pods (including terminated pods) to 1115 // the function, so that we don't remove volumes associated with terminated 1116 // but not yet deleted pods. 1117 // TODO: this method could more aggressively cleanup terminated pods 1118 // in the future (volumes, mount dirs, logs, and containers could all be 1119 // better separated) 1120 klog.V(3).InfoS("Clean up orphaned pod directories") 1121 err = kl.cleanupOrphanedPodDirs(allPods, runningRuntimePods) 1122 if err != nil { 1123 // We want all cleanup tasks to be run even if one of them failed. So 1124 // we just log an error here and continue other cleanup tasks. 1125 // This also applies to the other clean up tasks. 1126 klog.ErrorS(err, "Failed cleaning up orphaned pod directories") 1127 } 1128 1129 // Remove any orphaned mirror pods (mirror pods are tracked by name via the 1130 // pod worker) 1131 klog.V(3).InfoS("Clean up orphaned mirror pods") 1132 for _, podFullname := range orphanedMirrorPodFullnames { 1133 if !kl.podWorkers.IsPodForMirrorPodTerminatingByFullName(podFullname) { 1134 _, err := kl.mirrorPodClient.DeleteMirrorPod(podFullname, nil) 1135 if err != nil { 1136 klog.ErrorS(err, "Encountered error when deleting mirror pod", "podName", podFullname) 1137 } else { 1138 klog.V(3).InfoS("Deleted mirror pod", "podName", podFullname) 1139 } 1140 } 1141 } 1142 1143 // After pruning pod workers for terminated pods get the list of active pods for 1144 // metrics and to determine restarts. 1145 activePods := kl.filterOutInactivePods(allPods) 1146 allRegularPods, allStaticPods := splitPodsByStatic(allPods) 1147 activeRegularPods, activeStaticPods := splitPodsByStatic(activePods) 1148 metrics.DesiredPodCount.WithLabelValues("").Set(float64(len(allRegularPods))) 1149 metrics.DesiredPodCount.WithLabelValues("true").Set(float64(len(allStaticPods))) 1150 metrics.ActivePodCount.WithLabelValues("").Set(float64(len(activeRegularPods))) 1151 metrics.ActivePodCount.WithLabelValues("true").Set(float64(len(activeStaticPods))) 1152 metrics.MirrorPodCount.Set(float64(len(mirrorPods))) 1153 1154 // At this point, the pod worker is aware of which pods are not desired (SyncKnownPods). 1155 // We now look through the set of active pods for those that the pod worker is not aware of 1156 // and deliver an update. The most common reason a pod is not known is because the pod was 1157 // deleted and recreated with the same UID while the pod worker was driving its lifecycle (very 1158 // very rare for API pods, common for static pods with fixed UIDs). Containers that may still 1159 // be running from a previous execution must be reconciled by the pod worker's sync method. 1160 // We must use active pods because that is the set of admitted pods (podManager includes pods 1161 // that will never be run, and statusManager tracks already rejected pods). 1162 var restartCount, restartCountStatic int 1163 for _, desiredPod := range activePods { 1164 if _, knownPod := workingPods[desiredPod.UID]; knownPod { 1165 continue 1166 } 1167 1168 klog.V(3).InfoS("Pod will be restarted because it is in the desired set and not known to the pod workers (likely due to UID reuse)", "podUID", desiredPod.UID) 1169 isStatic := kubetypes.IsStaticPod(desiredPod) 1170 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(desiredPod) 1171 if pod == nil || wasMirror { 1172 klog.V(2).InfoS("Programmer error, restartable pod was a mirror pod but activePods should never contain a mirror pod", "podUID", desiredPod.UID) 1173 continue 1174 } 1175 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1176 UpdateType: kubetypes.SyncPodCreate, 1177 Pod: pod, 1178 MirrorPod: mirrorPod, 1179 }) 1180 1181 // the desired pod is now known as well 1182 workingPods[desiredPod.UID] = PodWorkerSync{State: SyncPod, HasConfig: true, Static: isStatic} 1183 if isStatic { 1184 // restartable static pods are the normal case 1185 restartCountStatic++ 1186 } else { 1187 // almost certainly means shenanigans, as API pods should never have the same UID after being deleted and recreated 1188 // unless there is a major API violation 1189 restartCount++ 1190 } 1191 } 1192 metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic)) 1193 metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount)) 1194 1195 // Complete termination of deleted pods that are not runtime pods (don't have 1196 // running containers), are terminal, and are not known to pod workers. 1197 // An example is pods rejected during kubelet admission that have never 1198 // started before (i.e. does not have an orphaned pod). 1199 // Adding the pods with SyncPodKill to pod workers allows to proceed with 1200 // force-deletion of such pods, yet preventing re-entry of the routine in the 1201 // next invocation of HandlePodCleanups. 1202 for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) { 1203 klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID) 1204 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1205 UpdateType: kubetypes.SyncPodKill, 1206 Pod: pod, 1207 }) 1208 } 1209 1210 // Finally, terminate any pods that are observed in the runtime but not present in the list of 1211 // known running pods from config. If we do terminate running runtime pods that will happen 1212 // asynchronously in the background and those will be processed in the next invocation of 1213 // HandlePodCleanups. 1214 var orphanCount int 1215 for _, runningPod := range runningRuntimePods { 1216 // If there are orphaned pod resources in CRI that are unknown to the pod worker, terminate them 1217 // now. Since housekeeping is exclusive to other pod worker updates, we know that no pods have 1218 // been added to the pod worker in the meantime. Note that pods that are not visible in the runtime 1219 // but which were previously known are terminated by SyncKnownPods(). 1220 _, knownPod := workingPods[runningPod.ID] 1221 if !knownPod { 1222 one := int64(1) 1223 killPodOptions := &KillPodOptions{ 1224 PodTerminationGracePeriodSecondsOverride: &one, 1225 } 1226 klog.V(2).InfoS("Clean up containers for orphaned pod we had not seen before", "podUID", runningPod.ID, "killPodOptions", killPodOptions) 1227 kl.podWorkers.UpdatePod(UpdatePodOptions{ 1228 UpdateType: kubetypes.SyncPodKill, 1229 RunningPod: runningPod, 1230 KillPodOptions: killPodOptions, 1231 }) 1232 1233 // the running pod is now known as well 1234 workingPods[runningPod.ID] = PodWorkerSync{State: TerminatingPod, Orphan: true} 1235 orphanCount++ 1236 } 1237 } 1238 metrics.OrphanedRuntimePodTotal.Add(float64(orphanCount)) 1239 1240 // Now that we have recorded any terminating pods, and added new pods that should be running, 1241 // record a summary here. Not all possible combinations of PodWorkerSync values are valid. 1242 counts := make(map[PodWorkerSync]int) 1243 for _, sync := range workingPods { 1244 counts[sync]++ 1245 } 1246 for validSync, configState := range map[PodWorkerSync]string{ 1247 {HasConfig: true, Static: true}: "desired", 1248 {HasConfig: true, Static: false}: "desired", 1249 {Orphan: true, HasConfig: true, Static: true}: "orphan", 1250 {Orphan: true, HasConfig: true, Static: false}: "orphan", 1251 {Orphan: true, HasConfig: false}: "runtime_only", 1252 } { 1253 for _, state := range []PodWorkerState{SyncPod, TerminatingPod, TerminatedPod} { 1254 validSync.State = state 1255 count := counts[validSync] 1256 delete(counts, validSync) 1257 staticString := stringIfTrue(validSync.Static) 1258 if !validSync.HasConfig { 1259 staticString = "unknown" 1260 } 1261 metrics.WorkingPodCount.WithLabelValues(state.String(), configState, staticString).Set(float64(count)) 1262 } 1263 } 1264 if len(counts) > 0 { 1265 // in case a combination is lost 1266 klog.V(3).InfoS("Programmer error, did not report a kubelet_working_pods metric for a value returned by SyncKnownPods", "counts", counts) 1267 } 1268 1269 // Remove any cgroups in the hierarchy for pods that are definitely no longer 1270 // running (not in the container runtime). 1271 if kl.cgroupsPerQOS { 1272 pcm := kl.containerManager.NewPodContainerManager() 1273 klog.V(3).InfoS("Clean up orphaned pod cgroups") 1274 kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, possiblyRunningPods) 1275 } 1276 1277 // Cleanup any backoff entries. 1278 kl.backOff.GC() 1279 return nil 1280 } 1281 1282 // filterTerminalPodsToDelete returns terminal pods which are ready to be 1283 // deleted by the status manager, but are not in pod workers. 1284 // First, the check for deletionTimestamp is a performance optimization as we 1285 // don't need to do anything with terminal pods without deletionTimestamp. 1286 // Second, the check for terminal pods is to avoid race conditions of triggering 1287 // deletion on Pending pods which are not yet added to pod workers. 1288 // Third, the check to skip pods known to pod workers is that the lifecycle of 1289 // such pods is already handled by pod workers. 1290 // Finally, we skip runtime pods as their termination is handled separately in 1291 // the HandlePodCleanups routine. 1292 func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod { 1293 terminalPodsToDelete := make(map[types.UID]*v1.Pod) 1294 for _, pod := range allPods { 1295 if pod.DeletionTimestamp == nil { 1296 // skip pods which don't have a deletion timestamp 1297 continue 1298 } 1299 if !podutil.IsPodPhaseTerminal(pod.Status.Phase) { 1300 // skip the non-terminal pods 1301 continue 1302 } 1303 if _, knownPod := workingPods[pod.UID]; knownPod { 1304 // skip pods known to pod workers 1305 continue 1306 } 1307 terminalPodsToDelete[pod.UID] = pod 1308 } 1309 for _, runningRuntimePod := range runningRuntimePods { 1310 // skip running runtime pods - they are handled by a dedicated routine 1311 // which terminates the containers 1312 delete(terminalPodsToDelete, runningRuntimePod.ID) 1313 } 1314 return terminalPodsToDelete 1315 } 1316 1317 // splitPodsByStatic separates a list of desired pods from the pod manager into 1318 // regular or static pods. Mirror pods are not valid config sources (a mirror pod 1319 // being created cannot cause the Kubelet to start running a static pod) and are 1320 // excluded. 1321 func splitPodsByStatic(pods []*v1.Pod) (regular, static []*v1.Pod) { 1322 regular, static = make([]*v1.Pod, 0, len(pods)), make([]*v1.Pod, 0, len(pods)) 1323 for _, pod := range pods { 1324 if kubetypes.IsMirrorPod(pod) { 1325 continue 1326 } 1327 if kubetypes.IsStaticPod(pod) { 1328 static = append(static, pod) 1329 } else { 1330 regular = append(regular, pod) 1331 } 1332 } 1333 return regular, static 1334 } 1335 1336 // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state 1337 // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current 1338 // running container is preferred over a previous termination. If info about the container is not available then a specific 1339 // error is returned to the end user. 1340 func (kl *Kubelet) validateContainerLogStatus(podName string, podStatus *v1.PodStatus, containerName string, previous bool) (containerID kubecontainer.ContainerID, err error) { 1341 var cID string 1342 1343 cStatus, found := podutil.GetContainerStatus(podStatus.ContainerStatuses, containerName) 1344 if !found { 1345 cStatus, found = podutil.GetContainerStatus(podStatus.InitContainerStatuses, containerName) 1346 } 1347 if !found { 1348 cStatus, found = podutil.GetContainerStatus(podStatus.EphemeralContainerStatuses, containerName) 1349 } 1350 if !found { 1351 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is not available", containerName, podName) 1352 } 1353 lastState := cStatus.LastTerminationState 1354 waiting, running, terminated := cStatus.State.Waiting, cStatus.State.Running, cStatus.State.Terminated 1355 1356 switch { 1357 case previous: 1358 if lastState.Terminated == nil || lastState.Terminated.ContainerID == "" { 1359 return kubecontainer.ContainerID{}, fmt.Errorf("previous terminated container %q in pod %q not found", containerName, podName) 1360 } 1361 cID = lastState.Terminated.ContainerID 1362 1363 case running != nil: 1364 cID = cStatus.ContainerID 1365 1366 case terminated != nil: 1367 // in cases where the next container didn't start, terminated.ContainerID will be empty, so get logs from the lastState.Terminated. 1368 if terminated.ContainerID == "" { 1369 if lastState.Terminated != nil && lastState.Terminated.ContainerID != "" { 1370 cID = lastState.Terminated.ContainerID 1371 } else { 1372 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) 1373 } 1374 } else { 1375 cID = terminated.ContainerID 1376 } 1377 1378 case lastState.Terminated != nil: 1379 if lastState.Terminated.ContainerID == "" { 1380 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName) 1381 } 1382 cID = lastState.Terminated.ContainerID 1383 1384 case waiting != nil: 1385 // output some info for the most common pending failures 1386 switch reason := waiting.Reason; reason { 1387 case images.ErrImagePull.Error(): 1388 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: image can't be pulled", containerName, podName) 1389 case images.ErrImagePullBackOff.Error(): 1390 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: trying and failing to pull image", containerName, podName) 1391 default: 1392 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: %v", containerName, podName, reason) 1393 } 1394 default: 1395 // unrecognized state 1396 return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start - no logs yet", containerName, podName) 1397 } 1398 1399 return kubecontainer.ParseContainerID(cID), nil 1400 } 1401 1402 // GetKubeletContainerLogs returns logs from the container 1403 // TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt 1404 // or all of them. 1405 func (kl *Kubelet) GetKubeletContainerLogs(ctx context.Context, podFullName, containerName string, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) error { 1406 // Pod workers periodically write status to statusManager. If status is not 1407 // cached there, something is wrong (or kubelet just restarted and hasn't 1408 // caught up yet). Just assume the pod is not ready yet. 1409 name, namespace, err := kubecontainer.ParsePodFullName(podFullName) 1410 if err != nil { 1411 return fmt.Errorf("unable to parse pod full name %q: %v", podFullName, err) 1412 } 1413 1414 pod, ok := kl.GetPodByName(namespace, name) 1415 if !ok { 1416 return fmt.Errorf("pod %q cannot be found - no logs available", name) 1417 } 1418 1419 // TODO: this should be using the podWorker's pod store as authoritative, since 1420 // the mirrorPod might still exist, the pod may have been force deleted but 1421 // is still terminating (users should be able to view logs of force deleted static pods 1422 // based on full name). 1423 var podUID types.UID 1424 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 1425 if wasMirror { 1426 if pod == nil { 1427 return fmt.Errorf("mirror pod %q does not have a corresponding pod", name) 1428 } 1429 podUID = mirrorPod.UID 1430 } else { 1431 podUID = pod.UID 1432 } 1433 1434 podStatus, found := kl.statusManager.GetPodStatus(podUID) 1435 if !found { 1436 // If there is no cached status, use the status from the 1437 // config source (apiserver). This is useful if kubelet 1438 // has recently been restarted. 1439 podStatus = pod.Status 1440 } 1441 1442 // TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID, 1443 // but inside kuberuntime we convert container id back to container name and restart count. 1444 // TODO: After separate container log lifecycle management, we should get log based on the existing log files 1445 // instead of container status. 1446 containerID, err := kl.validateContainerLogStatus(pod.Name, &podStatus, containerName, logOptions.Previous) 1447 if err != nil { 1448 return err 1449 } 1450 1451 // Do a zero-byte write to stdout before handing off to the container runtime. 1452 // This ensures at least one Write call is made to the writer when copying starts, 1453 // even if we then block waiting for log output from the container. 1454 if _, err := stdout.Write([]byte{}); err != nil { 1455 return err 1456 } 1457 1458 return kl.containerRuntime.GetContainerLogs(ctx, pod, containerID, logOptions, stdout, stderr) 1459 } 1460 1461 // getPhase returns the phase of a pod given its container info. 1462 func getPhase(pod *v1.Pod, info []v1.ContainerStatus, podIsTerminal bool) v1.PodPhase { 1463 spec := pod.Spec 1464 pendingInitialization := 0 1465 failedInitialization := 0 1466 1467 // regular init containers 1468 for _, container := range spec.InitContainers { 1469 if kubetypes.IsRestartableInitContainer(&container) { 1470 // Skip the restartable init containers here to handle them separately as 1471 // they are slightly different from the init containers in terms of the 1472 // pod phase. 1473 continue 1474 } 1475 1476 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1477 if !ok { 1478 pendingInitialization++ 1479 continue 1480 } 1481 1482 switch { 1483 case containerStatus.State.Running != nil: 1484 pendingInitialization++ 1485 case containerStatus.State.Terminated != nil: 1486 if containerStatus.State.Terminated.ExitCode != 0 { 1487 failedInitialization++ 1488 } 1489 case containerStatus.State.Waiting != nil: 1490 if containerStatus.LastTerminationState.Terminated != nil { 1491 if containerStatus.LastTerminationState.Terminated.ExitCode != 0 { 1492 failedInitialization++ 1493 } 1494 } else { 1495 pendingInitialization++ 1496 } 1497 default: 1498 pendingInitialization++ 1499 } 1500 } 1501 1502 // counters for restartable init and regular containers 1503 unknown := 0 1504 running := 0 1505 waiting := 0 1506 stopped := 0 1507 succeeded := 0 1508 1509 // restartable init containers 1510 for _, container := range spec.InitContainers { 1511 if !kubetypes.IsRestartableInitContainer(&container) { 1512 // Skip the regular init containers, as they have been handled above. 1513 continue 1514 } 1515 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1516 if !ok { 1517 unknown++ 1518 continue 1519 } 1520 1521 switch { 1522 case containerStatus.State.Running != nil: 1523 if containerStatus.Started == nil || !*containerStatus.Started { 1524 pendingInitialization++ 1525 } 1526 running++ 1527 case containerStatus.State.Terminated != nil: 1528 // Do nothing here, as terminated restartable init containers are not 1529 // taken into account for the pod phase. 1530 case containerStatus.State.Waiting != nil: 1531 if containerStatus.LastTerminationState.Terminated != nil { 1532 // Do nothing here, as terminated restartable init containers are not 1533 // taken into account for the pod phase. 1534 } else { 1535 pendingInitialization++ 1536 waiting++ 1537 } 1538 default: 1539 pendingInitialization++ 1540 unknown++ 1541 } 1542 } 1543 1544 for _, container := range spec.Containers { 1545 containerStatus, ok := podutil.GetContainerStatus(info, container.Name) 1546 if !ok { 1547 unknown++ 1548 continue 1549 } 1550 1551 switch { 1552 case containerStatus.State.Running != nil: 1553 running++ 1554 case containerStatus.State.Terminated != nil: 1555 stopped++ 1556 if containerStatus.State.Terminated.ExitCode == 0 { 1557 succeeded++ 1558 } 1559 case containerStatus.State.Waiting != nil: 1560 if containerStatus.LastTerminationState.Terminated != nil { 1561 stopped++ 1562 } else { 1563 waiting++ 1564 } 1565 default: 1566 unknown++ 1567 } 1568 } 1569 1570 if failedInitialization > 0 && spec.RestartPolicy == v1.RestartPolicyNever { 1571 return v1.PodFailed 1572 } 1573 1574 switch { 1575 case pendingInitialization > 0 && 1576 // This is needed to handle the case where the pod has been initialized but 1577 // the restartable init containers are restarting and the pod should not be 1578 // placed back into v1.PodPending since the regular containers have run. 1579 !kubecontainer.HasAnyRegularContainerStarted(&spec, info): 1580 fallthrough 1581 case waiting > 0: 1582 klog.V(5).InfoS("Pod waiting > 0, pending") 1583 // One or more containers has not been started 1584 return v1.PodPending 1585 case running > 0 && unknown == 0: 1586 // All containers have been started, and at least 1587 // one container is running 1588 return v1.PodRunning 1589 case running == 0 && stopped > 0 && unknown == 0: 1590 // The pod is terminal so its containers won't be restarted regardless 1591 // of the restart policy. 1592 if podIsTerminal { 1593 // TODO(#116484): Also assign terminal phase to static pods. 1594 if !kubetypes.IsStaticPod(pod) { 1595 // All regular containers are terminated in success and all restartable 1596 // init containers are stopped. 1597 if stopped == succeeded { 1598 return v1.PodSucceeded 1599 } 1600 // There is at least one failure 1601 return v1.PodFailed 1602 } 1603 } 1604 // All containers are terminated 1605 if spec.RestartPolicy == v1.RestartPolicyAlways { 1606 // All containers are in the process of restarting 1607 return v1.PodRunning 1608 } 1609 if stopped == succeeded { 1610 // RestartPolicy is not Always, all containers are terminated in success 1611 // and all restartable init containers are stopped. 1612 return v1.PodSucceeded 1613 } 1614 if spec.RestartPolicy == v1.RestartPolicyNever { 1615 // RestartPolicy is Never, and all containers are 1616 // terminated with at least one in failure 1617 return v1.PodFailed 1618 } 1619 // RestartPolicy is OnFailure, and at least one in failure 1620 // and in the process of restarting 1621 return v1.PodRunning 1622 default: 1623 klog.V(5).InfoS("Pod default case, pending") 1624 return v1.PodPending 1625 } 1626 } 1627 1628 func deleteCustomResourceFromResourceRequirements(target *v1.ResourceRequirements) { 1629 for resource := range target.Limits { 1630 if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage { 1631 delete(target.Limits, resource) 1632 } 1633 } 1634 for resource := range target.Requests { 1635 if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage { 1636 delete(target.Requests, resource) 1637 } 1638 } 1639 } 1640 1641 func (kl *Kubelet) determinePodResizeStatus(pod *v1.Pod, podStatus *v1.PodStatus) v1.PodResizeStatus { 1642 var podResizeStatus v1.PodResizeStatus 1643 specStatusDiffer := false 1644 for _, c := range pod.Spec.Containers { 1645 if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { 1646 cResourceCopy := c.Resources.DeepCopy() 1647 // for both requests and limits, we only compare the cpu, memory and ephemeralstorage 1648 // which are included in convertToAPIContainerStatuses 1649 deleteCustomResourceFromResourceRequirements(cResourceCopy) 1650 csResourceCopy := cs.Resources.DeepCopy() 1651 if csResourceCopy != nil && !cmp.Equal(*cResourceCopy, *csResourceCopy) { 1652 specStatusDiffer = true 1653 break 1654 } 1655 } 1656 } 1657 if !specStatusDiffer { 1658 // Clear last resize state from checkpoint 1659 if err := kl.statusManager.SetPodResizeStatus(pod.UID, ""); err != nil { 1660 klog.ErrorS(err, "SetPodResizeStatus failed", "pod", pod.Name) 1661 } 1662 } else { 1663 if resizeStatus, found := kl.statusManager.GetPodResizeStatus(string(pod.UID)); found { 1664 podResizeStatus = resizeStatus 1665 } 1666 } 1667 return podResizeStatus 1668 } 1669 1670 // generateAPIPodStatus creates the final API pod status for a pod, given the 1671 // internal pod status. This method should only be called from within sync*Pod methods. 1672 func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podIsTerminal bool) v1.PodStatus { 1673 klog.V(3).InfoS("Generating pod status", "podIsTerminal", podIsTerminal, "pod", klog.KObj(pod)) 1674 // use the previous pod status, or the api status, as the basis for this pod 1675 oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID) 1676 if !found { 1677 oldPodStatus = pod.Status 1678 } 1679 s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus) 1680 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 1681 s.Resize = kl.determinePodResizeStatus(pod, s) 1682 } 1683 // calculate the next phase and preserve reason 1684 allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) 1685 s.Phase = getPhase(pod, allStatus, podIsTerminal) 1686 klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "oldPhase", oldPodStatus.Phase, "phase", s.Phase) 1687 1688 // Perform a three-way merge between the statuses from the status manager, 1689 // runtime, and generated status to ensure terminal status is correctly set. 1690 if s.Phase != v1.PodFailed && s.Phase != v1.PodSucceeded { 1691 switch { 1692 case oldPodStatus.Phase == v1.PodFailed || oldPodStatus.Phase == v1.PodSucceeded: 1693 klog.V(4).InfoS("Status manager phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", oldPodStatus.Phase) 1694 s.Phase = oldPodStatus.Phase 1695 case pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded: 1696 klog.V(4).InfoS("API phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", pod.Status.Phase) 1697 s.Phase = pod.Status.Phase 1698 } 1699 } 1700 1701 if s.Phase == oldPodStatus.Phase { 1702 // preserve the reason and message which is associated with the phase 1703 s.Reason = oldPodStatus.Reason 1704 s.Message = oldPodStatus.Message 1705 if len(s.Reason) == 0 { 1706 s.Reason = pod.Status.Reason 1707 } 1708 if len(s.Message) == 0 { 1709 s.Message = pod.Status.Message 1710 } 1711 } 1712 1713 // check if an internal module has requested the pod is evicted and override the reason and message 1714 for _, podSyncHandler := range kl.PodSyncHandlers { 1715 if result := podSyncHandler.ShouldEvict(pod); result.Evict { 1716 s.Phase = v1.PodFailed 1717 s.Reason = result.Reason 1718 s.Message = result.Message 1719 break 1720 } 1721 } 1722 1723 // pods are not allowed to transition out of terminal phases 1724 if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { 1725 // API server shows terminal phase; transitions are not allowed 1726 if s.Phase != pod.Status.Phase { 1727 klog.ErrorS(nil, "Pod attempted illegal phase transition", "pod", klog.KObj(pod), "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s) 1728 // Force back to phase from the API server 1729 s.Phase = pod.Status.Phase 1730 } 1731 } 1732 1733 // ensure the probe managers have up to date status for containers 1734 kl.probeManager.UpdatePodStatus(pod, s) 1735 1736 // preserve all conditions not owned by the kubelet 1737 s.Conditions = make([]v1.PodCondition, 0, len(pod.Status.Conditions)+1) 1738 for _, c := range pod.Status.Conditions { 1739 if !kubetypes.PodConditionByKubelet(c.Type) { 1740 s.Conditions = append(s.Conditions, c) 1741 } 1742 } 1743 1744 if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) { 1745 // copy over the pod disruption conditions from state which is already 1746 // updated during the eviciton (due to either node resource pressure or 1747 // node graceful shutdown). We do not re-generate the conditions based 1748 // on the container statuses as they are added based on one-time events. 1749 cType := v1.DisruptionTarget 1750 if _, condition := podutil.GetPodConditionFromList(oldPodStatus.Conditions, cType); condition != nil { 1751 s.Conditions = utilpod.ReplaceOrAppendPodCondition(s.Conditions, condition) 1752 } 1753 } 1754 1755 // set all Kubelet-owned conditions 1756 if utilfeature.DefaultFeatureGate.Enabled(features.PodReadyToStartContainersCondition) { 1757 s.Conditions = append(s.Conditions, status.GeneratePodReadyToStartContainersCondition(pod, podStatus)) 1758 } 1759 allContainerStatuses := append(s.InitContainerStatuses, s.ContainerStatuses...) 1760 s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(&pod.Spec, allContainerStatuses, s.Phase)) 1761 s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(&pod.Spec, s.Conditions, allContainerStatuses, s.Phase)) 1762 s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(&pod.Spec, allContainerStatuses, s.Phase)) 1763 s.Conditions = append(s.Conditions, v1.PodCondition{ 1764 Type: v1.PodScheduled, 1765 Status: v1.ConditionTrue, 1766 }) 1767 // set HostIP/HostIPs and initialize PodIP/PodIPs for host network pods 1768 if kl.kubeClient != nil { 1769 hostIPs, err := kl.getHostIPsAnyWay() 1770 if err != nil { 1771 klog.V(4).InfoS("Cannot get host IPs", "err", err) 1772 } else { 1773 if s.HostIP != "" { 1774 if utilnet.IPFamilyOfString(s.HostIP) != utilnet.IPFamilyOf(hostIPs[0]) { 1775 kl.recorder.Eventf(pod, v1.EventTypeWarning, "HostIPsIPFamilyMismatch", 1776 "Kubelet detected an IPv%s node IP (%s), but the cloud provider selected an IPv%s node IP (%s); pass an explicit `--node-ip` to kubelet to fix this.", 1777 utilnet.IPFamilyOfString(s.HostIP), s.HostIP, utilnet.IPFamilyOf(hostIPs[0]), hostIPs[0].String()) 1778 } 1779 } 1780 s.HostIP = hostIPs[0].String() 1781 if utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) { 1782 s.HostIPs = []v1.HostIP{{IP: s.HostIP}} 1783 if len(hostIPs) == 2 { 1784 s.HostIPs = append(s.HostIPs, v1.HostIP{IP: hostIPs[1].String()}) 1785 } 1786 } 1787 1788 // HostNetwork Pods inherit the node IPs as PodIPs. They are immutable once set, 1789 // other than that if the node becomes dual-stack, we add the secondary IP. 1790 if kubecontainer.IsHostNetworkPod(pod) { 1791 // Primary IP is not set 1792 if s.PodIP == "" { 1793 s.PodIP = hostIPs[0].String() 1794 s.PodIPs = []v1.PodIP{{IP: s.PodIP}} 1795 } 1796 // Secondary IP is not set #105320 1797 if len(hostIPs) == 2 && len(s.PodIPs) == 1 { 1798 if utilnet.IPFamilyOfString(s.PodIPs[0].IP) != utilnet.IPFamilyOf(hostIPs[1]) { 1799 s.PodIPs = append(s.PodIPs, v1.PodIP{IP: hostIPs[1].String()}) 1800 } 1801 } 1802 } 1803 } 1804 } 1805 1806 return *s 1807 } 1808 1809 // sortPodIPs return the PodIPs sorted and truncated by the cluster IP family preference. 1810 // The runtime pod status may have an arbitrary number of IPs, in an arbitrary order. 1811 // PodIPs are obtained by: func (m *kubeGenericRuntimeManager) determinePodSandboxIPs() 1812 // Pick out the first returned IP of the same IP family as the node IP 1813 // first, followed by the first IP of the opposite IP family (if any) 1814 // and use them for the Pod.Status.PodIPs and the Downward API environment variables 1815 func (kl *Kubelet) sortPodIPs(podIPs []string) []string { 1816 ips := make([]string, 0, 2) 1817 var validPrimaryIP, validSecondaryIP func(ip string) bool 1818 if len(kl.nodeIPs) == 0 || utilnet.IsIPv4(kl.nodeIPs[0]) { 1819 validPrimaryIP = utilnet.IsIPv4String 1820 validSecondaryIP = utilnet.IsIPv6String 1821 } else { 1822 validPrimaryIP = utilnet.IsIPv6String 1823 validSecondaryIP = utilnet.IsIPv4String 1824 } 1825 for _, ip := range podIPs { 1826 if validPrimaryIP(ip) { 1827 ips = append(ips, ip) 1828 break 1829 } 1830 } 1831 for _, ip := range podIPs { 1832 if validSecondaryIP(ip) { 1833 ips = append(ips, ip) 1834 break 1835 } 1836 } 1837 return ips 1838 } 1839 1840 // convertStatusToAPIStatus initialize an api PodStatus for the given pod from 1841 // the given internal pod status and the previous state of the pod from the API. 1842 // It is purely transformative and does not alter the kubelet state at all. 1843 func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, oldPodStatus v1.PodStatus) *v1.PodStatus { 1844 var apiPodStatus v1.PodStatus 1845 1846 // copy pod status IPs to avoid race conditions with PodStatus #102806 1847 podIPs := make([]string, len(podStatus.IPs)) 1848 copy(podIPs, podStatus.IPs) 1849 1850 // make podIPs order match node IP family preference #97979 1851 podIPs = kl.sortPodIPs(podIPs) 1852 for _, ip := range podIPs { 1853 apiPodStatus.PodIPs = append(apiPodStatus.PodIPs, v1.PodIP{IP: ip}) 1854 } 1855 if len(apiPodStatus.PodIPs) > 0 { 1856 apiPodStatus.PodIP = apiPodStatus.PodIPs[0].IP 1857 } 1858 1859 // set status for Pods created on versions of kube older than 1.6 1860 apiPodStatus.QOSClass = v1qos.GetPodQOS(pod) 1861 1862 apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses( 1863 pod, podStatus, 1864 oldPodStatus.ContainerStatuses, 1865 pod.Spec.Containers, 1866 len(pod.Spec.InitContainers) > 0, 1867 false, 1868 ) 1869 apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses( 1870 pod, podStatus, 1871 oldPodStatus.InitContainerStatuses, 1872 pod.Spec.InitContainers, 1873 len(pod.Spec.InitContainers) > 0, 1874 true, 1875 ) 1876 var ecSpecs []v1.Container 1877 for i := range pod.Spec.EphemeralContainers { 1878 ecSpecs = append(ecSpecs, v1.Container(pod.Spec.EphemeralContainers[i].EphemeralContainerCommon)) 1879 } 1880 1881 // #80875: By now we've iterated podStatus 3 times. We could refactor this to make a single 1882 // pass through podStatus.ContainerStatuses 1883 apiPodStatus.EphemeralContainerStatuses = kl.convertToAPIContainerStatuses( 1884 pod, podStatus, 1885 oldPodStatus.EphemeralContainerStatuses, 1886 ecSpecs, 1887 len(pod.Spec.InitContainers) > 0, 1888 false, 1889 ) 1890 1891 return &apiPodStatus 1892 } 1893 1894 // convertToAPIContainerStatuses converts the given internal container 1895 // statuses into API container statuses. 1896 func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus { 1897 convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus { 1898 cid := cs.ID.String() 1899 status := &v1.ContainerStatus{ 1900 Name: cs.Name, 1901 RestartCount: int32(cs.RestartCount), 1902 Image: cs.Image, 1903 ImageID: cs.ImageID, 1904 ContainerID: cid, 1905 } 1906 switch { 1907 case cs.State == kubecontainer.ContainerStateRunning: 1908 status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)} 1909 case cs.State == kubecontainer.ContainerStateCreated: 1910 // containers that are created but not running are "waiting to be running" 1911 status.State.Waiting = &v1.ContainerStateWaiting{} 1912 case cs.State == kubecontainer.ContainerStateExited: 1913 status.State.Terminated = &v1.ContainerStateTerminated{ 1914 ExitCode: int32(cs.ExitCode), 1915 Reason: cs.Reason, 1916 Message: cs.Message, 1917 StartedAt: metav1.NewTime(cs.StartedAt), 1918 FinishedAt: metav1.NewTime(cs.FinishedAt), 1919 ContainerID: cid, 1920 } 1921 1922 case cs.State == kubecontainer.ContainerStateUnknown && 1923 oldStatus != nil && // we have an old status 1924 oldStatus.State.Running != nil: // our previous status was running 1925 // if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers). 1926 // you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result. 1927 // in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run 1928 // twice. "container never ran" is different than "container ran and failed". This is handled differently in the kubelet 1929 // and it is handled differently in higher order logic like crashloop detection and handling 1930 status.State.Terminated = &v1.ContainerStateTerminated{ 1931 Reason: "ContainerStatusUnknown", 1932 Message: "The container could not be located when the pod was terminated", 1933 ExitCode: 137, // this code indicates an error 1934 } 1935 // the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly 1936 // for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate. 1937 status.RestartCount = oldStatus.RestartCount + 1 1938 1939 default: 1940 // this collapses any unknown state to container waiting. If any container is waiting, then the pod status moves to pending even if it is running. 1941 // if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers 1942 // are actually running. 1943 // see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to 1944 // https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71 1945 // and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439 1946 status.State.Waiting = &v1.ContainerStateWaiting{} 1947 } 1948 return status 1949 } 1950 1951 convertContainerStatusResources := func(cName string, status *v1.ContainerStatus, cStatus *kubecontainer.Status, oldStatuses map[string]v1.ContainerStatus) *v1.ResourceRequirements { 1952 var requests, limits v1.ResourceList 1953 // oldStatus should always exist if container is running 1954 oldStatus, oldStatusFound := oldStatuses[cName] 1955 // Initialize limits/requests from container's spec upon transition to Running state 1956 // For cpu & memory, values queried from runtime via CRI always supercedes spec values 1957 // For ephemeral-storage, a running container's status.limit/request equals spec.limit/request 1958 determineResource := func(rName v1.ResourceName, v1ContainerResource, oldStatusResource, resource v1.ResourceList) { 1959 if oldStatusFound { 1960 if oldStatus.State.Running == nil || status.ContainerID != oldStatus.ContainerID { 1961 if r, exists := v1ContainerResource[rName]; exists { 1962 resource[rName] = r.DeepCopy() 1963 } 1964 } else { 1965 if oldStatusResource != nil { 1966 if r, exists := oldStatusResource[rName]; exists { 1967 resource[rName] = r.DeepCopy() 1968 } 1969 } 1970 } 1971 } 1972 } 1973 container := kubecontainer.GetContainerSpec(pod, cName) 1974 // AllocatedResources values come from checkpoint. It is the source-of-truth. 1975 found := false 1976 status.AllocatedResources, found = kl.statusManager.GetContainerResourceAllocation(string(pod.UID), cName) 1977 if !(container.Resources.Requests == nil && container.Resources.Limits == nil) && !found { 1978 // Log error and fallback to AllocatedResources in oldStatus if it exists 1979 klog.ErrorS(nil, "resource allocation not found in checkpoint store", "pod", pod.Name, "container", cName) 1980 if oldStatusFound { 1981 status.AllocatedResources = oldStatus.AllocatedResources 1982 } 1983 } 1984 if oldStatus.Resources == nil { 1985 oldStatus.Resources = &v1.ResourceRequirements{} 1986 } 1987 // Convert Limits 1988 if container.Resources.Limits != nil { 1989 limits = make(v1.ResourceList) 1990 if cStatus.Resources != nil && cStatus.Resources.CPULimit != nil { 1991 limits[v1.ResourceCPU] = cStatus.Resources.CPULimit.DeepCopy() 1992 } else { 1993 determineResource(v1.ResourceCPU, container.Resources.Limits, oldStatus.Resources.Limits, limits) 1994 } 1995 if cStatus.Resources != nil && cStatus.Resources.MemoryLimit != nil { 1996 limits[v1.ResourceMemory] = cStatus.Resources.MemoryLimit.DeepCopy() 1997 } else { 1998 determineResource(v1.ResourceMemory, container.Resources.Limits, oldStatus.Resources.Limits, limits) 1999 } 2000 if ephemeralStorage, found := container.Resources.Limits[v1.ResourceEphemeralStorage]; found { 2001 limits[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() 2002 } 2003 } 2004 // Convert Requests 2005 if status.AllocatedResources != nil { 2006 requests = make(v1.ResourceList) 2007 if cStatus.Resources != nil && cStatus.Resources.CPURequest != nil { 2008 requests[v1.ResourceCPU] = cStatus.Resources.CPURequest.DeepCopy() 2009 } else { 2010 determineResource(v1.ResourceCPU, status.AllocatedResources, oldStatus.Resources.Requests, requests) 2011 } 2012 if memory, found := status.AllocatedResources[v1.ResourceMemory]; found { 2013 requests[v1.ResourceMemory] = memory.DeepCopy() 2014 } 2015 if ephemeralStorage, found := status.AllocatedResources[v1.ResourceEphemeralStorage]; found { 2016 requests[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy() 2017 } 2018 } 2019 //TODO(vinaykul,derekwaynecarr,InPlacePodVerticalScaling): Update this to include extended resources in 2020 // addition to CPU, memory, ephemeral storage. Add test case for extended resources. 2021 resources := &v1.ResourceRequirements{ 2022 Limits: limits, 2023 Requests: requests, 2024 } 2025 return resources 2026 } 2027 2028 // Fetch old containers statuses from old pod status. 2029 oldStatuses := make(map[string]v1.ContainerStatus, len(containers)) 2030 for _, status := range previousStatus { 2031 oldStatuses[status.Name] = status 2032 } 2033 2034 // Set all container statuses to default waiting state 2035 statuses := make(map[string]*v1.ContainerStatus, len(containers)) 2036 defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: ContainerCreating}} 2037 if hasInitContainers { 2038 defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: PodInitializing}} 2039 } 2040 2041 for _, container := range containers { 2042 status := &v1.ContainerStatus{ 2043 Name: container.Name, 2044 Image: container.Image, 2045 State: defaultWaitingState, 2046 } 2047 oldStatus, found := oldStatuses[container.Name] 2048 if found { 2049 if oldStatus.State.Terminated != nil { 2050 status = &oldStatus 2051 } else { 2052 // Apply some values from the old statuses as the default values. 2053 status.RestartCount = oldStatus.RestartCount 2054 status.LastTerminationState = oldStatus.LastTerminationState 2055 } 2056 } 2057 statuses[container.Name] = status 2058 } 2059 2060 for _, container := range containers { 2061 found := false 2062 for _, cStatus := range podStatus.ContainerStatuses { 2063 if container.Name == cStatus.Name { 2064 found = true 2065 break 2066 } 2067 } 2068 if found { 2069 continue 2070 } 2071 // if no container is found, then assuming it should be waiting seems plausible, but the status code requires 2072 // that a previous termination be present. If we're offline long enough or something removed the container, then 2073 // the previous termination may not be present. This next code block ensures that if the container was previously running 2074 // then when that container status disappears, we can infer that it terminated even if we don't know the status code. 2075 // By setting the lasttermination state we are able to leave the container status waiting and present more accurate 2076 // data via the API. 2077 2078 oldStatus, ok := oldStatuses[container.Name] 2079 if !ok { 2080 continue 2081 } 2082 if oldStatus.State.Terminated != nil { 2083 // if the old container status was terminated, the lasttermination status is correct 2084 continue 2085 } 2086 if oldStatus.State.Running == nil { 2087 // if the old container status isn't running, then waiting is an appropriate status and we have nothing to do 2088 continue 2089 } 2090 2091 // If we're here, we know the pod was previously running, but doesn't have a terminated status. We will check now to 2092 // see if it's in a pending state. 2093 status := statuses[container.Name] 2094 // If the status we're about to write indicates the default, the Waiting status will force this pod back into Pending. 2095 // That isn't true, we know the pod was previously running. 2096 isDefaultWaitingStatus := status.State.Waiting != nil && status.State.Waiting.Reason == ContainerCreating 2097 if hasInitContainers { 2098 isDefaultWaitingStatus = status.State.Waiting != nil && status.State.Waiting.Reason == PodInitializing 2099 } 2100 if !isDefaultWaitingStatus { 2101 // the status was written, don't override 2102 continue 2103 } 2104 if status.LastTerminationState.Terminated != nil { 2105 // if we already have a termination state, nothing to do 2106 continue 2107 } 2108 2109 // setting this value ensures that we show as stopped here, not as waiting: 2110 // https://github.com/kubernetes/kubernetes/blob/90c9f7b3e198e82a756a68ffeac978a00d606e55/pkg/kubelet/kubelet_pods.go#L1440-L1445 2111 // This prevents the pod from becoming pending 2112 status.LastTerminationState.Terminated = &v1.ContainerStateTerminated{ 2113 Reason: "ContainerStatusUnknown", 2114 Message: "The container could not be located when the pod was deleted. The container used to be Running", 2115 ExitCode: 137, 2116 } 2117 2118 // If the pod was not deleted, then it's been restarted. Increment restart count. 2119 if pod.DeletionTimestamp == nil { 2120 status.RestartCount += 1 2121 } 2122 2123 statuses[container.Name] = status 2124 } 2125 2126 // Copy the slice before sorting it 2127 containerStatusesCopy := make([]*kubecontainer.Status, len(podStatus.ContainerStatuses)) 2128 copy(containerStatusesCopy, podStatus.ContainerStatuses) 2129 2130 // Make the latest container status comes first. 2131 sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(containerStatusesCopy))) 2132 // Set container statuses according to the statuses seen in pod status 2133 containerSeen := map[string]int{} 2134 for _, cStatus := range containerStatusesCopy { 2135 cName := cStatus.Name 2136 if _, ok := statuses[cName]; !ok { 2137 // This would also ignore the infra container. 2138 continue 2139 } 2140 if containerSeen[cName] >= 2 { 2141 continue 2142 } 2143 var oldStatusPtr *v1.ContainerStatus 2144 if oldStatus, ok := oldStatuses[cName]; ok { 2145 oldStatusPtr = &oldStatus 2146 } 2147 status := convertContainerStatus(cStatus, oldStatusPtr) 2148 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2149 if status.State.Running != nil { 2150 status.Resources = convertContainerStatusResources(cName, status, cStatus, oldStatuses) 2151 } 2152 } 2153 if containerSeen[cName] == 0 { 2154 statuses[cName] = status 2155 } else { 2156 statuses[cName].LastTerminationState = status.State 2157 } 2158 containerSeen[cName] = containerSeen[cName] + 1 2159 } 2160 2161 // Handle the containers failed to be started, which should be in Waiting state. 2162 for _, container := range containers { 2163 if isInitContainer { 2164 // If the init container is terminated with exit code 0, it won't be restarted. 2165 // TODO(random-liu): Handle this in a cleaner way. 2166 s := podStatus.FindContainerStatusByName(container.Name) 2167 if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 { 2168 continue 2169 } 2170 } 2171 // If a container should be restarted in next syncpod, it is *Waiting*. 2172 if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) { 2173 continue 2174 } 2175 status := statuses[container.Name] 2176 reason, ok := kl.reasonCache.Get(pod.UID, container.Name) 2177 if !ok { 2178 // In fact, we could also apply Waiting state here, but it is less informative, 2179 // and the container will be restarted soon, so we prefer the original state here. 2180 // Note that with the current implementation of ShouldContainerBeRestarted the original state here 2181 // could be: 2182 // * Waiting: There is no associated historical container and start failure reason record. 2183 // * Terminated: The container is terminated. 2184 continue 2185 } 2186 if status.State.Terminated != nil { 2187 status.LastTerminationState = status.State 2188 } 2189 status.State = v1.ContainerState{ 2190 Waiting: &v1.ContainerStateWaiting{ 2191 Reason: reason.Err.Error(), 2192 Message: reason.Message, 2193 }, 2194 } 2195 statuses[container.Name] = status 2196 } 2197 2198 // Sort the container statuses since clients of this interface expect the list 2199 // of containers in a pod has a deterministic order. 2200 if isInitContainer { 2201 return kubetypes.SortStatusesOfInitContainers(pod, statuses) 2202 } 2203 containerStatuses := make([]v1.ContainerStatus, 0, len(statuses)) 2204 for _, status := range statuses { 2205 containerStatuses = append(containerStatuses, *status) 2206 } 2207 2208 sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses)) 2209 return containerStatuses 2210 } 2211 2212 // ServeLogs returns logs of current machine. 2213 func (kl *Kubelet) ServeLogs(w http.ResponseWriter, req *http.Request) { 2214 // TODO: allowlist logs we are willing to serve 2215 kl.logServer.ServeHTTP(w, req) 2216 } 2217 2218 // findContainer finds and returns the container with the given pod ID, full name, and container name. 2219 // It returns nil if not found. 2220 func (kl *Kubelet) findContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string) (*kubecontainer.Container, error) { 2221 pods, err := kl.containerRuntime.GetPods(ctx, false) 2222 if err != nil { 2223 return nil, err 2224 } 2225 // Resolve and type convert back again. 2226 // We need the static pod UID but the kubecontainer API works with types.UID. 2227 podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) 2228 pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) 2229 return pod.FindContainerByName(containerName), nil 2230 } 2231 2232 // RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes 2233 func (kl *Kubelet) RunInContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string) ([]byte, error) { 2234 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2235 if err != nil { 2236 return nil, err 2237 } 2238 if container == nil { 2239 return nil, fmt.Errorf("container not found (%q)", containerName) 2240 } 2241 // TODO(tallclair): Pass a proper timeout value. 2242 return kl.runner.RunInContainer(ctx, container.ID, cmd, 0) 2243 } 2244 2245 // GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it. 2246 func (kl *Kubelet) GetExec(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommandserver.Options) (*url.URL, error) { 2247 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2248 if err != nil { 2249 return nil, err 2250 } 2251 if container == nil { 2252 return nil, fmt.Errorf("container not found (%q)", containerName) 2253 } 2254 return kl.streamingRuntime.GetExec(ctx, container.ID, cmd, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, streamOpts.TTY) 2255 } 2256 2257 // GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it. 2258 func (kl *Kubelet) GetAttach(ctx context.Context, podFullName string, podUID types.UID, containerName string, streamOpts remotecommandserver.Options) (*url.URL, error) { 2259 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 2260 if err != nil { 2261 return nil, err 2262 } 2263 if container == nil { 2264 return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) 2265 } 2266 2267 // The TTY setting for attach must match the TTY setting in the initial container configuration, 2268 // since whether the process is running in a TTY cannot be changed after it has started. We 2269 // need the api.Pod to get the TTY status. 2270 pod, found := kl.GetPodByFullName(podFullName) 2271 if !found || (string(podUID) != "" && pod.UID != podUID) { 2272 return nil, fmt.Errorf("pod %s not found", podFullName) 2273 } 2274 containerSpec := kubecontainer.GetContainerSpec(pod, containerName) 2275 if containerSpec == nil { 2276 return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName) 2277 } 2278 tty := containerSpec.TTY 2279 2280 return kl.streamingRuntime.GetAttach(ctx, container.ID, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, tty) 2281 } 2282 2283 // GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it. 2284 func (kl *Kubelet) GetPortForward(ctx context.Context, podName, podNamespace string, podUID types.UID, portForwardOpts portforward.V4Options) (*url.URL, error) { 2285 pods, err := kl.containerRuntime.GetPods(ctx, false) 2286 if err != nil { 2287 return nil, err 2288 } 2289 // Resolve and type convert back again. 2290 // We need the static pod UID but the kubecontainer API works with types.UID. 2291 podUID = types.UID(kl.podManager.TranslatePodUID(podUID)) 2292 podFullName := kubecontainer.BuildPodFullName(podName, podNamespace) 2293 pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID) 2294 if pod.IsEmpty() { 2295 return nil, fmt.Errorf("pod not found (%q)", podFullName) 2296 } 2297 2298 return kl.streamingRuntime.GetPortForward(ctx, podName, podNamespace, podUID, portForwardOpts.Ports) 2299 } 2300 2301 // cleanupOrphanedPodCgroups removes cgroups that should no longer exist. 2302 // it reconciles the cached state of cgroupPods with the specified list of runningPods 2303 func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupPods map[types.UID]cm.CgroupName, possiblyRunningPods map[types.UID]sets.Empty) { 2304 // Iterate over all the found pods to verify if they should be running 2305 for uid, val := range cgroupPods { 2306 // if the pod is in the running set, its not a candidate for cleanup 2307 if _, ok := possiblyRunningPods[uid]; ok { 2308 continue 2309 } 2310 2311 // If volumes have not been unmounted/detached, do not delete the cgroup 2312 // so any memory backed volumes don't have their charges propagated to the 2313 // parent croup. If the volumes still exist, reduce the cpu shares for any 2314 // process in the cgroup to the minimum value while we wait. if the kubelet 2315 // is configured to keep terminated volumes, we will delete the cgroup and not block. 2316 if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.keepTerminatedPodVolumes { 2317 klog.V(3).InfoS("Orphaned pod found, but volumes not yet removed. Reducing cpu to minimum", "podUID", uid) 2318 if err := pcm.ReduceCPULimits(val); err != nil { 2319 klog.InfoS("Failed to reduce cpu time for pod pending volume cleanup", "podUID", uid, "err", err) 2320 } 2321 continue 2322 } 2323 klog.V(3).InfoS("Orphaned pod found, removing pod cgroups", "podUID", uid) 2324 // Destroy all cgroups of pod that should not be running, 2325 // by first killing all the attached processes to these cgroups. 2326 // We ignore errors thrown by the method, as the housekeeping loop would 2327 // again try to delete these unwanted pod cgroups 2328 go pcm.Destroy(val) 2329 } 2330 }