k8s.io/kubernetes@v1.29.3/pkg/kubelet/kubelet_pods.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kubelet
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"net/http"
    25  	"net/url"
    26  	"os"
    27  	"path/filepath"
    28  	"runtime"
    29  	"sort"
    30  	"strings"
    31  
    32  	"github.com/google/go-cmp/cmp"
    33  	v1 "k8s.io/api/core/v1"
    34  	"k8s.io/apimachinery/pkg/api/errors"
    35  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    36  	"k8s.io/apimachinery/pkg/labels"
    37  	"k8s.io/apimachinery/pkg/types"
    38  	"k8s.io/apimachinery/pkg/util/sets"
    39  	utilvalidation "k8s.io/apimachinery/pkg/util/validation"
    40  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    41  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    42  	"k8s.io/klog/v2"
    43  	"k8s.io/kubelet/pkg/cri/streaming/portforward"
    44  	remotecommandserver "k8s.io/kubelet/pkg/cri/streaming/remotecommand"
    45  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    46  	"k8s.io/kubernetes/pkg/api/v1/resource"
    47  	podshelper "k8s.io/kubernetes/pkg/apis/core/pods"
    48  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    49  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    50  	"k8s.io/kubernetes/pkg/features"
    51  	"k8s.io/kubernetes/pkg/fieldpath"
    52  	"k8s.io/kubernetes/pkg/kubelet/cm"
    53  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    54  	"k8s.io/kubernetes/pkg/kubelet/envvars"
    55  	"k8s.io/kubernetes/pkg/kubelet/images"
    56  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    57  	"k8s.io/kubernetes/pkg/kubelet/status"
    58  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
    59  	"k8s.io/kubernetes/pkg/kubelet/util"
    60  	utilpod "k8s.io/kubernetes/pkg/util/pod"
    61  	volumeutil "k8s.io/kubernetes/pkg/volume/util"
    62  	"k8s.io/kubernetes/pkg/volume/util/hostutil"
    63  	"k8s.io/kubernetes/pkg/volume/util/subpath"
    64  	"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
    65  	volumevalidation "k8s.io/kubernetes/pkg/volume/validation"
    66  	"k8s.io/kubernetes/third_party/forked/golang/expansion"
    67  	utilnet "k8s.io/utils/net"
    68  )
    69  
    70  const (
    71  	managedHostsHeader                = "# Kubernetes-managed hosts file.\n"
    72  	managedHostsHeaderWithHostNetwork = "# Kubernetes-managed hosts file (host network).\n"
    73  )
    74  
    75  // Container state reason list
    76  const (
    77  	PodInitializing   = "PodInitializing"
    78  	ContainerCreating = "ContainerCreating"
    79  )
    80  
    81  // Get a list of pods that have data directories.
    82  func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) {
    83  	podInfos, err := os.ReadDir(kl.getPodsDir())
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  	pods := []types.UID{}
    88  	for i := range podInfos {
    89  		if podInfos[i].IsDir() {
    90  			pods = append(pods, types.UID(podInfos[i].Name()))
    91  		}
    92  	}
    93  	return pods, nil
    94  }
    95  
    96  // GetActivePods returns pods that have been admitted to the kubelet that
    97  // are not fully terminated. This is mapped to the "desired state" of the
    98  // kubelet - what pods should be running.
    99  //
   100  // WARNING: Currently this list does not include pods that have been force
   101  // deleted but may still be terminating, which means resources assigned to
   102  // those pods during admission may still be in use. See
   103  // https://github.com/kubernetes/kubernetes/issues/104824
   104  func (kl *Kubelet) GetActivePods() []*v1.Pod {
   105  	allPods := kl.podManager.GetPods()
   106  	activePods := kl.filterOutInactivePods(allPods)
   107  	return activePods
   108  }
   109  
   110  // makeBlockVolumes maps the raw block devices specified in the path of the container
   111  // Experimental
   112  func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) {
   113  	var devices []kubecontainer.DeviceInfo
   114  	for _, device := range container.VolumeDevices {
   115  		// check path is absolute
   116  		if !filepath.IsAbs(device.DevicePath) {
   117  			return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath)
   118  		}
   119  		vol, ok := podVolumes[device.Name]
   120  		if !ok || vol.BlockVolumeMapper == nil {
   121  			klog.ErrorS(nil, "Block volume cannot be satisfied for container, because the volume is missing or the volume mapper is nil", "containerName", container.Name, "device", device)
   122  			return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name)
   123  		}
   124  		// Get a symbolic link associated to a block device under pod device path
   125  		dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath()
   126  		symlinkPath := filepath.Join(dirPath, volName)
   127  		if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil {
   128  			return nil, checkErr
   129  		} else if islinkExist {
   130  			// Check readOnly in PVCVolumeSource and set read only permission if it's true.
   131  			permission := "mrw"
   132  			if vol.ReadOnly {
   133  				permission = "r"
   134  			}
   135  			klog.V(4).InfoS("Device will be attached to container in the corresponding path on host", "containerName", container.Name, "path", symlinkPath)
   136  			devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission})
   137  		}
   138  	}
   139  
   140  	return devices, nil
   141  }
   142  
   143  // shouldMountHostsFile checks if the nodes /etc/hosts should be mounted
   144  // Kubernetes only mounts on /etc/hosts if:
   145  // - container is not an infrastructure (pause) container
   146  // - container is not already mounting on /etc/hosts
   147  // Kubernetes will not mount /etc/hosts if:
   148  // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set.
   149  // - Windows pod contains a hostProcess container
   150  func shouldMountHostsFile(pod *v1.Pod, podIPs []string) bool {
   151  	shouldMount := len(podIPs) > 0
   152  	if runtime.GOOS == "windows" {
   153  		return shouldMount && !kubecontainer.HasWindowsHostProcessContainer(pod)
   154  	}
   155  	return shouldMount
   156  }
   157  
   158  // makeMounts determines the mount points for the given container.
   159  func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar) ([]kubecontainer.Mount, func(), error) {
   160  	mountEtcHostsFile := shouldMountHostsFile(pod, podIPs)
   161  	klog.V(3).InfoS("Creating hosts mount for container", "pod", klog.KObj(pod), "containerName", container.Name, "podIPs", podIPs, "path", mountEtcHostsFile)
   162  	mounts := []kubecontainer.Mount{}
   163  	var cleanupAction func()
   164  	for i, mount := range container.VolumeMounts {
   165  		// do not mount /etc/hosts if container is already mounting on the path
   166  		mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath)
   167  		vol, ok := podVolumes[mount.Name]
   168  		if !ok || vol.Mounter == nil {
   169  			klog.ErrorS(nil, "Mount cannot be satisfied for the container, because the volume is missing or the volume mounter (vol.Mounter) is nil",
   170  				"containerName", container.Name, "ok", ok, "volumeMounter", mount)
   171  			return nil, cleanupAction, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name)
   172  		}
   173  
   174  		relabelVolume := false
   175  		// If the volume supports SELinux and it has not been
   176  		// relabeled already and it is not a read-only volume,
   177  		// relabel it and mark it as labeled
   178  		if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SELinuxRelabel && !vol.SELinuxLabeled {
   179  			vol.SELinuxLabeled = true
   180  			relabelVolume = true
   181  		}
   182  		hostPath, err := volumeutil.GetPath(vol.Mounter)
   183  		if err != nil {
   184  			return nil, cleanupAction, err
   185  		}
   186  
   187  		subPath := mount.SubPath
   188  		if mount.SubPathExpr != "" {
   189  			subPath, err = kubecontainer.ExpandContainerVolumeMounts(mount, expandEnvs)
   190  
   191  			if err != nil {
   192  				return nil, cleanupAction, err
   193  			}
   194  		}
   195  
   196  		if subPath != "" {
   197  			if filepath.IsAbs(subPath) {
   198  				return nil, cleanupAction, fmt.Errorf("error SubPath `%s` must not be an absolute path", subPath)
   199  			}
   200  
   201  			err = volumevalidation.ValidatePathNoBacksteps(subPath)
   202  			if err != nil {
   203  				return nil, cleanupAction, fmt.Errorf("unable to provision SubPath `%s`: %v", subPath, err)
   204  			}
   205  
   206  			volumePath := hostPath
   207  			hostPath = filepath.Join(volumePath, subPath)
   208  
   209  			if subPathExists, err := hu.PathExists(hostPath); err != nil {
   210  				klog.ErrorS(nil, "Could not determine if subPath exists, will not attempt to change its permissions", "path", hostPath)
   211  			} else if !subPathExists {
   212  				// Create the sub path now because if it's auto-created later when referenced, it may have an
   213  				// incorrect ownership and mode. For example, the sub path directory must have at least g+rwx
   214  				// when the pod specifies an fsGroup, and if the directory is not created here, Docker will
   215  				// later auto-create it with the incorrect mode 0750
   216  				// Make extra care not to escape the volume!
   217  				perm, err := hu.GetMode(volumePath)
   218  				if err != nil {
   219  					return nil, cleanupAction, err
   220  				}
   221  				if err := subpather.SafeMakeDir(subPath, volumePath, perm); err != nil {
   222  					// Don't pass detailed error back to the user because it could give information about host filesystem
   223  					klog.ErrorS(err, "Failed to create subPath directory for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name)
   224  					return nil, cleanupAction, fmt.Errorf("failed to create subPath directory for volumeMount %q of container %q", mount.Name, container.Name)
   225  				}
   226  			}
   227  			hostPath, cleanupAction, err = subpather.PrepareSafeSubpath(subpath.Subpath{
   228  				VolumeMountIndex: i,
   229  				Path:             hostPath,
   230  				VolumeName:       vol.InnerVolumeSpecName,
   231  				VolumePath:       volumePath,
   232  				PodDir:           podDir,
   233  				ContainerName:    container.Name,
   234  			})
   235  			if err != nil {
   236  				// Don't pass detailed error back to the user because it could give information about host filesystem
   237  				klog.ErrorS(err, "Failed to prepare subPath for volumeMount of the container", "containerName", container.Name, "volumeMountName", mount.Name)
   238  				return nil, cleanupAction, fmt.Errorf("failed to prepare subPath for volumeMount %q of container %q", mount.Name, container.Name)
   239  			}
   240  		}
   241  
   242  		// Docker Volume Mounts fail on Windows if it is not of the form C:/
   243  		if volumeutil.IsWindowsLocalPath(runtime.GOOS, hostPath) {
   244  			hostPath = volumeutil.MakeAbsolutePath(runtime.GOOS, hostPath)
   245  		}
   246  
   247  		containerPath := mount.MountPath
   248  		// IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath
   249  		if !volumeutil.IsWindowsUNCPath(runtime.GOOS, containerPath) && !filepath.IsAbs(containerPath) {
   250  			containerPath = volumeutil.MakeAbsolutePath(runtime.GOOS, containerPath)
   251  		}
   252  
   253  		propagation, err := translateMountPropagation(mount.MountPropagation)
   254  		if err != nil {
   255  			return nil, cleanupAction, err
   256  		}
   257  		klog.V(5).InfoS("Mount has propagation", "pod", klog.KObj(pod), "containerName", container.Name, "volumeMountName", mount.Name, "propagation", propagation)
   258  		mustMountRO := vol.Mounter.GetAttributes().ReadOnly
   259  
   260  		mounts = append(mounts, kubecontainer.Mount{
   261  			Name:           mount.Name,
   262  			ContainerPath:  containerPath,
   263  			HostPath:       hostPath,
   264  			ReadOnly:       mount.ReadOnly || mustMountRO,
   265  			SELinuxRelabel: relabelVolume,
   266  			Propagation:    propagation,
   267  		})
   268  	}
   269  	if mountEtcHostsFile {
   270  		hostAliases := pod.Spec.HostAliases
   271  		hostsMount, err := makeHostsMount(podDir, podIPs, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork)
   272  		if err != nil {
   273  			return nil, cleanupAction, err
   274  		}
   275  		mounts = append(mounts, *hostsMount)
   276  	}
   277  	return mounts, cleanupAction, nil
   278  }
   279  
   280  // translateMountPropagation transforms v1.MountPropagationMode to
   281  // runtimeapi.MountPropagation.
   282  func translateMountPropagation(mountMode *v1.MountPropagationMode) (runtimeapi.MountPropagation, error) {
   283  	if runtime.GOOS == "windows" {
   284  		// Windows containers doesn't support mount propagation, use private for it.
   285  		// Refer https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation.
   286  		return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil
   287  	}
   288  
   289  	switch {
   290  	case mountMode == nil:
   291  		// PRIVATE is the default
   292  		return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil
   293  	case *mountMode == v1.MountPropagationHostToContainer:
   294  		return runtimeapi.MountPropagation_PROPAGATION_HOST_TO_CONTAINER, nil
   295  	case *mountMode == v1.MountPropagationBidirectional:
   296  		return runtimeapi.MountPropagation_PROPAGATION_BIDIRECTIONAL, nil
   297  	case *mountMode == v1.MountPropagationNone:
   298  		return runtimeapi.MountPropagation_PROPAGATION_PRIVATE, nil
   299  	default:
   300  		return 0, fmt.Errorf("invalid MountPropagation mode: %q", *mountMode)
   301  	}
   302  }
   303  
   304  // getEtcHostsPath returns the full host-side path to a pod's generated /etc/hosts file
   305  func getEtcHostsPath(podDir string) string {
   306  	hostsFilePath := filepath.Join(podDir, "etc-hosts")
   307  	// Volume Mounts fail on Windows if it is not of the form C:/
   308  	return volumeutil.MakeAbsolutePath(runtime.GOOS, hostsFilePath)
   309  }
   310  
   311  // makeHostsMount makes the mountpoint for the hosts file that the containers
   312  // in a pod are injected with. podIPs is provided instead of podIP as podIPs
   313  // are present even if dual-stack feature flag is not enabled.
   314  func makeHostsMount(podDir string, podIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) (*kubecontainer.Mount, error) {
   315  	hostsFilePath := getEtcHostsPath(podDir)
   316  	if err := ensureHostsFile(hostsFilePath, podIPs, hostName, hostDomainName, hostAliases, useHostNetwork); err != nil {
   317  		return nil, err
   318  	}
   319  	return &kubecontainer.Mount{
   320  		Name:           "k8s-managed-etc-hosts",
   321  		ContainerPath:  etcHostsPath,
   322  		HostPath:       hostsFilePath,
   323  		ReadOnly:       false,
   324  		SELinuxRelabel: true,
   325  	}, nil
   326  }
   327  
   328  // ensureHostsFile ensures that the given host file has an up-to-date ip, host
   329  // name, and domain name.
   330  func ensureHostsFile(fileName string, hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias, useHostNetwork bool) error {
   331  	var hostsFileContent []byte
   332  	var err error
   333  
   334  	if useHostNetwork {
   335  		// if Pod is using host network, read hosts file from the node's filesystem.
   336  		// `etcHostsPath` references the location of the hosts file on the node.
   337  		// `/etc/hosts` for *nix systems.
   338  		hostsFileContent, err = nodeHostsFileContent(etcHostsPath, hostAliases)
   339  		if err != nil {
   340  			return err
   341  		}
   342  	} else {
   343  		// if Pod is not using host network, create a managed hosts file with Pod IP and other information.
   344  		hostsFileContent = managedHostsFileContent(hostIPs, hostName, hostDomainName, hostAliases)
   345  	}
   346  
   347  	hostsFilePerm := os.FileMode(0644)
   348  	if err := os.WriteFile(fileName, hostsFileContent, hostsFilePerm); err != nil {
   349  		return err
   350  	}
   351  	return os.Chmod(fileName, hostsFilePerm)
   352  }
   353  
   354  // nodeHostsFileContent reads the content of node's hosts file.
   355  func nodeHostsFileContent(hostsFilePath string, hostAliases []v1.HostAlias) ([]byte, error) {
   356  	hostsFileContent, err := os.ReadFile(hostsFilePath)
   357  	if err != nil {
   358  		return nil, err
   359  	}
   360  	var buffer bytes.Buffer
   361  	buffer.WriteString(managedHostsHeaderWithHostNetwork)
   362  	buffer.Write(hostsFileContent)
   363  	buffer.Write(hostsEntriesFromHostAliases(hostAliases))
   364  	return buffer.Bytes(), nil
   365  }
   366  
   367  // managedHostsFileContent generates the content of the managed etc hosts based on Pod IPs and other
   368  // information.
   369  func managedHostsFileContent(hostIPs []string, hostName, hostDomainName string, hostAliases []v1.HostAlias) []byte {
   370  	var buffer bytes.Buffer
   371  	buffer.WriteString(managedHostsHeader)
   372  	buffer.WriteString("127.0.0.1\tlocalhost\n")                      // ipv4 localhost
   373  	buffer.WriteString("::1\tlocalhost ip6-localhost ip6-loopback\n") // ipv6 localhost
   374  	buffer.WriteString("fe00::0\tip6-localnet\n")
   375  	buffer.WriteString("fe00::0\tip6-mcastprefix\n")
   376  	buffer.WriteString("fe00::1\tip6-allnodes\n")
   377  	buffer.WriteString("fe00::2\tip6-allrouters\n")
   378  	if len(hostDomainName) > 0 {
   379  		// host entry generated for all IPs in podIPs
   380  		// podIPs field is populated for clusters even
   381  		// dual-stack feature flag is not enabled.
   382  		for _, hostIP := range hostIPs {
   383  			buffer.WriteString(fmt.Sprintf("%s\t%s.%s\t%s\n", hostIP, hostName, hostDomainName, hostName))
   384  		}
   385  	} else {
   386  		for _, hostIP := range hostIPs {
   387  			buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostIP, hostName))
   388  		}
   389  	}
   390  	buffer.Write(hostsEntriesFromHostAliases(hostAliases))
   391  	return buffer.Bytes()
   392  }
   393  
   394  func hostsEntriesFromHostAliases(hostAliases []v1.HostAlias) []byte {
   395  	if len(hostAliases) == 0 {
   396  		return []byte{}
   397  	}
   398  
   399  	var buffer bytes.Buffer
   400  	buffer.WriteString("\n")
   401  	buffer.WriteString("# Entries added by HostAliases.\n")
   402  	// for each IP, write all aliases onto single line in hosts file
   403  	for _, hostAlias := range hostAliases {
   404  		buffer.WriteString(fmt.Sprintf("%s\t%s\n", hostAlias.IP, strings.Join(hostAlias.Hostnames, "\t")))
   405  	}
   406  	return buffer.Bytes()
   407  }
   408  
   409  // truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars.
   410  func truncatePodHostnameIfNeeded(podName, hostname string) (string, error) {
   411  	// Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char).
   412  	const hostnameMaxLen = 63
   413  	if len(hostname) <= hostnameMaxLen {
   414  		return hostname, nil
   415  	}
   416  	truncated := hostname[:hostnameMaxLen]
   417  	klog.ErrorS(nil, "Hostname for pod was too long, truncated it", "podName", podName, "hostnameMaxLen", hostnameMaxLen, "truncatedHostname", truncated)
   418  	// hostname should not end with '-' or '.'
   419  	truncated = strings.TrimRight(truncated, "-.")
   420  	if len(truncated) == 0 {
   421  		// This should never happen.
   422  		return "", fmt.Errorf("hostname for pod %q was invalid: %q", podName, hostname)
   423  	}
   424  	return truncated, nil
   425  }
   426  
   427  // GetOrCreateUserNamespaceMappings returns the configuration for the sandbox user namespace
   428  func (kl *Kubelet) GetOrCreateUserNamespaceMappings(pod *v1.Pod) (*runtimeapi.UserNamespace, error) {
   429  	return kl.usernsManager.GetOrCreateUserNamespaceMappings(pod)
   430  }
   431  
   432  // GeneratePodHostNameAndDomain creates a hostname and domain name for a pod,
   433  // given that pod's spec and annotations or returns an error.
   434  func (kl *Kubelet) GeneratePodHostNameAndDomain(pod *v1.Pod) (string, string, error) {
   435  	clusterDomain := kl.dnsConfigurer.ClusterDomain
   436  
   437  	hostname := pod.Name
   438  	if len(pod.Spec.Hostname) > 0 {
   439  		if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Hostname); len(msgs) != 0 {
   440  			return "", "", fmt.Errorf("pod Hostname %q is not a valid DNS label: %s", pod.Spec.Hostname, strings.Join(msgs, ";"))
   441  		}
   442  		hostname = pod.Spec.Hostname
   443  	}
   444  
   445  	hostname, err := truncatePodHostnameIfNeeded(pod.Name, hostname)
   446  	if err != nil {
   447  		return "", "", err
   448  	}
   449  
   450  	hostDomain := ""
   451  	if len(pod.Spec.Subdomain) > 0 {
   452  		if msgs := utilvalidation.IsDNS1123Label(pod.Spec.Subdomain); len(msgs) != 0 {
   453  			return "", "", fmt.Errorf("pod Subdomain %q is not a valid DNS label: %s", pod.Spec.Subdomain, strings.Join(msgs, ";"))
   454  		}
   455  		hostDomain = fmt.Sprintf("%s.%s.svc.%s", pod.Spec.Subdomain, pod.Namespace, clusterDomain)
   456  	}
   457  
   458  	return hostname, hostDomain, nil
   459  }
   460  
   461  // GetPodCgroupParent gets pod cgroup parent from container manager.
   462  func (kl *Kubelet) GetPodCgroupParent(pod *v1.Pod) string {
   463  	pcm := kl.containerManager.NewPodContainerManager()
   464  	_, cgroupParent := pcm.GetPodContainerName(pod)
   465  	return cgroupParent
   466  }
   467  
   468  // GenerateRunContainerOptions generates the RunContainerOptions, which can be used by
   469  // the container runtime to set parameters for launching a container.
   470  func (kl *Kubelet) GenerateRunContainerOptions(ctx context.Context, pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) {
   471  	opts, err := kl.containerManager.GetResources(pod, container)
   472  	if err != nil {
   473  		return nil, nil, err
   474  	}
   475  	// The value of hostname is the short host name and it is sent to makeMounts to create /etc/hosts file.
   476  	hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod)
   477  	if err != nil {
   478  		return nil, nil, err
   479  	}
   480  	// nodename will be equal to hostname if SetHostnameAsFQDN is nil or false. If SetHostnameFQDN
   481  	// is true and hostDomainName is defined, nodename will be the FQDN (hostname.hostDomainName)
   482  	nodename, err := util.GetNodenameForKernel(hostname, hostDomainName, pod.Spec.SetHostnameAsFQDN)
   483  	if err != nil {
   484  		return nil, nil, err
   485  	}
   486  	opts.Hostname = nodename
   487  	podName := volumeutil.GetUniquePodName(pod)
   488  	volumes := kl.volumeManager.GetMountedVolumesForPod(podName)
   489  
   490  	blkutil := volumepathhandler.NewBlockVolumePathHandler()
   491  	blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil)
   492  	if err != nil {
   493  		return nil, nil, err
   494  	}
   495  	opts.Devices = append(opts.Devices, blkVolumes...)
   496  
   497  	envs, err := kl.makeEnvironmentVariables(pod, container, podIP, podIPs)
   498  	if err != nil {
   499  		return nil, nil, err
   500  	}
   501  	opts.Envs = append(opts.Envs, envs...)
   502  
   503  	// only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled.
   504  	mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs)
   505  	if err != nil {
   506  		return nil, cleanupAction, err
   507  	}
   508  	opts.Mounts = append(opts.Mounts, mounts...)
   509  
   510  	// adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot
   511  	// be mounted as volumes using Docker for Windows.
   512  	if len(container.TerminationMessagePath) != 0 {
   513  		p := kl.getPodContainerDir(pod.UID, container.Name)
   514  		if err := os.MkdirAll(p, 0750); err != nil {
   515  			klog.ErrorS(err, "Error on creating dir", "path", p)
   516  		} else {
   517  			opts.PodContainerDir = p
   518  		}
   519  	}
   520  
   521  	return opts, cleanupAction, nil
   522  }
   523  
   524  var masterServices = sets.NewString("kubernetes")
   525  
   526  // getServiceEnvVarMap makes a map[string]string of env vars for services a
   527  // pod in namespace ns should see.
   528  func (kl *Kubelet) getServiceEnvVarMap(ns string, enableServiceLinks bool) (map[string]string, error) {
   529  	var (
   530  		serviceMap = make(map[string]*v1.Service)
   531  		m          = make(map[string]string)
   532  	)
   533  
   534  	// Get all service resources from the master (via a cache),
   535  	// and populate them into service environment variables.
   536  	if kl.serviceLister == nil {
   537  		// Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars.
   538  		return m, nil
   539  	}
   540  	services, err := kl.serviceLister.List(labels.Everything())
   541  	if err != nil {
   542  		return m, fmt.Errorf("failed to list services when setting up env vars")
   543  	}
   544  
   545  	// project the services in namespace ns onto the master services
   546  	for i := range services {
   547  		service := services[i]
   548  		// ignore services where ClusterIP is "None" or empty
   549  		if !v1helper.IsServiceIPSet(service) {
   550  			continue
   551  		}
   552  		serviceName := service.Name
   553  
   554  		// We always want to add environment variabled for master services
   555  		// from the default namespace, even if enableServiceLinks is false.
   556  		// We also add environment variables for other services in the same
   557  		// namespace, if enableServiceLinks is true.
   558  		if service.Namespace == metav1.NamespaceDefault && masterServices.Has(serviceName) {
   559  			if _, exists := serviceMap[serviceName]; !exists {
   560  				serviceMap[serviceName] = service
   561  			}
   562  		} else if service.Namespace == ns && enableServiceLinks {
   563  			serviceMap[serviceName] = service
   564  		}
   565  	}
   566  
   567  	mappedServices := []*v1.Service{}
   568  	for key := range serviceMap {
   569  		mappedServices = append(mappedServices, serviceMap[key])
   570  	}
   571  
   572  	for _, e := range envvars.FromServices(mappedServices) {
   573  		m[e.Name] = e.Value
   574  	}
   575  	return m, nil
   576  }
   577  
   578  // Make the environment variables for a pod in the given namespace.
   579  func (kl *Kubelet) makeEnvironmentVariables(pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) ([]kubecontainer.EnvVar, error) {
   580  	if pod.Spec.EnableServiceLinks == nil {
   581  		return nil, fmt.Errorf("nil pod.spec.enableServiceLinks encountered, cannot construct envvars")
   582  	}
   583  
   584  	// If the pod originates from the kube-api, when we know that the kube-apiserver is responding and the kubelet's credentials are valid.
   585  	// Knowing this, it is reasonable to wait until the service lister has synchronized at least once before attempting to build
   586  	// a service env var map.  This doesn't present the race below from happening entirely, but it does prevent the "obvious"
   587  	// failure case of services simply not having completed a list operation that can reasonably be expected to succeed.
   588  	// One common case this prevents is a kubelet restart reading pods before services and some pod not having the
   589  	// KUBERNETES_SERVICE_HOST injected because we didn't wait a short time for services to sync before proceeding.
   590  	// The KUBERNETES_SERVICE_HOST link is special because it is unconditionally injected into pods and is read by the
   591  	// in-cluster-config for pod clients
   592  	if !kubetypes.IsStaticPod(pod) && !kl.serviceHasSynced() {
   593  		return nil, fmt.Errorf("services have not yet been read at least once, cannot construct envvars")
   594  	}
   595  
   596  	var result []kubecontainer.EnvVar
   597  	// Note:  These are added to the docker Config, but are not included in the checksum computed
   598  	// by kubecontainer.HashContainer(...).  That way, we can still determine whether an
   599  	// v1.Container is already running by its hash. (We don't want to restart a container just
   600  	// because some service changed.)
   601  	//
   602  	// Note that there is a race between Kubelet seeing the pod and kubelet seeing the service.
   603  	// To avoid this users can: (1) wait between starting a service and starting; or (2) detect
   604  	// missing service env var and exit and be restarted; or (3) use DNS instead of env vars
   605  	// and keep trying to resolve the DNS name of the service (recommended).
   606  	serviceEnv, err := kl.getServiceEnvVarMap(pod.Namespace, *pod.Spec.EnableServiceLinks)
   607  	if err != nil {
   608  		return result, err
   609  	}
   610  
   611  	var (
   612  		configMaps = make(map[string]*v1.ConfigMap)
   613  		secrets    = make(map[string]*v1.Secret)
   614  		tmpEnv     = make(map[string]string)
   615  	)
   616  
   617  	// Env will override EnvFrom variables.
   618  	// Process EnvFrom first then allow Env to replace existing values.
   619  	for _, envFrom := range container.EnvFrom {
   620  		switch {
   621  		case envFrom.ConfigMapRef != nil:
   622  			cm := envFrom.ConfigMapRef
   623  			name := cm.Name
   624  			configMap, ok := configMaps[name]
   625  			if !ok {
   626  				if kl.kubeClient == nil {
   627  					return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name)
   628  				}
   629  				optional := cm.Optional != nil && *cm.Optional
   630  				configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name)
   631  				if err != nil {
   632  					if errors.IsNotFound(err) && optional {
   633  						// ignore error when marked optional
   634  						continue
   635  					}
   636  					return result, err
   637  				}
   638  				configMaps[name] = configMap
   639  			}
   640  
   641  			invalidKeys := []string{}
   642  			for k, v := range configMap.Data {
   643  				if len(envFrom.Prefix) > 0 {
   644  					k = envFrom.Prefix + k
   645  				}
   646  				if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 {
   647  					invalidKeys = append(invalidKeys, k)
   648  					continue
   649  				}
   650  				tmpEnv[k] = v
   651  			}
   652  			if len(invalidKeys) > 0 {
   653  				sort.Strings(invalidKeys)
   654  				kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom configMap %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name)
   655  			}
   656  		case envFrom.SecretRef != nil:
   657  			s := envFrom.SecretRef
   658  			name := s.Name
   659  			secret, ok := secrets[name]
   660  			if !ok {
   661  				if kl.kubeClient == nil {
   662  					return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name)
   663  				}
   664  				optional := s.Optional != nil && *s.Optional
   665  				secret, err = kl.secretManager.GetSecret(pod.Namespace, name)
   666  				if err != nil {
   667  					if errors.IsNotFound(err) && optional {
   668  						// ignore error when marked optional
   669  						continue
   670  					}
   671  					return result, err
   672  				}
   673  				secrets[name] = secret
   674  			}
   675  
   676  			invalidKeys := []string{}
   677  			for k, v := range secret.Data {
   678  				if len(envFrom.Prefix) > 0 {
   679  					k = envFrom.Prefix + k
   680  				}
   681  				if errMsgs := utilvalidation.IsEnvVarName(k); len(errMsgs) != 0 {
   682  					invalidKeys = append(invalidKeys, k)
   683  					continue
   684  				}
   685  				tmpEnv[k] = string(v)
   686  			}
   687  			if len(invalidKeys) > 0 {
   688  				sort.Strings(invalidKeys)
   689  				kl.recorder.Eventf(pod, v1.EventTypeWarning, "InvalidEnvironmentVariableNames", "Keys [%s] from the EnvFrom secret %s/%s were skipped since they are considered invalid environment variable names.", strings.Join(invalidKeys, ", "), pod.Namespace, name)
   690  			}
   691  		}
   692  	}
   693  
   694  	// Determine the final values of variables:
   695  	//
   696  	// 1.  Determine the final value of each variable:
   697  	//     a.  If the variable's Value is set, expand the `$(var)` references to other
   698  	//         variables in the .Value field; the sources of variables are the declared
   699  	//         variables of the container and the service environment variables
   700  	//     b.  If a source is defined for an environment variable, resolve the source
   701  	// 2.  Create the container's environment in the order variables are declared
   702  	// 3.  Add remaining service environment vars
   703  	var (
   704  		mappingFunc = expansion.MappingFuncFor(tmpEnv, serviceEnv)
   705  	)
   706  	for _, envVar := range container.Env {
   707  		runtimeVal := envVar.Value
   708  		if runtimeVal != "" {
   709  			// Step 1a: expand variable references
   710  			runtimeVal = expansion.Expand(runtimeVal, mappingFunc)
   711  		} else if envVar.ValueFrom != nil {
   712  			// Step 1b: resolve alternate env var sources
   713  			switch {
   714  			case envVar.ValueFrom.FieldRef != nil:
   715  				runtimeVal, err = kl.podFieldSelectorRuntimeValue(envVar.ValueFrom.FieldRef, pod, podIP, podIPs)
   716  				if err != nil {
   717  					return result, err
   718  				}
   719  			case envVar.ValueFrom.ResourceFieldRef != nil:
   720  				defaultedPod, defaultedContainer, err := kl.defaultPodLimitsForDownwardAPI(pod, container)
   721  				if err != nil {
   722  					return result, err
   723  				}
   724  				runtimeVal, err = containerResourceRuntimeValue(envVar.ValueFrom.ResourceFieldRef, defaultedPod, defaultedContainer)
   725  				if err != nil {
   726  					return result, err
   727  				}
   728  			case envVar.ValueFrom.ConfigMapKeyRef != nil:
   729  				cm := envVar.ValueFrom.ConfigMapKeyRef
   730  				name := cm.Name
   731  				key := cm.Key
   732  				optional := cm.Optional != nil && *cm.Optional
   733  				configMap, ok := configMaps[name]
   734  				if !ok {
   735  					if kl.kubeClient == nil {
   736  						return result, fmt.Errorf("couldn't get configMap %v/%v, no kubeClient defined", pod.Namespace, name)
   737  					}
   738  					configMap, err = kl.configMapManager.GetConfigMap(pod.Namespace, name)
   739  					if err != nil {
   740  						if errors.IsNotFound(err) && optional {
   741  							// ignore error when marked optional
   742  							continue
   743  						}
   744  						return result, err
   745  					}
   746  					configMaps[name] = configMap
   747  				}
   748  				runtimeVal, ok = configMap.Data[key]
   749  				if !ok {
   750  					if optional {
   751  						continue
   752  					}
   753  					return result, fmt.Errorf("couldn't find key %v in ConfigMap %v/%v", key, pod.Namespace, name)
   754  				}
   755  			case envVar.ValueFrom.SecretKeyRef != nil:
   756  				s := envVar.ValueFrom.SecretKeyRef
   757  				name := s.Name
   758  				key := s.Key
   759  				optional := s.Optional != nil && *s.Optional
   760  				secret, ok := secrets[name]
   761  				if !ok {
   762  					if kl.kubeClient == nil {
   763  						return result, fmt.Errorf("couldn't get secret %v/%v, no kubeClient defined", pod.Namespace, name)
   764  					}
   765  					secret, err = kl.secretManager.GetSecret(pod.Namespace, name)
   766  					if err != nil {
   767  						if errors.IsNotFound(err) && optional {
   768  							// ignore error when marked optional
   769  							continue
   770  						}
   771  						return result, err
   772  					}
   773  					secrets[name] = secret
   774  				}
   775  				runtimeValBytes, ok := secret.Data[key]
   776  				if !ok {
   777  					if optional {
   778  						continue
   779  					}
   780  					return result, fmt.Errorf("couldn't find key %v in Secret %v/%v", key, pod.Namespace, name)
   781  				}
   782  				runtimeVal = string(runtimeValBytes)
   783  			}
   784  		}
   785  
   786  		tmpEnv[envVar.Name] = runtimeVal
   787  	}
   788  
   789  	// Append the env vars
   790  	for k, v := range tmpEnv {
   791  		result = append(result, kubecontainer.EnvVar{Name: k, Value: v})
   792  	}
   793  
   794  	// Append remaining service env vars.
   795  	for k, v := range serviceEnv {
   796  		// Accesses apiserver+Pods.
   797  		// So, the master may set service env vars, or kubelet may.  In case both are doing
   798  		// it, we skip the key from the kubelet-generated ones so we don't have duplicate
   799  		// env vars.
   800  		// TODO: remove this next line once all platforms use apiserver+Pods.
   801  		if _, present := tmpEnv[k]; !present {
   802  			result = append(result, kubecontainer.EnvVar{Name: k, Value: v})
   803  		}
   804  	}
   805  	return result, nil
   806  }
   807  
   808  // podFieldSelectorRuntimeValue returns the runtime value of the given
   809  // selector for a pod.
   810  func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod *v1.Pod, podIP string, podIPs []string) (string, error) {
   811  	internalFieldPath, _, err := podshelper.ConvertDownwardAPIFieldLabel(fs.APIVersion, fs.FieldPath, "")
   812  	if err != nil {
   813  		return "", err
   814  	}
   815  
   816  	// make podIPs order match node IP family preference #97979
   817  	podIPs = kl.sortPodIPs(podIPs)
   818  	if len(podIPs) > 0 {
   819  		podIP = podIPs[0]
   820  	}
   821  
   822  	switch internalFieldPath {
   823  	case "spec.nodeName":
   824  		return pod.Spec.NodeName, nil
   825  	case "spec.serviceAccountName":
   826  		return pod.Spec.ServiceAccountName, nil
   827  	case "status.hostIP":
   828  		hostIPs, err := kl.getHostIPsAnyWay()
   829  		if err != nil {
   830  			return "", err
   831  		}
   832  		return hostIPs[0].String(), nil
   833  	case "status.hostIPs":
   834  		if !utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) {
   835  			return "", nil
   836  		}
   837  		hostIPs, err := kl.getHostIPsAnyWay()
   838  		if err != nil {
   839  			return "", err
   840  		}
   841  		ips := make([]string, 0, len(hostIPs))
   842  		for _, ip := range hostIPs {
   843  			ips = append(ips, ip.String())
   844  		}
   845  		return strings.Join(ips, ","), nil
   846  	case "status.podIP":
   847  		return podIP, nil
   848  	case "status.podIPs":
   849  		return strings.Join(podIPs, ","), nil
   850  	}
   851  	return fieldpath.ExtractFieldPathAsString(pod, internalFieldPath)
   852  }
   853  
   854  // containerResourceRuntimeValue returns the value of the provided container resource
   855  func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, container *v1.Container) (string, error) {
   856  	containerName := fs.ContainerName
   857  	if len(containerName) == 0 {
   858  		return resource.ExtractContainerResourceValue(fs, container)
   859  	}
   860  	return resource.ExtractResourceValueByContainerName(fs, pod, containerName)
   861  }
   862  
   863  // killPod instructs the container runtime to kill the pod. This method requires that
   864  // the pod status contains the result of the last syncPod, otherwise it may fail to
   865  // terminate newly created containers and sandboxes.
   866  func (kl *Kubelet) killPod(ctx context.Context, pod *v1.Pod, p kubecontainer.Pod, gracePeriodOverride *int64) error {
   867  	// Call the container runtime KillPod method which stops all known running containers of the pod
   868  	if err := kl.containerRuntime.KillPod(ctx, pod, p, gracePeriodOverride); err != nil {
   869  		return err
   870  	}
   871  	if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
   872  		klog.V(2).InfoS("Failed to update QoS cgroups while killing pod", "err", err)
   873  	}
   874  	return nil
   875  }
   876  
   877  // makePodDataDirs creates the dirs for the pod datas.
   878  func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error {
   879  	uid := pod.UID
   880  	if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) {
   881  		return err
   882  	}
   883  	if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) {
   884  		return err
   885  	}
   886  	if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) {
   887  		return err
   888  	}
   889  	return nil
   890  }
   891  
   892  // getPullSecretsForPod inspects the Pod and retrieves the referenced pull
   893  // secrets.
   894  func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret {
   895  	pullSecrets := []v1.Secret{}
   896  	failedPullSecrets := []string{}
   897  
   898  	for _, secretRef := range pod.Spec.ImagePullSecrets {
   899  		if len(secretRef.Name) == 0 {
   900  			// API validation permitted entries with empty names (https://issue.k8s.io/99454#issuecomment-787838112).
   901  			// Ignore to avoid unnecessary warnings.
   902  			continue
   903  		}
   904  		secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name)
   905  		if err != nil {
   906  			klog.InfoS("Unable to retrieve pull secret, the image pull may not succeed.", "pod", klog.KObj(pod), "secret", klog.KObj(secret), "err", err)
   907  			failedPullSecrets = append(failedPullSecrets, secretRef.Name)
   908  			continue
   909  		}
   910  
   911  		pullSecrets = append(pullSecrets, *secret)
   912  	}
   913  
   914  	if len(failedPullSecrets) > 0 {
   915  		kl.recorder.Eventf(pod, v1.EventTypeWarning, "FailedToRetrieveImagePullSecret", "Unable to retrieve some image pull secrets (%s); attempting to pull the image may not succeed.", strings.Join(failedPullSecrets, ", "))
   916  	}
   917  
   918  	return pullSecrets
   919  }
   920  
   921  // PodCouldHaveRunningContainers returns true if the pod with the given UID could still have running
   922  // containers. This returns false if the pod has not yet been started or the pod is unknown.
   923  func (kl *Kubelet) PodCouldHaveRunningContainers(pod *v1.Pod) bool {
   924  	if kl.podWorkers.CouldHaveRunningContainers(pod.UID) {
   925  		return true
   926  	}
   927  
   928  	// Check if pod might need to unprepare resources before termination
   929  	// NOTE: This is a temporary solution. This call is here to avoid changing
   930  	// status manager and its tests.
   931  	// TODO: extend PodDeletionSafetyProvider interface and implement it
   932  	// in a separate Kubelet method.
   933  	if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
   934  		if kl.containerManager.PodMightNeedToUnprepareResources(pod.UID) {
   935  			return true
   936  		}
   937  	}
   938  	return false
   939  }
   940  
   941  // PodIsFinished returns true if SyncTerminatedPod is finished, ie.
   942  // all required node-level resources that a pod was consuming have
   943  // been reclaimed by the kubelet.
   944  func (kl *Kubelet) PodIsFinished(pod *v1.Pod) bool {
   945  	return kl.podWorkers.ShouldPodBeFinished(pod.UID)
   946  }
   947  
   948  // filterOutInactivePods returns pods that are not in a terminal phase
   949  // or are known to be fully terminated. This method should only be used
   950  // when the set of pods being filtered is upstream of the pod worker, i.e.
   951  // the pods the pod manager is aware of.
   952  func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod {
   953  	filteredPods := make([]*v1.Pod, 0, len(pods))
   954  	for _, p := range pods {
   955  		// if a pod is fully terminated by UID, it should be excluded from the
   956  		// list of pods
   957  		if kl.podWorkers.IsPodKnownTerminated(p.UID) {
   958  			continue
   959  		}
   960  
   961  		// terminal pods are considered inactive UNLESS they are actively terminating
   962  		if kl.isAdmittedPodTerminal(p) && !kl.podWorkers.IsPodTerminationRequested(p.UID) {
   963  			continue
   964  		}
   965  
   966  		filteredPods = append(filteredPods, p)
   967  	}
   968  	return filteredPods
   969  }
   970  
   971  // isAdmittedPodTerminal returns true if the provided config source pod is in
   972  // a terminal phase, or if the Kubelet has already indicated the pod has reached
   973  // a terminal phase but the config source has not accepted it yet. This method
   974  // should only be used within the pod configuration loops that notify the pod
   975  // worker, other components should treat the pod worker as authoritative.
   976  func (kl *Kubelet) isAdmittedPodTerminal(pod *v1.Pod) bool {
   977  	// pods are considered inactive if the config source has observed a
   978  	// terminal phase (if the Kubelet recorded that the pod reached a terminal
   979  	// phase the pod should never be restarted)
   980  	if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
   981  		return true
   982  	}
   983  	// a pod that has been marked terminal within the Kubelet is considered
   984  	// inactive (may have been rejected by Kubelet admission)
   985  	if status, ok := kl.statusManager.GetPodStatus(pod.UID); ok {
   986  		if status.Phase == v1.PodSucceeded || status.Phase == v1.PodFailed {
   987  			return true
   988  		}
   989  	}
   990  	return false
   991  }
   992  
   993  // removeOrphanedPodStatuses removes obsolete entries in podStatus where
   994  // the pod is no longer considered bound to this node.
   995  func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Pod) {
   996  	podUIDs := make(map[types.UID]bool)
   997  	for _, pod := range pods {
   998  		podUIDs[pod.UID] = true
   999  	}
  1000  	for _, pod := range mirrorPods {
  1001  		podUIDs[pod.UID] = true
  1002  	}
  1003  	kl.statusManager.RemoveOrphanedStatuses(podUIDs)
  1004  }
  1005  
  1006  // HandlePodCleanups performs a series of cleanup work, including terminating
  1007  // pod workers, killing unwanted pods, and removing orphaned volumes/pod
  1008  // directories. No config changes are sent to pod workers while this method
  1009  // is executing which means no new pods can appear. After this method completes
  1010  // the desired state of the kubelet should be reconciled with the actual state
  1011  // in the pod worker and other pod-related components.
  1012  //
  1013  // This function is executed by the main sync loop, so it must execute quickly
  1014  // and all nested calls should be asynchronous. Any slow reconciliation actions
  1015  // should be performed by other components (like the volume manager). The duration
  1016  // of this call is the minimum latency for static pods to be restarted if they
  1017  // are updated with a fixed UID (most should use a dynamic UID), and no config
  1018  // updates are delivered to the pod workers while this method is running.
  1019  func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
  1020  	// The kubelet lacks checkpointing, so we need to introspect the set of pods
  1021  	// in the cgroup tree prior to inspecting the set of pods in our pod manager.
  1022  	// this ensures our view of the cgroup tree does not mistakenly observe pods
  1023  	// that are added after the fact...
  1024  	var (
  1025  		cgroupPods map[types.UID]cm.CgroupName
  1026  		err        error
  1027  	)
  1028  	if kl.cgroupsPerQOS {
  1029  		pcm := kl.containerManager.NewPodContainerManager()
  1030  		cgroupPods, err = pcm.GetAllPodsFromCgroups()
  1031  		if err != nil {
  1032  			return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err)
  1033  		}
  1034  	}
  1035  
  1036  	allPods, mirrorPods, orphanedMirrorPodFullnames := kl.podManager.GetPodsAndMirrorPods()
  1037  
  1038  	// Pod phase progresses monotonically. Once a pod has reached a final state,
  1039  	// it should never leave regardless of the restart policy. The statuses
  1040  	// of such pods should not be changed, and there is no need to sync them.
  1041  	// TODO: the logic here does not handle two cases:
  1042  	//   1. If the containers were removed immediately after they died, kubelet
  1043  	//      may fail to generate correct statuses, let alone filtering correctly.
  1044  	//   2. If kubelet restarted before writing the terminated status for a pod
  1045  	//      to the apiserver, it could still restart the terminated pod (even
  1046  	//      though the pod was not considered terminated by the apiserver).
  1047  	// These two conditions could be alleviated by checkpointing kubelet.
  1048  
  1049  	// Stop the workers for terminated pods not in the config source
  1050  	klog.V(3).InfoS("Clean up pod workers for terminated pods")
  1051  	workingPods := kl.podWorkers.SyncKnownPods(allPods)
  1052  
  1053  	// Reconcile: At this point the pod workers have been pruned to the set of
  1054  	// desired pods. Pods that must be restarted due to UID reuse, or leftover
  1055  	// pods from previous runs, are not known to the pod worker.
  1056  
  1057  	allPodsByUID := make(map[types.UID]*v1.Pod)
  1058  	for _, pod := range allPods {
  1059  		allPodsByUID[pod.UID] = pod
  1060  	}
  1061  
  1062  	// Identify the set of pods that have workers, which should be all pods
  1063  	// from config that are not terminated, as well as any terminating pods
  1064  	// that have already been removed from config. Pods that are terminating
  1065  	// will be added to possiblyRunningPods, to prevent overly aggressive
  1066  	// cleanup of pod cgroups.
  1067  	stringIfTrue := func(t bool) string {
  1068  		if t {
  1069  			return "true"
  1070  		}
  1071  		return ""
  1072  	}
  1073  	runningPods := make(map[types.UID]sets.Empty)
  1074  	possiblyRunningPods := make(map[types.UID]sets.Empty)
  1075  	for uid, sync := range workingPods {
  1076  		switch sync.State {
  1077  		case SyncPod:
  1078  			runningPods[uid] = struct{}{}
  1079  			possiblyRunningPods[uid] = struct{}{}
  1080  		case TerminatingPod:
  1081  			possiblyRunningPods[uid] = struct{}{}
  1082  		default:
  1083  		}
  1084  	}
  1085  
  1086  	// Retrieve the list of running containers from the runtime to perform cleanup.
  1087  	// We need the latest state to avoid delaying restarts of static pods that reuse
  1088  	// a UID.
  1089  	if err := kl.runtimeCache.ForceUpdateIfOlder(ctx, kl.clock.Now()); err != nil {
  1090  		klog.ErrorS(err, "Error listing containers")
  1091  		return err
  1092  	}
  1093  	runningRuntimePods, err := kl.runtimeCache.GetPods(ctx)
  1094  	if err != nil {
  1095  		klog.ErrorS(err, "Error listing containers")
  1096  		return err
  1097  	}
  1098  
  1099  	// Stop probing pods that are not running
  1100  	klog.V(3).InfoS("Clean up probes for terminated pods")
  1101  	kl.probeManager.CleanupPods(possiblyRunningPods)
  1102  
  1103  	// Remove orphaned pod statuses not in the total list of known config pods
  1104  	klog.V(3).InfoS("Clean up orphaned pod statuses")
  1105  	kl.removeOrphanedPodStatuses(allPods, mirrorPods)
  1106  
  1107  	// Remove orphaned pod user namespace allocations (if any).
  1108  	klog.V(3).InfoS("Clean up orphaned pod user namespace allocations")
  1109  	if err = kl.usernsManager.CleanupOrphanedPodUsernsAllocations(allPods, runningRuntimePods); err != nil {
  1110  		klog.ErrorS(err, "Failed cleaning up orphaned pod user namespaces allocations")
  1111  	}
  1112  
  1113  	// Remove orphaned volumes from pods that are known not to have any
  1114  	// containers. Note that we pass all pods (including terminated pods) to
  1115  	// the function, so that we don't remove volumes associated with terminated
  1116  	// but not yet deleted pods.
  1117  	// TODO: this method could more aggressively cleanup terminated pods
  1118  	// in the future (volumes, mount dirs, logs, and containers could all be
  1119  	// better separated)
  1120  	klog.V(3).InfoS("Clean up orphaned pod directories")
  1121  	err = kl.cleanupOrphanedPodDirs(allPods, runningRuntimePods)
  1122  	if err != nil {
  1123  		// We want all cleanup tasks to be run even if one of them failed. So
  1124  		// we just log an error here and continue other cleanup tasks.
  1125  		// This also applies to the other clean up tasks.
  1126  		klog.ErrorS(err, "Failed cleaning up orphaned pod directories")
  1127  	}
  1128  
  1129  	// Remove any orphaned mirror pods (mirror pods are tracked by name via the
  1130  	// pod worker)
  1131  	klog.V(3).InfoS("Clean up orphaned mirror pods")
  1132  	for _, podFullname := range orphanedMirrorPodFullnames {
  1133  		if !kl.podWorkers.IsPodForMirrorPodTerminatingByFullName(podFullname) {
  1134  			_, err := kl.mirrorPodClient.DeleteMirrorPod(podFullname, nil)
  1135  			if err != nil {
  1136  				klog.ErrorS(err, "Encountered error when deleting mirror pod", "podName", podFullname)
  1137  			} else {
  1138  				klog.V(3).InfoS("Deleted mirror pod", "podName", podFullname)
  1139  			}
  1140  		}
  1141  	}
  1142  
  1143  	// After pruning pod workers for terminated pods get the list of active pods for
  1144  	// metrics and to determine restarts.
  1145  	activePods := kl.filterOutInactivePods(allPods)
  1146  	allRegularPods, allStaticPods := splitPodsByStatic(allPods)
  1147  	activeRegularPods, activeStaticPods := splitPodsByStatic(activePods)
  1148  	metrics.DesiredPodCount.WithLabelValues("").Set(float64(len(allRegularPods)))
  1149  	metrics.DesiredPodCount.WithLabelValues("true").Set(float64(len(allStaticPods)))
  1150  	metrics.ActivePodCount.WithLabelValues("").Set(float64(len(activeRegularPods)))
  1151  	metrics.ActivePodCount.WithLabelValues("true").Set(float64(len(activeStaticPods)))
  1152  	metrics.MirrorPodCount.Set(float64(len(mirrorPods)))
  1153  
  1154  	// At this point, the pod worker is aware of which pods are not desired (SyncKnownPods).
  1155  	// We now look through the set of active pods for those that the pod worker is not aware of
  1156  	// and deliver an update. The most common reason a pod is not known is because the pod was
  1157  	// deleted and recreated with the same UID while the pod worker was driving its lifecycle (very
  1158  	// very rare for API pods, common for static pods with fixed UIDs). Containers that may still
  1159  	// be running from a previous execution must be reconciled by the pod worker's sync method.
  1160  	// We must use active pods because that is the set of admitted pods (podManager includes pods
  1161  	// that will never be run, and statusManager tracks already rejected pods).
  1162  	var restartCount, restartCountStatic int
  1163  	for _, desiredPod := range activePods {
  1164  		if _, knownPod := workingPods[desiredPod.UID]; knownPod {
  1165  			continue
  1166  		}
  1167  
  1168  		klog.V(3).InfoS("Pod will be restarted because it is in the desired set and not known to the pod workers (likely due to UID reuse)", "podUID", desiredPod.UID)
  1169  		isStatic := kubetypes.IsStaticPod(desiredPod)
  1170  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(desiredPod)
  1171  		if pod == nil || wasMirror {
  1172  			klog.V(2).InfoS("Programmer error, restartable pod was a mirror pod but activePods should never contain a mirror pod", "podUID", desiredPod.UID)
  1173  			continue
  1174  		}
  1175  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  1176  			UpdateType: kubetypes.SyncPodCreate,
  1177  			Pod:        pod,
  1178  			MirrorPod:  mirrorPod,
  1179  		})
  1180  
  1181  		// the desired pod is now known as well
  1182  		workingPods[desiredPod.UID] = PodWorkerSync{State: SyncPod, HasConfig: true, Static: isStatic}
  1183  		if isStatic {
  1184  			// restartable static pods are the normal case
  1185  			restartCountStatic++
  1186  		} else {
  1187  			// almost certainly means shenanigans, as API pods should never have the same UID after being deleted and recreated
  1188  			// unless there is a major API violation
  1189  			restartCount++
  1190  		}
  1191  	}
  1192  	metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic))
  1193  	metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount))
  1194  
  1195  	// Complete termination of deleted pods that are not runtime pods (don't have
  1196  	// running containers), are terminal, and are not known to pod workers.
  1197  	// An example is pods rejected during kubelet admission that have never
  1198  	// started before (i.e. does not have an orphaned pod).
  1199  	// Adding the pods with SyncPodKill to pod workers allows to proceed with
  1200  	// force-deletion of such pods, yet preventing re-entry of the routine in the
  1201  	// next invocation of HandlePodCleanups.
  1202  	for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) {
  1203  		klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID)
  1204  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  1205  			UpdateType: kubetypes.SyncPodKill,
  1206  			Pod:        pod,
  1207  		})
  1208  	}
  1209  
  1210  	// Finally, terminate any pods that are observed in the runtime but not present in the list of
  1211  	// known running pods from config. If we do terminate running runtime pods that will happen
  1212  	// asynchronously in the background and those will be processed in the next invocation of
  1213  	// HandlePodCleanups.
  1214  	var orphanCount int
  1215  	for _, runningPod := range runningRuntimePods {
  1216  		// If there are orphaned pod resources in CRI that are unknown to the pod worker, terminate them
  1217  		// now. Since housekeeping is exclusive to other pod worker updates, we know that no pods have
  1218  		// been added to the pod worker in the meantime. Note that pods that are not visible in the runtime
  1219  		// but which were previously known are terminated by SyncKnownPods().
  1220  		_, knownPod := workingPods[runningPod.ID]
  1221  		if !knownPod {
  1222  			one := int64(1)
  1223  			killPodOptions := &KillPodOptions{
  1224  				PodTerminationGracePeriodSecondsOverride: &one,
  1225  			}
  1226  			klog.V(2).InfoS("Clean up containers for orphaned pod we had not seen before", "podUID", runningPod.ID, "killPodOptions", killPodOptions)
  1227  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  1228  				UpdateType:     kubetypes.SyncPodKill,
  1229  				RunningPod:     runningPod,
  1230  				KillPodOptions: killPodOptions,
  1231  			})
  1232  
  1233  			// the running pod is now known as well
  1234  			workingPods[runningPod.ID] = PodWorkerSync{State: TerminatingPod, Orphan: true}
  1235  			orphanCount++
  1236  		}
  1237  	}
  1238  	metrics.OrphanedRuntimePodTotal.Add(float64(orphanCount))
  1239  
  1240  	// Now that we have recorded any terminating pods, and added new pods that should be running,
  1241  	// record a summary here. Not all possible combinations of PodWorkerSync values are valid.
  1242  	counts := make(map[PodWorkerSync]int)
  1243  	for _, sync := range workingPods {
  1244  		counts[sync]++
  1245  	}
  1246  	for validSync, configState := range map[PodWorkerSync]string{
  1247  		{HasConfig: true, Static: true}:                "desired",
  1248  		{HasConfig: true, Static: false}:               "desired",
  1249  		{Orphan: true, HasConfig: true, Static: true}:  "orphan",
  1250  		{Orphan: true, HasConfig: true, Static: false}: "orphan",
  1251  		{Orphan: true, HasConfig: false}:               "runtime_only",
  1252  	} {
  1253  		for _, state := range []PodWorkerState{SyncPod, TerminatingPod, TerminatedPod} {
  1254  			validSync.State = state
  1255  			count := counts[validSync]
  1256  			delete(counts, validSync)
  1257  			staticString := stringIfTrue(validSync.Static)
  1258  			if !validSync.HasConfig {
  1259  				staticString = "unknown"
  1260  			}
  1261  			metrics.WorkingPodCount.WithLabelValues(state.String(), configState, staticString).Set(float64(count))
  1262  		}
  1263  	}
  1264  	if len(counts) > 0 {
  1265  		// in case a combination is lost
  1266  		klog.V(3).InfoS("Programmer error, did not report a kubelet_working_pods metric for a value returned by SyncKnownPods", "counts", counts)
  1267  	}
  1268  
  1269  	// Remove any cgroups in the hierarchy for pods that are definitely no longer
  1270  	// running (not in the container runtime).
  1271  	if kl.cgroupsPerQOS {
  1272  		pcm := kl.containerManager.NewPodContainerManager()
  1273  		klog.V(3).InfoS("Clean up orphaned pod cgroups")
  1274  		kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, possiblyRunningPods)
  1275  	}
  1276  
  1277  	// Cleanup any backoff entries.
  1278  	kl.backOff.GC()
  1279  	return nil
  1280  }
  1281  
  1282  // filterTerminalPodsToDelete returns terminal pods which are ready to be
  1283  // deleted by the status manager, but are not in pod workers.
  1284  // First, the check for deletionTimestamp is a performance optimization as we
  1285  // don't need to do anything with terminal pods without deletionTimestamp.
  1286  // Second, the check for terminal pods is to avoid race conditions of triggering
  1287  // deletion on Pending pods which are not yet added to pod workers.
  1288  // Third, the check to skip pods known to pod workers is that the lifecycle of
  1289  // such pods is already handled by pod workers.
  1290  // Finally, we skip runtime pods as their termination is handled separately in
  1291  // the HandlePodCleanups routine.
  1292  func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod {
  1293  	terminalPodsToDelete := make(map[types.UID]*v1.Pod)
  1294  	for _, pod := range allPods {
  1295  		if pod.DeletionTimestamp == nil {
  1296  			// skip pods which don't have a deletion timestamp
  1297  			continue
  1298  		}
  1299  		if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
  1300  			// skip the non-terminal pods
  1301  			continue
  1302  		}
  1303  		if _, knownPod := workingPods[pod.UID]; knownPod {
  1304  			// skip pods known to pod workers
  1305  			continue
  1306  		}
  1307  		terminalPodsToDelete[pod.UID] = pod
  1308  	}
  1309  	for _, runningRuntimePod := range runningRuntimePods {
  1310  		// skip running runtime pods - they are handled by a dedicated routine
  1311  		// which terminates the containers
  1312  		delete(terminalPodsToDelete, runningRuntimePod.ID)
  1313  	}
  1314  	return terminalPodsToDelete
  1315  }
  1316  
  1317  // splitPodsByStatic separates a list of desired pods from the pod manager into
  1318  // regular or static pods. Mirror pods are not valid config sources (a mirror pod
  1319  // being created cannot cause the Kubelet to start running a static pod) and are
  1320  // excluded.
  1321  func splitPodsByStatic(pods []*v1.Pod) (regular, static []*v1.Pod) {
  1322  	regular, static = make([]*v1.Pod, 0, len(pods)), make([]*v1.Pod, 0, len(pods))
  1323  	for _, pod := range pods {
  1324  		if kubetypes.IsMirrorPod(pod) {
  1325  			continue
  1326  		}
  1327  		if kubetypes.IsStaticPod(pod) {
  1328  			static = append(static, pod)
  1329  		} else {
  1330  			regular = append(regular, pod)
  1331  		}
  1332  	}
  1333  	return regular, static
  1334  }
  1335  
  1336  // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state
  1337  // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current
  1338  // running container is preferred over a previous termination. If info about the container is not available then a specific
  1339  // error is returned to the end user.
  1340  func (kl *Kubelet) validateContainerLogStatus(podName string, podStatus *v1.PodStatus, containerName string, previous bool) (containerID kubecontainer.ContainerID, err error) {
  1341  	var cID string
  1342  
  1343  	cStatus, found := podutil.GetContainerStatus(podStatus.ContainerStatuses, containerName)
  1344  	if !found {
  1345  		cStatus, found = podutil.GetContainerStatus(podStatus.InitContainerStatuses, containerName)
  1346  	}
  1347  	if !found {
  1348  		cStatus, found = podutil.GetContainerStatus(podStatus.EphemeralContainerStatuses, containerName)
  1349  	}
  1350  	if !found {
  1351  		return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is not available", containerName, podName)
  1352  	}
  1353  	lastState := cStatus.LastTerminationState
  1354  	waiting, running, terminated := cStatus.State.Waiting, cStatus.State.Running, cStatus.State.Terminated
  1355  
  1356  	switch {
  1357  	case previous:
  1358  		if lastState.Terminated == nil || lastState.Terminated.ContainerID == "" {
  1359  			return kubecontainer.ContainerID{}, fmt.Errorf("previous terminated container %q in pod %q not found", containerName, podName)
  1360  		}
  1361  		cID = lastState.Terminated.ContainerID
  1362  
  1363  	case running != nil:
  1364  		cID = cStatus.ContainerID
  1365  
  1366  	case terminated != nil:
  1367  		// in cases where the next container didn't start, terminated.ContainerID will be empty, so get logs from the lastState.Terminated.
  1368  		if terminated.ContainerID == "" {
  1369  			if lastState.Terminated != nil && lastState.Terminated.ContainerID != "" {
  1370  				cID = lastState.Terminated.ContainerID
  1371  			} else {
  1372  				return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName)
  1373  			}
  1374  		} else {
  1375  			cID = terminated.ContainerID
  1376  		}
  1377  
  1378  	case lastState.Terminated != nil:
  1379  		if lastState.Terminated.ContainerID == "" {
  1380  			return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is terminated", containerName, podName)
  1381  		}
  1382  		cID = lastState.Terminated.ContainerID
  1383  
  1384  	case waiting != nil:
  1385  		// output some info for the most common pending failures
  1386  		switch reason := waiting.Reason; reason {
  1387  		case images.ErrImagePull.Error():
  1388  			return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: image can't be pulled", containerName, podName)
  1389  		case images.ErrImagePullBackOff.Error():
  1390  			return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: trying and failing to pull image", containerName, podName)
  1391  		default:
  1392  			return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start: %v", containerName, podName, reason)
  1393  		}
  1394  	default:
  1395  		// unrecognized state
  1396  		return kubecontainer.ContainerID{}, fmt.Errorf("container %q in pod %q is waiting to start - no logs yet", containerName, podName)
  1397  	}
  1398  
  1399  	return kubecontainer.ParseContainerID(cID), nil
  1400  }
  1401  
  1402  // GetKubeletContainerLogs returns logs from the container
  1403  // TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt
  1404  // or all of them.
  1405  func (kl *Kubelet) GetKubeletContainerLogs(ctx context.Context, podFullName, containerName string, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) error {
  1406  	// Pod workers periodically write status to statusManager. If status is not
  1407  	// cached there, something is wrong (or kubelet just restarted and hasn't
  1408  	// caught up yet). Just assume the pod is not ready yet.
  1409  	name, namespace, err := kubecontainer.ParsePodFullName(podFullName)
  1410  	if err != nil {
  1411  		return fmt.Errorf("unable to parse pod full name %q: %v", podFullName, err)
  1412  	}
  1413  
  1414  	pod, ok := kl.GetPodByName(namespace, name)
  1415  	if !ok {
  1416  		return fmt.Errorf("pod %q cannot be found - no logs available", name)
  1417  	}
  1418  
  1419  	// TODO: this should be using the podWorker's pod store as authoritative, since
  1420  	// the mirrorPod might still exist, the pod may have been force deleted but
  1421  	// is still terminating (users should be able to view logs of force deleted static pods
  1422  	// based on full name).
  1423  	var podUID types.UID
  1424  	pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  1425  	if wasMirror {
  1426  		if pod == nil {
  1427  			return fmt.Errorf("mirror pod %q does not have a corresponding pod", name)
  1428  		}
  1429  		podUID = mirrorPod.UID
  1430  	} else {
  1431  		podUID = pod.UID
  1432  	}
  1433  
  1434  	podStatus, found := kl.statusManager.GetPodStatus(podUID)
  1435  	if !found {
  1436  		// If there is no cached status, use the status from the
  1437  		// config source (apiserver). This is useful if kubelet
  1438  		// has recently been restarted.
  1439  		podStatus = pod.Status
  1440  	}
  1441  
  1442  	// TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID,
  1443  	// but inside kuberuntime we convert container id back to container name and restart count.
  1444  	// TODO: After separate container log lifecycle management, we should get log based on the existing log files
  1445  	// instead of container status.
  1446  	containerID, err := kl.validateContainerLogStatus(pod.Name, &podStatus, containerName, logOptions.Previous)
  1447  	if err != nil {
  1448  		return err
  1449  	}
  1450  
  1451  	// Do a zero-byte write to stdout before handing off to the container runtime.
  1452  	// This ensures at least one Write call is made to the writer when copying starts,
  1453  	// even if we then block waiting for log output from the container.
  1454  	if _, err := stdout.Write([]byte{}); err != nil {
  1455  		return err
  1456  	}
  1457  
  1458  	return kl.containerRuntime.GetContainerLogs(ctx, pod, containerID, logOptions, stdout, stderr)
  1459  }
  1460  
  1461  // getPhase returns the phase of a pod given its container info.
  1462  func getPhase(pod *v1.Pod, info []v1.ContainerStatus, podIsTerminal bool) v1.PodPhase {
  1463  	spec := pod.Spec
  1464  	pendingInitialization := 0
  1465  	failedInitialization := 0
  1466  
  1467  	// regular init containers
  1468  	for _, container := range spec.InitContainers {
  1469  		if kubetypes.IsRestartableInitContainer(&container) {
  1470  			// Skip the restartable init containers here to handle them separately as
  1471  			// they are slightly different from the init containers in terms of the
  1472  			// pod phase.
  1473  			continue
  1474  		}
  1475  
  1476  		containerStatus, ok := podutil.GetContainerStatus(info, container.Name)
  1477  		if !ok {
  1478  			pendingInitialization++
  1479  			continue
  1480  		}
  1481  
  1482  		switch {
  1483  		case containerStatus.State.Running != nil:
  1484  			pendingInitialization++
  1485  		case containerStatus.State.Terminated != nil:
  1486  			if containerStatus.State.Terminated.ExitCode != 0 {
  1487  				failedInitialization++
  1488  			}
  1489  		case containerStatus.State.Waiting != nil:
  1490  			if containerStatus.LastTerminationState.Terminated != nil {
  1491  				if containerStatus.LastTerminationState.Terminated.ExitCode != 0 {
  1492  					failedInitialization++
  1493  				}
  1494  			} else {
  1495  				pendingInitialization++
  1496  			}
  1497  		default:
  1498  			pendingInitialization++
  1499  		}
  1500  	}
  1501  
  1502  	// counters for restartable init and regular containers
  1503  	unknown := 0
  1504  	running := 0
  1505  	waiting := 0
  1506  	stopped := 0
  1507  	succeeded := 0
  1508  
  1509  	// restartable init containers
  1510  	for _, container := range spec.InitContainers {
  1511  		if !kubetypes.IsRestartableInitContainer(&container) {
  1512  			// Skip the regular init containers, as they have been handled above.
  1513  			continue
  1514  		}
  1515  		containerStatus, ok := podutil.GetContainerStatus(info, container.Name)
  1516  		if !ok {
  1517  			unknown++
  1518  			continue
  1519  		}
  1520  
  1521  		switch {
  1522  		case containerStatus.State.Running != nil:
  1523  			if containerStatus.Started == nil || !*containerStatus.Started {
  1524  				pendingInitialization++
  1525  			}
  1526  			running++
  1527  		case containerStatus.State.Terminated != nil:
  1528  			// Do nothing here, as terminated restartable init containers are not
  1529  			// taken into account for the pod phase.
  1530  		case containerStatus.State.Waiting != nil:
  1531  			if containerStatus.LastTerminationState.Terminated != nil {
  1532  				// Do nothing here, as terminated restartable init containers are not
  1533  				// taken into account for the pod phase.
  1534  			} else {
  1535  				pendingInitialization++
  1536  				waiting++
  1537  			}
  1538  		default:
  1539  			pendingInitialization++
  1540  			unknown++
  1541  		}
  1542  	}
  1543  
  1544  	for _, container := range spec.Containers {
  1545  		containerStatus, ok := podutil.GetContainerStatus(info, container.Name)
  1546  		if !ok {
  1547  			unknown++
  1548  			continue
  1549  		}
  1550  
  1551  		switch {
  1552  		case containerStatus.State.Running != nil:
  1553  			running++
  1554  		case containerStatus.State.Terminated != nil:
  1555  			stopped++
  1556  			if containerStatus.State.Terminated.ExitCode == 0 {
  1557  				succeeded++
  1558  			}
  1559  		case containerStatus.State.Waiting != nil:
  1560  			if containerStatus.LastTerminationState.Terminated != nil {
  1561  				stopped++
  1562  			} else {
  1563  				waiting++
  1564  			}
  1565  		default:
  1566  			unknown++
  1567  		}
  1568  	}
  1569  
  1570  	if failedInitialization > 0 && spec.RestartPolicy == v1.RestartPolicyNever {
  1571  		return v1.PodFailed
  1572  	}
  1573  
  1574  	switch {
  1575  	case pendingInitialization > 0 &&
  1576  		// This is needed to handle the case where the pod has been initialized but
  1577  		// the restartable init containers are restarting and the pod should not be
  1578  		// placed back into v1.PodPending since the regular containers have run.
  1579  		!kubecontainer.HasAnyRegularContainerStarted(&spec, info):
  1580  		fallthrough
  1581  	case waiting > 0:
  1582  		klog.V(5).InfoS("Pod waiting > 0, pending")
  1583  		// One or more containers has not been started
  1584  		return v1.PodPending
  1585  	case running > 0 && unknown == 0:
  1586  		// All containers have been started, and at least
  1587  		// one container is running
  1588  		return v1.PodRunning
  1589  	case running == 0 && stopped > 0 && unknown == 0:
  1590  		// The pod is terminal so its containers won't be restarted regardless
  1591  		// of the restart policy.
  1592  		if podIsTerminal {
  1593  			// TODO(#116484): Also assign terminal phase to static pods.
  1594  			if !kubetypes.IsStaticPod(pod) {
  1595  				// All regular containers are terminated in success and all restartable
  1596  				// init containers are stopped.
  1597  				if stopped == succeeded {
  1598  					return v1.PodSucceeded
  1599  				}
  1600  				// There is at least one failure
  1601  				return v1.PodFailed
  1602  			}
  1603  		}
  1604  		// All containers are terminated
  1605  		if spec.RestartPolicy == v1.RestartPolicyAlways {
  1606  			// All containers are in the process of restarting
  1607  			return v1.PodRunning
  1608  		}
  1609  		if stopped == succeeded {
  1610  			// RestartPolicy is not Always, all containers are terminated in success
  1611  			// and all restartable init containers are stopped.
  1612  			return v1.PodSucceeded
  1613  		}
  1614  		if spec.RestartPolicy == v1.RestartPolicyNever {
  1615  			// RestartPolicy is Never, and all containers are
  1616  			// terminated with at least one in failure
  1617  			return v1.PodFailed
  1618  		}
  1619  		// RestartPolicy is OnFailure, and at least one in failure
  1620  		// and in the process of restarting
  1621  		return v1.PodRunning
  1622  	default:
  1623  		klog.V(5).InfoS("Pod default case, pending")
  1624  		return v1.PodPending
  1625  	}
  1626  }
  1627  
  1628  func deleteCustomResourceFromResourceRequirements(target *v1.ResourceRequirements) {
  1629  	for resource := range target.Limits {
  1630  		if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage {
  1631  			delete(target.Limits, resource)
  1632  		}
  1633  	}
  1634  	for resource := range target.Requests {
  1635  		if resource != v1.ResourceCPU && resource != v1.ResourceMemory && resource != v1.ResourceEphemeralStorage {
  1636  			delete(target.Requests, resource)
  1637  		}
  1638  	}
  1639  }
  1640  
  1641  func (kl *Kubelet) determinePodResizeStatus(pod *v1.Pod, podStatus *v1.PodStatus) v1.PodResizeStatus {
  1642  	var podResizeStatus v1.PodResizeStatus
  1643  	specStatusDiffer := false
  1644  	for _, c := range pod.Spec.Containers {
  1645  		if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok {
  1646  			cResourceCopy := c.Resources.DeepCopy()
  1647  			// for both requests and limits, we only compare the cpu, memory and ephemeralstorage
  1648  			// which are included in convertToAPIContainerStatuses
  1649  			deleteCustomResourceFromResourceRequirements(cResourceCopy)
  1650  			csResourceCopy := cs.Resources.DeepCopy()
  1651  			if csResourceCopy != nil && !cmp.Equal(*cResourceCopy, *csResourceCopy) {
  1652  				specStatusDiffer = true
  1653  				break
  1654  			}
  1655  		}
  1656  	}
  1657  	if !specStatusDiffer {
  1658  		// Clear last resize state from checkpoint
  1659  		if err := kl.statusManager.SetPodResizeStatus(pod.UID, ""); err != nil {
  1660  			klog.ErrorS(err, "SetPodResizeStatus failed", "pod", pod.Name)
  1661  		}
  1662  	} else {
  1663  		if resizeStatus, found := kl.statusManager.GetPodResizeStatus(string(pod.UID)); found {
  1664  			podResizeStatus = resizeStatus
  1665  		}
  1666  	}
  1667  	return podResizeStatus
  1668  }
  1669  
  1670  // generateAPIPodStatus creates the final API pod status for a pod, given the
  1671  // internal pod status. This method should only be called from within sync*Pod methods.
  1672  func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podIsTerminal bool) v1.PodStatus {
  1673  	klog.V(3).InfoS("Generating pod status", "podIsTerminal", podIsTerminal, "pod", klog.KObj(pod))
  1674  	// use the previous pod status, or the api status, as the basis for this pod
  1675  	oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID)
  1676  	if !found {
  1677  		oldPodStatus = pod.Status
  1678  	}
  1679  	s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus)
  1680  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  1681  		s.Resize = kl.determinePodResizeStatus(pod, s)
  1682  	}
  1683  	// calculate the next phase and preserve reason
  1684  	allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...)
  1685  	s.Phase = getPhase(pod, allStatus, podIsTerminal)
  1686  	klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "oldPhase", oldPodStatus.Phase, "phase", s.Phase)
  1687  
  1688  	// Perform a three-way merge between the statuses from the status manager,
  1689  	// runtime, and generated status to ensure terminal status is correctly set.
  1690  	if s.Phase != v1.PodFailed && s.Phase != v1.PodSucceeded {
  1691  		switch {
  1692  		case oldPodStatus.Phase == v1.PodFailed || oldPodStatus.Phase == v1.PodSucceeded:
  1693  			klog.V(4).InfoS("Status manager phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", oldPodStatus.Phase)
  1694  			s.Phase = oldPodStatus.Phase
  1695  		case pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded:
  1696  			klog.V(4).InfoS("API phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", pod.Status.Phase)
  1697  			s.Phase = pod.Status.Phase
  1698  		}
  1699  	}
  1700  
  1701  	if s.Phase == oldPodStatus.Phase {
  1702  		// preserve the reason and message which is associated with the phase
  1703  		s.Reason = oldPodStatus.Reason
  1704  		s.Message = oldPodStatus.Message
  1705  		if len(s.Reason) == 0 {
  1706  			s.Reason = pod.Status.Reason
  1707  		}
  1708  		if len(s.Message) == 0 {
  1709  			s.Message = pod.Status.Message
  1710  		}
  1711  	}
  1712  
  1713  	// check if an internal module has requested the pod is evicted and override the reason and message
  1714  	for _, podSyncHandler := range kl.PodSyncHandlers {
  1715  		if result := podSyncHandler.ShouldEvict(pod); result.Evict {
  1716  			s.Phase = v1.PodFailed
  1717  			s.Reason = result.Reason
  1718  			s.Message = result.Message
  1719  			break
  1720  		}
  1721  	}
  1722  
  1723  	// pods are not allowed to transition out of terminal phases
  1724  	if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
  1725  		// API server shows terminal phase; transitions are not allowed
  1726  		if s.Phase != pod.Status.Phase {
  1727  			klog.ErrorS(nil, "Pod attempted illegal phase transition", "pod", klog.KObj(pod), "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s)
  1728  			// Force back to phase from the API server
  1729  			s.Phase = pod.Status.Phase
  1730  		}
  1731  	}
  1732  
  1733  	// ensure the probe managers have up to date status for containers
  1734  	kl.probeManager.UpdatePodStatus(pod, s)
  1735  
  1736  	// preserve all conditions not owned by the kubelet
  1737  	s.Conditions = make([]v1.PodCondition, 0, len(pod.Status.Conditions)+1)
  1738  	for _, c := range pod.Status.Conditions {
  1739  		if !kubetypes.PodConditionByKubelet(c.Type) {
  1740  			s.Conditions = append(s.Conditions, c)
  1741  		}
  1742  	}
  1743  
  1744  	if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
  1745  		// copy over the pod disruption conditions from state which is already
  1746  		// updated during the eviciton (due to either node resource pressure or
  1747  		// node graceful shutdown). We do not re-generate the conditions based
  1748  		// on the container statuses as they are added based on one-time events.
  1749  		cType := v1.DisruptionTarget
  1750  		if _, condition := podutil.GetPodConditionFromList(oldPodStatus.Conditions, cType); condition != nil {
  1751  			s.Conditions = utilpod.ReplaceOrAppendPodCondition(s.Conditions, condition)
  1752  		}
  1753  	}
  1754  
  1755  	// set all Kubelet-owned conditions
  1756  	if utilfeature.DefaultFeatureGate.Enabled(features.PodReadyToStartContainersCondition) {
  1757  		s.Conditions = append(s.Conditions, status.GeneratePodReadyToStartContainersCondition(pod, podStatus))
  1758  	}
  1759  	allContainerStatuses := append(s.InitContainerStatuses, s.ContainerStatuses...)
  1760  	s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(&pod.Spec, allContainerStatuses, s.Phase))
  1761  	s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(&pod.Spec, s.Conditions, allContainerStatuses, s.Phase))
  1762  	s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(&pod.Spec, allContainerStatuses, s.Phase))
  1763  	s.Conditions = append(s.Conditions, v1.PodCondition{
  1764  		Type:   v1.PodScheduled,
  1765  		Status: v1.ConditionTrue,
  1766  	})
  1767  	// set HostIP/HostIPs and initialize PodIP/PodIPs for host network pods
  1768  	if kl.kubeClient != nil {
  1769  		hostIPs, err := kl.getHostIPsAnyWay()
  1770  		if err != nil {
  1771  			klog.V(4).InfoS("Cannot get host IPs", "err", err)
  1772  		} else {
  1773  			if s.HostIP != "" {
  1774  				if utilnet.IPFamilyOfString(s.HostIP) != utilnet.IPFamilyOf(hostIPs[0]) {
  1775  					kl.recorder.Eventf(pod, v1.EventTypeWarning, "HostIPsIPFamilyMismatch",
  1776  						"Kubelet detected an IPv%s node IP (%s), but the cloud provider selected an IPv%s node IP (%s); pass an explicit `--node-ip` to kubelet to fix this.",
  1777  						utilnet.IPFamilyOfString(s.HostIP), s.HostIP, utilnet.IPFamilyOf(hostIPs[0]), hostIPs[0].String())
  1778  				}
  1779  			}
  1780  			s.HostIP = hostIPs[0].String()
  1781  			if utilfeature.DefaultFeatureGate.Enabled(features.PodHostIPs) {
  1782  				s.HostIPs = []v1.HostIP{{IP: s.HostIP}}
  1783  				if len(hostIPs) == 2 {
  1784  					s.HostIPs = append(s.HostIPs, v1.HostIP{IP: hostIPs[1].String()})
  1785  				}
  1786  			}
  1787  
  1788  			// HostNetwork Pods inherit the node IPs as PodIPs. They are immutable once set,
  1789  			// other than that if the node becomes dual-stack, we add the secondary IP.
  1790  			if kubecontainer.IsHostNetworkPod(pod) {
  1791  				// Primary IP is not set
  1792  				if s.PodIP == "" {
  1793  					s.PodIP = hostIPs[0].String()
  1794  					s.PodIPs = []v1.PodIP{{IP: s.PodIP}}
  1795  				}
  1796  				// Secondary IP is not set #105320
  1797  				if len(hostIPs) == 2 && len(s.PodIPs) == 1 {
  1798  					if utilnet.IPFamilyOfString(s.PodIPs[0].IP) != utilnet.IPFamilyOf(hostIPs[1]) {
  1799  						s.PodIPs = append(s.PodIPs, v1.PodIP{IP: hostIPs[1].String()})
  1800  					}
  1801  				}
  1802  			}
  1803  		}
  1804  	}
  1805  
  1806  	return *s
  1807  }
  1808  
  1809  // sortPodIPs return the PodIPs sorted and truncated by the cluster IP family preference.
  1810  // The runtime pod status may have an arbitrary number of IPs, in an arbitrary order.
  1811  // PodIPs are obtained by: func (m *kubeGenericRuntimeManager) determinePodSandboxIPs()
  1812  // Pick out the first returned IP of the same IP family as the node IP
  1813  // first, followed by the first IP of the opposite IP family (if any)
  1814  // and use them for the Pod.Status.PodIPs and the Downward API environment variables
  1815  func (kl *Kubelet) sortPodIPs(podIPs []string) []string {
  1816  	ips := make([]string, 0, 2)
  1817  	var validPrimaryIP, validSecondaryIP func(ip string) bool
  1818  	if len(kl.nodeIPs) == 0 || utilnet.IsIPv4(kl.nodeIPs[0]) {
  1819  		validPrimaryIP = utilnet.IsIPv4String
  1820  		validSecondaryIP = utilnet.IsIPv6String
  1821  	} else {
  1822  		validPrimaryIP = utilnet.IsIPv6String
  1823  		validSecondaryIP = utilnet.IsIPv4String
  1824  	}
  1825  	for _, ip := range podIPs {
  1826  		if validPrimaryIP(ip) {
  1827  			ips = append(ips, ip)
  1828  			break
  1829  		}
  1830  	}
  1831  	for _, ip := range podIPs {
  1832  		if validSecondaryIP(ip) {
  1833  			ips = append(ips, ip)
  1834  			break
  1835  		}
  1836  	}
  1837  	return ips
  1838  }
  1839  
  1840  // convertStatusToAPIStatus initialize an api PodStatus for the given pod from
  1841  // the given internal pod status and the previous state of the pod from the API.
  1842  // It is purely transformative and does not alter the kubelet state at all.
  1843  func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, oldPodStatus v1.PodStatus) *v1.PodStatus {
  1844  	var apiPodStatus v1.PodStatus
  1845  
  1846  	// copy pod status IPs to avoid race conditions with PodStatus #102806
  1847  	podIPs := make([]string, len(podStatus.IPs))
  1848  	copy(podIPs, podStatus.IPs)
  1849  
  1850  	// make podIPs order match node IP family preference #97979
  1851  	podIPs = kl.sortPodIPs(podIPs)
  1852  	for _, ip := range podIPs {
  1853  		apiPodStatus.PodIPs = append(apiPodStatus.PodIPs, v1.PodIP{IP: ip})
  1854  	}
  1855  	if len(apiPodStatus.PodIPs) > 0 {
  1856  		apiPodStatus.PodIP = apiPodStatus.PodIPs[0].IP
  1857  	}
  1858  
  1859  	// set status for Pods created on versions of kube older than 1.6
  1860  	apiPodStatus.QOSClass = v1qos.GetPodQOS(pod)
  1861  
  1862  	apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses(
  1863  		pod, podStatus,
  1864  		oldPodStatus.ContainerStatuses,
  1865  		pod.Spec.Containers,
  1866  		len(pod.Spec.InitContainers) > 0,
  1867  		false,
  1868  	)
  1869  	apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses(
  1870  		pod, podStatus,
  1871  		oldPodStatus.InitContainerStatuses,
  1872  		pod.Spec.InitContainers,
  1873  		len(pod.Spec.InitContainers) > 0,
  1874  		true,
  1875  	)
  1876  	var ecSpecs []v1.Container
  1877  	for i := range pod.Spec.EphemeralContainers {
  1878  		ecSpecs = append(ecSpecs, v1.Container(pod.Spec.EphemeralContainers[i].EphemeralContainerCommon))
  1879  	}
  1880  
  1881  	// #80875: By now we've iterated podStatus 3 times. We could refactor this to make a single
  1882  	// pass through podStatus.ContainerStatuses
  1883  	apiPodStatus.EphemeralContainerStatuses = kl.convertToAPIContainerStatuses(
  1884  		pod, podStatus,
  1885  		oldPodStatus.EphemeralContainerStatuses,
  1886  		ecSpecs,
  1887  		len(pod.Spec.InitContainers) > 0,
  1888  		false,
  1889  	)
  1890  
  1891  	return &apiPodStatus
  1892  }
  1893  
  1894  // convertToAPIContainerStatuses converts the given internal container
  1895  // statuses into API container statuses.
  1896  func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus {
  1897  	convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus {
  1898  		cid := cs.ID.String()
  1899  		status := &v1.ContainerStatus{
  1900  			Name:         cs.Name,
  1901  			RestartCount: int32(cs.RestartCount),
  1902  			Image:        cs.Image,
  1903  			ImageID:      cs.ImageID,
  1904  			ContainerID:  cid,
  1905  		}
  1906  		switch {
  1907  		case cs.State == kubecontainer.ContainerStateRunning:
  1908  			status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
  1909  		case cs.State == kubecontainer.ContainerStateCreated:
  1910  			// containers that are created but not running are "waiting to be running"
  1911  			status.State.Waiting = &v1.ContainerStateWaiting{}
  1912  		case cs.State == kubecontainer.ContainerStateExited:
  1913  			status.State.Terminated = &v1.ContainerStateTerminated{
  1914  				ExitCode:    int32(cs.ExitCode),
  1915  				Reason:      cs.Reason,
  1916  				Message:     cs.Message,
  1917  				StartedAt:   metav1.NewTime(cs.StartedAt),
  1918  				FinishedAt:  metav1.NewTime(cs.FinishedAt),
  1919  				ContainerID: cid,
  1920  			}
  1921  
  1922  		case cs.State == kubecontainer.ContainerStateUnknown &&
  1923  			oldStatus != nil && // we have an old status
  1924  			oldStatus.State.Running != nil: // our previous status was running
  1925  			// if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers).
  1926  			// you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result.
  1927  			// in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run
  1928  			// twice. "container never ran" is different than "container ran and failed".  This is handled differently in the kubelet
  1929  			// and it is handled differently in higher order logic like crashloop detection and handling
  1930  			status.State.Terminated = &v1.ContainerStateTerminated{
  1931  				Reason:   "ContainerStatusUnknown",
  1932  				Message:  "The container could not be located when the pod was terminated",
  1933  				ExitCode: 137, // this code indicates an error
  1934  			}
  1935  			// the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly
  1936  			// for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate.
  1937  			status.RestartCount = oldStatus.RestartCount + 1
  1938  
  1939  		default:
  1940  			// this collapses any unknown state to container waiting.  If any container is waiting, then the pod status moves to pending even if it is running.
  1941  			// if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers
  1942  			// are actually running.
  1943  			// see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to
  1944  			// https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71
  1945  			// and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439
  1946  			status.State.Waiting = &v1.ContainerStateWaiting{}
  1947  		}
  1948  		return status
  1949  	}
  1950  
  1951  	convertContainerStatusResources := func(cName string, status *v1.ContainerStatus, cStatus *kubecontainer.Status, oldStatuses map[string]v1.ContainerStatus) *v1.ResourceRequirements {
  1952  		var requests, limits v1.ResourceList
  1953  		// oldStatus should always exist if container is running
  1954  		oldStatus, oldStatusFound := oldStatuses[cName]
  1955  		// Initialize limits/requests from container's spec upon transition to Running state
  1956  		// For cpu & memory, values queried from runtime via CRI always supercedes spec values
  1957  		// For ephemeral-storage, a running container's status.limit/request equals spec.limit/request
  1958  		determineResource := func(rName v1.ResourceName, v1ContainerResource, oldStatusResource, resource v1.ResourceList) {
  1959  			if oldStatusFound {
  1960  				if oldStatus.State.Running == nil || status.ContainerID != oldStatus.ContainerID {
  1961  					if r, exists := v1ContainerResource[rName]; exists {
  1962  						resource[rName] = r.DeepCopy()
  1963  					}
  1964  				} else {
  1965  					if oldStatusResource != nil {
  1966  						if r, exists := oldStatusResource[rName]; exists {
  1967  							resource[rName] = r.DeepCopy()
  1968  						}
  1969  					}
  1970  				}
  1971  			}
  1972  		}
  1973  		container := kubecontainer.GetContainerSpec(pod, cName)
  1974  		// AllocatedResources values come from checkpoint. It is the source-of-truth.
  1975  		found := false
  1976  		status.AllocatedResources, found = kl.statusManager.GetContainerResourceAllocation(string(pod.UID), cName)
  1977  		if !(container.Resources.Requests == nil && container.Resources.Limits == nil) && !found {
  1978  			// Log error and fallback to AllocatedResources in oldStatus if it exists
  1979  			klog.ErrorS(nil, "resource allocation not found in checkpoint store", "pod", pod.Name, "container", cName)
  1980  			if oldStatusFound {
  1981  				status.AllocatedResources = oldStatus.AllocatedResources
  1982  			}
  1983  		}
  1984  		if oldStatus.Resources == nil {
  1985  			oldStatus.Resources = &v1.ResourceRequirements{}
  1986  		}
  1987  		// Convert Limits
  1988  		if container.Resources.Limits != nil {
  1989  			limits = make(v1.ResourceList)
  1990  			if cStatus.Resources != nil && cStatus.Resources.CPULimit != nil {
  1991  				limits[v1.ResourceCPU] = cStatus.Resources.CPULimit.DeepCopy()
  1992  			} else {
  1993  				determineResource(v1.ResourceCPU, container.Resources.Limits, oldStatus.Resources.Limits, limits)
  1994  			}
  1995  			if cStatus.Resources != nil && cStatus.Resources.MemoryLimit != nil {
  1996  				limits[v1.ResourceMemory] = cStatus.Resources.MemoryLimit.DeepCopy()
  1997  			} else {
  1998  				determineResource(v1.ResourceMemory, container.Resources.Limits, oldStatus.Resources.Limits, limits)
  1999  			}
  2000  			if ephemeralStorage, found := container.Resources.Limits[v1.ResourceEphemeralStorage]; found {
  2001  				limits[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy()
  2002  			}
  2003  		}
  2004  		// Convert Requests
  2005  		if status.AllocatedResources != nil {
  2006  			requests = make(v1.ResourceList)
  2007  			if cStatus.Resources != nil && cStatus.Resources.CPURequest != nil {
  2008  				requests[v1.ResourceCPU] = cStatus.Resources.CPURequest.DeepCopy()
  2009  			} else {
  2010  				determineResource(v1.ResourceCPU, status.AllocatedResources, oldStatus.Resources.Requests, requests)
  2011  			}
  2012  			if memory, found := status.AllocatedResources[v1.ResourceMemory]; found {
  2013  				requests[v1.ResourceMemory] = memory.DeepCopy()
  2014  			}
  2015  			if ephemeralStorage, found := status.AllocatedResources[v1.ResourceEphemeralStorage]; found {
  2016  				requests[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy()
  2017  			}
  2018  		}
  2019  		//TODO(vinaykul,derekwaynecarr,InPlacePodVerticalScaling): Update this to include extended resources in
  2020  		// addition to CPU, memory, ephemeral storage. Add test case for extended resources.
  2021  		resources := &v1.ResourceRequirements{
  2022  			Limits:   limits,
  2023  			Requests: requests,
  2024  		}
  2025  		return resources
  2026  	}
  2027  
  2028  	// Fetch old containers statuses from old pod status.
  2029  	oldStatuses := make(map[string]v1.ContainerStatus, len(containers))
  2030  	for _, status := range previousStatus {
  2031  		oldStatuses[status.Name] = status
  2032  	}
  2033  
  2034  	// Set all container statuses to default waiting state
  2035  	statuses := make(map[string]*v1.ContainerStatus, len(containers))
  2036  	defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: ContainerCreating}}
  2037  	if hasInitContainers {
  2038  		defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: PodInitializing}}
  2039  	}
  2040  
  2041  	for _, container := range containers {
  2042  		status := &v1.ContainerStatus{
  2043  			Name:  container.Name,
  2044  			Image: container.Image,
  2045  			State: defaultWaitingState,
  2046  		}
  2047  		oldStatus, found := oldStatuses[container.Name]
  2048  		if found {
  2049  			if oldStatus.State.Terminated != nil {
  2050  				status = &oldStatus
  2051  			} else {
  2052  				// Apply some values from the old statuses as the default values.
  2053  				status.RestartCount = oldStatus.RestartCount
  2054  				status.LastTerminationState = oldStatus.LastTerminationState
  2055  			}
  2056  		}
  2057  		statuses[container.Name] = status
  2058  	}
  2059  
  2060  	for _, container := range containers {
  2061  		found := false
  2062  		for _, cStatus := range podStatus.ContainerStatuses {
  2063  			if container.Name == cStatus.Name {
  2064  				found = true
  2065  				break
  2066  			}
  2067  		}
  2068  		if found {
  2069  			continue
  2070  		}
  2071  		// if no container is found, then assuming it should be waiting seems plausible, but the status code requires
  2072  		// that a previous termination be present.  If we're offline long enough or something removed the container, then
  2073  		// the previous termination may not be present.  This next code block ensures that if the container was previously running
  2074  		// then when that container status disappears, we can infer that it terminated even if we don't know the status code.
  2075  		// By setting the lasttermination state we are able to leave the container status waiting and present more accurate
  2076  		// data via the API.
  2077  
  2078  		oldStatus, ok := oldStatuses[container.Name]
  2079  		if !ok {
  2080  			continue
  2081  		}
  2082  		if oldStatus.State.Terminated != nil {
  2083  			// if the old container status was terminated, the lasttermination status is correct
  2084  			continue
  2085  		}
  2086  		if oldStatus.State.Running == nil {
  2087  			// if the old container status isn't running, then waiting is an appropriate status and we have nothing to do
  2088  			continue
  2089  		}
  2090  
  2091  		// If we're here, we know the pod was previously running, but doesn't have a terminated status. We will check now to
  2092  		// see if it's in a pending state.
  2093  		status := statuses[container.Name]
  2094  		// If the status we're about to write indicates the default, the Waiting status will force this pod back into Pending.
  2095  		// That isn't true, we know the pod was previously running.
  2096  		isDefaultWaitingStatus := status.State.Waiting != nil && status.State.Waiting.Reason == ContainerCreating
  2097  		if hasInitContainers {
  2098  			isDefaultWaitingStatus = status.State.Waiting != nil && status.State.Waiting.Reason == PodInitializing
  2099  		}
  2100  		if !isDefaultWaitingStatus {
  2101  			// the status was written, don't override
  2102  			continue
  2103  		}
  2104  		if status.LastTerminationState.Terminated != nil {
  2105  			// if we already have a termination state, nothing to do
  2106  			continue
  2107  		}
  2108  
  2109  		// setting this value ensures that we show as stopped here, not as waiting:
  2110  		// https://github.com/kubernetes/kubernetes/blob/90c9f7b3e198e82a756a68ffeac978a00d606e55/pkg/kubelet/kubelet_pods.go#L1440-L1445
  2111  		// This prevents the pod from becoming pending
  2112  		status.LastTerminationState.Terminated = &v1.ContainerStateTerminated{
  2113  			Reason:   "ContainerStatusUnknown",
  2114  			Message:  "The container could not be located when the pod was deleted.  The container used to be Running",
  2115  			ExitCode: 137,
  2116  		}
  2117  
  2118  		// If the pod was not deleted, then it's been restarted. Increment restart count.
  2119  		if pod.DeletionTimestamp == nil {
  2120  			status.RestartCount += 1
  2121  		}
  2122  
  2123  		statuses[container.Name] = status
  2124  	}
  2125  
  2126  	// Copy the slice before sorting it
  2127  	containerStatusesCopy := make([]*kubecontainer.Status, len(podStatus.ContainerStatuses))
  2128  	copy(containerStatusesCopy, podStatus.ContainerStatuses)
  2129  
  2130  	// Make the latest container status comes first.
  2131  	sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(containerStatusesCopy)))
  2132  	// Set container statuses according to the statuses seen in pod status
  2133  	containerSeen := map[string]int{}
  2134  	for _, cStatus := range containerStatusesCopy {
  2135  		cName := cStatus.Name
  2136  		if _, ok := statuses[cName]; !ok {
  2137  			// This would also ignore the infra container.
  2138  			continue
  2139  		}
  2140  		if containerSeen[cName] >= 2 {
  2141  			continue
  2142  		}
  2143  		var oldStatusPtr *v1.ContainerStatus
  2144  		if oldStatus, ok := oldStatuses[cName]; ok {
  2145  			oldStatusPtr = &oldStatus
  2146  		}
  2147  		status := convertContainerStatus(cStatus, oldStatusPtr)
  2148  		if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2149  			if status.State.Running != nil {
  2150  				status.Resources = convertContainerStatusResources(cName, status, cStatus, oldStatuses)
  2151  			}
  2152  		}
  2153  		if containerSeen[cName] == 0 {
  2154  			statuses[cName] = status
  2155  		} else {
  2156  			statuses[cName].LastTerminationState = status.State
  2157  		}
  2158  		containerSeen[cName] = containerSeen[cName] + 1
  2159  	}
  2160  
  2161  	// Handle the containers failed to be started, which should be in Waiting state.
  2162  	for _, container := range containers {
  2163  		if isInitContainer {
  2164  			// If the init container is terminated with exit code 0, it won't be restarted.
  2165  			// TODO(random-liu): Handle this in a cleaner way.
  2166  			s := podStatus.FindContainerStatusByName(container.Name)
  2167  			if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 {
  2168  				continue
  2169  			}
  2170  		}
  2171  		// If a container should be restarted in next syncpod, it is *Waiting*.
  2172  		if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
  2173  			continue
  2174  		}
  2175  		status := statuses[container.Name]
  2176  		reason, ok := kl.reasonCache.Get(pod.UID, container.Name)
  2177  		if !ok {
  2178  			// In fact, we could also apply Waiting state here, but it is less informative,
  2179  			// and the container will be restarted soon, so we prefer the original state here.
  2180  			// Note that with the current implementation of ShouldContainerBeRestarted the original state here
  2181  			// could be:
  2182  			//   * Waiting: There is no associated historical container and start failure reason record.
  2183  			//   * Terminated: The container is terminated.
  2184  			continue
  2185  		}
  2186  		if status.State.Terminated != nil {
  2187  			status.LastTerminationState = status.State
  2188  		}
  2189  		status.State = v1.ContainerState{
  2190  			Waiting: &v1.ContainerStateWaiting{
  2191  				Reason:  reason.Err.Error(),
  2192  				Message: reason.Message,
  2193  			},
  2194  		}
  2195  		statuses[container.Name] = status
  2196  	}
  2197  
  2198  	// Sort the container statuses since clients of this interface expect the list
  2199  	// of containers in a pod has a deterministic order.
  2200  	if isInitContainer {
  2201  		return kubetypes.SortStatusesOfInitContainers(pod, statuses)
  2202  	}
  2203  	containerStatuses := make([]v1.ContainerStatus, 0, len(statuses))
  2204  	for _, status := range statuses {
  2205  		containerStatuses = append(containerStatuses, *status)
  2206  	}
  2207  
  2208  	sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses))
  2209  	return containerStatuses
  2210  }
  2211  
  2212  // ServeLogs returns logs of current machine.
  2213  func (kl *Kubelet) ServeLogs(w http.ResponseWriter, req *http.Request) {
  2214  	// TODO: allowlist logs we are willing to serve
  2215  	kl.logServer.ServeHTTP(w, req)
  2216  }
  2217  
  2218  // findContainer finds and returns the container with the given pod ID, full name, and container name.
  2219  // It returns nil if not found.
  2220  func (kl *Kubelet) findContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string) (*kubecontainer.Container, error) {
  2221  	pods, err := kl.containerRuntime.GetPods(ctx, false)
  2222  	if err != nil {
  2223  		return nil, err
  2224  	}
  2225  	// Resolve and type convert back again.
  2226  	// We need the static pod UID but the kubecontainer API works with types.UID.
  2227  	podUID = types.UID(kl.podManager.TranslatePodUID(podUID))
  2228  	pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID)
  2229  	return pod.FindContainerByName(containerName), nil
  2230  }
  2231  
  2232  // RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes
  2233  func (kl *Kubelet) RunInContainer(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string) ([]byte, error) {
  2234  	container, err := kl.findContainer(ctx, podFullName, podUID, containerName)
  2235  	if err != nil {
  2236  		return nil, err
  2237  	}
  2238  	if container == nil {
  2239  		return nil, fmt.Errorf("container not found (%q)", containerName)
  2240  	}
  2241  	// TODO(tallclair): Pass a proper timeout value.
  2242  	return kl.runner.RunInContainer(ctx, container.ID, cmd, 0)
  2243  }
  2244  
  2245  // GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it.
  2246  func (kl *Kubelet) GetExec(ctx context.Context, podFullName string, podUID types.UID, containerName string, cmd []string, streamOpts remotecommandserver.Options) (*url.URL, error) {
  2247  	container, err := kl.findContainer(ctx, podFullName, podUID, containerName)
  2248  	if err != nil {
  2249  		return nil, err
  2250  	}
  2251  	if container == nil {
  2252  		return nil, fmt.Errorf("container not found (%q)", containerName)
  2253  	}
  2254  	return kl.streamingRuntime.GetExec(ctx, container.ID, cmd, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, streamOpts.TTY)
  2255  }
  2256  
  2257  // GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it.
  2258  func (kl *Kubelet) GetAttach(ctx context.Context, podFullName string, podUID types.UID, containerName string, streamOpts remotecommandserver.Options) (*url.URL, error) {
  2259  	container, err := kl.findContainer(ctx, podFullName, podUID, containerName)
  2260  	if err != nil {
  2261  		return nil, err
  2262  	}
  2263  	if container == nil {
  2264  		return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName)
  2265  	}
  2266  
  2267  	// The TTY setting for attach must match the TTY setting in the initial container configuration,
  2268  	// since whether the process is running in a TTY cannot be changed after it has started.  We
  2269  	// need the api.Pod to get the TTY status.
  2270  	pod, found := kl.GetPodByFullName(podFullName)
  2271  	if !found || (string(podUID) != "" && pod.UID != podUID) {
  2272  		return nil, fmt.Errorf("pod %s not found", podFullName)
  2273  	}
  2274  	containerSpec := kubecontainer.GetContainerSpec(pod, containerName)
  2275  	if containerSpec == nil {
  2276  		return nil, fmt.Errorf("container %s not found in pod %s", containerName, podFullName)
  2277  	}
  2278  	tty := containerSpec.TTY
  2279  
  2280  	return kl.streamingRuntime.GetAttach(ctx, container.ID, streamOpts.Stdin, streamOpts.Stdout, streamOpts.Stderr, tty)
  2281  }
  2282  
  2283  // GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it.
  2284  func (kl *Kubelet) GetPortForward(ctx context.Context, podName, podNamespace string, podUID types.UID, portForwardOpts portforward.V4Options) (*url.URL, error) {
  2285  	pods, err := kl.containerRuntime.GetPods(ctx, false)
  2286  	if err != nil {
  2287  		return nil, err
  2288  	}
  2289  	// Resolve and type convert back again.
  2290  	// We need the static pod UID but the kubecontainer API works with types.UID.
  2291  	podUID = types.UID(kl.podManager.TranslatePodUID(podUID))
  2292  	podFullName := kubecontainer.BuildPodFullName(podName, podNamespace)
  2293  	pod := kubecontainer.Pods(pods).FindPod(podFullName, podUID)
  2294  	if pod.IsEmpty() {
  2295  		return nil, fmt.Errorf("pod not found (%q)", podFullName)
  2296  	}
  2297  
  2298  	return kl.streamingRuntime.GetPortForward(ctx, podName, podNamespace, podUID, portForwardOpts.Ports)
  2299  }
  2300  
  2301  // cleanupOrphanedPodCgroups removes cgroups that should no longer exist.
  2302  // it reconciles the cached state of cgroupPods with the specified list of runningPods
  2303  func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupPods map[types.UID]cm.CgroupName, possiblyRunningPods map[types.UID]sets.Empty) {
  2304  	// Iterate over all the found pods to verify if they should be running
  2305  	for uid, val := range cgroupPods {
  2306  		// if the pod is in the running set, its not a candidate for cleanup
  2307  		if _, ok := possiblyRunningPods[uid]; ok {
  2308  			continue
  2309  		}
  2310  
  2311  		// If volumes have not been unmounted/detached, do not delete the cgroup
  2312  		// so any memory backed volumes don't have their charges propagated to the
  2313  		// parent croup.  If the volumes still exist, reduce the cpu shares for any
  2314  		// process in the cgroup to the minimum value while we wait.  if the kubelet
  2315  		// is configured to keep terminated volumes, we will delete the cgroup and not block.
  2316  		if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.keepTerminatedPodVolumes {
  2317  			klog.V(3).InfoS("Orphaned pod found, but volumes not yet removed.  Reducing cpu to minimum", "podUID", uid)
  2318  			if err := pcm.ReduceCPULimits(val); err != nil {
  2319  				klog.InfoS("Failed to reduce cpu time for pod pending volume cleanup", "podUID", uid, "err", err)
  2320  			}
  2321  			continue
  2322  		}
  2323  		klog.V(3).InfoS("Orphaned pod found, removing pod cgroups", "podUID", uid)
  2324  		// Destroy all cgroups of pod that should not be running,
  2325  		// by first killing all the attached processes to these cgroups.
  2326  		// We ignore errors thrown by the method, as the housekeeping loop would
  2327  		// again try to delete these unwanted pod cgroups
  2328  		go pcm.Destroy(val)
  2329  	}
  2330  }