k8s.io/kubernetes@v1.29.3/pkg/kubelet/kubelet.go

k8s.io/kubernetes@v1.29.3/pkg/kubelet/kubelet.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kubelet
    18  
    19  import (
    20  	"context"
    21  	"crypto/tls"
    22  	"fmt"
    23  	"math"
    24  	"net"
    25  	"net/http"
    26  	"os"
    27  	"path/filepath"
    28  	sysruntime "runtime"
    29  	"sort"
    30  	"sync"
    31  	"sync/atomic"
    32  	"time"
    33  
    34  	cadvisorapi "github.com/google/cadvisor/info/v1"
    35  	"github.com/google/go-cmp/cmp"
    36  	libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns"
    37  	"github.com/opencontainers/selinux/go-selinux"
    38  	"go.opentelemetry.io/otel/attribute"
    39  	semconv "go.opentelemetry.io/otel/semconv/v1.12.0"
    40  	"go.opentelemetry.io/otel/trace"
    41  	"k8s.io/client-go/informers"
    42  
    43  	"k8s.io/mount-utils"
    44  	"k8s.io/utils/integer"
    45  	netutils "k8s.io/utils/net"
    46  
    47  	v1 "k8s.io/api/core/v1"
    48  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    49  	"k8s.io/apimachinery/pkg/fields"
    50  	"k8s.io/apimachinery/pkg/labels"
    51  	"k8s.io/apimachinery/pkg/types"
    52  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    53  	"k8s.io/apimachinery/pkg/util/sets"
    54  	"k8s.io/apimachinery/pkg/util/wait"
    55  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    56  	clientset "k8s.io/client-go/kubernetes"
    57  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    58  	corelisters "k8s.io/client-go/listers/core/v1"
    59  	"k8s.io/client-go/tools/cache"
    60  	"k8s.io/client-go/tools/record"
    61  	"k8s.io/client-go/util/certificate"
    62  	"k8s.io/client-go/util/flowcontrol"
    63  	cloudprovider "k8s.io/cloud-provider"
    64  	"k8s.io/component-helpers/apimachinery/lease"
    65  	internalapi "k8s.io/cri-api/pkg/apis"
    66  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    67  	"k8s.io/klog/v2"
    68  	pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
    69  	statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    70  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    71  	"k8s.io/kubernetes/pkg/api/v1/resource"
    72  	"k8s.io/kubernetes/pkg/features"
    73  	kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config"
    74  	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
    75  	"k8s.io/kubernetes/pkg/kubelet/cadvisor"
    76  	kubeletcertificate "k8s.io/kubernetes/pkg/kubelet/certificate"
    77  	"k8s.io/kubernetes/pkg/kubelet/cloudresource"
    78  	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
    79  	"k8s.io/kubernetes/pkg/kubelet/cm"
    80  	draplugin "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
    81  	"k8s.io/kubernetes/pkg/kubelet/config"
    82  	"k8s.io/kubernetes/pkg/kubelet/configmap"
    83  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    84  	"k8s.io/kubernetes/pkg/kubelet/cri/remote"
    85  	"k8s.io/kubernetes/pkg/kubelet/events"
    86  	"k8s.io/kubernetes/pkg/kubelet/eviction"
    87  	"k8s.io/kubernetes/pkg/kubelet/images"
    88  	"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
    89  	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
    90  	"k8s.io/kubernetes/pkg/kubelet/logs"
    91  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    92  	"k8s.io/kubernetes/pkg/kubelet/metrics/collectors"
    93  	"k8s.io/kubernetes/pkg/kubelet/network/dns"
    94  	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
    95  	oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom"
    96  	"k8s.io/kubernetes/pkg/kubelet/pleg"
    97  	"k8s.io/kubernetes/pkg/kubelet/pluginmanager"
    98  	plugincache "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
    99  	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
   100  	"k8s.io/kubernetes/pkg/kubelet/preemption"
   101  	"k8s.io/kubernetes/pkg/kubelet/prober"
   102  	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
   103  	"k8s.io/kubernetes/pkg/kubelet/runtimeclass"
   104  	"k8s.io/kubernetes/pkg/kubelet/secret"
   105  	"k8s.io/kubernetes/pkg/kubelet/server"
   106  	servermetrics "k8s.io/kubernetes/pkg/kubelet/server/metrics"
   107  	serverstats "k8s.io/kubernetes/pkg/kubelet/server/stats"
   108  	"k8s.io/kubernetes/pkg/kubelet/stats"
   109  	"k8s.io/kubernetes/pkg/kubelet/status"
   110  	"k8s.io/kubernetes/pkg/kubelet/sysctl"
   111  	"k8s.io/kubernetes/pkg/kubelet/token"
   112  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
   113  	"k8s.io/kubernetes/pkg/kubelet/userns"
   114  	"k8s.io/kubernetes/pkg/kubelet/util"
   115  	"k8s.io/kubernetes/pkg/kubelet/util/manager"
   116  	"k8s.io/kubernetes/pkg/kubelet/util/queue"
   117  	"k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
   118  	"k8s.io/kubernetes/pkg/kubelet/volumemanager"
   119  	httpprobe "k8s.io/kubernetes/pkg/probe/http"
   120  	"k8s.io/kubernetes/pkg/security/apparmor"
   121  	"k8s.io/kubernetes/pkg/util/oom"
   122  	"k8s.io/kubernetes/pkg/volume"
   123  	"k8s.io/kubernetes/pkg/volume/csi"
   124  	"k8s.io/kubernetes/pkg/volume/util/hostutil"
   125  	"k8s.io/kubernetes/pkg/volume/util/subpath"
   126  	"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
   127  	"k8s.io/utils/clock"
   128  )
   129  
   130  const (
   131  	// Max amount of time to wait for the container runtime to come up.
   132  	maxWaitForContainerRuntime = 30 * time.Second
   133  
   134  	// nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed.
   135  	nodeStatusUpdateRetry = 5
   136  
   137  	// nodeReadyGracePeriod is the period to allow for before fast status update is
   138  	// terminated and container runtime not being ready is logged without verbosity guard.
   139  	nodeReadyGracePeriod = 120 * time.Second
   140  
   141  	// DefaultContainerLogsDir is the location of container logs.
   142  	DefaultContainerLogsDir = "/var/log/containers"
   143  
   144  	// MaxContainerBackOff is the max backoff period, exported for the e2e test
   145  	MaxContainerBackOff = 300 * time.Second
   146  
   147  	// Period for performing global cleanup tasks.
   148  	housekeepingPeriod = time.Second * 2
   149  
   150  	// Duration at which housekeeping failed to satisfy the invariant that
   151  	// housekeeping should be fast to avoid blocking pod config (while
   152  	// housekeeping is running no new pods are started or deleted).
   153  	housekeepingWarningDuration = time.Second * 1
   154  
   155  	// Period after which the runtime cache expires - set to slightly longer than
   156  	// the expected length between housekeeping periods, which explicitly refreshes
   157  	// the cache.
   158  	runtimeCacheRefreshPeriod = housekeepingPeriod + housekeepingWarningDuration
   159  
   160  	// Period for performing eviction monitoring.
   161  	// ensure this is kept in sync with internal cadvisor housekeeping.
   162  	evictionMonitoringPeriod = time.Second * 10
   163  
   164  	// The path in containers' filesystems where the hosts file is mounted.
   165  	linuxEtcHostsPath   = "/etc/hosts"
   166  	windowsEtcHostsPath = "C:\\Windows\\System32\\drivers\\etc\\hosts"
   167  
   168  	// Capacity of the channel for receiving pod lifecycle events. This number
   169  	// is a bit arbitrary and may be adjusted in the future.
   170  	plegChannelCapacity = 1000
   171  
   172  	// Generic PLEG relies on relisting for discovering container events.
   173  	// A longer period means that kubelet will take longer to detect container
   174  	// changes and to update pod status. On the other hand, a shorter period
   175  	// will cause more frequent relisting (e.g., container runtime operations),
   176  	// leading to higher cpu usage.
   177  	// Note that even though we set the period to 1s, the relisting itself can
   178  	// take more than 1s to finish if the container runtime responds slowly
   179  	// and/or when there are many container changes in one cycle.
   180  	genericPlegRelistPeriod    = time.Second * 1
   181  	genericPlegRelistThreshold = time.Minute * 3
   182  
   183  	// Generic PLEG relist period and threshold when used with Evented PLEG.
   184  	eventedPlegRelistPeriod     = time.Second * 300
   185  	eventedPlegRelistThreshold  = time.Minute * 10
   186  	eventedPlegMaxStreamRetries = 5
   187  
   188  	// backOffPeriod is the period to back off when pod syncing results in an
   189  	// error. It is also used as the base period for the exponential backoff
   190  	// container restarts and image pulls.
   191  	backOffPeriod = time.Second * 10
   192  
   193  	// ContainerGCPeriod is the period for performing container garbage collection.
   194  	ContainerGCPeriod = time.Minute
   195  	// ImageGCPeriod is the period for performing image garbage collection.
   196  	ImageGCPeriod = 5 * time.Minute
   197  
   198  	// Minimum number of dead containers to keep in a pod
   199  	minDeadContainerInPod = 1
   200  
   201  	// nodeLeaseRenewIntervalFraction is the fraction of lease duration to renew the lease
   202  	nodeLeaseRenewIntervalFraction = 0.25
   203  
   204  	// instrumentationScope is the name of OpenTelemetry instrumentation scope
   205  	instrumentationScope = "k8s.io/kubernetes/pkg/kubelet"
   206  )
   207  
   208  var (
   209  	// ContainerLogsDir can be overwritten for testing usage
   210  	ContainerLogsDir = DefaultContainerLogsDir
   211  	etcHostsPath     = getContainerEtcHostsPath()
   212  )
   213  
   214  func getContainerEtcHostsPath() string {
   215  	if sysruntime.GOOS == "windows" {
   216  		return windowsEtcHostsPath
   217  	}
   218  	return linuxEtcHostsPath
   219  }
   220  
   221  // SyncHandler is an interface implemented by Kubelet, for testability
   222  type SyncHandler interface {
   223  	HandlePodAdditions(pods []*v1.Pod)
   224  	HandlePodUpdates(pods []*v1.Pod)
   225  	HandlePodRemoves(pods []*v1.Pod)
   226  	HandlePodReconcile(pods []*v1.Pod)
   227  	HandlePodSyncs(pods []*v1.Pod)
   228  	HandlePodCleanups(ctx context.Context) error
   229  }
   230  
   231  // Option is a functional option type for Kubelet
   232  type Option func(*Kubelet)
   233  
   234  // Bootstrap is a bootstrapping interface for kubelet, targets the initialization protocol
   235  type Bootstrap interface {
   236  	GetConfiguration() kubeletconfiginternal.KubeletConfiguration
   237  	BirthCry()
   238  	StartGarbageCollection()
   239  	ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, auth server.AuthInterface, tp trace.TracerProvider)
   240  	ListenAndServeReadOnly(address net.IP, port uint)
   241  	ListenAndServePodResources()
   242  	Run(<-chan kubetypes.PodUpdate)
   243  	RunOnce(<-chan kubetypes.PodUpdate) ([]RunPodResult, error)
   244  }
   245  
   246  // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed
   247  // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping
   248  // these objects while we figure out a more comprehensive dependency injection story for the Kubelet.
   249  type Dependencies struct {
   250  	Options []Option
   251  
   252  	// Injected Dependencies
   253  	Auth                      server.AuthInterface
   254  	CAdvisorInterface         cadvisor.Interface
   255  	Cloud                     cloudprovider.Interface
   256  	ContainerManager          cm.ContainerManager
   257  	EventClient               v1core.EventsGetter
   258  	HeartbeatClient           clientset.Interface
   259  	OnHeartbeatFailure        func()
   260  	KubeClient                clientset.Interface
   261  	Mounter                   mount.Interface
   262  	HostUtil                  hostutil.HostUtils
   263  	OOMAdjuster               *oom.OOMAdjuster
   264  	OSInterface               kubecontainer.OSInterface
   265  	PodConfig                 *config.PodConfig
   266  	ProbeManager              prober.Manager
   267  	Recorder                  record.EventRecorder
   268  	Subpather                 subpath.Interface
   269  	TracerProvider            trace.TracerProvider
   270  	VolumePlugins             []volume.VolumePlugin
   271  	DynamicPluginProber       volume.DynamicPluginProber
   272  	TLSOptions                *server.TLSOptions
   273  	RemoteRuntimeService      internalapi.RuntimeService
   274  	RemoteImageService        internalapi.ImageManagerService
   275  	PodStartupLatencyTracker  util.PodStartupLatencyTracker
   276  	NodeStartupLatencyTracker util.NodeStartupLatencyTracker
   277  	// remove it after cadvisor.UsingLegacyCadvisorStats dropped.
   278  	useLegacyCadvisorStats bool
   279  }
   280  
   281  // makePodSourceConfig creates a config.PodConfig from the given
   282  // KubeletConfiguration or returns an error.
   283  func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) {
   284  	manifestURLHeader := make(http.Header)
   285  	if len(kubeCfg.StaticPodURLHeader) > 0 {
   286  		for k, v := range kubeCfg.StaticPodURLHeader {
   287  			for i := range v {
   288  				manifestURLHeader.Add(k, v[i])
   289  			}
   290  		}
   291  	}
   292  
   293  	// source of all configuration
   294  	cfg := config.NewPodConfig(config.PodConfigNotificationIncremental, kubeDeps.Recorder, kubeDeps.PodStartupLatencyTracker)
   295  
   296  	// TODO:  it needs to be replaced by a proper context in the future
   297  	ctx := context.TODO()
   298  
   299  	// define file config source
   300  	if kubeCfg.StaticPodPath != "" {
   301  		klog.InfoS("Adding static pod path", "path", kubeCfg.StaticPodPath)
   302  		config.NewSourceFile(kubeCfg.StaticPodPath, nodeName, kubeCfg.FileCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.FileSource))
   303  	}
   304  
   305  	// define url config source
   306  	if kubeCfg.StaticPodURL != "" {
   307  		klog.InfoS("Adding pod URL with HTTP header", "URL", kubeCfg.StaticPodURL, "header", manifestURLHeader)
   308  		config.NewSourceURL(kubeCfg.StaticPodURL, manifestURLHeader, nodeName, kubeCfg.HTTPCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.HTTPSource))
   309  	}
   310  
   311  	if kubeDeps.KubeClient != nil {
   312  		klog.InfoS("Adding apiserver pod source")
   313  		config.NewSourceApiserver(kubeDeps.KubeClient, nodeName, nodeHasSynced, cfg.Channel(ctx, kubetypes.ApiserverSource))
   314  	}
   315  	return cfg, nil
   316  }
   317  
   318  // PreInitRuntimeService will init runtime service before RunKubelet.
   319  func PreInitRuntimeService(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies) error {
   320  	remoteImageEndpoint := kubeCfg.ImageServiceEndpoint
   321  	if remoteImageEndpoint == "" && kubeCfg.ContainerRuntimeEndpoint != "" {
   322  		remoteImageEndpoint = kubeCfg.ContainerRuntimeEndpoint
   323  	}
   324  	var err error
   325  	if kubeDeps.RemoteRuntimeService, err = remote.NewRemoteRuntimeService(kubeCfg.ContainerRuntimeEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, kubeDeps.TracerProvider); err != nil {
   326  		return err
   327  	}
   328  	if kubeDeps.RemoteImageService, err = remote.NewRemoteImageService(remoteImageEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, kubeDeps.TracerProvider); err != nil {
   329  		return err
   330  	}
   331  
   332  	kubeDeps.useLegacyCadvisorStats = cadvisor.UsingLegacyCadvisorStats(kubeCfg.ContainerRuntimeEndpoint)
   333  
   334  	return nil
   335  }
   336  
   337  // NewMainKubelet instantiates a new Kubelet object along with all the required internal modules.
   338  // No initialization of Kubelet and its modules should happen here.
   339  func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
   340  	kubeDeps *Dependencies,
   341  	crOptions *config.ContainerRuntimeOptions,
   342  	hostname string,
   343  	hostnameOverridden bool,
   344  	nodeName types.NodeName,
   345  	nodeIPs []net.IP,
   346  	providerID string,
   347  	cloudProvider string,
   348  	certDirectory string,
   349  	rootDirectory string,
   350  	imageCredentialProviderConfigFile string,
   351  	imageCredentialProviderBinDir string,
   352  	registerNode bool,
   353  	registerWithTaints []v1.Taint,
   354  	allowedUnsafeSysctls []string,
   355  	experimentalMounterPath string,
   356  	kernelMemcgNotification bool,
   357  	experimentalNodeAllocatableIgnoreEvictionThreshold bool,
   358  	minimumGCAge metav1.Duration,
   359  	maxPerPodContainerCount int32,
   360  	maxContainerCount int32,
   361  	registerSchedulable bool,
   362  	keepTerminatedPodVolumes bool,
   363  	nodeLabels map[string]string,
   364  	nodeStatusMaxImages int32,
   365  	seccompDefault bool,
   366  ) (*Kubelet, error) {
   367  	ctx := context.Background()
   368  	logger := klog.TODO()
   369  
   370  	if rootDirectory == "" {
   371  		return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
   372  	}
   373  	if kubeCfg.SyncFrequency.Duration <= 0 {
   374  		return nil, fmt.Errorf("invalid sync frequency %d", kubeCfg.SyncFrequency.Duration)
   375  	}
   376  
   377  	if utilfeature.DefaultFeatureGate.Enabled(features.DisableCloudProviders) && cloudprovider.IsDeprecatedInternal(cloudProvider) {
   378  		cloudprovider.DisableWarningForProvider(cloudProvider)
   379  		return nil, fmt.Errorf("cloud provider %q was specified, but built-in cloud providers are disabled. Please set --cloud-provider=external and migrate to an external cloud provider", cloudProvider)
   380  	}
   381  
   382  	var nodeHasSynced cache.InformerSynced
   383  	var nodeLister corelisters.NodeLister
   384  
   385  	// If kubeClient == nil, we are running in standalone mode (i.e. no API servers)
   386  	// If not nil, we are running as part of a cluster and should sync w/API
   387  	if kubeDeps.KubeClient != nil {
   388  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) {
   389  			options.FieldSelector = fields.Set{metav1.ObjectNameField: string(nodeName)}.String()
   390  		}))
   391  		nodeLister = kubeInformers.Core().V1().Nodes().Lister()
   392  		nodeHasSynced = func() bool {
   393  			return kubeInformers.Core().V1().Nodes().Informer().HasSynced()
   394  		}
   395  		kubeInformers.Start(wait.NeverStop)
   396  		klog.InfoS("Attempting to sync node with API server")
   397  	} else {
   398  		// we don't have a client to sync!
   399  		nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{})
   400  		nodeLister = corelisters.NewNodeLister(nodeIndexer)
   401  		nodeHasSynced = func() bool { return true }
   402  		klog.InfoS("Kubelet is running in standalone mode, will skip API server sync")
   403  	}
   404  
   405  	if kubeDeps.PodConfig == nil {
   406  		var err error
   407  		kubeDeps.PodConfig, err = makePodSourceConfig(kubeCfg, kubeDeps, nodeName, nodeHasSynced)
   408  		if err != nil {
   409  			return nil, err
   410  		}
   411  	}
   412  
   413  	containerGCPolicy := kubecontainer.GCPolicy{
   414  		MinAge:             minimumGCAge.Duration,
   415  		MaxPerPodContainer: int(maxPerPodContainerCount),
   416  		MaxContainers:      int(maxContainerCount),
   417  	}
   418  
   419  	daemonEndpoints := &v1.NodeDaemonEndpoints{
   420  		KubeletEndpoint: v1.DaemonEndpoint{Port: kubeCfg.Port},
   421  	}
   422  
   423  	imageGCPolicy := images.ImageGCPolicy{
   424  		MinAge:               kubeCfg.ImageMinimumGCAge.Duration,
   425  		HighThresholdPercent: int(kubeCfg.ImageGCHighThresholdPercent),
   426  		LowThresholdPercent:  int(kubeCfg.ImageGCLowThresholdPercent),
   427  	}
   428  
   429  	if utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) {
   430  		imageGCPolicy.MaxAge = kubeCfg.ImageMaximumGCAge.Duration
   431  	} else if kubeCfg.ImageMaximumGCAge.Duration != 0 {
   432  		klog.InfoS("ImageMaximumGCAge flag enabled, but corresponding feature gate is not enabled. Ignoring flag.")
   433  	}
   434  
   435  	enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable
   436  	if experimentalNodeAllocatableIgnoreEvictionThreshold {
   437  		// Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions
   438  		enforceNodeAllocatable = []string{}
   439  	}
   440  	thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
   441  	if err != nil {
   442  		return nil, err
   443  	}
   444  	evictionConfig := eviction.Config{
   445  		PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration,
   446  		MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod),
   447  		Thresholds:               thresholds,
   448  		KernelMemcgNotification:  kernelMemcgNotification,
   449  		PodCgroupRoot:            kubeDeps.ContainerManager.GetPodCgroupRoot(),
   450  	}
   451  
   452  	var serviceLister corelisters.ServiceLister
   453  	var serviceHasSynced cache.InformerSynced
   454  	if kubeDeps.KubeClient != nil {
   455  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0)
   456  		serviceLister = kubeInformers.Core().V1().Services().Lister()
   457  		serviceHasSynced = kubeInformers.Core().V1().Services().Informer().HasSynced
   458  		kubeInformers.Start(wait.NeverStop)
   459  	} else {
   460  		serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
   461  		serviceLister = corelisters.NewServiceLister(serviceIndexer)
   462  		serviceHasSynced = func() bool { return true }
   463  	}
   464  
   465  	// construct a node reference used for events
   466  	nodeRef := &v1.ObjectReference{
   467  		Kind:      "Node",
   468  		Name:      string(nodeName),
   469  		UID:       types.UID(nodeName),
   470  		Namespace: "",
   471  	}
   472  
   473  	oomWatcher, err := oomwatcher.NewWatcher(kubeDeps.Recorder)
   474  	if err != nil {
   475  		if libcontaineruserns.RunningInUserNS() {
   476  			if utilfeature.DefaultFeatureGate.Enabled(features.KubeletInUserNamespace) {
   477  				// oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error,
   478  				// when running in a user namespace with sysctl value `kernel.dmesg_restrict=1`.
   479  				klog.V(2).InfoS("Failed to create an oomWatcher (running in UserNS, ignoring)", "err", err)
   480  				oomWatcher = nil
   481  			} else {
   482  				klog.ErrorS(err, "Failed to create an oomWatcher (running in UserNS, Hint: enable KubeletInUserNamespace feature flag to ignore the error)")
   483  				return nil, err
   484  			}
   485  		} else {
   486  			return nil, err
   487  		}
   488  	}
   489  
   490  	clusterDNS := make([]net.IP, 0, len(kubeCfg.ClusterDNS))
   491  	for _, ipEntry := range kubeCfg.ClusterDNS {
   492  		ip := netutils.ParseIPSloppy(ipEntry)
   493  		if ip == nil {
   494  			klog.InfoS("Invalid clusterDNS IP", "IP", ipEntry)
   495  		} else {
   496  			clusterDNS = append(clusterDNS, ip)
   497  		}
   498  	}
   499  
   500  	// A TLS transport is needed to make HTTPS-based container lifecycle requests,
   501  	// but we do not have the information necessary to do TLS verification.
   502  	//
   503  	// This client must not be modified to include credentials, because it is
   504  	// critical that credentials not leak from the client to arbitrary hosts.
   505  	insecureContainerLifecycleHTTPClient := &http.Client{}
   506  	if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentHTTPGetHandlers) {
   507  		insecureTLSTransport := &http.Transport{
   508  			TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
   509  		}
   510  		insecureContainerLifecycleHTTPClient.Transport = insecureTLSTransport
   511  		insecureContainerLifecycleHTTPClient.CheckRedirect = httpprobe.RedirectChecker(false)
   512  	}
   513  
   514  	tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope)
   515  
   516  	klet := &Kubelet{
   517  		hostname:                       hostname,
   518  		hostnameOverridden:             hostnameOverridden,
   519  		nodeName:                       nodeName,
   520  		kubeClient:                     kubeDeps.KubeClient,
   521  		heartbeatClient:                kubeDeps.HeartbeatClient,
   522  		onRepeatedHeartbeatFailure:     kubeDeps.OnHeartbeatFailure,
   523  		rootDirectory:                  filepath.Clean(rootDirectory),
   524  		resyncInterval:                 kubeCfg.SyncFrequency.Duration,
   525  		sourcesReady:                   config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources),
   526  		registerNode:                   registerNode,
   527  		registerWithTaints:             registerWithTaints,
   528  		registerSchedulable:            registerSchedulable,
   529  		dnsConfigurer:                  dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig),
   530  		serviceLister:                  serviceLister,
   531  		serviceHasSynced:               serviceHasSynced,
   532  		nodeLister:                     nodeLister,
   533  		nodeHasSynced:                  nodeHasSynced,
   534  		streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration,
   535  		recorder:                       kubeDeps.Recorder,
   536  		cadvisor:                       kubeDeps.CAdvisorInterface,
   537  		cloud:                          kubeDeps.Cloud,
   538  		externalCloudProvider:          cloudprovider.IsExternal(cloudProvider),
   539  		providerID:                     providerID,
   540  		nodeRef:                        nodeRef,
   541  		nodeLabels:                     nodeLabels,
   542  		nodeStatusUpdateFrequency:      kubeCfg.NodeStatusUpdateFrequency.Duration,
   543  		nodeStatusReportFrequency:      kubeCfg.NodeStatusReportFrequency.Duration,
   544  		os:                             kubeDeps.OSInterface,
   545  		oomWatcher:                     oomWatcher,
   546  		cgroupsPerQOS:                  kubeCfg.CgroupsPerQOS,
   547  		cgroupRoot:                     kubeCfg.CgroupRoot,
   548  		mounter:                        kubeDeps.Mounter,
   549  		hostutil:                       kubeDeps.HostUtil,
   550  		subpather:                      kubeDeps.Subpather,
   551  		maxPods:                        int(kubeCfg.MaxPods),
   552  		podsPerCore:                    int(kubeCfg.PodsPerCore),
   553  		syncLoopMonitor:                atomic.Value{},
   554  		daemonEndpoints:                daemonEndpoints,
   555  		containerManager:               kubeDeps.ContainerManager,
   556  		nodeIPs:                        nodeIPs,
   557  		nodeIPValidator:                validateNodeIP,
   558  		clock:                          clock.RealClock{},
   559  		enableControllerAttachDetach:   kubeCfg.EnableControllerAttachDetach,
   560  		makeIPTablesUtilChains:         kubeCfg.MakeIPTablesUtilChains,
   561  		keepTerminatedPodVolumes:       keepTerminatedPodVolumes,
   562  		nodeStatusMaxImages:            nodeStatusMaxImages,
   563  		tracer:                         tracer,
   564  		nodeStartupLatencyTracker:      kubeDeps.NodeStartupLatencyTracker,
   565  	}
   566  
   567  	if klet.cloud != nil {
   568  		klet.cloudResourceSyncManager = cloudresource.NewSyncManager(klet.cloud, nodeName, klet.nodeStatusUpdateFrequency)
   569  	}
   570  
   571  	var secretManager secret.Manager
   572  	var configMapManager configmap.Manager
   573  	if klet.kubeClient != nil {
   574  		switch kubeCfg.ConfigMapAndSecretChangeDetectionStrategy {
   575  		case kubeletconfiginternal.WatchChangeDetectionStrategy:
   576  			secretManager = secret.NewWatchingSecretManager(klet.kubeClient, klet.resyncInterval)
   577  			configMapManager = configmap.NewWatchingConfigMapManager(klet.kubeClient, klet.resyncInterval)
   578  		case kubeletconfiginternal.TTLCacheChangeDetectionStrategy:
   579  			secretManager = secret.NewCachingSecretManager(
   580  				klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode))
   581  			configMapManager = configmap.NewCachingConfigMapManager(
   582  				klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode))
   583  		case kubeletconfiginternal.GetChangeDetectionStrategy:
   584  			secretManager = secret.NewSimpleSecretManager(klet.kubeClient)
   585  			configMapManager = configmap.NewSimpleConfigMapManager(klet.kubeClient)
   586  		default:
   587  			return nil, fmt.Errorf("unknown configmap and secret manager mode: %v", kubeCfg.ConfigMapAndSecretChangeDetectionStrategy)
   588  		}
   589  
   590  		klet.secretManager = secretManager
   591  		klet.configMapManager = configMapManager
   592  	}
   593  
   594  	machineInfo, err := klet.cadvisor.MachineInfo()
   595  	if err != nil {
   596  		return nil, err
   597  	}
   598  	// Avoid collector collects it as a timestamped metric
   599  	// See PR #95210 and #97006 for more details.
   600  	machineInfo.Timestamp = time.Time{}
   601  	klet.setCachedMachineInfo(machineInfo)
   602  
   603  	imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)
   604  
   605  	klet.livenessManager = proberesults.NewManager()
   606  	klet.readinessManager = proberesults.NewManager()
   607  	klet.startupManager = proberesults.NewManager()
   608  	klet.podCache = kubecontainer.NewCache()
   609  
   610  	klet.mirrorPodClient = kubepod.NewBasicMirrorClient(klet.kubeClient, string(nodeName), nodeLister)
   611  	klet.podManager = kubepod.NewBasicPodManager()
   612  
   613  	klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker, klet.getRootDir())
   614  
   615  	klet.resourceAnalyzer = serverstats.NewResourceAnalyzer(klet, kubeCfg.VolumeStatsAggPeriod.Duration, kubeDeps.Recorder)
   616  
   617  	klet.runtimeService = kubeDeps.RemoteRuntimeService
   618  
   619  	if kubeDeps.KubeClient != nil {
   620  		klet.runtimeClassManager = runtimeclass.NewManager(kubeDeps.KubeClient)
   621  	}
   622  
   623  	// setup containerLogManager for CRI container runtime
   624  	containerLogManager, err := logs.NewContainerLogManager(
   625  		klet.runtimeService,
   626  		kubeDeps.OSInterface,
   627  		kubeCfg.ContainerLogMaxSize,
   628  		int(kubeCfg.ContainerLogMaxFiles),
   629  	)
   630  	if err != nil {
   631  		return nil, fmt.Errorf("failed to initialize container log manager: %v", err)
   632  	}
   633  	klet.containerLogManager = containerLogManager
   634  
   635  	klet.reasonCache = NewReasonCache()
   636  	klet.workQueue = queue.NewBasicWorkQueue(klet.clock)
   637  	klet.podWorkers = newPodWorkers(
   638  		klet,
   639  		kubeDeps.Recorder,
   640  		klet.workQueue,
   641  		klet.resyncInterval,
   642  		backOffPeriod,
   643  		klet.podCache,
   644  	)
   645  
   646  	runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
   647  		kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
   648  		klet.livenessManager,
   649  		klet.readinessManager,
   650  		klet.startupManager,
   651  		rootDirectory,
   652  		machineInfo,
   653  		klet.podWorkers,
   654  		kubeDeps.OSInterface,
   655  		klet,
   656  		insecureContainerLifecycleHTTPClient,
   657  		imageBackOff,
   658  		kubeCfg.SerializeImagePulls,
   659  		kubeCfg.MaxParallelImagePulls,
   660  		float32(kubeCfg.RegistryPullQPS),
   661  		int(kubeCfg.RegistryBurst),
   662  		imageCredentialProviderConfigFile,
   663  		imageCredentialProviderBinDir,
   664  		kubeCfg.CPUCFSQuota,
   665  		kubeCfg.CPUCFSQuotaPeriod,
   666  		kubeDeps.RemoteRuntimeService,
   667  		kubeDeps.RemoteImageService,
   668  		kubeDeps.ContainerManager,
   669  		klet.containerLogManager,
   670  		klet.runtimeClassManager,
   671  		seccompDefault,
   672  		kubeCfg.MemorySwap.SwapBehavior,
   673  		kubeDeps.ContainerManager.GetNodeAllocatableAbsolute,
   674  		*kubeCfg.MemoryThrottlingFactor,
   675  		kubeDeps.PodStartupLatencyTracker,
   676  		kubeDeps.TracerProvider,
   677  	)
   678  	if err != nil {
   679  		return nil, err
   680  	}
   681  	klet.containerRuntime = runtime
   682  	klet.streamingRuntime = runtime
   683  	klet.runner = runtime
   684  
   685  	runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime, runtimeCacheRefreshPeriod)
   686  	if err != nil {
   687  		return nil, err
   688  	}
   689  	klet.runtimeCache = runtimeCache
   690  
   691  	// common provider to get host file system usage associated with a pod managed by kubelet
   692  	hostStatsProvider := stats.NewHostStatsProvider(kubecontainer.RealOS{}, func(podUID types.UID) string {
   693  		return getEtcHostsPath(klet.getPodDir(podUID))
   694  	})
   695  	if kubeDeps.useLegacyCadvisorStats {
   696  		klet.StatsProvider = stats.NewCadvisorStatsProvider(
   697  			klet.cadvisor,
   698  			klet.resourceAnalyzer,
   699  			klet.podManager,
   700  			klet.runtimeCache,
   701  			klet.containerRuntime,
   702  			klet.statusManager,
   703  			hostStatsProvider)
   704  	} else {
   705  		klet.StatsProvider = stats.NewCRIStatsProvider(
   706  			klet.cadvisor,
   707  			klet.resourceAnalyzer,
   708  			klet.podManager,
   709  			klet.runtimeCache,
   710  			kubeDeps.RemoteRuntimeService,
   711  			kubeDeps.RemoteImageService,
   712  			hostStatsProvider,
   713  			utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI))
   714  	}
   715  
   716  	eventChannel := make(chan *pleg.PodLifecycleEvent, plegChannelCapacity)
   717  
   718  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
   719  		// adjust Generic PLEG relisting period and threshold to higher value when Evented PLEG is turned on
   720  		genericRelistDuration := &pleg.RelistDuration{
   721  			RelistPeriod:    eventedPlegRelistPeriod,
   722  			RelistThreshold: eventedPlegRelistThreshold,
   723  		}
   724  		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
   725  		// In case Evented PLEG has to fall back on Generic PLEG due to an error,
   726  		// Evented PLEG should be able to reset the Generic PLEG relisting duration
   727  		// to the default value.
   728  		eventedRelistDuration := &pleg.RelistDuration{
   729  			RelistPeriod:    genericPlegRelistPeriod,
   730  			RelistThreshold: genericPlegRelistThreshold,
   731  		}
   732  		klet.eventedPleg, err = pleg.NewEventedPLEG(klet.containerRuntime, klet.runtimeService, eventChannel,
   733  			klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{})
   734  		if err != nil {
   735  			return nil, err
   736  		}
   737  	} else {
   738  		genericRelistDuration := &pleg.RelistDuration{
   739  			RelistPeriod:    genericPlegRelistPeriod,
   740  			RelistThreshold: genericPlegRelistThreshold,
   741  		}
   742  		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
   743  	}
   744  
   745  	klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime)
   746  	klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy)
   747  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
   748  		klet.runtimeState.addHealthCheck("EventedPLEG", klet.eventedPleg.Healthy)
   749  	}
   750  	if _, err := klet.updatePodCIDR(ctx, kubeCfg.PodCIDR); err != nil {
   751  		klog.ErrorS(err, "Pod CIDR update failed")
   752  	}
   753  
   754  	// setup containerGC
   755  	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady)
   756  	if err != nil {
   757  		return nil, err
   758  	}
   759  	klet.containerGC = containerGC
   760  	klet.containerDeletor = newPodContainerDeletor(klet.containerRuntime, integer.IntMax(containerGCPolicy.MaxPerPodContainer, minDeadContainerInPod))
   761  
   762  	// setup imageManager
   763  	imageManager, err := images.NewImageGCManager(klet.containerRuntime, klet.StatsProvider, kubeDeps.Recorder, nodeRef, imageGCPolicy, kubeDeps.TracerProvider)
   764  	if err != nil {
   765  		return nil, fmt.Errorf("failed to initialize image manager: %v", err)
   766  	}
   767  	klet.imageManager = imageManager
   768  
   769  	if kubeCfg.ServerTLSBootstrap && kubeDeps.TLSOptions != nil && utilfeature.DefaultFeatureGate.Enabled(features.RotateKubeletServerCertificate) {
   770  		klet.serverCertificateManager, err = kubeletcertificate.NewKubeletServerCertificateManager(klet.kubeClient, kubeCfg, klet.nodeName, klet.getLastObservedNodeAddresses, certDirectory)
   771  		if err != nil {
   772  			return nil, fmt.Errorf("failed to initialize certificate manager: %v", err)
   773  		}
   774  		kubeDeps.TLSOptions.Config.GetCertificate = func(*tls.ClientHelloInfo) (*tls.Certificate, error) {
   775  			cert := klet.serverCertificateManager.Current()
   776  			if cert == nil {
   777  				return nil, fmt.Errorf("no serving certificate available for the kubelet")
   778  			}
   779  			return cert, nil
   780  		}
   781  	}
   782  
   783  	if kubeDeps.ProbeManager != nil {
   784  		klet.probeManager = kubeDeps.ProbeManager
   785  	} else {
   786  		klet.probeManager = prober.NewManager(
   787  			klet.statusManager,
   788  			klet.livenessManager,
   789  			klet.readinessManager,
   790  			klet.startupManager,
   791  			klet.runner,
   792  			kubeDeps.Recorder)
   793  	}
   794  
   795  	tokenManager := token.NewManager(kubeDeps.KubeClient)
   796  
   797  	var clusterTrustBundleManager clustertrustbundle.Manager
   798  	if kubeDeps.KubeClient != nil && utilfeature.DefaultFeatureGate.Enabled(features.ClusterTrustBundleProjection) {
   799  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0)
   800  		clusterTrustBundleManager, err = clustertrustbundle.NewInformerManager(kubeInformers.Certificates().V1alpha1().ClusterTrustBundles(), 2*int(kubeCfg.MaxPods), 5*time.Minute)
   801  		if err != nil {
   802  			return nil, fmt.Errorf("while starting informer-based ClusterTrustBundle manager: %w", err)
   803  		}
   804  		kubeInformers.Start(wait.NeverStop)
   805  		klog.InfoS("Started ClusterTrustBundle informer")
   806  	} else {
   807  		// In static kubelet mode, use a no-op manager.
   808  		clusterTrustBundleManager = &clustertrustbundle.NoopManager{}
   809  		klog.InfoS("Not starting ClusterTrustBundle informer because we are in static kubelet mode")
   810  	}
   811  
   812  	// NewInitializedVolumePluginMgr initializes some storageErrors on the Kubelet runtimeState (in csi_plugin.go init)
   813  	// which affects node ready status. This function must be called before Kubelet is initialized so that the Node
   814  	// ReadyState is accurate with the storage state.
   815  	klet.volumePluginMgr, err =
   816  		NewInitializedVolumePluginMgr(klet, secretManager, configMapManager, tokenManager, clusterTrustBundleManager, kubeDeps.VolumePlugins, kubeDeps.DynamicPluginProber)
   817  	if err != nil {
   818  		return nil, err
   819  	}
   820  	klet.pluginManager = pluginmanager.NewPluginManager(
   821  		klet.getPluginsRegistrationDir(), /* sockDir */
   822  		kubeDeps.Recorder,
   823  	)
   824  
   825  	// If the experimentalMounterPathFlag is set, we do not want to
   826  	// check node capabilities since the mount path is not the default
   827  	if len(experimentalMounterPath) != 0 {
   828  		// Replace the nameserver in containerized-mounter's rootfs/etc/resolv.conf with kubelet.ClusterDNS
   829  		// so that service name could be resolved
   830  		klet.dnsConfigurer.SetupDNSinContainerizedMounter(experimentalMounterPath)
   831  	}
   832  
   833  	// setup volumeManager
   834  	klet.volumeManager = volumemanager.NewVolumeManager(
   835  		kubeCfg.EnableControllerAttachDetach,
   836  		nodeName,
   837  		klet.podManager,
   838  		klet.podWorkers,
   839  		klet.kubeClient,
   840  		klet.volumePluginMgr,
   841  		klet.containerRuntime,
   842  		kubeDeps.Mounter,
   843  		kubeDeps.HostUtil,
   844  		klet.getPodsDir(),
   845  		kubeDeps.Recorder,
   846  		keepTerminatedPodVolumes,
   847  		volumepathhandler.NewBlockVolumePathHandler())
   848  
   849  	klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)
   850  
   851  	// setup eviction manager
   852  	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig,
   853  		killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)
   854  
   855  	klet.evictionManager = evictionManager
   856  	klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
   857  
   858  	// Safe, allowed sysctls can always be used as unsafe sysctls in the spec.
   859  	// Hence, we concatenate those two lists.
   860  	safeAndUnsafeSysctls := append(sysctl.SafeSysctlAllowlist(), allowedUnsafeSysctls...)
   861  	sysctlsAllowlist, err := sysctl.NewAllowlist(safeAndUnsafeSysctls)
   862  	if err != nil {
   863  		return nil, err
   864  	}
   865  	klet.admitHandlers.AddPodAdmitHandler(sysctlsAllowlist)
   866  
   867  	// enable active deadline handler
   868  	activeDeadlineHandler, err := newActiveDeadlineHandler(klet.statusManager, kubeDeps.Recorder, klet.clock)
   869  	if err != nil {
   870  		return nil, err
   871  	}
   872  	klet.AddPodSyncLoopHandler(activeDeadlineHandler)
   873  	klet.AddPodSyncHandler(activeDeadlineHandler)
   874  
   875  	klet.admitHandlers.AddPodAdmitHandler(klet.containerManager.GetAllocateResourcesPodAdmitHandler())
   876  
   877  	criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
   878  	klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler, klet.containerManager.UpdatePluginResources))
   879  	// apply functional Option's
   880  	for _, opt := range kubeDeps.Options {
   881  		opt(klet)
   882  	}
   883  
   884  	if sysruntime.GOOS == "linux" {
   885  		// AppArmor is a Linux kernel security module and it does not support other operating systems.
   886  		klet.appArmorValidator = apparmor.NewValidator()
   887  		klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
   888  	}
   889  
   890  	leaseDuration := time.Duration(kubeCfg.NodeLeaseDurationSeconds) * time.Second
   891  	renewInterval := time.Duration(float64(leaseDuration) * nodeLeaseRenewIntervalFraction)
   892  	klet.nodeLeaseController = lease.NewController(
   893  		klet.clock,
   894  		klet.heartbeatClient,
   895  		string(klet.nodeName),
   896  		kubeCfg.NodeLeaseDurationSeconds,
   897  		klet.onRepeatedHeartbeatFailure,
   898  		renewInterval,
   899  		string(klet.nodeName),
   900  		v1.NamespaceNodeLease,
   901  		util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName)))
   902  
   903  	// setup node shutdown manager
   904  	shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
   905  		Logger:                           logger,
   906  		ProbeManager:                     klet.probeManager,
   907  		Recorder:                         kubeDeps.Recorder,
   908  		NodeRef:                          nodeRef,
   909  		GetPodsFunc:                      klet.GetActivePods,
   910  		KillPodFunc:                      killPodNow(klet.podWorkers, kubeDeps.Recorder),
   911  		SyncNodeStatusFunc:               klet.syncNodeStatus,
   912  		ShutdownGracePeriodRequested:     kubeCfg.ShutdownGracePeriod.Duration,
   913  		ShutdownGracePeriodCriticalPods:  kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
   914  		ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
   915  		StateDirectory:                   rootDirectory,
   916  	})
   917  	klet.shutdownManager = shutdownManager
   918  	klet.usernsManager, err = userns.MakeUserNsManager(klet)
   919  	if err != nil {
   920  		return nil, err
   921  	}
   922  	klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
   923  
   924  	// Finally, put the most recent version of the config on the Kubelet, so
   925  	// people can see how it was configured.
   926  	klet.kubeletConfiguration = *kubeCfg
   927  
   928  	// Generating the status funcs should be the last thing we do,
   929  	// since this relies on the rest of the Kubelet having been constructed.
   930  	klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()
   931  
   932  	return klet, nil
   933  }
   934  
   935  type serviceLister interface {
   936  	List(labels.Selector) ([]*v1.Service, error)
   937  }
   938  
   939  // Kubelet is the main kubelet implementation.
   940  type Kubelet struct {
   941  	kubeletConfiguration kubeletconfiginternal.KubeletConfiguration
   942  
   943  	// hostname is the hostname the kubelet detected or was given via flag/config
   944  	hostname string
   945  	// hostnameOverridden indicates the hostname was overridden via flag/config
   946  	hostnameOverridden bool
   947  
   948  	nodeName        types.NodeName
   949  	runtimeCache    kubecontainer.RuntimeCache
   950  	kubeClient      clientset.Interface
   951  	heartbeatClient clientset.Interface
   952  	// mirrorPodClient is used to create and delete mirror pods in the API for static
   953  	// pods.
   954  	mirrorPodClient kubepod.MirrorClient
   955  
   956  	rootDirectory string
   957  
   958  	lastObservedNodeAddressesMux sync.RWMutex
   959  	lastObservedNodeAddresses    []v1.NodeAddress
   960  
   961  	// onRepeatedHeartbeatFailure is called when a heartbeat operation fails more than once. optional.
   962  	onRepeatedHeartbeatFailure func()
   963  
   964  	// podManager stores the desired set of admitted pods and mirror pods that the kubelet should be
   965  	// running. The actual set of running pods is stored on the podWorkers. The manager is populated
   966  	// by the kubelet config loops which abstracts receiving configuration from many different sources
   967  	// (api for regular pods, local filesystem or http for static pods). The manager may be consulted
   968  	// by other components that need to see the set of desired pods. Note that not all desired pods are
   969  	// running, and not all running pods are in the podManager - for instance, force deleting a pod
   970  	// from the apiserver will remove it from the podManager, but the pod may still be terminating and
   971  	// tracked by the podWorkers. Components that need to know the actual consumed resources of the
   972  	// node or are driven by podWorkers and the sync*Pod methods (status, volume, stats) should also
   973  	// consult the podWorkers when reconciling.
   974  	//
   975  	// TODO: review all kubelet components that need the actual set of pods (vs the desired set)
   976  	// and update them to use podWorkers instead of podManager. This may introduce latency in some
   977  	// methods, but avoids race conditions and correctly accounts for terminating pods that have
   978  	// been force deleted or static pods that have been updated.
   979  	// https://github.com/kubernetes/kubernetes/issues/116970
   980  	podManager kubepod.Manager
   981  
   982  	// podWorkers is responsible for driving the lifecycle state machine of each pod. The worker is
   983  	// notified of config changes, updates, periodic reconciliation, container runtime updates, and
   984  	// evictions of all desired pods and will invoke reconciliation methods per pod in separate
   985  	// goroutines. The podWorkers are authoritative in the kubelet for what pods are actually being
   986  	// run and their current state:
   987  	//
   988  	// * syncing: pod should be running (syncPod)
   989  	// * terminating: pod should be stopped (syncTerminatingPod)
   990  	// * terminated: pod should have all resources cleaned up (syncTerminatedPod)
   991  	//
   992  	// and invoke the handler methods that correspond to each state. Components within the
   993  	// kubelet that need to know the phase of the pod in order to correctly set up or tear down
   994  	// resources must consult the podWorkers.
   995  	//
   996  	// Once a pod has been accepted by the pod workers, no other pod with that same UID (and
   997  	// name+namespace, for static pods) will be started until the first pod has fully terminated
   998  	// and been cleaned up by SyncKnownPods. This means a pod may be desired (in API), admitted
   999  	// (in pod manager), and requested (by invoking UpdatePod) but not start for an arbitrarily
  1000  	// long interval because a prior pod is still terminating.
  1001  	//
  1002  	// As an event-driven (by UpdatePod) controller, the podWorkers must periodically be resynced
  1003  	// by the kubelet invoking SyncKnownPods with the desired state (admitted pods in podManager).
  1004  	// Since the podManager may be unaware of some running pods due to force deletion, the
  1005  	// podWorkers are responsible for triggering a sync of pods that are no longer desired but
  1006  	// must still run to completion.
  1007  	podWorkers PodWorkers
  1008  
  1009  	// evictionManager observes the state of the node for situations that could impact node stability
  1010  	// and evicts pods (sets to phase Failed with reason Evicted) to reduce resource pressure. The
  1011  	// eviction manager acts on the actual state of the node and considers the podWorker to be
  1012  	// authoritative.
  1013  	evictionManager eviction.Manager
  1014  
  1015  	// probeManager tracks the set of running pods and ensures any user-defined periodic checks are
  1016  	// run to introspect the state of each pod.  The probe manager acts on the actual state of the node
  1017  	// and is notified of pods by the podWorker. The probe manager is the authoritative source of the
  1018  	// most recent probe status and is responsible for notifying the status manager, which
  1019  	// synthesizes them into the overall pod status.
  1020  	probeManager prober.Manager
  1021  
  1022  	// secretManager caches the set of secrets used by running pods on this node. The podWorkers
  1023  	// notify the secretManager when pods are started and terminated, and the secretManager must
  1024  	// then keep the needed secrets up-to-date as they change.
  1025  	secretManager secret.Manager
  1026  
  1027  	// configMapManager caches the set of config maps used by running pods on this node. The
  1028  	// podWorkers notify the configMapManager when pods are started and terminated, and the
  1029  	// configMapManager must then keep the needed config maps up-to-date as they change.
  1030  	configMapManager configmap.Manager
  1031  
  1032  	// volumeManager observes the set of running pods and is responsible for attaching, mounting,
  1033  	// unmounting, and detaching as those pods move through their lifecycle. It periodically
  1034  	// synchronizes the set of known volumes to the set of actually desired volumes and cleans up
  1035  	// any orphaned volumes. The volume manager considers the podWorker to be authoritative for
  1036  	// which pods are running.
  1037  	volumeManager volumemanager.VolumeManager
  1038  
  1039  	// statusManager receives updated pod status updates from the podWorker and updates the API
  1040  	// status of those pods to match. The statusManager is authoritative for the synthesized
  1041  	// status of the pod from the kubelet's perspective (other components own the individual
  1042  	// elements of status) and should be consulted by components in preference to assembling
  1043  	// that status themselves. Note that the status manager is downstream of the pod worker
  1044  	// and components that need to check whether a pod is still running should instead directly
  1045  	// consult the pod worker.
  1046  	statusManager status.Manager
  1047  
  1048  	// resyncInterval is the interval between periodic full reconciliations of
  1049  	// pods on this node.
  1050  	resyncInterval time.Duration
  1051  
  1052  	// sourcesReady records the sources seen by the kubelet, it is thread-safe.
  1053  	sourcesReady config.SourcesReady
  1054  
  1055  	// Optional, defaults to /logs/ from /var/log
  1056  	logServer http.Handler
  1057  	// Optional, defaults to simple Docker implementation
  1058  	runner kubecontainer.CommandRunner
  1059  
  1060  	// cAdvisor used for container information.
  1061  	cadvisor cadvisor.Interface
  1062  
  1063  	// Set to true to have the node register itself with the apiserver.
  1064  	registerNode bool
  1065  	// List of taints to add to a node object when the kubelet registers itself.
  1066  	registerWithTaints []v1.Taint
  1067  	// Set to true to have the node register itself as schedulable.
  1068  	registerSchedulable bool
  1069  	// for internal book keeping; access only from within registerWithApiserver
  1070  	registrationCompleted bool
  1071  
  1072  	// dnsConfigurer is used for setting up DNS resolver configuration when launching pods.
  1073  	dnsConfigurer *dns.Configurer
  1074  
  1075  	// serviceLister knows how to list services
  1076  	serviceLister serviceLister
  1077  	// serviceHasSynced indicates whether services have been sync'd at least once.
  1078  	// Check this before trusting a response from the lister.
  1079  	serviceHasSynced cache.InformerSynced
  1080  	// nodeLister knows how to list nodes
  1081  	nodeLister corelisters.NodeLister
  1082  	// nodeHasSynced indicates whether nodes have been sync'd at least once.
  1083  	// Check this before trusting a response from the node lister.
  1084  	nodeHasSynced cache.InformerSynced
  1085  	// a list of node labels to register
  1086  	nodeLabels map[string]string
  1087  
  1088  	// Last timestamp when runtime responded on ping.
  1089  	// Mutex is used to protect this value.
  1090  	runtimeState *runtimeState
  1091  
  1092  	// Volume plugins.
  1093  	volumePluginMgr *volume.VolumePluginMgr
  1094  
  1095  	// Manages container health check results.
  1096  	livenessManager  proberesults.Manager
  1097  	readinessManager proberesults.Manager
  1098  	startupManager   proberesults.Manager
  1099  
  1100  	// How long to keep idle streaming command execution/port forwarding
  1101  	// connections open before terminating them
  1102  	streamingConnectionIdleTimeout time.Duration
  1103  
  1104  	// The EventRecorder to use
  1105  	recorder record.EventRecorder
  1106  
  1107  	// Policy for handling garbage collection of dead containers.
  1108  	containerGC kubecontainer.GC
  1109  
  1110  	// Manager for image garbage collection.
  1111  	imageManager images.ImageGCManager
  1112  
  1113  	// Manager for container logs.
  1114  	containerLogManager logs.ContainerLogManager
  1115  
  1116  	// Cached MachineInfo returned by cadvisor.
  1117  	machineInfoLock sync.RWMutex
  1118  	machineInfo     *cadvisorapi.MachineInfo
  1119  
  1120  	// Handles certificate rotations.
  1121  	serverCertificateManager certificate.Manager
  1122  
  1123  	// Cloud provider interface.
  1124  	cloud cloudprovider.Interface
  1125  	// Handles requests to cloud provider with timeout
  1126  	cloudResourceSyncManager cloudresource.SyncManager
  1127  
  1128  	// Indicates that the node initialization happens in an external cloud controller
  1129  	externalCloudProvider bool
  1130  	// Reference to this node.
  1131  	nodeRef *v1.ObjectReference
  1132  
  1133  	// Container runtime.
  1134  	containerRuntime kubecontainer.Runtime
  1135  
  1136  	// Streaming runtime handles container streaming.
  1137  	streamingRuntime kubecontainer.StreamingRuntime
  1138  
  1139  	// Container runtime service (needed by container runtime Start()).
  1140  	runtimeService internalapi.RuntimeService
  1141  
  1142  	// reasonCache caches the failure reason of the last creation of all containers, which is
  1143  	// used for generating ContainerStatus.
  1144  	reasonCache *ReasonCache
  1145  
  1146  	// containerRuntimeReadyExpected indicates whether container runtime being ready is expected
  1147  	// so errors are logged without verbosity guard, to avoid excessive error logs at node startup.
  1148  	// It's false during the node initialization period of nodeReadyGracePeriod, and after that
  1149  	// it's set to true by fastStatusUpdateOnce when it exits.
  1150  	containerRuntimeReadyExpected bool
  1151  
  1152  	// nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease
  1153  	// feature is not enabled, it is also the frequency that kubelet posts node status to master.
  1154  	// In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod
  1155  	// in nodecontroller. There are several constraints:
  1156  	// 1. nodeMonitorGracePeriod must be N times more than nodeStatusUpdateFrequency, where
  1157  	//    N means number of retries allowed for kubelet to post node status. It is pointless
  1158  	//    to make nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since there
  1159  	//    will only be fresh values from Kubelet at an interval of nodeStatusUpdateFrequency.
  1160  	//    The constant must be less than podEvictionTimeout.
  1161  	// 2. nodeStatusUpdateFrequency needs to be large enough for kubelet to generate node
  1162  	//    status. Kubelet may fail to update node status reliably if the value is too small,
  1163  	//    as it takes time to gather all necessary node information.
  1164  	nodeStatusUpdateFrequency time.Duration
  1165  
  1166  	// nodeStatusReportFrequency is the frequency that kubelet posts node
  1167  	// status to master. It is only used when node lease feature is enabled.
  1168  	nodeStatusReportFrequency time.Duration
  1169  
  1170  	// lastStatusReportTime is the time when node status was last reported.
  1171  	lastStatusReportTime time.Time
  1172  
  1173  	// syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe.
  1174  	// This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
  1175  	syncNodeStatusMux sync.Mutex
  1176  
  1177  	// updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe.
  1178  	// This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else.
  1179  	updatePodCIDRMux sync.Mutex
  1180  
  1181  	// updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe.
  1182  	// This lock is used by Kubelet.updateRuntimeUp and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
  1183  	updateRuntimeMux sync.Mutex
  1184  
  1185  	// nodeLeaseController claims and renews the node lease for this Kubelet
  1186  	nodeLeaseController lease.Controller
  1187  
  1188  	// pleg observes the state of the container runtime and notifies the kubelet of changes to containers, which
  1189  	// notifies the podWorkers to reconcile the state of the pod (for instance, if a container dies and needs to
  1190  	// be restarted).
  1191  	pleg pleg.PodLifecycleEventGenerator
  1192  
  1193  	// eventedPleg supplements the pleg to deliver edge-driven container changes with low-latency.
  1194  	eventedPleg pleg.PodLifecycleEventGenerator
  1195  
  1196  	// Store kubecontainer.PodStatus for all pods.
  1197  	podCache kubecontainer.Cache
  1198  
  1199  	// os is a facade for various syscalls that need to be mocked during testing.
  1200  	os kubecontainer.OSInterface
  1201  
  1202  	// Watcher of out of memory events.
  1203  	oomWatcher oomwatcher.Watcher
  1204  
  1205  	// Monitor resource usage
  1206  	resourceAnalyzer serverstats.ResourceAnalyzer
  1207  
  1208  	// Whether or not we should have the QOS cgroup hierarchy for resource management
  1209  	cgroupsPerQOS bool
  1210  
  1211  	// If non-empty, pass this to the container runtime as the root cgroup.
  1212  	cgroupRoot string
  1213  
  1214  	// Mounter to use for volumes.
  1215  	mounter mount.Interface
  1216  
  1217  	// hostutil to interact with filesystems
  1218  	hostutil hostutil.HostUtils
  1219  
  1220  	// subpather to execute subpath actions
  1221  	subpather subpath.Interface
  1222  
  1223  	// Manager of non-Runtime containers.
  1224  	containerManager cm.ContainerManager
  1225  
  1226  	// Maximum Number of Pods which can be run by this Kubelet
  1227  	maxPods int
  1228  
  1229  	// Monitor Kubelet's sync loop
  1230  	syncLoopMonitor atomic.Value
  1231  
  1232  	// Container restart Backoff
  1233  	backOff *flowcontrol.Backoff
  1234  
  1235  	// Information about the ports which are opened by daemons on Node running this Kubelet server.
  1236  	daemonEndpoints *v1.NodeDaemonEndpoints
  1237  
  1238  	// A queue used to trigger pod workers.
  1239  	workQueue queue.WorkQueue
  1240  
  1241  	// oneTimeInitializer is used to initialize modules that are dependent on the runtime to be up.
  1242  	oneTimeInitializer sync.Once
  1243  
  1244  	// If set, use this IP address or addresses for the node
  1245  	nodeIPs []net.IP
  1246  
  1247  	// use this function to validate the kubelet nodeIP
  1248  	nodeIPValidator func(net.IP) error
  1249  
  1250  	// If non-nil, this is a unique identifier for the node in an external database, eg. cloudprovider
  1251  	providerID string
  1252  
  1253  	// clock is an interface that provides time related functionality in a way that makes it
  1254  	// easy to test the code.
  1255  	clock clock.WithTicker
  1256  
  1257  	// handlers called during the tryUpdateNodeStatus cycle
  1258  	setNodeStatusFuncs []func(context.Context, *v1.Node) error
  1259  
  1260  	lastNodeUnschedulableLock sync.Mutex
  1261  	// maintains Node.Spec.Unschedulable value from previous run of tryUpdateNodeStatus()
  1262  	lastNodeUnschedulable bool
  1263  
  1264  	// the list of handlers to call during pod admission.
  1265  	admitHandlers lifecycle.PodAdmitHandlers
  1266  
  1267  	// softAdmithandlers are applied to the pod after it is admitted by the Kubelet, but before it is
  1268  	// run. A pod rejected by a softAdmitHandler will be left in a Pending state indefinitely. If a
  1269  	// rejected pod should not be recreated, or the scheduler is not aware of the rejection rule, the
  1270  	// admission rule should be applied by a softAdmitHandler.
  1271  	softAdmitHandlers lifecycle.PodAdmitHandlers
  1272  
  1273  	// the list of handlers to call during pod sync loop.
  1274  	lifecycle.PodSyncLoopHandlers
  1275  
  1276  	// the list of handlers to call during pod sync.
  1277  	lifecycle.PodSyncHandlers
  1278  
  1279  	// the number of allowed pods per core
  1280  	podsPerCore int
  1281  
  1282  	// enableControllerAttachDetach indicates the Attach/Detach controller
  1283  	// should manage attachment/detachment of volumes scheduled to this node,
  1284  	// and disable kubelet from executing any attach/detach operations
  1285  	enableControllerAttachDetach bool
  1286  
  1287  	// trigger deleting containers in a pod
  1288  	containerDeletor *podContainerDeletor
  1289  
  1290  	// config iptables util rules
  1291  	makeIPTablesUtilChains bool
  1292  
  1293  	// The AppArmor validator for checking whether AppArmor is supported.
  1294  	appArmorValidator apparmor.Validator
  1295  
  1296  	// StatsProvider provides the node and the container stats.
  1297  	StatsProvider *stats.Provider
  1298  
  1299  	// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
  1300  	// This can be useful for debugging volume related issues.
  1301  	keepTerminatedPodVolumes bool // DEPRECATED
  1302  
  1303  	// pluginmanager runs a set of asynchronous loops that figure out which
  1304  	// plugins need to be registered/unregistered based on this node and makes it so.
  1305  	pluginManager pluginmanager.PluginManager
  1306  
  1307  	// This flag sets a maximum number of images to report in the node status.
  1308  	nodeStatusMaxImages int32
  1309  
  1310  	// Handles RuntimeClass objects for the Kubelet.
  1311  	runtimeClassManager *runtimeclass.Manager
  1312  
  1313  	// Handles node shutdown events for the Node.
  1314  	shutdownManager nodeshutdown.Manager
  1315  
  1316  	// Manage user namespaces
  1317  	usernsManager *userns.UsernsManager
  1318  
  1319  	// Mutex to serialize new pod admission and existing pod resizing
  1320  	podResizeMutex sync.Mutex
  1321  
  1322  	// OpenTelemetry Tracer
  1323  	tracer trace.Tracer
  1324  
  1325  	// Track node startup latencies
  1326  	nodeStartupLatencyTracker util.NodeStartupLatencyTracker
  1327  }
  1328  
  1329  // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface
  1330  func (kl *Kubelet) ListPodStats(ctx context.Context) ([]statsapi.PodStats, error) {
  1331  	return kl.StatsProvider.ListPodStats(ctx)
  1332  }
  1333  
  1334  // ListPodCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface
  1335  func (kl *Kubelet) ListPodCPUAndMemoryStats(ctx context.Context) ([]statsapi.PodStats, error) {
  1336  	return kl.StatsProvider.ListPodCPUAndMemoryStats(ctx)
  1337  }
  1338  
  1339  // ListPodStatsAndUpdateCPUNanoCoreUsage is delegated to StatsProvider, which implements stats.Provider interface
  1340  func (kl *Kubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) {
  1341  	return kl.StatsProvider.ListPodStatsAndUpdateCPUNanoCoreUsage(ctx)
  1342  }
  1343  
  1344  // ImageFsStats is delegated to StatsProvider, which implements stats.Provider interface
  1345  func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) {
  1346  	return kl.StatsProvider.ImageFsStats(ctx)
  1347  }
  1348  
  1349  // GetCgroupStats is delegated to StatsProvider, which implements stats.Provider interface
  1350  func (kl *Kubelet) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) {
  1351  	return kl.StatsProvider.GetCgroupStats(cgroupName, updateStats)
  1352  }
  1353  
  1354  // GetCgroupCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface
  1355  func (kl *Kubelet) GetCgroupCPUAndMemoryStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, error) {
  1356  	return kl.StatsProvider.GetCgroupCPUAndMemoryStats(cgroupName, updateStats)
  1357  }
  1358  
  1359  // RootFsStats is delegated to StatsProvider, which implements stats.Provider interface
  1360  func (kl *Kubelet) RootFsStats() (*statsapi.FsStats, error) {
  1361  	return kl.StatsProvider.RootFsStats()
  1362  }
  1363  
  1364  // GetContainerInfo is delegated to StatsProvider, which implements stats.Provider interface
  1365  func (kl *Kubelet) GetContainerInfo(ctx context.Context, podFullName string, uid types.UID, containerName string, req *cadvisorapi.ContainerInfoRequest) (*cadvisorapi.ContainerInfo, error) {
  1366  	return kl.StatsProvider.GetContainerInfo(ctx, podFullName, uid, containerName, req)
  1367  }
  1368  
  1369  // GetRawContainerInfo is delegated to StatsProvider, which implements stats.Provider interface
  1370  func (kl *Kubelet) GetRawContainerInfo(containerName string, req *cadvisorapi.ContainerInfoRequest, subcontainers bool) (map[string]*cadvisorapi.ContainerInfo, error) {
  1371  	return kl.StatsProvider.GetRawContainerInfo(containerName, req, subcontainers)
  1372  }
  1373  
  1374  // RlimitStats is delegated to StatsProvider, which implements stats.Provider interface
  1375  func (kl *Kubelet) RlimitStats() (*statsapi.RlimitStats, error) {
  1376  	return kl.StatsProvider.RlimitStats()
  1377  }
  1378  
  1379  // setupDataDirs creates:
  1380  // 1.  the root directory
  1381  // 2.  the pods directory
  1382  // 3.  the plugins directory
  1383  // 4.  the pod-resources directory
  1384  // 5.  the checkpoint directory
  1385  func (kl *Kubelet) setupDataDirs() error {
  1386  	if cleanedRoot := filepath.Clean(kl.rootDirectory); cleanedRoot != kl.rootDirectory {
  1387  		return fmt.Errorf("rootDirectory not in canonical form: expected %s, was %s", cleanedRoot, kl.rootDirectory)
  1388  	}
  1389  	pluginRegistrationDir := kl.getPluginsRegistrationDir()
  1390  	pluginsDir := kl.getPluginsDir()
  1391  	if err := os.MkdirAll(kl.getRootDir(), 0750); err != nil {
  1392  		return fmt.Errorf("error creating root directory: %v", err)
  1393  	}
  1394  	if err := kl.hostutil.MakeRShared(kl.getRootDir()); err != nil {
  1395  		return fmt.Errorf("error configuring root directory: %v", err)
  1396  	}
  1397  	if err := os.MkdirAll(kl.getPodsDir(), 0750); err != nil {
  1398  		return fmt.Errorf("error creating pods directory: %v", err)
  1399  	}
  1400  	if err := os.MkdirAll(kl.getPluginsDir(), 0750); err != nil {
  1401  		return fmt.Errorf("error creating plugins directory: %v", err)
  1402  	}
  1403  	if err := os.MkdirAll(kl.getPluginsRegistrationDir(), 0750); err != nil {
  1404  		return fmt.Errorf("error creating plugins registry directory: %v", err)
  1405  	}
  1406  	if err := os.MkdirAll(kl.getPodResourcesDir(), 0750); err != nil {
  1407  		return fmt.Errorf("error creating podresources directory: %v", err)
  1408  	}
  1409  	if utilfeature.DefaultFeatureGate.Enabled(features.ContainerCheckpoint) {
  1410  		if err := os.MkdirAll(kl.getCheckpointsDir(), 0700); err != nil {
  1411  			return fmt.Errorf("error creating checkpoint directory: %v", err)
  1412  		}
  1413  	}
  1414  	if selinux.GetEnabled() {
  1415  		err := selinux.SetFileLabel(pluginRegistrationDir, config.KubeletPluginsDirSELinuxLabel)
  1416  		if err != nil {
  1417  			klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugin registration dir", "path", pluginRegistrationDir, "err", err)
  1418  		}
  1419  		err = selinux.SetFileLabel(pluginsDir, config.KubeletPluginsDirSELinuxLabel)
  1420  		if err != nil {
  1421  			klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugins dir", "path", pluginsDir, "err", err)
  1422  		}
  1423  	}
  1424  	return nil
  1425  }
  1426  
  1427  // StartGarbageCollection starts garbage collection threads.
  1428  func (kl *Kubelet) StartGarbageCollection() {
  1429  	loggedContainerGCFailure := false
  1430  	go wait.Until(func() {
  1431  		ctx := context.Background()
  1432  		if err := kl.containerGC.GarbageCollect(ctx); err != nil {
  1433  			klog.ErrorS(err, "Container garbage collection failed")
  1434  			kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
  1435  			loggedContainerGCFailure = true
  1436  		} else {
  1437  			var vLevel klog.Level = 4
  1438  			if loggedContainerGCFailure {
  1439  				vLevel = 1
  1440  				loggedContainerGCFailure = false
  1441  			}
  1442  
  1443  			klog.V(vLevel).InfoS("Container garbage collection succeeded")
  1444  		}
  1445  	}, ContainerGCPeriod, wait.NeverStop)
  1446  
  1447  	// when the high threshold is set to 100, stub the image GC manager
  1448  	if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 {
  1449  		klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100, Disable image GC")
  1450  		return
  1451  	}
  1452  
  1453  	prevImageGCFailed := false
  1454  	go wait.Until(func() {
  1455  		ctx := context.Background()
  1456  		if err := kl.imageManager.GarbageCollect(ctx); err != nil {
  1457  			if prevImageGCFailed {
  1458  				klog.ErrorS(err, "Image garbage collection failed multiple times in a row")
  1459  				// Only create an event for repeated failures
  1460  				kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error())
  1461  			} else {
  1462  				klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet")
  1463  			}
  1464  			prevImageGCFailed = true
  1465  		} else {
  1466  			var vLevel klog.Level = 4
  1467  			if prevImageGCFailed {
  1468  				vLevel = 1
  1469  				prevImageGCFailed = false
  1470  			}
  1471  
  1472  			klog.V(vLevel).InfoS("Image garbage collection succeeded")
  1473  		}
  1474  	}, ImageGCPeriod, wait.NeverStop)
  1475  }
  1476  
  1477  // initializeModules will initialize internal modules that do not require the container runtime to be up.
  1478  // Note that the modules here must not depend on modules that are not initialized here.
  1479  func (kl *Kubelet) initializeModules() error {
  1480  	// Prometheus metrics.
  1481  	metrics.Register(
  1482  		collectors.NewVolumeStatsCollector(kl),
  1483  		collectors.NewLogMetricsCollector(kl.StatsProvider.ListPodStats),
  1484  	)
  1485  	metrics.SetNodeName(kl.nodeName)
  1486  	servermetrics.Register()
  1487  
  1488  	// Setup filesystem directories.
  1489  	if err := kl.setupDataDirs(); err != nil {
  1490  		return err
  1491  	}
  1492  
  1493  	// If the container logs directory does not exist, create it.
  1494  	if _, err := os.Stat(ContainerLogsDir); err != nil {
  1495  		if err := kl.os.MkdirAll(ContainerLogsDir, 0755); err != nil {
  1496  			return fmt.Errorf("failed to create directory %q: %v", ContainerLogsDir, err)
  1497  		}
  1498  	}
  1499  
  1500  	// Start the image manager.
  1501  	kl.imageManager.Start()
  1502  
  1503  	// Start the certificate manager if it was enabled.
  1504  	if kl.serverCertificateManager != nil {
  1505  		kl.serverCertificateManager.Start()
  1506  	}
  1507  
  1508  	// Start out of memory watcher.
  1509  	if kl.oomWatcher != nil {
  1510  		if err := kl.oomWatcher.Start(kl.nodeRef); err != nil {
  1511  			return fmt.Errorf("failed to start OOM watcher: %w", err)
  1512  		}
  1513  	}
  1514  
  1515  	// Start resource analyzer
  1516  	kl.resourceAnalyzer.Start()
  1517  
  1518  	return nil
  1519  }
  1520  
  1521  // initializeRuntimeDependentModules will initialize internal modules that require the container runtime to be up.
  1522  func (kl *Kubelet) initializeRuntimeDependentModules() {
  1523  	if err := kl.cadvisor.Start(); err != nil {
  1524  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1525  		klog.ErrorS(err, "Failed to start cAdvisor")
  1526  		os.Exit(1)
  1527  	}
  1528  
  1529  	// trigger on-demand stats collection once so that we have capacity information for ephemeral storage.
  1530  	// ignore any errors, since if stats collection is not successful, the container manager will fail to start below.
  1531  	kl.StatsProvider.GetCgroupStats("/", true)
  1532  	// Start container manager.
  1533  	node, err := kl.getNodeAnyWay()
  1534  	if err != nil {
  1535  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1536  		klog.ErrorS(err, "Kubelet failed to get node info")
  1537  		os.Exit(1)
  1538  	}
  1539  	// containerManager must start after cAdvisor because it needs filesystem capacity information
  1540  	if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil {
  1541  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1542  		klog.ErrorS(err, "Failed to start ContainerManager")
  1543  		os.Exit(1)
  1544  	}
  1545  	// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
  1546  	kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.PodIsFinished, evictionMonitoringPeriod)
  1547  
  1548  	// container log manager must start after container runtime is up to retrieve information from container runtime
  1549  	// and inform container to reopen log file after log rotation.
  1550  	kl.containerLogManager.Start()
  1551  	// Adding Registration Callback function for CSI Driver
  1552  	kl.pluginManager.AddHandler(pluginwatcherapi.CSIPlugin, plugincache.PluginHandler(csi.PluginHandler))
  1553  	// Adding Registration Callback function for DRA Plugin
  1554  	if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
  1555  		kl.pluginManager.AddHandler(pluginwatcherapi.DRAPlugin, plugincache.PluginHandler(draplugin.NewRegistrationHandler()))
  1556  	}
  1557  	// Adding Registration Callback function for Device Manager
  1558  	kl.pluginManager.AddHandler(pluginwatcherapi.DevicePlugin, kl.containerManager.GetPluginRegistrationHandler())
  1559  
  1560  	// Start the plugin manager
  1561  	klog.V(4).InfoS("Starting plugin manager")
  1562  	go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop)
  1563  
  1564  	err = kl.shutdownManager.Start()
  1565  	if err != nil {
  1566  		// The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it.
  1567  		klog.ErrorS(err, "Failed to start node shutdown manager")
  1568  	}
  1569  }
  1570  
  1571  // Run starts the kubelet reacting to config updates
  1572  func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
  1573  	ctx := context.Background()
  1574  	if kl.logServer == nil {
  1575  		file := http.FileServer(http.Dir(nodeLogDir))
  1576  		if utilfeature.DefaultFeatureGate.Enabled(features.NodeLogQuery) && kl.kubeletConfiguration.EnableSystemLogQuery {
  1577  			kl.logServer = http.StripPrefix("/logs/", http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
  1578  				if nlq, errs := newNodeLogQuery(req.URL.Query()); len(errs) > 0 {
  1579  					http.Error(w, errs.ToAggregate().Error(), http.StatusBadRequest)
  1580  					return
  1581  				} else if nlq != nil {
  1582  					if req.URL.Path != "/" && req.URL.Path != "" {
  1583  						http.Error(w, "path not allowed in query mode", http.StatusNotAcceptable)
  1584  						return
  1585  					}
  1586  					if errs := nlq.validate(); len(errs) > 0 {
  1587  						http.Error(w, errs.ToAggregate().Error(), http.StatusNotAcceptable)
  1588  						return
  1589  					}
  1590  					// Validation ensures that the request does not query services and files at the same time
  1591  					if len(nlq.Services) > 0 {
  1592  						journal.ServeHTTP(w, req)
  1593  						return
  1594  					}
  1595  					// Validation ensures that the request does not explicitly query multiple files at the same time
  1596  					if len(nlq.Files) == 1 {
  1597  						// Account for the \ being used on Windows clients
  1598  						req.URL.Path = filepath.ToSlash(nlq.Files[0])
  1599  					}
  1600  				}
  1601  				// Fall back in case the caller is directly trying to query a file
  1602  				// Example: kubectl get --raw /api/v1/nodes/$name/proxy/logs/foo.log
  1603  				file.ServeHTTP(w, req)
  1604  			}))
  1605  		} else {
  1606  			kl.logServer = http.StripPrefix("/logs/", file)
  1607  		}
  1608  	}
  1609  	if kl.kubeClient == nil {
  1610  		klog.InfoS("No API server defined - no node status update will be sent")
  1611  	}
  1612  
  1613  	// Start the cloud provider sync manager
  1614  	if kl.cloudResourceSyncManager != nil {
  1615  		go kl.cloudResourceSyncManager.Run(wait.NeverStop)
  1616  	}
  1617  
  1618  	if err := kl.initializeModules(); err != nil {
  1619  		kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error())
  1620  		klog.ErrorS(err, "Failed to initialize internal modules")
  1621  		os.Exit(1)
  1622  	}
  1623  
  1624  	// Start volume manager
  1625  	go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
  1626  
  1627  	if kl.kubeClient != nil {
  1628  		// Start two go-routines to update the status.
  1629  		//
  1630  		// The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals,
  1631  		// while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver
  1632  		// once the node becomes ready, then exits afterwards.
  1633  		//
  1634  		// Introduce some small jittering to ensure that over time the requests won't start
  1635  		// accumulating at approximately the same time from the set of nodes due to priority and
  1636  		// fairness effect.
  1637  		go wait.JitterUntil(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, 0.04, true, wait.NeverStop)
  1638  		go kl.fastStatusUpdateOnce()
  1639  
  1640  		// start syncing lease
  1641  		go kl.nodeLeaseController.Run(context.Background())
  1642  	}
  1643  	go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop)
  1644  
  1645  	// Set up iptables util rules
  1646  	if kl.makeIPTablesUtilChains {
  1647  		kl.initNetworkUtil()
  1648  	}
  1649  
  1650  	// Start component sync loops.
  1651  	kl.statusManager.Start()
  1652  
  1653  	// Start syncing RuntimeClasses if enabled.
  1654  	if kl.runtimeClassManager != nil {
  1655  		kl.runtimeClassManager.Start(wait.NeverStop)
  1656  	}
  1657  
  1658  	// Start the pod lifecycle event generator.
  1659  	kl.pleg.Start()
  1660  
  1661  	// Start eventedPLEG only if EventedPLEG feature gate is enabled.
  1662  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
  1663  		kl.eventedPleg.Start()
  1664  	}
  1665  
  1666  	kl.syncLoop(ctx, updates, kl)
  1667  }
  1668  
  1669  // SyncPod is the transaction script for the sync of a single pod (setting up)
  1670  // a pod. This method is reentrant and expected to converge a pod towards the
  1671  // desired state of the spec. The reverse (teardown) is handled in
  1672  // SyncTerminatingPod and SyncTerminatedPod. If SyncPod exits without error,
  1673  // then the pod runtime state is in sync with the desired configuration state
  1674  // (pod is running). If SyncPod exits with a transient error, the next
  1675  // invocation of SyncPod is expected to make progress towards reaching the
  1676  // desired state. SyncPod exits with isTerminal when the pod was detected to
  1677  // have reached a terminal lifecycle phase due to container exits (for
  1678  // RestartNever or RestartOnFailure) and the next method invoked will be
  1679  // SyncTerminatingPod. If the pod terminates for any other reason, SyncPod
  1680  // will receive a context cancellation and should exit as soon as possible.
  1681  //
  1682  // Arguments:
  1683  //
  1684  // updateType - whether this is a create (first time) or an update, should
  1685  // only be used for metrics since this method must be reentrant
  1686  //
  1687  // pod - the pod that is being set up
  1688  //
  1689  // mirrorPod - the mirror pod known to the kubelet for this pod, if any
  1690  //
  1691  // podStatus - the most recent pod status observed for this pod which can
  1692  // be used to determine the set of actions that should be taken during
  1693  // this loop of SyncPod
  1694  //
  1695  // The workflow is:
  1696  //   - If the pod is being created, record pod worker start latency
  1697  //   - Call generateAPIPodStatus to prepare an v1.PodStatus for the pod
  1698  //   - If the pod is being seen as running for the first time, record pod
  1699  //     start latency
  1700  //   - Update the status of the pod in the status manager
  1701  //   - Stop the pod's containers if it should not be running due to soft
  1702  //     admission
  1703  //   - Ensure any background tracking for a runnable pod is started
  1704  //   - Create a mirror pod if the pod is a static pod, and does not
  1705  //     already have a mirror pod
  1706  //   - Create the data directories for the pod if they do not exist
  1707  //   - Wait for volumes to attach/mount
  1708  //   - Fetch the pull secrets for the pod
  1709  //   - Call the container runtime's SyncPod callback
  1710  //   - Update the traffic shaping for the pod's ingress and egress limits
  1711  //
  1712  // If any step of this workflow errors, the error is returned, and is repeated
  1713  // on the next SyncPod call.
  1714  //
  1715  // This operation writes all events that are dispatched in order to provide
  1716  // the most accurate information possible about an error situation to aid debugging.
  1717  // Callers should not write an event if this operation returns an error.
  1718  func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) {
  1719  	ctx, otelSpan := kl.tracer.Start(ctx, "syncPod", trace.WithAttributes(
  1720  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  1721  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  1722  		semconv.K8SPodNameKey.String(pod.Name),
  1723  		attribute.String("k8s.pod.update_type", updateType.String()),
  1724  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  1725  	))
  1726  	klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  1727  	defer func() {
  1728  		klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal)
  1729  		otelSpan.End()
  1730  	}()
  1731  
  1732  	// Latency measurements for the main workflow are relative to the
  1733  	// first time the pod was seen by kubelet.
  1734  	var firstSeenTime time.Time
  1735  	if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok {
  1736  		firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get()
  1737  	}
  1738  
  1739  	// Record pod worker start latency if being created
  1740  	// TODO: make pod workers record their own latencies
  1741  	if updateType == kubetypes.SyncPodCreate {
  1742  		if !firstSeenTime.IsZero() {
  1743  			// This is the first time we are syncing the pod. Record the latency
  1744  			// since kubelet first saw the pod if firstSeenTime is set.
  1745  			metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime))
  1746  		} else {
  1747  			klog.V(3).InfoS("First seen time not recorded for pod",
  1748  				"podUID", pod.UID,
  1749  				"pod", klog.KObj(pod))
  1750  		}
  1751  	}
  1752  
  1753  	// Generate final API pod status with pod and status manager status
  1754  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
  1755  	// The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576)
  1756  	// TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and
  1757  	// set pod IP to hostIP directly in runtime.GetPodStatus
  1758  	podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs))
  1759  	for _, ipInfo := range apiPodStatus.PodIPs {
  1760  		podStatus.IPs = append(podStatus.IPs, ipInfo.IP)
  1761  	}
  1762  	if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 {
  1763  		podStatus.IPs = []string{apiPodStatus.PodIP}
  1764  	}
  1765  
  1766  	// If the pod is terminal, we don't need to continue to setup the pod
  1767  	if apiPodStatus.Phase == v1.PodSucceeded || apiPodStatus.Phase == v1.PodFailed {
  1768  		kl.statusManager.SetPodStatus(pod, apiPodStatus)
  1769  		isTerminal = true
  1770  		return isTerminal, nil
  1771  	}
  1772  
  1773  	// If the pod should not be running, we request the pod's containers be stopped. This is not the same
  1774  	// as termination (we want to stop the pod, but potentially restart it later if soft admission allows
  1775  	// it later). Set the status and phase appropriately
  1776  	runnable := kl.canRunPod(pod)
  1777  	if !runnable.Admit {
  1778  		// Pod is not runnable; and update the Pod and Container statuses to why.
  1779  		if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded {
  1780  			apiPodStatus.Phase = v1.PodPending
  1781  		}
  1782  		apiPodStatus.Reason = runnable.Reason
  1783  		apiPodStatus.Message = runnable.Message
  1784  		// Waiting containers are not creating.
  1785  		const waitingReason = "Blocked"
  1786  		for _, cs := range apiPodStatus.InitContainerStatuses {
  1787  			if cs.State.Waiting != nil {
  1788  				cs.State.Waiting.Reason = waitingReason
  1789  			}
  1790  		}
  1791  		for _, cs := range apiPodStatus.ContainerStatuses {
  1792  			if cs.State.Waiting != nil {
  1793  				cs.State.Waiting.Reason = waitingReason
  1794  			}
  1795  		}
  1796  	}
  1797  
  1798  	// Record the time it takes for the pod to become running
  1799  	// since kubelet first saw the pod if firstSeenTime is set.
  1800  	existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID)
  1801  	if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning &&
  1802  		!firstSeenTime.IsZero() {
  1803  		metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime))
  1804  	}
  1805  
  1806  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  1807  
  1808  	// Pods that are not runnable must be stopped - return a typed error to the pod worker
  1809  	if !runnable.Admit {
  1810  		klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message)
  1811  		var syncErr error
  1812  		p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1813  		if err := kl.killPod(ctx, pod, p, nil); err != nil {
  1814  			if !wait.Interrupted(err) {
  1815  				kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  1816  				syncErr = fmt.Errorf("error killing pod: %w", err)
  1817  				utilruntime.HandleError(syncErr)
  1818  			}
  1819  		} else {
  1820  			// There was no error killing the pod, but the pod cannot be run.
  1821  			// Return an error to signal that the sync loop should back off.
  1822  			syncErr = fmt.Errorf("pod cannot be run: %v", runnable.Message)
  1823  		}
  1824  		return false, syncErr
  1825  	}
  1826  
  1827  	// If the network plugin is not ready, only start the pod if it uses the host network
  1828  	if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) {
  1829  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err)
  1830  		return false, fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err)
  1831  	}
  1832  
  1833  	// ensure the kubelet knows about referenced secrets or configmaps used by the pod
  1834  	if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  1835  		if kl.secretManager != nil {
  1836  			kl.secretManager.RegisterPod(pod)
  1837  		}
  1838  		if kl.configMapManager != nil {
  1839  			kl.configMapManager.RegisterPod(pod)
  1840  		}
  1841  	}
  1842  
  1843  	// Create Cgroups for the pod and apply resource parameters
  1844  	// to them if cgroups-per-qos flag is enabled.
  1845  	pcm := kl.containerManager.NewPodContainerManager()
  1846  	// If pod has already been terminated then we need not create
  1847  	// or update the pod's cgroup
  1848  	// TODO: once context cancellation is added this check can be removed
  1849  	if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  1850  		// When the kubelet is restarted with the cgroups-per-qos
  1851  		// flag enabled, all the pod's running containers
  1852  		// should be killed intermittently and brought back up
  1853  		// under the qos cgroup hierarchy.
  1854  		// Check if this is the pod's first sync
  1855  		firstSync := true
  1856  		for _, containerStatus := range apiPodStatus.ContainerStatuses {
  1857  			if containerStatus.State.Running != nil {
  1858  				firstSync = false
  1859  				break
  1860  			}
  1861  		}
  1862  		// Don't kill containers in pod if pod's cgroups already
  1863  		// exists or the pod is running for the first time
  1864  		podKilled := false
  1865  		if !pcm.Exists(pod) && !firstSync {
  1866  			p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1867  			if err := kl.killPod(ctx, pod, p, nil); err == nil {
  1868  				if wait.Interrupted(err) {
  1869  					return false, err
  1870  				}
  1871  				podKilled = true
  1872  			} else {
  1873  				klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus)
  1874  			}
  1875  		}
  1876  		// Create and Update pod's Cgroups
  1877  		// Don't create cgroups for run once pod if it was killed above
  1878  		// The current policy is not to restart the run once pods when
  1879  		// the kubelet is restarted with the new flag as run once pods are
  1880  		// expected to run only once and if the kubelet is restarted then
  1881  		// they are not expected to run again.
  1882  		// We don't create and apply updates to cgroup if its a run once pod and was killed above
  1883  		if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) {
  1884  			if !pcm.Exists(pod) {
  1885  				if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
  1886  					klog.V(2).InfoS("Failed to update QoS cgroups while syncing pod", "pod", klog.KObj(pod), "err", err)
  1887  				}
  1888  				if err := pcm.EnsureExists(pod); err != nil {
  1889  					kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err)
  1890  					return false, fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err)
  1891  				}
  1892  			}
  1893  		}
  1894  	}
  1895  
  1896  	// Create Mirror Pod for Static Pod if it doesn't already exist
  1897  	if kubetypes.IsStaticPod(pod) {
  1898  		deleted := false
  1899  		if mirrorPod != nil {
  1900  			if mirrorPod.DeletionTimestamp != nil || !kubepod.IsMirrorPodOf(mirrorPod, pod) {
  1901  				// The mirror pod is semantically different from the static pod. Remove
  1902  				// it. The mirror pod will get recreated later.
  1903  				klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID)
  1904  				podFullName := kubecontainer.GetPodFullName(pod)
  1905  				var err error
  1906  				deleted, err = kl.mirrorPodClient.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID)
  1907  				if deleted {
  1908  					klog.InfoS("Deleted mirror pod because it is outdated", "pod", klog.KObj(mirrorPod))
  1909  				} else if err != nil {
  1910  					klog.ErrorS(err, "Failed deleting mirror pod", "pod", klog.KObj(mirrorPod))
  1911  				}
  1912  			}
  1913  		}
  1914  		if mirrorPod == nil || deleted {
  1915  			node, err := kl.GetNode()
  1916  			if err != nil || node.DeletionTimestamp != nil {
  1917  				klog.V(4).InfoS("No need to create a mirror pod, since node has been removed from the cluster", "node", klog.KRef("", string(kl.nodeName)))
  1918  			} else {
  1919  				klog.V(4).InfoS("Creating a mirror pod for static pod", "pod", klog.KObj(pod))
  1920  				if err := kl.mirrorPodClient.CreateMirrorPod(pod); err != nil {
  1921  					klog.ErrorS(err, "Failed creating a mirror pod for", "pod", klog.KObj(pod))
  1922  				}
  1923  			}
  1924  		}
  1925  	}
  1926  
  1927  	// Make data directories for the pod
  1928  	if err := kl.makePodDataDirs(pod); err != nil {
  1929  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err)
  1930  		klog.ErrorS(err, "Unable to make pod data directories for pod", "pod", klog.KObj(pod))
  1931  		return false, err
  1932  	}
  1933  
  1934  	// Wait for volumes to attach/mount
  1935  	if err := kl.volumeManager.WaitForAttachAndMount(ctx, pod); err != nil {
  1936  		if !wait.Interrupted(err) {
  1937  			kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err)
  1938  			klog.ErrorS(err, "Unable to attach or mount volumes for pod; skipping pod", "pod", klog.KObj(pod))
  1939  		}
  1940  		return false, err
  1941  	}
  1942  
  1943  	// Fetch the pull secrets for the pod
  1944  	pullSecrets := kl.getPullSecretsForPod(pod)
  1945  
  1946  	// Ensure the pod is being probed
  1947  	kl.probeManager.AddPod(pod)
  1948  
  1949  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  1950  		// Handle pod resize here instead of doing it in HandlePodUpdates because
  1951  		// this conveniently retries any Deferred resize requests
  1952  		// TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan
  1953  		//     See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060
  1954  		if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) {
  1955  			pod = kl.handlePodResourcesResize(pod)
  1956  		}
  1957  	}
  1958  
  1959  	// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
  1960  	// Currently, using that context causes test failures. To remove this todoCtx, any wait.Interrupted
  1961  	// errors need to be filtered from result and bypass the reasonCache - cancelling the context for
  1962  	// SyncPod is a known and deliberate error, not a generic error.
  1963  	todoCtx := context.TODO()
  1964  	// Call the container runtime's SyncPod callback
  1965  	result := kl.containerRuntime.SyncPod(todoCtx, pod, podStatus, pullSecrets, kl.backOff)
  1966  	kl.reasonCache.Update(pod.UID, result)
  1967  	if err := result.Error(); err != nil {
  1968  		// Do not return error if the only failures were pods in backoff
  1969  		for _, r := range result.SyncResults {
  1970  			if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff {
  1971  				// Do not record an event here, as we keep all event logging for sync pod failures
  1972  				// local to container runtime, so we get better errors.
  1973  				return false, err
  1974  			}
  1975  		}
  1976  
  1977  		return false, nil
  1978  	}
  1979  
  1980  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) {
  1981  		// While resize is in progress, periodically call PLEG to update pod cache
  1982  		runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1983  		if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil {
  1984  			klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod))
  1985  			return false, err
  1986  		}
  1987  	}
  1988  
  1989  	return false, nil
  1990  }
  1991  
  1992  // SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method
  1993  // returns without error, the pod is considered to be terminated and it will be safe to clean up any
  1994  // pod state that is tied to the lifetime of running containers. The next method invoked will be
  1995  // SyncTerminatedPod. This method is expected to return with the grace period provided and the
  1996  // provided context may be cancelled if the duration is exceeded. The method may also be interrupted
  1997  // with a context cancellation if the grace period is shortened by the user or the kubelet (such as
  1998  // during eviction). This method is not guaranteed to be called if a pod is force deleted from the
  1999  // configuration and the kubelet is restarted - SyncTerminatingRuntimePod handles those orphaned
  2000  // pods.
  2001  func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error {
  2002  	// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
  2003  	// Currently, using that context causes test failures.
  2004  	ctx, otelSpan := kl.tracer.Start(context.Background(), "syncTerminatingPod", trace.WithAttributes(
  2005  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  2006  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  2007  		semconv.K8SPodNameKey.String(pod.Name),
  2008  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  2009  	))
  2010  	defer otelSpan.End()
  2011  	klog.V(4).InfoS("SyncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2012  	defer klog.V(4).InfoS("SyncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2013  
  2014  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
  2015  	if podStatusFn != nil {
  2016  		podStatusFn(&apiPodStatus)
  2017  	}
  2018  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2019  
  2020  	if gracePeriod != nil {
  2021  		klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod)
  2022  	} else {
  2023  		klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil)
  2024  	}
  2025  
  2026  	kl.probeManager.StopLivenessAndStartup(pod)
  2027  
  2028  	p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  2029  	if err := kl.killPod(ctx, pod, p, gracePeriod); err != nil {
  2030  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  2031  		// there was an error killing the pod, so we return that error directly
  2032  		utilruntime.HandleError(err)
  2033  		return err
  2034  	}
  2035  
  2036  	// Once the containers are stopped, we can stop probing for liveness and readiness.
  2037  	// TODO: once a pod is terminal, certain probes (liveness exec) could be stopped immediately after
  2038  	//   the detection of a container shutdown or (for readiness) after the first failure. Tracked as
  2039  	//   https://github.com/kubernetes/kubernetes/issues/107894 although may not be worth optimizing.
  2040  	kl.probeManager.RemovePod(pod)
  2041  
  2042  	// Guard against consistency issues in KillPod implementations by checking that there are no
  2043  	// running containers. This method is invoked infrequently so this is effectively free and can
  2044  	// catch race conditions introduced by callers updating pod status out of order.
  2045  	// TODO: have KillPod return the terminal status of stopped containers and write that into the
  2046  	//  cache immediately
  2047  	podStatus, err := kl.containerRuntime.GetPodStatus(ctx, pod.UID, pod.Name, pod.Namespace)
  2048  	if err != nil {
  2049  		klog.ErrorS(err, "Unable to read pod status prior to final pod termination", "pod", klog.KObj(pod), "podUID", pod.UID)
  2050  		return err
  2051  	}
  2052  	var runningContainers []string
  2053  	type container struct {
  2054  		Name       string
  2055  		State      string
  2056  		ExitCode   int
  2057  		FinishedAt string
  2058  	}
  2059  	var containers []container
  2060  	klogV := klog.V(4)
  2061  	klogVEnabled := klogV.Enabled()
  2062  	for _, s := range podStatus.ContainerStatuses {
  2063  		if s.State == kubecontainer.ContainerStateRunning {
  2064  			runningContainers = append(runningContainers, s.ID.String())
  2065  		}
  2066  		if klogVEnabled {
  2067  			containers = append(containers, container{Name: s.Name, State: string(s.State), ExitCode: s.ExitCode, FinishedAt: s.FinishedAt.UTC().Format(time.RFC3339Nano)})
  2068  		}
  2069  	}
  2070  	if klogVEnabled {
  2071  		sort.Slice(containers, func(i, j int) bool { return containers[i].Name < containers[j].Name })
  2072  		klog.V(4).InfoS("Post-termination container state", "pod", klog.KObj(pod), "podUID", pod.UID, "containers", containers)
  2073  	}
  2074  	if len(runningContainers) > 0 {
  2075  		return fmt.Errorf("detected running containers after a successful KillPod, CRI violation: %v", runningContainers)
  2076  	}
  2077  
  2078  	// NOTE: resources must be unprepared AFTER all containers have stopped
  2079  	// and BEFORE the pod status is changed on the API server
  2080  	// to avoid race conditions with the resource deallocation code in kubernetes core.
  2081  	if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
  2082  		if err := kl.UnprepareDynamicResources(pod); err != nil {
  2083  			return err
  2084  		}
  2085  	}
  2086  
  2087  	// Compute and update the status in cache once the pods are no longer running.
  2088  	// The computation is done here to ensure the pod status used for it contains
  2089  	// information about the container end states (including exit codes) - when
  2090  	// SyncTerminatedPod is called the containers may already be removed.
  2091  	apiPodStatus = kl.generateAPIPodStatus(pod, podStatus, true)
  2092  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2093  
  2094  	// we have successfully stopped all containers, the pod is terminating, our status is "done"
  2095  	klog.V(4).InfoS("Pod termination stopped all running containers", "pod", klog.KObj(pod), "podUID", pod.UID)
  2096  
  2097  	return nil
  2098  }
  2099  
  2100  // SyncTerminatingRuntimePod is expected to terminate running containers in a pod that we have no
  2101  // configuration for. Once this method returns without error, any remaining local state can be safely
  2102  // cleaned up by background processes in each subsystem. Unlike syncTerminatingPod, we lack
  2103  // knowledge of the full pod spec and so cannot perform lifecycle related operations, only ensure
  2104  // that the remnant of the running pod is terminated and allow garbage collection to proceed. We do
  2105  // not update the status of the pod because with the source of configuration removed, we have no
  2106  // place to send that status.
  2107  func (kl *Kubelet) SyncTerminatingRuntimePod(_ context.Context, runningPod *kubecontainer.Pod) error {
  2108  	// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
  2109  	// Currently, using that context causes test failures.
  2110  	ctx := context.Background()
  2111  	pod := runningPod.ToAPIPod()
  2112  	klog.V(4).InfoS("SyncTerminatingRuntimePod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2113  	defer klog.V(4).InfoS("SyncTerminatingRuntimePod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2114  
  2115  	// we kill the pod directly since we have lost all other information about the pod.
  2116  	klog.V(4).InfoS("Orphaned running pod terminating without grace period", "pod", klog.KObj(pod), "podUID", pod.UID)
  2117  	// TODO: this should probably be zero, to bypass any waiting (needs fixes in container runtime)
  2118  	gracePeriod := int64(1)
  2119  	if err := kl.killPod(ctx, pod, *runningPod, &gracePeriod); err != nil {
  2120  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  2121  		// there was an error killing the pod, so we return that error directly
  2122  		utilruntime.HandleError(err)
  2123  		return err
  2124  	}
  2125  	klog.V(4).InfoS("Pod termination stopped all running orphaned containers", "pod", klog.KObj(pod), "podUID", pod.UID)
  2126  	return nil
  2127  }
  2128  
  2129  // SyncTerminatedPod cleans up a pod that has terminated (has no running containers).
  2130  // The invocations in this call are expected to tear down all pod resources.
  2131  // When this method exits the pod is expected to be ready for cleanup. This method
  2132  // reduces the latency of pod cleanup but is not guaranteed to get called in all scenarios.
  2133  //
  2134  // Because the kubelet has no local store of information, all actions in this method that modify
  2135  // on-disk state must be reentrant and be garbage collected by HandlePodCleanups or a separate loop.
  2136  // This typically occurs when a pod is force deleted from configuration (local disk or API) and the
  2137  // kubelet restarts in the middle of the action.
  2138  func (kl *Kubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error {
  2139  	ctx, otelSpan := kl.tracer.Start(ctx, "syncTerminatedPod", trace.WithAttributes(
  2140  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  2141  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  2142  		semconv.K8SPodNameKey.String(pod.Name),
  2143  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  2144  	))
  2145  	defer otelSpan.End()
  2146  	klog.V(4).InfoS("SyncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2147  	defer klog.V(4).InfoS("SyncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2148  
  2149  	// generate the final status of the pod
  2150  	// TODO: should we simply fold this into TerminatePod? that would give a single pod update
  2151  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, true)
  2152  
  2153  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2154  
  2155  	// volumes are unmounted after the pod worker reports ShouldPodRuntimeBeRemoved (which is satisfied
  2156  	// before syncTerminatedPod is invoked)
  2157  	if err := kl.volumeManager.WaitForUnmount(ctx, pod); err != nil {
  2158  		return err
  2159  	}
  2160  	klog.V(4).InfoS("Pod termination unmounted volumes", "pod", klog.KObj(pod), "podUID", pod.UID)
  2161  
  2162  	if !kl.keepTerminatedPodVolumes {
  2163  		// This waiting loop relies on the background cleanup which starts after pod workers respond
  2164  		// true for ShouldPodRuntimeBeRemoved, which happens after `SyncTerminatingPod` is completed.
  2165  		if err := wait.PollUntilContextCancel(ctx, 100*time.Millisecond, true, func(ctx context.Context) (bool, error) {
  2166  			volumesExist := kl.podVolumesExist(pod.UID)
  2167  			if volumesExist {
  2168  				klog.V(3).InfoS("Pod is terminated, but some volumes have not been cleaned up", "pod", klog.KObj(pod), "podUID", pod.UID)
  2169  			}
  2170  			return !volumesExist, nil
  2171  		}); err != nil {
  2172  			return err
  2173  		}
  2174  		klog.V(3).InfoS("Pod termination cleaned up volume paths", "pod", klog.KObj(pod), "podUID", pod.UID)
  2175  	}
  2176  
  2177  	// After volume unmount is complete, let the secret and configmap managers know we're done with this pod
  2178  	if kl.secretManager != nil {
  2179  		kl.secretManager.UnregisterPod(pod)
  2180  	}
  2181  	if kl.configMapManager != nil {
  2182  		kl.configMapManager.UnregisterPod(pod)
  2183  	}
  2184  
  2185  	// Note: we leave pod containers to be reclaimed in the background since dockershim requires the
  2186  	// container for retrieving logs and we want to make sure logs are available until the pod is
  2187  	// physically deleted.
  2188  
  2189  	// remove any cgroups in the hierarchy for pods that are no longer running.
  2190  	if kl.cgroupsPerQOS {
  2191  		pcm := kl.containerManager.NewPodContainerManager()
  2192  		name, _ := pcm.GetPodContainerName(pod)
  2193  		if err := pcm.Destroy(name); err != nil {
  2194  			return err
  2195  		}
  2196  		klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID)
  2197  	}
  2198  
  2199  	kl.usernsManager.Release(pod.UID)
  2200  
  2201  	// mark the final pod status
  2202  	kl.statusManager.TerminatePod(pod)
  2203  	klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID)
  2204  
  2205  	return nil
  2206  }
  2207  
  2208  // Get pods which should be resynchronized. Currently, the following pod should be resynchronized:
  2209  //   - pod whose work is ready.
  2210  //   - internal modules that request sync of a pod.
  2211  //
  2212  // This method does not return orphaned pods (those known only to the pod worker that may have
  2213  // been deleted from configuration). Those pods are synced by HandlePodCleanups as a consequence
  2214  // of driving the state machine to completion.
  2215  //
  2216  // TODO: Consider synchronizing all pods which have not recently been acted on to be resilient
  2217  // to bugs that might prevent updates from being delivered (such as the previous bug with
  2218  // orphaned pods). Instead of asking the work queue for pending work, consider asking the
  2219  // PodWorker which pods should be synced.
  2220  func (kl *Kubelet) getPodsToSync() []*v1.Pod {
  2221  	allPods := kl.podManager.GetPods()
  2222  	podUIDs := kl.workQueue.GetWork()
  2223  	podUIDSet := sets.NewString()
  2224  	for _, podUID := range podUIDs {
  2225  		podUIDSet.Insert(string(podUID))
  2226  	}
  2227  	var podsToSync []*v1.Pod
  2228  	for _, pod := range allPods {
  2229  		if podUIDSet.Has(string(pod.UID)) {
  2230  			// The work of the pod is ready
  2231  			podsToSync = append(podsToSync, pod)
  2232  			continue
  2233  		}
  2234  		for _, podSyncLoopHandler := range kl.PodSyncLoopHandlers {
  2235  			if podSyncLoopHandler.ShouldSync(pod) {
  2236  				podsToSync = append(podsToSync, pod)
  2237  				break
  2238  			}
  2239  		}
  2240  	}
  2241  	return podsToSync
  2242  }
  2243  
  2244  // deletePod deletes the pod from the internal state of the kubelet by:
  2245  // 1.  stopping the associated pod worker asynchronously
  2246  // 2.  signaling to kill the pod by sending on the podKillingCh channel
  2247  //
  2248  // deletePod returns an error if not all sources are ready or the pod is not
  2249  // found in the runtime cache.
  2250  func (kl *Kubelet) deletePod(pod *v1.Pod) error {
  2251  	if pod == nil {
  2252  		return fmt.Errorf("deletePod does not allow nil pod")
  2253  	}
  2254  	if !kl.sourcesReady.AllReady() {
  2255  		// If the sources aren't ready, skip deletion, as we may accidentally delete pods
  2256  		// for sources that haven't reported yet.
  2257  		return fmt.Errorf("skipping delete because sources aren't ready yet")
  2258  	}
  2259  	klog.V(3).InfoS("Pod has been deleted and must be killed", "pod", klog.KObj(pod), "podUID", pod.UID)
  2260  	kl.podWorkers.UpdatePod(UpdatePodOptions{
  2261  		Pod:        pod,
  2262  		UpdateType: kubetypes.SyncPodKill,
  2263  	})
  2264  	// We leave the volume/directory cleanup to the periodic cleanup routine.
  2265  	return nil
  2266  }
  2267  
  2268  // rejectPod records an event about the pod with the given reason and message,
  2269  // and updates the pod to the failed phase in the status manager.
  2270  func (kl *Kubelet) rejectPod(pod *v1.Pod, reason, message string) {
  2271  	kl.recorder.Eventf(pod, v1.EventTypeWarning, reason, message)
  2272  	kl.statusManager.SetPodStatus(pod, v1.PodStatus{
  2273  		Phase:   v1.PodFailed,
  2274  		Reason:  reason,
  2275  		Message: "Pod was rejected: " + message})
  2276  }
  2277  
  2278  // canAdmitPod determines if a pod can be admitted, and gives a reason if it
  2279  // cannot. "pod" is new pod, while "pods" are all admitted pods
  2280  // The function returns a boolean value indicating whether the pod
  2281  // can be admitted, a brief single-word reason and a message explaining why
  2282  // the pod cannot be admitted.
  2283  func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, string) {
  2284  	// the kubelet will invoke each pod admit handler in sequence
  2285  	// if any handler rejects, the pod is rejected.
  2286  	// TODO: move out of disk check into a pod admitter
  2287  	// TODO: out of resource eviction should have a pod admitter call-out
  2288  	attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods}
  2289  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2290  		// Use allocated resources values from checkpoint store (source of truth) to determine fit
  2291  		otherPods := make([]*v1.Pod, 0, len(pods))
  2292  		for _, p := range pods {
  2293  			op := p.DeepCopy()
  2294  			kl.updateContainerResourceAllocation(op)
  2295  
  2296  			otherPods = append(otherPods, op)
  2297  		}
  2298  		attrs.OtherPods = otherPods
  2299  	}
  2300  	for _, podAdmitHandler := range kl.admitHandlers {
  2301  		if result := podAdmitHandler.Admit(attrs); !result.Admit {
  2302  			return false, result.Reason, result.Message
  2303  		}
  2304  	}
  2305  
  2306  	return true, "", ""
  2307  }
  2308  
  2309  func (kl *Kubelet) canRunPod(pod *v1.Pod) lifecycle.PodAdmitResult {
  2310  	attrs := &lifecycle.PodAdmitAttributes{Pod: pod}
  2311  	// Get "OtherPods". Rejected pods are failed, so only include admitted pods that are alive.
  2312  	attrs.OtherPods = kl.GetActivePods()
  2313  
  2314  	for _, handler := range kl.softAdmitHandlers {
  2315  		if result := handler.Admit(attrs); !result.Admit {
  2316  			return result
  2317  		}
  2318  	}
  2319  
  2320  	return lifecycle.PodAdmitResult{Admit: true}
  2321  }
  2322  
  2323  // syncLoop is the main loop for processing changes. It watches for changes from
  2324  // three channels (file, apiserver, and http) and creates a union of them. For
  2325  // any new change seen, will run a sync against desired state and running state. If
  2326  // no changes are seen to the configuration, will synchronize the last known desired
  2327  // state every sync-frequency seconds. Never returns.
  2328  func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) {
  2329  	klog.InfoS("Starting kubelet main sync loop")
  2330  	// The syncTicker wakes up kubelet to checks if there are any pod workers
  2331  	// that need to be sync'd. A one-second period is sufficient because the
  2332  	// sync interval is defaulted to 10s.
  2333  	syncTicker := time.NewTicker(time.Second)
  2334  	defer syncTicker.Stop()
  2335  	housekeepingTicker := time.NewTicker(housekeepingPeriod)
  2336  	defer housekeepingTicker.Stop()
  2337  	plegCh := kl.pleg.Watch()
  2338  	const (
  2339  		base   = 100 * time.Millisecond
  2340  		max    = 5 * time.Second
  2341  		factor = 2
  2342  	)
  2343  	duration := base
  2344  	// Responsible for checking limits in resolv.conf
  2345  	// The limits do not have anything to do with individual pods
  2346  	// Since this is called in syncLoop, we don't need to call it anywhere else
  2347  	if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" {
  2348  		kl.dnsConfigurer.CheckLimitsForResolvConf()
  2349  	}
  2350  
  2351  	for {
  2352  		if err := kl.runtimeState.runtimeErrors(); err != nil {
  2353  			klog.ErrorS(err, "Skipping pod synchronization")
  2354  			// exponential backoff
  2355  			time.Sleep(duration)
  2356  			duration = time.Duration(math.Min(float64(max), factor*float64(duration)))
  2357  			continue
  2358  		}
  2359  		// reset backoff if we have a success
  2360  		duration = base
  2361  
  2362  		kl.syncLoopMonitor.Store(kl.clock.Now())
  2363  		if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
  2364  			break
  2365  		}
  2366  		kl.syncLoopMonitor.Store(kl.clock.Now())
  2367  	}
  2368  }
  2369  
  2370  // syncLoopIteration reads from various channels and dispatches pods to the
  2371  // given handler.
  2372  //
  2373  // Arguments:
  2374  // 1.  configCh:       a channel to read config events from
  2375  // 2.  handler:        the SyncHandler to dispatch pods to
  2376  // 3.  syncCh:         a channel to read periodic sync events from
  2377  // 4.  housekeepingCh: a channel to read housekeeping events from
  2378  // 5.  plegCh:         a channel to read PLEG updates from
  2379  //
  2380  // Events are also read from the kubelet liveness manager's update channel.
  2381  //
  2382  // The workflow is to read from one of the channels, handle that event, and
  2383  // update the timestamp in the sync loop monitor.
  2384  //
  2385  // Here is an appropriate place to note that despite the syntactical
  2386  // similarity to the switch statement, the case statements in a select are
  2387  // evaluated in a pseudorandom order if there are multiple channels ready to
  2388  // read from when the select is evaluated.  In other words, case statements
  2389  // are evaluated in random order, and you can not assume that the case
  2390  // statements evaluate in order if multiple channels have events.
  2391  //
  2392  // With that in mind, in truly no particular order, the different channels
  2393  // are handled as follows:
  2394  //
  2395  //   - configCh: dispatch the pods for the config change to the appropriate
  2396  //     handler callback for the event type
  2397  //   - plegCh: update the runtime cache; sync pod
  2398  //   - syncCh: sync all pods waiting for sync
  2399  //   - housekeepingCh: trigger cleanup of pods
  2400  //   - health manager: sync pods that have failed or in which one or more
  2401  //     containers have failed health checks
  2402  func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
  2403  	syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
  2404  	select {
  2405  	case u, open := <-configCh:
  2406  		// Update from a config source; dispatch it to the right handler
  2407  		// callback.
  2408  		if !open {
  2409  			klog.ErrorS(nil, "Update channel is closed, exiting the sync loop")
  2410  			return false
  2411  		}
  2412  
  2413  		switch u.Op {
  2414  		case kubetypes.ADD:
  2415  			klog.V(2).InfoS("SyncLoop ADD", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2416  			// After restarting, kubelet will get all existing pods through
  2417  			// ADD as if they are new pods. These pods will then go through the
  2418  			// admission process and *may* be rejected. This can be resolved
  2419  			// once we have checkpointing.
  2420  			handler.HandlePodAdditions(u.Pods)
  2421  		case kubetypes.UPDATE:
  2422  			klog.V(2).InfoS("SyncLoop UPDATE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2423  			handler.HandlePodUpdates(u.Pods)
  2424  		case kubetypes.REMOVE:
  2425  			klog.V(2).InfoS("SyncLoop REMOVE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2426  			handler.HandlePodRemoves(u.Pods)
  2427  		case kubetypes.RECONCILE:
  2428  			klog.V(4).InfoS("SyncLoop RECONCILE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2429  			handler.HandlePodReconcile(u.Pods)
  2430  		case kubetypes.DELETE:
  2431  			klog.V(2).InfoS("SyncLoop DELETE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2432  			// DELETE is treated as a UPDATE because of graceful deletion.
  2433  			handler.HandlePodUpdates(u.Pods)
  2434  		case kubetypes.SET:
  2435  			// TODO: Do we want to support this?
  2436  			klog.ErrorS(nil, "Kubelet does not support snapshot update")
  2437  		default:
  2438  			klog.ErrorS(nil, "Invalid operation type received", "operation", u.Op)
  2439  		}
  2440  
  2441  		kl.sourcesReady.AddSource(u.Source)
  2442  
  2443  	case e := <-plegCh:
  2444  		if isSyncPodWorthy(e) {
  2445  			// PLEG event for a pod; sync it.
  2446  			if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
  2447  				klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
  2448  				handler.HandlePodSyncs([]*v1.Pod{pod})
  2449  			} else {
  2450  				// If the pod no longer exists, ignore the event.
  2451  				klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e)
  2452  			}
  2453  		}
  2454  
  2455  		if e.Type == pleg.ContainerDied {
  2456  			if containerID, ok := e.Data.(string); ok {
  2457  				kl.cleanUpContainersInPod(e.ID, containerID)
  2458  			}
  2459  		}
  2460  	case <-syncCh:
  2461  		// Sync pods waiting for sync
  2462  		podsToSync := kl.getPodsToSync()
  2463  		if len(podsToSync) == 0 {
  2464  			break
  2465  		}
  2466  		klog.V(4).InfoS("SyncLoop (SYNC) pods", "total", len(podsToSync), "pods", klog.KObjSlice(podsToSync))
  2467  		handler.HandlePodSyncs(podsToSync)
  2468  	case update := <-kl.livenessManager.Updates():
  2469  		if update.Result == proberesults.Failure {
  2470  			handleProbeSync(kl, update, handler, "liveness", "unhealthy")
  2471  		}
  2472  	case update := <-kl.readinessManager.Updates():
  2473  		ready := update.Result == proberesults.Success
  2474  		kl.statusManager.SetContainerReadiness(update.PodUID, update.ContainerID, ready)
  2475  
  2476  		status := ""
  2477  		if ready {
  2478  			status = "ready"
  2479  		}
  2480  		handleProbeSync(kl, update, handler, "readiness", status)
  2481  	case update := <-kl.startupManager.Updates():
  2482  		started := update.Result == proberesults.Success
  2483  		kl.statusManager.SetContainerStartup(update.PodUID, update.ContainerID, started)
  2484  
  2485  		status := "unhealthy"
  2486  		if started {
  2487  			status = "started"
  2488  		}
  2489  		handleProbeSync(kl, update, handler, "startup", status)
  2490  	case <-housekeepingCh:
  2491  		if !kl.sourcesReady.AllReady() {
  2492  			// If the sources aren't ready or volume manager has not yet synced the states,
  2493  			// skip housekeeping, as we may accidentally delete pods from unready sources.
  2494  			klog.V(4).InfoS("SyncLoop (housekeeping, skipped): sources aren't ready yet")
  2495  		} else {
  2496  			start := time.Now()
  2497  			klog.V(4).InfoS("SyncLoop (housekeeping)")
  2498  			if err := handler.HandlePodCleanups(ctx); err != nil {
  2499  				klog.ErrorS(err, "Failed cleaning pods")
  2500  			}
  2501  			duration := time.Since(start)
  2502  			if duration > housekeepingWarningDuration {
  2503  				klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than expected", "expected", housekeepingWarningDuration, "actual", duration.Round(time.Millisecond))
  2504  			}
  2505  			klog.V(4).InfoS("SyncLoop (housekeeping) end", "duration", duration.Round(time.Millisecond))
  2506  		}
  2507  	}
  2508  	return true
  2509  }
  2510  
  2511  func handleProbeSync(kl *Kubelet, update proberesults.Update, handler SyncHandler, probe, status string) {
  2512  	// We should not use the pod from manager, because it is never updated after initialization.
  2513  	pod, ok := kl.podManager.GetPodByUID(update.PodUID)
  2514  	if !ok {
  2515  		// If the pod no longer exists, ignore the update.
  2516  		klog.V(4).InfoS("SyncLoop (probe): ignore irrelevant update", "probe", probe, "status", status, "update", update)
  2517  		return
  2518  	}
  2519  	klog.V(1).InfoS("SyncLoop (probe)", "probe", probe, "status", status, "pod", klog.KObj(pod))
  2520  	handler.HandlePodSyncs([]*v1.Pod{pod})
  2521  }
  2522  
  2523  // HandlePodAdditions is the callback in SyncHandler for pods being added from
  2524  // a config source.
  2525  func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
  2526  	start := kl.clock.Now()
  2527  	sort.Sort(sliceutils.PodsByCreationTime(pods))
  2528  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2529  		kl.podResizeMutex.Lock()
  2530  		defer kl.podResizeMutex.Unlock()
  2531  	}
  2532  	for _, pod := range pods {
  2533  		existingPods := kl.podManager.GetPods()
  2534  		// Always add the pod to the pod manager. Kubelet relies on the pod
  2535  		// manager as the source of truth for the desired state. If a pod does
  2536  		// not exist in the pod manager, it means that it has been deleted in
  2537  		// the apiserver and no action (other than cleanup) is required.
  2538  		kl.podManager.AddPod(pod)
  2539  
  2540  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2541  		if wasMirror {
  2542  			if pod == nil {
  2543  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2544  				continue
  2545  			}
  2546  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2547  				Pod:        pod,
  2548  				MirrorPod:  mirrorPod,
  2549  				UpdateType: kubetypes.SyncPodUpdate,
  2550  				StartTime:  start,
  2551  			})
  2552  			continue
  2553  		}
  2554  
  2555  		// Only go through the admission process if the pod is not requested
  2556  		// for termination by another part of the kubelet. If the pod is already
  2557  		// using resources (previously admitted), the pod worker is going to be
  2558  		// shutting it down. If the pod hasn't started yet, we know that when
  2559  		// the pod worker is invoked it will also avoid setting up the pod, so
  2560  		// we simply avoid doing any work.
  2561  		if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  2562  			// We failed pods that we rejected, so activePods include all admitted
  2563  			// pods that are alive.
  2564  			activePods := kl.filterOutInactivePods(existingPods)
  2565  
  2566  			if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2567  				// To handle kubelet restarts, test pod admissibility using AllocatedResources values
  2568  				// (for cpu & memory) from checkpoint store. If found, that is the source of truth.
  2569  				podCopy := pod.DeepCopy()
  2570  				kl.updateContainerResourceAllocation(podCopy)
  2571  
  2572  				// Check if we can admit the pod; if not, reject it.
  2573  				if ok, reason, message := kl.canAdmitPod(activePods, podCopy); !ok {
  2574  					kl.rejectPod(pod, reason, message)
  2575  					continue
  2576  				}
  2577  				// For new pod, checkpoint the resource values at which the Pod has been admitted
  2578  				if err := kl.statusManager.SetPodAllocation(podCopy); err != nil {
  2579  					//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2580  					klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod))
  2581  				}
  2582  			} else {
  2583  				// Check if we can admit the pod; if not, reject it.
  2584  				if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok {
  2585  					kl.rejectPod(pod, reason, message)
  2586  					continue
  2587  				}
  2588  			}
  2589  		}
  2590  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2591  			Pod:        pod,
  2592  			MirrorPod:  mirrorPod,
  2593  			UpdateType: kubetypes.SyncPodCreate,
  2594  			StartTime:  start,
  2595  		})
  2596  	}
  2597  }
  2598  
  2599  // updateContainerResourceAllocation updates AllocatedResources values
  2600  // (for cpu & memory) from checkpoint store
  2601  func (kl *Kubelet) updateContainerResourceAllocation(pod *v1.Pod) {
  2602  	for _, c := range pod.Spec.Containers {
  2603  		allocatedResources, found := kl.statusManager.GetContainerResourceAllocation(string(pod.UID), c.Name)
  2604  		if c.Resources.Requests != nil && found {
  2605  			if _, ok := allocatedResources[v1.ResourceCPU]; ok {
  2606  				c.Resources.Requests[v1.ResourceCPU] = allocatedResources[v1.ResourceCPU]
  2607  			}
  2608  			if _, ok := allocatedResources[v1.ResourceMemory]; ok {
  2609  				c.Resources.Requests[v1.ResourceMemory] = allocatedResources[v1.ResourceMemory]
  2610  			}
  2611  		}
  2612  	}
  2613  }
  2614  
  2615  // HandlePodUpdates is the callback in the SyncHandler interface for pods
  2616  // being updated from a config source.
  2617  func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) {
  2618  	start := kl.clock.Now()
  2619  	for _, pod := range pods {
  2620  		kl.podManager.UpdatePod(pod)
  2621  
  2622  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2623  		if wasMirror {
  2624  			if pod == nil {
  2625  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2626  				continue
  2627  			}
  2628  		}
  2629  
  2630  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2631  			Pod:        pod,
  2632  			MirrorPod:  mirrorPod,
  2633  			UpdateType: kubetypes.SyncPodUpdate,
  2634  			StartTime:  start,
  2635  		})
  2636  	}
  2637  }
  2638  
  2639  // HandlePodRemoves is the callback in the SyncHandler interface for pods
  2640  // being removed from a config source.
  2641  func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) {
  2642  	start := kl.clock.Now()
  2643  	for _, pod := range pods {
  2644  		kl.podManager.RemovePod(pod)
  2645  
  2646  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2647  		if wasMirror {
  2648  			if pod == nil {
  2649  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2650  				continue
  2651  			}
  2652  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2653  				Pod:        pod,
  2654  				MirrorPod:  mirrorPod,
  2655  				UpdateType: kubetypes.SyncPodUpdate,
  2656  				StartTime:  start,
  2657  			})
  2658  			continue
  2659  		}
  2660  
  2661  		// Deletion is allowed to fail because the periodic cleanup routine
  2662  		// will trigger deletion again.
  2663  		if err := kl.deletePod(pod); err != nil {
  2664  			klog.V(2).InfoS("Failed to delete pod", "pod", klog.KObj(pod), "err", err)
  2665  		}
  2666  	}
  2667  }
  2668  
  2669  // HandlePodReconcile is the callback in the SyncHandler interface for pods
  2670  // that should be reconciled. Pods are reconciled when only the status of the
  2671  // pod is updated in the API.
  2672  func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) {
  2673  	start := kl.clock.Now()
  2674  	for _, pod := range pods {
  2675  		// Update the pod in pod manager, status manager will do periodically reconcile according
  2676  		// to the pod manager.
  2677  		kl.podManager.UpdatePod(pod)
  2678  
  2679  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2680  		if wasMirror {
  2681  			if pod == nil {
  2682  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2683  				continue
  2684  			}
  2685  			// Static pods should be reconciled the same way as regular pods
  2686  		}
  2687  
  2688  		// TODO: reconcile being calculated in the config manager is questionable, and avoiding
  2689  		// extra syncs may no longer be necessary. Reevaluate whether Reconcile and Sync can be
  2690  		// merged (after resolving the next two TODOs).
  2691  
  2692  		// Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation.
  2693  		// TODO: this should be unnecessary today - determine what is the cause for this to
  2694  		// be different than Sync, or if there is a better place for it. For instance, we have
  2695  		// needsReconcile in kubelet/config, here, and in status_manager.
  2696  		if status.NeedToReconcilePodReadiness(pod) {
  2697  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2698  				Pod:        pod,
  2699  				MirrorPod:  mirrorPod,
  2700  				UpdateType: kubetypes.SyncPodSync,
  2701  				StartTime:  start,
  2702  			})
  2703  		}
  2704  
  2705  		// After an evicted pod is synced, all dead containers in the pod can be removed.
  2706  		// TODO: this is questionable - status read is async and during eviction we already
  2707  		// expect to not have some container info. The pod worker knows whether a pod has
  2708  		// been evicted, so if this is about minimizing the time to react to an eviction we
  2709  		// can do better. If it's about preserving pod status info we can also do better.
  2710  		if eviction.PodIsEvicted(pod.Status) {
  2711  			if podStatus, err := kl.podCache.Get(pod.UID); err == nil {
  2712  				kl.containerDeletor.deleteContainersInPod("", podStatus, true)
  2713  			}
  2714  		}
  2715  	}
  2716  }
  2717  
  2718  // HandlePodSyncs is the callback in the syncHandler interface for pods
  2719  // that should be dispatched to pod workers for sync.
  2720  func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) {
  2721  	start := kl.clock.Now()
  2722  	for _, pod := range pods {
  2723  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2724  		if wasMirror {
  2725  			if pod == nil {
  2726  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2727  				continue
  2728  			}
  2729  			// Syncing a mirror pod is a programmer error since the intent of sync is to
  2730  			// batch notify all pending work. We should make it impossible to double sync,
  2731  			// but for now log a programmer error to prevent accidental introduction.
  2732  			klog.V(3).InfoS("Programmer error, HandlePodSyncs does not expect to receive mirror pods", "podUID", pod.UID, "mirrorPodUID", mirrorPod.UID)
  2733  			continue
  2734  		}
  2735  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2736  			Pod:        pod,
  2737  			MirrorPod:  mirrorPod,
  2738  			UpdateType: kubetypes.SyncPodSync,
  2739  			StartTime:  start,
  2740  		})
  2741  	}
  2742  }
  2743  
  2744  func isPodResizeInProgress(pod *v1.Pod, podStatus *v1.PodStatus) bool {
  2745  	for _, c := range pod.Spec.Containers {
  2746  		if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok {
  2747  			if cs.Resources == nil {
  2748  				continue
  2749  			}
  2750  			if !cmp.Equal(c.Resources.Limits, cs.Resources.Limits) || !cmp.Equal(cs.AllocatedResources, cs.Resources.Requests) {
  2751  				return true
  2752  			}
  2753  		}
  2754  	}
  2755  	return false
  2756  }
  2757  
  2758  func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus) {
  2759  	var otherActivePods []*v1.Pod
  2760  
  2761  	node, err := kl.getNodeAnyWay()
  2762  	if err != nil {
  2763  		klog.ErrorS(err, "getNodeAnyway function failed")
  2764  		return false, nil, ""
  2765  	}
  2766  	podCopy := pod.DeepCopy()
  2767  	cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
  2768  	memAvailable := node.Status.Allocatable.Memory().Value()
  2769  	cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU)
  2770  	memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory)
  2771  	if cpuRequests > cpuAvailable || memRequests > memAvailable {
  2772  		klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name)
  2773  		return false, podCopy, v1.PodResizeStatusInfeasible
  2774  	}
  2775  
  2776  	// Treat the existing pod needing resize as a new pod with desired resources seeking admit.
  2777  	// If desired resources don't fit, pod continues to run with currently allocated resources.
  2778  	activePods := kl.GetActivePods()
  2779  	for _, p := range activePods {
  2780  		if p.UID != pod.UID {
  2781  			otherActivePods = append(otherActivePods, p)
  2782  		}
  2783  	}
  2784  
  2785  	if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok {
  2786  		// Log reason and return. Let the next sync iteration retry the resize
  2787  		klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage)
  2788  		return false, podCopy, v1.PodResizeStatusDeferred
  2789  	}
  2790  
  2791  	for _, container := range podCopy.Spec.Containers {
  2792  		idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name)
  2793  		if found {
  2794  			for rName, rQuantity := range container.Resources.Requests {
  2795  				podCopy.Status.ContainerStatuses[idx].AllocatedResources[rName] = rQuantity
  2796  			}
  2797  		}
  2798  	}
  2799  	return true, podCopy, v1.PodResizeStatusInProgress
  2800  }
  2801  
  2802  func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod {
  2803  	if pod.Status.Phase != v1.PodRunning {
  2804  		return pod
  2805  	}
  2806  	podResized := false
  2807  	for _, container := range pod.Spec.Containers {
  2808  		if len(container.Resources.Requests) == 0 {
  2809  			continue
  2810  		}
  2811  		containerStatus, found := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name)
  2812  		if !found {
  2813  			klog.V(5).InfoS("ContainerStatus not found", "pod", pod.Name, "container", container.Name)
  2814  			break
  2815  		}
  2816  		if len(containerStatus.AllocatedResources) != len(container.Resources.Requests) {
  2817  			klog.V(5).InfoS("ContainerStatus.AllocatedResources length mismatch", "pod", pod.Name, "container", container.Name)
  2818  			break
  2819  		}
  2820  		if !cmp.Equal(container.Resources.Requests, containerStatus.AllocatedResources) {
  2821  			podResized = true
  2822  			break
  2823  		}
  2824  	}
  2825  	if !podResized {
  2826  		return pod
  2827  	}
  2828  
  2829  	kl.podResizeMutex.Lock()
  2830  	defer kl.podResizeMutex.Unlock()
  2831  	fit, updatedPod, resizeStatus := kl.canResizePod(pod)
  2832  	if updatedPod == nil {
  2833  		return pod
  2834  	}
  2835  	if fit {
  2836  		// Update pod resource allocation checkpoint
  2837  		if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil {
  2838  			//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2839  			klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod))
  2840  			return pod
  2841  		}
  2842  	}
  2843  	if resizeStatus != "" {
  2844  		// Save resize decision to checkpoint
  2845  		if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil {
  2846  			//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2847  			klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod))
  2848  			return pod
  2849  		}
  2850  		updatedPod.Status.Resize = resizeStatus
  2851  	}
  2852  	kl.podManager.UpdatePod(updatedPod)
  2853  	kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status)
  2854  	return updatedPod
  2855  }
  2856  
  2857  // LatestLoopEntryTime returns the last time in the sync loop monitor.
  2858  func (kl *Kubelet) LatestLoopEntryTime() time.Time {
  2859  	val := kl.syncLoopMonitor.Load()
  2860  	if val == nil {
  2861  		return time.Time{}
  2862  	}
  2863  	return val.(time.Time)
  2864  }
  2865  
  2866  // updateRuntimeUp calls the container runtime status callback, initializing
  2867  // the runtime dependent modules when the container runtime first comes up,
  2868  // and returns an error if the status check fails.  If the status check is OK,
  2869  // update the container runtime uptime in the kubelet runtimeState.
  2870  func (kl *Kubelet) updateRuntimeUp() {
  2871  	kl.updateRuntimeMux.Lock()
  2872  	defer kl.updateRuntimeMux.Unlock()
  2873  	ctx := context.Background()
  2874  
  2875  	s, err := kl.containerRuntime.Status(ctx)
  2876  	if err != nil {
  2877  		klog.ErrorS(err, "Container runtime sanity check failed")
  2878  		return
  2879  	}
  2880  	if s == nil {
  2881  		klog.ErrorS(nil, "Container runtime status is nil")
  2882  		return
  2883  	}
  2884  	// Periodically log the whole runtime status for debugging.
  2885  	klog.V(4).InfoS("Container runtime status", "status", s)
  2886  	klogErrorS := klog.ErrorS
  2887  	if !kl.containerRuntimeReadyExpected {
  2888  		klogErrorS = klog.V(4).ErrorS
  2889  	}
  2890  	networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady)
  2891  	if networkReady == nil || !networkReady.Status {
  2892  		klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
  2893  		kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady))
  2894  	} else {
  2895  		// Set nil if the container runtime network is ready.
  2896  		kl.runtimeState.setNetworkState(nil)
  2897  	}
  2898  	// information in RuntimeReady condition will be propagated to NodeReady condition.
  2899  	runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady)
  2900  	// If RuntimeReady is not set or is false, report an error.
  2901  	if runtimeReady == nil || !runtimeReady.Status {
  2902  		klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
  2903  		kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady))
  2904  		return
  2905  	}
  2906  	kl.runtimeState.setRuntimeState(nil)
  2907  	kl.oneTimeInitializer.Do(kl.initializeRuntimeDependentModules)
  2908  	kl.runtimeState.setRuntimeSync(kl.clock.Now())
  2909  }
  2910  
  2911  // GetConfiguration returns the KubeletConfiguration used to configure the kubelet.
  2912  func (kl *Kubelet) GetConfiguration() kubeletconfiginternal.KubeletConfiguration {
  2913  	return kl.kubeletConfiguration
  2914  }
  2915  
  2916  // BirthCry sends an event that the kubelet has started up.
  2917  func (kl *Kubelet) BirthCry() {
  2918  	// Make an event that kubelet restarted.
  2919  	kl.recorder.Eventf(kl.nodeRef, v1.EventTypeNormal, events.StartingKubelet, "Starting kubelet.")
  2920  }
  2921  
  2922  // ResyncInterval returns the interval used for periodic syncs.
  2923  func (kl *Kubelet) ResyncInterval() time.Duration {
  2924  	return kl.resyncInterval
  2925  }
  2926  
  2927  // ListenAndServe runs the kubelet HTTP server.
  2928  func (kl *Kubelet) ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions,
  2929  	auth server.AuthInterface, tp trace.TracerProvider) {
  2930  	server.ListenAndServeKubeletServer(kl, kl.resourceAnalyzer, kubeCfg, tlsOptions, auth, tp)
  2931  }
  2932  
  2933  // ListenAndServeReadOnly runs the kubelet HTTP server in read-only mode.
  2934  func (kl *Kubelet) ListenAndServeReadOnly(address net.IP, port uint) {
  2935  	server.ListenAndServeKubeletReadOnlyServer(kl, kl.resourceAnalyzer, address, port)
  2936  }
  2937  
  2938  // ListenAndServePodResources runs the kubelet podresources grpc service
  2939  func (kl *Kubelet) ListenAndServePodResources() {
  2940  	endpoint, err := util.LocalEndpoint(kl.getPodResourcesDir(), podresources.Socket)
  2941  	if err != nil {
  2942  		klog.V(2).InfoS("Failed to get local endpoint for PodResources endpoint", "err", err)
  2943  		return
  2944  	}
  2945  
  2946  	providers := podresources.PodResourcesProviders{
  2947  		Pods:             kl.podManager,
  2948  		Devices:          kl.containerManager,
  2949  		Cpus:             kl.containerManager,
  2950  		Memory:           kl.containerManager,
  2951  		DynamicResources: kl.containerManager,
  2952  	}
  2953  
  2954  	server.ListenAndServePodResources(endpoint, providers)
  2955  }
  2956  
  2957  // Delete the eligible dead container instances in a pod. Depending on the configuration, the latest dead containers may be kept around.
  2958  func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID string) {
  2959  	if podStatus, err := kl.podCache.Get(podID); err == nil {
  2960  		// When an evicted or deleted pod has already synced, all containers can be removed.
  2961  		removeAll := kl.podWorkers.ShouldPodContentBeRemoved(podID)
  2962  		kl.containerDeletor.deleteContainersInPod(exitedContainerID, podStatus, removeAll)
  2963  	}
  2964  }
  2965  
  2966  // fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime
  2967  // would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible.
  2968  // Function returns after the node status update after such event, or when the node is already ready.
  2969  // Function is executed only during Kubelet start which improves latency to ready node by updating
  2970  // kubelet state, runtime status and node statuses ASAP.
  2971  func (kl *Kubelet) fastStatusUpdateOnce() {
  2972  	ctx := context.Background()
  2973  	start := kl.clock.Now()
  2974  	stopCh := make(chan struct{})
  2975  
  2976  	// Keep trying to make fast node status update until either timeout is reached or an update is successful.
  2977  	wait.Until(func() {
  2978  		// fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired
  2979  		// (status was not updated within nodeReadyGracePeriod and the second argument below gets true),
  2980  		// then we close the channel and abort the loop.
  2981  		if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) {
  2982  			close(stopCh)
  2983  		}
  2984  	}, 100*time.Millisecond, stopCh)
  2985  }
  2986  
  2987  // CheckpointContainer tries to checkpoint a container. The parameters are used to
  2988  // look up the specified container. If the container specified by the given parameters
  2989  // cannot be found an error is returned. If the container is found the container
  2990  // engine will be asked to checkpoint the given container into the kubelet's default
  2991  // checkpoint directory.
  2992  func (kl *Kubelet) CheckpointContainer(
  2993  	ctx context.Context,
  2994  	podUID types.UID,
  2995  	podFullName,
  2996  	containerName string,
  2997  	options *runtimeapi.CheckpointContainerRequest,
  2998  ) error {
  2999  	container, err := kl.findContainer(ctx, podFullName, podUID, containerName)
  3000  	if err != nil {
  3001  		return err
  3002  	}
  3003  	if container == nil {
  3004  		return fmt.Errorf("container %v not found", containerName)
  3005  	}
  3006  
  3007  	options.Location = filepath.Join(
  3008  		kl.getCheckpointsDir(),
  3009  		fmt.Sprintf(
  3010  			"checkpoint-%s-%s-%s.tar",
  3011  			podFullName,
  3012  			containerName,
  3013  			time.Now().Format(time.RFC3339),
  3014  		),
  3015  	)
  3016  
  3017  	options.ContainerId = string(container.ID.ID)
  3018  
  3019  	if err := kl.containerRuntime.CheckpointContainer(ctx, options); err != nil {
  3020  		return err
  3021  	}
  3022  
  3023  	return nil
  3024  }
  3025  
  3026  // ListMetricDescriptors gets the descriptors for the metrics that will be returned in ListPodSandboxMetrics.
  3027  func (kl *Kubelet) ListMetricDescriptors(ctx context.Context) ([]*runtimeapi.MetricDescriptor, error) {
  3028  	return kl.containerRuntime.ListMetricDescriptors(ctx)
  3029  }
  3030  
  3031  // ListPodSandboxMetrics retrieves the metrics for all pod sandboxes.
  3032  func (kl *Kubelet) ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error) {
  3033  	return kl.containerRuntime.ListPodSandboxMetrics(ctx)
  3034  }
  3035  
  3036  func (kl *Kubelet) supportLocalStorageCapacityIsolation() bool {
  3037  	return kl.GetConfiguration().LocalStorageCapacityIsolation
  3038  }
  3039  
  3040  // isSyncPodWorthy filters out events that are not worthy of pod syncing
  3041  func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool {
  3042  	// ContainerRemoved doesn't affect pod state
  3043  	return event.Type != pleg.ContainerRemoved
  3044  }
  3045  
  3046  // PrepareDynamicResources calls the container Manager PrepareDynamicResources API
  3047  // This method implements the RuntimeHelper interface
  3048  func (kl *Kubelet) PrepareDynamicResources(pod *v1.Pod) error {
  3049  	return kl.containerManager.PrepareDynamicResources(pod)
  3050  }
  3051  
  3052  // UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API
  3053  // This method implements the RuntimeHelper interface
  3054  func (kl *Kubelet) UnprepareDynamicResources(pod *v1.Pod) error {
  3055  	return kl.containerManager.UnprepareDynamicResources(pod)
  3056  }