k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kubelet.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kubelet
    18  
    19  import (
    20  	"context"
    21  	"crypto/tls"
    22  	"errors"
    23  	"fmt"
    24  	"math"
    25  	"net"
    26  	"net/http"
    27  	"os"
    28  	"path/filepath"
    29  	sysruntime "runtime"
    30  	"sort"
    31  	"sync"
    32  	"sync/atomic"
    33  	"time"
    34  
    35  	cadvisorapi "github.com/google/cadvisor/info/v1"
    36  	"github.com/google/go-cmp/cmp"
    37  	"github.com/opencontainers/selinux/go-selinux"
    38  	"go.opentelemetry.io/otel/attribute"
    39  	semconv "go.opentelemetry.io/otel/semconv/v1.12.0"
    40  	"go.opentelemetry.io/otel/trace"
    41  	"k8s.io/client-go/informers"
    42  
    43  	"k8s.io/mount-utils"
    44  	netutils "k8s.io/utils/net"
    45  
    46  	v1 "k8s.io/api/core/v1"
    47  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    48  	"k8s.io/apimachinery/pkg/fields"
    49  	"k8s.io/apimachinery/pkg/labels"
    50  	"k8s.io/apimachinery/pkg/types"
    51  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    52  	"k8s.io/apimachinery/pkg/util/sets"
    53  	"k8s.io/apimachinery/pkg/util/wait"
    54  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    55  	clientset "k8s.io/client-go/kubernetes"
    56  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    57  	corelisters "k8s.io/client-go/listers/core/v1"
    58  	"k8s.io/client-go/tools/cache"
    59  	"k8s.io/client-go/tools/record"
    60  	"k8s.io/client-go/util/certificate"
    61  	"k8s.io/client-go/util/flowcontrol"
    62  	cloudprovider "k8s.io/cloud-provider"
    63  	"k8s.io/component-helpers/apimachinery/lease"
    64  	internalapi "k8s.io/cri-api/pkg/apis"
    65  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    66  	remote "k8s.io/cri-client/pkg"
    67  	"k8s.io/klog/v2"
    68  	pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
    69  	statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    70  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    71  	"k8s.io/kubernetes/pkg/api/v1/resource"
    72  	"k8s.io/kubernetes/pkg/features"
    73  	kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config"
    74  	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
    75  	"k8s.io/kubernetes/pkg/kubelet/cadvisor"
    76  	kubeletcertificate "k8s.io/kubernetes/pkg/kubelet/certificate"
    77  	"k8s.io/kubernetes/pkg/kubelet/cloudresource"
    78  	"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
    79  	"k8s.io/kubernetes/pkg/kubelet/cm"
    80  	draplugin "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
    81  	"k8s.io/kubernetes/pkg/kubelet/config"
    82  	"k8s.io/kubernetes/pkg/kubelet/configmap"
    83  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    84  	"k8s.io/kubernetes/pkg/kubelet/events"
    85  	"k8s.io/kubernetes/pkg/kubelet/eviction"
    86  	"k8s.io/kubernetes/pkg/kubelet/images"
    87  	"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
    88  	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
    89  	"k8s.io/kubernetes/pkg/kubelet/logs"
    90  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    91  	"k8s.io/kubernetes/pkg/kubelet/metrics/collectors"
    92  	"k8s.io/kubernetes/pkg/kubelet/network/dns"
    93  	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
    94  	oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom"
    95  	"k8s.io/kubernetes/pkg/kubelet/pleg"
    96  	"k8s.io/kubernetes/pkg/kubelet/pluginmanager"
    97  	plugincache "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
    98  	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
    99  	"k8s.io/kubernetes/pkg/kubelet/preemption"
   100  	"k8s.io/kubernetes/pkg/kubelet/prober"
   101  	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
   102  	"k8s.io/kubernetes/pkg/kubelet/runtimeclass"
   103  	"k8s.io/kubernetes/pkg/kubelet/secret"
   104  	"k8s.io/kubernetes/pkg/kubelet/server"
   105  	servermetrics "k8s.io/kubernetes/pkg/kubelet/server/metrics"
   106  	serverstats "k8s.io/kubernetes/pkg/kubelet/server/stats"
   107  	"k8s.io/kubernetes/pkg/kubelet/stats"
   108  	"k8s.io/kubernetes/pkg/kubelet/status"
   109  	"k8s.io/kubernetes/pkg/kubelet/sysctl"
   110  	"k8s.io/kubernetes/pkg/kubelet/token"
   111  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
   112  	"k8s.io/kubernetes/pkg/kubelet/userns"
   113  	"k8s.io/kubernetes/pkg/kubelet/userns/inuserns"
   114  	"k8s.io/kubernetes/pkg/kubelet/util"
   115  	"k8s.io/kubernetes/pkg/kubelet/util/manager"
   116  	"k8s.io/kubernetes/pkg/kubelet/util/queue"
   117  	"k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
   118  	"k8s.io/kubernetes/pkg/kubelet/volumemanager"
   119  	httpprobe "k8s.io/kubernetes/pkg/probe/http"
   120  	"k8s.io/kubernetes/pkg/security/apparmor"
   121  	"k8s.io/kubernetes/pkg/util/oom"
   122  	"k8s.io/kubernetes/pkg/volume"
   123  	"k8s.io/kubernetes/pkg/volume/csi"
   124  	"k8s.io/kubernetes/pkg/volume/util/hostutil"
   125  	"k8s.io/kubernetes/pkg/volume/util/subpath"
   126  	"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
   127  	"k8s.io/utils/clock"
   128  )
   129  
   130  const (
   131  	// Max amount of time to wait for the container runtime to come up.
   132  	maxWaitForContainerRuntime = 30 * time.Second
   133  
   134  	// nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed.
   135  	nodeStatusUpdateRetry = 5
   136  
   137  	// nodeReadyGracePeriod is the period to allow for before fast status update is
   138  	// terminated and container runtime not being ready is logged without verbosity guard.
   139  	nodeReadyGracePeriod = 120 * time.Second
   140  
   141  	// DefaultContainerLogsDir is the location of container logs.
   142  	DefaultContainerLogsDir = "/var/log/containers"
   143  
   144  	// MaxContainerBackOff is the max backoff period, exported for the e2e test
   145  	MaxContainerBackOff = 300 * time.Second
   146  
   147  	// Period for performing global cleanup tasks.
   148  	housekeepingPeriod = time.Second * 2
   149  
   150  	// Duration at which housekeeping failed to satisfy the invariant that
   151  	// housekeeping should be fast to avoid blocking pod config (while
   152  	// housekeeping is running no new pods are started or deleted).
   153  	housekeepingWarningDuration = time.Second * 1
   154  
   155  	// Period after which the runtime cache expires - set to slightly longer than
   156  	// the expected length between housekeeping periods, which explicitly refreshes
   157  	// the cache.
   158  	runtimeCacheRefreshPeriod = housekeepingPeriod + housekeepingWarningDuration
   159  
   160  	// Period for performing eviction monitoring.
   161  	// ensure this is kept in sync with internal cadvisor housekeeping.
   162  	evictionMonitoringPeriod = time.Second * 10
   163  
   164  	// The path in containers' filesystems where the hosts file is mounted.
   165  	linuxEtcHostsPath   = "/etc/hosts"
   166  	windowsEtcHostsPath = "C:\\Windows\\System32\\drivers\\etc\\hosts"
   167  
   168  	// Capacity of the channel for receiving pod lifecycle events. This number
   169  	// is a bit arbitrary and may be adjusted in the future.
   170  	plegChannelCapacity = 1000
   171  
   172  	// Generic PLEG relies on relisting for discovering container events.
   173  	// A longer period means that kubelet will take longer to detect container
   174  	// changes and to update pod status. On the other hand, a shorter period
   175  	// will cause more frequent relisting (e.g., container runtime operations),
   176  	// leading to higher cpu usage.
   177  	// Note that even though we set the period to 1s, the relisting itself can
   178  	// take more than 1s to finish if the container runtime responds slowly
   179  	// and/or when there are many container changes in one cycle.
   180  	genericPlegRelistPeriod    = time.Second * 1
   181  	genericPlegRelistThreshold = time.Minute * 3
   182  
   183  	// Generic PLEG relist period and threshold when used with Evented PLEG.
   184  	eventedPlegRelistPeriod     = time.Second * 300
   185  	eventedPlegRelistThreshold  = time.Minute * 10
   186  	eventedPlegMaxStreamRetries = 5
   187  
   188  	// backOffPeriod is the period to back off when pod syncing results in an
   189  	// error. It is also used as the base period for the exponential backoff
   190  	// container restarts and image pulls.
   191  	backOffPeriod = time.Second * 10
   192  
   193  	// ContainerGCPeriod is the period for performing container garbage collection.
   194  	ContainerGCPeriod = time.Minute
   195  	// ImageGCPeriod is the period for performing image garbage collection.
   196  	ImageGCPeriod = 5 * time.Minute
   197  
   198  	// Minimum number of dead containers to keep in a pod
   199  	minDeadContainerInPod = 1
   200  
   201  	// nodeLeaseRenewIntervalFraction is the fraction of lease duration to renew the lease
   202  	nodeLeaseRenewIntervalFraction = 0.25
   203  
   204  	// instrumentationScope is the name of OpenTelemetry instrumentation scope
   205  	instrumentationScope = "k8s.io/kubernetes/pkg/kubelet"
   206  )
   207  
   208  var (
   209  	// ContainerLogsDir can be overwritten for testing usage
   210  	ContainerLogsDir = DefaultContainerLogsDir
   211  	etcHostsPath     = getContainerEtcHostsPath()
   212  )
   213  
   214  func getContainerEtcHostsPath() string {
   215  	if sysruntime.GOOS == "windows" {
   216  		return windowsEtcHostsPath
   217  	}
   218  	return linuxEtcHostsPath
   219  }
   220  
   221  // SyncHandler is an interface implemented by Kubelet, for testability
   222  type SyncHandler interface {
   223  	HandlePodAdditions(pods []*v1.Pod)
   224  	HandlePodUpdates(pods []*v1.Pod)
   225  	HandlePodRemoves(pods []*v1.Pod)
   226  	HandlePodReconcile(pods []*v1.Pod)
   227  	HandlePodSyncs(pods []*v1.Pod)
   228  	HandlePodCleanups(ctx context.Context) error
   229  }
   230  
   231  // Option is a functional option type for Kubelet
   232  type Option func(*Kubelet)
   233  
   234  // Bootstrap is a bootstrapping interface for kubelet, targets the initialization protocol
   235  type Bootstrap interface {
   236  	GetConfiguration() kubeletconfiginternal.KubeletConfiguration
   237  	BirthCry()
   238  	StartGarbageCollection()
   239  	ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, auth server.AuthInterface, tp trace.TracerProvider)
   240  	ListenAndServeReadOnly(address net.IP, port uint, tp trace.TracerProvider)
   241  	ListenAndServePodResources()
   242  	Run(<-chan kubetypes.PodUpdate)
   243  	RunOnce(<-chan kubetypes.PodUpdate) ([]RunPodResult, error)
   244  }
   245  
   246  // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed
   247  // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping
   248  // these objects while we figure out a more comprehensive dependency injection story for the Kubelet.
   249  type Dependencies struct {
   250  	Options []Option
   251  
   252  	// Injected Dependencies
   253  	Auth                      server.AuthInterface
   254  	CAdvisorInterface         cadvisor.Interface
   255  	Cloud                     cloudprovider.Interface
   256  	ContainerManager          cm.ContainerManager
   257  	EventClient               v1core.EventsGetter
   258  	HeartbeatClient           clientset.Interface
   259  	OnHeartbeatFailure        func()
   260  	KubeClient                clientset.Interface
   261  	Mounter                   mount.Interface
   262  	HostUtil                  hostutil.HostUtils
   263  	OOMAdjuster               *oom.OOMAdjuster
   264  	OSInterface               kubecontainer.OSInterface
   265  	PodConfig                 *config.PodConfig
   266  	ProbeManager              prober.Manager
   267  	Recorder                  record.EventRecorder
   268  	Subpather                 subpath.Interface
   269  	TracerProvider            trace.TracerProvider
   270  	VolumePlugins             []volume.VolumePlugin
   271  	DynamicPluginProber       volume.DynamicPluginProber
   272  	TLSOptions                *server.TLSOptions
   273  	RemoteRuntimeService      internalapi.RuntimeService
   274  	RemoteImageService        internalapi.ImageManagerService
   275  	PodStartupLatencyTracker  util.PodStartupLatencyTracker
   276  	NodeStartupLatencyTracker util.NodeStartupLatencyTracker
   277  	// remove it after cadvisor.UsingLegacyCadvisorStats dropped.
   278  	useLegacyCadvisorStats bool
   279  }
   280  
   281  // makePodSourceConfig creates a config.PodConfig from the given
   282  // KubeletConfiguration or returns an error.
   283  func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) {
   284  	manifestURLHeader := make(http.Header)
   285  	if len(kubeCfg.StaticPodURLHeader) > 0 {
   286  		for k, v := range kubeCfg.StaticPodURLHeader {
   287  			for i := range v {
   288  				manifestURLHeader.Add(k, v[i])
   289  			}
   290  		}
   291  	}
   292  
   293  	// source of all configuration
   294  	cfg := config.NewPodConfig(config.PodConfigNotificationIncremental, kubeDeps.Recorder, kubeDeps.PodStartupLatencyTracker)
   295  
   296  	// TODO:  it needs to be replaced by a proper context in the future
   297  	ctx := context.TODO()
   298  
   299  	// define file config source
   300  	if kubeCfg.StaticPodPath != "" {
   301  		klog.InfoS("Adding static pod path", "path", kubeCfg.StaticPodPath)
   302  		config.NewSourceFile(kubeCfg.StaticPodPath, nodeName, kubeCfg.FileCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.FileSource))
   303  	}
   304  
   305  	// define url config source
   306  	if kubeCfg.StaticPodURL != "" {
   307  		klog.InfoS("Adding pod URL with HTTP header", "URL", kubeCfg.StaticPodURL, "header", manifestURLHeader)
   308  		config.NewSourceURL(kubeCfg.StaticPodURL, manifestURLHeader, nodeName, kubeCfg.HTTPCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.HTTPSource))
   309  	}
   310  
   311  	if kubeDeps.KubeClient != nil {
   312  		klog.InfoS("Adding apiserver pod source")
   313  		config.NewSourceApiserver(kubeDeps.KubeClient, nodeName, nodeHasSynced, cfg.Channel(ctx, kubetypes.ApiserverSource))
   314  	}
   315  	return cfg, nil
   316  }
   317  
   318  // PreInitRuntimeService will init runtime service before RunKubelet.
   319  func PreInitRuntimeService(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies) error {
   320  	remoteImageEndpoint := kubeCfg.ImageServiceEndpoint
   321  	if remoteImageEndpoint == "" && kubeCfg.ContainerRuntimeEndpoint != "" {
   322  		remoteImageEndpoint = kubeCfg.ContainerRuntimeEndpoint
   323  	}
   324  	var err error
   325  
   326  	var tp trace.TracerProvider
   327  	if utilfeature.DefaultFeatureGate.Enabled(features.KubeletTracing) {
   328  		tp = kubeDeps.TracerProvider
   329  	}
   330  
   331  	logger := klog.Background()
   332  	if kubeDeps.RemoteRuntimeService, err = remote.NewRemoteRuntimeService(kubeCfg.ContainerRuntimeEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, tp, &logger); err != nil {
   333  		return err
   334  	}
   335  	if kubeDeps.RemoteImageService, err = remote.NewRemoteImageService(remoteImageEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, tp, &logger); err != nil {
   336  		return err
   337  	}
   338  
   339  	kubeDeps.useLegacyCadvisorStats = cadvisor.UsingLegacyCadvisorStats(kubeCfg.ContainerRuntimeEndpoint)
   340  
   341  	return nil
   342  }
   343  
   344  // NewMainKubelet instantiates a new Kubelet object along with all the required internal modules.
   345  // No initialization of Kubelet and its modules should happen here.
   346  func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
   347  	kubeDeps *Dependencies,
   348  	crOptions *config.ContainerRuntimeOptions,
   349  	hostname string,
   350  	hostnameOverridden bool,
   351  	nodeName types.NodeName,
   352  	nodeIPs []net.IP,
   353  	providerID string,
   354  	cloudProvider string,
   355  	certDirectory string,
   356  	rootDirectory string,
   357  	podLogsDirectory string,
   358  	imageCredentialProviderConfigFile string,
   359  	imageCredentialProviderBinDir string,
   360  	registerNode bool,
   361  	registerWithTaints []v1.Taint,
   362  	allowedUnsafeSysctls []string,
   363  	experimentalMounterPath string,
   364  	kernelMemcgNotification bool,
   365  	experimentalNodeAllocatableIgnoreEvictionThreshold bool,
   366  	minimumGCAge metav1.Duration,
   367  	maxPerPodContainerCount int32,
   368  	maxContainerCount int32,
   369  	registerSchedulable bool,
   370  	nodeLabels map[string]string,
   371  	nodeStatusMaxImages int32,
   372  	seccompDefault bool,
   373  ) (*Kubelet, error) {
   374  	ctx := context.Background()
   375  	logger := klog.TODO()
   376  
   377  	if rootDirectory == "" {
   378  		return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
   379  	}
   380  	if podLogsDirectory == "" {
   381  		return nil, errors.New("pod logs root directory is empty")
   382  	}
   383  	if kubeCfg.SyncFrequency.Duration <= 0 {
   384  		return nil, fmt.Errorf("invalid sync frequency %d", kubeCfg.SyncFrequency.Duration)
   385  	}
   386  
   387  	if utilfeature.DefaultFeatureGate.Enabled(features.DisableCloudProviders) && cloudprovider.IsDeprecatedInternal(cloudProvider) {
   388  		cloudprovider.DisableWarningForProvider(cloudProvider)
   389  		return nil, fmt.Errorf("cloud provider %q was specified, but built-in cloud providers are disabled. Please set --cloud-provider=external and migrate to an external cloud provider", cloudProvider)
   390  	}
   391  
   392  	var nodeHasSynced cache.InformerSynced
   393  	var nodeLister corelisters.NodeLister
   394  
   395  	// If kubeClient == nil, we are running in standalone mode (i.e. no API servers)
   396  	// If not nil, we are running as part of a cluster and should sync w/API
   397  	if kubeDeps.KubeClient != nil {
   398  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) {
   399  			options.FieldSelector = fields.Set{metav1.ObjectNameField: string(nodeName)}.String()
   400  		}))
   401  		nodeLister = kubeInformers.Core().V1().Nodes().Lister()
   402  		nodeHasSynced = func() bool {
   403  			return kubeInformers.Core().V1().Nodes().Informer().HasSynced()
   404  		}
   405  		kubeInformers.Start(wait.NeverStop)
   406  		klog.InfoS("Attempting to sync node with API server")
   407  	} else {
   408  		// we don't have a client to sync!
   409  		nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{})
   410  		nodeLister = corelisters.NewNodeLister(nodeIndexer)
   411  		nodeHasSynced = func() bool { return true }
   412  		klog.InfoS("Kubelet is running in standalone mode, will skip API server sync")
   413  	}
   414  
   415  	if kubeDeps.PodConfig == nil {
   416  		var err error
   417  		kubeDeps.PodConfig, err = makePodSourceConfig(kubeCfg, kubeDeps, nodeName, nodeHasSynced)
   418  		if err != nil {
   419  			return nil, err
   420  		}
   421  	}
   422  
   423  	containerGCPolicy := kubecontainer.GCPolicy{
   424  		MinAge:             minimumGCAge.Duration,
   425  		MaxPerPodContainer: int(maxPerPodContainerCount),
   426  		MaxContainers:      int(maxContainerCount),
   427  	}
   428  
   429  	daemonEndpoints := &v1.NodeDaemonEndpoints{
   430  		KubeletEndpoint: v1.DaemonEndpoint{Port: kubeCfg.Port},
   431  	}
   432  
   433  	imageGCPolicy := images.ImageGCPolicy{
   434  		MinAge:               kubeCfg.ImageMinimumGCAge.Duration,
   435  		HighThresholdPercent: int(kubeCfg.ImageGCHighThresholdPercent),
   436  		LowThresholdPercent:  int(kubeCfg.ImageGCLowThresholdPercent),
   437  	}
   438  
   439  	if utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) {
   440  		imageGCPolicy.MaxAge = kubeCfg.ImageMaximumGCAge.Duration
   441  	} else if kubeCfg.ImageMaximumGCAge.Duration != 0 {
   442  		klog.InfoS("ImageMaximumGCAge flag enabled, but corresponding feature gate is not enabled. Ignoring flag.")
   443  	}
   444  
   445  	enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable
   446  	if experimentalNodeAllocatableIgnoreEvictionThreshold {
   447  		// Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions
   448  		enforceNodeAllocatable = []string{}
   449  	}
   450  	thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
   451  	if err != nil {
   452  		return nil, err
   453  	}
   454  	evictionConfig := eviction.Config{
   455  		PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration,
   456  		MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod),
   457  		Thresholds:               thresholds,
   458  		KernelMemcgNotification:  kernelMemcgNotification,
   459  		PodCgroupRoot:            kubeDeps.ContainerManager.GetPodCgroupRoot(),
   460  	}
   461  
   462  	var serviceLister corelisters.ServiceLister
   463  	var serviceHasSynced cache.InformerSynced
   464  	if kubeDeps.KubeClient != nil {
   465  		// don't watch headless services, they are not needed since this informer is only used to create the environment variables for pods.
   466  		// See https://issues.k8s.io/122394
   467  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) {
   468  			options.FieldSelector = fields.OneTermNotEqualSelector("spec.clusterIP", v1.ClusterIPNone).String()
   469  		}))
   470  		serviceLister = kubeInformers.Core().V1().Services().Lister()
   471  		serviceHasSynced = kubeInformers.Core().V1().Services().Informer().HasSynced
   472  		kubeInformers.Start(wait.NeverStop)
   473  	} else {
   474  		serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
   475  		serviceLister = corelisters.NewServiceLister(serviceIndexer)
   476  		serviceHasSynced = func() bool { return true }
   477  	}
   478  
   479  	// construct a node reference used for events
   480  	nodeRef := &v1.ObjectReference{
   481  		Kind:      "Node",
   482  		Name:      string(nodeName),
   483  		UID:       types.UID(nodeName),
   484  		Namespace: "",
   485  	}
   486  
   487  	oomWatcher, err := oomwatcher.NewWatcher(kubeDeps.Recorder)
   488  	if err != nil {
   489  		if inuserns.RunningInUserNS() {
   490  			if utilfeature.DefaultFeatureGate.Enabled(features.KubeletInUserNamespace) {
   491  				// oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error,
   492  				// when running in a user namespace with sysctl value `kernel.dmesg_restrict=1`.
   493  				klog.V(2).InfoS("Failed to create an oomWatcher (running in UserNS, ignoring)", "err", err)
   494  				oomWatcher = nil
   495  			} else {
   496  				klog.ErrorS(err, "Failed to create an oomWatcher (running in UserNS, Hint: enable KubeletInUserNamespace feature flag to ignore the error)")
   497  				return nil, err
   498  			}
   499  		} else {
   500  			return nil, err
   501  		}
   502  	}
   503  
   504  	clusterDNS := make([]net.IP, 0, len(kubeCfg.ClusterDNS))
   505  	for _, ipEntry := range kubeCfg.ClusterDNS {
   506  		ip := netutils.ParseIPSloppy(ipEntry)
   507  		if ip == nil {
   508  			klog.InfoS("Invalid clusterDNS IP", "IP", ipEntry)
   509  		} else {
   510  			clusterDNS = append(clusterDNS, ip)
   511  		}
   512  	}
   513  
   514  	// A TLS transport is needed to make HTTPS-based container lifecycle requests,
   515  	// but we do not have the information necessary to do TLS verification.
   516  	//
   517  	// This client must not be modified to include credentials, because it is
   518  	// critical that credentials not leak from the client to arbitrary hosts.
   519  	insecureContainerLifecycleHTTPClient := &http.Client{
   520  		Transport: &http.Transport{
   521  			TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
   522  		},
   523  		CheckRedirect: httpprobe.RedirectChecker(false),
   524  	}
   525  
   526  	tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope)
   527  
   528  	klet := &Kubelet{
   529  		hostname:                       hostname,
   530  		hostnameOverridden:             hostnameOverridden,
   531  		nodeName:                       nodeName,
   532  		kubeClient:                     kubeDeps.KubeClient,
   533  		heartbeatClient:                kubeDeps.HeartbeatClient,
   534  		onRepeatedHeartbeatFailure:     kubeDeps.OnHeartbeatFailure,
   535  		rootDirectory:                  filepath.Clean(rootDirectory),
   536  		podLogsDirectory:               podLogsDirectory,
   537  		resyncInterval:                 kubeCfg.SyncFrequency.Duration,
   538  		sourcesReady:                   config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources),
   539  		registerNode:                   registerNode,
   540  		registerWithTaints:             registerWithTaints,
   541  		registerSchedulable:            registerSchedulable,
   542  		dnsConfigurer:                  dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig),
   543  		serviceLister:                  serviceLister,
   544  		serviceHasSynced:               serviceHasSynced,
   545  		nodeLister:                     nodeLister,
   546  		nodeHasSynced:                  nodeHasSynced,
   547  		streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration,
   548  		recorder:                       kubeDeps.Recorder,
   549  		cadvisor:                       kubeDeps.CAdvisorInterface,
   550  		cloud:                          kubeDeps.Cloud,
   551  		externalCloudProvider:          cloudprovider.IsExternal(cloudProvider),
   552  		providerID:                     providerID,
   553  		nodeRef:                        nodeRef,
   554  		nodeLabels:                     nodeLabels,
   555  		nodeStatusUpdateFrequency:      kubeCfg.NodeStatusUpdateFrequency.Duration,
   556  		nodeStatusReportFrequency:      kubeCfg.NodeStatusReportFrequency.Duration,
   557  		os:                             kubeDeps.OSInterface,
   558  		oomWatcher:                     oomWatcher,
   559  		cgroupsPerQOS:                  kubeCfg.CgroupsPerQOS,
   560  		cgroupRoot:                     kubeCfg.CgroupRoot,
   561  		mounter:                        kubeDeps.Mounter,
   562  		hostutil:                       kubeDeps.HostUtil,
   563  		subpather:                      kubeDeps.Subpather,
   564  		maxPods:                        int(kubeCfg.MaxPods),
   565  		podsPerCore:                    int(kubeCfg.PodsPerCore),
   566  		syncLoopMonitor:                atomic.Value{},
   567  		daemonEndpoints:                daemonEndpoints,
   568  		containerManager:               kubeDeps.ContainerManager,
   569  		nodeIPs:                        nodeIPs,
   570  		nodeIPValidator:                validateNodeIP,
   571  		clock:                          clock.RealClock{},
   572  		enableControllerAttachDetach:   kubeCfg.EnableControllerAttachDetach,
   573  		makeIPTablesUtilChains:         kubeCfg.MakeIPTablesUtilChains,
   574  		nodeStatusMaxImages:            nodeStatusMaxImages,
   575  		tracer:                         tracer,
   576  		nodeStartupLatencyTracker:      kubeDeps.NodeStartupLatencyTracker,
   577  	}
   578  
   579  	if klet.cloud != nil {
   580  		klet.cloudResourceSyncManager = cloudresource.NewSyncManager(klet.cloud, nodeName, klet.nodeStatusUpdateFrequency)
   581  	}
   582  
   583  	var secretManager secret.Manager
   584  	var configMapManager configmap.Manager
   585  	if klet.kubeClient != nil {
   586  		switch kubeCfg.ConfigMapAndSecretChangeDetectionStrategy {
   587  		case kubeletconfiginternal.WatchChangeDetectionStrategy:
   588  			secretManager = secret.NewWatchingSecretManager(klet.kubeClient, klet.resyncInterval)
   589  			configMapManager = configmap.NewWatchingConfigMapManager(klet.kubeClient, klet.resyncInterval)
   590  		case kubeletconfiginternal.TTLCacheChangeDetectionStrategy:
   591  			secretManager = secret.NewCachingSecretManager(
   592  				klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode))
   593  			configMapManager = configmap.NewCachingConfigMapManager(
   594  				klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode))
   595  		case kubeletconfiginternal.GetChangeDetectionStrategy:
   596  			secretManager = secret.NewSimpleSecretManager(klet.kubeClient)
   597  			configMapManager = configmap.NewSimpleConfigMapManager(klet.kubeClient)
   598  		default:
   599  			return nil, fmt.Errorf("unknown configmap and secret manager mode: %v", kubeCfg.ConfigMapAndSecretChangeDetectionStrategy)
   600  		}
   601  
   602  		klet.secretManager = secretManager
   603  		klet.configMapManager = configMapManager
   604  	}
   605  
   606  	machineInfo, err := klet.cadvisor.MachineInfo()
   607  	if err != nil {
   608  		return nil, err
   609  	}
   610  	// Avoid collector collects it as a timestamped metric
   611  	// See PR #95210 and #97006 for more details.
   612  	machineInfo.Timestamp = time.Time{}
   613  	klet.setCachedMachineInfo(machineInfo)
   614  
   615  	imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)
   616  
   617  	klet.livenessManager = proberesults.NewManager()
   618  	klet.readinessManager = proberesults.NewManager()
   619  	klet.startupManager = proberesults.NewManager()
   620  	klet.podCache = kubecontainer.NewCache()
   621  
   622  	klet.mirrorPodClient = kubepod.NewBasicMirrorClient(klet.kubeClient, string(nodeName), nodeLister)
   623  	klet.podManager = kubepod.NewBasicPodManager()
   624  
   625  	klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker, klet.getRootDir())
   626  
   627  	klet.resourceAnalyzer = serverstats.NewResourceAnalyzer(klet, kubeCfg.VolumeStatsAggPeriod.Duration, kubeDeps.Recorder)
   628  
   629  	klet.runtimeService = kubeDeps.RemoteRuntimeService
   630  
   631  	if kubeDeps.KubeClient != nil {
   632  		klet.runtimeClassManager = runtimeclass.NewManager(kubeDeps.KubeClient)
   633  	}
   634  
   635  	// setup containerLogManager for CRI container runtime
   636  	containerLogManager, err := logs.NewContainerLogManager(
   637  		klet.runtimeService,
   638  		kubeDeps.OSInterface,
   639  		kubeCfg.ContainerLogMaxSize,
   640  		int(kubeCfg.ContainerLogMaxFiles),
   641  		int(kubeCfg.ContainerLogMaxWorkers),
   642  		kubeCfg.ContainerLogMonitorInterval,
   643  	)
   644  	if err != nil {
   645  		return nil, fmt.Errorf("failed to initialize container log manager: %v", err)
   646  	}
   647  	klet.containerLogManager = containerLogManager
   648  
   649  	klet.reasonCache = NewReasonCache()
   650  	klet.workQueue = queue.NewBasicWorkQueue(klet.clock)
   651  	klet.podWorkers = newPodWorkers(
   652  		klet,
   653  		kubeDeps.Recorder,
   654  		klet.workQueue,
   655  		klet.resyncInterval,
   656  		backOffPeriod,
   657  		klet.podCache,
   658  	)
   659  
   660  	runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
   661  		kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
   662  		klet.livenessManager,
   663  		klet.readinessManager,
   664  		klet.startupManager,
   665  		rootDirectory,
   666  		podLogsDirectory,
   667  		machineInfo,
   668  		klet.podWorkers,
   669  		kubeDeps.OSInterface,
   670  		klet,
   671  		insecureContainerLifecycleHTTPClient,
   672  		imageBackOff,
   673  		kubeCfg.SerializeImagePulls,
   674  		kubeCfg.MaxParallelImagePulls,
   675  		float32(kubeCfg.RegistryPullQPS),
   676  		int(kubeCfg.RegistryBurst),
   677  		imageCredentialProviderConfigFile,
   678  		imageCredentialProviderBinDir,
   679  		kubeCfg.CPUCFSQuota,
   680  		kubeCfg.CPUCFSQuotaPeriod,
   681  		kubeDeps.RemoteRuntimeService,
   682  		kubeDeps.RemoteImageService,
   683  		kubeDeps.ContainerManager,
   684  		klet.containerLogManager,
   685  		klet.runtimeClassManager,
   686  		seccompDefault,
   687  		kubeCfg.MemorySwap.SwapBehavior,
   688  		kubeDeps.ContainerManager.GetNodeAllocatableAbsolute,
   689  		*kubeCfg.MemoryThrottlingFactor,
   690  		kubeDeps.PodStartupLatencyTracker,
   691  		kubeDeps.TracerProvider,
   692  	)
   693  	if err != nil {
   694  		return nil, err
   695  	}
   696  	klet.containerRuntime = runtime
   697  	klet.streamingRuntime = runtime
   698  	klet.runner = runtime
   699  
   700  	runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime, runtimeCacheRefreshPeriod)
   701  	if err != nil {
   702  		return nil, err
   703  	}
   704  	klet.runtimeCache = runtimeCache
   705  
   706  	// common provider to get host file system usage associated with a pod managed by kubelet
   707  	hostStatsProvider := stats.NewHostStatsProvider(kubecontainer.RealOS{}, func(podUID types.UID) string {
   708  		return getEtcHostsPath(klet.getPodDir(podUID))
   709  	}, podLogsDirectory)
   710  	if kubeDeps.useLegacyCadvisorStats {
   711  		klet.StatsProvider = stats.NewCadvisorStatsProvider(
   712  			klet.cadvisor,
   713  			klet.resourceAnalyzer,
   714  			klet.podManager,
   715  			klet.runtimeCache,
   716  			klet.containerRuntime,
   717  			klet.statusManager,
   718  			hostStatsProvider)
   719  	} else {
   720  		klet.StatsProvider = stats.NewCRIStatsProvider(
   721  			klet.cadvisor,
   722  			klet.resourceAnalyzer,
   723  			klet.podManager,
   724  			klet.runtimeCache,
   725  			kubeDeps.RemoteRuntimeService,
   726  			kubeDeps.RemoteImageService,
   727  			hostStatsProvider,
   728  			utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI))
   729  	}
   730  
   731  	eventChannel := make(chan *pleg.PodLifecycleEvent, plegChannelCapacity)
   732  
   733  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
   734  		// adjust Generic PLEG relisting period and threshold to higher value when Evented PLEG is turned on
   735  		genericRelistDuration := &pleg.RelistDuration{
   736  			RelistPeriod:    eventedPlegRelistPeriod,
   737  			RelistThreshold: eventedPlegRelistThreshold,
   738  		}
   739  		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
   740  		// In case Evented PLEG has to fall back on Generic PLEG due to an error,
   741  		// Evented PLEG should be able to reset the Generic PLEG relisting duration
   742  		// to the default value.
   743  		eventedRelistDuration := &pleg.RelistDuration{
   744  			RelistPeriod:    genericPlegRelistPeriod,
   745  			RelistThreshold: genericPlegRelistThreshold,
   746  		}
   747  		klet.eventedPleg, err = pleg.NewEventedPLEG(klet.containerRuntime, klet.runtimeService, eventChannel,
   748  			klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{})
   749  		if err != nil {
   750  			return nil, err
   751  		}
   752  	} else {
   753  		genericRelistDuration := &pleg.RelistDuration{
   754  			RelistPeriod:    genericPlegRelistPeriod,
   755  			RelistThreshold: genericPlegRelistThreshold,
   756  		}
   757  		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
   758  	}
   759  
   760  	klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime)
   761  	klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy)
   762  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
   763  		klet.runtimeState.addHealthCheck("EventedPLEG", klet.eventedPleg.Healthy)
   764  	}
   765  	if _, err := klet.updatePodCIDR(ctx, kubeCfg.PodCIDR); err != nil {
   766  		klog.ErrorS(err, "Pod CIDR update failed")
   767  	}
   768  
   769  	// setup containerGC
   770  	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady)
   771  	if err != nil {
   772  		return nil, err
   773  	}
   774  	klet.containerGC = containerGC
   775  	klet.containerDeletor = newPodContainerDeletor(klet.containerRuntime, max(containerGCPolicy.MaxPerPodContainer, minDeadContainerInPod))
   776  
   777  	// setup imageManager
   778  	imageManager, err := images.NewImageGCManager(klet.containerRuntime, klet.StatsProvider, kubeDeps.Recorder, nodeRef, imageGCPolicy, kubeDeps.TracerProvider)
   779  	if err != nil {
   780  		return nil, fmt.Errorf("failed to initialize image manager: %v", err)
   781  	}
   782  	klet.imageManager = imageManager
   783  
   784  	if kubeCfg.ServerTLSBootstrap && kubeDeps.TLSOptions != nil && utilfeature.DefaultFeatureGate.Enabled(features.RotateKubeletServerCertificate) {
   785  		klet.serverCertificateManager, err = kubeletcertificate.NewKubeletServerCertificateManager(klet.kubeClient, kubeCfg, klet.nodeName, klet.getLastObservedNodeAddresses, certDirectory)
   786  		if err != nil {
   787  			return nil, fmt.Errorf("failed to initialize certificate manager: %v", err)
   788  		}
   789  		kubeDeps.TLSOptions.Config.GetCertificate = func(*tls.ClientHelloInfo) (*tls.Certificate, error) {
   790  			cert := klet.serverCertificateManager.Current()
   791  			if cert == nil {
   792  				return nil, fmt.Errorf("no serving certificate available for the kubelet")
   793  			}
   794  			return cert, nil
   795  		}
   796  	}
   797  
   798  	if kubeDeps.ProbeManager != nil {
   799  		klet.probeManager = kubeDeps.ProbeManager
   800  	} else {
   801  		klet.probeManager = prober.NewManager(
   802  			klet.statusManager,
   803  			klet.livenessManager,
   804  			klet.readinessManager,
   805  			klet.startupManager,
   806  			klet.runner,
   807  			kubeDeps.Recorder)
   808  	}
   809  
   810  	tokenManager := token.NewManager(kubeDeps.KubeClient)
   811  
   812  	var clusterTrustBundleManager clustertrustbundle.Manager
   813  	if kubeDeps.KubeClient != nil && utilfeature.DefaultFeatureGate.Enabled(features.ClusterTrustBundleProjection) {
   814  		kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0)
   815  		clusterTrustBundleManager, err = clustertrustbundle.NewInformerManager(kubeInformers.Certificates().V1alpha1().ClusterTrustBundles(), 2*int(kubeCfg.MaxPods), 5*time.Minute)
   816  		if err != nil {
   817  			return nil, fmt.Errorf("while starting informer-based ClusterTrustBundle manager: %w", err)
   818  		}
   819  		kubeInformers.Start(wait.NeverStop)
   820  		klog.InfoS("Started ClusterTrustBundle informer")
   821  	} else {
   822  		// In static kubelet mode, use a no-op manager.
   823  		clusterTrustBundleManager = &clustertrustbundle.NoopManager{}
   824  		klog.InfoS("Not starting ClusterTrustBundle informer because we are in static kubelet mode")
   825  	}
   826  
   827  	// NewInitializedVolumePluginMgr initializes some storageErrors on the Kubelet runtimeState (in csi_plugin.go init)
   828  	// which affects node ready status. This function must be called before Kubelet is initialized so that the Node
   829  	// ReadyState is accurate with the storage state.
   830  	klet.volumePluginMgr, err =
   831  		NewInitializedVolumePluginMgr(klet, secretManager, configMapManager, tokenManager, clusterTrustBundleManager, kubeDeps.VolumePlugins, kubeDeps.DynamicPluginProber)
   832  	if err != nil {
   833  		return nil, err
   834  	}
   835  	klet.pluginManager = pluginmanager.NewPluginManager(
   836  		klet.getPluginsRegistrationDir(), /* sockDir */
   837  		kubeDeps.Recorder,
   838  	)
   839  
   840  	// If the experimentalMounterPathFlag is set, we do not want to
   841  	// check node capabilities since the mount path is not the default
   842  	if len(experimentalMounterPath) != 0 {
   843  		// Replace the nameserver in containerized-mounter's rootfs/etc/resolv.conf with kubelet.ClusterDNS
   844  		// so that service name could be resolved
   845  		klet.dnsConfigurer.SetupDNSinContainerizedMounter(experimentalMounterPath)
   846  	}
   847  
   848  	// setup volumeManager
   849  	klet.volumeManager = volumemanager.NewVolumeManager(
   850  		kubeCfg.EnableControllerAttachDetach,
   851  		nodeName,
   852  		klet.podManager,
   853  		klet.podWorkers,
   854  		klet.kubeClient,
   855  		klet.volumePluginMgr,
   856  		klet.containerRuntime,
   857  		kubeDeps.Mounter,
   858  		kubeDeps.HostUtil,
   859  		klet.getPodsDir(),
   860  		kubeDeps.Recorder,
   861  		volumepathhandler.NewBlockVolumePathHandler())
   862  
   863  	klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)
   864  
   865  	// setup eviction manager
   866  	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig,
   867  		killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)
   868  
   869  	klet.evictionManager = evictionManager
   870  	klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
   871  
   872  	// Safe, allowed sysctls can always be used as unsafe sysctls in the spec.
   873  	// Hence, we concatenate those two lists.
   874  	safeAndUnsafeSysctls := append(sysctl.SafeSysctlAllowlist(), allowedUnsafeSysctls...)
   875  	sysctlsAllowlist, err := sysctl.NewAllowlist(safeAndUnsafeSysctls)
   876  	if err != nil {
   877  		return nil, err
   878  	}
   879  	klet.admitHandlers.AddPodAdmitHandler(sysctlsAllowlist)
   880  
   881  	// enable active deadline handler
   882  	activeDeadlineHandler, err := newActiveDeadlineHandler(klet.statusManager, kubeDeps.Recorder, klet.clock)
   883  	if err != nil {
   884  		return nil, err
   885  	}
   886  	klet.AddPodSyncLoopHandler(activeDeadlineHandler)
   887  	klet.AddPodSyncHandler(activeDeadlineHandler)
   888  
   889  	klet.admitHandlers.AddPodAdmitHandler(klet.containerManager.GetAllocateResourcesPodAdmitHandler())
   890  
   891  	criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
   892  	klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler, klet.containerManager.UpdatePluginResources))
   893  	// apply functional Option's
   894  	for _, opt := range kubeDeps.Options {
   895  		opt(klet)
   896  	}
   897  
   898  	if sysruntime.GOOS == "linux" {
   899  		// AppArmor is a Linux kernel security module and it does not support other operating systems.
   900  		klet.appArmorValidator = apparmor.NewValidator()
   901  		klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
   902  	}
   903  
   904  	leaseDuration := time.Duration(kubeCfg.NodeLeaseDurationSeconds) * time.Second
   905  	renewInterval := time.Duration(float64(leaseDuration) * nodeLeaseRenewIntervalFraction)
   906  	klet.nodeLeaseController = lease.NewController(
   907  		klet.clock,
   908  		klet.heartbeatClient,
   909  		string(klet.nodeName),
   910  		kubeCfg.NodeLeaseDurationSeconds,
   911  		klet.onRepeatedHeartbeatFailure,
   912  		renewInterval,
   913  		string(klet.nodeName),
   914  		v1.NamespaceNodeLease,
   915  		util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName)))
   916  
   917  	// setup node shutdown manager
   918  	shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
   919  		Logger:                           logger,
   920  		ProbeManager:                     klet.probeManager,
   921  		Recorder:                         kubeDeps.Recorder,
   922  		NodeRef:                          nodeRef,
   923  		GetPodsFunc:                      klet.GetActivePods,
   924  		KillPodFunc:                      killPodNow(klet.podWorkers, kubeDeps.Recorder),
   925  		SyncNodeStatusFunc:               klet.syncNodeStatus,
   926  		ShutdownGracePeriodRequested:     kubeCfg.ShutdownGracePeriod.Duration,
   927  		ShutdownGracePeriodCriticalPods:  kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
   928  		ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
   929  		StateDirectory:                   rootDirectory,
   930  	})
   931  	klet.shutdownManager = shutdownManager
   932  	klet.usernsManager, err = userns.MakeUserNsManager(klet)
   933  	if err != nil {
   934  		return nil, err
   935  	}
   936  	klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
   937  
   938  	// Finally, put the most recent version of the config on the Kubelet, so
   939  	// people can see how it was configured.
   940  	klet.kubeletConfiguration = *kubeCfg
   941  
   942  	// Generating the status funcs should be the last thing we do,
   943  	// since this relies on the rest of the Kubelet having been constructed.
   944  	klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()
   945  
   946  	return klet, nil
   947  }
   948  
   949  type serviceLister interface {
   950  	List(labels.Selector) ([]*v1.Service, error)
   951  }
   952  
   953  // Kubelet is the main kubelet implementation.
   954  type Kubelet struct {
   955  	kubeletConfiguration kubeletconfiginternal.KubeletConfiguration
   956  
   957  	// hostname is the hostname the kubelet detected or was given via flag/config
   958  	hostname string
   959  	// hostnameOverridden indicates the hostname was overridden via flag/config
   960  	hostnameOverridden bool
   961  
   962  	nodeName        types.NodeName
   963  	runtimeCache    kubecontainer.RuntimeCache
   964  	kubeClient      clientset.Interface
   965  	heartbeatClient clientset.Interface
   966  	// mirrorPodClient is used to create and delete mirror pods in the API for static
   967  	// pods.
   968  	mirrorPodClient kubepod.MirrorClient
   969  
   970  	rootDirectory    string
   971  	podLogsDirectory string
   972  
   973  	lastObservedNodeAddressesMux sync.RWMutex
   974  	lastObservedNodeAddresses    []v1.NodeAddress
   975  
   976  	// onRepeatedHeartbeatFailure is called when a heartbeat operation fails more than once. optional.
   977  	onRepeatedHeartbeatFailure func()
   978  
   979  	// podManager stores the desired set of admitted pods and mirror pods that the kubelet should be
   980  	// running. The actual set of running pods is stored on the podWorkers. The manager is populated
   981  	// by the kubelet config loops which abstracts receiving configuration from many different sources
   982  	// (api for regular pods, local filesystem or http for static pods). The manager may be consulted
   983  	// by other components that need to see the set of desired pods. Note that not all desired pods are
   984  	// running, and not all running pods are in the podManager - for instance, force deleting a pod
   985  	// from the apiserver will remove it from the podManager, but the pod may still be terminating and
   986  	// tracked by the podWorkers. Components that need to know the actual consumed resources of the
   987  	// node or are driven by podWorkers and the sync*Pod methods (status, volume, stats) should also
   988  	// consult the podWorkers when reconciling.
   989  	//
   990  	// TODO: review all kubelet components that need the actual set of pods (vs the desired set)
   991  	// and update them to use podWorkers instead of podManager. This may introduce latency in some
   992  	// methods, but avoids race conditions and correctly accounts for terminating pods that have
   993  	// been force deleted or static pods that have been updated.
   994  	// https://github.com/kubernetes/kubernetes/issues/116970
   995  	podManager kubepod.Manager
   996  
   997  	// podWorkers is responsible for driving the lifecycle state machine of each pod. The worker is
   998  	// notified of config changes, updates, periodic reconciliation, container runtime updates, and
   999  	// evictions of all desired pods and will invoke reconciliation methods per pod in separate
  1000  	// goroutines. The podWorkers are authoritative in the kubelet for what pods are actually being
  1001  	// run and their current state:
  1002  	//
  1003  	// * syncing: pod should be running (syncPod)
  1004  	// * terminating: pod should be stopped (syncTerminatingPod)
  1005  	// * terminated: pod should have all resources cleaned up (syncTerminatedPod)
  1006  	//
  1007  	// and invoke the handler methods that correspond to each state. Components within the
  1008  	// kubelet that need to know the phase of the pod in order to correctly set up or tear down
  1009  	// resources must consult the podWorkers.
  1010  	//
  1011  	// Once a pod has been accepted by the pod workers, no other pod with that same UID (and
  1012  	// name+namespace, for static pods) will be started until the first pod has fully terminated
  1013  	// and been cleaned up by SyncKnownPods. This means a pod may be desired (in API), admitted
  1014  	// (in pod manager), and requested (by invoking UpdatePod) but not start for an arbitrarily
  1015  	// long interval because a prior pod is still terminating.
  1016  	//
  1017  	// As an event-driven (by UpdatePod) controller, the podWorkers must periodically be resynced
  1018  	// by the kubelet invoking SyncKnownPods with the desired state (admitted pods in podManager).
  1019  	// Since the podManager may be unaware of some running pods due to force deletion, the
  1020  	// podWorkers are responsible for triggering a sync of pods that are no longer desired but
  1021  	// must still run to completion.
  1022  	podWorkers PodWorkers
  1023  
  1024  	// evictionManager observes the state of the node for situations that could impact node stability
  1025  	// and evicts pods (sets to phase Failed with reason Evicted) to reduce resource pressure. The
  1026  	// eviction manager acts on the actual state of the node and considers the podWorker to be
  1027  	// authoritative.
  1028  	evictionManager eviction.Manager
  1029  
  1030  	// probeManager tracks the set of running pods and ensures any user-defined periodic checks are
  1031  	// run to introspect the state of each pod.  The probe manager acts on the actual state of the node
  1032  	// and is notified of pods by the podWorker. The probe manager is the authoritative source of the
  1033  	// most recent probe status and is responsible for notifying the status manager, which
  1034  	// synthesizes them into the overall pod status.
  1035  	probeManager prober.Manager
  1036  
  1037  	// secretManager caches the set of secrets used by running pods on this node. The podWorkers
  1038  	// notify the secretManager when pods are started and terminated, and the secretManager must
  1039  	// then keep the needed secrets up-to-date as they change.
  1040  	secretManager secret.Manager
  1041  
  1042  	// configMapManager caches the set of config maps used by running pods on this node. The
  1043  	// podWorkers notify the configMapManager when pods are started and terminated, and the
  1044  	// configMapManager must then keep the needed config maps up-to-date as they change.
  1045  	configMapManager configmap.Manager
  1046  
  1047  	// volumeManager observes the set of running pods and is responsible for attaching, mounting,
  1048  	// unmounting, and detaching as those pods move through their lifecycle. It periodically
  1049  	// synchronizes the set of known volumes to the set of actually desired volumes and cleans up
  1050  	// any orphaned volumes. The volume manager considers the podWorker to be authoritative for
  1051  	// which pods are running.
  1052  	volumeManager volumemanager.VolumeManager
  1053  
  1054  	// statusManager receives updated pod status updates from the podWorker and updates the API
  1055  	// status of those pods to match. The statusManager is authoritative for the synthesized
  1056  	// status of the pod from the kubelet's perspective (other components own the individual
  1057  	// elements of status) and should be consulted by components in preference to assembling
  1058  	// that status themselves. Note that the status manager is downstream of the pod worker
  1059  	// and components that need to check whether a pod is still running should instead directly
  1060  	// consult the pod worker.
  1061  	statusManager status.Manager
  1062  
  1063  	// resyncInterval is the interval between periodic full reconciliations of
  1064  	// pods on this node.
  1065  	resyncInterval time.Duration
  1066  
  1067  	// sourcesReady records the sources seen by the kubelet, it is thread-safe.
  1068  	sourcesReady config.SourcesReady
  1069  
  1070  	// Optional, defaults to /logs/ from /var/log
  1071  	logServer http.Handler
  1072  	// Optional, defaults to simple Docker implementation
  1073  	runner kubecontainer.CommandRunner
  1074  
  1075  	// cAdvisor used for container information.
  1076  	cadvisor cadvisor.Interface
  1077  
  1078  	// Set to true to have the node register itself with the apiserver.
  1079  	registerNode bool
  1080  	// List of taints to add to a node object when the kubelet registers itself.
  1081  	registerWithTaints []v1.Taint
  1082  	// Set to true to have the node register itself as schedulable.
  1083  	registerSchedulable bool
  1084  	// for internal book keeping; access only from within registerWithApiserver
  1085  	registrationCompleted bool
  1086  
  1087  	// dnsConfigurer is used for setting up DNS resolver configuration when launching pods.
  1088  	dnsConfigurer *dns.Configurer
  1089  
  1090  	// serviceLister knows how to list services
  1091  	serviceLister serviceLister
  1092  	// serviceHasSynced indicates whether services have been sync'd at least once.
  1093  	// Check this before trusting a response from the lister.
  1094  	serviceHasSynced cache.InformerSynced
  1095  	// nodeLister knows how to list nodes
  1096  	nodeLister corelisters.NodeLister
  1097  	// nodeHasSynced indicates whether nodes have been sync'd at least once.
  1098  	// Check this before trusting a response from the node lister.
  1099  	nodeHasSynced cache.InformerSynced
  1100  	// a list of node labels to register
  1101  	nodeLabels map[string]string
  1102  
  1103  	// Last timestamp when runtime responded on ping.
  1104  	// Mutex is used to protect this value.
  1105  	runtimeState *runtimeState
  1106  
  1107  	// Volume plugins.
  1108  	volumePluginMgr *volume.VolumePluginMgr
  1109  
  1110  	// Manages container health check results.
  1111  	livenessManager  proberesults.Manager
  1112  	readinessManager proberesults.Manager
  1113  	startupManager   proberesults.Manager
  1114  
  1115  	// How long to keep idle streaming command execution/port forwarding
  1116  	// connections open before terminating them
  1117  	streamingConnectionIdleTimeout time.Duration
  1118  
  1119  	// The EventRecorder to use
  1120  	recorder record.EventRecorder
  1121  
  1122  	// Policy for handling garbage collection of dead containers.
  1123  	containerGC kubecontainer.GC
  1124  
  1125  	// Manager for image garbage collection.
  1126  	imageManager images.ImageGCManager
  1127  
  1128  	// Manager for container logs.
  1129  	containerLogManager logs.ContainerLogManager
  1130  
  1131  	// Cached MachineInfo returned by cadvisor.
  1132  	machineInfoLock sync.RWMutex
  1133  	machineInfo     *cadvisorapi.MachineInfo
  1134  
  1135  	// Handles certificate rotations.
  1136  	serverCertificateManager certificate.Manager
  1137  
  1138  	// Cloud provider interface.
  1139  	cloud cloudprovider.Interface
  1140  	// Handles requests to cloud provider with timeout
  1141  	cloudResourceSyncManager cloudresource.SyncManager
  1142  
  1143  	// Indicates that the node initialization happens in an external cloud controller
  1144  	externalCloudProvider bool
  1145  	// Reference to this node.
  1146  	nodeRef *v1.ObjectReference
  1147  
  1148  	// Container runtime.
  1149  	containerRuntime kubecontainer.Runtime
  1150  
  1151  	// Streaming runtime handles container streaming.
  1152  	streamingRuntime kubecontainer.StreamingRuntime
  1153  
  1154  	// Container runtime service (needed by container runtime Start()).
  1155  	runtimeService internalapi.RuntimeService
  1156  
  1157  	// reasonCache caches the failure reason of the last creation of all containers, which is
  1158  	// used for generating ContainerStatus.
  1159  	reasonCache *ReasonCache
  1160  
  1161  	// containerRuntimeReadyExpected indicates whether container runtime being ready is expected
  1162  	// so errors are logged without verbosity guard, to avoid excessive error logs at node startup.
  1163  	// It's false during the node initialization period of nodeReadyGracePeriod, and after that
  1164  	// it's set to true by fastStatusUpdateOnce when it exits.
  1165  	containerRuntimeReadyExpected bool
  1166  
  1167  	// nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease
  1168  	// feature is not enabled, it is also the frequency that kubelet posts node status to master.
  1169  	// In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod
  1170  	// in nodecontroller. There are several constraints:
  1171  	// 1. nodeMonitorGracePeriod must be N times more than nodeStatusUpdateFrequency, where
  1172  	//    N means number of retries allowed for kubelet to post node status. It is pointless
  1173  	//    to make nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since there
  1174  	//    will only be fresh values from Kubelet at an interval of nodeStatusUpdateFrequency.
  1175  	//    The constant must be less than podEvictionTimeout.
  1176  	// 2. nodeStatusUpdateFrequency needs to be large enough for kubelet to generate node
  1177  	//    status. Kubelet may fail to update node status reliably if the value is too small,
  1178  	//    as it takes time to gather all necessary node information.
  1179  	nodeStatusUpdateFrequency time.Duration
  1180  
  1181  	// nodeStatusReportFrequency is the frequency that kubelet posts node
  1182  	// status to master. It is only used when node lease feature is enabled.
  1183  	nodeStatusReportFrequency time.Duration
  1184  
  1185  	// lastStatusReportTime is the time when node status was last reported.
  1186  	lastStatusReportTime time.Time
  1187  
  1188  	// syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe.
  1189  	// This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else.
  1190  	syncNodeStatusMux sync.Mutex
  1191  
  1192  	// updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe.
  1193  	// This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else.
  1194  	updatePodCIDRMux sync.Mutex
  1195  
  1196  	// updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe.
  1197  	// This lock is used by Kubelet.updateRuntimeUp, Kubelet.fastNodeStatusUpdate and
  1198  	// Kubelet.HandlerSupportsUserNamespaces functions and shouldn't be used anywhere else.
  1199  	updateRuntimeMux sync.Mutex
  1200  
  1201  	// nodeLeaseController claims and renews the node lease for this Kubelet
  1202  	nodeLeaseController lease.Controller
  1203  
  1204  	// pleg observes the state of the container runtime and notifies the kubelet of changes to containers, which
  1205  	// notifies the podWorkers to reconcile the state of the pod (for instance, if a container dies and needs to
  1206  	// be restarted).
  1207  	pleg pleg.PodLifecycleEventGenerator
  1208  
  1209  	// eventedPleg supplements the pleg to deliver edge-driven container changes with low-latency.
  1210  	eventedPleg pleg.PodLifecycleEventGenerator
  1211  
  1212  	// Store kubecontainer.PodStatus for all pods.
  1213  	podCache kubecontainer.Cache
  1214  
  1215  	// os is a facade for various syscalls that need to be mocked during testing.
  1216  	os kubecontainer.OSInterface
  1217  
  1218  	// Watcher of out of memory events.
  1219  	oomWatcher oomwatcher.Watcher
  1220  
  1221  	// Monitor resource usage
  1222  	resourceAnalyzer serverstats.ResourceAnalyzer
  1223  
  1224  	// Whether or not we should have the QOS cgroup hierarchy for resource management
  1225  	cgroupsPerQOS bool
  1226  
  1227  	// If non-empty, pass this to the container runtime as the root cgroup.
  1228  	cgroupRoot string
  1229  
  1230  	// Mounter to use for volumes.
  1231  	mounter mount.Interface
  1232  
  1233  	// hostutil to interact with filesystems
  1234  	hostutil hostutil.HostUtils
  1235  
  1236  	// subpather to execute subpath actions
  1237  	subpather subpath.Interface
  1238  
  1239  	// Manager of non-Runtime containers.
  1240  	containerManager cm.ContainerManager
  1241  
  1242  	// Maximum Number of Pods which can be run by this Kubelet
  1243  	maxPods int
  1244  
  1245  	// Monitor Kubelet's sync loop
  1246  	syncLoopMonitor atomic.Value
  1247  
  1248  	// Container restart Backoff
  1249  	backOff *flowcontrol.Backoff
  1250  
  1251  	// Information about the ports which are opened by daemons on Node running this Kubelet server.
  1252  	daemonEndpoints *v1.NodeDaemonEndpoints
  1253  
  1254  	// A queue used to trigger pod workers.
  1255  	workQueue queue.WorkQueue
  1256  
  1257  	// oneTimeInitializer is used to initialize modules that are dependent on the runtime to be up.
  1258  	oneTimeInitializer sync.Once
  1259  
  1260  	// If set, use this IP address or addresses for the node
  1261  	nodeIPs []net.IP
  1262  
  1263  	// use this function to validate the kubelet nodeIP
  1264  	nodeIPValidator func(net.IP) error
  1265  
  1266  	// If non-nil, this is a unique identifier for the node in an external database, eg. cloudprovider
  1267  	providerID string
  1268  
  1269  	// clock is an interface that provides time related functionality in a way that makes it
  1270  	// easy to test the code.
  1271  	clock clock.WithTicker
  1272  
  1273  	// handlers called during the tryUpdateNodeStatus cycle
  1274  	setNodeStatusFuncs []func(context.Context, *v1.Node) error
  1275  
  1276  	lastNodeUnschedulableLock sync.Mutex
  1277  	// maintains Node.Spec.Unschedulable value from previous run of tryUpdateNodeStatus()
  1278  	lastNodeUnschedulable bool
  1279  
  1280  	// the list of handlers to call during pod admission.
  1281  	admitHandlers lifecycle.PodAdmitHandlers
  1282  
  1283  	// softAdmithandlers are applied to the pod after it is admitted by the Kubelet, but before it is
  1284  	// run. A pod rejected by a softAdmitHandler will be left in a Pending state indefinitely. If a
  1285  	// rejected pod should not be recreated, or the scheduler is not aware of the rejection rule, the
  1286  	// admission rule should be applied by a softAdmitHandler.
  1287  	softAdmitHandlers lifecycle.PodAdmitHandlers
  1288  
  1289  	// the list of handlers to call during pod sync loop.
  1290  	lifecycle.PodSyncLoopHandlers
  1291  
  1292  	// the list of handlers to call during pod sync.
  1293  	lifecycle.PodSyncHandlers
  1294  
  1295  	// the number of allowed pods per core
  1296  	podsPerCore int
  1297  
  1298  	// enableControllerAttachDetach indicates the Attach/Detach controller
  1299  	// should manage attachment/detachment of volumes scheduled to this node,
  1300  	// and disable kubelet from executing any attach/detach operations
  1301  	enableControllerAttachDetach bool
  1302  
  1303  	// trigger deleting containers in a pod
  1304  	containerDeletor *podContainerDeletor
  1305  
  1306  	// config iptables util rules
  1307  	makeIPTablesUtilChains bool
  1308  
  1309  	// The AppArmor validator for checking whether AppArmor is supported.
  1310  	appArmorValidator apparmor.Validator
  1311  
  1312  	// StatsProvider provides the node and the container stats.
  1313  	StatsProvider *stats.Provider
  1314  
  1315  	// pluginmanager runs a set of asynchronous loops that figure out which
  1316  	// plugins need to be registered/unregistered based on this node and makes it so.
  1317  	pluginManager pluginmanager.PluginManager
  1318  
  1319  	// This flag sets a maximum number of images to report in the node status.
  1320  	nodeStatusMaxImages int32
  1321  
  1322  	// Handles RuntimeClass objects for the Kubelet.
  1323  	runtimeClassManager *runtimeclass.Manager
  1324  
  1325  	// Handles node shutdown events for the Node.
  1326  	shutdownManager nodeshutdown.Manager
  1327  
  1328  	// Manage user namespaces
  1329  	usernsManager *userns.UsernsManager
  1330  
  1331  	// Mutex to serialize new pod admission and existing pod resizing
  1332  	podResizeMutex sync.Mutex
  1333  
  1334  	// OpenTelemetry Tracer
  1335  	tracer trace.Tracer
  1336  
  1337  	// Track node startup latencies
  1338  	nodeStartupLatencyTracker util.NodeStartupLatencyTracker
  1339  }
  1340  
  1341  // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface
  1342  func (kl *Kubelet) ListPodStats(ctx context.Context) ([]statsapi.PodStats, error) {
  1343  	return kl.StatsProvider.ListPodStats(ctx)
  1344  }
  1345  
  1346  // ListPodCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface
  1347  func (kl *Kubelet) ListPodCPUAndMemoryStats(ctx context.Context) ([]statsapi.PodStats, error) {
  1348  	return kl.StatsProvider.ListPodCPUAndMemoryStats(ctx)
  1349  }
  1350  
  1351  // ListPodStatsAndUpdateCPUNanoCoreUsage is delegated to StatsProvider, which implements stats.Provider interface
  1352  func (kl *Kubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) {
  1353  	return kl.StatsProvider.ListPodStatsAndUpdateCPUNanoCoreUsage(ctx)
  1354  }
  1355  
  1356  // ImageFsStats is delegated to StatsProvider, which implements stats.Provider interface
  1357  func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) {
  1358  	return kl.StatsProvider.ImageFsStats(ctx)
  1359  }
  1360  
  1361  // GetCgroupStats is delegated to StatsProvider, which implements stats.Provider interface
  1362  func (kl *Kubelet) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) {
  1363  	return kl.StatsProvider.GetCgroupStats(cgroupName, updateStats)
  1364  }
  1365  
  1366  // GetCgroupCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface
  1367  func (kl *Kubelet) GetCgroupCPUAndMemoryStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, error) {
  1368  	return kl.StatsProvider.GetCgroupCPUAndMemoryStats(cgroupName, updateStats)
  1369  }
  1370  
  1371  // RootFsStats is delegated to StatsProvider, which implements stats.Provider interface
  1372  func (kl *Kubelet) RootFsStats() (*statsapi.FsStats, error) {
  1373  	return kl.StatsProvider.RootFsStats()
  1374  }
  1375  
  1376  // RlimitStats is delegated to StatsProvider, which implements stats.Provider interface
  1377  func (kl *Kubelet) RlimitStats() (*statsapi.RlimitStats, error) {
  1378  	return kl.StatsProvider.RlimitStats()
  1379  }
  1380  
  1381  // setupDataDirs creates:
  1382  // 1.  the root directory
  1383  // 2.  the pods directory
  1384  // 3.  the plugins directory
  1385  // 4.  the pod-resources directory
  1386  // 5.  the checkpoint directory
  1387  // 6.  the pod logs root directory
  1388  func (kl *Kubelet) setupDataDirs() error {
  1389  	if cleanedRoot := filepath.Clean(kl.rootDirectory); cleanedRoot != kl.rootDirectory {
  1390  		return fmt.Errorf("rootDirectory not in canonical form: expected %s, was %s", cleanedRoot, kl.rootDirectory)
  1391  	}
  1392  	pluginRegistrationDir := kl.getPluginsRegistrationDir()
  1393  	pluginsDir := kl.getPluginsDir()
  1394  	if err := os.MkdirAll(kl.getRootDir(), 0750); err != nil {
  1395  		return fmt.Errorf("error creating root directory: %v", err)
  1396  	}
  1397  	if err := os.MkdirAll(kl.getPodLogsDir(), 0750); err != nil {
  1398  		return fmt.Errorf("error creating pod logs root directory %q: %w", kl.getPodLogsDir(), err)
  1399  	}
  1400  	if err := kl.hostutil.MakeRShared(kl.getRootDir()); err != nil {
  1401  		return fmt.Errorf("error configuring root directory: %v", err)
  1402  	}
  1403  	if err := os.MkdirAll(kl.getPodsDir(), 0750); err != nil {
  1404  		return fmt.Errorf("error creating pods directory: %v", err)
  1405  	}
  1406  	if err := os.MkdirAll(kl.getPluginsDir(), 0750); err != nil {
  1407  		return fmt.Errorf("error creating plugins directory: %v", err)
  1408  	}
  1409  	if err := os.MkdirAll(kl.getPluginsRegistrationDir(), 0750); err != nil {
  1410  		return fmt.Errorf("error creating plugins registry directory: %v", err)
  1411  	}
  1412  	if err := os.MkdirAll(kl.getPodResourcesDir(), 0750); err != nil {
  1413  		return fmt.Errorf("error creating podresources directory: %v", err)
  1414  	}
  1415  	if utilfeature.DefaultFeatureGate.Enabled(features.ContainerCheckpoint) {
  1416  		if err := os.MkdirAll(kl.getCheckpointsDir(), 0700); err != nil {
  1417  			return fmt.Errorf("error creating checkpoint directory: %v", err)
  1418  		}
  1419  	}
  1420  	if selinux.GetEnabled() {
  1421  		err := selinux.SetFileLabel(pluginRegistrationDir, config.KubeletPluginsDirSELinuxLabel)
  1422  		if err != nil {
  1423  			klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugin registration dir", "path", pluginRegistrationDir, "err", err)
  1424  		}
  1425  		err = selinux.SetFileLabel(pluginsDir, config.KubeletPluginsDirSELinuxLabel)
  1426  		if err != nil {
  1427  			klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugins dir", "path", pluginsDir, "err", err)
  1428  		}
  1429  	}
  1430  	return nil
  1431  }
  1432  
  1433  // StartGarbageCollection starts garbage collection threads.
  1434  func (kl *Kubelet) StartGarbageCollection() {
  1435  	loggedContainerGCFailure := false
  1436  	go wait.Until(func() {
  1437  		ctx := context.Background()
  1438  		if err := kl.containerGC.GarbageCollect(ctx); err != nil {
  1439  			klog.ErrorS(err, "Container garbage collection failed")
  1440  			kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
  1441  			loggedContainerGCFailure = true
  1442  		} else {
  1443  			var vLevel klog.Level = 4
  1444  			if loggedContainerGCFailure {
  1445  				vLevel = 1
  1446  				loggedContainerGCFailure = false
  1447  			}
  1448  
  1449  			klog.V(vLevel).InfoS("Container garbage collection succeeded")
  1450  		}
  1451  	}, ContainerGCPeriod, wait.NeverStop)
  1452  
  1453  	// when the high threshold is set to 100, and the max age is 0 (or the max age feature is disabled)
  1454  	// stub the image GC manager
  1455  	if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 &&
  1456  		(!utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) || kl.kubeletConfiguration.ImageMaximumGCAge.Duration == 0) {
  1457  		klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100 and ImageMaximumGCAge is 0, Disable image GC")
  1458  		return
  1459  	}
  1460  
  1461  	prevImageGCFailed := false
  1462  	beganGC := time.Now()
  1463  	go wait.Until(func() {
  1464  		ctx := context.Background()
  1465  		if err := kl.imageManager.GarbageCollect(ctx, beganGC); err != nil {
  1466  			if prevImageGCFailed {
  1467  				klog.ErrorS(err, "Image garbage collection failed multiple times in a row")
  1468  				// Only create an event for repeated failures
  1469  				kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error())
  1470  			} else {
  1471  				klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet")
  1472  			}
  1473  			prevImageGCFailed = true
  1474  		} else {
  1475  			var vLevel klog.Level = 4
  1476  			if prevImageGCFailed {
  1477  				vLevel = 1
  1478  				prevImageGCFailed = false
  1479  			}
  1480  
  1481  			klog.V(vLevel).InfoS("Image garbage collection succeeded")
  1482  		}
  1483  	}, ImageGCPeriod, wait.NeverStop)
  1484  }
  1485  
  1486  // initializeModules will initialize internal modules that do not require the container runtime to be up.
  1487  // Note that the modules here must not depend on modules that are not initialized here.
  1488  func (kl *Kubelet) initializeModules() error {
  1489  	// Prometheus metrics.
  1490  	metrics.Register(
  1491  		collectors.NewVolumeStatsCollector(kl),
  1492  		collectors.NewLogMetricsCollector(kl.StatsProvider.ListPodStats),
  1493  	)
  1494  	metrics.SetNodeName(kl.nodeName)
  1495  	servermetrics.Register()
  1496  
  1497  	// Setup filesystem directories.
  1498  	if err := kl.setupDataDirs(); err != nil {
  1499  		return err
  1500  	}
  1501  
  1502  	// If the container logs directory does not exist, create it.
  1503  	if _, err := os.Stat(ContainerLogsDir); err != nil {
  1504  		if err := kl.os.MkdirAll(ContainerLogsDir, 0755); err != nil {
  1505  			return fmt.Errorf("failed to create directory %q: %v", ContainerLogsDir, err)
  1506  		}
  1507  	}
  1508  
  1509  	// Start the image manager.
  1510  	kl.imageManager.Start()
  1511  
  1512  	// Start the certificate manager if it was enabled.
  1513  	if kl.serverCertificateManager != nil {
  1514  		kl.serverCertificateManager.Start()
  1515  	}
  1516  
  1517  	// Start out of memory watcher.
  1518  	if kl.oomWatcher != nil {
  1519  		if err := kl.oomWatcher.Start(kl.nodeRef); err != nil {
  1520  			return fmt.Errorf("failed to start OOM watcher: %w", err)
  1521  		}
  1522  	}
  1523  
  1524  	// Start resource analyzer
  1525  	kl.resourceAnalyzer.Start()
  1526  
  1527  	return nil
  1528  }
  1529  
  1530  // initializeRuntimeDependentModules will initialize internal modules that require the container runtime to be up.
  1531  func (kl *Kubelet) initializeRuntimeDependentModules() {
  1532  	if err := kl.cadvisor.Start(); err != nil {
  1533  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1534  		klog.ErrorS(err, "Failed to start cAdvisor")
  1535  		os.Exit(1)
  1536  	}
  1537  
  1538  	// trigger on-demand stats collection once so that we have capacity information for ephemeral storage.
  1539  	// ignore any errors, since if stats collection is not successful, the container manager will fail to start below.
  1540  	kl.StatsProvider.GetCgroupStats("/", true)
  1541  	// Start container manager.
  1542  	node, err := kl.getNodeAnyWay()
  1543  	if err != nil {
  1544  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1545  		klog.ErrorS(err, "Kubelet failed to get node info")
  1546  		os.Exit(1)
  1547  	}
  1548  	// containerManager must start after cAdvisor because it needs filesystem capacity information
  1549  	if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil {
  1550  		// Fail kubelet and rely on the babysitter to retry starting kubelet.
  1551  		klog.ErrorS(err, "Failed to start ContainerManager")
  1552  		os.Exit(1)
  1553  	}
  1554  	// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
  1555  	kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.PodIsFinished, evictionMonitoringPeriod)
  1556  
  1557  	// container log manager must start after container runtime is up to retrieve information from container runtime
  1558  	// and inform container to reopen log file after log rotation.
  1559  	kl.containerLogManager.Start()
  1560  	// Adding Registration Callback function for CSI Driver
  1561  	kl.pluginManager.AddHandler(pluginwatcherapi.CSIPlugin, plugincache.PluginHandler(csi.PluginHandler))
  1562  	// Adding Registration Callback function for DRA Plugin
  1563  	if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
  1564  		kl.pluginManager.AddHandler(pluginwatcherapi.DRAPlugin, plugincache.PluginHandler(draplugin.NewRegistrationHandler(kl.kubeClient, kl.getNodeAnyWay)))
  1565  	}
  1566  	// Adding Registration Callback function for Device Manager
  1567  	kl.pluginManager.AddHandler(pluginwatcherapi.DevicePlugin, kl.containerManager.GetPluginRegistrationHandler())
  1568  
  1569  	// Start the plugin manager
  1570  	klog.V(4).InfoS("Starting plugin manager")
  1571  	go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop)
  1572  
  1573  	err = kl.shutdownManager.Start()
  1574  	if err != nil {
  1575  		// The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it.
  1576  		klog.ErrorS(err, "Failed to start node shutdown manager")
  1577  	}
  1578  }
  1579  
  1580  // Run starts the kubelet reacting to config updates
  1581  func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
  1582  	ctx := context.Background()
  1583  	if kl.logServer == nil {
  1584  		file := http.FileServer(http.Dir(nodeLogDir))
  1585  		if utilfeature.DefaultFeatureGate.Enabled(features.NodeLogQuery) && kl.kubeletConfiguration.EnableSystemLogQuery {
  1586  			kl.logServer = http.StripPrefix("/logs/", http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
  1587  				if nlq, errs := newNodeLogQuery(req.URL.Query()); len(errs) > 0 {
  1588  					http.Error(w, errs.ToAggregate().Error(), http.StatusBadRequest)
  1589  					return
  1590  				} else if nlq != nil {
  1591  					if req.URL.Path != "/" && req.URL.Path != "" {
  1592  						http.Error(w, "path not allowed in query mode", http.StatusNotAcceptable)
  1593  						return
  1594  					}
  1595  					if errs := nlq.validate(); len(errs) > 0 {
  1596  						http.Error(w, errs.ToAggregate().Error(), http.StatusNotAcceptable)
  1597  						return
  1598  					}
  1599  					// Validation ensures that the request does not query services and files at the same time
  1600  					if len(nlq.Services) > 0 {
  1601  						journal.ServeHTTP(w, req)
  1602  						return
  1603  					}
  1604  					// Validation ensures that the request does not explicitly query multiple files at the same time
  1605  					if len(nlq.Files) == 1 {
  1606  						// Account for the \ being used on Windows clients
  1607  						req.URL.Path = filepath.ToSlash(nlq.Files[0])
  1608  					}
  1609  				}
  1610  				// Fall back in case the caller is directly trying to query a file
  1611  				// Example: kubectl get --raw /api/v1/nodes/$name/proxy/logs/foo.log
  1612  				file.ServeHTTP(w, req)
  1613  			}))
  1614  		} else {
  1615  			kl.logServer = http.StripPrefix("/logs/", file)
  1616  		}
  1617  	}
  1618  	if kl.kubeClient == nil {
  1619  		klog.InfoS("No API server defined - no node status update will be sent")
  1620  	}
  1621  
  1622  	// Start the cloud provider sync manager
  1623  	if kl.cloudResourceSyncManager != nil {
  1624  		go kl.cloudResourceSyncManager.Run(wait.NeverStop)
  1625  	}
  1626  
  1627  	if err := kl.initializeModules(); err != nil {
  1628  		kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error())
  1629  		klog.ErrorS(err, "Failed to initialize internal modules")
  1630  		os.Exit(1)
  1631  	}
  1632  
  1633  	// Start volume manager
  1634  	go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop)
  1635  
  1636  	if kl.kubeClient != nil {
  1637  		// Start two go-routines to update the status.
  1638  		//
  1639  		// The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals,
  1640  		// while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver
  1641  		// once the node becomes ready, then exits afterwards.
  1642  		//
  1643  		// Introduce some small jittering to ensure that over time the requests won't start
  1644  		// accumulating at approximately the same time from the set of nodes due to priority and
  1645  		// fairness effect.
  1646  		go wait.JitterUntil(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, 0.04, true, wait.NeverStop)
  1647  		go kl.fastStatusUpdateOnce()
  1648  
  1649  		// start syncing lease
  1650  		go kl.nodeLeaseController.Run(context.Background())
  1651  	}
  1652  	go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop)
  1653  
  1654  	// Set up iptables util rules
  1655  	if kl.makeIPTablesUtilChains {
  1656  		kl.initNetworkUtil()
  1657  	}
  1658  
  1659  	// Start component sync loops.
  1660  	kl.statusManager.Start()
  1661  
  1662  	// Start syncing RuntimeClasses if enabled.
  1663  	if kl.runtimeClassManager != nil {
  1664  		kl.runtimeClassManager.Start(wait.NeverStop)
  1665  	}
  1666  
  1667  	// Start the pod lifecycle event generator.
  1668  	kl.pleg.Start()
  1669  
  1670  	// Start eventedPLEG only if EventedPLEG feature gate is enabled.
  1671  	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
  1672  		kl.eventedPleg.Start()
  1673  	}
  1674  
  1675  	kl.syncLoop(ctx, updates, kl)
  1676  }
  1677  
  1678  // SyncPod is the transaction script for the sync of a single pod (setting up)
  1679  // a pod. This method is reentrant and expected to converge a pod towards the
  1680  // desired state of the spec. The reverse (teardown) is handled in
  1681  // SyncTerminatingPod and SyncTerminatedPod. If SyncPod exits without error,
  1682  // then the pod runtime state is in sync with the desired configuration state
  1683  // (pod is running). If SyncPod exits with a transient error, the next
  1684  // invocation of SyncPod is expected to make progress towards reaching the
  1685  // desired state. SyncPod exits with isTerminal when the pod was detected to
  1686  // have reached a terminal lifecycle phase due to container exits (for
  1687  // RestartNever or RestartOnFailure) and the next method invoked will be
  1688  // SyncTerminatingPod. If the pod terminates for any other reason, SyncPod
  1689  // will receive a context cancellation and should exit as soon as possible.
  1690  //
  1691  // Arguments:
  1692  //
  1693  // updateType - whether this is a create (first time) or an update, should
  1694  // only be used for metrics since this method must be reentrant
  1695  //
  1696  // pod - the pod that is being set up
  1697  //
  1698  // mirrorPod - the mirror pod known to the kubelet for this pod, if any
  1699  //
  1700  // podStatus - the most recent pod status observed for this pod which can
  1701  // be used to determine the set of actions that should be taken during
  1702  // this loop of SyncPod
  1703  //
  1704  // The workflow is:
  1705  //   - If the pod is being created, record pod worker start latency
  1706  //   - Call generateAPIPodStatus to prepare an v1.PodStatus for the pod
  1707  //   - If the pod is being seen as running for the first time, record pod
  1708  //     start latency
  1709  //   - Update the status of the pod in the status manager
  1710  //   - Stop the pod's containers if it should not be running due to soft
  1711  //     admission
  1712  //   - Ensure any background tracking for a runnable pod is started
  1713  //   - Create a mirror pod if the pod is a static pod, and does not
  1714  //     already have a mirror pod
  1715  //   - Create the data directories for the pod if they do not exist
  1716  //   - Wait for volumes to attach/mount
  1717  //   - Fetch the pull secrets for the pod
  1718  //   - Call the container runtime's SyncPod callback
  1719  //   - Update the traffic shaping for the pod's ingress and egress limits
  1720  //
  1721  // If any step of this workflow errors, the error is returned, and is repeated
  1722  // on the next SyncPod call.
  1723  //
  1724  // This operation writes all events that are dispatched in order to provide
  1725  // the most accurate information possible about an error situation to aid debugging.
  1726  // Callers should not write an event if this operation returns an error.
  1727  func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) {
  1728  	ctx, otelSpan := kl.tracer.Start(ctx, "syncPod", trace.WithAttributes(
  1729  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  1730  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  1731  		semconv.K8SPodNameKey.String(pod.Name),
  1732  		attribute.String("k8s.pod.update_type", updateType.String()),
  1733  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  1734  	))
  1735  	klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  1736  	defer func() {
  1737  		klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal)
  1738  		otelSpan.End()
  1739  	}()
  1740  
  1741  	// Latency measurements for the main workflow are relative to the
  1742  	// first time the pod was seen by kubelet.
  1743  	var firstSeenTime time.Time
  1744  	if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok {
  1745  		firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get()
  1746  	}
  1747  
  1748  	// Record pod worker start latency if being created
  1749  	// TODO: make pod workers record their own latencies
  1750  	if updateType == kubetypes.SyncPodCreate {
  1751  		if !firstSeenTime.IsZero() {
  1752  			// This is the first time we are syncing the pod. Record the latency
  1753  			// since kubelet first saw the pod if firstSeenTime is set.
  1754  			metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime))
  1755  		} else {
  1756  			klog.V(3).InfoS("First seen time not recorded for pod",
  1757  				"podUID", pod.UID,
  1758  				"pod", klog.KObj(pod))
  1759  		}
  1760  	}
  1761  
  1762  	// Generate final API pod status with pod and status manager status
  1763  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
  1764  	// The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576)
  1765  	// TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and
  1766  	// set pod IP to hostIP directly in runtime.GetPodStatus
  1767  	podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs))
  1768  	for _, ipInfo := range apiPodStatus.PodIPs {
  1769  		podStatus.IPs = append(podStatus.IPs, ipInfo.IP)
  1770  	}
  1771  	if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 {
  1772  		podStatus.IPs = []string{apiPodStatus.PodIP}
  1773  	}
  1774  
  1775  	// If the pod is terminal, we don't need to continue to setup the pod
  1776  	if apiPodStatus.Phase == v1.PodSucceeded || apiPodStatus.Phase == v1.PodFailed {
  1777  		kl.statusManager.SetPodStatus(pod, apiPodStatus)
  1778  		isTerminal = true
  1779  		return isTerminal, nil
  1780  	}
  1781  
  1782  	// If the pod should not be running, we request the pod's containers be stopped. This is not the same
  1783  	// as termination (we want to stop the pod, but potentially restart it later if soft admission allows
  1784  	// it later). Set the status and phase appropriately
  1785  	runnable := kl.canRunPod(pod)
  1786  	if !runnable.Admit {
  1787  		// Pod is not runnable; and update the Pod and Container statuses to why.
  1788  		if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded {
  1789  			apiPodStatus.Phase = v1.PodPending
  1790  		}
  1791  		apiPodStatus.Reason = runnable.Reason
  1792  		apiPodStatus.Message = runnable.Message
  1793  		// Waiting containers are not creating.
  1794  		const waitingReason = "Blocked"
  1795  		for _, cs := range apiPodStatus.InitContainerStatuses {
  1796  			if cs.State.Waiting != nil {
  1797  				cs.State.Waiting.Reason = waitingReason
  1798  			}
  1799  		}
  1800  		for _, cs := range apiPodStatus.ContainerStatuses {
  1801  			if cs.State.Waiting != nil {
  1802  				cs.State.Waiting.Reason = waitingReason
  1803  			}
  1804  		}
  1805  	}
  1806  
  1807  	// Record the time it takes for the pod to become running
  1808  	// since kubelet first saw the pod if firstSeenTime is set.
  1809  	existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID)
  1810  	if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning &&
  1811  		!firstSeenTime.IsZero() {
  1812  		metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime))
  1813  	}
  1814  
  1815  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  1816  
  1817  	// Pods that are not runnable must be stopped - return a typed error to the pod worker
  1818  	if !runnable.Admit {
  1819  		klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message)
  1820  		var syncErr error
  1821  		p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1822  		if err := kl.killPod(ctx, pod, p, nil); err != nil {
  1823  			if !wait.Interrupted(err) {
  1824  				kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  1825  				syncErr = fmt.Errorf("error killing pod: %w", err)
  1826  				utilruntime.HandleError(syncErr)
  1827  			}
  1828  		} else {
  1829  			// There was no error killing the pod, but the pod cannot be run.
  1830  			// Return an error to signal that the sync loop should back off.
  1831  			syncErr = fmt.Errorf("pod cannot be run: %v", runnable.Message)
  1832  		}
  1833  		return false, syncErr
  1834  	}
  1835  
  1836  	// If the network plugin is not ready, only start the pod if it uses the host network
  1837  	if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) {
  1838  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err)
  1839  		return false, fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err)
  1840  	}
  1841  
  1842  	// ensure the kubelet knows about referenced secrets or configmaps used by the pod
  1843  	if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  1844  		if kl.secretManager != nil {
  1845  			kl.secretManager.RegisterPod(pod)
  1846  		}
  1847  		if kl.configMapManager != nil {
  1848  			kl.configMapManager.RegisterPod(pod)
  1849  		}
  1850  	}
  1851  
  1852  	// Create Cgroups for the pod and apply resource parameters
  1853  	// to them if cgroups-per-qos flag is enabled.
  1854  	pcm := kl.containerManager.NewPodContainerManager()
  1855  	// If pod has already been terminated then we need not create
  1856  	// or update the pod's cgroup
  1857  	// TODO: once context cancellation is added this check can be removed
  1858  	if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  1859  		// When the kubelet is restarted with the cgroups-per-qos
  1860  		// flag enabled, all the pod's running containers
  1861  		// should be killed intermittently and brought back up
  1862  		// under the qos cgroup hierarchy.
  1863  		// Check if this is the pod's first sync
  1864  		firstSync := true
  1865  		for _, containerStatus := range apiPodStatus.ContainerStatuses {
  1866  			if containerStatus.State.Running != nil {
  1867  				firstSync = false
  1868  				break
  1869  			}
  1870  		}
  1871  		// Don't kill containers in pod if pod's cgroups already
  1872  		// exists or the pod is running for the first time
  1873  		podKilled := false
  1874  		if !pcm.Exists(pod) && !firstSync {
  1875  			p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1876  			if err := kl.killPod(ctx, pod, p, nil); err == nil {
  1877  				if wait.Interrupted(err) {
  1878  					return false, err
  1879  				}
  1880  				podKilled = true
  1881  			} else {
  1882  				klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus)
  1883  			}
  1884  		}
  1885  		// Create and Update pod's Cgroups
  1886  		// Don't create cgroups for run once pod if it was killed above
  1887  		// The current policy is not to restart the run once pods when
  1888  		// the kubelet is restarted with the new flag as run once pods are
  1889  		// expected to run only once and if the kubelet is restarted then
  1890  		// they are not expected to run again.
  1891  		// We don't create and apply updates to cgroup if its a run once pod and was killed above
  1892  		if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) {
  1893  			if !pcm.Exists(pod) {
  1894  				if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
  1895  					klog.V(2).InfoS("Failed to update QoS cgroups while syncing pod", "pod", klog.KObj(pod), "err", err)
  1896  				}
  1897  				if err := pcm.EnsureExists(pod); err != nil {
  1898  					kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err)
  1899  					return false, fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err)
  1900  				}
  1901  			}
  1902  		}
  1903  	}
  1904  
  1905  	// Create Mirror Pod for Static Pod if it doesn't already exist
  1906  	if kubetypes.IsStaticPod(pod) {
  1907  		deleted := false
  1908  		if mirrorPod != nil {
  1909  			if mirrorPod.DeletionTimestamp != nil || !kubepod.IsMirrorPodOf(mirrorPod, pod) {
  1910  				// The mirror pod is semantically different from the static pod. Remove
  1911  				// it. The mirror pod will get recreated later.
  1912  				klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID)
  1913  				podFullName := kubecontainer.GetPodFullName(pod)
  1914  				var err error
  1915  				deleted, err = kl.mirrorPodClient.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID)
  1916  				if deleted {
  1917  					klog.InfoS("Deleted mirror pod because it is outdated", "pod", klog.KObj(mirrorPod))
  1918  				} else if err != nil {
  1919  					klog.ErrorS(err, "Failed deleting mirror pod", "pod", klog.KObj(mirrorPod))
  1920  				}
  1921  			}
  1922  		}
  1923  		if mirrorPod == nil || deleted {
  1924  			node, err := kl.GetNode()
  1925  			if err != nil {
  1926  				klog.V(4).ErrorS(err, "No need to create a mirror pod, since failed to get node info from the cluster", "node", klog.KRef("", string(kl.nodeName)))
  1927  			} else if node.DeletionTimestamp != nil {
  1928  				klog.V(4).InfoS("No need to create a mirror pod, since node has been removed from the cluster", "node", klog.KRef("", string(kl.nodeName)))
  1929  			} else {
  1930  				klog.V(4).InfoS("Creating a mirror pod for static pod", "pod", klog.KObj(pod))
  1931  				if err := kl.mirrorPodClient.CreateMirrorPod(pod); err != nil {
  1932  					klog.ErrorS(err, "Failed creating a mirror pod for", "pod", klog.KObj(pod))
  1933  				}
  1934  			}
  1935  		}
  1936  	}
  1937  
  1938  	// Make data directories for the pod
  1939  	if err := kl.makePodDataDirs(pod); err != nil {
  1940  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err)
  1941  		klog.ErrorS(err, "Unable to make pod data directories for pod", "pod", klog.KObj(pod))
  1942  		return false, err
  1943  	}
  1944  
  1945  	// Wait for volumes to attach/mount
  1946  	if err := kl.volumeManager.WaitForAttachAndMount(ctx, pod); err != nil {
  1947  		if !wait.Interrupted(err) {
  1948  			kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err)
  1949  			klog.ErrorS(err, "Unable to attach or mount volumes for pod; skipping pod", "pod", klog.KObj(pod))
  1950  		}
  1951  		return false, err
  1952  	}
  1953  
  1954  	// Fetch the pull secrets for the pod
  1955  	pullSecrets := kl.getPullSecretsForPod(pod)
  1956  
  1957  	// Ensure the pod is being probed
  1958  	kl.probeManager.AddPod(pod)
  1959  
  1960  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  1961  		// Handle pod resize here instead of doing it in HandlePodUpdates because
  1962  		// this conveniently retries any Deferred resize requests
  1963  		// TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan
  1964  		//     See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060
  1965  		if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) {
  1966  			pod = kl.handlePodResourcesResize(pod)
  1967  		}
  1968  	}
  1969  
  1970  	// TODO(#113606): use cancellation from the incoming context parameter, which comes from the pod worker.
  1971  	// Currently, using cancellation from that context causes test failures. To remove this WithoutCancel,
  1972  	// any wait.Interrupted errors need to be filtered from result and bypass the reasonCache - cancelling
  1973  	// the context for SyncPod is a known and deliberate error, not a generic error.
  1974  	// Use WithoutCancel instead of a new context.TODO() to propagate trace context
  1975  	// Call the container runtime's SyncPod callback
  1976  	sctx := context.WithoutCancel(ctx)
  1977  	result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff)
  1978  	kl.reasonCache.Update(pod.UID, result)
  1979  	if err := result.Error(); err != nil {
  1980  		// Do not return error if the only failures were pods in backoff
  1981  		for _, r := range result.SyncResults {
  1982  			if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff {
  1983  				// Do not record an event here, as we keep all event logging for sync pod failures
  1984  				// local to container runtime, so we get better errors.
  1985  				return false, err
  1986  			}
  1987  		}
  1988  
  1989  		return false, nil
  1990  	}
  1991  
  1992  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) {
  1993  		// While resize is in progress, periodically call PLEG to update pod cache
  1994  		runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  1995  		if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil {
  1996  			klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod))
  1997  			return false, err
  1998  		}
  1999  	}
  2000  
  2001  	return false, nil
  2002  }
  2003  
  2004  // SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method
  2005  // returns without error, the pod is considered to be terminated and it will be safe to clean up any
  2006  // pod state that is tied to the lifetime of running containers. The next method invoked will be
  2007  // SyncTerminatedPod. This method is expected to return with the grace period provided and the
  2008  // provided context may be cancelled if the duration is exceeded. The method may also be interrupted
  2009  // with a context cancellation if the grace period is shortened by the user or the kubelet (such as
  2010  // during eviction). This method is not guaranteed to be called if a pod is force deleted from the
  2011  // configuration and the kubelet is restarted - SyncTerminatingRuntimePod handles those orphaned
  2012  // pods.
  2013  func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error {
  2014  	// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
  2015  	// Currently, using that context causes test failures.
  2016  	ctx, otelSpan := kl.tracer.Start(context.Background(), "syncTerminatingPod", trace.WithAttributes(
  2017  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  2018  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  2019  		semconv.K8SPodNameKey.String(pod.Name),
  2020  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  2021  	))
  2022  	defer otelSpan.End()
  2023  	klog.V(4).InfoS("SyncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2024  	defer klog.V(4).InfoS("SyncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2025  
  2026  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
  2027  	if podStatusFn != nil {
  2028  		podStatusFn(&apiPodStatus)
  2029  	}
  2030  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2031  
  2032  	if gracePeriod != nil {
  2033  		klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod)
  2034  	} else {
  2035  		klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil)
  2036  	}
  2037  
  2038  	kl.probeManager.StopLivenessAndStartup(pod)
  2039  
  2040  	p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
  2041  	if err := kl.killPod(ctx, pod, p, gracePeriod); err != nil {
  2042  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  2043  		// there was an error killing the pod, so we return that error directly
  2044  		utilruntime.HandleError(err)
  2045  		return err
  2046  	}
  2047  
  2048  	// Once the containers are stopped, we can stop probing for liveness and readiness.
  2049  	// TODO: once a pod is terminal, certain probes (liveness exec) could be stopped immediately after
  2050  	//   the detection of a container shutdown or (for readiness) after the first failure. Tracked as
  2051  	//   https://github.com/kubernetes/kubernetes/issues/107894 although may not be worth optimizing.
  2052  	kl.probeManager.RemovePod(pod)
  2053  
  2054  	// Guard against consistency issues in KillPod implementations by checking that there are no
  2055  	// running containers. This method is invoked infrequently so this is effectively free and can
  2056  	// catch race conditions introduced by callers updating pod status out of order.
  2057  	// TODO: have KillPod return the terminal status of stopped containers and write that into the
  2058  	//  cache immediately
  2059  	podStatus, err := kl.containerRuntime.GetPodStatus(ctx, pod.UID, pod.Name, pod.Namespace)
  2060  	if err != nil {
  2061  		klog.ErrorS(err, "Unable to read pod status prior to final pod termination", "pod", klog.KObj(pod), "podUID", pod.UID)
  2062  		return err
  2063  	}
  2064  	var runningContainers []string
  2065  	type container struct {
  2066  		Name       string
  2067  		State      string
  2068  		ExitCode   int
  2069  		FinishedAt string
  2070  	}
  2071  	var containers []container
  2072  	klogV := klog.V(4)
  2073  	klogVEnabled := klogV.Enabled()
  2074  	for _, s := range podStatus.ContainerStatuses {
  2075  		if s.State == kubecontainer.ContainerStateRunning {
  2076  			runningContainers = append(runningContainers, s.ID.String())
  2077  		}
  2078  		if klogVEnabled {
  2079  			containers = append(containers, container{Name: s.Name, State: string(s.State), ExitCode: s.ExitCode, FinishedAt: s.FinishedAt.UTC().Format(time.RFC3339Nano)})
  2080  		}
  2081  	}
  2082  	if klogVEnabled {
  2083  		sort.Slice(containers, func(i, j int) bool { return containers[i].Name < containers[j].Name })
  2084  		klog.V(4).InfoS("Post-termination container state", "pod", klog.KObj(pod), "podUID", pod.UID, "containers", containers)
  2085  	}
  2086  	if len(runningContainers) > 0 {
  2087  		return fmt.Errorf("detected running containers after a successful KillPod, CRI violation: %v", runningContainers)
  2088  	}
  2089  
  2090  	// NOTE: resources must be unprepared AFTER all containers have stopped
  2091  	// and BEFORE the pod status is changed on the API server
  2092  	// to avoid race conditions with the resource deallocation code in kubernetes core.
  2093  	if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
  2094  		if err := kl.UnprepareDynamicResources(pod); err != nil {
  2095  			return err
  2096  		}
  2097  	}
  2098  
  2099  	// Compute and update the status in cache once the pods are no longer running.
  2100  	// The computation is done here to ensure the pod status used for it contains
  2101  	// information about the container end states (including exit codes) - when
  2102  	// SyncTerminatedPod is called the containers may already be removed.
  2103  	apiPodStatus = kl.generateAPIPodStatus(pod, podStatus, true)
  2104  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2105  
  2106  	// we have successfully stopped all containers, the pod is terminating, our status is "done"
  2107  	klog.V(4).InfoS("Pod termination stopped all running containers", "pod", klog.KObj(pod), "podUID", pod.UID)
  2108  
  2109  	return nil
  2110  }
  2111  
  2112  // SyncTerminatingRuntimePod is expected to terminate running containers in a pod that we have no
  2113  // configuration for. Once this method returns without error, any remaining local state can be safely
  2114  // cleaned up by background processes in each subsystem. Unlike syncTerminatingPod, we lack
  2115  // knowledge of the full pod spec and so cannot perform lifecycle related operations, only ensure
  2116  // that the remnant of the running pod is terminated and allow garbage collection to proceed. We do
  2117  // not update the status of the pod because with the source of configuration removed, we have no
  2118  // place to send that status.
  2119  func (kl *Kubelet) SyncTerminatingRuntimePod(_ context.Context, runningPod *kubecontainer.Pod) error {
  2120  	// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
  2121  	// Currently, using that context causes test failures.
  2122  	ctx := context.Background()
  2123  	pod := runningPod.ToAPIPod()
  2124  	klog.V(4).InfoS("SyncTerminatingRuntimePod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2125  	defer klog.V(4).InfoS("SyncTerminatingRuntimePod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2126  
  2127  	// we kill the pod directly since we have lost all other information about the pod.
  2128  	klog.V(4).InfoS("Orphaned running pod terminating without grace period", "pod", klog.KObj(pod), "podUID", pod.UID)
  2129  	// TODO: this should probably be zero, to bypass any waiting (needs fixes in container runtime)
  2130  	gracePeriod := int64(1)
  2131  	if err := kl.killPod(ctx, pod, *runningPod, &gracePeriod); err != nil {
  2132  		kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
  2133  		// there was an error killing the pod, so we return that error directly
  2134  		utilruntime.HandleError(err)
  2135  		return err
  2136  	}
  2137  	klog.V(4).InfoS("Pod termination stopped all running orphaned containers", "pod", klog.KObj(pod), "podUID", pod.UID)
  2138  	return nil
  2139  }
  2140  
  2141  // SyncTerminatedPod cleans up a pod that has terminated (has no running containers).
  2142  // The invocations in this call are expected to tear down all pod resources.
  2143  // When this method exits the pod is expected to be ready for cleanup. This method
  2144  // reduces the latency of pod cleanup but is not guaranteed to get called in all scenarios.
  2145  //
  2146  // Because the kubelet has no local store of information, all actions in this method that modify
  2147  // on-disk state must be reentrant and be garbage collected by HandlePodCleanups or a separate loop.
  2148  // This typically occurs when a pod is force deleted from configuration (local disk or API) and the
  2149  // kubelet restarts in the middle of the action.
  2150  func (kl *Kubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error {
  2151  	ctx, otelSpan := kl.tracer.Start(ctx, "syncTerminatedPod", trace.WithAttributes(
  2152  		semconv.K8SPodUIDKey.String(string(pod.UID)),
  2153  		attribute.String("k8s.pod", klog.KObj(pod).String()),
  2154  		semconv.K8SPodNameKey.String(pod.Name),
  2155  		semconv.K8SNamespaceNameKey.String(pod.Namespace),
  2156  	))
  2157  	defer otelSpan.End()
  2158  	klog.V(4).InfoS("SyncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
  2159  	defer klog.V(4).InfoS("SyncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID)
  2160  
  2161  	// generate the final status of the pod
  2162  	// TODO: should we simply fold this into TerminatePod? that would give a single pod update
  2163  	apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, true)
  2164  
  2165  	kl.statusManager.SetPodStatus(pod, apiPodStatus)
  2166  
  2167  	// volumes are unmounted after the pod worker reports ShouldPodRuntimeBeRemoved (which is satisfied
  2168  	// before syncTerminatedPod is invoked)
  2169  	if err := kl.volumeManager.WaitForUnmount(ctx, pod); err != nil {
  2170  		return err
  2171  	}
  2172  	klog.V(4).InfoS("Pod termination unmounted volumes", "pod", klog.KObj(pod), "podUID", pod.UID)
  2173  
  2174  	// This waiting loop relies on the background cleanup which starts after pod workers respond
  2175  	// true for ShouldPodRuntimeBeRemoved, which happens after `SyncTerminatingPod` is completed.
  2176  	if err := wait.PollUntilContextCancel(ctx, 100*time.Millisecond, true, func(ctx context.Context) (bool, error) {
  2177  		volumesExist := kl.podVolumesExist(pod.UID)
  2178  		if volumesExist {
  2179  			klog.V(3).InfoS("Pod is terminated, but some volumes have not been cleaned up", "pod", klog.KObj(pod), "podUID", pod.UID)
  2180  		}
  2181  		return !volumesExist, nil
  2182  	}); err != nil {
  2183  		return err
  2184  	}
  2185  	klog.V(3).InfoS("Pod termination cleaned up volume paths", "pod", klog.KObj(pod), "podUID", pod.UID)
  2186  
  2187  	// After volume unmount is complete, let the secret and configmap managers know we're done with this pod
  2188  	if kl.secretManager != nil {
  2189  		kl.secretManager.UnregisterPod(pod)
  2190  	}
  2191  	if kl.configMapManager != nil {
  2192  		kl.configMapManager.UnregisterPod(pod)
  2193  	}
  2194  
  2195  	// Note: we leave pod containers to be reclaimed in the background since dockershim requires the
  2196  	// container for retrieving logs and we want to make sure logs are available until the pod is
  2197  	// physically deleted.
  2198  
  2199  	// remove any cgroups in the hierarchy for pods that are no longer running.
  2200  	if kl.cgroupsPerQOS {
  2201  		pcm := kl.containerManager.NewPodContainerManager()
  2202  		name, _ := pcm.GetPodContainerName(pod)
  2203  		if err := pcm.Destroy(name); err != nil {
  2204  			return err
  2205  		}
  2206  		klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID)
  2207  	}
  2208  
  2209  	kl.usernsManager.Release(pod.UID)
  2210  
  2211  	// mark the final pod status
  2212  	kl.statusManager.TerminatePod(pod)
  2213  	klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID)
  2214  
  2215  	return nil
  2216  }
  2217  
  2218  // Get pods which should be resynchronized. Currently, the following pod should be resynchronized:
  2219  //   - pod whose work is ready.
  2220  //   - internal modules that request sync of a pod.
  2221  //
  2222  // This method does not return orphaned pods (those known only to the pod worker that may have
  2223  // been deleted from configuration). Those pods are synced by HandlePodCleanups as a consequence
  2224  // of driving the state machine to completion.
  2225  //
  2226  // TODO: Consider synchronizing all pods which have not recently been acted on to be resilient
  2227  // to bugs that might prevent updates from being delivered (such as the previous bug with
  2228  // orphaned pods). Instead of asking the work queue for pending work, consider asking the
  2229  // PodWorker which pods should be synced.
  2230  func (kl *Kubelet) getPodsToSync() []*v1.Pod {
  2231  	allPods := kl.podManager.GetPods()
  2232  	podUIDs := kl.workQueue.GetWork()
  2233  	podUIDSet := sets.NewString()
  2234  	for _, podUID := range podUIDs {
  2235  		podUIDSet.Insert(string(podUID))
  2236  	}
  2237  	var podsToSync []*v1.Pod
  2238  	for _, pod := range allPods {
  2239  		if podUIDSet.Has(string(pod.UID)) {
  2240  			// The work of the pod is ready
  2241  			podsToSync = append(podsToSync, pod)
  2242  			continue
  2243  		}
  2244  		for _, podSyncLoopHandler := range kl.PodSyncLoopHandlers {
  2245  			if podSyncLoopHandler.ShouldSync(pod) {
  2246  				podsToSync = append(podsToSync, pod)
  2247  				break
  2248  			}
  2249  		}
  2250  	}
  2251  	return podsToSync
  2252  }
  2253  
  2254  // deletePod deletes the pod from the internal state of the kubelet by:
  2255  // 1.  stopping the associated pod worker asynchronously
  2256  // 2.  signaling to kill the pod by sending on the podKillingCh channel
  2257  //
  2258  // deletePod returns an error if not all sources are ready or the pod is not
  2259  // found in the runtime cache.
  2260  func (kl *Kubelet) deletePod(pod *v1.Pod) error {
  2261  	if pod == nil {
  2262  		return fmt.Errorf("deletePod does not allow nil pod")
  2263  	}
  2264  	if !kl.sourcesReady.AllReady() {
  2265  		// If the sources aren't ready, skip deletion, as we may accidentally delete pods
  2266  		// for sources that haven't reported yet.
  2267  		return fmt.Errorf("skipping delete because sources aren't ready yet")
  2268  	}
  2269  	klog.V(3).InfoS("Pod has been deleted and must be killed", "pod", klog.KObj(pod), "podUID", pod.UID)
  2270  	kl.podWorkers.UpdatePod(UpdatePodOptions{
  2271  		Pod:        pod,
  2272  		UpdateType: kubetypes.SyncPodKill,
  2273  	})
  2274  	// We leave the volume/directory cleanup to the periodic cleanup routine.
  2275  	return nil
  2276  }
  2277  
  2278  // rejectPod records an event about the pod with the given reason and message,
  2279  // and updates the pod to the failed phase in the status manager.
  2280  func (kl *Kubelet) rejectPod(pod *v1.Pod, reason, message string) {
  2281  	kl.recorder.Eventf(pod, v1.EventTypeWarning, reason, message)
  2282  	kl.statusManager.SetPodStatus(pod, v1.PodStatus{
  2283  		Phase:   v1.PodFailed,
  2284  		Reason:  reason,
  2285  		Message: "Pod was rejected: " + message})
  2286  }
  2287  
  2288  // canAdmitPod determines if a pod can be admitted, and gives a reason if it
  2289  // cannot. "pod" is new pod, while "pods" are all admitted pods
  2290  // The function returns a boolean value indicating whether the pod
  2291  // can be admitted, a brief single-word reason and a message explaining why
  2292  // the pod cannot be admitted.
  2293  func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, string) {
  2294  	// the kubelet will invoke each pod admit handler in sequence
  2295  	// if any handler rejects, the pod is rejected.
  2296  	// TODO: move out of disk check into a pod admitter
  2297  	// TODO: out of resource eviction should have a pod admitter call-out
  2298  	attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods}
  2299  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2300  		// Use allocated resources values from checkpoint store (source of truth) to determine fit
  2301  		otherPods := make([]*v1.Pod, 0, len(pods))
  2302  		for _, p := range pods {
  2303  			op := p.DeepCopy()
  2304  			kl.updateContainerResourceAllocation(op)
  2305  
  2306  			otherPods = append(otherPods, op)
  2307  		}
  2308  		attrs.OtherPods = otherPods
  2309  	}
  2310  	for _, podAdmitHandler := range kl.admitHandlers {
  2311  		if result := podAdmitHandler.Admit(attrs); !result.Admit {
  2312  			return false, result.Reason, result.Message
  2313  		}
  2314  	}
  2315  
  2316  	return true, "", ""
  2317  }
  2318  
  2319  func (kl *Kubelet) canRunPod(pod *v1.Pod) lifecycle.PodAdmitResult {
  2320  	attrs := &lifecycle.PodAdmitAttributes{Pod: pod}
  2321  	// Get "OtherPods". Rejected pods are failed, so only include admitted pods that are alive.
  2322  	attrs.OtherPods = kl.GetActivePods()
  2323  
  2324  	for _, handler := range kl.softAdmitHandlers {
  2325  		if result := handler.Admit(attrs); !result.Admit {
  2326  			return result
  2327  		}
  2328  	}
  2329  
  2330  	return lifecycle.PodAdmitResult{Admit: true}
  2331  }
  2332  
  2333  // syncLoop is the main loop for processing changes. It watches for changes from
  2334  // three channels (file, apiserver, and http) and creates a union of them. For
  2335  // any new change seen, will run a sync against desired state and running state. If
  2336  // no changes are seen to the configuration, will synchronize the last known desired
  2337  // state every sync-frequency seconds. Never returns.
  2338  func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) {
  2339  	klog.InfoS("Starting kubelet main sync loop")
  2340  	// The syncTicker wakes up kubelet to checks if there are any pod workers
  2341  	// that need to be sync'd. A one-second period is sufficient because the
  2342  	// sync interval is defaulted to 10s.
  2343  	syncTicker := time.NewTicker(time.Second)
  2344  	defer syncTicker.Stop()
  2345  	housekeepingTicker := time.NewTicker(housekeepingPeriod)
  2346  	defer housekeepingTicker.Stop()
  2347  	plegCh := kl.pleg.Watch()
  2348  	const (
  2349  		base   = 100 * time.Millisecond
  2350  		max    = 5 * time.Second
  2351  		factor = 2
  2352  	)
  2353  	duration := base
  2354  	// Responsible for checking limits in resolv.conf
  2355  	// The limits do not have anything to do with individual pods
  2356  	// Since this is called in syncLoop, we don't need to call it anywhere else
  2357  	if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" {
  2358  		kl.dnsConfigurer.CheckLimitsForResolvConf()
  2359  	}
  2360  
  2361  	for {
  2362  		if err := kl.runtimeState.runtimeErrors(); err != nil {
  2363  			klog.ErrorS(err, "Skipping pod synchronization")
  2364  			// exponential backoff
  2365  			time.Sleep(duration)
  2366  			duration = time.Duration(math.Min(float64(max), factor*float64(duration)))
  2367  			continue
  2368  		}
  2369  		// reset backoff if we have a success
  2370  		duration = base
  2371  
  2372  		kl.syncLoopMonitor.Store(kl.clock.Now())
  2373  		if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) {
  2374  			break
  2375  		}
  2376  		kl.syncLoopMonitor.Store(kl.clock.Now())
  2377  	}
  2378  }
  2379  
  2380  // syncLoopIteration reads from various channels and dispatches pods to the
  2381  // given handler.
  2382  //
  2383  // Arguments:
  2384  // 1.  configCh:       a channel to read config events from
  2385  // 2.  handler:        the SyncHandler to dispatch pods to
  2386  // 3.  syncCh:         a channel to read periodic sync events from
  2387  // 4.  housekeepingCh: a channel to read housekeeping events from
  2388  // 5.  plegCh:         a channel to read PLEG updates from
  2389  //
  2390  // Events are also read from the kubelet liveness manager's update channel.
  2391  //
  2392  // The workflow is to read from one of the channels, handle that event, and
  2393  // update the timestamp in the sync loop monitor.
  2394  //
  2395  // Here is an appropriate place to note that despite the syntactical
  2396  // similarity to the switch statement, the case statements in a select are
  2397  // evaluated in a pseudorandom order if there are multiple channels ready to
  2398  // read from when the select is evaluated.  In other words, case statements
  2399  // are evaluated in random order, and you can not assume that the case
  2400  // statements evaluate in order if multiple channels have events.
  2401  //
  2402  // With that in mind, in truly no particular order, the different channels
  2403  // are handled as follows:
  2404  //
  2405  //   - configCh: dispatch the pods for the config change to the appropriate
  2406  //     handler callback for the event type
  2407  //   - plegCh: update the runtime cache; sync pod
  2408  //   - syncCh: sync all pods waiting for sync
  2409  //   - housekeepingCh: trigger cleanup of pods
  2410  //   - health manager: sync pods that have failed or in which one or more
  2411  //     containers have failed health checks
  2412  func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
  2413  	syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
  2414  	select {
  2415  	case u, open := <-configCh:
  2416  		// Update from a config source; dispatch it to the right handler
  2417  		// callback.
  2418  		if !open {
  2419  			klog.ErrorS(nil, "Update channel is closed, exiting the sync loop")
  2420  			return false
  2421  		}
  2422  
  2423  		switch u.Op {
  2424  		case kubetypes.ADD:
  2425  			klog.V(2).InfoS("SyncLoop ADD", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2426  			// After restarting, kubelet will get all existing pods through
  2427  			// ADD as if they are new pods. These pods will then go through the
  2428  			// admission process and *may* be rejected. This can be resolved
  2429  			// once we have checkpointing.
  2430  			handler.HandlePodAdditions(u.Pods)
  2431  		case kubetypes.UPDATE:
  2432  			klog.V(2).InfoS("SyncLoop UPDATE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2433  			handler.HandlePodUpdates(u.Pods)
  2434  		case kubetypes.REMOVE:
  2435  			klog.V(2).InfoS("SyncLoop REMOVE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2436  			handler.HandlePodRemoves(u.Pods)
  2437  		case kubetypes.RECONCILE:
  2438  			klog.V(4).InfoS("SyncLoop RECONCILE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2439  			handler.HandlePodReconcile(u.Pods)
  2440  		case kubetypes.DELETE:
  2441  			klog.V(2).InfoS("SyncLoop DELETE", "source", u.Source, "pods", klog.KObjSlice(u.Pods))
  2442  			// DELETE is treated as a UPDATE because of graceful deletion.
  2443  			handler.HandlePodUpdates(u.Pods)
  2444  		case kubetypes.SET:
  2445  			// TODO: Do we want to support this?
  2446  			klog.ErrorS(nil, "Kubelet does not support snapshot update")
  2447  		default:
  2448  			klog.ErrorS(nil, "Invalid operation type received", "operation", u.Op)
  2449  		}
  2450  
  2451  		kl.sourcesReady.AddSource(u.Source)
  2452  
  2453  	case e := <-plegCh:
  2454  		if isSyncPodWorthy(e) {
  2455  			// PLEG event for a pod; sync it.
  2456  			if pod, ok := kl.podManager.GetPodByUID(e.ID); ok {
  2457  				klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e)
  2458  				handler.HandlePodSyncs([]*v1.Pod{pod})
  2459  			} else {
  2460  				// If the pod no longer exists, ignore the event.
  2461  				klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e)
  2462  			}
  2463  		}
  2464  
  2465  		if e.Type == pleg.ContainerDied {
  2466  			if containerID, ok := e.Data.(string); ok {
  2467  				kl.cleanUpContainersInPod(e.ID, containerID)
  2468  			}
  2469  		}
  2470  	case <-syncCh:
  2471  		// Sync pods waiting for sync
  2472  		podsToSync := kl.getPodsToSync()
  2473  		if len(podsToSync) == 0 {
  2474  			break
  2475  		}
  2476  		klog.V(4).InfoS("SyncLoop (SYNC) pods", "total", len(podsToSync), "pods", klog.KObjSlice(podsToSync))
  2477  		handler.HandlePodSyncs(podsToSync)
  2478  	case update := <-kl.livenessManager.Updates():
  2479  		if update.Result == proberesults.Failure {
  2480  			handleProbeSync(kl, update, handler, "liveness", "unhealthy")
  2481  		}
  2482  	case update := <-kl.readinessManager.Updates():
  2483  		ready := update.Result == proberesults.Success
  2484  		kl.statusManager.SetContainerReadiness(update.PodUID, update.ContainerID, ready)
  2485  
  2486  		status := ""
  2487  		if ready {
  2488  			status = "ready"
  2489  		}
  2490  		handleProbeSync(kl, update, handler, "readiness", status)
  2491  	case update := <-kl.startupManager.Updates():
  2492  		started := update.Result == proberesults.Success
  2493  		kl.statusManager.SetContainerStartup(update.PodUID, update.ContainerID, started)
  2494  
  2495  		status := "unhealthy"
  2496  		if started {
  2497  			status = "started"
  2498  		}
  2499  		handleProbeSync(kl, update, handler, "startup", status)
  2500  	case <-housekeepingCh:
  2501  		if !kl.sourcesReady.AllReady() {
  2502  			// If the sources aren't ready or volume manager has not yet synced the states,
  2503  			// skip housekeeping, as we may accidentally delete pods from unready sources.
  2504  			klog.V(4).InfoS("SyncLoop (housekeeping, skipped): sources aren't ready yet")
  2505  		} else {
  2506  			start := time.Now()
  2507  			klog.V(4).InfoS("SyncLoop (housekeeping)")
  2508  			if err := handler.HandlePodCleanups(ctx); err != nil {
  2509  				klog.ErrorS(err, "Failed cleaning pods")
  2510  			}
  2511  			duration := time.Since(start)
  2512  			if duration > housekeepingWarningDuration {
  2513  				klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than expected", "expected", housekeepingWarningDuration, "actual", duration.Round(time.Millisecond))
  2514  			}
  2515  			klog.V(4).InfoS("SyncLoop (housekeeping) end", "duration", duration.Round(time.Millisecond))
  2516  		}
  2517  	}
  2518  	return true
  2519  }
  2520  
  2521  func handleProbeSync(kl *Kubelet, update proberesults.Update, handler SyncHandler, probe, status string) {
  2522  	// We should not use the pod from manager, because it is never updated after initialization.
  2523  	pod, ok := kl.podManager.GetPodByUID(update.PodUID)
  2524  	if !ok {
  2525  		// If the pod no longer exists, ignore the update.
  2526  		klog.V(4).InfoS("SyncLoop (probe): ignore irrelevant update", "probe", probe, "status", status, "update", update)
  2527  		return
  2528  	}
  2529  	klog.V(1).InfoS("SyncLoop (probe)", "probe", probe, "status", status, "pod", klog.KObj(pod))
  2530  	handler.HandlePodSyncs([]*v1.Pod{pod})
  2531  }
  2532  
  2533  // HandlePodAdditions is the callback in SyncHandler for pods being added from
  2534  // a config source.
  2535  func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
  2536  	start := kl.clock.Now()
  2537  	sort.Sort(sliceutils.PodsByCreationTime(pods))
  2538  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2539  		kl.podResizeMutex.Lock()
  2540  		defer kl.podResizeMutex.Unlock()
  2541  	}
  2542  	for _, pod := range pods {
  2543  		existingPods := kl.podManager.GetPods()
  2544  		// Always add the pod to the pod manager. Kubelet relies on the pod
  2545  		// manager as the source of truth for the desired state. If a pod does
  2546  		// not exist in the pod manager, it means that it has been deleted in
  2547  		// the apiserver and no action (other than cleanup) is required.
  2548  		kl.podManager.AddPod(pod)
  2549  
  2550  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2551  		if wasMirror {
  2552  			if pod == nil {
  2553  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2554  				continue
  2555  			}
  2556  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2557  				Pod:        pod,
  2558  				MirrorPod:  mirrorPod,
  2559  				UpdateType: kubetypes.SyncPodUpdate,
  2560  				StartTime:  start,
  2561  			})
  2562  			continue
  2563  		}
  2564  
  2565  		// Only go through the admission process if the pod is not requested
  2566  		// for termination by another part of the kubelet. If the pod is already
  2567  		// using resources (previously admitted), the pod worker is going to be
  2568  		// shutting it down. If the pod hasn't started yet, we know that when
  2569  		// the pod worker is invoked it will also avoid setting up the pod, so
  2570  		// we simply avoid doing any work.
  2571  		if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
  2572  			// We failed pods that we rejected, so activePods include all admitted
  2573  			// pods that are alive.
  2574  			activePods := kl.filterOutInactivePods(existingPods)
  2575  
  2576  			if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  2577  				// To handle kubelet restarts, test pod admissibility using AllocatedResources values
  2578  				// (for cpu & memory) from checkpoint store. If found, that is the source of truth.
  2579  				podCopy := pod.DeepCopy()
  2580  				kl.updateContainerResourceAllocation(podCopy)
  2581  
  2582  				// Check if we can admit the pod; if not, reject it.
  2583  				if ok, reason, message := kl.canAdmitPod(activePods, podCopy); !ok {
  2584  					kl.rejectPod(pod, reason, message)
  2585  					continue
  2586  				}
  2587  				// For new pod, checkpoint the resource values at which the Pod has been admitted
  2588  				if err := kl.statusManager.SetPodAllocation(podCopy); err != nil {
  2589  					//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2590  					klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod))
  2591  				}
  2592  			} else {
  2593  				// Check if we can admit the pod; if not, reject it.
  2594  				if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok {
  2595  					kl.rejectPod(pod, reason, message)
  2596  					continue
  2597  				}
  2598  			}
  2599  		}
  2600  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2601  			Pod:        pod,
  2602  			MirrorPod:  mirrorPod,
  2603  			UpdateType: kubetypes.SyncPodCreate,
  2604  			StartTime:  start,
  2605  		})
  2606  	}
  2607  }
  2608  
  2609  // updateContainerResourceAllocation updates AllocatedResources values
  2610  // (for cpu & memory) from checkpoint store
  2611  func (kl *Kubelet) updateContainerResourceAllocation(pod *v1.Pod) {
  2612  	for _, c := range pod.Spec.Containers {
  2613  		allocatedResources, found := kl.statusManager.GetContainerResourceAllocation(string(pod.UID), c.Name)
  2614  		if c.Resources.Requests != nil && found {
  2615  			if _, ok := allocatedResources[v1.ResourceCPU]; ok {
  2616  				c.Resources.Requests[v1.ResourceCPU] = allocatedResources[v1.ResourceCPU]
  2617  			}
  2618  			if _, ok := allocatedResources[v1.ResourceMemory]; ok {
  2619  				c.Resources.Requests[v1.ResourceMemory] = allocatedResources[v1.ResourceMemory]
  2620  			}
  2621  		}
  2622  	}
  2623  }
  2624  
  2625  // HandlePodUpdates is the callback in the SyncHandler interface for pods
  2626  // being updated from a config source.
  2627  func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) {
  2628  	start := kl.clock.Now()
  2629  	for _, pod := range pods {
  2630  		kl.podManager.UpdatePod(pod)
  2631  
  2632  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2633  		if wasMirror {
  2634  			if pod == nil {
  2635  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2636  				continue
  2637  			}
  2638  		}
  2639  
  2640  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2641  			Pod:        pod,
  2642  			MirrorPod:  mirrorPod,
  2643  			UpdateType: kubetypes.SyncPodUpdate,
  2644  			StartTime:  start,
  2645  		})
  2646  	}
  2647  }
  2648  
  2649  // HandlePodRemoves is the callback in the SyncHandler interface for pods
  2650  // being removed from a config source.
  2651  func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) {
  2652  	start := kl.clock.Now()
  2653  	for _, pod := range pods {
  2654  		kl.podManager.RemovePod(pod)
  2655  
  2656  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2657  		if wasMirror {
  2658  			if pod == nil {
  2659  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2660  				continue
  2661  			}
  2662  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2663  				Pod:        pod,
  2664  				MirrorPod:  mirrorPod,
  2665  				UpdateType: kubetypes.SyncPodUpdate,
  2666  				StartTime:  start,
  2667  			})
  2668  			continue
  2669  		}
  2670  
  2671  		// Deletion is allowed to fail because the periodic cleanup routine
  2672  		// will trigger deletion again.
  2673  		if err := kl.deletePod(pod); err != nil {
  2674  			klog.V(2).InfoS("Failed to delete pod", "pod", klog.KObj(pod), "err", err)
  2675  		}
  2676  	}
  2677  }
  2678  
  2679  // HandlePodReconcile is the callback in the SyncHandler interface for pods
  2680  // that should be reconciled. Pods are reconciled when only the status of the
  2681  // pod is updated in the API.
  2682  func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) {
  2683  	start := kl.clock.Now()
  2684  	for _, pod := range pods {
  2685  		// Update the pod in pod manager, status manager will do periodically reconcile according
  2686  		// to the pod manager.
  2687  		kl.podManager.UpdatePod(pod)
  2688  
  2689  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2690  		if wasMirror {
  2691  			if pod == nil {
  2692  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2693  				continue
  2694  			}
  2695  			// Static pods should be reconciled the same way as regular pods
  2696  		}
  2697  
  2698  		// TODO: reconcile being calculated in the config manager is questionable, and avoiding
  2699  		// extra syncs may no longer be necessary. Reevaluate whether Reconcile and Sync can be
  2700  		// merged (after resolving the next two TODOs).
  2701  
  2702  		// Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation.
  2703  		// TODO: this should be unnecessary today - determine what is the cause for this to
  2704  		// be different than Sync, or if there is a better place for it. For instance, we have
  2705  		// needsReconcile in kubelet/config, here, and in status_manager.
  2706  		if status.NeedToReconcilePodReadiness(pod) {
  2707  			kl.podWorkers.UpdatePod(UpdatePodOptions{
  2708  				Pod:        pod,
  2709  				MirrorPod:  mirrorPod,
  2710  				UpdateType: kubetypes.SyncPodSync,
  2711  				StartTime:  start,
  2712  			})
  2713  		}
  2714  
  2715  		// After an evicted pod is synced, all dead containers in the pod can be removed.
  2716  		// TODO: this is questionable - status read is async and during eviction we already
  2717  		// expect to not have some container info. The pod worker knows whether a pod has
  2718  		// been evicted, so if this is about minimizing the time to react to an eviction we
  2719  		// can do better. If it's about preserving pod status info we can also do better.
  2720  		if eviction.PodIsEvicted(pod.Status) {
  2721  			if podStatus, err := kl.podCache.Get(pod.UID); err == nil {
  2722  				kl.containerDeletor.deleteContainersInPod("", podStatus, true)
  2723  			}
  2724  		}
  2725  	}
  2726  }
  2727  
  2728  // HandlePodSyncs is the callback in the syncHandler interface for pods
  2729  // that should be dispatched to pod workers for sync.
  2730  func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) {
  2731  	start := kl.clock.Now()
  2732  	for _, pod := range pods {
  2733  		pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
  2734  		if wasMirror {
  2735  			if pod == nil {
  2736  				klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID)
  2737  				continue
  2738  			}
  2739  			// Syncing a mirror pod is a programmer error since the intent of sync is to
  2740  			// batch notify all pending work. We should make it impossible to double sync,
  2741  			// but for now log a programmer error to prevent accidental introduction.
  2742  			klog.V(3).InfoS("Programmer error, HandlePodSyncs does not expect to receive mirror pods", "podUID", pod.UID, "mirrorPodUID", mirrorPod.UID)
  2743  			continue
  2744  		}
  2745  		kl.podWorkers.UpdatePod(UpdatePodOptions{
  2746  			Pod:        pod,
  2747  			MirrorPod:  mirrorPod,
  2748  			UpdateType: kubetypes.SyncPodSync,
  2749  			StartTime:  start,
  2750  		})
  2751  	}
  2752  }
  2753  
  2754  func isPodResizeInProgress(pod *v1.Pod, podStatus *v1.PodStatus) bool {
  2755  	for _, c := range pod.Spec.Containers {
  2756  		if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok {
  2757  			if cs.Resources == nil {
  2758  				continue
  2759  			}
  2760  			if !cmp.Equal(c.Resources.Limits, cs.Resources.Limits) || !cmp.Equal(cs.AllocatedResources, cs.Resources.Requests) {
  2761  				return true
  2762  			}
  2763  		}
  2764  	}
  2765  	return false
  2766  }
  2767  
  2768  func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus) {
  2769  	var otherActivePods []*v1.Pod
  2770  
  2771  	node, err := kl.getNodeAnyWay()
  2772  	if err != nil {
  2773  		klog.ErrorS(err, "getNodeAnyway function failed")
  2774  		return false, nil, ""
  2775  	}
  2776  	podCopy := pod.DeepCopy()
  2777  	cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
  2778  	memAvailable := node.Status.Allocatable.Memory().Value()
  2779  	cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU)
  2780  	memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory)
  2781  	if cpuRequests > cpuAvailable || memRequests > memAvailable {
  2782  		klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name)
  2783  		return false, podCopy, v1.PodResizeStatusInfeasible
  2784  	}
  2785  
  2786  	// Treat the existing pod needing resize as a new pod with desired resources seeking admit.
  2787  	// If desired resources don't fit, pod continues to run with currently allocated resources.
  2788  	activePods := kl.GetActivePods()
  2789  	for _, p := range activePods {
  2790  		if p.UID != pod.UID {
  2791  			otherActivePods = append(otherActivePods, p)
  2792  		}
  2793  	}
  2794  
  2795  	if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok {
  2796  		// Log reason and return. Let the next sync iteration retry the resize
  2797  		klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage)
  2798  		return false, podCopy, v1.PodResizeStatusDeferred
  2799  	}
  2800  
  2801  	for _, container := range podCopy.Spec.Containers {
  2802  		idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name)
  2803  		if found {
  2804  			for rName, rQuantity := range container.Resources.Requests {
  2805  				podCopy.Status.ContainerStatuses[idx].AllocatedResources[rName] = rQuantity
  2806  			}
  2807  		}
  2808  	}
  2809  	return true, podCopy, v1.PodResizeStatusInProgress
  2810  }
  2811  
  2812  func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod {
  2813  	if pod.Status.Phase != v1.PodRunning {
  2814  		return pod
  2815  	}
  2816  	podResized := false
  2817  	for _, container := range pod.Spec.Containers {
  2818  		if len(container.Resources.Requests) == 0 {
  2819  			continue
  2820  		}
  2821  		containerStatus, found := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name)
  2822  		if !found {
  2823  			klog.V(5).InfoS("ContainerStatus not found", "pod", pod.Name, "container", container.Name)
  2824  			break
  2825  		}
  2826  		if len(containerStatus.AllocatedResources) != len(container.Resources.Requests) {
  2827  			klog.V(5).InfoS("ContainerStatus.AllocatedResources length mismatch", "pod", pod.Name, "container", container.Name)
  2828  			break
  2829  		}
  2830  		if !cmp.Equal(container.Resources.Requests, containerStatus.AllocatedResources) {
  2831  			podResized = true
  2832  			break
  2833  		}
  2834  	}
  2835  	if !podResized {
  2836  		return pod
  2837  	}
  2838  
  2839  	kl.podResizeMutex.Lock()
  2840  	defer kl.podResizeMutex.Unlock()
  2841  	fit, updatedPod, resizeStatus := kl.canResizePod(pod)
  2842  	if updatedPod == nil {
  2843  		return pod
  2844  	}
  2845  	if fit {
  2846  		// Update pod resource allocation checkpoint
  2847  		if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil {
  2848  			//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2849  			klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod))
  2850  			return pod
  2851  		}
  2852  	}
  2853  	if resizeStatus != "" {
  2854  		// Save resize decision to checkpoint
  2855  		if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil {
  2856  			//TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate
  2857  			klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod))
  2858  			return pod
  2859  		}
  2860  		updatedPod.Status.Resize = resizeStatus
  2861  	}
  2862  	kl.podManager.UpdatePod(updatedPod)
  2863  	kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status)
  2864  	return updatedPod
  2865  }
  2866  
  2867  // LatestLoopEntryTime returns the last time in the sync loop monitor.
  2868  func (kl *Kubelet) LatestLoopEntryTime() time.Time {
  2869  	val := kl.syncLoopMonitor.Load()
  2870  	if val == nil {
  2871  		return time.Time{}
  2872  	}
  2873  	return val.(time.Time)
  2874  }
  2875  
  2876  // updateRuntimeUp calls the container runtime status callback, initializing
  2877  // the runtime dependent modules when the container runtime first comes up,
  2878  // and returns an error if the status check fails.  If the status check is OK,
  2879  // update the container runtime uptime in the kubelet runtimeState.
  2880  func (kl *Kubelet) updateRuntimeUp() {
  2881  	kl.updateRuntimeMux.Lock()
  2882  	defer kl.updateRuntimeMux.Unlock()
  2883  	ctx := context.Background()
  2884  
  2885  	s, err := kl.containerRuntime.Status(ctx)
  2886  	if err != nil {
  2887  		klog.ErrorS(err, "Container runtime sanity check failed")
  2888  		return
  2889  	}
  2890  	if s == nil {
  2891  		klog.ErrorS(nil, "Container runtime status is nil")
  2892  		return
  2893  	}
  2894  	// Periodically log the whole runtime status for debugging.
  2895  	klog.V(4).InfoS("Container runtime status", "status", s)
  2896  	klogErrorS := klog.ErrorS
  2897  	if !kl.containerRuntimeReadyExpected {
  2898  		klogErrorS = klog.V(4).ErrorS
  2899  	}
  2900  	networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady)
  2901  	if networkReady == nil || !networkReady.Status {
  2902  		klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady)
  2903  		kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady))
  2904  	} else {
  2905  		// Set nil if the container runtime network is ready.
  2906  		kl.runtimeState.setNetworkState(nil)
  2907  	}
  2908  	// information in RuntimeReady condition will be propagated to NodeReady condition.
  2909  	runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady)
  2910  	// If RuntimeReady is not set or is false, report an error.
  2911  	if runtimeReady == nil || !runtimeReady.Status {
  2912  		klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady)
  2913  		kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady))
  2914  		return
  2915  	}
  2916  
  2917  	kl.runtimeState.setRuntimeState(nil)
  2918  	kl.runtimeState.setRuntimeHandlers(s.Handlers)
  2919  	kl.oneTimeInitializer.Do(kl.initializeRuntimeDependentModules)
  2920  	kl.runtimeState.setRuntimeSync(kl.clock.Now())
  2921  }
  2922  
  2923  // GetConfiguration returns the KubeletConfiguration used to configure the kubelet.
  2924  func (kl *Kubelet) GetConfiguration() kubeletconfiginternal.KubeletConfiguration {
  2925  	return kl.kubeletConfiguration
  2926  }
  2927  
  2928  // BirthCry sends an event that the kubelet has started up.
  2929  func (kl *Kubelet) BirthCry() {
  2930  	// Make an event that kubelet restarted.
  2931  	kl.recorder.Eventf(kl.nodeRef, v1.EventTypeNormal, events.StartingKubelet, "Starting kubelet.")
  2932  }
  2933  
  2934  // ResyncInterval returns the interval used for periodic syncs.
  2935  func (kl *Kubelet) ResyncInterval() time.Duration {
  2936  	return kl.resyncInterval
  2937  }
  2938  
  2939  // ListenAndServe runs the kubelet HTTP server.
  2940  func (kl *Kubelet) ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions,
  2941  	auth server.AuthInterface, tp trace.TracerProvider) {
  2942  	server.ListenAndServeKubeletServer(kl, kl.resourceAnalyzer, kubeCfg, tlsOptions, auth, tp)
  2943  }
  2944  
  2945  // ListenAndServeReadOnly runs the kubelet HTTP server in read-only mode.
  2946  func (kl *Kubelet) ListenAndServeReadOnly(address net.IP, port uint, tp trace.TracerProvider) {
  2947  	server.ListenAndServeKubeletReadOnlyServer(kl, kl.resourceAnalyzer, address, port, tp)
  2948  }
  2949  
  2950  // ListenAndServePodResources runs the kubelet podresources grpc service
  2951  func (kl *Kubelet) ListenAndServePodResources() {
  2952  	endpoint, err := util.LocalEndpoint(kl.getPodResourcesDir(), podresources.Socket)
  2953  	if err != nil {
  2954  		klog.V(2).InfoS("Failed to get local endpoint for PodResources endpoint", "err", err)
  2955  		return
  2956  	}
  2957  
  2958  	providers := podresources.PodResourcesProviders{
  2959  		Pods:             kl.podManager,
  2960  		Devices:          kl.containerManager,
  2961  		Cpus:             kl.containerManager,
  2962  		Memory:           kl.containerManager,
  2963  		DynamicResources: kl.containerManager,
  2964  	}
  2965  
  2966  	server.ListenAndServePodResources(endpoint, providers)
  2967  }
  2968  
  2969  // Delete the eligible dead container instances in a pod. Depending on the configuration, the latest dead containers may be kept around.
  2970  func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID string) {
  2971  	if podStatus, err := kl.podCache.Get(podID); err == nil {
  2972  		// When an evicted or deleted pod has already synced, all containers can be removed.
  2973  		removeAll := kl.podWorkers.ShouldPodContentBeRemoved(podID)
  2974  		kl.containerDeletor.deleteContainersInPod(exitedContainerID, podStatus, removeAll)
  2975  	}
  2976  }
  2977  
  2978  // fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime
  2979  // would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible.
  2980  // Function returns after the node status update after such event, or when the node is already ready.
  2981  // Function is executed only during Kubelet start which improves latency to ready node by updating
  2982  // kubelet state, runtime status and node statuses ASAP.
  2983  func (kl *Kubelet) fastStatusUpdateOnce() {
  2984  	ctx := context.Background()
  2985  	start := kl.clock.Now()
  2986  	stopCh := make(chan struct{})
  2987  
  2988  	// Keep trying to make fast node status update until either timeout is reached or an update is successful.
  2989  	wait.Until(func() {
  2990  		// fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired
  2991  		// (status was not updated within nodeReadyGracePeriod and the second argument below gets true),
  2992  		// then we close the channel and abort the loop.
  2993  		if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) {
  2994  			close(stopCh)
  2995  		}
  2996  	}, 100*time.Millisecond, stopCh)
  2997  }
  2998  
  2999  // CheckpointContainer tries to checkpoint a container. The parameters are used to
  3000  // look up the specified container. If the container specified by the given parameters
  3001  // cannot be found an error is returned. If the container is found the container
  3002  // engine will be asked to checkpoint the given container into the kubelet's default
  3003  // checkpoint directory.
  3004  func (kl *Kubelet) CheckpointContainer(
  3005  	ctx context.Context,
  3006  	podUID types.UID,
  3007  	podFullName,
  3008  	containerName string,
  3009  	options *runtimeapi.CheckpointContainerRequest,
  3010  ) error {
  3011  	container, err := kl.findContainer(ctx, podFullName, podUID, containerName)
  3012  	if err != nil {
  3013  		return err
  3014  	}
  3015  	if container == nil {
  3016  		return fmt.Errorf("container %v not found", containerName)
  3017  	}
  3018  
  3019  	options.Location = filepath.Join(
  3020  		kl.getCheckpointsDir(),
  3021  		fmt.Sprintf(
  3022  			"checkpoint-%s-%s-%s.tar",
  3023  			podFullName,
  3024  			containerName,
  3025  			time.Now().Format(time.RFC3339),
  3026  		),
  3027  	)
  3028  
  3029  	options.ContainerId = string(container.ID.ID)
  3030  
  3031  	if err := kl.containerRuntime.CheckpointContainer(ctx, options); err != nil {
  3032  		return err
  3033  	}
  3034  
  3035  	return nil
  3036  }
  3037  
  3038  // ListMetricDescriptors gets the descriptors for the metrics that will be returned in ListPodSandboxMetrics.
  3039  func (kl *Kubelet) ListMetricDescriptors(ctx context.Context) ([]*runtimeapi.MetricDescriptor, error) {
  3040  	return kl.containerRuntime.ListMetricDescriptors(ctx)
  3041  }
  3042  
  3043  // ListPodSandboxMetrics retrieves the metrics for all pod sandboxes.
  3044  func (kl *Kubelet) ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error) {
  3045  	return kl.containerRuntime.ListPodSandboxMetrics(ctx)
  3046  }
  3047  
  3048  func (kl *Kubelet) supportLocalStorageCapacityIsolation() bool {
  3049  	return kl.GetConfiguration().LocalStorageCapacityIsolation
  3050  }
  3051  
  3052  // isSyncPodWorthy filters out events that are not worthy of pod syncing
  3053  func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool {
  3054  	// ContainerRemoved doesn't affect pod state
  3055  	return event.Type != pleg.ContainerRemoved
  3056  }
  3057  
  3058  // PrepareDynamicResources calls the container Manager PrepareDynamicResources API
  3059  // This method implements the RuntimeHelper interface
  3060  func (kl *Kubelet) PrepareDynamicResources(pod *v1.Pod) error {
  3061  	return kl.containerManager.PrepareDynamicResources(pod)
  3062  }
  3063  
  3064  // UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API
  3065  // This method implements the RuntimeHelper interface
  3066  func (kl *Kubelet) UnprepareDynamicResources(pod *v1.Pod) error {
  3067  	return kl.containerManager.UnprepareDynamicResources(pod)
  3068  }