k8s.io/kubernetes@v1.29.3/pkg/kubelet/kubelet.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "context" 21 "crypto/tls" 22 "fmt" 23 "math" 24 "net" 25 "net/http" 26 "os" 27 "path/filepath" 28 sysruntime "runtime" 29 "sort" 30 "sync" 31 "sync/atomic" 32 "time" 33 34 cadvisorapi "github.com/google/cadvisor/info/v1" 35 "github.com/google/go-cmp/cmp" 36 libcontaineruserns "github.com/opencontainers/runc/libcontainer/userns" 37 "github.com/opencontainers/selinux/go-selinux" 38 "go.opentelemetry.io/otel/attribute" 39 semconv "go.opentelemetry.io/otel/semconv/v1.12.0" 40 "go.opentelemetry.io/otel/trace" 41 "k8s.io/client-go/informers" 42 43 "k8s.io/mount-utils" 44 "k8s.io/utils/integer" 45 netutils "k8s.io/utils/net" 46 47 v1 "k8s.io/api/core/v1" 48 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 49 "k8s.io/apimachinery/pkg/fields" 50 "k8s.io/apimachinery/pkg/labels" 51 "k8s.io/apimachinery/pkg/types" 52 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 53 "k8s.io/apimachinery/pkg/util/sets" 54 "k8s.io/apimachinery/pkg/util/wait" 55 utilfeature "k8s.io/apiserver/pkg/util/feature" 56 clientset "k8s.io/client-go/kubernetes" 57 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 58 corelisters "k8s.io/client-go/listers/core/v1" 59 "k8s.io/client-go/tools/cache" 60 "k8s.io/client-go/tools/record" 61 "k8s.io/client-go/util/certificate" 62 "k8s.io/client-go/util/flowcontrol" 63 cloudprovider "k8s.io/cloud-provider" 64 "k8s.io/component-helpers/apimachinery/lease" 65 internalapi "k8s.io/cri-api/pkg/apis" 66 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 67 "k8s.io/klog/v2" 68 pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1" 69 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 70 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 71 "k8s.io/kubernetes/pkg/api/v1/resource" 72 "k8s.io/kubernetes/pkg/features" 73 kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" 74 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 75 "k8s.io/kubernetes/pkg/kubelet/cadvisor" 76 kubeletcertificate "k8s.io/kubernetes/pkg/kubelet/certificate" 77 "k8s.io/kubernetes/pkg/kubelet/cloudresource" 78 "k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" 79 "k8s.io/kubernetes/pkg/kubelet/cm" 80 draplugin "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" 81 "k8s.io/kubernetes/pkg/kubelet/config" 82 "k8s.io/kubernetes/pkg/kubelet/configmap" 83 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 84 "k8s.io/kubernetes/pkg/kubelet/cri/remote" 85 "k8s.io/kubernetes/pkg/kubelet/events" 86 "k8s.io/kubernetes/pkg/kubelet/eviction" 87 "k8s.io/kubernetes/pkg/kubelet/images" 88 "k8s.io/kubernetes/pkg/kubelet/kuberuntime" 89 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 90 "k8s.io/kubernetes/pkg/kubelet/logs" 91 "k8s.io/kubernetes/pkg/kubelet/metrics" 92 "k8s.io/kubernetes/pkg/kubelet/metrics/collectors" 93 "k8s.io/kubernetes/pkg/kubelet/network/dns" 94 "k8s.io/kubernetes/pkg/kubelet/nodeshutdown" 95 oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom" 96 "k8s.io/kubernetes/pkg/kubelet/pleg" 97 "k8s.io/kubernetes/pkg/kubelet/pluginmanager" 98 plugincache "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 99 kubepod "k8s.io/kubernetes/pkg/kubelet/pod" 100 "k8s.io/kubernetes/pkg/kubelet/preemption" 101 "k8s.io/kubernetes/pkg/kubelet/prober" 102 proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" 103 "k8s.io/kubernetes/pkg/kubelet/runtimeclass" 104 "k8s.io/kubernetes/pkg/kubelet/secret" 105 "k8s.io/kubernetes/pkg/kubelet/server" 106 servermetrics "k8s.io/kubernetes/pkg/kubelet/server/metrics" 107 serverstats "k8s.io/kubernetes/pkg/kubelet/server/stats" 108 "k8s.io/kubernetes/pkg/kubelet/stats" 109 "k8s.io/kubernetes/pkg/kubelet/status" 110 "k8s.io/kubernetes/pkg/kubelet/sysctl" 111 "k8s.io/kubernetes/pkg/kubelet/token" 112 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 113 "k8s.io/kubernetes/pkg/kubelet/userns" 114 "k8s.io/kubernetes/pkg/kubelet/util" 115 "k8s.io/kubernetes/pkg/kubelet/util/manager" 116 "k8s.io/kubernetes/pkg/kubelet/util/queue" 117 "k8s.io/kubernetes/pkg/kubelet/util/sliceutils" 118 "k8s.io/kubernetes/pkg/kubelet/volumemanager" 119 httpprobe "k8s.io/kubernetes/pkg/probe/http" 120 "k8s.io/kubernetes/pkg/security/apparmor" 121 "k8s.io/kubernetes/pkg/util/oom" 122 "k8s.io/kubernetes/pkg/volume" 123 "k8s.io/kubernetes/pkg/volume/csi" 124 "k8s.io/kubernetes/pkg/volume/util/hostutil" 125 "k8s.io/kubernetes/pkg/volume/util/subpath" 126 "k8s.io/kubernetes/pkg/volume/util/volumepathhandler" 127 "k8s.io/utils/clock" 128 ) 129 130 const ( 131 // Max amount of time to wait for the container runtime to come up. 132 maxWaitForContainerRuntime = 30 * time.Second 133 134 // nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed. 135 nodeStatusUpdateRetry = 5 136 137 // nodeReadyGracePeriod is the period to allow for before fast status update is 138 // terminated and container runtime not being ready is logged without verbosity guard. 139 nodeReadyGracePeriod = 120 * time.Second 140 141 // DefaultContainerLogsDir is the location of container logs. 142 DefaultContainerLogsDir = "/var/log/containers" 143 144 // MaxContainerBackOff is the max backoff period, exported for the e2e test 145 MaxContainerBackOff = 300 * time.Second 146 147 // Period for performing global cleanup tasks. 148 housekeepingPeriod = time.Second * 2 149 150 // Duration at which housekeeping failed to satisfy the invariant that 151 // housekeeping should be fast to avoid blocking pod config (while 152 // housekeeping is running no new pods are started or deleted). 153 housekeepingWarningDuration = time.Second * 1 154 155 // Period after which the runtime cache expires - set to slightly longer than 156 // the expected length between housekeeping periods, which explicitly refreshes 157 // the cache. 158 runtimeCacheRefreshPeriod = housekeepingPeriod + housekeepingWarningDuration 159 160 // Period for performing eviction monitoring. 161 // ensure this is kept in sync with internal cadvisor housekeeping. 162 evictionMonitoringPeriod = time.Second * 10 163 164 // The path in containers' filesystems where the hosts file is mounted. 165 linuxEtcHostsPath = "/etc/hosts" 166 windowsEtcHostsPath = "C:\\Windows\\System32\\drivers\\etc\\hosts" 167 168 // Capacity of the channel for receiving pod lifecycle events. This number 169 // is a bit arbitrary and may be adjusted in the future. 170 plegChannelCapacity = 1000 171 172 // Generic PLEG relies on relisting for discovering container events. 173 // A longer period means that kubelet will take longer to detect container 174 // changes and to update pod status. On the other hand, a shorter period 175 // will cause more frequent relisting (e.g., container runtime operations), 176 // leading to higher cpu usage. 177 // Note that even though we set the period to 1s, the relisting itself can 178 // take more than 1s to finish if the container runtime responds slowly 179 // and/or when there are many container changes in one cycle. 180 genericPlegRelistPeriod = time.Second * 1 181 genericPlegRelistThreshold = time.Minute * 3 182 183 // Generic PLEG relist period and threshold when used with Evented PLEG. 184 eventedPlegRelistPeriod = time.Second * 300 185 eventedPlegRelistThreshold = time.Minute * 10 186 eventedPlegMaxStreamRetries = 5 187 188 // backOffPeriod is the period to back off when pod syncing results in an 189 // error. It is also used as the base period for the exponential backoff 190 // container restarts and image pulls. 191 backOffPeriod = time.Second * 10 192 193 // ContainerGCPeriod is the period for performing container garbage collection. 194 ContainerGCPeriod = time.Minute 195 // ImageGCPeriod is the period for performing image garbage collection. 196 ImageGCPeriod = 5 * time.Minute 197 198 // Minimum number of dead containers to keep in a pod 199 minDeadContainerInPod = 1 200 201 // nodeLeaseRenewIntervalFraction is the fraction of lease duration to renew the lease 202 nodeLeaseRenewIntervalFraction = 0.25 203 204 // instrumentationScope is the name of OpenTelemetry instrumentation scope 205 instrumentationScope = "k8s.io/kubernetes/pkg/kubelet" 206 ) 207 208 var ( 209 // ContainerLogsDir can be overwritten for testing usage 210 ContainerLogsDir = DefaultContainerLogsDir 211 etcHostsPath = getContainerEtcHostsPath() 212 ) 213 214 func getContainerEtcHostsPath() string { 215 if sysruntime.GOOS == "windows" { 216 return windowsEtcHostsPath 217 } 218 return linuxEtcHostsPath 219 } 220 221 // SyncHandler is an interface implemented by Kubelet, for testability 222 type SyncHandler interface { 223 HandlePodAdditions(pods []*v1.Pod) 224 HandlePodUpdates(pods []*v1.Pod) 225 HandlePodRemoves(pods []*v1.Pod) 226 HandlePodReconcile(pods []*v1.Pod) 227 HandlePodSyncs(pods []*v1.Pod) 228 HandlePodCleanups(ctx context.Context) error 229 } 230 231 // Option is a functional option type for Kubelet 232 type Option func(*Kubelet) 233 234 // Bootstrap is a bootstrapping interface for kubelet, targets the initialization protocol 235 type Bootstrap interface { 236 GetConfiguration() kubeletconfiginternal.KubeletConfiguration 237 BirthCry() 238 StartGarbageCollection() 239 ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, auth server.AuthInterface, tp trace.TracerProvider) 240 ListenAndServeReadOnly(address net.IP, port uint) 241 ListenAndServePodResources() 242 Run(<-chan kubetypes.PodUpdate) 243 RunOnce(<-chan kubetypes.PodUpdate) ([]RunPodResult, error) 244 } 245 246 // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed 247 // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping 248 // these objects while we figure out a more comprehensive dependency injection story for the Kubelet. 249 type Dependencies struct { 250 Options []Option 251 252 // Injected Dependencies 253 Auth server.AuthInterface 254 CAdvisorInterface cadvisor.Interface 255 Cloud cloudprovider.Interface 256 ContainerManager cm.ContainerManager 257 EventClient v1core.EventsGetter 258 HeartbeatClient clientset.Interface 259 OnHeartbeatFailure func() 260 KubeClient clientset.Interface 261 Mounter mount.Interface 262 HostUtil hostutil.HostUtils 263 OOMAdjuster *oom.OOMAdjuster 264 OSInterface kubecontainer.OSInterface 265 PodConfig *config.PodConfig 266 ProbeManager prober.Manager 267 Recorder record.EventRecorder 268 Subpather subpath.Interface 269 TracerProvider trace.TracerProvider 270 VolumePlugins []volume.VolumePlugin 271 DynamicPluginProber volume.DynamicPluginProber 272 TLSOptions *server.TLSOptions 273 RemoteRuntimeService internalapi.RuntimeService 274 RemoteImageService internalapi.ImageManagerService 275 PodStartupLatencyTracker util.PodStartupLatencyTracker 276 NodeStartupLatencyTracker util.NodeStartupLatencyTracker 277 // remove it after cadvisor.UsingLegacyCadvisorStats dropped. 278 useLegacyCadvisorStats bool 279 } 280 281 // makePodSourceConfig creates a config.PodConfig from the given 282 // KubeletConfiguration or returns an error. 283 func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) { 284 manifestURLHeader := make(http.Header) 285 if len(kubeCfg.StaticPodURLHeader) > 0 { 286 for k, v := range kubeCfg.StaticPodURLHeader { 287 for i := range v { 288 manifestURLHeader.Add(k, v[i]) 289 } 290 } 291 } 292 293 // source of all configuration 294 cfg := config.NewPodConfig(config.PodConfigNotificationIncremental, kubeDeps.Recorder, kubeDeps.PodStartupLatencyTracker) 295 296 // TODO: it needs to be replaced by a proper context in the future 297 ctx := context.TODO() 298 299 // define file config source 300 if kubeCfg.StaticPodPath != "" { 301 klog.InfoS("Adding static pod path", "path", kubeCfg.StaticPodPath) 302 config.NewSourceFile(kubeCfg.StaticPodPath, nodeName, kubeCfg.FileCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.FileSource)) 303 } 304 305 // define url config source 306 if kubeCfg.StaticPodURL != "" { 307 klog.InfoS("Adding pod URL with HTTP header", "URL", kubeCfg.StaticPodURL, "header", manifestURLHeader) 308 config.NewSourceURL(kubeCfg.StaticPodURL, manifestURLHeader, nodeName, kubeCfg.HTTPCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.HTTPSource)) 309 } 310 311 if kubeDeps.KubeClient != nil { 312 klog.InfoS("Adding apiserver pod source") 313 config.NewSourceApiserver(kubeDeps.KubeClient, nodeName, nodeHasSynced, cfg.Channel(ctx, kubetypes.ApiserverSource)) 314 } 315 return cfg, nil 316 } 317 318 // PreInitRuntimeService will init runtime service before RunKubelet. 319 func PreInitRuntimeService(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies) error { 320 remoteImageEndpoint := kubeCfg.ImageServiceEndpoint 321 if remoteImageEndpoint == "" && kubeCfg.ContainerRuntimeEndpoint != "" { 322 remoteImageEndpoint = kubeCfg.ContainerRuntimeEndpoint 323 } 324 var err error 325 if kubeDeps.RemoteRuntimeService, err = remote.NewRemoteRuntimeService(kubeCfg.ContainerRuntimeEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, kubeDeps.TracerProvider); err != nil { 326 return err 327 } 328 if kubeDeps.RemoteImageService, err = remote.NewRemoteImageService(remoteImageEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, kubeDeps.TracerProvider); err != nil { 329 return err 330 } 331 332 kubeDeps.useLegacyCadvisorStats = cadvisor.UsingLegacyCadvisorStats(kubeCfg.ContainerRuntimeEndpoint) 333 334 return nil 335 } 336 337 // NewMainKubelet instantiates a new Kubelet object along with all the required internal modules. 338 // No initialization of Kubelet and its modules should happen here. 339 func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, 340 kubeDeps *Dependencies, 341 crOptions *config.ContainerRuntimeOptions, 342 hostname string, 343 hostnameOverridden bool, 344 nodeName types.NodeName, 345 nodeIPs []net.IP, 346 providerID string, 347 cloudProvider string, 348 certDirectory string, 349 rootDirectory string, 350 imageCredentialProviderConfigFile string, 351 imageCredentialProviderBinDir string, 352 registerNode bool, 353 registerWithTaints []v1.Taint, 354 allowedUnsafeSysctls []string, 355 experimentalMounterPath string, 356 kernelMemcgNotification bool, 357 experimentalNodeAllocatableIgnoreEvictionThreshold bool, 358 minimumGCAge metav1.Duration, 359 maxPerPodContainerCount int32, 360 maxContainerCount int32, 361 registerSchedulable bool, 362 keepTerminatedPodVolumes bool, 363 nodeLabels map[string]string, 364 nodeStatusMaxImages int32, 365 seccompDefault bool, 366 ) (*Kubelet, error) { 367 ctx := context.Background() 368 logger := klog.TODO() 369 370 if rootDirectory == "" { 371 return nil, fmt.Errorf("invalid root directory %q", rootDirectory) 372 } 373 if kubeCfg.SyncFrequency.Duration <= 0 { 374 return nil, fmt.Errorf("invalid sync frequency %d", kubeCfg.SyncFrequency.Duration) 375 } 376 377 if utilfeature.DefaultFeatureGate.Enabled(features.DisableCloudProviders) && cloudprovider.IsDeprecatedInternal(cloudProvider) { 378 cloudprovider.DisableWarningForProvider(cloudProvider) 379 return nil, fmt.Errorf("cloud provider %q was specified, but built-in cloud providers are disabled. Please set --cloud-provider=external and migrate to an external cloud provider", cloudProvider) 380 } 381 382 var nodeHasSynced cache.InformerSynced 383 var nodeLister corelisters.NodeLister 384 385 // If kubeClient == nil, we are running in standalone mode (i.e. no API servers) 386 // If not nil, we are running as part of a cluster and should sync w/API 387 if kubeDeps.KubeClient != nil { 388 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) { 389 options.FieldSelector = fields.Set{metav1.ObjectNameField: string(nodeName)}.String() 390 })) 391 nodeLister = kubeInformers.Core().V1().Nodes().Lister() 392 nodeHasSynced = func() bool { 393 return kubeInformers.Core().V1().Nodes().Informer().HasSynced() 394 } 395 kubeInformers.Start(wait.NeverStop) 396 klog.InfoS("Attempting to sync node with API server") 397 } else { 398 // we don't have a client to sync! 399 nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) 400 nodeLister = corelisters.NewNodeLister(nodeIndexer) 401 nodeHasSynced = func() bool { return true } 402 klog.InfoS("Kubelet is running in standalone mode, will skip API server sync") 403 } 404 405 if kubeDeps.PodConfig == nil { 406 var err error 407 kubeDeps.PodConfig, err = makePodSourceConfig(kubeCfg, kubeDeps, nodeName, nodeHasSynced) 408 if err != nil { 409 return nil, err 410 } 411 } 412 413 containerGCPolicy := kubecontainer.GCPolicy{ 414 MinAge: minimumGCAge.Duration, 415 MaxPerPodContainer: int(maxPerPodContainerCount), 416 MaxContainers: int(maxContainerCount), 417 } 418 419 daemonEndpoints := &v1.NodeDaemonEndpoints{ 420 KubeletEndpoint: v1.DaemonEndpoint{Port: kubeCfg.Port}, 421 } 422 423 imageGCPolicy := images.ImageGCPolicy{ 424 MinAge: kubeCfg.ImageMinimumGCAge.Duration, 425 HighThresholdPercent: int(kubeCfg.ImageGCHighThresholdPercent), 426 LowThresholdPercent: int(kubeCfg.ImageGCLowThresholdPercent), 427 } 428 429 if utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) { 430 imageGCPolicy.MaxAge = kubeCfg.ImageMaximumGCAge.Duration 431 } else if kubeCfg.ImageMaximumGCAge.Duration != 0 { 432 klog.InfoS("ImageMaximumGCAge flag enabled, but corresponding feature gate is not enabled. Ignoring flag.") 433 } 434 435 enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable 436 if experimentalNodeAllocatableIgnoreEvictionThreshold { 437 // Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions 438 enforceNodeAllocatable = []string{} 439 } 440 thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim) 441 if err != nil { 442 return nil, err 443 } 444 evictionConfig := eviction.Config{ 445 PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration, 446 MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod), 447 Thresholds: thresholds, 448 KernelMemcgNotification: kernelMemcgNotification, 449 PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(), 450 } 451 452 var serviceLister corelisters.ServiceLister 453 var serviceHasSynced cache.InformerSynced 454 if kubeDeps.KubeClient != nil { 455 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0) 456 serviceLister = kubeInformers.Core().V1().Services().Lister() 457 serviceHasSynced = kubeInformers.Core().V1().Services().Informer().HasSynced 458 kubeInformers.Start(wait.NeverStop) 459 } else { 460 serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) 461 serviceLister = corelisters.NewServiceLister(serviceIndexer) 462 serviceHasSynced = func() bool { return true } 463 } 464 465 // construct a node reference used for events 466 nodeRef := &v1.ObjectReference{ 467 Kind: "Node", 468 Name: string(nodeName), 469 UID: types.UID(nodeName), 470 Namespace: "", 471 } 472 473 oomWatcher, err := oomwatcher.NewWatcher(kubeDeps.Recorder) 474 if err != nil { 475 if libcontaineruserns.RunningInUserNS() { 476 if utilfeature.DefaultFeatureGate.Enabled(features.KubeletInUserNamespace) { 477 // oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error, 478 // when running in a user namespace with sysctl value `kernel.dmesg_restrict=1`. 479 klog.V(2).InfoS("Failed to create an oomWatcher (running in UserNS, ignoring)", "err", err) 480 oomWatcher = nil 481 } else { 482 klog.ErrorS(err, "Failed to create an oomWatcher (running in UserNS, Hint: enable KubeletInUserNamespace feature flag to ignore the error)") 483 return nil, err 484 } 485 } else { 486 return nil, err 487 } 488 } 489 490 clusterDNS := make([]net.IP, 0, len(kubeCfg.ClusterDNS)) 491 for _, ipEntry := range kubeCfg.ClusterDNS { 492 ip := netutils.ParseIPSloppy(ipEntry) 493 if ip == nil { 494 klog.InfoS("Invalid clusterDNS IP", "IP", ipEntry) 495 } else { 496 clusterDNS = append(clusterDNS, ip) 497 } 498 } 499 500 // A TLS transport is needed to make HTTPS-based container lifecycle requests, 501 // but we do not have the information necessary to do TLS verification. 502 // 503 // This client must not be modified to include credentials, because it is 504 // critical that credentials not leak from the client to arbitrary hosts. 505 insecureContainerLifecycleHTTPClient := &http.Client{} 506 if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentHTTPGetHandlers) { 507 insecureTLSTransport := &http.Transport{ 508 TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 509 } 510 insecureContainerLifecycleHTTPClient.Transport = insecureTLSTransport 511 insecureContainerLifecycleHTTPClient.CheckRedirect = httpprobe.RedirectChecker(false) 512 } 513 514 tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope) 515 516 klet := &Kubelet{ 517 hostname: hostname, 518 hostnameOverridden: hostnameOverridden, 519 nodeName: nodeName, 520 kubeClient: kubeDeps.KubeClient, 521 heartbeatClient: kubeDeps.HeartbeatClient, 522 onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure, 523 rootDirectory: filepath.Clean(rootDirectory), 524 resyncInterval: kubeCfg.SyncFrequency.Duration, 525 sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources), 526 registerNode: registerNode, 527 registerWithTaints: registerWithTaints, 528 registerSchedulable: registerSchedulable, 529 dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig), 530 serviceLister: serviceLister, 531 serviceHasSynced: serviceHasSynced, 532 nodeLister: nodeLister, 533 nodeHasSynced: nodeHasSynced, 534 streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration, 535 recorder: kubeDeps.Recorder, 536 cadvisor: kubeDeps.CAdvisorInterface, 537 cloud: kubeDeps.Cloud, 538 externalCloudProvider: cloudprovider.IsExternal(cloudProvider), 539 providerID: providerID, 540 nodeRef: nodeRef, 541 nodeLabels: nodeLabels, 542 nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration, 543 nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration, 544 os: kubeDeps.OSInterface, 545 oomWatcher: oomWatcher, 546 cgroupsPerQOS: kubeCfg.CgroupsPerQOS, 547 cgroupRoot: kubeCfg.CgroupRoot, 548 mounter: kubeDeps.Mounter, 549 hostutil: kubeDeps.HostUtil, 550 subpather: kubeDeps.Subpather, 551 maxPods: int(kubeCfg.MaxPods), 552 podsPerCore: int(kubeCfg.PodsPerCore), 553 syncLoopMonitor: atomic.Value{}, 554 daemonEndpoints: daemonEndpoints, 555 containerManager: kubeDeps.ContainerManager, 556 nodeIPs: nodeIPs, 557 nodeIPValidator: validateNodeIP, 558 clock: clock.RealClock{}, 559 enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, 560 makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, 561 keepTerminatedPodVolumes: keepTerminatedPodVolumes, 562 nodeStatusMaxImages: nodeStatusMaxImages, 563 tracer: tracer, 564 nodeStartupLatencyTracker: kubeDeps.NodeStartupLatencyTracker, 565 } 566 567 if klet.cloud != nil { 568 klet.cloudResourceSyncManager = cloudresource.NewSyncManager(klet.cloud, nodeName, klet.nodeStatusUpdateFrequency) 569 } 570 571 var secretManager secret.Manager 572 var configMapManager configmap.Manager 573 if klet.kubeClient != nil { 574 switch kubeCfg.ConfigMapAndSecretChangeDetectionStrategy { 575 case kubeletconfiginternal.WatchChangeDetectionStrategy: 576 secretManager = secret.NewWatchingSecretManager(klet.kubeClient, klet.resyncInterval) 577 configMapManager = configmap.NewWatchingConfigMapManager(klet.kubeClient, klet.resyncInterval) 578 case kubeletconfiginternal.TTLCacheChangeDetectionStrategy: 579 secretManager = secret.NewCachingSecretManager( 580 klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode)) 581 configMapManager = configmap.NewCachingConfigMapManager( 582 klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode)) 583 case kubeletconfiginternal.GetChangeDetectionStrategy: 584 secretManager = secret.NewSimpleSecretManager(klet.kubeClient) 585 configMapManager = configmap.NewSimpleConfigMapManager(klet.kubeClient) 586 default: 587 return nil, fmt.Errorf("unknown configmap and secret manager mode: %v", kubeCfg.ConfigMapAndSecretChangeDetectionStrategy) 588 } 589 590 klet.secretManager = secretManager 591 klet.configMapManager = configMapManager 592 } 593 594 machineInfo, err := klet.cadvisor.MachineInfo() 595 if err != nil { 596 return nil, err 597 } 598 // Avoid collector collects it as a timestamped metric 599 // See PR #95210 and #97006 for more details. 600 machineInfo.Timestamp = time.Time{} 601 klet.setCachedMachineInfo(machineInfo) 602 603 imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) 604 605 klet.livenessManager = proberesults.NewManager() 606 klet.readinessManager = proberesults.NewManager() 607 klet.startupManager = proberesults.NewManager() 608 klet.podCache = kubecontainer.NewCache() 609 610 klet.mirrorPodClient = kubepod.NewBasicMirrorClient(klet.kubeClient, string(nodeName), nodeLister) 611 klet.podManager = kubepod.NewBasicPodManager() 612 613 klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker, klet.getRootDir()) 614 615 klet.resourceAnalyzer = serverstats.NewResourceAnalyzer(klet, kubeCfg.VolumeStatsAggPeriod.Duration, kubeDeps.Recorder) 616 617 klet.runtimeService = kubeDeps.RemoteRuntimeService 618 619 if kubeDeps.KubeClient != nil { 620 klet.runtimeClassManager = runtimeclass.NewManager(kubeDeps.KubeClient) 621 } 622 623 // setup containerLogManager for CRI container runtime 624 containerLogManager, err := logs.NewContainerLogManager( 625 klet.runtimeService, 626 kubeDeps.OSInterface, 627 kubeCfg.ContainerLogMaxSize, 628 int(kubeCfg.ContainerLogMaxFiles), 629 ) 630 if err != nil { 631 return nil, fmt.Errorf("failed to initialize container log manager: %v", err) 632 } 633 klet.containerLogManager = containerLogManager 634 635 klet.reasonCache = NewReasonCache() 636 klet.workQueue = queue.NewBasicWorkQueue(klet.clock) 637 klet.podWorkers = newPodWorkers( 638 klet, 639 kubeDeps.Recorder, 640 klet.workQueue, 641 klet.resyncInterval, 642 backOffPeriod, 643 klet.podCache, 644 ) 645 646 runtime, err := kuberuntime.NewKubeGenericRuntimeManager( 647 kubecontainer.FilterEventRecorder(kubeDeps.Recorder), 648 klet.livenessManager, 649 klet.readinessManager, 650 klet.startupManager, 651 rootDirectory, 652 machineInfo, 653 klet.podWorkers, 654 kubeDeps.OSInterface, 655 klet, 656 insecureContainerLifecycleHTTPClient, 657 imageBackOff, 658 kubeCfg.SerializeImagePulls, 659 kubeCfg.MaxParallelImagePulls, 660 float32(kubeCfg.RegistryPullQPS), 661 int(kubeCfg.RegistryBurst), 662 imageCredentialProviderConfigFile, 663 imageCredentialProviderBinDir, 664 kubeCfg.CPUCFSQuota, 665 kubeCfg.CPUCFSQuotaPeriod, 666 kubeDeps.RemoteRuntimeService, 667 kubeDeps.RemoteImageService, 668 kubeDeps.ContainerManager, 669 klet.containerLogManager, 670 klet.runtimeClassManager, 671 seccompDefault, 672 kubeCfg.MemorySwap.SwapBehavior, 673 kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, 674 *kubeCfg.MemoryThrottlingFactor, 675 kubeDeps.PodStartupLatencyTracker, 676 kubeDeps.TracerProvider, 677 ) 678 if err != nil { 679 return nil, err 680 } 681 klet.containerRuntime = runtime 682 klet.streamingRuntime = runtime 683 klet.runner = runtime 684 685 runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime, runtimeCacheRefreshPeriod) 686 if err != nil { 687 return nil, err 688 } 689 klet.runtimeCache = runtimeCache 690 691 // common provider to get host file system usage associated with a pod managed by kubelet 692 hostStatsProvider := stats.NewHostStatsProvider(kubecontainer.RealOS{}, func(podUID types.UID) string { 693 return getEtcHostsPath(klet.getPodDir(podUID)) 694 }) 695 if kubeDeps.useLegacyCadvisorStats { 696 klet.StatsProvider = stats.NewCadvisorStatsProvider( 697 klet.cadvisor, 698 klet.resourceAnalyzer, 699 klet.podManager, 700 klet.runtimeCache, 701 klet.containerRuntime, 702 klet.statusManager, 703 hostStatsProvider) 704 } else { 705 klet.StatsProvider = stats.NewCRIStatsProvider( 706 klet.cadvisor, 707 klet.resourceAnalyzer, 708 klet.podManager, 709 klet.runtimeCache, 710 kubeDeps.RemoteRuntimeService, 711 kubeDeps.RemoteImageService, 712 hostStatsProvider, 713 utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI)) 714 } 715 716 eventChannel := make(chan *pleg.PodLifecycleEvent, plegChannelCapacity) 717 718 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 719 // adjust Generic PLEG relisting period and threshold to higher value when Evented PLEG is turned on 720 genericRelistDuration := &pleg.RelistDuration{ 721 RelistPeriod: eventedPlegRelistPeriod, 722 RelistThreshold: eventedPlegRelistThreshold, 723 } 724 klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{}) 725 // In case Evented PLEG has to fall back on Generic PLEG due to an error, 726 // Evented PLEG should be able to reset the Generic PLEG relisting duration 727 // to the default value. 728 eventedRelistDuration := &pleg.RelistDuration{ 729 RelistPeriod: genericPlegRelistPeriod, 730 RelistThreshold: genericPlegRelistThreshold, 731 } 732 klet.eventedPleg, err = pleg.NewEventedPLEG(klet.containerRuntime, klet.runtimeService, eventChannel, 733 klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{}) 734 if err != nil { 735 return nil, err 736 } 737 } else { 738 genericRelistDuration := &pleg.RelistDuration{ 739 RelistPeriod: genericPlegRelistPeriod, 740 RelistThreshold: genericPlegRelistThreshold, 741 } 742 klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{}) 743 } 744 745 klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime) 746 klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy) 747 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 748 klet.runtimeState.addHealthCheck("EventedPLEG", klet.eventedPleg.Healthy) 749 } 750 if _, err := klet.updatePodCIDR(ctx, kubeCfg.PodCIDR); err != nil { 751 klog.ErrorS(err, "Pod CIDR update failed") 752 } 753 754 // setup containerGC 755 containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady) 756 if err != nil { 757 return nil, err 758 } 759 klet.containerGC = containerGC 760 klet.containerDeletor = newPodContainerDeletor(klet.containerRuntime, integer.IntMax(containerGCPolicy.MaxPerPodContainer, minDeadContainerInPod)) 761 762 // setup imageManager 763 imageManager, err := images.NewImageGCManager(klet.containerRuntime, klet.StatsProvider, kubeDeps.Recorder, nodeRef, imageGCPolicy, kubeDeps.TracerProvider) 764 if err != nil { 765 return nil, fmt.Errorf("failed to initialize image manager: %v", err) 766 } 767 klet.imageManager = imageManager 768 769 if kubeCfg.ServerTLSBootstrap && kubeDeps.TLSOptions != nil && utilfeature.DefaultFeatureGate.Enabled(features.RotateKubeletServerCertificate) { 770 klet.serverCertificateManager, err = kubeletcertificate.NewKubeletServerCertificateManager(klet.kubeClient, kubeCfg, klet.nodeName, klet.getLastObservedNodeAddresses, certDirectory) 771 if err != nil { 772 return nil, fmt.Errorf("failed to initialize certificate manager: %v", err) 773 } 774 kubeDeps.TLSOptions.Config.GetCertificate = func(*tls.ClientHelloInfo) (*tls.Certificate, error) { 775 cert := klet.serverCertificateManager.Current() 776 if cert == nil { 777 return nil, fmt.Errorf("no serving certificate available for the kubelet") 778 } 779 return cert, nil 780 } 781 } 782 783 if kubeDeps.ProbeManager != nil { 784 klet.probeManager = kubeDeps.ProbeManager 785 } else { 786 klet.probeManager = prober.NewManager( 787 klet.statusManager, 788 klet.livenessManager, 789 klet.readinessManager, 790 klet.startupManager, 791 klet.runner, 792 kubeDeps.Recorder) 793 } 794 795 tokenManager := token.NewManager(kubeDeps.KubeClient) 796 797 var clusterTrustBundleManager clustertrustbundle.Manager 798 if kubeDeps.KubeClient != nil && utilfeature.DefaultFeatureGate.Enabled(features.ClusterTrustBundleProjection) { 799 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0) 800 clusterTrustBundleManager, err = clustertrustbundle.NewInformerManager(kubeInformers.Certificates().V1alpha1().ClusterTrustBundles(), 2*int(kubeCfg.MaxPods), 5*time.Minute) 801 if err != nil { 802 return nil, fmt.Errorf("while starting informer-based ClusterTrustBundle manager: %w", err) 803 } 804 kubeInformers.Start(wait.NeverStop) 805 klog.InfoS("Started ClusterTrustBundle informer") 806 } else { 807 // In static kubelet mode, use a no-op manager. 808 clusterTrustBundleManager = &clustertrustbundle.NoopManager{} 809 klog.InfoS("Not starting ClusterTrustBundle informer because we are in static kubelet mode") 810 } 811 812 // NewInitializedVolumePluginMgr initializes some storageErrors on the Kubelet runtimeState (in csi_plugin.go init) 813 // which affects node ready status. This function must be called before Kubelet is initialized so that the Node 814 // ReadyState is accurate with the storage state. 815 klet.volumePluginMgr, err = 816 NewInitializedVolumePluginMgr(klet, secretManager, configMapManager, tokenManager, clusterTrustBundleManager, kubeDeps.VolumePlugins, kubeDeps.DynamicPluginProber) 817 if err != nil { 818 return nil, err 819 } 820 klet.pluginManager = pluginmanager.NewPluginManager( 821 klet.getPluginsRegistrationDir(), /* sockDir */ 822 kubeDeps.Recorder, 823 ) 824 825 // If the experimentalMounterPathFlag is set, we do not want to 826 // check node capabilities since the mount path is not the default 827 if len(experimentalMounterPath) != 0 { 828 // Replace the nameserver in containerized-mounter's rootfs/etc/resolv.conf with kubelet.ClusterDNS 829 // so that service name could be resolved 830 klet.dnsConfigurer.SetupDNSinContainerizedMounter(experimentalMounterPath) 831 } 832 833 // setup volumeManager 834 klet.volumeManager = volumemanager.NewVolumeManager( 835 kubeCfg.EnableControllerAttachDetach, 836 nodeName, 837 klet.podManager, 838 klet.podWorkers, 839 klet.kubeClient, 840 klet.volumePluginMgr, 841 klet.containerRuntime, 842 kubeDeps.Mounter, 843 kubeDeps.HostUtil, 844 klet.getPodsDir(), 845 kubeDeps.Recorder, 846 keepTerminatedPodVolumes, 847 volumepathhandler.NewBlockVolumePathHandler()) 848 849 klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) 850 851 // setup eviction manager 852 evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, 853 killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation) 854 855 klet.evictionManager = evictionManager 856 klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler) 857 858 // Safe, allowed sysctls can always be used as unsafe sysctls in the spec. 859 // Hence, we concatenate those two lists. 860 safeAndUnsafeSysctls := append(sysctl.SafeSysctlAllowlist(), allowedUnsafeSysctls...) 861 sysctlsAllowlist, err := sysctl.NewAllowlist(safeAndUnsafeSysctls) 862 if err != nil { 863 return nil, err 864 } 865 klet.admitHandlers.AddPodAdmitHandler(sysctlsAllowlist) 866 867 // enable active deadline handler 868 activeDeadlineHandler, err := newActiveDeadlineHandler(klet.statusManager, kubeDeps.Recorder, klet.clock) 869 if err != nil { 870 return nil, err 871 } 872 klet.AddPodSyncLoopHandler(activeDeadlineHandler) 873 klet.AddPodSyncHandler(activeDeadlineHandler) 874 875 klet.admitHandlers.AddPodAdmitHandler(klet.containerManager.GetAllocateResourcesPodAdmitHandler()) 876 877 criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder) 878 klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler, klet.containerManager.UpdatePluginResources)) 879 // apply functional Option's 880 for _, opt := range kubeDeps.Options { 881 opt(klet) 882 } 883 884 if sysruntime.GOOS == "linux" { 885 // AppArmor is a Linux kernel security module and it does not support other operating systems. 886 klet.appArmorValidator = apparmor.NewValidator() 887 klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator)) 888 } 889 890 leaseDuration := time.Duration(kubeCfg.NodeLeaseDurationSeconds) * time.Second 891 renewInterval := time.Duration(float64(leaseDuration) * nodeLeaseRenewIntervalFraction) 892 klet.nodeLeaseController = lease.NewController( 893 klet.clock, 894 klet.heartbeatClient, 895 string(klet.nodeName), 896 kubeCfg.NodeLeaseDurationSeconds, 897 klet.onRepeatedHeartbeatFailure, 898 renewInterval, 899 string(klet.nodeName), 900 v1.NamespaceNodeLease, 901 util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName))) 902 903 // setup node shutdown manager 904 shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{ 905 Logger: logger, 906 ProbeManager: klet.probeManager, 907 Recorder: kubeDeps.Recorder, 908 NodeRef: nodeRef, 909 GetPodsFunc: klet.GetActivePods, 910 KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder), 911 SyncNodeStatusFunc: klet.syncNodeStatus, 912 ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration, 913 ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration, 914 ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority, 915 StateDirectory: rootDirectory, 916 }) 917 klet.shutdownManager = shutdownManager 918 klet.usernsManager, err = userns.MakeUserNsManager(klet) 919 if err != nil { 920 return nil, err 921 } 922 klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler) 923 924 // Finally, put the most recent version of the config on the Kubelet, so 925 // people can see how it was configured. 926 klet.kubeletConfiguration = *kubeCfg 927 928 // Generating the status funcs should be the last thing we do, 929 // since this relies on the rest of the Kubelet having been constructed. 930 klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs() 931 932 return klet, nil 933 } 934 935 type serviceLister interface { 936 List(labels.Selector) ([]*v1.Service, error) 937 } 938 939 // Kubelet is the main kubelet implementation. 940 type Kubelet struct { 941 kubeletConfiguration kubeletconfiginternal.KubeletConfiguration 942 943 // hostname is the hostname the kubelet detected or was given via flag/config 944 hostname string 945 // hostnameOverridden indicates the hostname was overridden via flag/config 946 hostnameOverridden bool 947 948 nodeName types.NodeName 949 runtimeCache kubecontainer.RuntimeCache 950 kubeClient clientset.Interface 951 heartbeatClient clientset.Interface 952 // mirrorPodClient is used to create and delete mirror pods in the API for static 953 // pods. 954 mirrorPodClient kubepod.MirrorClient 955 956 rootDirectory string 957 958 lastObservedNodeAddressesMux sync.RWMutex 959 lastObservedNodeAddresses []v1.NodeAddress 960 961 // onRepeatedHeartbeatFailure is called when a heartbeat operation fails more than once. optional. 962 onRepeatedHeartbeatFailure func() 963 964 // podManager stores the desired set of admitted pods and mirror pods that the kubelet should be 965 // running. The actual set of running pods is stored on the podWorkers. The manager is populated 966 // by the kubelet config loops which abstracts receiving configuration from many different sources 967 // (api for regular pods, local filesystem or http for static pods). The manager may be consulted 968 // by other components that need to see the set of desired pods. Note that not all desired pods are 969 // running, and not all running pods are in the podManager - for instance, force deleting a pod 970 // from the apiserver will remove it from the podManager, but the pod may still be terminating and 971 // tracked by the podWorkers. Components that need to know the actual consumed resources of the 972 // node or are driven by podWorkers and the sync*Pod methods (status, volume, stats) should also 973 // consult the podWorkers when reconciling. 974 // 975 // TODO: review all kubelet components that need the actual set of pods (vs the desired set) 976 // and update them to use podWorkers instead of podManager. This may introduce latency in some 977 // methods, but avoids race conditions and correctly accounts for terminating pods that have 978 // been force deleted or static pods that have been updated. 979 // https://github.com/kubernetes/kubernetes/issues/116970 980 podManager kubepod.Manager 981 982 // podWorkers is responsible for driving the lifecycle state machine of each pod. The worker is 983 // notified of config changes, updates, periodic reconciliation, container runtime updates, and 984 // evictions of all desired pods and will invoke reconciliation methods per pod in separate 985 // goroutines. The podWorkers are authoritative in the kubelet for what pods are actually being 986 // run and their current state: 987 // 988 // * syncing: pod should be running (syncPod) 989 // * terminating: pod should be stopped (syncTerminatingPod) 990 // * terminated: pod should have all resources cleaned up (syncTerminatedPod) 991 // 992 // and invoke the handler methods that correspond to each state. Components within the 993 // kubelet that need to know the phase of the pod in order to correctly set up or tear down 994 // resources must consult the podWorkers. 995 // 996 // Once a pod has been accepted by the pod workers, no other pod with that same UID (and 997 // name+namespace, for static pods) will be started until the first pod has fully terminated 998 // and been cleaned up by SyncKnownPods. This means a pod may be desired (in API), admitted 999 // (in pod manager), and requested (by invoking UpdatePod) but not start for an arbitrarily 1000 // long interval because a prior pod is still terminating. 1001 // 1002 // As an event-driven (by UpdatePod) controller, the podWorkers must periodically be resynced 1003 // by the kubelet invoking SyncKnownPods with the desired state (admitted pods in podManager). 1004 // Since the podManager may be unaware of some running pods due to force deletion, the 1005 // podWorkers are responsible for triggering a sync of pods that are no longer desired but 1006 // must still run to completion. 1007 podWorkers PodWorkers 1008 1009 // evictionManager observes the state of the node for situations that could impact node stability 1010 // and evicts pods (sets to phase Failed with reason Evicted) to reduce resource pressure. The 1011 // eviction manager acts on the actual state of the node and considers the podWorker to be 1012 // authoritative. 1013 evictionManager eviction.Manager 1014 1015 // probeManager tracks the set of running pods and ensures any user-defined periodic checks are 1016 // run to introspect the state of each pod. The probe manager acts on the actual state of the node 1017 // and is notified of pods by the podWorker. The probe manager is the authoritative source of the 1018 // most recent probe status and is responsible for notifying the status manager, which 1019 // synthesizes them into the overall pod status. 1020 probeManager prober.Manager 1021 1022 // secretManager caches the set of secrets used by running pods on this node. The podWorkers 1023 // notify the secretManager when pods are started and terminated, and the secretManager must 1024 // then keep the needed secrets up-to-date as they change. 1025 secretManager secret.Manager 1026 1027 // configMapManager caches the set of config maps used by running pods on this node. The 1028 // podWorkers notify the configMapManager when pods are started and terminated, and the 1029 // configMapManager must then keep the needed config maps up-to-date as they change. 1030 configMapManager configmap.Manager 1031 1032 // volumeManager observes the set of running pods and is responsible for attaching, mounting, 1033 // unmounting, and detaching as those pods move through their lifecycle. It periodically 1034 // synchronizes the set of known volumes to the set of actually desired volumes and cleans up 1035 // any orphaned volumes. The volume manager considers the podWorker to be authoritative for 1036 // which pods are running. 1037 volumeManager volumemanager.VolumeManager 1038 1039 // statusManager receives updated pod status updates from the podWorker and updates the API 1040 // status of those pods to match. The statusManager is authoritative for the synthesized 1041 // status of the pod from the kubelet's perspective (other components own the individual 1042 // elements of status) and should be consulted by components in preference to assembling 1043 // that status themselves. Note that the status manager is downstream of the pod worker 1044 // and components that need to check whether a pod is still running should instead directly 1045 // consult the pod worker. 1046 statusManager status.Manager 1047 1048 // resyncInterval is the interval between periodic full reconciliations of 1049 // pods on this node. 1050 resyncInterval time.Duration 1051 1052 // sourcesReady records the sources seen by the kubelet, it is thread-safe. 1053 sourcesReady config.SourcesReady 1054 1055 // Optional, defaults to /logs/ from /var/log 1056 logServer http.Handler 1057 // Optional, defaults to simple Docker implementation 1058 runner kubecontainer.CommandRunner 1059 1060 // cAdvisor used for container information. 1061 cadvisor cadvisor.Interface 1062 1063 // Set to true to have the node register itself with the apiserver. 1064 registerNode bool 1065 // List of taints to add to a node object when the kubelet registers itself. 1066 registerWithTaints []v1.Taint 1067 // Set to true to have the node register itself as schedulable. 1068 registerSchedulable bool 1069 // for internal book keeping; access only from within registerWithApiserver 1070 registrationCompleted bool 1071 1072 // dnsConfigurer is used for setting up DNS resolver configuration when launching pods. 1073 dnsConfigurer *dns.Configurer 1074 1075 // serviceLister knows how to list services 1076 serviceLister serviceLister 1077 // serviceHasSynced indicates whether services have been sync'd at least once. 1078 // Check this before trusting a response from the lister. 1079 serviceHasSynced cache.InformerSynced 1080 // nodeLister knows how to list nodes 1081 nodeLister corelisters.NodeLister 1082 // nodeHasSynced indicates whether nodes have been sync'd at least once. 1083 // Check this before trusting a response from the node lister. 1084 nodeHasSynced cache.InformerSynced 1085 // a list of node labels to register 1086 nodeLabels map[string]string 1087 1088 // Last timestamp when runtime responded on ping. 1089 // Mutex is used to protect this value. 1090 runtimeState *runtimeState 1091 1092 // Volume plugins. 1093 volumePluginMgr *volume.VolumePluginMgr 1094 1095 // Manages container health check results. 1096 livenessManager proberesults.Manager 1097 readinessManager proberesults.Manager 1098 startupManager proberesults.Manager 1099 1100 // How long to keep idle streaming command execution/port forwarding 1101 // connections open before terminating them 1102 streamingConnectionIdleTimeout time.Duration 1103 1104 // The EventRecorder to use 1105 recorder record.EventRecorder 1106 1107 // Policy for handling garbage collection of dead containers. 1108 containerGC kubecontainer.GC 1109 1110 // Manager for image garbage collection. 1111 imageManager images.ImageGCManager 1112 1113 // Manager for container logs. 1114 containerLogManager logs.ContainerLogManager 1115 1116 // Cached MachineInfo returned by cadvisor. 1117 machineInfoLock sync.RWMutex 1118 machineInfo *cadvisorapi.MachineInfo 1119 1120 // Handles certificate rotations. 1121 serverCertificateManager certificate.Manager 1122 1123 // Cloud provider interface. 1124 cloud cloudprovider.Interface 1125 // Handles requests to cloud provider with timeout 1126 cloudResourceSyncManager cloudresource.SyncManager 1127 1128 // Indicates that the node initialization happens in an external cloud controller 1129 externalCloudProvider bool 1130 // Reference to this node. 1131 nodeRef *v1.ObjectReference 1132 1133 // Container runtime. 1134 containerRuntime kubecontainer.Runtime 1135 1136 // Streaming runtime handles container streaming. 1137 streamingRuntime kubecontainer.StreamingRuntime 1138 1139 // Container runtime service (needed by container runtime Start()). 1140 runtimeService internalapi.RuntimeService 1141 1142 // reasonCache caches the failure reason of the last creation of all containers, which is 1143 // used for generating ContainerStatus. 1144 reasonCache *ReasonCache 1145 1146 // containerRuntimeReadyExpected indicates whether container runtime being ready is expected 1147 // so errors are logged without verbosity guard, to avoid excessive error logs at node startup. 1148 // It's false during the node initialization period of nodeReadyGracePeriod, and after that 1149 // it's set to true by fastStatusUpdateOnce when it exits. 1150 containerRuntimeReadyExpected bool 1151 1152 // nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease 1153 // feature is not enabled, it is also the frequency that kubelet posts node status to master. 1154 // In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod 1155 // in nodecontroller. There are several constraints: 1156 // 1. nodeMonitorGracePeriod must be N times more than nodeStatusUpdateFrequency, where 1157 // N means number of retries allowed for kubelet to post node status. It is pointless 1158 // to make nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since there 1159 // will only be fresh values from Kubelet at an interval of nodeStatusUpdateFrequency. 1160 // The constant must be less than podEvictionTimeout. 1161 // 2. nodeStatusUpdateFrequency needs to be large enough for kubelet to generate node 1162 // status. Kubelet may fail to update node status reliably if the value is too small, 1163 // as it takes time to gather all necessary node information. 1164 nodeStatusUpdateFrequency time.Duration 1165 1166 // nodeStatusReportFrequency is the frequency that kubelet posts node 1167 // status to master. It is only used when node lease feature is enabled. 1168 nodeStatusReportFrequency time.Duration 1169 1170 // lastStatusReportTime is the time when node status was last reported. 1171 lastStatusReportTime time.Time 1172 1173 // syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe. 1174 // This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else. 1175 syncNodeStatusMux sync.Mutex 1176 1177 // updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe. 1178 // This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else. 1179 updatePodCIDRMux sync.Mutex 1180 1181 // updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe. 1182 // This lock is used by Kubelet.updateRuntimeUp and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else. 1183 updateRuntimeMux sync.Mutex 1184 1185 // nodeLeaseController claims and renews the node lease for this Kubelet 1186 nodeLeaseController lease.Controller 1187 1188 // pleg observes the state of the container runtime and notifies the kubelet of changes to containers, which 1189 // notifies the podWorkers to reconcile the state of the pod (for instance, if a container dies and needs to 1190 // be restarted). 1191 pleg pleg.PodLifecycleEventGenerator 1192 1193 // eventedPleg supplements the pleg to deliver edge-driven container changes with low-latency. 1194 eventedPleg pleg.PodLifecycleEventGenerator 1195 1196 // Store kubecontainer.PodStatus for all pods. 1197 podCache kubecontainer.Cache 1198 1199 // os is a facade for various syscalls that need to be mocked during testing. 1200 os kubecontainer.OSInterface 1201 1202 // Watcher of out of memory events. 1203 oomWatcher oomwatcher.Watcher 1204 1205 // Monitor resource usage 1206 resourceAnalyzer serverstats.ResourceAnalyzer 1207 1208 // Whether or not we should have the QOS cgroup hierarchy for resource management 1209 cgroupsPerQOS bool 1210 1211 // If non-empty, pass this to the container runtime as the root cgroup. 1212 cgroupRoot string 1213 1214 // Mounter to use for volumes. 1215 mounter mount.Interface 1216 1217 // hostutil to interact with filesystems 1218 hostutil hostutil.HostUtils 1219 1220 // subpather to execute subpath actions 1221 subpather subpath.Interface 1222 1223 // Manager of non-Runtime containers. 1224 containerManager cm.ContainerManager 1225 1226 // Maximum Number of Pods which can be run by this Kubelet 1227 maxPods int 1228 1229 // Monitor Kubelet's sync loop 1230 syncLoopMonitor atomic.Value 1231 1232 // Container restart Backoff 1233 backOff *flowcontrol.Backoff 1234 1235 // Information about the ports which are opened by daemons on Node running this Kubelet server. 1236 daemonEndpoints *v1.NodeDaemonEndpoints 1237 1238 // A queue used to trigger pod workers. 1239 workQueue queue.WorkQueue 1240 1241 // oneTimeInitializer is used to initialize modules that are dependent on the runtime to be up. 1242 oneTimeInitializer sync.Once 1243 1244 // If set, use this IP address or addresses for the node 1245 nodeIPs []net.IP 1246 1247 // use this function to validate the kubelet nodeIP 1248 nodeIPValidator func(net.IP) error 1249 1250 // If non-nil, this is a unique identifier for the node in an external database, eg. cloudprovider 1251 providerID string 1252 1253 // clock is an interface that provides time related functionality in a way that makes it 1254 // easy to test the code. 1255 clock clock.WithTicker 1256 1257 // handlers called during the tryUpdateNodeStatus cycle 1258 setNodeStatusFuncs []func(context.Context, *v1.Node) error 1259 1260 lastNodeUnschedulableLock sync.Mutex 1261 // maintains Node.Spec.Unschedulable value from previous run of tryUpdateNodeStatus() 1262 lastNodeUnschedulable bool 1263 1264 // the list of handlers to call during pod admission. 1265 admitHandlers lifecycle.PodAdmitHandlers 1266 1267 // softAdmithandlers are applied to the pod after it is admitted by the Kubelet, but before it is 1268 // run. A pod rejected by a softAdmitHandler will be left in a Pending state indefinitely. If a 1269 // rejected pod should not be recreated, or the scheduler is not aware of the rejection rule, the 1270 // admission rule should be applied by a softAdmitHandler. 1271 softAdmitHandlers lifecycle.PodAdmitHandlers 1272 1273 // the list of handlers to call during pod sync loop. 1274 lifecycle.PodSyncLoopHandlers 1275 1276 // the list of handlers to call during pod sync. 1277 lifecycle.PodSyncHandlers 1278 1279 // the number of allowed pods per core 1280 podsPerCore int 1281 1282 // enableControllerAttachDetach indicates the Attach/Detach controller 1283 // should manage attachment/detachment of volumes scheduled to this node, 1284 // and disable kubelet from executing any attach/detach operations 1285 enableControllerAttachDetach bool 1286 1287 // trigger deleting containers in a pod 1288 containerDeletor *podContainerDeletor 1289 1290 // config iptables util rules 1291 makeIPTablesUtilChains bool 1292 1293 // The AppArmor validator for checking whether AppArmor is supported. 1294 appArmorValidator apparmor.Validator 1295 1296 // StatsProvider provides the node and the container stats. 1297 StatsProvider *stats.Provider 1298 1299 // This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. 1300 // This can be useful for debugging volume related issues. 1301 keepTerminatedPodVolumes bool // DEPRECATED 1302 1303 // pluginmanager runs a set of asynchronous loops that figure out which 1304 // plugins need to be registered/unregistered based on this node and makes it so. 1305 pluginManager pluginmanager.PluginManager 1306 1307 // This flag sets a maximum number of images to report in the node status. 1308 nodeStatusMaxImages int32 1309 1310 // Handles RuntimeClass objects for the Kubelet. 1311 runtimeClassManager *runtimeclass.Manager 1312 1313 // Handles node shutdown events for the Node. 1314 shutdownManager nodeshutdown.Manager 1315 1316 // Manage user namespaces 1317 usernsManager *userns.UsernsManager 1318 1319 // Mutex to serialize new pod admission and existing pod resizing 1320 podResizeMutex sync.Mutex 1321 1322 // OpenTelemetry Tracer 1323 tracer trace.Tracer 1324 1325 // Track node startup latencies 1326 nodeStartupLatencyTracker util.NodeStartupLatencyTracker 1327 } 1328 1329 // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface 1330 func (kl *Kubelet) ListPodStats(ctx context.Context) ([]statsapi.PodStats, error) { 1331 return kl.StatsProvider.ListPodStats(ctx) 1332 } 1333 1334 // ListPodCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface 1335 func (kl *Kubelet) ListPodCPUAndMemoryStats(ctx context.Context) ([]statsapi.PodStats, error) { 1336 return kl.StatsProvider.ListPodCPUAndMemoryStats(ctx) 1337 } 1338 1339 // ListPodStatsAndUpdateCPUNanoCoreUsage is delegated to StatsProvider, which implements stats.Provider interface 1340 func (kl *Kubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) { 1341 return kl.StatsProvider.ListPodStatsAndUpdateCPUNanoCoreUsage(ctx) 1342 } 1343 1344 // ImageFsStats is delegated to StatsProvider, which implements stats.Provider interface 1345 func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) { 1346 return kl.StatsProvider.ImageFsStats(ctx) 1347 } 1348 1349 // GetCgroupStats is delegated to StatsProvider, which implements stats.Provider interface 1350 func (kl *Kubelet) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) { 1351 return kl.StatsProvider.GetCgroupStats(cgroupName, updateStats) 1352 } 1353 1354 // GetCgroupCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface 1355 func (kl *Kubelet) GetCgroupCPUAndMemoryStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, error) { 1356 return kl.StatsProvider.GetCgroupCPUAndMemoryStats(cgroupName, updateStats) 1357 } 1358 1359 // RootFsStats is delegated to StatsProvider, which implements stats.Provider interface 1360 func (kl *Kubelet) RootFsStats() (*statsapi.FsStats, error) { 1361 return kl.StatsProvider.RootFsStats() 1362 } 1363 1364 // GetContainerInfo is delegated to StatsProvider, which implements stats.Provider interface 1365 func (kl *Kubelet) GetContainerInfo(ctx context.Context, podFullName string, uid types.UID, containerName string, req *cadvisorapi.ContainerInfoRequest) (*cadvisorapi.ContainerInfo, error) { 1366 return kl.StatsProvider.GetContainerInfo(ctx, podFullName, uid, containerName, req) 1367 } 1368 1369 // GetRawContainerInfo is delegated to StatsProvider, which implements stats.Provider interface 1370 func (kl *Kubelet) GetRawContainerInfo(containerName string, req *cadvisorapi.ContainerInfoRequest, subcontainers bool) (map[string]*cadvisorapi.ContainerInfo, error) { 1371 return kl.StatsProvider.GetRawContainerInfo(containerName, req, subcontainers) 1372 } 1373 1374 // RlimitStats is delegated to StatsProvider, which implements stats.Provider interface 1375 func (kl *Kubelet) RlimitStats() (*statsapi.RlimitStats, error) { 1376 return kl.StatsProvider.RlimitStats() 1377 } 1378 1379 // setupDataDirs creates: 1380 // 1. the root directory 1381 // 2. the pods directory 1382 // 3. the plugins directory 1383 // 4. the pod-resources directory 1384 // 5. the checkpoint directory 1385 func (kl *Kubelet) setupDataDirs() error { 1386 if cleanedRoot := filepath.Clean(kl.rootDirectory); cleanedRoot != kl.rootDirectory { 1387 return fmt.Errorf("rootDirectory not in canonical form: expected %s, was %s", cleanedRoot, kl.rootDirectory) 1388 } 1389 pluginRegistrationDir := kl.getPluginsRegistrationDir() 1390 pluginsDir := kl.getPluginsDir() 1391 if err := os.MkdirAll(kl.getRootDir(), 0750); err != nil { 1392 return fmt.Errorf("error creating root directory: %v", err) 1393 } 1394 if err := kl.hostutil.MakeRShared(kl.getRootDir()); err != nil { 1395 return fmt.Errorf("error configuring root directory: %v", err) 1396 } 1397 if err := os.MkdirAll(kl.getPodsDir(), 0750); err != nil { 1398 return fmt.Errorf("error creating pods directory: %v", err) 1399 } 1400 if err := os.MkdirAll(kl.getPluginsDir(), 0750); err != nil { 1401 return fmt.Errorf("error creating plugins directory: %v", err) 1402 } 1403 if err := os.MkdirAll(kl.getPluginsRegistrationDir(), 0750); err != nil { 1404 return fmt.Errorf("error creating plugins registry directory: %v", err) 1405 } 1406 if err := os.MkdirAll(kl.getPodResourcesDir(), 0750); err != nil { 1407 return fmt.Errorf("error creating podresources directory: %v", err) 1408 } 1409 if utilfeature.DefaultFeatureGate.Enabled(features.ContainerCheckpoint) { 1410 if err := os.MkdirAll(kl.getCheckpointsDir(), 0700); err != nil { 1411 return fmt.Errorf("error creating checkpoint directory: %v", err) 1412 } 1413 } 1414 if selinux.GetEnabled() { 1415 err := selinux.SetFileLabel(pluginRegistrationDir, config.KubeletPluginsDirSELinuxLabel) 1416 if err != nil { 1417 klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugin registration dir", "path", pluginRegistrationDir, "err", err) 1418 } 1419 err = selinux.SetFileLabel(pluginsDir, config.KubeletPluginsDirSELinuxLabel) 1420 if err != nil { 1421 klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugins dir", "path", pluginsDir, "err", err) 1422 } 1423 } 1424 return nil 1425 } 1426 1427 // StartGarbageCollection starts garbage collection threads. 1428 func (kl *Kubelet) StartGarbageCollection() { 1429 loggedContainerGCFailure := false 1430 go wait.Until(func() { 1431 ctx := context.Background() 1432 if err := kl.containerGC.GarbageCollect(ctx); err != nil { 1433 klog.ErrorS(err, "Container garbage collection failed") 1434 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error()) 1435 loggedContainerGCFailure = true 1436 } else { 1437 var vLevel klog.Level = 4 1438 if loggedContainerGCFailure { 1439 vLevel = 1 1440 loggedContainerGCFailure = false 1441 } 1442 1443 klog.V(vLevel).InfoS("Container garbage collection succeeded") 1444 } 1445 }, ContainerGCPeriod, wait.NeverStop) 1446 1447 // when the high threshold is set to 100, stub the image GC manager 1448 if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 { 1449 klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100, Disable image GC") 1450 return 1451 } 1452 1453 prevImageGCFailed := false 1454 go wait.Until(func() { 1455 ctx := context.Background() 1456 if err := kl.imageManager.GarbageCollect(ctx); err != nil { 1457 if prevImageGCFailed { 1458 klog.ErrorS(err, "Image garbage collection failed multiple times in a row") 1459 // Only create an event for repeated failures 1460 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error()) 1461 } else { 1462 klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet") 1463 } 1464 prevImageGCFailed = true 1465 } else { 1466 var vLevel klog.Level = 4 1467 if prevImageGCFailed { 1468 vLevel = 1 1469 prevImageGCFailed = false 1470 } 1471 1472 klog.V(vLevel).InfoS("Image garbage collection succeeded") 1473 } 1474 }, ImageGCPeriod, wait.NeverStop) 1475 } 1476 1477 // initializeModules will initialize internal modules that do not require the container runtime to be up. 1478 // Note that the modules here must not depend on modules that are not initialized here. 1479 func (kl *Kubelet) initializeModules() error { 1480 // Prometheus metrics. 1481 metrics.Register( 1482 collectors.NewVolumeStatsCollector(kl), 1483 collectors.NewLogMetricsCollector(kl.StatsProvider.ListPodStats), 1484 ) 1485 metrics.SetNodeName(kl.nodeName) 1486 servermetrics.Register() 1487 1488 // Setup filesystem directories. 1489 if err := kl.setupDataDirs(); err != nil { 1490 return err 1491 } 1492 1493 // If the container logs directory does not exist, create it. 1494 if _, err := os.Stat(ContainerLogsDir); err != nil { 1495 if err := kl.os.MkdirAll(ContainerLogsDir, 0755); err != nil { 1496 return fmt.Errorf("failed to create directory %q: %v", ContainerLogsDir, err) 1497 } 1498 } 1499 1500 // Start the image manager. 1501 kl.imageManager.Start() 1502 1503 // Start the certificate manager if it was enabled. 1504 if kl.serverCertificateManager != nil { 1505 kl.serverCertificateManager.Start() 1506 } 1507 1508 // Start out of memory watcher. 1509 if kl.oomWatcher != nil { 1510 if err := kl.oomWatcher.Start(kl.nodeRef); err != nil { 1511 return fmt.Errorf("failed to start OOM watcher: %w", err) 1512 } 1513 } 1514 1515 // Start resource analyzer 1516 kl.resourceAnalyzer.Start() 1517 1518 return nil 1519 } 1520 1521 // initializeRuntimeDependentModules will initialize internal modules that require the container runtime to be up. 1522 func (kl *Kubelet) initializeRuntimeDependentModules() { 1523 if err := kl.cadvisor.Start(); err != nil { 1524 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1525 klog.ErrorS(err, "Failed to start cAdvisor") 1526 os.Exit(1) 1527 } 1528 1529 // trigger on-demand stats collection once so that we have capacity information for ephemeral storage. 1530 // ignore any errors, since if stats collection is not successful, the container manager will fail to start below. 1531 kl.StatsProvider.GetCgroupStats("/", true) 1532 // Start container manager. 1533 node, err := kl.getNodeAnyWay() 1534 if err != nil { 1535 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1536 klog.ErrorS(err, "Kubelet failed to get node info") 1537 os.Exit(1) 1538 } 1539 // containerManager must start after cAdvisor because it needs filesystem capacity information 1540 if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil { 1541 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1542 klog.ErrorS(err, "Failed to start ContainerManager") 1543 os.Exit(1) 1544 } 1545 // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs 1546 kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.PodIsFinished, evictionMonitoringPeriod) 1547 1548 // container log manager must start after container runtime is up to retrieve information from container runtime 1549 // and inform container to reopen log file after log rotation. 1550 kl.containerLogManager.Start() 1551 // Adding Registration Callback function for CSI Driver 1552 kl.pluginManager.AddHandler(pluginwatcherapi.CSIPlugin, plugincache.PluginHandler(csi.PluginHandler)) 1553 // Adding Registration Callback function for DRA Plugin 1554 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 1555 kl.pluginManager.AddHandler(pluginwatcherapi.DRAPlugin, plugincache.PluginHandler(draplugin.NewRegistrationHandler())) 1556 } 1557 // Adding Registration Callback function for Device Manager 1558 kl.pluginManager.AddHandler(pluginwatcherapi.DevicePlugin, kl.containerManager.GetPluginRegistrationHandler()) 1559 1560 // Start the plugin manager 1561 klog.V(4).InfoS("Starting plugin manager") 1562 go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop) 1563 1564 err = kl.shutdownManager.Start() 1565 if err != nil { 1566 // The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it. 1567 klog.ErrorS(err, "Failed to start node shutdown manager") 1568 } 1569 } 1570 1571 // Run starts the kubelet reacting to config updates 1572 func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { 1573 ctx := context.Background() 1574 if kl.logServer == nil { 1575 file := http.FileServer(http.Dir(nodeLogDir)) 1576 if utilfeature.DefaultFeatureGate.Enabled(features.NodeLogQuery) && kl.kubeletConfiguration.EnableSystemLogQuery { 1577 kl.logServer = http.StripPrefix("/logs/", http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 1578 if nlq, errs := newNodeLogQuery(req.URL.Query()); len(errs) > 0 { 1579 http.Error(w, errs.ToAggregate().Error(), http.StatusBadRequest) 1580 return 1581 } else if nlq != nil { 1582 if req.URL.Path != "/" && req.URL.Path != "" { 1583 http.Error(w, "path not allowed in query mode", http.StatusNotAcceptable) 1584 return 1585 } 1586 if errs := nlq.validate(); len(errs) > 0 { 1587 http.Error(w, errs.ToAggregate().Error(), http.StatusNotAcceptable) 1588 return 1589 } 1590 // Validation ensures that the request does not query services and files at the same time 1591 if len(nlq.Services) > 0 { 1592 journal.ServeHTTP(w, req) 1593 return 1594 } 1595 // Validation ensures that the request does not explicitly query multiple files at the same time 1596 if len(nlq.Files) == 1 { 1597 // Account for the \ being used on Windows clients 1598 req.URL.Path = filepath.ToSlash(nlq.Files[0]) 1599 } 1600 } 1601 // Fall back in case the caller is directly trying to query a file 1602 // Example: kubectl get --raw /api/v1/nodes/$name/proxy/logs/foo.log 1603 file.ServeHTTP(w, req) 1604 })) 1605 } else { 1606 kl.logServer = http.StripPrefix("/logs/", file) 1607 } 1608 } 1609 if kl.kubeClient == nil { 1610 klog.InfoS("No API server defined - no node status update will be sent") 1611 } 1612 1613 // Start the cloud provider sync manager 1614 if kl.cloudResourceSyncManager != nil { 1615 go kl.cloudResourceSyncManager.Run(wait.NeverStop) 1616 } 1617 1618 if err := kl.initializeModules(); err != nil { 1619 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error()) 1620 klog.ErrorS(err, "Failed to initialize internal modules") 1621 os.Exit(1) 1622 } 1623 1624 // Start volume manager 1625 go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop) 1626 1627 if kl.kubeClient != nil { 1628 // Start two go-routines to update the status. 1629 // 1630 // The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals, 1631 // while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver 1632 // once the node becomes ready, then exits afterwards. 1633 // 1634 // Introduce some small jittering to ensure that over time the requests won't start 1635 // accumulating at approximately the same time from the set of nodes due to priority and 1636 // fairness effect. 1637 go wait.JitterUntil(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, 0.04, true, wait.NeverStop) 1638 go kl.fastStatusUpdateOnce() 1639 1640 // start syncing lease 1641 go kl.nodeLeaseController.Run(context.Background()) 1642 } 1643 go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop) 1644 1645 // Set up iptables util rules 1646 if kl.makeIPTablesUtilChains { 1647 kl.initNetworkUtil() 1648 } 1649 1650 // Start component sync loops. 1651 kl.statusManager.Start() 1652 1653 // Start syncing RuntimeClasses if enabled. 1654 if kl.runtimeClassManager != nil { 1655 kl.runtimeClassManager.Start(wait.NeverStop) 1656 } 1657 1658 // Start the pod lifecycle event generator. 1659 kl.pleg.Start() 1660 1661 // Start eventedPLEG only if EventedPLEG feature gate is enabled. 1662 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 1663 kl.eventedPleg.Start() 1664 } 1665 1666 kl.syncLoop(ctx, updates, kl) 1667 } 1668 1669 // SyncPod is the transaction script for the sync of a single pod (setting up) 1670 // a pod. This method is reentrant and expected to converge a pod towards the 1671 // desired state of the spec. The reverse (teardown) is handled in 1672 // SyncTerminatingPod and SyncTerminatedPod. If SyncPod exits without error, 1673 // then the pod runtime state is in sync with the desired configuration state 1674 // (pod is running). If SyncPod exits with a transient error, the next 1675 // invocation of SyncPod is expected to make progress towards reaching the 1676 // desired state. SyncPod exits with isTerminal when the pod was detected to 1677 // have reached a terminal lifecycle phase due to container exits (for 1678 // RestartNever or RestartOnFailure) and the next method invoked will be 1679 // SyncTerminatingPod. If the pod terminates for any other reason, SyncPod 1680 // will receive a context cancellation and should exit as soon as possible. 1681 // 1682 // Arguments: 1683 // 1684 // updateType - whether this is a create (first time) or an update, should 1685 // only be used for metrics since this method must be reentrant 1686 // 1687 // pod - the pod that is being set up 1688 // 1689 // mirrorPod - the mirror pod known to the kubelet for this pod, if any 1690 // 1691 // podStatus - the most recent pod status observed for this pod which can 1692 // be used to determine the set of actions that should be taken during 1693 // this loop of SyncPod 1694 // 1695 // The workflow is: 1696 // - If the pod is being created, record pod worker start latency 1697 // - Call generateAPIPodStatus to prepare an v1.PodStatus for the pod 1698 // - If the pod is being seen as running for the first time, record pod 1699 // start latency 1700 // - Update the status of the pod in the status manager 1701 // - Stop the pod's containers if it should not be running due to soft 1702 // admission 1703 // - Ensure any background tracking for a runnable pod is started 1704 // - Create a mirror pod if the pod is a static pod, and does not 1705 // already have a mirror pod 1706 // - Create the data directories for the pod if they do not exist 1707 // - Wait for volumes to attach/mount 1708 // - Fetch the pull secrets for the pod 1709 // - Call the container runtime's SyncPod callback 1710 // - Update the traffic shaping for the pod's ingress and egress limits 1711 // 1712 // If any step of this workflow errors, the error is returned, and is repeated 1713 // on the next SyncPod call. 1714 // 1715 // This operation writes all events that are dispatched in order to provide 1716 // the most accurate information possible about an error situation to aid debugging. 1717 // Callers should not write an event if this operation returns an error. 1718 func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) { 1719 ctx, otelSpan := kl.tracer.Start(ctx, "syncPod", trace.WithAttributes( 1720 semconv.K8SPodUIDKey.String(string(pod.UID)), 1721 attribute.String("k8s.pod", klog.KObj(pod).String()), 1722 semconv.K8SPodNameKey.String(pod.Name), 1723 attribute.String("k8s.pod.update_type", updateType.String()), 1724 semconv.K8SNamespaceNameKey.String(pod.Namespace), 1725 )) 1726 klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 1727 defer func() { 1728 klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal) 1729 otelSpan.End() 1730 }() 1731 1732 // Latency measurements for the main workflow are relative to the 1733 // first time the pod was seen by kubelet. 1734 var firstSeenTime time.Time 1735 if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok { 1736 firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get() 1737 } 1738 1739 // Record pod worker start latency if being created 1740 // TODO: make pod workers record their own latencies 1741 if updateType == kubetypes.SyncPodCreate { 1742 if !firstSeenTime.IsZero() { 1743 // This is the first time we are syncing the pod. Record the latency 1744 // since kubelet first saw the pod if firstSeenTime is set. 1745 metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) 1746 } else { 1747 klog.V(3).InfoS("First seen time not recorded for pod", 1748 "podUID", pod.UID, 1749 "pod", klog.KObj(pod)) 1750 } 1751 } 1752 1753 // Generate final API pod status with pod and status manager status 1754 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false) 1755 // The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576) 1756 // TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and 1757 // set pod IP to hostIP directly in runtime.GetPodStatus 1758 podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs)) 1759 for _, ipInfo := range apiPodStatus.PodIPs { 1760 podStatus.IPs = append(podStatus.IPs, ipInfo.IP) 1761 } 1762 if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 { 1763 podStatus.IPs = []string{apiPodStatus.PodIP} 1764 } 1765 1766 // If the pod is terminal, we don't need to continue to setup the pod 1767 if apiPodStatus.Phase == v1.PodSucceeded || apiPodStatus.Phase == v1.PodFailed { 1768 kl.statusManager.SetPodStatus(pod, apiPodStatus) 1769 isTerminal = true 1770 return isTerminal, nil 1771 } 1772 1773 // If the pod should not be running, we request the pod's containers be stopped. This is not the same 1774 // as termination (we want to stop the pod, but potentially restart it later if soft admission allows 1775 // it later). Set the status and phase appropriately 1776 runnable := kl.canRunPod(pod) 1777 if !runnable.Admit { 1778 // Pod is not runnable; and update the Pod and Container statuses to why. 1779 if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded { 1780 apiPodStatus.Phase = v1.PodPending 1781 } 1782 apiPodStatus.Reason = runnable.Reason 1783 apiPodStatus.Message = runnable.Message 1784 // Waiting containers are not creating. 1785 const waitingReason = "Blocked" 1786 for _, cs := range apiPodStatus.InitContainerStatuses { 1787 if cs.State.Waiting != nil { 1788 cs.State.Waiting.Reason = waitingReason 1789 } 1790 } 1791 for _, cs := range apiPodStatus.ContainerStatuses { 1792 if cs.State.Waiting != nil { 1793 cs.State.Waiting.Reason = waitingReason 1794 } 1795 } 1796 } 1797 1798 // Record the time it takes for the pod to become running 1799 // since kubelet first saw the pod if firstSeenTime is set. 1800 existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) 1801 if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && 1802 !firstSeenTime.IsZero() { 1803 metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) 1804 } 1805 1806 kl.statusManager.SetPodStatus(pod, apiPodStatus) 1807 1808 // Pods that are not runnable must be stopped - return a typed error to the pod worker 1809 if !runnable.Admit { 1810 klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message) 1811 var syncErr error 1812 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1813 if err := kl.killPod(ctx, pod, p, nil); err != nil { 1814 if !wait.Interrupted(err) { 1815 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 1816 syncErr = fmt.Errorf("error killing pod: %w", err) 1817 utilruntime.HandleError(syncErr) 1818 } 1819 } else { 1820 // There was no error killing the pod, but the pod cannot be run. 1821 // Return an error to signal that the sync loop should back off. 1822 syncErr = fmt.Errorf("pod cannot be run: %v", runnable.Message) 1823 } 1824 return false, syncErr 1825 } 1826 1827 // If the network plugin is not ready, only start the pod if it uses the host network 1828 if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) { 1829 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err) 1830 return false, fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err) 1831 } 1832 1833 // ensure the kubelet knows about referenced secrets or configmaps used by the pod 1834 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 1835 if kl.secretManager != nil { 1836 kl.secretManager.RegisterPod(pod) 1837 } 1838 if kl.configMapManager != nil { 1839 kl.configMapManager.RegisterPod(pod) 1840 } 1841 } 1842 1843 // Create Cgroups for the pod and apply resource parameters 1844 // to them if cgroups-per-qos flag is enabled. 1845 pcm := kl.containerManager.NewPodContainerManager() 1846 // If pod has already been terminated then we need not create 1847 // or update the pod's cgroup 1848 // TODO: once context cancellation is added this check can be removed 1849 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 1850 // When the kubelet is restarted with the cgroups-per-qos 1851 // flag enabled, all the pod's running containers 1852 // should be killed intermittently and brought back up 1853 // under the qos cgroup hierarchy. 1854 // Check if this is the pod's first sync 1855 firstSync := true 1856 for _, containerStatus := range apiPodStatus.ContainerStatuses { 1857 if containerStatus.State.Running != nil { 1858 firstSync = false 1859 break 1860 } 1861 } 1862 // Don't kill containers in pod if pod's cgroups already 1863 // exists or the pod is running for the first time 1864 podKilled := false 1865 if !pcm.Exists(pod) && !firstSync { 1866 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1867 if err := kl.killPod(ctx, pod, p, nil); err == nil { 1868 if wait.Interrupted(err) { 1869 return false, err 1870 } 1871 podKilled = true 1872 } else { 1873 klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus) 1874 } 1875 } 1876 // Create and Update pod's Cgroups 1877 // Don't create cgroups for run once pod if it was killed above 1878 // The current policy is not to restart the run once pods when 1879 // the kubelet is restarted with the new flag as run once pods are 1880 // expected to run only once and if the kubelet is restarted then 1881 // they are not expected to run again. 1882 // We don't create and apply updates to cgroup if its a run once pod and was killed above 1883 if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) { 1884 if !pcm.Exists(pod) { 1885 if err := kl.containerManager.UpdateQOSCgroups(); err != nil { 1886 klog.V(2).InfoS("Failed to update QoS cgroups while syncing pod", "pod", klog.KObj(pod), "err", err) 1887 } 1888 if err := pcm.EnsureExists(pod); err != nil { 1889 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err) 1890 return false, fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) 1891 } 1892 } 1893 } 1894 } 1895 1896 // Create Mirror Pod for Static Pod if it doesn't already exist 1897 if kubetypes.IsStaticPod(pod) { 1898 deleted := false 1899 if mirrorPod != nil { 1900 if mirrorPod.DeletionTimestamp != nil || !kubepod.IsMirrorPodOf(mirrorPod, pod) { 1901 // The mirror pod is semantically different from the static pod. Remove 1902 // it. The mirror pod will get recreated later. 1903 klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID) 1904 podFullName := kubecontainer.GetPodFullName(pod) 1905 var err error 1906 deleted, err = kl.mirrorPodClient.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID) 1907 if deleted { 1908 klog.InfoS("Deleted mirror pod because it is outdated", "pod", klog.KObj(mirrorPod)) 1909 } else if err != nil { 1910 klog.ErrorS(err, "Failed deleting mirror pod", "pod", klog.KObj(mirrorPod)) 1911 } 1912 } 1913 } 1914 if mirrorPod == nil || deleted { 1915 node, err := kl.GetNode() 1916 if err != nil || node.DeletionTimestamp != nil { 1917 klog.V(4).InfoS("No need to create a mirror pod, since node has been removed from the cluster", "node", klog.KRef("", string(kl.nodeName))) 1918 } else { 1919 klog.V(4).InfoS("Creating a mirror pod for static pod", "pod", klog.KObj(pod)) 1920 if err := kl.mirrorPodClient.CreateMirrorPod(pod); err != nil { 1921 klog.ErrorS(err, "Failed creating a mirror pod for", "pod", klog.KObj(pod)) 1922 } 1923 } 1924 } 1925 } 1926 1927 // Make data directories for the pod 1928 if err := kl.makePodDataDirs(pod); err != nil { 1929 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err) 1930 klog.ErrorS(err, "Unable to make pod data directories for pod", "pod", klog.KObj(pod)) 1931 return false, err 1932 } 1933 1934 // Wait for volumes to attach/mount 1935 if err := kl.volumeManager.WaitForAttachAndMount(ctx, pod); err != nil { 1936 if !wait.Interrupted(err) { 1937 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err) 1938 klog.ErrorS(err, "Unable to attach or mount volumes for pod; skipping pod", "pod", klog.KObj(pod)) 1939 } 1940 return false, err 1941 } 1942 1943 // Fetch the pull secrets for the pod 1944 pullSecrets := kl.getPullSecretsForPod(pod) 1945 1946 // Ensure the pod is being probed 1947 kl.probeManager.AddPod(pod) 1948 1949 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 1950 // Handle pod resize here instead of doing it in HandlePodUpdates because 1951 // this conveniently retries any Deferred resize requests 1952 // TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan 1953 // See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060 1954 if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) { 1955 pod = kl.handlePodResourcesResize(pod) 1956 } 1957 } 1958 1959 // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. 1960 // Currently, using that context causes test failures. To remove this todoCtx, any wait.Interrupted 1961 // errors need to be filtered from result and bypass the reasonCache - cancelling the context for 1962 // SyncPod is a known and deliberate error, not a generic error. 1963 todoCtx := context.TODO() 1964 // Call the container runtime's SyncPod callback 1965 result := kl.containerRuntime.SyncPod(todoCtx, pod, podStatus, pullSecrets, kl.backOff) 1966 kl.reasonCache.Update(pod.UID, result) 1967 if err := result.Error(); err != nil { 1968 // Do not return error if the only failures were pods in backoff 1969 for _, r := range result.SyncResults { 1970 if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff { 1971 // Do not record an event here, as we keep all event logging for sync pod failures 1972 // local to container runtime, so we get better errors. 1973 return false, err 1974 } 1975 } 1976 1977 return false, nil 1978 } 1979 1980 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) { 1981 // While resize is in progress, periodically call PLEG to update pod cache 1982 runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1983 if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil { 1984 klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod)) 1985 return false, err 1986 } 1987 } 1988 1989 return false, nil 1990 } 1991 1992 // SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method 1993 // returns without error, the pod is considered to be terminated and it will be safe to clean up any 1994 // pod state that is tied to the lifetime of running containers. The next method invoked will be 1995 // SyncTerminatedPod. This method is expected to return with the grace period provided and the 1996 // provided context may be cancelled if the duration is exceeded. The method may also be interrupted 1997 // with a context cancellation if the grace period is shortened by the user or the kubelet (such as 1998 // during eviction). This method is not guaranteed to be called if a pod is force deleted from the 1999 // configuration and the kubelet is restarted - SyncTerminatingRuntimePod handles those orphaned 2000 // pods. 2001 func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { 2002 // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. 2003 // Currently, using that context causes test failures. 2004 ctx, otelSpan := kl.tracer.Start(context.Background(), "syncTerminatingPod", trace.WithAttributes( 2005 semconv.K8SPodUIDKey.String(string(pod.UID)), 2006 attribute.String("k8s.pod", klog.KObj(pod).String()), 2007 semconv.K8SPodNameKey.String(pod.Name), 2008 semconv.K8SNamespaceNameKey.String(pod.Namespace), 2009 )) 2010 defer otelSpan.End() 2011 klog.V(4).InfoS("SyncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2012 defer klog.V(4).InfoS("SyncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2013 2014 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false) 2015 if podStatusFn != nil { 2016 podStatusFn(&apiPodStatus) 2017 } 2018 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2019 2020 if gracePeriod != nil { 2021 klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod) 2022 } else { 2023 klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil) 2024 } 2025 2026 kl.probeManager.StopLivenessAndStartup(pod) 2027 2028 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 2029 if err := kl.killPod(ctx, pod, p, gracePeriod); err != nil { 2030 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 2031 // there was an error killing the pod, so we return that error directly 2032 utilruntime.HandleError(err) 2033 return err 2034 } 2035 2036 // Once the containers are stopped, we can stop probing for liveness and readiness. 2037 // TODO: once a pod is terminal, certain probes (liveness exec) could be stopped immediately after 2038 // the detection of a container shutdown or (for readiness) after the first failure. Tracked as 2039 // https://github.com/kubernetes/kubernetes/issues/107894 although may not be worth optimizing. 2040 kl.probeManager.RemovePod(pod) 2041 2042 // Guard against consistency issues in KillPod implementations by checking that there are no 2043 // running containers. This method is invoked infrequently so this is effectively free and can 2044 // catch race conditions introduced by callers updating pod status out of order. 2045 // TODO: have KillPod return the terminal status of stopped containers and write that into the 2046 // cache immediately 2047 podStatus, err := kl.containerRuntime.GetPodStatus(ctx, pod.UID, pod.Name, pod.Namespace) 2048 if err != nil { 2049 klog.ErrorS(err, "Unable to read pod status prior to final pod termination", "pod", klog.KObj(pod), "podUID", pod.UID) 2050 return err 2051 } 2052 var runningContainers []string 2053 type container struct { 2054 Name string 2055 State string 2056 ExitCode int 2057 FinishedAt string 2058 } 2059 var containers []container 2060 klogV := klog.V(4) 2061 klogVEnabled := klogV.Enabled() 2062 for _, s := range podStatus.ContainerStatuses { 2063 if s.State == kubecontainer.ContainerStateRunning { 2064 runningContainers = append(runningContainers, s.ID.String()) 2065 } 2066 if klogVEnabled { 2067 containers = append(containers, container{Name: s.Name, State: string(s.State), ExitCode: s.ExitCode, FinishedAt: s.FinishedAt.UTC().Format(time.RFC3339Nano)}) 2068 } 2069 } 2070 if klogVEnabled { 2071 sort.Slice(containers, func(i, j int) bool { return containers[i].Name < containers[j].Name }) 2072 klog.V(4).InfoS("Post-termination container state", "pod", klog.KObj(pod), "podUID", pod.UID, "containers", containers) 2073 } 2074 if len(runningContainers) > 0 { 2075 return fmt.Errorf("detected running containers after a successful KillPod, CRI violation: %v", runningContainers) 2076 } 2077 2078 // NOTE: resources must be unprepared AFTER all containers have stopped 2079 // and BEFORE the pod status is changed on the API server 2080 // to avoid race conditions with the resource deallocation code in kubernetes core. 2081 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 2082 if err := kl.UnprepareDynamicResources(pod); err != nil { 2083 return err 2084 } 2085 } 2086 2087 // Compute and update the status in cache once the pods are no longer running. 2088 // The computation is done here to ensure the pod status used for it contains 2089 // information about the container end states (including exit codes) - when 2090 // SyncTerminatedPod is called the containers may already be removed. 2091 apiPodStatus = kl.generateAPIPodStatus(pod, podStatus, true) 2092 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2093 2094 // we have successfully stopped all containers, the pod is terminating, our status is "done" 2095 klog.V(4).InfoS("Pod termination stopped all running containers", "pod", klog.KObj(pod), "podUID", pod.UID) 2096 2097 return nil 2098 } 2099 2100 // SyncTerminatingRuntimePod is expected to terminate running containers in a pod that we have no 2101 // configuration for. Once this method returns without error, any remaining local state can be safely 2102 // cleaned up by background processes in each subsystem. Unlike syncTerminatingPod, we lack 2103 // knowledge of the full pod spec and so cannot perform lifecycle related operations, only ensure 2104 // that the remnant of the running pod is terminated and allow garbage collection to proceed. We do 2105 // not update the status of the pod because with the source of configuration removed, we have no 2106 // place to send that status. 2107 func (kl *Kubelet) SyncTerminatingRuntimePod(_ context.Context, runningPod *kubecontainer.Pod) error { 2108 // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. 2109 // Currently, using that context causes test failures. 2110 ctx := context.Background() 2111 pod := runningPod.ToAPIPod() 2112 klog.V(4).InfoS("SyncTerminatingRuntimePod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2113 defer klog.V(4).InfoS("SyncTerminatingRuntimePod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2114 2115 // we kill the pod directly since we have lost all other information about the pod. 2116 klog.V(4).InfoS("Orphaned running pod terminating without grace period", "pod", klog.KObj(pod), "podUID", pod.UID) 2117 // TODO: this should probably be zero, to bypass any waiting (needs fixes in container runtime) 2118 gracePeriod := int64(1) 2119 if err := kl.killPod(ctx, pod, *runningPod, &gracePeriod); err != nil { 2120 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 2121 // there was an error killing the pod, so we return that error directly 2122 utilruntime.HandleError(err) 2123 return err 2124 } 2125 klog.V(4).InfoS("Pod termination stopped all running orphaned containers", "pod", klog.KObj(pod), "podUID", pod.UID) 2126 return nil 2127 } 2128 2129 // SyncTerminatedPod cleans up a pod that has terminated (has no running containers). 2130 // The invocations in this call are expected to tear down all pod resources. 2131 // When this method exits the pod is expected to be ready for cleanup. This method 2132 // reduces the latency of pod cleanup but is not guaranteed to get called in all scenarios. 2133 // 2134 // Because the kubelet has no local store of information, all actions in this method that modify 2135 // on-disk state must be reentrant and be garbage collected by HandlePodCleanups or a separate loop. 2136 // This typically occurs when a pod is force deleted from configuration (local disk or API) and the 2137 // kubelet restarts in the middle of the action. 2138 func (kl *Kubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { 2139 ctx, otelSpan := kl.tracer.Start(ctx, "syncTerminatedPod", trace.WithAttributes( 2140 semconv.K8SPodUIDKey.String(string(pod.UID)), 2141 attribute.String("k8s.pod", klog.KObj(pod).String()), 2142 semconv.K8SPodNameKey.String(pod.Name), 2143 semconv.K8SNamespaceNameKey.String(pod.Namespace), 2144 )) 2145 defer otelSpan.End() 2146 klog.V(4).InfoS("SyncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2147 defer klog.V(4).InfoS("SyncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2148 2149 // generate the final status of the pod 2150 // TODO: should we simply fold this into TerminatePod? that would give a single pod update 2151 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, true) 2152 2153 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2154 2155 // volumes are unmounted after the pod worker reports ShouldPodRuntimeBeRemoved (which is satisfied 2156 // before syncTerminatedPod is invoked) 2157 if err := kl.volumeManager.WaitForUnmount(ctx, pod); err != nil { 2158 return err 2159 } 2160 klog.V(4).InfoS("Pod termination unmounted volumes", "pod", klog.KObj(pod), "podUID", pod.UID) 2161 2162 if !kl.keepTerminatedPodVolumes { 2163 // This waiting loop relies on the background cleanup which starts after pod workers respond 2164 // true for ShouldPodRuntimeBeRemoved, which happens after `SyncTerminatingPod` is completed. 2165 if err := wait.PollUntilContextCancel(ctx, 100*time.Millisecond, true, func(ctx context.Context) (bool, error) { 2166 volumesExist := kl.podVolumesExist(pod.UID) 2167 if volumesExist { 2168 klog.V(3).InfoS("Pod is terminated, but some volumes have not been cleaned up", "pod", klog.KObj(pod), "podUID", pod.UID) 2169 } 2170 return !volumesExist, nil 2171 }); err != nil { 2172 return err 2173 } 2174 klog.V(3).InfoS("Pod termination cleaned up volume paths", "pod", klog.KObj(pod), "podUID", pod.UID) 2175 } 2176 2177 // After volume unmount is complete, let the secret and configmap managers know we're done with this pod 2178 if kl.secretManager != nil { 2179 kl.secretManager.UnregisterPod(pod) 2180 } 2181 if kl.configMapManager != nil { 2182 kl.configMapManager.UnregisterPod(pod) 2183 } 2184 2185 // Note: we leave pod containers to be reclaimed in the background since dockershim requires the 2186 // container for retrieving logs and we want to make sure logs are available until the pod is 2187 // physically deleted. 2188 2189 // remove any cgroups in the hierarchy for pods that are no longer running. 2190 if kl.cgroupsPerQOS { 2191 pcm := kl.containerManager.NewPodContainerManager() 2192 name, _ := pcm.GetPodContainerName(pod) 2193 if err := pcm.Destroy(name); err != nil { 2194 return err 2195 } 2196 klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID) 2197 } 2198 2199 kl.usernsManager.Release(pod.UID) 2200 2201 // mark the final pod status 2202 kl.statusManager.TerminatePod(pod) 2203 klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID) 2204 2205 return nil 2206 } 2207 2208 // Get pods which should be resynchronized. Currently, the following pod should be resynchronized: 2209 // - pod whose work is ready. 2210 // - internal modules that request sync of a pod. 2211 // 2212 // This method does not return orphaned pods (those known only to the pod worker that may have 2213 // been deleted from configuration). Those pods are synced by HandlePodCleanups as a consequence 2214 // of driving the state machine to completion. 2215 // 2216 // TODO: Consider synchronizing all pods which have not recently been acted on to be resilient 2217 // to bugs that might prevent updates from being delivered (such as the previous bug with 2218 // orphaned pods). Instead of asking the work queue for pending work, consider asking the 2219 // PodWorker which pods should be synced. 2220 func (kl *Kubelet) getPodsToSync() []*v1.Pod { 2221 allPods := kl.podManager.GetPods() 2222 podUIDs := kl.workQueue.GetWork() 2223 podUIDSet := sets.NewString() 2224 for _, podUID := range podUIDs { 2225 podUIDSet.Insert(string(podUID)) 2226 } 2227 var podsToSync []*v1.Pod 2228 for _, pod := range allPods { 2229 if podUIDSet.Has(string(pod.UID)) { 2230 // The work of the pod is ready 2231 podsToSync = append(podsToSync, pod) 2232 continue 2233 } 2234 for _, podSyncLoopHandler := range kl.PodSyncLoopHandlers { 2235 if podSyncLoopHandler.ShouldSync(pod) { 2236 podsToSync = append(podsToSync, pod) 2237 break 2238 } 2239 } 2240 } 2241 return podsToSync 2242 } 2243 2244 // deletePod deletes the pod from the internal state of the kubelet by: 2245 // 1. stopping the associated pod worker asynchronously 2246 // 2. signaling to kill the pod by sending on the podKillingCh channel 2247 // 2248 // deletePod returns an error if not all sources are ready or the pod is not 2249 // found in the runtime cache. 2250 func (kl *Kubelet) deletePod(pod *v1.Pod) error { 2251 if pod == nil { 2252 return fmt.Errorf("deletePod does not allow nil pod") 2253 } 2254 if !kl.sourcesReady.AllReady() { 2255 // If the sources aren't ready, skip deletion, as we may accidentally delete pods 2256 // for sources that haven't reported yet. 2257 return fmt.Errorf("skipping delete because sources aren't ready yet") 2258 } 2259 klog.V(3).InfoS("Pod has been deleted and must be killed", "pod", klog.KObj(pod), "podUID", pod.UID) 2260 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2261 Pod: pod, 2262 UpdateType: kubetypes.SyncPodKill, 2263 }) 2264 // We leave the volume/directory cleanup to the periodic cleanup routine. 2265 return nil 2266 } 2267 2268 // rejectPod records an event about the pod with the given reason and message, 2269 // and updates the pod to the failed phase in the status manager. 2270 func (kl *Kubelet) rejectPod(pod *v1.Pod, reason, message string) { 2271 kl.recorder.Eventf(pod, v1.EventTypeWarning, reason, message) 2272 kl.statusManager.SetPodStatus(pod, v1.PodStatus{ 2273 Phase: v1.PodFailed, 2274 Reason: reason, 2275 Message: "Pod was rejected: " + message}) 2276 } 2277 2278 // canAdmitPod determines if a pod can be admitted, and gives a reason if it 2279 // cannot. "pod" is new pod, while "pods" are all admitted pods 2280 // The function returns a boolean value indicating whether the pod 2281 // can be admitted, a brief single-word reason and a message explaining why 2282 // the pod cannot be admitted. 2283 func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, string) { 2284 // the kubelet will invoke each pod admit handler in sequence 2285 // if any handler rejects, the pod is rejected. 2286 // TODO: move out of disk check into a pod admitter 2287 // TODO: out of resource eviction should have a pod admitter call-out 2288 attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods} 2289 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2290 // Use allocated resources values from checkpoint store (source of truth) to determine fit 2291 otherPods := make([]*v1.Pod, 0, len(pods)) 2292 for _, p := range pods { 2293 op := p.DeepCopy() 2294 kl.updateContainerResourceAllocation(op) 2295 2296 otherPods = append(otherPods, op) 2297 } 2298 attrs.OtherPods = otherPods 2299 } 2300 for _, podAdmitHandler := range kl.admitHandlers { 2301 if result := podAdmitHandler.Admit(attrs); !result.Admit { 2302 return false, result.Reason, result.Message 2303 } 2304 } 2305 2306 return true, "", "" 2307 } 2308 2309 func (kl *Kubelet) canRunPod(pod *v1.Pod) lifecycle.PodAdmitResult { 2310 attrs := &lifecycle.PodAdmitAttributes{Pod: pod} 2311 // Get "OtherPods". Rejected pods are failed, so only include admitted pods that are alive. 2312 attrs.OtherPods = kl.GetActivePods() 2313 2314 for _, handler := range kl.softAdmitHandlers { 2315 if result := handler.Admit(attrs); !result.Admit { 2316 return result 2317 } 2318 } 2319 2320 return lifecycle.PodAdmitResult{Admit: true} 2321 } 2322 2323 // syncLoop is the main loop for processing changes. It watches for changes from 2324 // three channels (file, apiserver, and http) and creates a union of them. For 2325 // any new change seen, will run a sync against desired state and running state. If 2326 // no changes are seen to the configuration, will synchronize the last known desired 2327 // state every sync-frequency seconds. Never returns. 2328 func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) { 2329 klog.InfoS("Starting kubelet main sync loop") 2330 // The syncTicker wakes up kubelet to checks if there are any pod workers 2331 // that need to be sync'd. A one-second period is sufficient because the 2332 // sync interval is defaulted to 10s. 2333 syncTicker := time.NewTicker(time.Second) 2334 defer syncTicker.Stop() 2335 housekeepingTicker := time.NewTicker(housekeepingPeriod) 2336 defer housekeepingTicker.Stop() 2337 plegCh := kl.pleg.Watch() 2338 const ( 2339 base = 100 * time.Millisecond 2340 max = 5 * time.Second 2341 factor = 2 2342 ) 2343 duration := base 2344 // Responsible for checking limits in resolv.conf 2345 // The limits do not have anything to do with individual pods 2346 // Since this is called in syncLoop, we don't need to call it anywhere else 2347 if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" { 2348 kl.dnsConfigurer.CheckLimitsForResolvConf() 2349 } 2350 2351 for { 2352 if err := kl.runtimeState.runtimeErrors(); err != nil { 2353 klog.ErrorS(err, "Skipping pod synchronization") 2354 // exponential backoff 2355 time.Sleep(duration) 2356 duration = time.Duration(math.Min(float64(max), factor*float64(duration))) 2357 continue 2358 } 2359 // reset backoff if we have a success 2360 duration = base 2361 2362 kl.syncLoopMonitor.Store(kl.clock.Now()) 2363 if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) { 2364 break 2365 } 2366 kl.syncLoopMonitor.Store(kl.clock.Now()) 2367 } 2368 } 2369 2370 // syncLoopIteration reads from various channels and dispatches pods to the 2371 // given handler. 2372 // 2373 // Arguments: 2374 // 1. configCh: a channel to read config events from 2375 // 2. handler: the SyncHandler to dispatch pods to 2376 // 3. syncCh: a channel to read periodic sync events from 2377 // 4. housekeepingCh: a channel to read housekeeping events from 2378 // 5. plegCh: a channel to read PLEG updates from 2379 // 2380 // Events are also read from the kubelet liveness manager's update channel. 2381 // 2382 // The workflow is to read from one of the channels, handle that event, and 2383 // update the timestamp in the sync loop monitor. 2384 // 2385 // Here is an appropriate place to note that despite the syntactical 2386 // similarity to the switch statement, the case statements in a select are 2387 // evaluated in a pseudorandom order if there are multiple channels ready to 2388 // read from when the select is evaluated. In other words, case statements 2389 // are evaluated in random order, and you can not assume that the case 2390 // statements evaluate in order if multiple channels have events. 2391 // 2392 // With that in mind, in truly no particular order, the different channels 2393 // are handled as follows: 2394 // 2395 // - configCh: dispatch the pods for the config change to the appropriate 2396 // handler callback for the event type 2397 // - plegCh: update the runtime cache; sync pod 2398 // - syncCh: sync all pods waiting for sync 2399 // - housekeepingCh: trigger cleanup of pods 2400 // - health manager: sync pods that have failed or in which one or more 2401 // containers have failed health checks 2402 func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler, 2403 syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool { 2404 select { 2405 case u, open := <-configCh: 2406 // Update from a config source; dispatch it to the right handler 2407 // callback. 2408 if !open { 2409 klog.ErrorS(nil, "Update channel is closed, exiting the sync loop") 2410 return false 2411 } 2412 2413 switch u.Op { 2414 case kubetypes.ADD: 2415 klog.V(2).InfoS("SyncLoop ADD", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2416 // After restarting, kubelet will get all existing pods through 2417 // ADD as if they are new pods. These pods will then go through the 2418 // admission process and *may* be rejected. This can be resolved 2419 // once we have checkpointing. 2420 handler.HandlePodAdditions(u.Pods) 2421 case kubetypes.UPDATE: 2422 klog.V(2).InfoS("SyncLoop UPDATE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2423 handler.HandlePodUpdates(u.Pods) 2424 case kubetypes.REMOVE: 2425 klog.V(2).InfoS("SyncLoop REMOVE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2426 handler.HandlePodRemoves(u.Pods) 2427 case kubetypes.RECONCILE: 2428 klog.V(4).InfoS("SyncLoop RECONCILE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2429 handler.HandlePodReconcile(u.Pods) 2430 case kubetypes.DELETE: 2431 klog.V(2).InfoS("SyncLoop DELETE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2432 // DELETE is treated as a UPDATE because of graceful deletion. 2433 handler.HandlePodUpdates(u.Pods) 2434 case kubetypes.SET: 2435 // TODO: Do we want to support this? 2436 klog.ErrorS(nil, "Kubelet does not support snapshot update") 2437 default: 2438 klog.ErrorS(nil, "Invalid operation type received", "operation", u.Op) 2439 } 2440 2441 kl.sourcesReady.AddSource(u.Source) 2442 2443 case e := <-plegCh: 2444 if isSyncPodWorthy(e) { 2445 // PLEG event for a pod; sync it. 2446 if pod, ok := kl.podManager.GetPodByUID(e.ID); ok { 2447 klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e) 2448 handler.HandlePodSyncs([]*v1.Pod{pod}) 2449 } else { 2450 // If the pod no longer exists, ignore the event. 2451 klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e) 2452 } 2453 } 2454 2455 if e.Type == pleg.ContainerDied { 2456 if containerID, ok := e.Data.(string); ok { 2457 kl.cleanUpContainersInPod(e.ID, containerID) 2458 } 2459 } 2460 case <-syncCh: 2461 // Sync pods waiting for sync 2462 podsToSync := kl.getPodsToSync() 2463 if len(podsToSync) == 0 { 2464 break 2465 } 2466 klog.V(4).InfoS("SyncLoop (SYNC) pods", "total", len(podsToSync), "pods", klog.KObjSlice(podsToSync)) 2467 handler.HandlePodSyncs(podsToSync) 2468 case update := <-kl.livenessManager.Updates(): 2469 if update.Result == proberesults.Failure { 2470 handleProbeSync(kl, update, handler, "liveness", "unhealthy") 2471 } 2472 case update := <-kl.readinessManager.Updates(): 2473 ready := update.Result == proberesults.Success 2474 kl.statusManager.SetContainerReadiness(update.PodUID, update.ContainerID, ready) 2475 2476 status := "" 2477 if ready { 2478 status = "ready" 2479 } 2480 handleProbeSync(kl, update, handler, "readiness", status) 2481 case update := <-kl.startupManager.Updates(): 2482 started := update.Result == proberesults.Success 2483 kl.statusManager.SetContainerStartup(update.PodUID, update.ContainerID, started) 2484 2485 status := "unhealthy" 2486 if started { 2487 status = "started" 2488 } 2489 handleProbeSync(kl, update, handler, "startup", status) 2490 case <-housekeepingCh: 2491 if !kl.sourcesReady.AllReady() { 2492 // If the sources aren't ready or volume manager has not yet synced the states, 2493 // skip housekeeping, as we may accidentally delete pods from unready sources. 2494 klog.V(4).InfoS("SyncLoop (housekeeping, skipped): sources aren't ready yet") 2495 } else { 2496 start := time.Now() 2497 klog.V(4).InfoS("SyncLoop (housekeeping)") 2498 if err := handler.HandlePodCleanups(ctx); err != nil { 2499 klog.ErrorS(err, "Failed cleaning pods") 2500 } 2501 duration := time.Since(start) 2502 if duration > housekeepingWarningDuration { 2503 klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than expected", "expected", housekeepingWarningDuration, "actual", duration.Round(time.Millisecond)) 2504 } 2505 klog.V(4).InfoS("SyncLoop (housekeeping) end", "duration", duration.Round(time.Millisecond)) 2506 } 2507 } 2508 return true 2509 } 2510 2511 func handleProbeSync(kl *Kubelet, update proberesults.Update, handler SyncHandler, probe, status string) { 2512 // We should not use the pod from manager, because it is never updated after initialization. 2513 pod, ok := kl.podManager.GetPodByUID(update.PodUID) 2514 if !ok { 2515 // If the pod no longer exists, ignore the update. 2516 klog.V(4).InfoS("SyncLoop (probe): ignore irrelevant update", "probe", probe, "status", status, "update", update) 2517 return 2518 } 2519 klog.V(1).InfoS("SyncLoop (probe)", "probe", probe, "status", status, "pod", klog.KObj(pod)) 2520 handler.HandlePodSyncs([]*v1.Pod{pod}) 2521 } 2522 2523 // HandlePodAdditions is the callback in SyncHandler for pods being added from 2524 // a config source. 2525 func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { 2526 start := kl.clock.Now() 2527 sort.Sort(sliceutils.PodsByCreationTime(pods)) 2528 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2529 kl.podResizeMutex.Lock() 2530 defer kl.podResizeMutex.Unlock() 2531 } 2532 for _, pod := range pods { 2533 existingPods := kl.podManager.GetPods() 2534 // Always add the pod to the pod manager. Kubelet relies on the pod 2535 // manager as the source of truth for the desired state. If a pod does 2536 // not exist in the pod manager, it means that it has been deleted in 2537 // the apiserver and no action (other than cleanup) is required. 2538 kl.podManager.AddPod(pod) 2539 2540 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2541 if wasMirror { 2542 if pod == nil { 2543 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2544 continue 2545 } 2546 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2547 Pod: pod, 2548 MirrorPod: mirrorPod, 2549 UpdateType: kubetypes.SyncPodUpdate, 2550 StartTime: start, 2551 }) 2552 continue 2553 } 2554 2555 // Only go through the admission process if the pod is not requested 2556 // for termination by another part of the kubelet. If the pod is already 2557 // using resources (previously admitted), the pod worker is going to be 2558 // shutting it down. If the pod hasn't started yet, we know that when 2559 // the pod worker is invoked it will also avoid setting up the pod, so 2560 // we simply avoid doing any work. 2561 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 2562 // We failed pods that we rejected, so activePods include all admitted 2563 // pods that are alive. 2564 activePods := kl.filterOutInactivePods(existingPods) 2565 2566 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2567 // To handle kubelet restarts, test pod admissibility using AllocatedResources values 2568 // (for cpu & memory) from checkpoint store. If found, that is the source of truth. 2569 podCopy := pod.DeepCopy() 2570 kl.updateContainerResourceAllocation(podCopy) 2571 2572 // Check if we can admit the pod; if not, reject it. 2573 if ok, reason, message := kl.canAdmitPod(activePods, podCopy); !ok { 2574 kl.rejectPod(pod, reason, message) 2575 continue 2576 } 2577 // For new pod, checkpoint the resource values at which the Pod has been admitted 2578 if err := kl.statusManager.SetPodAllocation(podCopy); err != nil { 2579 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2580 klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod)) 2581 } 2582 } else { 2583 // Check if we can admit the pod; if not, reject it. 2584 if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { 2585 kl.rejectPod(pod, reason, message) 2586 continue 2587 } 2588 } 2589 } 2590 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2591 Pod: pod, 2592 MirrorPod: mirrorPod, 2593 UpdateType: kubetypes.SyncPodCreate, 2594 StartTime: start, 2595 }) 2596 } 2597 } 2598 2599 // updateContainerResourceAllocation updates AllocatedResources values 2600 // (for cpu & memory) from checkpoint store 2601 func (kl *Kubelet) updateContainerResourceAllocation(pod *v1.Pod) { 2602 for _, c := range pod.Spec.Containers { 2603 allocatedResources, found := kl.statusManager.GetContainerResourceAllocation(string(pod.UID), c.Name) 2604 if c.Resources.Requests != nil && found { 2605 if _, ok := allocatedResources[v1.ResourceCPU]; ok { 2606 c.Resources.Requests[v1.ResourceCPU] = allocatedResources[v1.ResourceCPU] 2607 } 2608 if _, ok := allocatedResources[v1.ResourceMemory]; ok { 2609 c.Resources.Requests[v1.ResourceMemory] = allocatedResources[v1.ResourceMemory] 2610 } 2611 } 2612 } 2613 } 2614 2615 // HandlePodUpdates is the callback in the SyncHandler interface for pods 2616 // being updated from a config source. 2617 func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) { 2618 start := kl.clock.Now() 2619 for _, pod := range pods { 2620 kl.podManager.UpdatePod(pod) 2621 2622 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2623 if wasMirror { 2624 if pod == nil { 2625 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2626 continue 2627 } 2628 } 2629 2630 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2631 Pod: pod, 2632 MirrorPod: mirrorPod, 2633 UpdateType: kubetypes.SyncPodUpdate, 2634 StartTime: start, 2635 }) 2636 } 2637 } 2638 2639 // HandlePodRemoves is the callback in the SyncHandler interface for pods 2640 // being removed from a config source. 2641 func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) { 2642 start := kl.clock.Now() 2643 for _, pod := range pods { 2644 kl.podManager.RemovePod(pod) 2645 2646 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2647 if wasMirror { 2648 if pod == nil { 2649 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2650 continue 2651 } 2652 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2653 Pod: pod, 2654 MirrorPod: mirrorPod, 2655 UpdateType: kubetypes.SyncPodUpdate, 2656 StartTime: start, 2657 }) 2658 continue 2659 } 2660 2661 // Deletion is allowed to fail because the periodic cleanup routine 2662 // will trigger deletion again. 2663 if err := kl.deletePod(pod); err != nil { 2664 klog.V(2).InfoS("Failed to delete pod", "pod", klog.KObj(pod), "err", err) 2665 } 2666 } 2667 } 2668 2669 // HandlePodReconcile is the callback in the SyncHandler interface for pods 2670 // that should be reconciled. Pods are reconciled when only the status of the 2671 // pod is updated in the API. 2672 func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) { 2673 start := kl.clock.Now() 2674 for _, pod := range pods { 2675 // Update the pod in pod manager, status manager will do periodically reconcile according 2676 // to the pod manager. 2677 kl.podManager.UpdatePod(pod) 2678 2679 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2680 if wasMirror { 2681 if pod == nil { 2682 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2683 continue 2684 } 2685 // Static pods should be reconciled the same way as regular pods 2686 } 2687 2688 // TODO: reconcile being calculated in the config manager is questionable, and avoiding 2689 // extra syncs may no longer be necessary. Reevaluate whether Reconcile and Sync can be 2690 // merged (after resolving the next two TODOs). 2691 2692 // Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation. 2693 // TODO: this should be unnecessary today - determine what is the cause for this to 2694 // be different than Sync, or if there is a better place for it. For instance, we have 2695 // needsReconcile in kubelet/config, here, and in status_manager. 2696 if status.NeedToReconcilePodReadiness(pod) { 2697 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2698 Pod: pod, 2699 MirrorPod: mirrorPod, 2700 UpdateType: kubetypes.SyncPodSync, 2701 StartTime: start, 2702 }) 2703 } 2704 2705 // After an evicted pod is synced, all dead containers in the pod can be removed. 2706 // TODO: this is questionable - status read is async and during eviction we already 2707 // expect to not have some container info. The pod worker knows whether a pod has 2708 // been evicted, so if this is about minimizing the time to react to an eviction we 2709 // can do better. If it's about preserving pod status info we can also do better. 2710 if eviction.PodIsEvicted(pod.Status) { 2711 if podStatus, err := kl.podCache.Get(pod.UID); err == nil { 2712 kl.containerDeletor.deleteContainersInPod("", podStatus, true) 2713 } 2714 } 2715 } 2716 } 2717 2718 // HandlePodSyncs is the callback in the syncHandler interface for pods 2719 // that should be dispatched to pod workers for sync. 2720 func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) { 2721 start := kl.clock.Now() 2722 for _, pod := range pods { 2723 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2724 if wasMirror { 2725 if pod == nil { 2726 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2727 continue 2728 } 2729 // Syncing a mirror pod is a programmer error since the intent of sync is to 2730 // batch notify all pending work. We should make it impossible to double sync, 2731 // but for now log a programmer error to prevent accidental introduction. 2732 klog.V(3).InfoS("Programmer error, HandlePodSyncs does not expect to receive mirror pods", "podUID", pod.UID, "mirrorPodUID", mirrorPod.UID) 2733 continue 2734 } 2735 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2736 Pod: pod, 2737 MirrorPod: mirrorPod, 2738 UpdateType: kubetypes.SyncPodSync, 2739 StartTime: start, 2740 }) 2741 } 2742 } 2743 2744 func isPodResizeInProgress(pod *v1.Pod, podStatus *v1.PodStatus) bool { 2745 for _, c := range pod.Spec.Containers { 2746 if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { 2747 if cs.Resources == nil { 2748 continue 2749 } 2750 if !cmp.Equal(c.Resources.Limits, cs.Resources.Limits) || !cmp.Equal(cs.AllocatedResources, cs.Resources.Requests) { 2751 return true 2752 } 2753 } 2754 } 2755 return false 2756 } 2757 2758 func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus) { 2759 var otherActivePods []*v1.Pod 2760 2761 node, err := kl.getNodeAnyWay() 2762 if err != nil { 2763 klog.ErrorS(err, "getNodeAnyway function failed") 2764 return false, nil, "" 2765 } 2766 podCopy := pod.DeepCopy() 2767 cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() 2768 memAvailable := node.Status.Allocatable.Memory().Value() 2769 cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU) 2770 memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory) 2771 if cpuRequests > cpuAvailable || memRequests > memAvailable { 2772 klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name) 2773 return false, podCopy, v1.PodResizeStatusInfeasible 2774 } 2775 2776 // Treat the existing pod needing resize as a new pod with desired resources seeking admit. 2777 // If desired resources don't fit, pod continues to run with currently allocated resources. 2778 activePods := kl.GetActivePods() 2779 for _, p := range activePods { 2780 if p.UID != pod.UID { 2781 otherActivePods = append(otherActivePods, p) 2782 } 2783 } 2784 2785 if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok { 2786 // Log reason and return. Let the next sync iteration retry the resize 2787 klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage) 2788 return false, podCopy, v1.PodResizeStatusDeferred 2789 } 2790 2791 for _, container := range podCopy.Spec.Containers { 2792 idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name) 2793 if found { 2794 for rName, rQuantity := range container.Resources.Requests { 2795 podCopy.Status.ContainerStatuses[idx].AllocatedResources[rName] = rQuantity 2796 } 2797 } 2798 } 2799 return true, podCopy, v1.PodResizeStatusInProgress 2800 } 2801 2802 func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod { 2803 if pod.Status.Phase != v1.PodRunning { 2804 return pod 2805 } 2806 podResized := false 2807 for _, container := range pod.Spec.Containers { 2808 if len(container.Resources.Requests) == 0 { 2809 continue 2810 } 2811 containerStatus, found := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name) 2812 if !found { 2813 klog.V(5).InfoS("ContainerStatus not found", "pod", pod.Name, "container", container.Name) 2814 break 2815 } 2816 if len(containerStatus.AllocatedResources) != len(container.Resources.Requests) { 2817 klog.V(5).InfoS("ContainerStatus.AllocatedResources length mismatch", "pod", pod.Name, "container", container.Name) 2818 break 2819 } 2820 if !cmp.Equal(container.Resources.Requests, containerStatus.AllocatedResources) { 2821 podResized = true 2822 break 2823 } 2824 } 2825 if !podResized { 2826 return pod 2827 } 2828 2829 kl.podResizeMutex.Lock() 2830 defer kl.podResizeMutex.Unlock() 2831 fit, updatedPod, resizeStatus := kl.canResizePod(pod) 2832 if updatedPod == nil { 2833 return pod 2834 } 2835 if fit { 2836 // Update pod resource allocation checkpoint 2837 if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil { 2838 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2839 klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod)) 2840 return pod 2841 } 2842 } 2843 if resizeStatus != "" { 2844 // Save resize decision to checkpoint 2845 if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil { 2846 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2847 klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod)) 2848 return pod 2849 } 2850 updatedPod.Status.Resize = resizeStatus 2851 } 2852 kl.podManager.UpdatePod(updatedPod) 2853 kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status) 2854 return updatedPod 2855 } 2856 2857 // LatestLoopEntryTime returns the last time in the sync loop monitor. 2858 func (kl *Kubelet) LatestLoopEntryTime() time.Time { 2859 val := kl.syncLoopMonitor.Load() 2860 if val == nil { 2861 return time.Time{} 2862 } 2863 return val.(time.Time) 2864 } 2865 2866 // updateRuntimeUp calls the container runtime status callback, initializing 2867 // the runtime dependent modules when the container runtime first comes up, 2868 // and returns an error if the status check fails. If the status check is OK, 2869 // update the container runtime uptime in the kubelet runtimeState. 2870 func (kl *Kubelet) updateRuntimeUp() { 2871 kl.updateRuntimeMux.Lock() 2872 defer kl.updateRuntimeMux.Unlock() 2873 ctx := context.Background() 2874 2875 s, err := kl.containerRuntime.Status(ctx) 2876 if err != nil { 2877 klog.ErrorS(err, "Container runtime sanity check failed") 2878 return 2879 } 2880 if s == nil { 2881 klog.ErrorS(nil, "Container runtime status is nil") 2882 return 2883 } 2884 // Periodically log the whole runtime status for debugging. 2885 klog.V(4).InfoS("Container runtime status", "status", s) 2886 klogErrorS := klog.ErrorS 2887 if !kl.containerRuntimeReadyExpected { 2888 klogErrorS = klog.V(4).ErrorS 2889 } 2890 networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady) 2891 if networkReady == nil || !networkReady.Status { 2892 klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady) 2893 kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady)) 2894 } else { 2895 // Set nil if the container runtime network is ready. 2896 kl.runtimeState.setNetworkState(nil) 2897 } 2898 // information in RuntimeReady condition will be propagated to NodeReady condition. 2899 runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady) 2900 // If RuntimeReady is not set or is false, report an error. 2901 if runtimeReady == nil || !runtimeReady.Status { 2902 klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady) 2903 kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady)) 2904 return 2905 } 2906 kl.runtimeState.setRuntimeState(nil) 2907 kl.oneTimeInitializer.Do(kl.initializeRuntimeDependentModules) 2908 kl.runtimeState.setRuntimeSync(kl.clock.Now()) 2909 } 2910 2911 // GetConfiguration returns the KubeletConfiguration used to configure the kubelet. 2912 func (kl *Kubelet) GetConfiguration() kubeletconfiginternal.KubeletConfiguration { 2913 return kl.kubeletConfiguration 2914 } 2915 2916 // BirthCry sends an event that the kubelet has started up. 2917 func (kl *Kubelet) BirthCry() { 2918 // Make an event that kubelet restarted. 2919 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeNormal, events.StartingKubelet, "Starting kubelet.") 2920 } 2921 2922 // ResyncInterval returns the interval used for periodic syncs. 2923 func (kl *Kubelet) ResyncInterval() time.Duration { 2924 return kl.resyncInterval 2925 } 2926 2927 // ListenAndServe runs the kubelet HTTP server. 2928 func (kl *Kubelet) ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, 2929 auth server.AuthInterface, tp trace.TracerProvider) { 2930 server.ListenAndServeKubeletServer(kl, kl.resourceAnalyzer, kubeCfg, tlsOptions, auth, tp) 2931 } 2932 2933 // ListenAndServeReadOnly runs the kubelet HTTP server in read-only mode. 2934 func (kl *Kubelet) ListenAndServeReadOnly(address net.IP, port uint) { 2935 server.ListenAndServeKubeletReadOnlyServer(kl, kl.resourceAnalyzer, address, port) 2936 } 2937 2938 // ListenAndServePodResources runs the kubelet podresources grpc service 2939 func (kl *Kubelet) ListenAndServePodResources() { 2940 endpoint, err := util.LocalEndpoint(kl.getPodResourcesDir(), podresources.Socket) 2941 if err != nil { 2942 klog.V(2).InfoS("Failed to get local endpoint for PodResources endpoint", "err", err) 2943 return 2944 } 2945 2946 providers := podresources.PodResourcesProviders{ 2947 Pods: kl.podManager, 2948 Devices: kl.containerManager, 2949 Cpus: kl.containerManager, 2950 Memory: kl.containerManager, 2951 DynamicResources: kl.containerManager, 2952 } 2953 2954 server.ListenAndServePodResources(endpoint, providers) 2955 } 2956 2957 // Delete the eligible dead container instances in a pod. Depending on the configuration, the latest dead containers may be kept around. 2958 func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID string) { 2959 if podStatus, err := kl.podCache.Get(podID); err == nil { 2960 // When an evicted or deleted pod has already synced, all containers can be removed. 2961 removeAll := kl.podWorkers.ShouldPodContentBeRemoved(podID) 2962 kl.containerDeletor.deleteContainersInPod(exitedContainerID, podStatus, removeAll) 2963 } 2964 } 2965 2966 // fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime 2967 // would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible. 2968 // Function returns after the node status update after such event, or when the node is already ready. 2969 // Function is executed only during Kubelet start which improves latency to ready node by updating 2970 // kubelet state, runtime status and node statuses ASAP. 2971 func (kl *Kubelet) fastStatusUpdateOnce() { 2972 ctx := context.Background() 2973 start := kl.clock.Now() 2974 stopCh := make(chan struct{}) 2975 2976 // Keep trying to make fast node status update until either timeout is reached or an update is successful. 2977 wait.Until(func() { 2978 // fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired 2979 // (status was not updated within nodeReadyGracePeriod and the second argument below gets true), 2980 // then we close the channel and abort the loop. 2981 if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) { 2982 close(stopCh) 2983 } 2984 }, 100*time.Millisecond, stopCh) 2985 } 2986 2987 // CheckpointContainer tries to checkpoint a container. The parameters are used to 2988 // look up the specified container. If the container specified by the given parameters 2989 // cannot be found an error is returned. If the container is found the container 2990 // engine will be asked to checkpoint the given container into the kubelet's default 2991 // checkpoint directory. 2992 func (kl *Kubelet) CheckpointContainer( 2993 ctx context.Context, 2994 podUID types.UID, 2995 podFullName, 2996 containerName string, 2997 options *runtimeapi.CheckpointContainerRequest, 2998 ) error { 2999 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 3000 if err != nil { 3001 return err 3002 } 3003 if container == nil { 3004 return fmt.Errorf("container %v not found", containerName) 3005 } 3006 3007 options.Location = filepath.Join( 3008 kl.getCheckpointsDir(), 3009 fmt.Sprintf( 3010 "checkpoint-%s-%s-%s.tar", 3011 podFullName, 3012 containerName, 3013 time.Now().Format(time.RFC3339), 3014 ), 3015 ) 3016 3017 options.ContainerId = string(container.ID.ID) 3018 3019 if err := kl.containerRuntime.CheckpointContainer(ctx, options); err != nil { 3020 return err 3021 } 3022 3023 return nil 3024 } 3025 3026 // ListMetricDescriptors gets the descriptors for the metrics that will be returned in ListPodSandboxMetrics. 3027 func (kl *Kubelet) ListMetricDescriptors(ctx context.Context) ([]*runtimeapi.MetricDescriptor, error) { 3028 return kl.containerRuntime.ListMetricDescriptors(ctx) 3029 } 3030 3031 // ListPodSandboxMetrics retrieves the metrics for all pod sandboxes. 3032 func (kl *Kubelet) ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error) { 3033 return kl.containerRuntime.ListPodSandboxMetrics(ctx) 3034 } 3035 3036 func (kl *Kubelet) supportLocalStorageCapacityIsolation() bool { 3037 return kl.GetConfiguration().LocalStorageCapacityIsolation 3038 } 3039 3040 // isSyncPodWorthy filters out events that are not worthy of pod syncing 3041 func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool { 3042 // ContainerRemoved doesn't affect pod state 3043 return event.Type != pleg.ContainerRemoved 3044 } 3045 3046 // PrepareDynamicResources calls the container Manager PrepareDynamicResources API 3047 // This method implements the RuntimeHelper interface 3048 func (kl *Kubelet) PrepareDynamicResources(pod *v1.Pod) error { 3049 return kl.containerManager.PrepareDynamicResources(pod) 3050 } 3051 3052 // UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API 3053 // This method implements the RuntimeHelper interface 3054 func (kl *Kubelet) UnprepareDynamicResources(pod *v1.Pod) error { 3055 return kl.containerManager.UnprepareDynamicResources(pod) 3056 }