k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kubelet.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "context" 21 "crypto/tls" 22 "errors" 23 "fmt" 24 "math" 25 "net" 26 "net/http" 27 "os" 28 "path/filepath" 29 sysruntime "runtime" 30 "sort" 31 "sync" 32 "sync/atomic" 33 "time" 34 35 cadvisorapi "github.com/google/cadvisor/info/v1" 36 "github.com/google/go-cmp/cmp" 37 "github.com/opencontainers/selinux/go-selinux" 38 "go.opentelemetry.io/otel/attribute" 39 semconv "go.opentelemetry.io/otel/semconv/v1.12.0" 40 "go.opentelemetry.io/otel/trace" 41 "k8s.io/client-go/informers" 42 43 "k8s.io/mount-utils" 44 netutils "k8s.io/utils/net" 45 46 v1 "k8s.io/api/core/v1" 47 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 48 "k8s.io/apimachinery/pkg/fields" 49 "k8s.io/apimachinery/pkg/labels" 50 "k8s.io/apimachinery/pkg/types" 51 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 52 "k8s.io/apimachinery/pkg/util/sets" 53 "k8s.io/apimachinery/pkg/util/wait" 54 utilfeature "k8s.io/apiserver/pkg/util/feature" 55 clientset "k8s.io/client-go/kubernetes" 56 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 57 corelisters "k8s.io/client-go/listers/core/v1" 58 "k8s.io/client-go/tools/cache" 59 "k8s.io/client-go/tools/record" 60 "k8s.io/client-go/util/certificate" 61 "k8s.io/client-go/util/flowcontrol" 62 cloudprovider "k8s.io/cloud-provider" 63 "k8s.io/component-helpers/apimachinery/lease" 64 internalapi "k8s.io/cri-api/pkg/apis" 65 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 66 remote "k8s.io/cri-client/pkg" 67 "k8s.io/klog/v2" 68 pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1" 69 statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 70 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 71 "k8s.io/kubernetes/pkg/api/v1/resource" 72 "k8s.io/kubernetes/pkg/features" 73 kubeletconfiginternal "k8s.io/kubernetes/pkg/kubelet/apis/config" 74 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 75 "k8s.io/kubernetes/pkg/kubelet/cadvisor" 76 kubeletcertificate "k8s.io/kubernetes/pkg/kubelet/certificate" 77 "k8s.io/kubernetes/pkg/kubelet/cloudresource" 78 "k8s.io/kubernetes/pkg/kubelet/clustertrustbundle" 79 "k8s.io/kubernetes/pkg/kubelet/cm" 80 draplugin "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" 81 "k8s.io/kubernetes/pkg/kubelet/config" 82 "k8s.io/kubernetes/pkg/kubelet/configmap" 83 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 84 "k8s.io/kubernetes/pkg/kubelet/events" 85 "k8s.io/kubernetes/pkg/kubelet/eviction" 86 "k8s.io/kubernetes/pkg/kubelet/images" 87 "k8s.io/kubernetes/pkg/kubelet/kuberuntime" 88 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 89 "k8s.io/kubernetes/pkg/kubelet/logs" 90 "k8s.io/kubernetes/pkg/kubelet/metrics" 91 "k8s.io/kubernetes/pkg/kubelet/metrics/collectors" 92 "k8s.io/kubernetes/pkg/kubelet/network/dns" 93 "k8s.io/kubernetes/pkg/kubelet/nodeshutdown" 94 oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom" 95 "k8s.io/kubernetes/pkg/kubelet/pleg" 96 "k8s.io/kubernetes/pkg/kubelet/pluginmanager" 97 plugincache "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 98 kubepod "k8s.io/kubernetes/pkg/kubelet/pod" 99 "k8s.io/kubernetes/pkg/kubelet/preemption" 100 "k8s.io/kubernetes/pkg/kubelet/prober" 101 proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results" 102 "k8s.io/kubernetes/pkg/kubelet/runtimeclass" 103 "k8s.io/kubernetes/pkg/kubelet/secret" 104 "k8s.io/kubernetes/pkg/kubelet/server" 105 servermetrics "k8s.io/kubernetes/pkg/kubelet/server/metrics" 106 serverstats "k8s.io/kubernetes/pkg/kubelet/server/stats" 107 "k8s.io/kubernetes/pkg/kubelet/stats" 108 "k8s.io/kubernetes/pkg/kubelet/status" 109 "k8s.io/kubernetes/pkg/kubelet/sysctl" 110 "k8s.io/kubernetes/pkg/kubelet/token" 111 kubetypes "k8s.io/kubernetes/pkg/kubelet/types" 112 "k8s.io/kubernetes/pkg/kubelet/userns" 113 "k8s.io/kubernetes/pkg/kubelet/userns/inuserns" 114 "k8s.io/kubernetes/pkg/kubelet/util" 115 "k8s.io/kubernetes/pkg/kubelet/util/manager" 116 "k8s.io/kubernetes/pkg/kubelet/util/queue" 117 "k8s.io/kubernetes/pkg/kubelet/util/sliceutils" 118 "k8s.io/kubernetes/pkg/kubelet/volumemanager" 119 httpprobe "k8s.io/kubernetes/pkg/probe/http" 120 "k8s.io/kubernetes/pkg/security/apparmor" 121 "k8s.io/kubernetes/pkg/util/oom" 122 "k8s.io/kubernetes/pkg/volume" 123 "k8s.io/kubernetes/pkg/volume/csi" 124 "k8s.io/kubernetes/pkg/volume/util/hostutil" 125 "k8s.io/kubernetes/pkg/volume/util/subpath" 126 "k8s.io/kubernetes/pkg/volume/util/volumepathhandler" 127 "k8s.io/utils/clock" 128 ) 129 130 const ( 131 // Max amount of time to wait for the container runtime to come up. 132 maxWaitForContainerRuntime = 30 * time.Second 133 134 // nodeStatusUpdateRetry specifies how many times kubelet retries when posting node status failed. 135 nodeStatusUpdateRetry = 5 136 137 // nodeReadyGracePeriod is the period to allow for before fast status update is 138 // terminated and container runtime not being ready is logged without verbosity guard. 139 nodeReadyGracePeriod = 120 * time.Second 140 141 // DefaultContainerLogsDir is the location of container logs. 142 DefaultContainerLogsDir = "/var/log/containers" 143 144 // MaxContainerBackOff is the max backoff period, exported for the e2e test 145 MaxContainerBackOff = 300 * time.Second 146 147 // Period for performing global cleanup tasks. 148 housekeepingPeriod = time.Second * 2 149 150 // Duration at which housekeeping failed to satisfy the invariant that 151 // housekeeping should be fast to avoid blocking pod config (while 152 // housekeeping is running no new pods are started or deleted). 153 housekeepingWarningDuration = time.Second * 1 154 155 // Period after which the runtime cache expires - set to slightly longer than 156 // the expected length between housekeeping periods, which explicitly refreshes 157 // the cache. 158 runtimeCacheRefreshPeriod = housekeepingPeriod + housekeepingWarningDuration 159 160 // Period for performing eviction monitoring. 161 // ensure this is kept in sync with internal cadvisor housekeeping. 162 evictionMonitoringPeriod = time.Second * 10 163 164 // The path in containers' filesystems where the hosts file is mounted. 165 linuxEtcHostsPath = "/etc/hosts" 166 windowsEtcHostsPath = "C:\\Windows\\System32\\drivers\\etc\\hosts" 167 168 // Capacity of the channel for receiving pod lifecycle events. This number 169 // is a bit arbitrary and may be adjusted in the future. 170 plegChannelCapacity = 1000 171 172 // Generic PLEG relies on relisting for discovering container events. 173 // A longer period means that kubelet will take longer to detect container 174 // changes and to update pod status. On the other hand, a shorter period 175 // will cause more frequent relisting (e.g., container runtime operations), 176 // leading to higher cpu usage. 177 // Note that even though we set the period to 1s, the relisting itself can 178 // take more than 1s to finish if the container runtime responds slowly 179 // and/or when there are many container changes in one cycle. 180 genericPlegRelistPeriod = time.Second * 1 181 genericPlegRelistThreshold = time.Minute * 3 182 183 // Generic PLEG relist period and threshold when used with Evented PLEG. 184 eventedPlegRelistPeriod = time.Second * 300 185 eventedPlegRelistThreshold = time.Minute * 10 186 eventedPlegMaxStreamRetries = 5 187 188 // backOffPeriod is the period to back off when pod syncing results in an 189 // error. It is also used as the base period for the exponential backoff 190 // container restarts and image pulls. 191 backOffPeriod = time.Second * 10 192 193 // ContainerGCPeriod is the period for performing container garbage collection. 194 ContainerGCPeriod = time.Minute 195 // ImageGCPeriod is the period for performing image garbage collection. 196 ImageGCPeriod = 5 * time.Minute 197 198 // Minimum number of dead containers to keep in a pod 199 minDeadContainerInPod = 1 200 201 // nodeLeaseRenewIntervalFraction is the fraction of lease duration to renew the lease 202 nodeLeaseRenewIntervalFraction = 0.25 203 204 // instrumentationScope is the name of OpenTelemetry instrumentation scope 205 instrumentationScope = "k8s.io/kubernetes/pkg/kubelet" 206 ) 207 208 var ( 209 // ContainerLogsDir can be overwritten for testing usage 210 ContainerLogsDir = DefaultContainerLogsDir 211 etcHostsPath = getContainerEtcHostsPath() 212 ) 213 214 func getContainerEtcHostsPath() string { 215 if sysruntime.GOOS == "windows" { 216 return windowsEtcHostsPath 217 } 218 return linuxEtcHostsPath 219 } 220 221 // SyncHandler is an interface implemented by Kubelet, for testability 222 type SyncHandler interface { 223 HandlePodAdditions(pods []*v1.Pod) 224 HandlePodUpdates(pods []*v1.Pod) 225 HandlePodRemoves(pods []*v1.Pod) 226 HandlePodReconcile(pods []*v1.Pod) 227 HandlePodSyncs(pods []*v1.Pod) 228 HandlePodCleanups(ctx context.Context) error 229 } 230 231 // Option is a functional option type for Kubelet 232 type Option func(*Kubelet) 233 234 // Bootstrap is a bootstrapping interface for kubelet, targets the initialization protocol 235 type Bootstrap interface { 236 GetConfiguration() kubeletconfiginternal.KubeletConfiguration 237 BirthCry() 238 StartGarbageCollection() 239 ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, auth server.AuthInterface, tp trace.TracerProvider) 240 ListenAndServeReadOnly(address net.IP, port uint, tp trace.TracerProvider) 241 ListenAndServePodResources() 242 Run(<-chan kubetypes.PodUpdate) 243 RunOnce(<-chan kubetypes.PodUpdate) ([]RunPodResult, error) 244 } 245 246 // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed 247 // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping 248 // these objects while we figure out a more comprehensive dependency injection story for the Kubelet. 249 type Dependencies struct { 250 Options []Option 251 252 // Injected Dependencies 253 Auth server.AuthInterface 254 CAdvisorInterface cadvisor.Interface 255 Cloud cloudprovider.Interface 256 ContainerManager cm.ContainerManager 257 EventClient v1core.EventsGetter 258 HeartbeatClient clientset.Interface 259 OnHeartbeatFailure func() 260 KubeClient clientset.Interface 261 Mounter mount.Interface 262 HostUtil hostutil.HostUtils 263 OOMAdjuster *oom.OOMAdjuster 264 OSInterface kubecontainer.OSInterface 265 PodConfig *config.PodConfig 266 ProbeManager prober.Manager 267 Recorder record.EventRecorder 268 Subpather subpath.Interface 269 TracerProvider trace.TracerProvider 270 VolumePlugins []volume.VolumePlugin 271 DynamicPluginProber volume.DynamicPluginProber 272 TLSOptions *server.TLSOptions 273 RemoteRuntimeService internalapi.RuntimeService 274 RemoteImageService internalapi.ImageManagerService 275 PodStartupLatencyTracker util.PodStartupLatencyTracker 276 NodeStartupLatencyTracker util.NodeStartupLatencyTracker 277 // remove it after cadvisor.UsingLegacyCadvisorStats dropped. 278 useLegacyCadvisorStats bool 279 } 280 281 // makePodSourceConfig creates a config.PodConfig from the given 282 // KubeletConfiguration or returns an error. 283 func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) { 284 manifestURLHeader := make(http.Header) 285 if len(kubeCfg.StaticPodURLHeader) > 0 { 286 for k, v := range kubeCfg.StaticPodURLHeader { 287 for i := range v { 288 manifestURLHeader.Add(k, v[i]) 289 } 290 } 291 } 292 293 // source of all configuration 294 cfg := config.NewPodConfig(config.PodConfigNotificationIncremental, kubeDeps.Recorder, kubeDeps.PodStartupLatencyTracker) 295 296 // TODO: it needs to be replaced by a proper context in the future 297 ctx := context.TODO() 298 299 // define file config source 300 if kubeCfg.StaticPodPath != "" { 301 klog.InfoS("Adding static pod path", "path", kubeCfg.StaticPodPath) 302 config.NewSourceFile(kubeCfg.StaticPodPath, nodeName, kubeCfg.FileCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.FileSource)) 303 } 304 305 // define url config source 306 if kubeCfg.StaticPodURL != "" { 307 klog.InfoS("Adding pod URL with HTTP header", "URL", kubeCfg.StaticPodURL, "header", manifestURLHeader) 308 config.NewSourceURL(kubeCfg.StaticPodURL, manifestURLHeader, nodeName, kubeCfg.HTTPCheckFrequency.Duration, cfg.Channel(ctx, kubetypes.HTTPSource)) 309 } 310 311 if kubeDeps.KubeClient != nil { 312 klog.InfoS("Adding apiserver pod source") 313 config.NewSourceApiserver(kubeDeps.KubeClient, nodeName, nodeHasSynced, cfg.Channel(ctx, kubetypes.ApiserverSource)) 314 } 315 return cfg, nil 316 } 317 318 // PreInitRuntimeService will init runtime service before RunKubelet. 319 func PreInitRuntimeService(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies) error { 320 remoteImageEndpoint := kubeCfg.ImageServiceEndpoint 321 if remoteImageEndpoint == "" && kubeCfg.ContainerRuntimeEndpoint != "" { 322 remoteImageEndpoint = kubeCfg.ContainerRuntimeEndpoint 323 } 324 var err error 325 326 var tp trace.TracerProvider 327 if utilfeature.DefaultFeatureGate.Enabled(features.KubeletTracing) { 328 tp = kubeDeps.TracerProvider 329 } 330 331 logger := klog.Background() 332 if kubeDeps.RemoteRuntimeService, err = remote.NewRemoteRuntimeService(kubeCfg.ContainerRuntimeEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, tp, &logger); err != nil { 333 return err 334 } 335 if kubeDeps.RemoteImageService, err = remote.NewRemoteImageService(remoteImageEndpoint, kubeCfg.RuntimeRequestTimeout.Duration, tp, &logger); err != nil { 336 return err 337 } 338 339 kubeDeps.useLegacyCadvisorStats = cadvisor.UsingLegacyCadvisorStats(kubeCfg.ContainerRuntimeEndpoint) 340 341 return nil 342 } 343 344 // NewMainKubelet instantiates a new Kubelet object along with all the required internal modules. 345 // No initialization of Kubelet and its modules should happen here. 346 func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, 347 kubeDeps *Dependencies, 348 crOptions *config.ContainerRuntimeOptions, 349 hostname string, 350 hostnameOverridden bool, 351 nodeName types.NodeName, 352 nodeIPs []net.IP, 353 providerID string, 354 cloudProvider string, 355 certDirectory string, 356 rootDirectory string, 357 podLogsDirectory string, 358 imageCredentialProviderConfigFile string, 359 imageCredentialProviderBinDir string, 360 registerNode bool, 361 registerWithTaints []v1.Taint, 362 allowedUnsafeSysctls []string, 363 experimentalMounterPath string, 364 kernelMemcgNotification bool, 365 experimentalNodeAllocatableIgnoreEvictionThreshold bool, 366 minimumGCAge metav1.Duration, 367 maxPerPodContainerCount int32, 368 maxContainerCount int32, 369 registerSchedulable bool, 370 nodeLabels map[string]string, 371 nodeStatusMaxImages int32, 372 seccompDefault bool, 373 ) (*Kubelet, error) { 374 ctx := context.Background() 375 logger := klog.TODO() 376 377 if rootDirectory == "" { 378 return nil, fmt.Errorf("invalid root directory %q", rootDirectory) 379 } 380 if podLogsDirectory == "" { 381 return nil, errors.New("pod logs root directory is empty") 382 } 383 if kubeCfg.SyncFrequency.Duration <= 0 { 384 return nil, fmt.Errorf("invalid sync frequency %d", kubeCfg.SyncFrequency.Duration) 385 } 386 387 if utilfeature.DefaultFeatureGate.Enabled(features.DisableCloudProviders) && cloudprovider.IsDeprecatedInternal(cloudProvider) { 388 cloudprovider.DisableWarningForProvider(cloudProvider) 389 return nil, fmt.Errorf("cloud provider %q was specified, but built-in cloud providers are disabled. Please set --cloud-provider=external and migrate to an external cloud provider", cloudProvider) 390 } 391 392 var nodeHasSynced cache.InformerSynced 393 var nodeLister corelisters.NodeLister 394 395 // If kubeClient == nil, we are running in standalone mode (i.e. no API servers) 396 // If not nil, we are running as part of a cluster and should sync w/API 397 if kubeDeps.KubeClient != nil { 398 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) { 399 options.FieldSelector = fields.Set{metav1.ObjectNameField: string(nodeName)}.String() 400 })) 401 nodeLister = kubeInformers.Core().V1().Nodes().Lister() 402 nodeHasSynced = func() bool { 403 return kubeInformers.Core().V1().Nodes().Informer().HasSynced() 404 } 405 kubeInformers.Start(wait.NeverStop) 406 klog.InfoS("Attempting to sync node with API server") 407 } else { 408 // we don't have a client to sync! 409 nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) 410 nodeLister = corelisters.NewNodeLister(nodeIndexer) 411 nodeHasSynced = func() bool { return true } 412 klog.InfoS("Kubelet is running in standalone mode, will skip API server sync") 413 } 414 415 if kubeDeps.PodConfig == nil { 416 var err error 417 kubeDeps.PodConfig, err = makePodSourceConfig(kubeCfg, kubeDeps, nodeName, nodeHasSynced) 418 if err != nil { 419 return nil, err 420 } 421 } 422 423 containerGCPolicy := kubecontainer.GCPolicy{ 424 MinAge: minimumGCAge.Duration, 425 MaxPerPodContainer: int(maxPerPodContainerCount), 426 MaxContainers: int(maxContainerCount), 427 } 428 429 daemonEndpoints := &v1.NodeDaemonEndpoints{ 430 KubeletEndpoint: v1.DaemonEndpoint{Port: kubeCfg.Port}, 431 } 432 433 imageGCPolicy := images.ImageGCPolicy{ 434 MinAge: kubeCfg.ImageMinimumGCAge.Duration, 435 HighThresholdPercent: int(kubeCfg.ImageGCHighThresholdPercent), 436 LowThresholdPercent: int(kubeCfg.ImageGCLowThresholdPercent), 437 } 438 439 if utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) { 440 imageGCPolicy.MaxAge = kubeCfg.ImageMaximumGCAge.Duration 441 } else if kubeCfg.ImageMaximumGCAge.Duration != 0 { 442 klog.InfoS("ImageMaximumGCAge flag enabled, but corresponding feature gate is not enabled. Ignoring flag.") 443 } 444 445 enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable 446 if experimentalNodeAllocatableIgnoreEvictionThreshold { 447 // Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions 448 enforceNodeAllocatable = []string{} 449 } 450 thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim) 451 if err != nil { 452 return nil, err 453 } 454 evictionConfig := eviction.Config{ 455 PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration, 456 MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod), 457 Thresholds: thresholds, 458 KernelMemcgNotification: kernelMemcgNotification, 459 PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(), 460 } 461 462 var serviceLister corelisters.ServiceLister 463 var serviceHasSynced cache.InformerSynced 464 if kubeDeps.KubeClient != nil { 465 // don't watch headless services, they are not needed since this informer is only used to create the environment variables for pods. 466 // See https://issues.k8s.io/122394 467 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0, informers.WithTweakListOptions(func(options *metav1.ListOptions) { 468 options.FieldSelector = fields.OneTermNotEqualSelector("spec.clusterIP", v1.ClusterIPNone).String() 469 })) 470 serviceLister = kubeInformers.Core().V1().Services().Lister() 471 serviceHasSynced = kubeInformers.Core().V1().Services().Informer().HasSynced 472 kubeInformers.Start(wait.NeverStop) 473 } else { 474 serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) 475 serviceLister = corelisters.NewServiceLister(serviceIndexer) 476 serviceHasSynced = func() bool { return true } 477 } 478 479 // construct a node reference used for events 480 nodeRef := &v1.ObjectReference{ 481 Kind: "Node", 482 Name: string(nodeName), 483 UID: types.UID(nodeName), 484 Namespace: "", 485 } 486 487 oomWatcher, err := oomwatcher.NewWatcher(kubeDeps.Recorder) 488 if err != nil { 489 if inuserns.RunningInUserNS() { 490 if utilfeature.DefaultFeatureGate.Enabled(features.KubeletInUserNamespace) { 491 // oomwatcher.NewWatcher returns "open /dev/kmsg: operation not permitted" error, 492 // when running in a user namespace with sysctl value `kernel.dmesg_restrict=1`. 493 klog.V(2).InfoS("Failed to create an oomWatcher (running in UserNS, ignoring)", "err", err) 494 oomWatcher = nil 495 } else { 496 klog.ErrorS(err, "Failed to create an oomWatcher (running in UserNS, Hint: enable KubeletInUserNamespace feature flag to ignore the error)") 497 return nil, err 498 } 499 } else { 500 return nil, err 501 } 502 } 503 504 clusterDNS := make([]net.IP, 0, len(kubeCfg.ClusterDNS)) 505 for _, ipEntry := range kubeCfg.ClusterDNS { 506 ip := netutils.ParseIPSloppy(ipEntry) 507 if ip == nil { 508 klog.InfoS("Invalid clusterDNS IP", "IP", ipEntry) 509 } else { 510 clusterDNS = append(clusterDNS, ip) 511 } 512 } 513 514 // A TLS transport is needed to make HTTPS-based container lifecycle requests, 515 // but we do not have the information necessary to do TLS verification. 516 // 517 // This client must not be modified to include credentials, because it is 518 // critical that credentials not leak from the client to arbitrary hosts. 519 insecureContainerLifecycleHTTPClient := &http.Client{ 520 Transport: &http.Transport{ 521 TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, 522 }, 523 CheckRedirect: httpprobe.RedirectChecker(false), 524 } 525 526 tracer := kubeDeps.TracerProvider.Tracer(instrumentationScope) 527 528 klet := &Kubelet{ 529 hostname: hostname, 530 hostnameOverridden: hostnameOverridden, 531 nodeName: nodeName, 532 kubeClient: kubeDeps.KubeClient, 533 heartbeatClient: kubeDeps.HeartbeatClient, 534 onRepeatedHeartbeatFailure: kubeDeps.OnHeartbeatFailure, 535 rootDirectory: filepath.Clean(rootDirectory), 536 podLogsDirectory: podLogsDirectory, 537 resyncInterval: kubeCfg.SyncFrequency.Duration, 538 sourcesReady: config.NewSourcesReady(kubeDeps.PodConfig.SeenAllSources), 539 registerNode: registerNode, 540 registerWithTaints: registerWithTaints, 541 registerSchedulable: registerSchedulable, 542 dnsConfigurer: dns.NewConfigurer(kubeDeps.Recorder, nodeRef, nodeIPs, clusterDNS, kubeCfg.ClusterDomain, kubeCfg.ResolverConfig), 543 serviceLister: serviceLister, 544 serviceHasSynced: serviceHasSynced, 545 nodeLister: nodeLister, 546 nodeHasSynced: nodeHasSynced, 547 streamingConnectionIdleTimeout: kubeCfg.StreamingConnectionIdleTimeout.Duration, 548 recorder: kubeDeps.Recorder, 549 cadvisor: kubeDeps.CAdvisorInterface, 550 cloud: kubeDeps.Cloud, 551 externalCloudProvider: cloudprovider.IsExternal(cloudProvider), 552 providerID: providerID, 553 nodeRef: nodeRef, 554 nodeLabels: nodeLabels, 555 nodeStatusUpdateFrequency: kubeCfg.NodeStatusUpdateFrequency.Duration, 556 nodeStatusReportFrequency: kubeCfg.NodeStatusReportFrequency.Duration, 557 os: kubeDeps.OSInterface, 558 oomWatcher: oomWatcher, 559 cgroupsPerQOS: kubeCfg.CgroupsPerQOS, 560 cgroupRoot: kubeCfg.CgroupRoot, 561 mounter: kubeDeps.Mounter, 562 hostutil: kubeDeps.HostUtil, 563 subpather: kubeDeps.Subpather, 564 maxPods: int(kubeCfg.MaxPods), 565 podsPerCore: int(kubeCfg.PodsPerCore), 566 syncLoopMonitor: atomic.Value{}, 567 daemonEndpoints: daemonEndpoints, 568 containerManager: kubeDeps.ContainerManager, 569 nodeIPs: nodeIPs, 570 nodeIPValidator: validateNodeIP, 571 clock: clock.RealClock{}, 572 enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, 573 makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains, 574 nodeStatusMaxImages: nodeStatusMaxImages, 575 tracer: tracer, 576 nodeStartupLatencyTracker: kubeDeps.NodeStartupLatencyTracker, 577 } 578 579 if klet.cloud != nil { 580 klet.cloudResourceSyncManager = cloudresource.NewSyncManager(klet.cloud, nodeName, klet.nodeStatusUpdateFrequency) 581 } 582 583 var secretManager secret.Manager 584 var configMapManager configmap.Manager 585 if klet.kubeClient != nil { 586 switch kubeCfg.ConfigMapAndSecretChangeDetectionStrategy { 587 case kubeletconfiginternal.WatchChangeDetectionStrategy: 588 secretManager = secret.NewWatchingSecretManager(klet.kubeClient, klet.resyncInterval) 589 configMapManager = configmap.NewWatchingConfigMapManager(klet.kubeClient, klet.resyncInterval) 590 case kubeletconfiginternal.TTLCacheChangeDetectionStrategy: 591 secretManager = secret.NewCachingSecretManager( 592 klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode)) 593 configMapManager = configmap.NewCachingConfigMapManager( 594 klet.kubeClient, manager.GetObjectTTLFromNodeFunc(klet.GetNode)) 595 case kubeletconfiginternal.GetChangeDetectionStrategy: 596 secretManager = secret.NewSimpleSecretManager(klet.kubeClient) 597 configMapManager = configmap.NewSimpleConfigMapManager(klet.kubeClient) 598 default: 599 return nil, fmt.Errorf("unknown configmap and secret manager mode: %v", kubeCfg.ConfigMapAndSecretChangeDetectionStrategy) 600 } 601 602 klet.secretManager = secretManager 603 klet.configMapManager = configMapManager 604 } 605 606 machineInfo, err := klet.cadvisor.MachineInfo() 607 if err != nil { 608 return nil, err 609 } 610 // Avoid collector collects it as a timestamped metric 611 // See PR #95210 and #97006 for more details. 612 machineInfo.Timestamp = time.Time{} 613 klet.setCachedMachineInfo(machineInfo) 614 615 imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) 616 617 klet.livenessManager = proberesults.NewManager() 618 klet.readinessManager = proberesults.NewManager() 619 klet.startupManager = proberesults.NewManager() 620 klet.podCache = kubecontainer.NewCache() 621 622 klet.mirrorPodClient = kubepod.NewBasicMirrorClient(klet.kubeClient, string(nodeName), nodeLister) 623 klet.podManager = kubepod.NewBasicPodManager() 624 625 klet.statusManager = status.NewManager(klet.kubeClient, klet.podManager, klet, kubeDeps.PodStartupLatencyTracker, klet.getRootDir()) 626 627 klet.resourceAnalyzer = serverstats.NewResourceAnalyzer(klet, kubeCfg.VolumeStatsAggPeriod.Duration, kubeDeps.Recorder) 628 629 klet.runtimeService = kubeDeps.RemoteRuntimeService 630 631 if kubeDeps.KubeClient != nil { 632 klet.runtimeClassManager = runtimeclass.NewManager(kubeDeps.KubeClient) 633 } 634 635 // setup containerLogManager for CRI container runtime 636 containerLogManager, err := logs.NewContainerLogManager( 637 klet.runtimeService, 638 kubeDeps.OSInterface, 639 kubeCfg.ContainerLogMaxSize, 640 int(kubeCfg.ContainerLogMaxFiles), 641 int(kubeCfg.ContainerLogMaxWorkers), 642 kubeCfg.ContainerLogMonitorInterval, 643 ) 644 if err != nil { 645 return nil, fmt.Errorf("failed to initialize container log manager: %v", err) 646 } 647 klet.containerLogManager = containerLogManager 648 649 klet.reasonCache = NewReasonCache() 650 klet.workQueue = queue.NewBasicWorkQueue(klet.clock) 651 klet.podWorkers = newPodWorkers( 652 klet, 653 kubeDeps.Recorder, 654 klet.workQueue, 655 klet.resyncInterval, 656 backOffPeriod, 657 klet.podCache, 658 ) 659 660 runtime, err := kuberuntime.NewKubeGenericRuntimeManager( 661 kubecontainer.FilterEventRecorder(kubeDeps.Recorder), 662 klet.livenessManager, 663 klet.readinessManager, 664 klet.startupManager, 665 rootDirectory, 666 podLogsDirectory, 667 machineInfo, 668 klet.podWorkers, 669 kubeDeps.OSInterface, 670 klet, 671 insecureContainerLifecycleHTTPClient, 672 imageBackOff, 673 kubeCfg.SerializeImagePulls, 674 kubeCfg.MaxParallelImagePulls, 675 float32(kubeCfg.RegistryPullQPS), 676 int(kubeCfg.RegistryBurst), 677 imageCredentialProviderConfigFile, 678 imageCredentialProviderBinDir, 679 kubeCfg.CPUCFSQuota, 680 kubeCfg.CPUCFSQuotaPeriod, 681 kubeDeps.RemoteRuntimeService, 682 kubeDeps.RemoteImageService, 683 kubeDeps.ContainerManager, 684 klet.containerLogManager, 685 klet.runtimeClassManager, 686 seccompDefault, 687 kubeCfg.MemorySwap.SwapBehavior, 688 kubeDeps.ContainerManager.GetNodeAllocatableAbsolute, 689 *kubeCfg.MemoryThrottlingFactor, 690 kubeDeps.PodStartupLatencyTracker, 691 kubeDeps.TracerProvider, 692 ) 693 if err != nil { 694 return nil, err 695 } 696 klet.containerRuntime = runtime 697 klet.streamingRuntime = runtime 698 klet.runner = runtime 699 700 runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime, runtimeCacheRefreshPeriod) 701 if err != nil { 702 return nil, err 703 } 704 klet.runtimeCache = runtimeCache 705 706 // common provider to get host file system usage associated with a pod managed by kubelet 707 hostStatsProvider := stats.NewHostStatsProvider(kubecontainer.RealOS{}, func(podUID types.UID) string { 708 return getEtcHostsPath(klet.getPodDir(podUID)) 709 }, podLogsDirectory) 710 if kubeDeps.useLegacyCadvisorStats { 711 klet.StatsProvider = stats.NewCadvisorStatsProvider( 712 klet.cadvisor, 713 klet.resourceAnalyzer, 714 klet.podManager, 715 klet.runtimeCache, 716 klet.containerRuntime, 717 klet.statusManager, 718 hostStatsProvider) 719 } else { 720 klet.StatsProvider = stats.NewCRIStatsProvider( 721 klet.cadvisor, 722 klet.resourceAnalyzer, 723 klet.podManager, 724 klet.runtimeCache, 725 kubeDeps.RemoteRuntimeService, 726 kubeDeps.RemoteImageService, 727 hostStatsProvider, 728 utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI)) 729 } 730 731 eventChannel := make(chan *pleg.PodLifecycleEvent, plegChannelCapacity) 732 733 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 734 // adjust Generic PLEG relisting period and threshold to higher value when Evented PLEG is turned on 735 genericRelistDuration := &pleg.RelistDuration{ 736 RelistPeriod: eventedPlegRelistPeriod, 737 RelistThreshold: eventedPlegRelistThreshold, 738 } 739 klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{}) 740 // In case Evented PLEG has to fall back on Generic PLEG due to an error, 741 // Evented PLEG should be able to reset the Generic PLEG relisting duration 742 // to the default value. 743 eventedRelistDuration := &pleg.RelistDuration{ 744 RelistPeriod: genericPlegRelistPeriod, 745 RelistThreshold: genericPlegRelistThreshold, 746 } 747 klet.eventedPleg, err = pleg.NewEventedPLEG(klet.containerRuntime, klet.runtimeService, eventChannel, 748 klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{}) 749 if err != nil { 750 return nil, err 751 } 752 } else { 753 genericRelistDuration := &pleg.RelistDuration{ 754 RelistPeriod: genericPlegRelistPeriod, 755 RelistThreshold: genericPlegRelistThreshold, 756 } 757 klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{}) 758 } 759 760 klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime) 761 klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy) 762 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 763 klet.runtimeState.addHealthCheck("EventedPLEG", klet.eventedPleg.Healthy) 764 } 765 if _, err := klet.updatePodCIDR(ctx, kubeCfg.PodCIDR); err != nil { 766 klog.ErrorS(err, "Pod CIDR update failed") 767 } 768 769 // setup containerGC 770 containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady) 771 if err != nil { 772 return nil, err 773 } 774 klet.containerGC = containerGC 775 klet.containerDeletor = newPodContainerDeletor(klet.containerRuntime, max(containerGCPolicy.MaxPerPodContainer, minDeadContainerInPod)) 776 777 // setup imageManager 778 imageManager, err := images.NewImageGCManager(klet.containerRuntime, klet.StatsProvider, kubeDeps.Recorder, nodeRef, imageGCPolicy, kubeDeps.TracerProvider) 779 if err != nil { 780 return nil, fmt.Errorf("failed to initialize image manager: %v", err) 781 } 782 klet.imageManager = imageManager 783 784 if kubeCfg.ServerTLSBootstrap && kubeDeps.TLSOptions != nil && utilfeature.DefaultFeatureGate.Enabled(features.RotateKubeletServerCertificate) { 785 klet.serverCertificateManager, err = kubeletcertificate.NewKubeletServerCertificateManager(klet.kubeClient, kubeCfg, klet.nodeName, klet.getLastObservedNodeAddresses, certDirectory) 786 if err != nil { 787 return nil, fmt.Errorf("failed to initialize certificate manager: %v", err) 788 } 789 kubeDeps.TLSOptions.Config.GetCertificate = func(*tls.ClientHelloInfo) (*tls.Certificate, error) { 790 cert := klet.serverCertificateManager.Current() 791 if cert == nil { 792 return nil, fmt.Errorf("no serving certificate available for the kubelet") 793 } 794 return cert, nil 795 } 796 } 797 798 if kubeDeps.ProbeManager != nil { 799 klet.probeManager = kubeDeps.ProbeManager 800 } else { 801 klet.probeManager = prober.NewManager( 802 klet.statusManager, 803 klet.livenessManager, 804 klet.readinessManager, 805 klet.startupManager, 806 klet.runner, 807 kubeDeps.Recorder) 808 } 809 810 tokenManager := token.NewManager(kubeDeps.KubeClient) 811 812 var clusterTrustBundleManager clustertrustbundle.Manager 813 if kubeDeps.KubeClient != nil && utilfeature.DefaultFeatureGate.Enabled(features.ClusterTrustBundleProjection) { 814 kubeInformers := informers.NewSharedInformerFactoryWithOptions(kubeDeps.KubeClient, 0) 815 clusterTrustBundleManager, err = clustertrustbundle.NewInformerManager(kubeInformers.Certificates().V1alpha1().ClusterTrustBundles(), 2*int(kubeCfg.MaxPods), 5*time.Minute) 816 if err != nil { 817 return nil, fmt.Errorf("while starting informer-based ClusterTrustBundle manager: %w", err) 818 } 819 kubeInformers.Start(wait.NeverStop) 820 klog.InfoS("Started ClusterTrustBundle informer") 821 } else { 822 // In static kubelet mode, use a no-op manager. 823 clusterTrustBundleManager = &clustertrustbundle.NoopManager{} 824 klog.InfoS("Not starting ClusterTrustBundle informer because we are in static kubelet mode") 825 } 826 827 // NewInitializedVolumePluginMgr initializes some storageErrors on the Kubelet runtimeState (in csi_plugin.go init) 828 // which affects node ready status. This function must be called before Kubelet is initialized so that the Node 829 // ReadyState is accurate with the storage state. 830 klet.volumePluginMgr, err = 831 NewInitializedVolumePluginMgr(klet, secretManager, configMapManager, tokenManager, clusterTrustBundleManager, kubeDeps.VolumePlugins, kubeDeps.DynamicPluginProber) 832 if err != nil { 833 return nil, err 834 } 835 klet.pluginManager = pluginmanager.NewPluginManager( 836 klet.getPluginsRegistrationDir(), /* sockDir */ 837 kubeDeps.Recorder, 838 ) 839 840 // If the experimentalMounterPathFlag is set, we do not want to 841 // check node capabilities since the mount path is not the default 842 if len(experimentalMounterPath) != 0 { 843 // Replace the nameserver in containerized-mounter's rootfs/etc/resolv.conf with kubelet.ClusterDNS 844 // so that service name could be resolved 845 klet.dnsConfigurer.SetupDNSinContainerizedMounter(experimentalMounterPath) 846 } 847 848 // setup volumeManager 849 klet.volumeManager = volumemanager.NewVolumeManager( 850 kubeCfg.EnableControllerAttachDetach, 851 nodeName, 852 klet.podManager, 853 klet.podWorkers, 854 klet.kubeClient, 855 klet.volumePluginMgr, 856 klet.containerRuntime, 857 kubeDeps.Mounter, 858 kubeDeps.HostUtil, 859 klet.getPodsDir(), 860 kubeDeps.Recorder, 861 volumepathhandler.NewBlockVolumePathHandler()) 862 863 klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) 864 865 // setup eviction manager 866 evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, 867 killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation) 868 869 klet.evictionManager = evictionManager 870 klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler) 871 872 // Safe, allowed sysctls can always be used as unsafe sysctls in the spec. 873 // Hence, we concatenate those two lists. 874 safeAndUnsafeSysctls := append(sysctl.SafeSysctlAllowlist(), allowedUnsafeSysctls...) 875 sysctlsAllowlist, err := sysctl.NewAllowlist(safeAndUnsafeSysctls) 876 if err != nil { 877 return nil, err 878 } 879 klet.admitHandlers.AddPodAdmitHandler(sysctlsAllowlist) 880 881 // enable active deadline handler 882 activeDeadlineHandler, err := newActiveDeadlineHandler(klet.statusManager, kubeDeps.Recorder, klet.clock) 883 if err != nil { 884 return nil, err 885 } 886 klet.AddPodSyncLoopHandler(activeDeadlineHandler) 887 klet.AddPodSyncHandler(activeDeadlineHandler) 888 889 klet.admitHandlers.AddPodAdmitHandler(klet.containerManager.GetAllocateResourcesPodAdmitHandler()) 890 891 criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder) 892 klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler, klet.containerManager.UpdatePluginResources)) 893 // apply functional Option's 894 for _, opt := range kubeDeps.Options { 895 opt(klet) 896 } 897 898 if sysruntime.GOOS == "linux" { 899 // AppArmor is a Linux kernel security module and it does not support other operating systems. 900 klet.appArmorValidator = apparmor.NewValidator() 901 klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator)) 902 } 903 904 leaseDuration := time.Duration(kubeCfg.NodeLeaseDurationSeconds) * time.Second 905 renewInterval := time.Duration(float64(leaseDuration) * nodeLeaseRenewIntervalFraction) 906 klet.nodeLeaseController = lease.NewController( 907 klet.clock, 908 klet.heartbeatClient, 909 string(klet.nodeName), 910 kubeCfg.NodeLeaseDurationSeconds, 911 klet.onRepeatedHeartbeatFailure, 912 renewInterval, 913 string(klet.nodeName), 914 v1.NamespaceNodeLease, 915 util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName))) 916 917 // setup node shutdown manager 918 shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{ 919 Logger: logger, 920 ProbeManager: klet.probeManager, 921 Recorder: kubeDeps.Recorder, 922 NodeRef: nodeRef, 923 GetPodsFunc: klet.GetActivePods, 924 KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder), 925 SyncNodeStatusFunc: klet.syncNodeStatus, 926 ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration, 927 ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration, 928 ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority, 929 StateDirectory: rootDirectory, 930 }) 931 klet.shutdownManager = shutdownManager 932 klet.usernsManager, err = userns.MakeUserNsManager(klet) 933 if err != nil { 934 return nil, err 935 } 936 klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler) 937 938 // Finally, put the most recent version of the config on the Kubelet, so 939 // people can see how it was configured. 940 klet.kubeletConfiguration = *kubeCfg 941 942 // Generating the status funcs should be the last thing we do, 943 // since this relies on the rest of the Kubelet having been constructed. 944 klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs() 945 946 return klet, nil 947 } 948 949 type serviceLister interface { 950 List(labels.Selector) ([]*v1.Service, error) 951 } 952 953 // Kubelet is the main kubelet implementation. 954 type Kubelet struct { 955 kubeletConfiguration kubeletconfiginternal.KubeletConfiguration 956 957 // hostname is the hostname the kubelet detected or was given via flag/config 958 hostname string 959 // hostnameOverridden indicates the hostname was overridden via flag/config 960 hostnameOverridden bool 961 962 nodeName types.NodeName 963 runtimeCache kubecontainer.RuntimeCache 964 kubeClient clientset.Interface 965 heartbeatClient clientset.Interface 966 // mirrorPodClient is used to create and delete mirror pods in the API for static 967 // pods. 968 mirrorPodClient kubepod.MirrorClient 969 970 rootDirectory string 971 podLogsDirectory string 972 973 lastObservedNodeAddressesMux sync.RWMutex 974 lastObservedNodeAddresses []v1.NodeAddress 975 976 // onRepeatedHeartbeatFailure is called when a heartbeat operation fails more than once. optional. 977 onRepeatedHeartbeatFailure func() 978 979 // podManager stores the desired set of admitted pods and mirror pods that the kubelet should be 980 // running. The actual set of running pods is stored on the podWorkers. The manager is populated 981 // by the kubelet config loops which abstracts receiving configuration from many different sources 982 // (api for regular pods, local filesystem or http for static pods). The manager may be consulted 983 // by other components that need to see the set of desired pods. Note that not all desired pods are 984 // running, and not all running pods are in the podManager - for instance, force deleting a pod 985 // from the apiserver will remove it from the podManager, but the pod may still be terminating and 986 // tracked by the podWorkers. Components that need to know the actual consumed resources of the 987 // node or are driven by podWorkers and the sync*Pod methods (status, volume, stats) should also 988 // consult the podWorkers when reconciling. 989 // 990 // TODO: review all kubelet components that need the actual set of pods (vs the desired set) 991 // and update them to use podWorkers instead of podManager. This may introduce latency in some 992 // methods, but avoids race conditions and correctly accounts for terminating pods that have 993 // been force deleted or static pods that have been updated. 994 // https://github.com/kubernetes/kubernetes/issues/116970 995 podManager kubepod.Manager 996 997 // podWorkers is responsible for driving the lifecycle state machine of each pod. The worker is 998 // notified of config changes, updates, periodic reconciliation, container runtime updates, and 999 // evictions of all desired pods and will invoke reconciliation methods per pod in separate 1000 // goroutines. The podWorkers are authoritative in the kubelet for what pods are actually being 1001 // run and their current state: 1002 // 1003 // * syncing: pod should be running (syncPod) 1004 // * terminating: pod should be stopped (syncTerminatingPod) 1005 // * terminated: pod should have all resources cleaned up (syncTerminatedPod) 1006 // 1007 // and invoke the handler methods that correspond to each state. Components within the 1008 // kubelet that need to know the phase of the pod in order to correctly set up or tear down 1009 // resources must consult the podWorkers. 1010 // 1011 // Once a pod has been accepted by the pod workers, no other pod with that same UID (and 1012 // name+namespace, for static pods) will be started until the first pod has fully terminated 1013 // and been cleaned up by SyncKnownPods. This means a pod may be desired (in API), admitted 1014 // (in pod manager), and requested (by invoking UpdatePod) but not start for an arbitrarily 1015 // long interval because a prior pod is still terminating. 1016 // 1017 // As an event-driven (by UpdatePod) controller, the podWorkers must periodically be resynced 1018 // by the kubelet invoking SyncKnownPods with the desired state (admitted pods in podManager). 1019 // Since the podManager may be unaware of some running pods due to force deletion, the 1020 // podWorkers are responsible for triggering a sync of pods that are no longer desired but 1021 // must still run to completion. 1022 podWorkers PodWorkers 1023 1024 // evictionManager observes the state of the node for situations that could impact node stability 1025 // and evicts pods (sets to phase Failed with reason Evicted) to reduce resource pressure. The 1026 // eviction manager acts on the actual state of the node and considers the podWorker to be 1027 // authoritative. 1028 evictionManager eviction.Manager 1029 1030 // probeManager tracks the set of running pods and ensures any user-defined periodic checks are 1031 // run to introspect the state of each pod. The probe manager acts on the actual state of the node 1032 // and is notified of pods by the podWorker. The probe manager is the authoritative source of the 1033 // most recent probe status and is responsible for notifying the status manager, which 1034 // synthesizes them into the overall pod status. 1035 probeManager prober.Manager 1036 1037 // secretManager caches the set of secrets used by running pods on this node. The podWorkers 1038 // notify the secretManager when pods are started and terminated, and the secretManager must 1039 // then keep the needed secrets up-to-date as they change. 1040 secretManager secret.Manager 1041 1042 // configMapManager caches the set of config maps used by running pods on this node. The 1043 // podWorkers notify the configMapManager when pods are started and terminated, and the 1044 // configMapManager must then keep the needed config maps up-to-date as they change. 1045 configMapManager configmap.Manager 1046 1047 // volumeManager observes the set of running pods and is responsible for attaching, mounting, 1048 // unmounting, and detaching as those pods move through their lifecycle. It periodically 1049 // synchronizes the set of known volumes to the set of actually desired volumes and cleans up 1050 // any orphaned volumes. The volume manager considers the podWorker to be authoritative for 1051 // which pods are running. 1052 volumeManager volumemanager.VolumeManager 1053 1054 // statusManager receives updated pod status updates from the podWorker and updates the API 1055 // status of those pods to match. The statusManager is authoritative for the synthesized 1056 // status of the pod from the kubelet's perspective (other components own the individual 1057 // elements of status) and should be consulted by components in preference to assembling 1058 // that status themselves. Note that the status manager is downstream of the pod worker 1059 // and components that need to check whether a pod is still running should instead directly 1060 // consult the pod worker. 1061 statusManager status.Manager 1062 1063 // resyncInterval is the interval between periodic full reconciliations of 1064 // pods on this node. 1065 resyncInterval time.Duration 1066 1067 // sourcesReady records the sources seen by the kubelet, it is thread-safe. 1068 sourcesReady config.SourcesReady 1069 1070 // Optional, defaults to /logs/ from /var/log 1071 logServer http.Handler 1072 // Optional, defaults to simple Docker implementation 1073 runner kubecontainer.CommandRunner 1074 1075 // cAdvisor used for container information. 1076 cadvisor cadvisor.Interface 1077 1078 // Set to true to have the node register itself with the apiserver. 1079 registerNode bool 1080 // List of taints to add to a node object when the kubelet registers itself. 1081 registerWithTaints []v1.Taint 1082 // Set to true to have the node register itself as schedulable. 1083 registerSchedulable bool 1084 // for internal book keeping; access only from within registerWithApiserver 1085 registrationCompleted bool 1086 1087 // dnsConfigurer is used for setting up DNS resolver configuration when launching pods. 1088 dnsConfigurer *dns.Configurer 1089 1090 // serviceLister knows how to list services 1091 serviceLister serviceLister 1092 // serviceHasSynced indicates whether services have been sync'd at least once. 1093 // Check this before trusting a response from the lister. 1094 serviceHasSynced cache.InformerSynced 1095 // nodeLister knows how to list nodes 1096 nodeLister corelisters.NodeLister 1097 // nodeHasSynced indicates whether nodes have been sync'd at least once. 1098 // Check this before trusting a response from the node lister. 1099 nodeHasSynced cache.InformerSynced 1100 // a list of node labels to register 1101 nodeLabels map[string]string 1102 1103 // Last timestamp when runtime responded on ping. 1104 // Mutex is used to protect this value. 1105 runtimeState *runtimeState 1106 1107 // Volume plugins. 1108 volumePluginMgr *volume.VolumePluginMgr 1109 1110 // Manages container health check results. 1111 livenessManager proberesults.Manager 1112 readinessManager proberesults.Manager 1113 startupManager proberesults.Manager 1114 1115 // How long to keep idle streaming command execution/port forwarding 1116 // connections open before terminating them 1117 streamingConnectionIdleTimeout time.Duration 1118 1119 // The EventRecorder to use 1120 recorder record.EventRecorder 1121 1122 // Policy for handling garbage collection of dead containers. 1123 containerGC kubecontainer.GC 1124 1125 // Manager for image garbage collection. 1126 imageManager images.ImageGCManager 1127 1128 // Manager for container logs. 1129 containerLogManager logs.ContainerLogManager 1130 1131 // Cached MachineInfo returned by cadvisor. 1132 machineInfoLock sync.RWMutex 1133 machineInfo *cadvisorapi.MachineInfo 1134 1135 // Handles certificate rotations. 1136 serverCertificateManager certificate.Manager 1137 1138 // Cloud provider interface. 1139 cloud cloudprovider.Interface 1140 // Handles requests to cloud provider with timeout 1141 cloudResourceSyncManager cloudresource.SyncManager 1142 1143 // Indicates that the node initialization happens in an external cloud controller 1144 externalCloudProvider bool 1145 // Reference to this node. 1146 nodeRef *v1.ObjectReference 1147 1148 // Container runtime. 1149 containerRuntime kubecontainer.Runtime 1150 1151 // Streaming runtime handles container streaming. 1152 streamingRuntime kubecontainer.StreamingRuntime 1153 1154 // Container runtime service (needed by container runtime Start()). 1155 runtimeService internalapi.RuntimeService 1156 1157 // reasonCache caches the failure reason of the last creation of all containers, which is 1158 // used for generating ContainerStatus. 1159 reasonCache *ReasonCache 1160 1161 // containerRuntimeReadyExpected indicates whether container runtime being ready is expected 1162 // so errors are logged without verbosity guard, to avoid excessive error logs at node startup. 1163 // It's false during the node initialization period of nodeReadyGracePeriod, and after that 1164 // it's set to true by fastStatusUpdateOnce when it exits. 1165 containerRuntimeReadyExpected bool 1166 1167 // nodeStatusUpdateFrequency specifies how often kubelet computes node status. If node lease 1168 // feature is not enabled, it is also the frequency that kubelet posts node status to master. 1169 // In that case, be cautious when changing the constant, it must work with nodeMonitorGracePeriod 1170 // in nodecontroller. There are several constraints: 1171 // 1. nodeMonitorGracePeriod must be N times more than nodeStatusUpdateFrequency, where 1172 // N means number of retries allowed for kubelet to post node status. It is pointless 1173 // to make nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since there 1174 // will only be fresh values from Kubelet at an interval of nodeStatusUpdateFrequency. 1175 // The constant must be less than podEvictionTimeout. 1176 // 2. nodeStatusUpdateFrequency needs to be large enough for kubelet to generate node 1177 // status. Kubelet may fail to update node status reliably if the value is too small, 1178 // as it takes time to gather all necessary node information. 1179 nodeStatusUpdateFrequency time.Duration 1180 1181 // nodeStatusReportFrequency is the frequency that kubelet posts node 1182 // status to master. It is only used when node lease feature is enabled. 1183 nodeStatusReportFrequency time.Duration 1184 1185 // lastStatusReportTime is the time when node status was last reported. 1186 lastStatusReportTime time.Time 1187 1188 // syncNodeStatusMux is a lock on updating the node status, because this path is not thread-safe. 1189 // This lock is used by Kubelet.syncNodeStatus and Kubelet.fastNodeStatusUpdate functions and shouldn't be used anywhere else. 1190 syncNodeStatusMux sync.Mutex 1191 1192 // updatePodCIDRMux is a lock on updating pod CIDR, because this path is not thread-safe. 1193 // This lock is used by Kubelet.updatePodCIDR function and shouldn't be used anywhere else. 1194 updatePodCIDRMux sync.Mutex 1195 1196 // updateRuntimeMux is a lock on updating runtime, because this path is not thread-safe. 1197 // This lock is used by Kubelet.updateRuntimeUp, Kubelet.fastNodeStatusUpdate and 1198 // Kubelet.HandlerSupportsUserNamespaces functions and shouldn't be used anywhere else. 1199 updateRuntimeMux sync.Mutex 1200 1201 // nodeLeaseController claims and renews the node lease for this Kubelet 1202 nodeLeaseController lease.Controller 1203 1204 // pleg observes the state of the container runtime and notifies the kubelet of changes to containers, which 1205 // notifies the podWorkers to reconcile the state of the pod (for instance, if a container dies and needs to 1206 // be restarted). 1207 pleg pleg.PodLifecycleEventGenerator 1208 1209 // eventedPleg supplements the pleg to deliver edge-driven container changes with low-latency. 1210 eventedPleg pleg.PodLifecycleEventGenerator 1211 1212 // Store kubecontainer.PodStatus for all pods. 1213 podCache kubecontainer.Cache 1214 1215 // os is a facade for various syscalls that need to be mocked during testing. 1216 os kubecontainer.OSInterface 1217 1218 // Watcher of out of memory events. 1219 oomWatcher oomwatcher.Watcher 1220 1221 // Monitor resource usage 1222 resourceAnalyzer serverstats.ResourceAnalyzer 1223 1224 // Whether or not we should have the QOS cgroup hierarchy for resource management 1225 cgroupsPerQOS bool 1226 1227 // If non-empty, pass this to the container runtime as the root cgroup. 1228 cgroupRoot string 1229 1230 // Mounter to use for volumes. 1231 mounter mount.Interface 1232 1233 // hostutil to interact with filesystems 1234 hostutil hostutil.HostUtils 1235 1236 // subpather to execute subpath actions 1237 subpather subpath.Interface 1238 1239 // Manager of non-Runtime containers. 1240 containerManager cm.ContainerManager 1241 1242 // Maximum Number of Pods which can be run by this Kubelet 1243 maxPods int 1244 1245 // Monitor Kubelet's sync loop 1246 syncLoopMonitor atomic.Value 1247 1248 // Container restart Backoff 1249 backOff *flowcontrol.Backoff 1250 1251 // Information about the ports which are opened by daemons on Node running this Kubelet server. 1252 daemonEndpoints *v1.NodeDaemonEndpoints 1253 1254 // A queue used to trigger pod workers. 1255 workQueue queue.WorkQueue 1256 1257 // oneTimeInitializer is used to initialize modules that are dependent on the runtime to be up. 1258 oneTimeInitializer sync.Once 1259 1260 // If set, use this IP address or addresses for the node 1261 nodeIPs []net.IP 1262 1263 // use this function to validate the kubelet nodeIP 1264 nodeIPValidator func(net.IP) error 1265 1266 // If non-nil, this is a unique identifier for the node in an external database, eg. cloudprovider 1267 providerID string 1268 1269 // clock is an interface that provides time related functionality in a way that makes it 1270 // easy to test the code. 1271 clock clock.WithTicker 1272 1273 // handlers called during the tryUpdateNodeStatus cycle 1274 setNodeStatusFuncs []func(context.Context, *v1.Node) error 1275 1276 lastNodeUnschedulableLock sync.Mutex 1277 // maintains Node.Spec.Unschedulable value from previous run of tryUpdateNodeStatus() 1278 lastNodeUnschedulable bool 1279 1280 // the list of handlers to call during pod admission. 1281 admitHandlers lifecycle.PodAdmitHandlers 1282 1283 // softAdmithandlers are applied to the pod after it is admitted by the Kubelet, but before it is 1284 // run. A pod rejected by a softAdmitHandler will be left in a Pending state indefinitely. If a 1285 // rejected pod should not be recreated, or the scheduler is not aware of the rejection rule, the 1286 // admission rule should be applied by a softAdmitHandler. 1287 softAdmitHandlers lifecycle.PodAdmitHandlers 1288 1289 // the list of handlers to call during pod sync loop. 1290 lifecycle.PodSyncLoopHandlers 1291 1292 // the list of handlers to call during pod sync. 1293 lifecycle.PodSyncHandlers 1294 1295 // the number of allowed pods per core 1296 podsPerCore int 1297 1298 // enableControllerAttachDetach indicates the Attach/Detach controller 1299 // should manage attachment/detachment of volumes scheduled to this node, 1300 // and disable kubelet from executing any attach/detach operations 1301 enableControllerAttachDetach bool 1302 1303 // trigger deleting containers in a pod 1304 containerDeletor *podContainerDeletor 1305 1306 // config iptables util rules 1307 makeIPTablesUtilChains bool 1308 1309 // The AppArmor validator for checking whether AppArmor is supported. 1310 appArmorValidator apparmor.Validator 1311 1312 // StatsProvider provides the node and the container stats. 1313 StatsProvider *stats.Provider 1314 1315 // pluginmanager runs a set of asynchronous loops that figure out which 1316 // plugins need to be registered/unregistered based on this node and makes it so. 1317 pluginManager pluginmanager.PluginManager 1318 1319 // This flag sets a maximum number of images to report in the node status. 1320 nodeStatusMaxImages int32 1321 1322 // Handles RuntimeClass objects for the Kubelet. 1323 runtimeClassManager *runtimeclass.Manager 1324 1325 // Handles node shutdown events for the Node. 1326 shutdownManager nodeshutdown.Manager 1327 1328 // Manage user namespaces 1329 usernsManager *userns.UsernsManager 1330 1331 // Mutex to serialize new pod admission and existing pod resizing 1332 podResizeMutex sync.Mutex 1333 1334 // OpenTelemetry Tracer 1335 tracer trace.Tracer 1336 1337 // Track node startup latencies 1338 nodeStartupLatencyTracker util.NodeStartupLatencyTracker 1339 } 1340 1341 // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface 1342 func (kl *Kubelet) ListPodStats(ctx context.Context) ([]statsapi.PodStats, error) { 1343 return kl.StatsProvider.ListPodStats(ctx) 1344 } 1345 1346 // ListPodCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface 1347 func (kl *Kubelet) ListPodCPUAndMemoryStats(ctx context.Context) ([]statsapi.PodStats, error) { 1348 return kl.StatsProvider.ListPodCPUAndMemoryStats(ctx) 1349 } 1350 1351 // ListPodStatsAndUpdateCPUNanoCoreUsage is delegated to StatsProvider, which implements stats.Provider interface 1352 func (kl *Kubelet) ListPodStatsAndUpdateCPUNanoCoreUsage(ctx context.Context) ([]statsapi.PodStats, error) { 1353 return kl.StatsProvider.ListPodStatsAndUpdateCPUNanoCoreUsage(ctx) 1354 } 1355 1356 // ImageFsStats is delegated to StatsProvider, which implements stats.Provider interface 1357 func (kl *Kubelet) ImageFsStats(ctx context.Context) (*statsapi.FsStats, *statsapi.FsStats, error) { 1358 return kl.StatsProvider.ImageFsStats(ctx) 1359 } 1360 1361 // GetCgroupStats is delegated to StatsProvider, which implements stats.Provider interface 1362 func (kl *Kubelet) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) { 1363 return kl.StatsProvider.GetCgroupStats(cgroupName, updateStats) 1364 } 1365 1366 // GetCgroupCPUAndMemoryStats is delegated to StatsProvider, which implements stats.Provider interface 1367 func (kl *Kubelet) GetCgroupCPUAndMemoryStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, error) { 1368 return kl.StatsProvider.GetCgroupCPUAndMemoryStats(cgroupName, updateStats) 1369 } 1370 1371 // RootFsStats is delegated to StatsProvider, which implements stats.Provider interface 1372 func (kl *Kubelet) RootFsStats() (*statsapi.FsStats, error) { 1373 return kl.StatsProvider.RootFsStats() 1374 } 1375 1376 // RlimitStats is delegated to StatsProvider, which implements stats.Provider interface 1377 func (kl *Kubelet) RlimitStats() (*statsapi.RlimitStats, error) { 1378 return kl.StatsProvider.RlimitStats() 1379 } 1380 1381 // setupDataDirs creates: 1382 // 1. the root directory 1383 // 2. the pods directory 1384 // 3. the plugins directory 1385 // 4. the pod-resources directory 1386 // 5. the checkpoint directory 1387 // 6. the pod logs root directory 1388 func (kl *Kubelet) setupDataDirs() error { 1389 if cleanedRoot := filepath.Clean(kl.rootDirectory); cleanedRoot != kl.rootDirectory { 1390 return fmt.Errorf("rootDirectory not in canonical form: expected %s, was %s", cleanedRoot, kl.rootDirectory) 1391 } 1392 pluginRegistrationDir := kl.getPluginsRegistrationDir() 1393 pluginsDir := kl.getPluginsDir() 1394 if err := os.MkdirAll(kl.getRootDir(), 0750); err != nil { 1395 return fmt.Errorf("error creating root directory: %v", err) 1396 } 1397 if err := os.MkdirAll(kl.getPodLogsDir(), 0750); err != nil { 1398 return fmt.Errorf("error creating pod logs root directory %q: %w", kl.getPodLogsDir(), err) 1399 } 1400 if err := kl.hostutil.MakeRShared(kl.getRootDir()); err != nil { 1401 return fmt.Errorf("error configuring root directory: %v", err) 1402 } 1403 if err := os.MkdirAll(kl.getPodsDir(), 0750); err != nil { 1404 return fmt.Errorf("error creating pods directory: %v", err) 1405 } 1406 if err := os.MkdirAll(kl.getPluginsDir(), 0750); err != nil { 1407 return fmt.Errorf("error creating plugins directory: %v", err) 1408 } 1409 if err := os.MkdirAll(kl.getPluginsRegistrationDir(), 0750); err != nil { 1410 return fmt.Errorf("error creating plugins registry directory: %v", err) 1411 } 1412 if err := os.MkdirAll(kl.getPodResourcesDir(), 0750); err != nil { 1413 return fmt.Errorf("error creating podresources directory: %v", err) 1414 } 1415 if utilfeature.DefaultFeatureGate.Enabled(features.ContainerCheckpoint) { 1416 if err := os.MkdirAll(kl.getCheckpointsDir(), 0700); err != nil { 1417 return fmt.Errorf("error creating checkpoint directory: %v", err) 1418 } 1419 } 1420 if selinux.GetEnabled() { 1421 err := selinux.SetFileLabel(pluginRegistrationDir, config.KubeletPluginsDirSELinuxLabel) 1422 if err != nil { 1423 klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugin registration dir", "path", pluginRegistrationDir, "err", err) 1424 } 1425 err = selinux.SetFileLabel(pluginsDir, config.KubeletPluginsDirSELinuxLabel) 1426 if err != nil { 1427 klog.InfoS("Unprivileged containerized plugins might not work, could not set selinux context on plugins dir", "path", pluginsDir, "err", err) 1428 } 1429 } 1430 return nil 1431 } 1432 1433 // StartGarbageCollection starts garbage collection threads. 1434 func (kl *Kubelet) StartGarbageCollection() { 1435 loggedContainerGCFailure := false 1436 go wait.Until(func() { 1437 ctx := context.Background() 1438 if err := kl.containerGC.GarbageCollect(ctx); err != nil { 1439 klog.ErrorS(err, "Container garbage collection failed") 1440 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error()) 1441 loggedContainerGCFailure = true 1442 } else { 1443 var vLevel klog.Level = 4 1444 if loggedContainerGCFailure { 1445 vLevel = 1 1446 loggedContainerGCFailure = false 1447 } 1448 1449 klog.V(vLevel).InfoS("Container garbage collection succeeded") 1450 } 1451 }, ContainerGCPeriod, wait.NeverStop) 1452 1453 // when the high threshold is set to 100, and the max age is 0 (or the max age feature is disabled) 1454 // stub the image GC manager 1455 if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 && 1456 (!utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) || kl.kubeletConfiguration.ImageMaximumGCAge.Duration == 0) { 1457 klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100 and ImageMaximumGCAge is 0, Disable image GC") 1458 return 1459 } 1460 1461 prevImageGCFailed := false 1462 beganGC := time.Now() 1463 go wait.Until(func() { 1464 ctx := context.Background() 1465 if err := kl.imageManager.GarbageCollect(ctx, beganGC); err != nil { 1466 if prevImageGCFailed { 1467 klog.ErrorS(err, "Image garbage collection failed multiple times in a row") 1468 // Only create an event for repeated failures 1469 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error()) 1470 } else { 1471 klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet") 1472 } 1473 prevImageGCFailed = true 1474 } else { 1475 var vLevel klog.Level = 4 1476 if prevImageGCFailed { 1477 vLevel = 1 1478 prevImageGCFailed = false 1479 } 1480 1481 klog.V(vLevel).InfoS("Image garbage collection succeeded") 1482 } 1483 }, ImageGCPeriod, wait.NeverStop) 1484 } 1485 1486 // initializeModules will initialize internal modules that do not require the container runtime to be up. 1487 // Note that the modules here must not depend on modules that are not initialized here. 1488 func (kl *Kubelet) initializeModules() error { 1489 // Prometheus metrics. 1490 metrics.Register( 1491 collectors.NewVolumeStatsCollector(kl), 1492 collectors.NewLogMetricsCollector(kl.StatsProvider.ListPodStats), 1493 ) 1494 metrics.SetNodeName(kl.nodeName) 1495 servermetrics.Register() 1496 1497 // Setup filesystem directories. 1498 if err := kl.setupDataDirs(); err != nil { 1499 return err 1500 } 1501 1502 // If the container logs directory does not exist, create it. 1503 if _, err := os.Stat(ContainerLogsDir); err != nil { 1504 if err := kl.os.MkdirAll(ContainerLogsDir, 0755); err != nil { 1505 return fmt.Errorf("failed to create directory %q: %v", ContainerLogsDir, err) 1506 } 1507 } 1508 1509 // Start the image manager. 1510 kl.imageManager.Start() 1511 1512 // Start the certificate manager if it was enabled. 1513 if kl.serverCertificateManager != nil { 1514 kl.serverCertificateManager.Start() 1515 } 1516 1517 // Start out of memory watcher. 1518 if kl.oomWatcher != nil { 1519 if err := kl.oomWatcher.Start(kl.nodeRef); err != nil { 1520 return fmt.Errorf("failed to start OOM watcher: %w", err) 1521 } 1522 } 1523 1524 // Start resource analyzer 1525 kl.resourceAnalyzer.Start() 1526 1527 return nil 1528 } 1529 1530 // initializeRuntimeDependentModules will initialize internal modules that require the container runtime to be up. 1531 func (kl *Kubelet) initializeRuntimeDependentModules() { 1532 if err := kl.cadvisor.Start(); err != nil { 1533 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1534 klog.ErrorS(err, "Failed to start cAdvisor") 1535 os.Exit(1) 1536 } 1537 1538 // trigger on-demand stats collection once so that we have capacity information for ephemeral storage. 1539 // ignore any errors, since if stats collection is not successful, the container manager will fail to start below. 1540 kl.StatsProvider.GetCgroupStats("/", true) 1541 // Start container manager. 1542 node, err := kl.getNodeAnyWay() 1543 if err != nil { 1544 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1545 klog.ErrorS(err, "Kubelet failed to get node info") 1546 os.Exit(1) 1547 } 1548 // containerManager must start after cAdvisor because it needs filesystem capacity information 1549 if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService, kl.supportLocalStorageCapacityIsolation()); err != nil { 1550 // Fail kubelet and rely on the babysitter to retry starting kubelet. 1551 klog.ErrorS(err, "Failed to start ContainerManager") 1552 os.Exit(1) 1553 } 1554 // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs 1555 kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.PodIsFinished, evictionMonitoringPeriod) 1556 1557 // container log manager must start after container runtime is up to retrieve information from container runtime 1558 // and inform container to reopen log file after log rotation. 1559 kl.containerLogManager.Start() 1560 // Adding Registration Callback function for CSI Driver 1561 kl.pluginManager.AddHandler(pluginwatcherapi.CSIPlugin, plugincache.PluginHandler(csi.PluginHandler)) 1562 // Adding Registration Callback function for DRA Plugin 1563 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 1564 kl.pluginManager.AddHandler(pluginwatcherapi.DRAPlugin, plugincache.PluginHandler(draplugin.NewRegistrationHandler(kl.kubeClient, kl.getNodeAnyWay))) 1565 } 1566 // Adding Registration Callback function for Device Manager 1567 kl.pluginManager.AddHandler(pluginwatcherapi.DevicePlugin, kl.containerManager.GetPluginRegistrationHandler()) 1568 1569 // Start the plugin manager 1570 klog.V(4).InfoS("Starting plugin manager") 1571 go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop) 1572 1573 err = kl.shutdownManager.Start() 1574 if err != nil { 1575 // The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it. 1576 klog.ErrorS(err, "Failed to start node shutdown manager") 1577 } 1578 } 1579 1580 // Run starts the kubelet reacting to config updates 1581 func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { 1582 ctx := context.Background() 1583 if kl.logServer == nil { 1584 file := http.FileServer(http.Dir(nodeLogDir)) 1585 if utilfeature.DefaultFeatureGate.Enabled(features.NodeLogQuery) && kl.kubeletConfiguration.EnableSystemLogQuery { 1586 kl.logServer = http.StripPrefix("/logs/", http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 1587 if nlq, errs := newNodeLogQuery(req.URL.Query()); len(errs) > 0 { 1588 http.Error(w, errs.ToAggregate().Error(), http.StatusBadRequest) 1589 return 1590 } else if nlq != nil { 1591 if req.URL.Path != "/" && req.URL.Path != "" { 1592 http.Error(w, "path not allowed in query mode", http.StatusNotAcceptable) 1593 return 1594 } 1595 if errs := nlq.validate(); len(errs) > 0 { 1596 http.Error(w, errs.ToAggregate().Error(), http.StatusNotAcceptable) 1597 return 1598 } 1599 // Validation ensures that the request does not query services and files at the same time 1600 if len(nlq.Services) > 0 { 1601 journal.ServeHTTP(w, req) 1602 return 1603 } 1604 // Validation ensures that the request does not explicitly query multiple files at the same time 1605 if len(nlq.Files) == 1 { 1606 // Account for the \ being used on Windows clients 1607 req.URL.Path = filepath.ToSlash(nlq.Files[0]) 1608 } 1609 } 1610 // Fall back in case the caller is directly trying to query a file 1611 // Example: kubectl get --raw /api/v1/nodes/$name/proxy/logs/foo.log 1612 file.ServeHTTP(w, req) 1613 })) 1614 } else { 1615 kl.logServer = http.StripPrefix("/logs/", file) 1616 } 1617 } 1618 if kl.kubeClient == nil { 1619 klog.InfoS("No API server defined - no node status update will be sent") 1620 } 1621 1622 // Start the cloud provider sync manager 1623 if kl.cloudResourceSyncManager != nil { 1624 go kl.cloudResourceSyncManager.Run(wait.NeverStop) 1625 } 1626 1627 if err := kl.initializeModules(); err != nil { 1628 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.KubeletSetupFailed, err.Error()) 1629 klog.ErrorS(err, "Failed to initialize internal modules") 1630 os.Exit(1) 1631 } 1632 1633 // Start volume manager 1634 go kl.volumeManager.Run(kl.sourcesReady, wait.NeverStop) 1635 1636 if kl.kubeClient != nil { 1637 // Start two go-routines to update the status. 1638 // 1639 // The first will report to the apiserver every nodeStatusUpdateFrequency and is aimed to provide regular status intervals, 1640 // while the second is used to provide a more timely status update during initialization and runs an one-shot update to the apiserver 1641 // once the node becomes ready, then exits afterwards. 1642 // 1643 // Introduce some small jittering to ensure that over time the requests won't start 1644 // accumulating at approximately the same time from the set of nodes due to priority and 1645 // fairness effect. 1646 go wait.JitterUntil(kl.syncNodeStatus, kl.nodeStatusUpdateFrequency, 0.04, true, wait.NeverStop) 1647 go kl.fastStatusUpdateOnce() 1648 1649 // start syncing lease 1650 go kl.nodeLeaseController.Run(context.Background()) 1651 } 1652 go wait.Until(kl.updateRuntimeUp, 5*time.Second, wait.NeverStop) 1653 1654 // Set up iptables util rules 1655 if kl.makeIPTablesUtilChains { 1656 kl.initNetworkUtil() 1657 } 1658 1659 // Start component sync loops. 1660 kl.statusManager.Start() 1661 1662 // Start syncing RuntimeClasses if enabled. 1663 if kl.runtimeClassManager != nil { 1664 kl.runtimeClassManager.Start(wait.NeverStop) 1665 } 1666 1667 // Start the pod lifecycle event generator. 1668 kl.pleg.Start() 1669 1670 // Start eventedPLEG only if EventedPLEG feature gate is enabled. 1671 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 1672 kl.eventedPleg.Start() 1673 } 1674 1675 kl.syncLoop(ctx, updates, kl) 1676 } 1677 1678 // SyncPod is the transaction script for the sync of a single pod (setting up) 1679 // a pod. This method is reentrant and expected to converge a pod towards the 1680 // desired state of the spec. The reverse (teardown) is handled in 1681 // SyncTerminatingPod and SyncTerminatedPod. If SyncPod exits without error, 1682 // then the pod runtime state is in sync with the desired configuration state 1683 // (pod is running). If SyncPod exits with a transient error, the next 1684 // invocation of SyncPod is expected to make progress towards reaching the 1685 // desired state. SyncPod exits with isTerminal when the pod was detected to 1686 // have reached a terminal lifecycle phase due to container exits (for 1687 // RestartNever or RestartOnFailure) and the next method invoked will be 1688 // SyncTerminatingPod. If the pod terminates for any other reason, SyncPod 1689 // will receive a context cancellation and should exit as soon as possible. 1690 // 1691 // Arguments: 1692 // 1693 // updateType - whether this is a create (first time) or an update, should 1694 // only be used for metrics since this method must be reentrant 1695 // 1696 // pod - the pod that is being set up 1697 // 1698 // mirrorPod - the mirror pod known to the kubelet for this pod, if any 1699 // 1700 // podStatus - the most recent pod status observed for this pod which can 1701 // be used to determine the set of actions that should be taken during 1702 // this loop of SyncPod 1703 // 1704 // The workflow is: 1705 // - If the pod is being created, record pod worker start latency 1706 // - Call generateAPIPodStatus to prepare an v1.PodStatus for the pod 1707 // - If the pod is being seen as running for the first time, record pod 1708 // start latency 1709 // - Update the status of the pod in the status manager 1710 // - Stop the pod's containers if it should not be running due to soft 1711 // admission 1712 // - Ensure any background tracking for a runnable pod is started 1713 // - Create a mirror pod if the pod is a static pod, and does not 1714 // already have a mirror pod 1715 // - Create the data directories for the pod if they do not exist 1716 // - Wait for volumes to attach/mount 1717 // - Fetch the pull secrets for the pod 1718 // - Call the container runtime's SyncPod callback 1719 // - Update the traffic shaping for the pod's ingress and egress limits 1720 // 1721 // If any step of this workflow errors, the error is returned, and is repeated 1722 // on the next SyncPod call. 1723 // 1724 // This operation writes all events that are dispatched in order to provide 1725 // the most accurate information possible about an error situation to aid debugging. 1726 // Callers should not write an event if this operation returns an error. 1727 func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) { 1728 ctx, otelSpan := kl.tracer.Start(ctx, "syncPod", trace.WithAttributes( 1729 semconv.K8SPodUIDKey.String(string(pod.UID)), 1730 attribute.String("k8s.pod", klog.KObj(pod).String()), 1731 semconv.K8SPodNameKey.String(pod.Name), 1732 attribute.String("k8s.pod.update_type", updateType.String()), 1733 semconv.K8SNamespaceNameKey.String(pod.Namespace), 1734 )) 1735 klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 1736 defer func() { 1737 klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal) 1738 otelSpan.End() 1739 }() 1740 1741 // Latency measurements for the main workflow are relative to the 1742 // first time the pod was seen by kubelet. 1743 var firstSeenTime time.Time 1744 if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok { 1745 firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get() 1746 } 1747 1748 // Record pod worker start latency if being created 1749 // TODO: make pod workers record their own latencies 1750 if updateType == kubetypes.SyncPodCreate { 1751 if !firstSeenTime.IsZero() { 1752 // This is the first time we are syncing the pod. Record the latency 1753 // since kubelet first saw the pod if firstSeenTime is set. 1754 metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) 1755 } else { 1756 klog.V(3).InfoS("First seen time not recorded for pod", 1757 "podUID", pod.UID, 1758 "pod", klog.KObj(pod)) 1759 } 1760 } 1761 1762 // Generate final API pod status with pod and status manager status 1763 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false) 1764 // The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576) 1765 // TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and 1766 // set pod IP to hostIP directly in runtime.GetPodStatus 1767 podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs)) 1768 for _, ipInfo := range apiPodStatus.PodIPs { 1769 podStatus.IPs = append(podStatus.IPs, ipInfo.IP) 1770 } 1771 if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 { 1772 podStatus.IPs = []string{apiPodStatus.PodIP} 1773 } 1774 1775 // If the pod is terminal, we don't need to continue to setup the pod 1776 if apiPodStatus.Phase == v1.PodSucceeded || apiPodStatus.Phase == v1.PodFailed { 1777 kl.statusManager.SetPodStatus(pod, apiPodStatus) 1778 isTerminal = true 1779 return isTerminal, nil 1780 } 1781 1782 // If the pod should not be running, we request the pod's containers be stopped. This is not the same 1783 // as termination (we want to stop the pod, but potentially restart it later if soft admission allows 1784 // it later). Set the status and phase appropriately 1785 runnable := kl.canRunPod(pod) 1786 if !runnable.Admit { 1787 // Pod is not runnable; and update the Pod and Container statuses to why. 1788 if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded { 1789 apiPodStatus.Phase = v1.PodPending 1790 } 1791 apiPodStatus.Reason = runnable.Reason 1792 apiPodStatus.Message = runnable.Message 1793 // Waiting containers are not creating. 1794 const waitingReason = "Blocked" 1795 for _, cs := range apiPodStatus.InitContainerStatuses { 1796 if cs.State.Waiting != nil { 1797 cs.State.Waiting.Reason = waitingReason 1798 } 1799 } 1800 for _, cs := range apiPodStatus.ContainerStatuses { 1801 if cs.State.Waiting != nil { 1802 cs.State.Waiting.Reason = waitingReason 1803 } 1804 } 1805 } 1806 1807 // Record the time it takes for the pod to become running 1808 // since kubelet first saw the pod if firstSeenTime is set. 1809 existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) 1810 if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && 1811 !firstSeenTime.IsZero() { 1812 metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) 1813 } 1814 1815 kl.statusManager.SetPodStatus(pod, apiPodStatus) 1816 1817 // Pods that are not runnable must be stopped - return a typed error to the pod worker 1818 if !runnable.Admit { 1819 klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message) 1820 var syncErr error 1821 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1822 if err := kl.killPod(ctx, pod, p, nil); err != nil { 1823 if !wait.Interrupted(err) { 1824 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 1825 syncErr = fmt.Errorf("error killing pod: %w", err) 1826 utilruntime.HandleError(syncErr) 1827 } 1828 } else { 1829 // There was no error killing the pod, but the pod cannot be run. 1830 // Return an error to signal that the sync loop should back off. 1831 syncErr = fmt.Errorf("pod cannot be run: %v", runnable.Message) 1832 } 1833 return false, syncErr 1834 } 1835 1836 // If the network plugin is not ready, only start the pod if it uses the host network 1837 if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) { 1838 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err) 1839 return false, fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err) 1840 } 1841 1842 // ensure the kubelet knows about referenced secrets or configmaps used by the pod 1843 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 1844 if kl.secretManager != nil { 1845 kl.secretManager.RegisterPod(pod) 1846 } 1847 if kl.configMapManager != nil { 1848 kl.configMapManager.RegisterPod(pod) 1849 } 1850 } 1851 1852 // Create Cgroups for the pod and apply resource parameters 1853 // to them if cgroups-per-qos flag is enabled. 1854 pcm := kl.containerManager.NewPodContainerManager() 1855 // If pod has already been terminated then we need not create 1856 // or update the pod's cgroup 1857 // TODO: once context cancellation is added this check can be removed 1858 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 1859 // When the kubelet is restarted with the cgroups-per-qos 1860 // flag enabled, all the pod's running containers 1861 // should be killed intermittently and brought back up 1862 // under the qos cgroup hierarchy. 1863 // Check if this is the pod's first sync 1864 firstSync := true 1865 for _, containerStatus := range apiPodStatus.ContainerStatuses { 1866 if containerStatus.State.Running != nil { 1867 firstSync = false 1868 break 1869 } 1870 } 1871 // Don't kill containers in pod if pod's cgroups already 1872 // exists or the pod is running for the first time 1873 podKilled := false 1874 if !pcm.Exists(pod) && !firstSync { 1875 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1876 if err := kl.killPod(ctx, pod, p, nil); err == nil { 1877 if wait.Interrupted(err) { 1878 return false, err 1879 } 1880 podKilled = true 1881 } else { 1882 klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus) 1883 } 1884 } 1885 // Create and Update pod's Cgroups 1886 // Don't create cgroups for run once pod if it was killed above 1887 // The current policy is not to restart the run once pods when 1888 // the kubelet is restarted with the new flag as run once pods are 1889 // expected to run only once and if the kubelet is restarted then 1890 // they are not expected to run again. 1891 // We don't create and apply updates to cgroup if its a run once pod and was killed above 1892 if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) { 1893 if !pcm.Exists(pod) { 1894 if err := kl.containerManager.UpdateQOSCgroups(); err != nil { 1895 klog.V(2).InfoS("Failed to update QoS cgroups while syncing pod", "pod", klog.KObj(pod), "err", err) 1896 } 1897 if err := pcm.EnsureExists(pod); err != nil { 1898 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err) 1899 return false, fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) 1900 } 1901 } 1902 } 1903 } 1904 1905 // Create Mirror Pod for Static Pod if it doesn't already exist 1906 if kubetypes.IsStaticPod(pod) { 1907 deleted := false 1908 if mirrorPod != nil { 1909 if mirrorPod.DeletionTimestamp != nil || !kubepod.IsMirrorPodOf(mirrorPod, pod) { 1910 // The mirror pod is semantically different from the static pod. Remove 1911 // it. The mirror pod will get recreated later. 1912 klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID) 1913 podFullName := kubecontainer.GetPodFullName(pod) 1914 var err error 1915 deleted, err = kl.mirrorPodClient.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID) 1916 if deleted { 1917 klog.InfoS("Deleted mirror pod because it is outdated", "pod", klog.KObj(mirrorPod)) 1918 } else if err != nil { 1919 klog.ErrorS(err, "Failed deleting mirror pod", "pod", klog.KObj(mirrorPod)) 1920 } 1921 } 1922 } 1923 if mirrorPod == nil || deleted { 1924 node, err := kl.GetNode() 1925 if err != nil { 1926 klog.V(4).ErrorS(err, "No need to create a mirror pod, since failed to get node info from the cluster", "node", klog.KRef("", string(kl.nodeName))) 1927 } else if node.DeletionTimestamp != nil { 1928 klog.V(4).InfoS("No need to create a mirror pod, since node has been removed from the cluster", "node", klog.KRef("", string(kl.nodeName))) 1929 } else { 1930 klog.V(4).InfoS("Creating a mirror pod for static pod", "pod", klog.KObj(pod)) 1931 if err := kl.mirrorPodClient.CreateMirrorPod(pod); err != nil { 1932 klog.ErrorS(err, "Failed creating a mirror pod for", "pod", klog.KObj(pod)) 1933 } 1934 } 1935 } 1936 } 1937 1938 // Make data directories for the pod 1939 if err := kl.makePodDataDirs(pod); err != nil { 1940 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err) 1941 klog.ErrorS(err, "Unable to make pod data directories for pod", "pod", klog.KObj(pod)) 1942 return false, err 1943 } 1944 1945 // Wait for volumes to attach/mount 1946 if err := kl.volumeManager.WaitForAttachAndMount(ctx, pod); err != nil { 1947 if !wait.Interrupted(err) { 1948 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err) 1949 klog.ErrorS(err, "Unable to attach or mount volumes for pod; skipping pod", "pod", klog.KObj(pod)) 1950 } 1951 return false, err 1952 } 1953 1954 // Fetch the pull secrets for the pod 1955 pullSecrets := kl.getPullSecretsForPod(pod) 1956 1957 // Ensure the pod is being probed 1958 kl.probeManager.AddPod(pod) 1959 1960 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 1961 // Handle pod resize here instead of doing it in HandlePodUpdates because 1962 // this conveniently retries any Deferred resize requests 1963 // TODO(vinaykul,InPlacePodVerticalScaling): Investigate doing this in HandlePodUpdates + periodic SyncLoop scan 1964 // See: https://github.com/kubernetes/kubernetes/pull/102884#discussion_r663160060 1965 if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) { 1966 pod = kl.handlePodResourcesResize(pod) 1967 } 1968 } 1969 1970 // TODO(#113606): use cancellation from the incoming context parameter, which comes from the pod worker. 1971 // Currently, using cancellation from that context causes test failures. To remove this WithoutCancel, 1972 // any wait.Interrupted errors need to be filtered from result and bypass the reasonCache - cancelling 1973 // the context for SyncPod is a known and deliberate error, not a generic error. 1974 // Use WithoutCancel instead of a new context.TODO() to propagate trace context 1975 // Call the container runtime's SyncPod callback 1976 sctx := context.WithoutCancel(ctx) 1977 result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff) 1978 kl.reasonCache.Update(pod.UID, result) 1979 if err := result.Error(); err != nil { 1980 // Do not return error if the only failures were pods in backoff 1981 for _, r := range result.SyncResults { 1982 if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff { 1983 // Do not record an event here, as we keep all event logging for sync pod failures 1984 // local to container runtime, so we get better errors. 1985 return false, err 1986 } 1987 } 1988 1989 return false, nil 1990 } 1991 1992 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) { 1993 // While resize is in progress, periodically call PLEG to update pod cache 1994 runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 1995 if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil { 1996 klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod)) 1997 return false, err 1998 } 1999 } 2000 2001 return false, nil 2002 } 2003 2004 // SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method 2005 // returns without error, the pod is considered to be terminated and it will be safe to clean up any 2006 // pod state that is tied to the lifetime of running containers. The next method invoked will be 2007 // SyncTerminatedPod. This method is expected to return with the grace period provided and the 2008 // provided context may be cancelled if the duration is exceeded. The method may also be interrupted 2009 // with a context cancellation if the grace period is shortened by the user or the kubelet (such as 2010 // during eviction). This method is not guaranteed to be called if a pod is force deleted from the 2011 // configuration and the kubelet is restarted - SyncTerminatingRuntimePod handles those orphaned 2012 // pods. 2013 func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { 2014 // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. 2015 // Currently, using that context causes test failures. 2016 ctx, otelSpan := kl.tracer.Start(context.Background(), "syncTerminatingPod", trace.WithAttributes( 2017 semconv.K8SPodUIDKey.String(string(pod.UID)), 2018 attribute.String("k8s.pod", klog.KObj(pod).String()), 2019 semconv.K8SPodNameKey.String(pod.Name), 2020 semconv.K8SNamespaceNameKey.String(pod.Namespace), 2021 )) 2022 defer otelSpan.End() 2023 klog.V(4).InfoS("SyncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2024 defer klog.V(4).InfoS("SyncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2025 2026 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false) 2027 if podStatusFn != nil { 2028 podStatusFn(&apiPodStatus) 2029 } 2030 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2031 2032 if gracePeriod != nil { 2033 klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod) 2034 } else { 2035 klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil) 2036 } 2037 2038 kl.probeManager.StopLivenessAndStartup(pod) 2039 2040 p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) 2041 if err := kl.killPod(ctx, pod, p, gracePeriod); err != nil { 2042 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 2043 // there was an error killing the pod, so we return that error directly 2044 utilruntime.HandleError(err) 2045 return err 2046 } 2047 2048 // Once the containers are stopped, we can stop probing for liveness and readiness. 2049 // TODO: once a pod is terminal, certain probes (liveness exec) could be stopped immediately after 2050 // the detection of a container shutdown or (for readiness) after the first failure. Tracked as 2051 // https://github.com/kubernetes/kubernetes/issues/107894 although may not be worth optimizing. 2052 kl.probeManager.RemovePod(pod) 2053 2054 // Guard against consistency issues in KillPod implementations by checking that there are no 2055 // running containers. This method is invoked infrequently so this is effectively free and can 2056 // catch race conditions introduced by callers updating pod status out of order. 2057 // TODO: have KillPod return the terminal status of stopped containers and write that into the 2058 // cache immediately 2059 podStatus, err := kl.containerRuntime.GetPodStatus(ctx, pod.UID, pod.Name, pod.Namespace) 2060 if err != nil { 2061 klog.ErrorS(err, "Unable to read pod status prior to final pod termination", "pod", klog.KObj(pod), "podUID", pod.UID) 2062 return err 2063 } 2064 var runningContainers []string 2065 type container struct { 2066 Name string 2067 State string 2068 ExitCode int 2069 FinishedAt string 2070 } 2071 var containers []container 2072 klogV := klog.V(4) 2073 klogVEnabled := klogV.Enabled() 2074 for _, s := range podStatus.ContainerStatuses { 2075 if s.State == kubecontainer.ContainerStateRunning { 2076 runningContainers = append(runningContainers, s.ID.String()) 2077 } 2078 if klogVEnabled { 2079 containers = append(containers, container{Name: s.Name, State: string(s.State), ExitCode: s.ExitCode, FinishedAt: s.FinishedAt.UTC().Format(time.RFC3339Nano)}) 2080 } 2081 } 2082 if klogVEnabled { 2083 sort.Slice(containers, func(i, j int) bool { return containers[i].Name < containers[j].Name }) 2084 klog.V(4).InfoS("Post-termination container state", "pod", klog.KObj(pod), "podUID", pod.UID, "containers", containers) 2085 } 2086 if len(runningContainers) > 0 { 2087 return fmt.Errorf("detected running containers after a successful KillPod, CRI violation: %v", runningContainers) 2088 } 2089 2090 // NOTE: resources must be unprepared AFTER all containers have stopped 2091 // and BEFORE the pod status is changed on the API server 2092 // to avoid race conditions with the resource deallocation code in kubernetes core. 2093 if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { 2094 if err := kl.UnprepareDynamicResources(pod); err != nil { 2095 return err 2096 } 2097 } 2098 2099 // Compute and update the status in cache once the pods are no longer running. 2100 // The computation is done here to ensure the pod status used for it contains 2101 // information about the container end states (including exit codes) - when 2102 // SyncTerminatedPod is called the containers may already be removed. 2103 apiPodStatus = kl.generateAPIPodStatus(pod, podStatus, true) 2104 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2105 2106 // we have successfully stopped all containers, the pod is terminating, our status is "done" 2107 klog.V(4).InfoS("Pod termination stopped all running containers", "pod", klog.KObj(pod), "podUID", pod.UID) 2108 2109 return nil 2110 } 2111 2112 // SyncTerminatingRuntimePod is expected to terminate running containers in a pod that we have no 2113 // configuration for. Once this method returns without error, any remaining local state can be safely 2114 // cleaned up by background processes in each subsystem. Unlike syncTerminatingPod, we lack 2115 // knowledge of the full pod spec and so cannot perform lifecycle related operations, only ensure 2116 // that the remnant of the running pod is terminated and allow garbage collection to proceed. We do 2117 // not update the status of the pod because with the source of configuration removed, we have no 2118 // place to send that status. 2119 func (kl *Kubelet) SyncTerminatingRuntimePod(_ context.Context, runningPod *kubecontainer.Pod) error { 2120 // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. 2121 // Currently, using that context causes test failures. 2122 ctx := context.Background() 2123 pod := runningPod.ToAPIPod() 2124 klog.V(4).InfoS("SyncTerminatingRuntimePod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2125 defer klog.V(4).InfoS("SyncTerminatingRuntimePod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2126 2127 // we kill the pod directly since we have lost all other information about the pod. 2128 klog.V(4).InfoS("Orphaned running pod terminating without grace period", "pod", klog.KObj(pod), "podUID", pod.UID) 2129 // TODO: this should probably be zero, to bypass any waiting (needs fixes in container runtime) 2130 gracePeriod := int64(1) 2131 if err := kl.killPod(ctx, pod, *runningPod, &gracePeriod); err != nil { 2132 kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) 2133 // there was an error killing the pod, so we return that error directly 2134 utilruntime.HandleError(err) 2135 return err 2136 } 2137 klog.V(4).InfoS("Pod termination stopped all running orphaned containers", "pod", klog.KObj(pod), "podUID", pod.UID) 2138 return nil 2139 } 2140 2141 // SyncTerminatedPod cleans up a pod that has terminated (has no running containers). 2142 // The invocations in this call are expected to tear down all pod resources. 2143 // When this method exits the pod is expected to be ready for cleanup. This method 2144 // reduces the latency of pod cleanup but is not guaranteed to get called in all scenarios. 2145 // 2146 // Because the kubelet has no local store of information, all actions in this method that modify 2147 // on-disk state must be reentrant and be garbage collected by HandlePodCleanups or a separate loop. 2148 // This typically occurs when a pod is force deleted from configuration (local disk or API) and the 2149 // kubelet restarts in the middle of the action. 2150 func (kl *Kubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { 2151 ctx, otelSpan := kl.tracer.Start(ctx, "syncTerminatedPod", trace.WithAttributes( 2152 semconv.K8SPodUIDKey.String(string(pod.UID)), 2153 attribute.String("k8s.pod", klog.KObj(pod).String()), 2154 semconv.K8SPodNameKey.String(pod.Name), 2155 semconv.K8SNamespaceNameKey.String(pod.Namespace), 2156 )) 2157 defer otelSpan.End() 2158 klog.V(4).InfoS("SyncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) 2159 defer klog.V(4).InfoS("SyncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) 2160 2161 // generate the final status of the pod 2162 // TODO: should we simply fold this into TerminatePod? that would give a single pod update 2163 apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, true) 2164 2165 kl.statusManager.SetPodStatus(pod, apiPodStatus) 2166 2167 // volumes are unmounted after the pod worker reports ShouldPodRuntimeBeRemoved (which is satisfied 2168 // before syncTerminatedPod is invoked) 2169 if err := kl.volumeManager.WaitForUnmount(ctx, pod); err != nil { 2170 return err 2171 } 2172 klog.V(4).InfoS("Pod termination unmounted volumes", "pod", klog.KObj(pod), "podUID", pod.UID) 2173 2174 // This waiting loop relies on the background cleanup which starts after pod workers respond 2175 // true for ShouldPodRuntimeBeRemoved, which happens after `SyncTerminatingPod` is completed. 2176 if err := wait.PollUntilContextCancel(ctx, 100*time.Millisecond, true, func(ctx context.Context) (bool, error) { 2177 volumesExist := kl.podVolumesExist(pod.UID) 2178 if volumesExist { 2179 klog.V(3).InfoS("Pod is terminated, but some volumes have not been cleaned up", "pod", klog.KObj(pod), "podUID", pod.UID) 2180 } 2181 return !volumesExist, nil 2182 }); err != nil { 2183 return err 2184 } 2185 klog.V(3).InfoS("Pod termination cleaned up volume paths", "pod", klog.KObj(pod), "podUID", pod.UID) 2186 2187 // After volume unmount is complete, let the secret and configmap managers know we're done with this pod 2188 if kl.secretManager != nil { 2189 kl.secretManager.UnregisterPod(pod) 2190 } 2191 if kl.configMapManager != nil { 2192 kl.configMapManager.UnregisterPod(pod) 2193 } 2194 2195 // Note: we leave pod containers to be reclaimed in the background since dockershim requires the 2196 // container for retrieving logs and we want to make sure logs are available until the pod is 2197 // physically deleted. 2198 2199 // remove any cgroups in the hierarchy for pods that are no longer running. 2200 if kl.cgroupsPerQOS { 2201 pcm := kl.containerManager.NewPodContainerManager() 2202 name, _ := pcm.GetPodContainerName(pod) 2203 if err := pcm.Destroy(name); err != nil { 2204 return err 2205 } 2206 klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID) 2207 } 2208 2209 kl.usernsManager.Release(pod.UID) 2210 2211 // mark the final pod status 2212 kl.statusManager.TerminatePod(pod) 2213 klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID) 2214 2215 return nil 2216 } 2217 2218 // Get pods which should be resynchronized. Currently, the following pod should be resynchronized: 2219 // - pod whose work is ready. 2220 // - internal modules that request sync of a pod. 2221 // 2222 // This method does not return orphaned pods (those known only to the pod worker that may have 2223 // been deleted from configuration). Those pods are synced by HandlePodCleanups as a consequence 2224 // of driving the state machine to completion. 2225 // 2226 // TODO: Consider synchronizing all pods which have not recently been acted on to be resilient 2227 // to bugs that might prevent updates from being delivered (such as the previous bug with 2228 // orphaned pods). Instead of asking the work queue for pending work, consider asking the 2229 // PodWorker which pods should be synced. 2230 func (kl *Kubelet) getPodsToSync() []*v1.Pod { 2231 allPods := kl.podManager.GetPods() 2232 podUIDs := kl.workQueue.GetWork() 2233 podUIDSet := sets.NewString() 2234 for _, podUID := range podUIDs { 2235 podUIDSet.Insert(string(podUID)) 2236 } 2237 var podsToSync []*v1.Pod 2238 for _, pod := range allPods { 2239 if podUIDSet.Has(string(pod.UID)) { 2240 // The work of the pod is ready 2241 podsToSync = append(podsToSync, pod) 2242 continue 2243 } 2244 for _, podSyncLoopHandler := range kl.PodSyncLoopHandlers { 2245 if podSyncLoopHandler.ShouldSync(pod) { 2246 podsToSync = append(podsToSync, pod) 2247 break 2248 } 2249 } 2250 } 2251 return podsToSync 2252 } 2253 2254 // deletePod deletes the pod from the internal state of the kubelet by: 2255 // 1. stopping the associated pod worker asynchronously 2256 // 2. signaling to kill the pod by sending on the podKillingCh channel 2257 // 2258 // deletePod returns an error if not all sources are ready or the pod is not 2259 // found in the runtime cache. 2260 func (kl *Kubelet) deletePod(pod *v1.Pod) error { 2261 if pod == nil { 2262 return fmt.Errorf("deletePod does not allow nil pod") 2263 } 2264 if !kl.sourcesReady.AllReady() { 2265 // If the sources aren't ready, skip deletion, as we may accidentally delete pods 2266 // for sources that haven't reported yet. 2267 return fmt.Errorf("skipping delete because sources aren't ready yet") 2268 } 2269 klog.V(3).InfoS("Pod has been deleted and must be killed", "pod", klog.KObj(pod), "podUID", pod.UID) 2270 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2271 Pod: pod, 2272 UpdateType: kubetypes.SyncPodKill, 2273 }) 2274 // We leave the volume/directory cleanup to the periodic cleanup routine. 2275 return nil 2276 } 2277 2278 // rejectPod records an event about the pod with the given reason and message, 2279 // and updates the pod to the failed phase in the status manager. 2280 func (kl *Kubelet) rejectPod(pod *v1.Pod, reason, message string) { 2281 kl.recorder.Eventf(pod, v1.EventTypeWarning, reason, message) 2282 kl.statusManager.SetPodStatus(pod, v1.PodStatus{ 2283 Phase: v1.PodFailed, 2284 Reason: reason, 2285 Message: "Pod was rejected: " + message}) 2286 } 2287 2288 // canAdmitPod determines if a pod can be admitted, and gives a reason if it 2289 // cannot. "pod" is new pod, while "pods" are all admitted pods 2290 // The function returns a boolean value indicating whether the pod 2291 // can be admitted, a brief single-word reason and a message explaining why 2292 // the pod cannot be admitted. 2293 func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, string) { 2294 // the kubelet will invoke each pod admit handler in sequence 2295 // if any handler rejects, the pod is rejected. 2296 // TODO: move out of disk check into a pod admitter 2297 // TODO: out of resource eviction should have a pod admitter call-out 2298 attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods} 2299 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2300 // Use allocated resources values from checkpoint store (source of truth) to determine fit 2301 otherPods := make([]*v1.Pod, 0, len(pods)) 2302 for _, p := range pods { 2303 op := p.DeepCopy() 2304 kl.updateContainerResourceAllocation(op) 2305 2306 otherPods = append(otherPods, op) 2307 } 2308 attrs.OtherPods = otherPods 2309 } 2310 for _, podAdmitHandler := range kl.admitHandlers { 2311 if result := podAdmitHandler.Admit(attrs); !result.Admit { 2312 return false, result.Reason, result.Message 2313 } 2314 } 2315 2316 return true, "", "" 2317 } 2318 2319 func (kl *Kubelet) canRunPod(pod *v1.Pod) lifecycle.PodAdmitResult { 2320 attrs := &lifecycle.PodAdmitAttributes{Pod: pod} 2321 // Get "OtherPods". Rejected pods are failed, so only include admitted pods that are alive. 2322 attrs.OtherPods = kl.GetActivePods() 2323 2324 for _, handler := range kl.softAdmitHandlers { 2325 if result := handler.Admit(attrs); !result.Admit { 2326 return result 2327 } 2328 } 2329 2330 return lifecycle.PodAdmitResult{Admit: true} 2331 } 2332 2333 // syncLoop is the main loop for processing changes. It watches for changes from 2334 // three channels (file, apiserver, and http) and creates a union of them. For 2335 // any new change seen, will run a sync against desired state and running state. If 2336 // no changes are seen to the configuration, will synchronize the last known desired 2337 // state every sync-frequency seconds. Never returns. 2338 func (kl *Kubelet) syncLoop(ctx context.Context, updates <-chan kubetypes.PodUpdate, handler SyncHandler) { 2339 klog.InfoS("Starting kubelet main sync loop") 2340 // The syncTicker wakes up kubelet to checks if there are any pod workers 2341 // that need to be sync'd. A one-second period is sufficient because the 2342 // sync interval is defaulted to 10s. 2343 syncTicker := time.NewTicker(time.Second) 2344 defer syncTicker.Stop() 2345 housekeepingTicker := time.NewTicker(housekeepingPeriod) 2346 defer housekeepingTicker.Stop() 2347 plegCh := kl.pleg.Watch() 2348 const ( 2349 base = 100 * time.Millisecond 2350 max = 5 * time.Second 2351 factor = 2 2352 ) 2353 duration := base 2354 // Responsible for checking limits in resolv.conf 2355 // The limits do not have anything to do with individual pods 2356 // Since this is called in syncLoop, we don't need to call it anywhere else 2357 if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" { 2358 kl.dnsConfigurer.CheckLimitsForResolvConf() 2359 } 2360 2361 for { 2362 if err := kl.runtimeState.runtimeErrors(); err != nil { 2363 klog.ErrorS(err, "Skipping pod synchronization") 2364 // exponential backoff 2365 time.Sleep(duration) 2366 duration = time.Duration(math.Min(float64(max), factor*float64(duration))) 2367 continue 2368 } 2369 // reset backoff if we have a success 2370 duration = base 2371 2372 kl.syncLoopMonitor.Store(kl.clock.Now()) 2373 if !kl.syncLoopIteration(ctx, updates, handler, syncTicker.C, housekeepingTicker.C, plegCh) { 2374 break 2375 } 2376 kl.syncLoopMonitor.Store(kl.clock.Now()) 2377 } 2378 } 2379 2380 // syncLoopIteration reads from various channels and dispatches pods to the 2381 // given handler. 2382 // 2383 // Arguments: 2384 // 1. configCh: a channel to read config events from 2385 // 2. handler: the SyncHandler to dispatch pods to 2386 // 3. syncCh: a channel to read periodic sync events from 2387 // 4. housekeepingCh: a channel to read housekeeping events from 2388 // 5. plegCh: a channel to read PLEG updates from 2389 // 2390 // Events are also read from the kubelet liveness manager's update channel. 2391 // 2392 // The workflow is to read from one of the channels, handle that event, and 2393 // update the timestamp in the sync loop monitor. 2394 // 2395 // Here is an appropriate place to note that despite the syntactical 2396 // similarity to the switch statement, the case statements in a select are 2397 // evaluated in a pseudorandom order if there are multiple channels ready to 2398 // read from when the select is evaluated. In other words, case statements 2399 // are evaluated in random order, and you can not assume that the case 2400 // statements evaluate in order if multiple channels have events. 2401 // 2402 // With that in mind, in truly no particular order, the different channels 2403 // are handled as follows: 2404 // 2405 // - configCh: dispatch the pods for the config change to the appropriate 2406 // handler callback for the event type 2407 // - plegCh: update the runtime cache; sync pod 2408 // - syncCh: sync all pods waiting for sync 2409 // - housekeepingCh: trigger cleanup of pods 2410 // - health manager: sync pods that have failed or in which one or more 2411 // containers have failed health checks 2412 func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubetypes.PodUpdate, handler SyncHandler, 2413 syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool { 2414 select { 2415 case u, open := <-configCh: 2416 // Update from a config source; dispatch it to the right handler 2417 // callback. 2418 if !open { 2419 klog.ErrorS(nil, "Update channel is closed, exiting the sync loop") 2420 return false 2421 } 2422 2423 switch u.Op { 2424 case kubetypes.ADD: 2425 klog.V(2).InfoS("SyncLoop ADD", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2426 // After restarting, kubelet will get all existing pods through 2427 // ADD as if they are new pods. These pods will then go through the 2428 // admission process and *may* be rejected. This can be resolved 2429 // once we have checkpointing. 2430 handler.HandlePodAdditions(u.Pods) 2431 case kubetypes.UPDATE: 2432 klog.V(2).InfoS("SyncLoop UPDATE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2433 handler.HandlePodUpdates(u.Pods) 2434 case kubetypes.REMOVE: 2435 klog.V(2).InfoS("SyncLoop REMOVE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2436 handler.HandlePodRemoves(u.Pods) 2437 case kubetypes.RECONCILE: 2438 klog.V(4).InfoS("SyncLoop RECONCILE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2439 handler.HandlePodReconcile(u.Pods) 2440 case kubetypes.DELETE: 2441 klog.V(2).InfoS("SyncLoop DELETE", "source", u.Source, "pods", klog.KObjSlice(u.Pods)) 2442 // DELETE is treated as a UPDATE because of graceful deletion. 2443 handler.HandlePodUpdates(u.Pods) 2444 case kubetypes.SET: 2445 // TODO: Do we want to support this? 2446 klog.ErrorS(nil, "Kubelet does not support snapshot update") 2447 default: 2448 klog.ErrorS(nil, "Invalid operation type received", "operation", u.Op) 2449 } 2450 2451 kl.sourcesReady.AddSource(u.Source) 2452 2453 case e := <-plegCh: 2454 if isSyncPodWorthy(e) { 2455 // PLEG event for a pod; sync it. 2456 if pod, ok := kl.podManager.GetPodByUID(e.ID); ok { 2457 klog.V(2).InfoS("SyncLoop (PLEG): event for pod", "pod", klog.KObj(pod), "event", e) 2458 handler.HandlePodSyncs([]*v1.Pod{pod}) 2459 } else { 2460 // If the pod no longer exists, ignore the event. 2461 klog.V(4).InfoS("SyncLoop (PLEG): pod does not exist, ignore irrelevant event", "event", e) 2462 } 2463 } 2464 2465 if e.Type == pleg.ContainerDied { 2466 if containerID, ok := e.Data.(string); ok { 2467 kl.cleanUpContainersInPod(e.ID, containerID) 2468 } 2469 } 2470 case <-syncCh: 2471 // Sync pods waiting for sync 2472 podsToSync := kl.getPodsToSync() 2473 if len(podsToSync) == 0 { 2474 break 2475 } 2476 klog.V(4).InfoS("SyncLoop (SYNC) pods", "total", len(podsToSync), "pods", klog.KObjSlice(podsToSync)) 2477 handler.HandlePodSyncs(podsToSync) 2478 case update := <-kl.livenessManager.Updates(): 2479 if update.Result == proberesults.Failure { 2480 handleProbeSync(kl, update, handler, "liveness", "unhealthy") 2481 } 2482 case update := <-kl.readinessManager.Updates(): 2483 ready := update.Result == proberesults.Success 2484 kl.statusManager.SetContainerReadiness(update.PodUID, update.ContainerID, ready) 2485 2486 status := "" 2487 if ready { 2488 status = "ready" 2489 } 2490 handleProbeSync(kl, update, handler, "readiness", status) 2491 case update := <-kl.startupManager.Updates(): 2492 started := update.Result == proberesults.Success 2493 kl.statusManager.SetContainerStartup(update.PodUID, update.ContainerID, started) 2494 2495 status := "unhealthy" 2496 if started { 2497 status = "started" 2498 } 2499 handleProbeSync(kl, update, handler, "startup", status) 2500 case <-housekeepingCh: 2501 if !kl.sourcesReady.AllReady() { 2502 // If the sources aren't ready or volume manager has not yet synced the states, 2503 // skip housekeeping, as we may accidentally delete pods from unready sources. 2504 klog.V(4).InfoS("SyncLoop (housekeeping, skipped): sources aren't ready yet") 2505 } else { 2506 start := time.Now() 2507 klog.V(4).InfoS("SyncLoop (housekeeping)") 2508 if err := handler.HandlePodCleanups(ctx); err != nil { 2509 klog.ErrorS(err, "Failed cleaning pods") 2510 } 2511 duration := time.Since(start) 2512 if duration > housekeepingWarningDuration { 2513 klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than expected", "expected", housekeepingWarningDuration, "actual", duration.Round(time.Millisecond)) 2514 } 2515 klog.V(4).InfoS("SyncLoop (housekeeping) end", "duration", duration.Round(time.Millisecond)) 2516 } 2517 } 2518 return true 2519 } 2520 2521 func handleProbeSync(kl *Kubelet, update proberesults.Update, handler SyncHandler, probe, status string) { 2522 // We should not use the pod from manager, because it is never updated after initialization. 2523 pod, ok := kl.podManager.GetPodByUID(update.PodUID) 2524 if !ok { 2525 // If the pod no longer exists, ignore the update. 2526 klog.V(4).InfoS("SyncLoop (probe): ignore irrelevant update", "probe", probe, "status", status, "update", update) 2527 return 2528 } 2529 klog.V(1).InfoS("SyncLoop (probe)", "probe", probe, "status", status, "pod", klog.KObj(pod)) 2530 handler.HandlePodSyncs([]*v1.Pod{pod}) 2531 } 2532 2533 // HandlePodAdditions is the callback in SyncHandler for pods being added from 2534 // a config source. 2535 func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { 2536 start := kl.clock.Now() 2537 sort.Sort(sliceutils.PodsByCreationTime(pods)) 2538 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2539 kl.podResizeMutex.Lock() 2540 defer kl.podResizeMutex.Unlock() 2541 } 2542 for _, pod := range pods { 2543 existingPods := kl.podManager.GetPods() 2544 // Always add the pod to the pod manager. Kubelet relies on the pod 2545 // manager as the source of truth for the desired state. If a pod does 2546 // not exist in the pod manager, it means that it has been deleted in 2547 // the apiserver and no action (other than cleanup) is required. 2548 kl.podManager.AddPod(pod) 2549 2550 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2551 if wasMirror { 2552 if pod == nil { 2553 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2554 continue 2555 } 2556 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2557 Pod: pod, 2558 MirrorPod: mirrorPod, 2559 UpdateType: kubetypes.SyncPodUpdate, 2560 StartTime: start, 2561 }) 2562 continue 2563 } 2564 2565 // Only go through the admission process if the pod is not requested 2566 // for termination by another part of the kubelet. If the pod is already 2567 // using resources (previously admitted), the pod worker is going to be 2568 // shutting it down. If the pod hasn't started yet, we know that when 2569 // the pod worker is invoked it will also avoid setting up the pod, so 2570 // we simply avoid doing any work. 2571 if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { 2572 // We failed pods that we rejected, so activePods include all admitted 2573 // pods that are alive. 2574 activePods := kl.filterOutInactivePods(existingPods) 2575 2576 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 2577 // To handle kubelet restarts, test pod admissibility using AllocatedResources values 2578 // (for cpu & memory) from checkpoint store. If found, that is the source of truth. 2579 podCopy := pod.DeepCopy() 2580 kl.updateContainerResourceAllocation(podCopy) 2581 2582 // Check if we can admit the pod; if not, reject it. 2583 if ok, reason, message := kl.canAdmitPod(activePods, podCopy); !ok { 2584 kl.rejectPod(pod, reason, message) 2585 continue 2586 } 2587 // For new pod, checkpoint the resource values at which the Pod has been admitted 2588 if err := kl.statusManager.SetPodAllocation(podCopy); err != nil { 2589 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2590 klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod)) 2591 } 2592 } else { 2593 // Check if we can admit the pod; if not, reject it. 2594 if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { 2595 kl.rejectPod(pod, reason, message) 2596 continue 2597 } 2598 } 2599 } 2600 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2601 Pod: pod, 2602 MirrorPod: mirrorPod, 2603 UpdateType: kubetypes.SyncPodCreate, 2604 StartTime: start, 2605 }) 2606 } 2607 } 2608 2609 // updateContainerResourceAllocation updates AllocatedResources values 2610 // (for cpu & memory) from checkpoint store 2611 func (kl *Kubelet) updateContainerResourceAllocation(pod *v1.Pod) { 2612 for _, c := range pod.Spec.Containers { 2613 allocatedResources, found := kl.statusManager.GetContainerResourceAllocation(string(pod.UID), c.Name) 2614 if c.Resources.Requests != nil && found { 2615 if _, ok := allocatedResources[v1.ResourceCPU]; ok { 2616 c.Resources.Requests[v1.ResourceCPU] = allocatedResources[v1.ResourceCPU] 2617 } 2618 if _, ok := allocatedResources[v1.ResourceMemory]; ok { 2619 c.Resources.Requests[v1.ResourceMemory] = allocatedResources[v1.ResourceMemory] 2620 } 2621 } 2622 } 2623 } 2624 2625 // HandlePodUpdates is the callback in the SyncHandler interface for pods 2626 // being updated from a config source. 2627 func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) { 2628 start := kl.clock.Now() 2629 for _, pod := range pods { 2630 kl.podManager.UpdatePod(pod) 2631 2632 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2633 if wasMirror { 2634 if pod == nil { 2635 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2636 continue 2637 } 2638 } 2639 2640 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2641 Pod: pod, 2642 MirrorPod: mirrorPod, 2643 UpdateType: kubetypes.SyncPodUpdate, 2644 StartTime: start, 2645 }) 2646 } 2647 } 2648 2649 // HandlePodRemoves is the callback in the SyncHandler interface for pods 2650 // being removed from a config source. 2651 func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) { 2652 start := kl.clock.Now() 2653 for _, pod := range pods { 2654 kl.podManager.RemovePod(pod) 2655 2656 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2657 if wasMirror { 2658 if pod == nil { 2659 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2660 continue 2661 } 2662 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2663 Pod: pod, 2664 MirrorPod: mirrorPod, 2665 UpdateType: kubetypes.SyncPodUpdate, 2666 StartTime: start, 2667 }) 2668 continue 2669 } 2670 2671 // Deletion is allowed to fail because the periodic cleanup routine 2672 // will trigger deletion again. 2673 if err := kl.deletePod(pod); err != nil { 2674 klog.V(2).InfoS("Failed to delete pod", "pod", klog.KObj(pod), "err", err) 2675 } 2676 } 2677 } 2678 2679 // HandlePodReconcile is the callback in the SyncHandler interface for pods 2680 // that should be reconciled. Pods are reconciled when only the status of the 2681 // pod is updated in the API. 2682 func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) { 2683 start := kl.clock.Now() 2684 for _, pod := range pods { 2685 // Update the pod in pod manager, status manager will do periodically reconcile according 2686 // to the pod manager. 2687 kl.podManager.UpdatePod(pod) 2688 2689 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2690 if wasMirror { 2691 if pod == nil { 2692 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2693 continue 2694 } 2695 // Static pods should be reconciled the same way as regular pods 2696 } 2697 2698 // TODO: reconcile being calculated in the config manager is questionable, and avoiding 2699 // extra syncs may no longer be necessary. Reevaluate whether Reconcile and Sync can be 2700 // merged (after resolving the next two TODOs). 2701 2702 // Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation. 2703 // TODO: this should be unnecessary today - determine what is the cause for this to 2704 // be different than Sync, or if there is a better place for it. For instance, we have 2705 // needsReconcile in kubelet/config, here, and in status_manager. 2706 if status.NeedToReconcilePodReadiness(pod) { 2707 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2708 Pod: pod, 2709 MirrorPod: mirrorPod, 2710 UpdateType: kubetypes.SyncPodSync, 2711 StartTime: start, 2712 }) 2713 } 2714 2715 // After an evicted pod is synced, all dead containers in the pod can be removed. 2716 // TODO: this is questionable - status read is async and during eviction we already 2717 // expect to not have some container info. The pod worker knows whether a pod has 2718 // been evicted, so if this is about minimizing the time to react to an eviction we 2719 // can do better. If it's about preserving pod status info we can also do better. 2720 if eviction.PodIsEvicted(pod.Status) { 2721 if podStatus, err := kl.podCache.Get(pod.UID); err == nil { 2722 kl.containerDeletor.deleteContainersInPod("", podStatus, true) 2723 } 2724 } 2725 } 2726 } 2727 2728 // HandlePodSyncs is the callback in the syncHandler interface for pods 2729 // that should be dispatched to pod workers for sync. 2730 func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) { 2731 start := kl.clock.Now() 2732 for _, pod := range pods { 2733 pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod) 2734 if wasMirror { 2735 if pod == nil { 2736 klog.V(2).InfoS("Unable to find pod for mirror pod, skipping", "mirrorPod", klog.KObj(mirrorPod), "mirrorPodUID", mirrorPod.UID) 2737 continue 2738 } 2739 // Syncing a mirror pod is a programmer error since the intent of sync is to 2740 // batch notify all pending work. We should make it impossible to double sync, 2741 // but for now log a programmer error to prevent accidental introduction. 2742 klog.V(3).InfoS("Programmer error, HandlePodSyncs does not expect to receive mirror pods", "podUID", pod.UID, "mirrorPodUID", mirrorPod.UID) 2743 continue 2744 } 2745 kl.podWorkers.UpdatePod(UpdatePodOptions{ 2746 Pod: pod, 2747 MirrorPod: mirrorPod, 2748 UpdateType: kubetypes.SyncPodSync, 2749 StartTime: start, 2750 }) 2751 } 2752 } 2753 2754 func isPodResizeInProgress(pod *v1.Pod, podStatus *v1.PodStatus) bool { 2755 for _, c := range pod.Spec.Containers { 2756 if cs, ok := podutil.GetContainerStatus(podStatus.ContainerStatuses, c.Name); ok { 2757 if cs.Resources == nil { 2758 continue 2759 } 2760 if !cmp.Equal(c.Resources.Limits, cs.Resources.Limits) || !cmp.Equal(cs.AllocatedResources, cs.Resources.Requests) { 2761 return true 2762 } 2763 } 2764 } 2765 return false 2766 } 2767 2768 func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, *v1.Pod, v1.PodResizeStatus) { 2769 var otherActivePods []*v1.Pod 2770 2771 node, err := kl.getNodeAnyWay() 2772 if err != nil { 2773 klog.ErrorS(err, "getNodeAnyway function failed") 2774 return false, nil, "" 2775 } 2776 podCopy := pod.DeepCopy() 2777 cpuAvailable := node.Status.Allocatable.Cpu().MilliValue() 2778 memAvailable := node.Status.Allocatable.Memory().Value() 2779 cpuRequests := resource.GetResourceRequest(podCopy, v1.ResourceCPU) 2780 memRequests := resource.GetResourceRequest(podCopy, v1.ResourceMemory) 2781 if cpuRequests > cpuAvailable || memRequests > memAvailable { 2782 klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", podCopy.Name) 2783 return false, podCopy, v1.PodResizeStatusInfeasible 2784 } 2785 2786 // Treat the existing pod needing resize as a new pod with desired resources seeking admit. 2787 // If desired resources don't fit, pod continues to run with currently allocated resources. 2788 activePods := kl.GetActivePods() 2789 for _, p := range activePods { 2790 if p.UID != pod.UID { 2791 otherActivePods = append(otherActivePods, p) 2792 } 2793 } 2794 2795 if ok, failReason, failMessage := kl.canAdmitPod(otherActivePods, podCopy); !ok { 2796 // Log reason and return. Let the next sync iteration retry the resize 2797 klog.V(3).InfoS("Resize cannot be accommodated", "pod", podCopy.Name, "reason", failReason, "message", failMessage) 2798 return false, podCopy, v1.PodResizeStatusDeferred 2799 } 2800 2801 for _, container := range podCopy.Spec.Containers { 2802 idx, found := podutil.GetIndexOfContainerStatus(podCopy.Status.ContainerStatuses, container.Name) 2803 if found { 2804 for rName, rQuantity := range container.Resources.Requests { 2805 podCopy.Status.ContainerStatuses[idx].AllocatedResources[rName] = rQuantity 2806 } 2807 } 2808 } 2809 return true, podCopy, v1.PodResizeStatusInProgress 2810 } 2811 2812 func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod) *v1.Pod { 2813 if pod.Status.Phase != v1.PodRunning { 2814 return pod 2815 } 2816 podResized := false 2817 for _, container := range pod.Spec.Containers { 2818 if len(container.Resources.Requests) == 0 { 2819 continue 2820 } 2821 containerStatus, found := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name) 2822 if !found { 2823 klog.V(5).InfoS("ContainerStatus not found", "pod", pod.Name, "container", container.Name) 2824 break 2825 } 2826 if len(containerStatus.AllocatedResources) != len(container.Resources.Requests) { 2827 klog.V(5).InfoS("ContainerStatus.AllocatedResources length mismatch", "pod", pod.Name, "container", container.Name) 2828 break 2829 } 2830 if !cmp.Equal(container.Resources.Requests, containerStatus.AllocatedResources) { 2831 podResized = true 2832 break 2833 } 2834 } 2835 if !podResized { 2836 return pod 2837 } 2838 2839 kl.podResizeMutex.Lock() 2840 defer kl.podResizeMutex.Unlock() 2841 fit, updatedPod, resizeStatus := kl.canResizePod(pod) 2842 if updatedPod == nil { 2843 return pod 2844 } 2845 if fit { 2846 // Update pod resource allocation checkpoint 2847 if err := kl.statusManager.SetPodAllocation(updatedPod); err != nil { 2848 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2849 klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(updatedPod)) 2850 return pod 2851 } 2852 } 2853 if resizeStatus != "" { 2854 // Save resize decision to checkpoint 2855 if err := kl.statusManager.SetPodResizeStatus(updatedPod.UID, resizeStatus); err != nil { 2856 //TODO(vinaykul,InPlacePodVerticalScaling): Can we recover from this in some way? Investigate 2857 klog.ErrorS(err, "SetPodResizeStatus failed", "pod", klog.KObj(updatedPod)) 2858 return pod 2859 } 2860 updatedPod.Status.Resize = resizeStatus 2861 } 2862 kl.podManager.UpdatePod(updatedPod) 2863 kl.statusManager.SetPodStatus(updatedPod, updatedPod.Status) 2864 return updatedPod 2865 } 2866 2867 // LatestLoopEntryTime returns the last time in the sync loop monitor. 2868 func (kl *Kubelet) LatestLoopEntryTime() time.Time { 2869 val := kl.syncLoopMonitor.Load() 2870 if val == nil { 2871 return time.Time{} 2872 } 2873 return val.(time.Time) 2874 } 2875 2876 // updateRuntimeUp calls the container runtime status callback, initializing 2877 // the runtime dependent modules when the container runtime first comes up, 2878 // and returns an error if the status check fails. If the status check is OK, 2879 // update the container runtime uptime in the kubelet runtimeState. 2880 func (kl *Kubelet) updateRuntimeUp() { 2881 kl.updateRuntimeMux.Lock() 2882 defer kl.updateRuntimeMux.Unlock() 2883 ctx := context.Background() 2884 2885 s, err := kl.containerRuntime.Status(ctx) 2886 if err != nil { 2887 klog.ErrorS(err, "Container runtime sanity check failed") 2888 return 2889 } 2890 if s == nil { 2891 klog.ErrorS(nil, "Container runtime status is nil") 2892 return 2893 } 2894 // Periodically log the whole runtime status for debugging. 2895 klog.V(4).InfoS("Container runtime status", "status", s) 2896 klogErrorS := klog.ErrorS 2897 if !kl.containerRuntimeReadyExpected { 2898 klogErrorS = klog.V(4).ErrorS 2899 } 2900 networkReady := s.GetRuntimeCondition(kubecontainer.NetworkReady) 2901 if networkReady == nil || !networkReady.Status { 2902 klogErrorS(nil, "Container runtime network not ready", "networkReady", networkReady) 2903 kl.runtimeState.setNetworkState(fmt.Errorf("container runtime network not ready: %v", networkReady)) 2904 } else { 2905 // Set nil if the container runtime network is ready. 2906 kl.runtimeState.setNetworkState(nil) 2907 } 2908 // information in RuntimeReady condition will be propagated to NodeReady condition. 2909 runtimeReady := s.GetRuntimeCondition(kubecontainer.RuntimeReady) 2910 // If RuntimeReady is not set or is false, report an error. 2911 if runtimeReady == nil || !runtimeReady.Status { 2912 klogErrorS(nil, "Container runtime not ready", "runtimeReady", runtimeReady) 2913 kl.runtimeState.setRuntimeState(fmt.Errorf("container runtime not ready: %v", runtimeReady)) 2914 return 2915 } 2916 2917 kl.runtimeState.setRuntimeState(nil) 2918 kl.runtimeState.setRuntimeHandlers(s.Handlers) 2919 kl.oneTimeInitializer.Do(kl.initializeRuntimeDependentModules) 2920 kl.runtimeState.setRuntimeSync(kl.clock.Now()) 2921 } 2922 2923 // GetConfiguration returns the KubeletConfiguration used to configure the kubelet. 2924 func (kl *Kubelet) GetConfiguration() kubeletconfiginternal.KubeletConfiguration { 2925 return kl.kubeletConfiguration 2926 } 2927 2928 // BirthCry sends an event that the kubelet has started up. 2929 func (kl *Kubelet) BirthCry() { 2930 // Make an event that kubelet restarted. 2931 kl.recorder.Eventf(kl.nodeRef, v1.EventTypeNormal, events.StartingKubelet, "Starting kubelet.") 2932 } 2933 2934 // ResyncInterval returns the interval used for periodic syncs. 2935 func (kl *Kubelet) ResyncInterval() time.Duration { 2936 return kl.resyncInterval 2937 } 2938 2939 // ListenAndServe runs the kubelet HTTP server. 2940 func (kl *Kubelet) ListenAndServe(kubeCfg *kubeletconfiginternal.KubeletConfiguration, tlsOptions *server.TLSOptions, 2941 auth server.AuthInterface, tp trace.TracerProvider) { 2942 server.ListenAndServeKubeletServer(kl, kl.resourceAnalyzer, kubeCfg, tlsOptions, auth, tp) 2943 } 2944 2945 // ListenAndServeReadOnly runs the kubelet HTTP server in read-only mode. 2946 func (kl *Kubelet) ListenAndServeReadOnly(address net.IP, port uint, tp trace.TracerProvider) { 2947 server.ListenAndServeKubeletReadOnlyServer(kl, kl.resourceAnalyzer, address, port, tp) 2948 } 2949 2950 // ListenAndServePodResources runs the kubelet podresources grpc service 2951 func (kl *Kubelet) ListenAndServePodResources() { 2952 endpoint, err := util.LocalEndpoint(kl.getPodResourcesDir(), podresources.Socket) 2953 if err != nil { 2954 klog.V(2).InfoS("Failed to get local endpoint for PodResources endpoint", "err", err) 2955 return 2956 } 2957 2958 providers := podresources.PodResourcesProviders{ 2959 Pods: kl.podManager, 2960 Devices: kl.containerManager, 2961 Cpus: kl.containerManager, 2962 Memory: kl.containerManager, 2963 DynamicResources: kl.containerManager, 2964 } 2965 2966 server.ListenAndServePodResources(endpoint, providers) 2967 } 2968 2969 // Delete the eligible dead container instances in a pod. Depending on the configuration, the latest dead containers may be kept around. 2970 func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID string) { 2971 if podStatus, err := kl.podCache.Get(podID); err == nil { 2972 // When an evicted or deleted pod has already synced, all containers can be removed. 2973 removeAll := kl.podWorkers.ShouldPodContentBeRemoved(podID) 2974 kl.containerDeletor.deleteContainersInPod(exitedContainerID, podStatus, removeAll) 2975 } 2976 } 2977 2978 // fastStatusUpdateOnce starts a loop that checks if the current state of kubelet + container runtime 2979 // would be able to turn the node ready, and sync the ready state to the apiserver as soon as possible. 2980 // Function returns after the node status update after such event, or when the node is already ready. 2981 // Function is executed only during Kubelet start which improves latency to ready node by updating 2982 // kubelet state, runtime status and node statuses ASAP. 2983 func (kl *Kubelet) fastStatusUpdateOnce() { 2984 ctx := context.Background() 2985 start := kl.clock.Now() 2986 stopCh := make(chan struct{}) 2987 2988 // Keep trying to make fast node status update until either timeout is reached or an update is successful. 2989 wait.Until(func() { 2990 // fastNodeStatusUpdate returns true when it succeeds or when the grace period has expired 2991 // (status was not updated within nodeReadyGracePeriod and the second argument below gets true), 2992 // then we close the channel and abort the loop. 2993 if kl.fastNodeStatusUpdate(ctx, kl.clock.Since(start) >= nodeReadyGracePeriod) { 2994 close(stopCh) 2995 } 2996 }, 100*time.Millisecond, stopCh) 2997 } 2998 2999 // CheckpointContainer tries to checkpoint a container. The parameters are used to 3000 // look up the specified container. If the container specified by the given parameters 3001 // cannot be found an error is returned. If the container is found the container 3002 // engine will be asked to checkpoint the given container into the kubelet's default 3003 // checkpoint directory. 3004 func (kl *Kubelet) CheckpointContainer( 3005 ctx context.Context, 3006 podUID types.UID, 3007 podFullName, 3008 containerName string, 3009 options *runtimeapi.CheckpointContainerRequest, 3010 ) error { 3011 container, err := kl.findContainer(ctx, podFullName, podUID, containerName) 3012 if err != nil { 3013 return err 3014 } 3015 if container == nil { 3016 return fmt.Errorf("container %v not found", containerName) 3017 } 3018 3019 options.Location = filepath.Join( 3020 kl.getCheckpointsDir(), 3021 fmt.Sprintf( 3022 "checkpoint-%s-%s-%s.tar", 3023 podFullName, 3024 containerName, 3025 time.Now().Format(time.RFC3339), 3026 ), 3027 ) 3028 3029 options.ContainerId = string(container.ID.ID) 3030 3031 if err := kl.containerRuntime.CheckpointContainer(ctx, options); err != nil { 3032 return err 3033 } 3034 3035 return nil 3036 } 3037 3038 // ListMetricDescriptors gets the descriptors for the metrics that will be returned in ListPodSandboxMetrics. 3039 func (kl *Kubelet) ListMetricDescriptors(ctx context.Context) ([]*runtimeapi.MetricDescriptor, error) { 3040 return kl.containerRuntime.ListMetricDescriptors(ctx) 3041 } 3042 3043 // ListPodSandboxMetrics retrieves the metrics for all pod sandboxes. 3044 func (kl *Kubelet) ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error) { 3045 return kl.containerRuntime.ListPodSandboxMetrics(ctx) 3046 } 3047 3048 func (kl *Kubelet) supportLocalStorageCapacityIsolation() bool { 3049 return kl.GetConfiguration().LocalStorageCapacityIsolation 3050 } 3051 3052 // isSyncPodWorthy filters out events that are not worthy of pod syncing 3053 func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool { 3054 // ContainerRemoved doesn't affect pod state 3055 return event.Type != pleg.ContainerRemoved 3056 } 3057 3058 // PrepareDynamicResources calls the container Manager PrepareDynamicResources API 3059 // This method implements the RuntimeHelper interface 3060 func (kl *Kubelet) PrepareDynamicResources(pod *v1.Pod) error { 3061 return kl.containerManager.PrepareDynamicResources(pod) 3062 } 3063 3064 // UnprepareDynamicResources calls the container Manager UnprepareDynamicResources API 3065 // This method implements the RuntimeHelper interface 3066 func (kl *Kubelet) UnprepareDynamicResources(pod *v1.Pod) error { 3067 return kl.containerManager.UnprepareDynamicResources(pod) 3068 }