k8s.io/kubernetes@v1.29.3/pkg/kubelet/pleg/generic.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pleg 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/apimachinery/pkg/util/sets" 28 "k8s.io/apimachinery/pkg/util/wait" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 31 "k8s.io/klog/v2" 32 "k8s.io/kubernetes/pkg/features" 33 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 34 "k8s.io/kubernetes/pkg/kubelet/metrics" 35 "k8s.io/utils/clock" 36 ) 37 38 // GenericPLEG is an extremely simple generic PLEG that relies solely on 39 // periodic listing to discover container changes. It should be used 40 // as temporary replacement for container runtimes do not support a proper 41 // event generator yet. 42 // 43 // Note that GenericPLEG assumes that a container would not be created, 44 // terminated, and garbage collected within one relist period. If such an 45 // incident happens, GenenricPLEG would miss all events regarding this 46 // container. In the case of relisting failure, the window may become longer. 47 // Note that this assumption is not unique -- many kubelet internal components 48 // rely on terminated containers as tombstones for bookkeeping purposes. The 49 // garbage collector is implemented to work with such situations. However, to 50 // guarantee that kubelet can handle missing container events, it is 51 // recommended to set the relist period short and have an auxiliary, longer 52 // periodic sync in kubelet as the safety net. 53 type GenericPLEG struct { 54 // The container runtime. 55 runtime kubecontainer.Runtime 56 // The channel from which the subscriber listens events. 57 eventChannel chan *PodLifecycleEvent 58 // The internal cache for pod/container information. 59 podRecords podRecords 60 // Time of the last relisting. 61 relistTime atomic.Value 62 // Cache for storing the runtime states required for syncing pods. 63 cache kubecontainer.Cache 64 // For testability. 65 clock clock.Clock 66 // Pods that failed to have their status retrieved during a relist. These pods will be 67 // retried during the next relisting. 68 podsToReinspect map[types.UID]*kubecontainer.Pod 69 // Stop the Generic PLEG by closing the channel. 70 stopCh chan struct{} 71 // Locks the relisting of the Generic PLEG 72 relistLock sync.Mutex 73 // Indicates if the Generic PLEG is running or not 74 isRunning bool 75 // Locks the start/stop operation of Generic PLEG 76 runningMu sync.Mutex 77 // Indicates relisting related parameters 78 relistDuration *RelistDuration 79 // Mutex to serialize updateCache called by relist vs UpdateCache interface 80 podCacheMutex sync.Mutex 81 } 82 83 // plegContainerState has a one-to-one mapping to the 84 // kubecontainer.State except for the non-existent state. This state 85 // is introduced here to complete the state transition scenarios. 86 type plegContainerState string 87 88 const ( 89 plegContainerRunning plegContainerState = "running" 90 plegContainerExited plegContainerState = "exited" 91 plegContainerUnknown plegContainerState = "unknown" 92 plegContainerNonExistent plegContainerState = "non-existent" 93 ) 94 95 func convertState(state kubecontainer.State) plegContainerState { 96 switch state { 97 case kubecontainer.ContainerStateCreated: 98 // kubelet doesn't use the "created" state yet, hence convert it to "unknown". 99 return plegContainerUnknown 100 case kubecontainer.ContainerStateRunning: 101 return plegContainerRunning 102 case kubecontainer.ContainerStateExited: 103 return plegContainerExited 104 case kubecontainer.ContainerStateUnknown: 105 return plegContainerUnknown 106 default: 107 panic(fmt.Sprintf("unrecognized container state: %v", state)) 108 } 109 } 110 111 type podRecord struct { 112 old *kubecontainer.Pod 113 current *kubecontainer.Pod 114 } 115 116 type podRecords map[types.UID]*podRecord 117 118 // NewGenericPLEG instantiates a new GenericPLEG object and return it. 119 func NewGenericPLEG(runtime kubecontainer.Runtime, eventChannel chan *PodLifecycleEvent, 120 relistDuration *RelistDuration, cache kubecontainer.Cache, 121 clock clock.Clock) PodLifecycleEventGenerator { 122 return &GenericPLEG{ 123 relistDuration: relistDuration, 124 runtime: runtime, 125 eventChannel: eventChannel, 126 podRecords: make(podRecords), 127 cache: cache, 128 clock: clock, 129 } 130 } 131 132 // Watch returns a channel from which the subscriber can receive PodLifecycleEvent 133 // events. 134 // TODO: support multiple subscribers. 135 func (g *GenericPLEG) Watch() chan *PodLifecycleEvent { 136 return g.eventChannel 137 } 138 139 // Start spawns a goroutine to relist periodically. 140 func (g *GenericPLEG) Start() { 141 g.runningMu.Lock() 142 defer g.runningMu.Unlock() 143 if !g.isRunning { 144 g.isRunning = true 145 g.stopCh = make(chan struct{}) 146 go wait.Until(g.Relist, g.relistDuration.RelistPeriod, g.stopCh) 147 } 148 } 149 150 func (g *GenericPLEG) Stop() { 151 g.runningMu.Lock() 152 defer g.runningMu.Unlock() 153 if g.isRunning { 154 close(g.stopCh) 155 g.isRunning = false 156 } 157 } 158 159 func (g *GenericPLEG) Update(relistDuration *RelistDuration) { 160 g.relistDuration = relistDuration 161 } 162 163 // Healthy check if PLEG work properly. 164 // relistThreshold is the maximum interval between two relist. 165 func (g *GenericPLEG) Healthy() (bool, error) { 166 relistTime := g.getRelistTime() 167 if relistTime.IsZero() { 168 return false, fmt.Errorf("pleg has yet to be successful") 169 } 170 // Expose as metric so you can alert on `time()-pleg_last_seen_seconds > nn` 171 metrics.PLEGLastSeen.Set(float64(relistTime.Unix())) 172 elapsed := g.clock.Since(relistTime) 173 if elapsed > g.relistDuration.RelistThreshold { 174 return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, g.relistDuration.RelistThreshold) 175 } 176 return true, nil 177 } 178 179 func generateEvents(podID types.UID, cid string, oldState, newState plegContainerState) []*PodLifecycleEvent { 180 if newState == oldState { 181 return nil 182 } 183 184 klog.V(4).InfoS("GenericPLEG", "podUID", podID, "containerID", cid, "oldState", oldState, "newState", newState) 185 switch newState { 186 case plegContainerRunning: 187 return []*PodLifecycleEvent{{ID: podID, Type: ContainerStarted, Data: cid}} 188 case plegContainerExited: 189 return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}} 190 case plegContainerUnknown: 191 return []*PodLifecycleEvent{{ID: podID, Type: ContainerChanged, Data: cid}} 192 case plegContainerNonExistent: 193 switch oldState { 194 case plegContainerExited: 195 // We already reported that the container died before. 196 return []*PodLifecycleEvent{{ID: podID, Type: ContainerRemoved, Data: cid}} 197 default: 198 return []*PodLifecycleEvent{{ID: podID, Type: ContainerDied, Data: cid}, {ID: podID, Type: ContainerRemoved, Data: cid}} 199 } 200 default: 201 panic(fmt.Sprintf("unrecognized container state: %v", newState)) 202 } 203 } 204 205 func (g *GenericPLEG) getRelistTime() time.Time { 206 val := g.relistTime.Load() 207 if val == nil { 208 return time.Time{} 209 } 210 return val.(time.Time) 211 } 212 213 func (g *GenericPLEG) updateRelistTime(timestamp time.Time) { 214 g.relistTime.Store(timestamp) 215 } 216 217 // Relist queries the container runtime for list of pods/containers, compare 218 // with the internal pods/containers, and generates events accordingly. 219 func (g *GenericPLEG) Relist() { 220 g.relistLock.Lock() 221 defer g.relistLock.Unlock() 222 223 ctx := context.Background() 224 klog.V(5).InfoS("GenericPLEG: Relisting") 225 226 if lastRelistTime := g.getRelistTime(); !lastRelistTime.IsZero() { 227 metrics.PLEGRelistInterval.Observe(metrics.SinceInSeconds(lastRelistTime)) 228 } 229 230 timestamp := g.clock.Now() 231 defer func() { 232 metrics.PLEGRelistDuration.Observe(metrics.SinceInSeconds(timestamp)) 233 }() 234 235 // Get all the pods. 236 podList, err := g.runtime.GetPods(ctx, true) 237 if err != nil { 238 klog.ErrorS(err, "GenericPLEG: Unable to retrieve pods") 239 return 240 } 241 242 g.updateRelistTime(timestamp) 243 244 pods := kubecontainer.Pods(podList) 245 // update running pod and container count 246 updateRunningPodAndContainerMetrics(pods) 247 g.podRecords.setCurrent(pods) 248 249 // Compare the old and the current pods, and generate events. 250 eventsByPodID := map[types.UID][]*PodLifecycleEvent{} 251 for pid := range g.podRecords { 252 oldPod := g.podRecords.getOld(pid) 253 pod := g.podRecords.getCurrent(pid) 254 // Get all containers in the old and the new pod. 255 allContainers := getContainersFromPods(oldPod, pod) 256 for _, container := range allContainers { 257 events := computeEvents(oldPod, pod, &container.ID) 258 for _, e := range events { 259 updateEvents(eventsByPodID, e) 260 } 261 } 262 } 263 264 var needsReinspection map[types.UID]*kubecontainer.Pod 265 if g.cacheEnabled() { 266 needsReinspection = make(map[types.UID]*kubecontainer.Pod) 267 } 268 269 // If there are events associated with a pod, we should update the 270 // podCache. 271 for pid, events := range eventsByPodID { 272 pod := g.podRecords.getCurrent(pid) 273 if g.cacheEnabled() { 274 // updateCache() will inspect the pod and update the cache. If an 275 // error occurs during the inspection, we want PLEG to retry again 276 // in the next relist. To achieve this, we do not update the 277 // associated podRecord of the pod, so that the change will be 278 // detect again in the next relist. 279 // TODO: If many pods changed during the same relist period, 280 // inspecting the pod and getting the PodStatus to update the cache 281 // serially may take a while. We should be aware of this and 282 // parallelize if needed. 283 if err, updated := g.updateCache(ctx, pod, pid); err != nil { 284 // Rely on updateCache calling GetPodStatus to log the actual error. 285 klog.V(4).ErrorS(err, "PLEG: Ignoring events for pod", "pod", klog.KRef(pod.Namespace, pod.Name)) 286 287 // make sure we try to reinspect the pod during the next relisting 288 needsReinspection[pid] = pod 289 290 continue 291 } else { 292 // this pod was in the list to reinspect and we did so because it had events, so remove it 293 // from the list (we don't want the reinspection code below to inspect it a second time in 294 // this relist execution) 295 delete(g.podsToReinspect, pid) 296 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { 297 if !updated { 298 continue 299 } 300 } 301 } 302 } 303 // Update the internal storage and send out the events. 304 g.podRecords.update(pid) 305 306 // Map from containerId to exit code; used as a temporary cache for lookup 307 containerExitCode := make(map[string]int) 308 309 for i := range events { 310 // Filter out events that are not reliable and no other components use yet. 311 if events[i].Type == ContainerChanged { 312 continue 313 } 314 select { 315 case g.eventChannel <- events[i]: 316 default: 317 metrics.PLEGDiscardEvents.Inc() 318 klog.ErrorS(nil, "Event channel is full, discard this relist() cycle event") 319 } 320 // Log exit code of containers when they finished in a particular event 321 if events[i].Type == ContainerDied { 322 // Fill up containerExitCode map for ContainerDied event when first time appeared 323 if len(containerExitCode) == 0 && pod != nil && g.cache != nil { 324 // Get updated podStatus 325 status, err := g.cache.Get(pod.ID) 326 if err == nil { 327 for _, containerStatus := range status.ContainerStatuses { 328 containerExitCode[containerStatus.ID.ID] = containerStatus.ExitCode 329 } 330 } 331 } 332 if containerID, ok := events[i].Data.(string); ok { 333 if exitCode, ok := containerExitCode[containerID]; ok && pod != nil { 334 klog.V(2).InfoS("Generic (PLEG): container finished", "podID", pod.ID, "containerID", containerID, "exitCode", exitCode) 335 } 336 } 337 } 338 } 339 } 340 341 if g.cacheEnabled() { 342 // reinspect any pods that failed inspection during the previous relist 343 if len(g.podsToReinspect) > 0 { 344 klog.V(5).InfoS("GenericPLEG: Reinspecting pods that previously failed inspection") 345 for pid, pod := range g.podsToReinspect { 346 if err, _ := g.updateCache(ctx, pod, pid); err != nil { 347 // Rely on updateCache calling GetPodStatus to log the actual error. 348 klog.V(5).ErrorS(err, "PLEG: pod failed reinspection", "pod", klog.KRef(pod.Namespace, pod.Name)) 349 needsReinspection[pid] = pod 350 } 351 } 352 } 353 354 // Update the cache timestamp. This needs to happen *after* 355 // all pods have been properly updated in the cache. 356 g.cache.UpdateTime(timestamp) 357 } 358 359 // make sure we retain the list of pods that need reinspecting the next time relist is called 360 g.podsToReinspect = needsReinspection 361 } 362 363 func getContainersFromPods(pods ...*kubecontainer.Pod) []*kubecontainer.Container { 364 cidSet := sets.NewString() 365 var containers []*kubecontainer.Container 366 fillCidSet := func(cs []*kubecontainer.Container) { 367 for _, c := range cs { 368 cid := c.ID.ID 369 if cidSet.Has(cid) { 370 continue 371 } 372 cidSet.Insert(cid) 373 containers = append(containers, c) 374 } 375 } 376 377 for _, p := range pods { 378 if p == nil { 379 continue 380 } 381 fillCidSet(p.Containers) 382 // Update sandboxes as containers 383 // TODO: keep track of sandboxes explicitly. 384 fillCidSet(p.Sandboxes) 385 } 386 return containers 387 } 388 389 func computeEvents(oldPod, newPod *kubecontainer.Pod, cid *kubecontainer.ContainerID) []*PodLifecycleEvent { 390 var pid types.UID 391 if oldPod != nil { 392 pid = oldPod.ID 393 } else if newPod != nil { 394 pid = newPod.ID 395 } 396 oldState := getContainerState(oldPod, cid) 397 newState := getContainerState(newPod, cid) 398 return generateEvents(pid, cid.ID, oldState, newState) 399 } 400 401 func (g *GenericPLEG) cacheEnabled() bool { 402 return g.cache != nil 403 } 404 405 // getPodIP preserves an older cached status' pod IP if the new status has no pod IPs 406 // and its sandboxes have exited 407 func (g *GenericPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus) []string { 408 if len(status.IPs) != 0 { 409 return status.IPs 410 } 411 412 oldStatus, err := g.cache.Get(pid) 413 if err != nil || len(oldStatus.IPs) == 0 { 414 return nil 415 } 416 417 for _, sandboxStatus := range status.SandboxStatuses { 418 // If at least one sandbox is ready, then use this status update's pod IP 419 if sandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY { 420 return status.IPs 421 } 422 } 423 424 // For pods with no ready containers or sandboxes (like exited pods) 425 // use the old status' pod IP 426 return oldStatus.IPs 427 } 428 429 // updateCache tries to update the pod status in the kubelet cache and returns true if the 430 // pod status was actually updated in the cache. It will return false if the pod status 431 // was ignored by the cache. 432 func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, pid types.UID) (error, bool) { 433 if pod == nil { 434 // The pod is missing in the current relist. This means that 435 // the pod has no visible (active or inactive) containers. 436 klog.V(4).InfoS("PLEG: Delete status for pod", "podUID", string(pid)) 437 g.cache.Delete(pid) 438 return nil, true 439 } 440 441 g.podCacheMutex.Lock() 442 defer g.podCacheMutex.Unlock() 443 timestamp := g.clock.Now() 444 445 status, err := g.runtime.GetPodStatus(ctx, pod.ID, pod.Name, pod.Namespace) 446 if err != nil { 447 // nolint:logcheck // Not using the result of klog.V inside the 448 // if branch is okay, we just use it to determine whether the 449 // additional "podStatus" key and its value should be added. 450 if klog.V(6).Enabled() { 451 klog.ErrorS(err, "PLEG: Write status", "pod", klog.KRef(pod.Namespace, pod.Name), "podStatus", status) 452 } else { 453 klog.ErrorS(err, "PLEG: Write status", "pod", klog.KRef(pod.Namespace, pod.Name)) 454 } 455 } else { 456 if klogV := klog.V(6); klogV.Enabled() { 457 klogV.InfoS("PLEG: Write status", "pod", klog.KRef(pod.Namespace, pod.Name), "podStatus", status) 458 } else { 459 klog.V(4).InfoS("PLEG: Write status", "pod", klog.KRef(pod.Namespace, pod.Name)) 460 } 461 // Preserve the pod IP across cache updates if the new IP is empty. 462 // When a pod is torn down, kubelet may race with PLEG and retrieve 463 // a pod status after network teardown, but the kubernetes API expects 464 // the completed pod's IP to be available after the pod is dead. 465 status.IPs = g.getPodIPs(pid, status) 466 } 467 468 // When we use Generic PLEG only, the PodStatus is saved in the cache without 469 // any validation of the existing status against the current timestamp. 470 // This works well when there is only Generic PLEG setting the PodStatus in the cache however, 471 // if we have multiple entities, such as Evented PLEG, while trying to set the PodStatus in the 472 // cache we may run into the racy timestamps given each of them were to calculate the timestamps 473 // in their respective execution flow. While Generic PLEG calculates this timestamp and gets 474 // the PodStatus, we can only calculate the corresponding timestamp in 475 // Evented PLEG after the event has been received by the Kubelet. 476 // For more details refer to: 477 // https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/3386-kubelet-evented-pleg#timestamp-of-the-pod-status 478 if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) && isEventedPLEGInUse() { 479 timestamp = status.TimeStamp 480 } 481 482 return err, g.cache.Set(pod.ID, status, err, timestamp) 483 } 484 485 func (g *GenericPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) { 486 ctx := context.Background() 487 if !g.cacheEnabled() { 488 return fmt.Errorf("pod cache disabled"), false 489 } 490 if pod == nil { 491 return fmt.Errorf("pod cannot be nil"), false 492 } 493 return g.updateCache(ctx, pod, pid) 494 } 495 496 func updateEvents(eventsByPodID map[types.UID][]*PodLifecycleEvent, e *PodLifecycleEvent) { 497 if e == nil { 498 return 499 } 500 eventsByPodID[e.ID] = append(eventsByPodID[e.ID], e) 501 } 502 503 func getContainerState(pod *kubecontainer.Pod, cid *kubecontainer.ContainerID) plegContainerState { 504 // Default to the non-existent state. 505 state := plegContainerNonExistent 506 if pod == nil { 507 return state 508 } 509 c := pod.FindContainerByID(*cid) 510 if c != nil { 511 return convertState(c.State) 512 } 513 // Search through sandboxes too. 514 c = pod.FindSandboxByID(*cid) 515 if c != nil { 516 return convertState(c.State) 517 } 518 519 return state 520 } 521 522 func updateRunningPodAndContainerMetrics(pods []*kubecontainer.Pod) { 523 runningSandboxNum := 0 524 // intermediate map to store the count of each "container_state" 525 containerStateCount := make(map[string]int) 526 527 for _, pod := range pods { 528 containers := pod.Containers 529 for _, container := range containers { 530 // update the corresponding "container_state" in map to set value for the gaugeVec metrics 531 containerStateCount[string(container.State)]++ 532 } 533 534 sandboxes := pod.Sandboxes 535 536 for _, sandbox := range sandboxes { 537 if sandbox.State == kubecontainer.ContainerStateRunning { 538 runningSandboxNum++ 539 // every pod should only have one running sandbox 540 break 541 } 542 } 543 } 544 for key, value := range containerStateCount { 545 metrics.RunningContainerCount.WithLabelValues(key).Set(float64(value)) 546 } 547 548 // Set the number of running pods in the parameter 549 metrics.RunningPodCount.Set(float64(runningSandboxNum)) 550 } 551 552 func (pr podRecords) getOld(id types.UID) *kubecontainer.Pod { 553 r, ok := pr[id] 554 if !ok { 555 return nil 556 } 557 return r.old 558 } 559 560 func (pr podRecords) getCurrent(id types.UID) *kubecontainer.Pod { 561 r, ok := pr[id] 562 if !ok { 563 return nil 564 } 565 return r.current 566 } 567 568 func (pr podRecords) setCurrent(pods []*kubecontainer.Pod) { 569 for i := range pr { 570 pr[i].current = nil 571 } 572 for _, pod := range pods { 573 if r, ok := pr[pod.ID]; ok { 574 r.current = pod 575 } else { 576 pr[pod.ID] = &podRecord{current: pod} 577 } 578 } 579 } 580 581 func (pr podRecords) update(id types.UID) { 582 r, ok := pr[id] 583 if !ok { 584 return 585 } 586 pr.updateInternal(id, r) 587 } 588 589 func (pr podRecords) updateInternal(id types.UID, r *podRecord) { 590 if r.current == nil { 591 // Pod no longer exists; delete the entry. 592 delete(pr, id) 593 return 594 } 595 r.old = r.current 596 r.current = nil 597 }