k8s.io/kubernetes@v1.29.3/pkg/kubelet/pleg/evented.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pleg 18 19 import ( 20 "fmt" 21 "sync" 22 "time" 23 24 "k8s.io/apimachinery/pkg/types" 25 "k8s.io/apimachinery/pkg/util/wait" 26 internalapi "k8s.io/cri-api/pkg/apis" 27 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 28 "k8s.io/klog/v2" 29 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 30 "k8s.io/kubernetes/pkg/kubelet/metrics" 31 "k8s.io/utils/clock" 32 ) 33 34 // The frequency with which global timestamp of the cache is to 35 // is to be updated periodically. If pod workers get stuck at cache.GetNewerThan 36 // call, after this period it will be unblocked. 37 const globalCacheUpdatePeriod = 5 * time.Second 38 39 var ( 40 eventedPLEGUsage = false 41 eventedPLEGUsageMu = sync.RWMutex{} 42 ) 43 44 // isEventedPLEGInUse indicates whether Evented PLEG is in use. Even after enabling 45 // the Evented PLEG feature gate, there could be several reasons it may not be in use. 46 // e.g. Streaming data issues from the runtime or the runtime does not implement the 47 // container events stream. 48 func isEventedPLEGInUse() bool { 49 eventedPLEGUsageMu.RLock() 50 defer eventedPLEGUsageMu.RUnlock() 51 return eventedPLEGUsage 52 } 53 54 // setEventedPLEGUsage should only be accessed from 55 // Start/Stop of Evented PLEG. 56 func setEventedPLEGUsage(enable bool) { 57 eventedPLEGUsageMu.Lock() 58 defer eventedPLEGUsageMu.Unlock() 59 eventedPLEGUsage = enable 60 } 61 62 type EventedPLEG struct { 63 // The container runtime. 64 runtime kubecontainer.Runtime 65 // The runtime service. 66 runtimeService internalapi.RuntimeService 67 // The channel from which the subscriber listens events. 68 eventChannel chan *PodLifecycleEvent 69 // Cache for storing the runtime states required for syncing pods. 70 cache kubecontainer.Cache 71 // For testability. 72 clock clock.Clock 73 // GenericPLEG is used to force relist when required. 74 genericPleg podLifecycleEventGeneratorHandler 75 // The maximum number of retries when getting container events from the runtime. 76 eventedPlegMaxStreamRetries int 77 // Indicates relisting related parameters 78 relistDuration *RelistDuration 79 // Stop the Evented PLEG by closing the channel. 80 stopCh chan struct{} 81 // Stops the periodic update of the cache global timestamp. 82 stopCacheUpdateCh chan struct{} 83 // Locks the start/stop operation of the Evented PLEG. 84 runningMu sync.Mutex 85 } 86 87 // NewEventedPLEG instantiates a new EventedPLEG object and return it. 88 func NewEventedPLEG(runtime kubecontainer.Runtime, runtimeService internalapi.RuntimeService, eventChannel chan *PodLifecycleEvent, 89 cache kubecontainer.Cache, genericPleg PodLifecycleEventGenerator, eventedPlegMaxStreamRetries int, 90 relistDuration *RelistDuration, clock clock.Clock) (PodLifecycleEventGenerator, error) { 91 handler, ok := genericPleg.(podLifecycleEventGeneratorHandler) 92 if !ok { 93 return nil, fmt.Errorf("%v doesn't implement podLifecycleEventGeneratorHandler interface", genericPleg) 94 } 95 return &EventedPLEG{ 96 runtime: runtime, 97 runtimeService: runtimeService, 98 eventChannel: eventChannel, 99 cache: cache, 100 genericPleg: handler, 101 eventedPlegMaxStreamRetries: eventedPlegMaxStreamRetries, 102 relistDuration: relistDuration, 103 clock: clock, 104 }, nil 105 } 106 107 // Watch returns a channel from which the subscriber can receive PodLifecycleEvent events. 108 func (e *EventedPLEG) Watch() chan *PodLifecycleEvent { 109 return e.eventChannel 110 } 111 112 // Relist relists all containers using GenericPLEG 113 func (e *EventedPLEG) Relist() { 114 e.genericPleg.Relist() 115 } 116 117 // Start starts the Evented PLEG 118 func (e *EventedPLEG) Start() { 119 e.runningMu.Lock() 120 defer e.runningMu.Unlock() 121 if isEventedPLEGInUse() { 122 return 123 } 124 setEventedPLEGUsage(true) 125 e.stopCh = make(chan struct{}) 126 e.stopCacheUpdateCh = make(chan struct{}) 127 go wait.Until(e.watchEventsChannel, 0, e.stopCh) 128 go wait.Until(e.updateGlobalCache, globalCacheUpdatePeriod, e.stopCacheUpdateCh) 129 } 130 131 // Stop stops the Evented PLEG 132 func (e *EventedPLEG) Stop() { 133 e.runningMu.Lock() 134 defer e.runningMu.Unlock() 135 if !isEventedPLEGInUse() { 136 return 137 } 138 setEventedPLEGUsage(false) 139 close(e.stopCh) 140 close(e.stopCacheUpdateCh) 141 } 142 143 // In case the Evented PLEG experiences undetectable issues in the underlying 144 // GRPC connection there is a remote chance the pod might get stuck in a 145 // given state while it has progressed in its life cycle. This function will be 146 // called periodically to update the global timestamp of the cache so that those 147 // pods stuck at GetNewerThan in pod workers will get unstuck. 148 func (e *EventedPLEG) updateGlobalCache() { 149 e.cache.UpdateTime(time.Now()) 150 } 151 152 // Update the relisting period and threshold 153 func (e *EventedPLEG) Update(relistDuration *RelistDuration) { 154 e.genericPleg.Update(relistDuration) 155 } 156 157 // Healthy check if PLEG work properly. 158 func (e *EventedPLEG) Healthy() (bool, error) { 159 // GenericPLEG is declared unhealthy when relisting time is more 160 // than the relistThreshold. In case EventedPLEG is turned on, 161 // relistingPeriod and relistingThreshold are adjusted to higher 162 // values. So the health check of Generic PLEG should check 163 // the adjusted values of relistingPeriod and relistingThreshold. 164 165 // EventedPLEG is declared unhealthy only if eventChannel is out of capacity. 166 if len(e.eventChannel) == cap(e.eventChannel) { 167 return false, fmt.Errorf("EventedPLEG: pleg event channel capacity is full with %v events", len(e.eventChannel)) 168 } 169 170 timestamp := e.clock.Now() 171 metrics.PLEGLastSeen.Set(float64(timestamp.Unix())) 172 return true, nil 173 } 174 175 func (e *EventedPLEG) watchEventsChannel() { 176 containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel)) 177 defer close(containerEventsResponseCh) 178 179 // Get the container events from the runtime. 180 go func() { 181 numAttempts := 0 182 for { 183 if numAttempts >= e.eventedPlegMaxStreamRetries { 184 if isEventedPLEGInUse() { 185 // Fall back to Generic PLEG relisting since Evented PLEG is not working. 186 klog.V(4).InfoS("Fall back to Generic PLEG relisting since Evented PLEG is not working") 187 e.Stop() 188 e.genericPleg.Stop() // Stop the existing Generic PLEG which runs with longer relisting period when Evented PLEG is in use. 189 e.Update(e.relistDuration) // Update the relisting period to the default value for the Generic PLEG. 190 e.genericPleg.Start() 191 break 192 } 193 } 194 195 err := e.runtimeService.GetContainerEvents(containerEventsResponseCh) 196 if err != nil { 197 metrics.EventedPLEGConnErr.Inc() 198 numAttempts++ 199 e.Relist() // Force a relist to get the latest container and pods running metric. 200 klog.V(4).InfoS("Evented PLEG: Failed to get container events, retrying: ", "err", err) 201 } 202 } 203 }() 204 205 if isEventedPLEGInUse() { 206 e.processCRIEvents(containerEventsResponseCh) 207 } 208 } 209 210 func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) { 211 for event := range containerEventsResponseCh { 212 // Ignore the event if PodSandboxStatus is nil. 213 // This might happen under some race condition where the podSandbox has 214 // been deleted, and therefore container runtime couldn't find the 215 // podSandbox for the container when generating the event. 216 // It is safe to ignore because 217 // a) a event would have been received for the sandbox deletion, 218 // b) in worst case, a relist will eventually sync the pod status. 219 // TODO(#114371): Figure out a way to handle this case instead of ignoring. 220 if event.PodSandboxStatus == nil || event.PodSandboxStatus.Metadata == nil { 221 klog.ErrorS(nil, "Evented PLEG: received ContainerEventResponse with nil PodSandboxStatus or PodSandboxStatus.Metadata", "containerEventResponse", event) 222 continue 223 } 224 225 podID := types.UID(event.PodSandboxStatus.Metadata.Uid) 226 shouldSendPLEGEvent := false 227 228 status, err := e.runtime.GeneratePodStatus(event) 229 if err != nil { 230 // nolint:logcheck // Not using the result of klog.V inside the 231 // if branch is okay, we just use it to determine whether the 232 // additional "podStatus" key and its value should be added. 233 if klog.V(6).Enabled() { 234 klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID, "podStatus", status) 235 } else { 236 klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID) 237 } 238 } else { 239 if klogV := klog.V(6); klogV.Enabled() { 240 klogV.InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID, "podStatus", status) 241 } else { 242 klog.V(4).InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID) 243 } 244 // Preserve the pod IP across cache updates if the new IP is empty. 245 // When a pod is torn down, kubelet may race with PLEG and retrieve 246 // a pod status after network teardown, but the kubernetes API expects 247 // the completed pod's IP to be available after the pod is dead. 248 status.IPs = e.getPodIPs(podID, status) 249 } 250 251 e.updateRunningPodMetric(status) 252 e.updateRunningContainerMetric(status) 253 e.updateLatencyMetric(event) 254 255 if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT { 256 for _, sandbox := range status.SandboxStatuses { 257 if sandbox.Id == event.ContainerId { 258 // When the CONTAINER_DELETED_EVENT is received by the kubelet, 259 // the runtime has indicated that the container has been removed 260 // by the runtime and hence, it must be removed from the cache 261 // of kubelet too. 262 e.cache.Delete(podID) 263 } 264 } 265 shouldSendPLEGEvent = true 266 } else { 267 if e.cache.Set(podID, status, err, time.Unix(event.GetCreatedAt(), 0)) { 268 shouldSendPLEGEvent = true 269 } 270 } 271 272 if shouldSendPLEGEvent { 273 e.processCRIEvent(event) 274 } 275 } 276 } 277 278 func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) { 279 switch event.ContainerEventType { 280 case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT: 281 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId}) 282 klog.V(4).InfoS("Received Container Stopped Event", "event", event.String()) 283 case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT: 284 // We only need to update the pod status on container create. 285 // But we don't have to generate any PodLifeCycleEvent. Container creation related 286 // PodLifeCycleEvent is ignored by the existing Generic PLEG as well. 287 // https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L88 and 288 // https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L273 289 klog.V(4).InfoS("Received Container Created Event", "event", event.String()) 290 case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT: 291 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId}) 292 klog.V(4).InfoS("Received Container Started Event", "event", event.String()) 293 case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT: 294 // In case the pod is deleted it is safe to generate both ContainerDied and ContainerRemoved events, just like in the case of 295 // Generic PLEG. https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L169 296 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId}) 297 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId}) 298 klog.V(4).InfoS("Received Container Deleted Event", "event", event) 299 } 300 } 301 302 func (e *EventedPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus) []string { 303 if len(status.IPs) != 0 { 304 return status.IPs 305 } 306 307 oldStatus, err := e.cache.Get(pid) 308 if err != nil || len(oldStatus.IPs) == 0 { 309 return nil 310 } 311 312 for _, sandboxStatus := range status.SandboxStatuses { 313 // If at least one sandbox is ready, then use this status update's pod IP 314 if sandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY { 315 return status.IPs 316 } 317 } 318 319 // For pods with no ready containers or sandboxes (like exited pods) 320 // use the old status' pod IP 321 return oldStatus.IPs 322 } 323 324 func (e *EventedPLEG) sendPodLifecycleEvent(event *PodLifecycleEvent) { 325 select { 326 case e.eventChannel <- event: 327 default: 328 // record how many events were discarded due to channel out of capacity 329 metrics.PLEGDiscardEvents.Inc() 330 klog.ErrorS(nil, "Evented PLEG: Event channel is full, discarded pod lifecycle event") 331 } 332 } 333 334 func getPodSandboxState(podStatus *kubecontainer.PodStatus) kubecontainer.State { 335 // increase running pod count when cache doesn't contain podID 336 var sandboxId string 337 for _, sandbox := range podStatus.SandboxStatuses { 338 sandboxId = sandbox.Id 339 // pod must contain only one sandbox 340 break 341 } 342 343 for _, containerStatus := range podStatus.ContainerStatuses { 344 if containerStatus.ID.ID == sandboxId { 345 if containerStatus.State == kubecontainer.ContainerStateRunning { 346 return containerStatus.State 347 } 348 } 349 } 350 return kubecontainer.ContainerStateExited 351 } 352 353 func (e *EventedPLEG) updateRunningPodMetric(podStatus *kubecontainer.PodStatus) { 354 cachedPodStatus, err := e.cache.Get(podStatus.ID) 355 if err != nil { 356 klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID) 357 } 358 // cache miss condition: The pod status object will have empty state if missed in cache 359 if len(cachedPodStatus.SandboxStatuses) < 1 { 360 sandboxState := getPodSandboxState(podStatus) 361 if sandboxState == kubecontainer.ContainerStateRunning { 362 metrics.RunningPodCount.Inc() 363 } 364 } else { 365 oldSandboxState := getPodSandboxState(cachedPodStatus) 366 currentSandboxState := getPodSandboxState(podStatus) 367 368 if oldSandboxState == kubecontainer.ContainerStateRunning && currentSandboxState != kubecontainer.ContainerStateRunning { 369 metrics.RunningPodCount.Dec() 370 } else if oldSandboxState != kubecontainer.ContainerStateRunning && currentSandboxState == kubecontainer.ContainerStateRunning { 371 metrics.RunningPodCount.Inc() 372 } 373 } 374 } 375 376 func getContainerStateCount(podStatus *kubecontainer.PodStatus) map[kubecontainer.State]int { 377 containerStateCount := make(map[kubecontainer.State]int) 378 for _, container := range podStatus.ContainerStatuses { 379 containerStateCount[container.State]++ 380 } 381 return containerStateCount 382 } 383 384 func (e *EventedPLEG) updateRunningContainerMetric(podStatus *kubecontainer.PodStatus) { 385 cachedPodStatus, err := e.cache.Get(podStatus.ID) 386 if err != nil { 387 klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID) 388 } 389 390 // cache miss condition: The pod status object will have empty state if missed in cache 391 if len(cachedPodStatus.SandboxStatuses) < 1 { 392 containerStateCount := getContainerStateCount(podStatus) 393 for state, count := range containerStateCount { 394 // add currently obtained count 395 metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(count)) 396 } 397 } else { 398 oldContainerStateCount := getContainerStateCount(cachedPodStatus) 399 currentContainerStateCount := getContainerStateCount(podStatus) 400 401 // old and new set of container states may vary; 402 // get a unique set of container states combining both 403 containerStates := make(map[kubecontainer.State]bool) 404 for state := range oldContainerStateCount { 405 containerStates[state] = true 406 } 407 for state := range currentContainerStateCount { 408 containerStates[state] = true 409 } 410 411 // update the metric via difference of old and current counts 412 for state := range containerStates { 413 diff := currentContainerStateCount[state] - oldContainerStateCount[state] 414 metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(diff)) 415 } 416 } 417 } 418 419 func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventResponse) { 420 duration := time.Duration(time.Now().UnixNano()-event.CreatedAt) * time.Nanosecond 421 metrics.EventedPLEGConnLatency.Observe(duration.Seconds()) 422 } 423 424 func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) { 425 return fmt.Errorf("not implemented"), false 426 }