k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/pleg/evented.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pleg 18 19 import ( 20 "fmt" 21 "sync" 22 "time" 23 24 "k8s.io/apimachinery/pkg/types" 25 "k8s.io/apimachinery/pkg/util/wait" 26 internalapi "k8s.io/cri-api/pkg/apis" 27 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 28 "k8s.io/klog/v2" 29 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 30 "k8s.io/kubernetes/pkg/kubelet/metrics" 31 "k8s.io/utils/clock" 32 ) 33 34 // The frequency with which global timestamp of the cache is to 35 // is to be updated periodically. If pod workers get stuck at cache.GetNewerThan 36 // call, after this period it will be unblocked. 37 const globalCacheUpdatePeriod = 5 * time.Second 38 39 var ( 40 eventedPLEGUsage = false 41 eventedPLEGUsageMu = sync.RWMutex{} 42 ) 43 44 // isEventedPLEGInUse indicates whether Evented PLEG is in use. Even after enabling 45 // the Evented PLEG feature gate, there could be several reasons it may not be in use. 46 // e.g. Streaming data issues from the runtime or the runtime does not implement the 47 // container events stream. 48 func isEventedPLEGInUse() bool { 49 eventedPLEGUsageMu.RLock() 50 defer eventedPLEGUsageMu.RUnlock() 51 return eventedPLEGUsage 52 } 53 54 // setEventedPLEGUsage should only be accessed from 55 // Start/Stop of Evented PLEG. 56 func setEventedPLEGUsage(enable bool) { 57 eventedPLEGUsageMu.Lock() 58 defer eventedPLEGUsageMu.Unlock() 59 eventedPLEGUsage = enable 60 } 61 62 type EventedPLEG struct { 63 // The container runtime. 64 runtime kubecontainer.Runtime 65 // The runtime service. 66 runtimeService internalapi.RuntimeService 67 // The channel from which the subscriber listens events. 68 eventChannel chan *PodLifecycleEvent 69 // Cache for storing the runtime states required for syncing pods. 70 cache kubecontainer.Cache 71 // For testability. 72 clock clock.Clock 73 // GenericPLEG is used to force relist when required. 74 genericPleg podLifecycleEventGeneratorHandler 75 // The maximum number of retries when getting container events from the runtime. 76 eventedPlegMaxStreamRetries int 77 // Indicates relisting related parameters 78 relistDuration *RelistDuration 79 // Stop the Evented PLEG by closing the channel. 80 stopCh chan struct{} 81 // Stops the periodic update of the cache global timestamp. 82 stopCacheUpdateCh chan struct{} 83 // Locks the start/stop operation of the Evented PLEG. 84 runningMu sync.Mutex 85 } 86 87 // NewEventedPLEG instantiates a new EventedPLEG object and return it. 88 func NewEventedPLEG(runtime kubecontainer.Runtime, runtimeService internalapi.RuntimeService, eventChannel chan *PodLifecycleEvent, 89 cache kubecontainer.Cache, genericPleg PodLifecycleEventGenerator, eventedPlegMaxStreamRetries int, 90 relistDuration *RelistDuration, clock clock.Clock) (PodLifecycleEventGenerator, error) { 91 handler, ok := genericPleg.(podLifecycleEventGeneratorHandler) 92 if !ok { 93 return nil, fmt.Errorf("%v doesn't implement podLifecycleEventGeneratorHandler interface", genericPleg) 94 } 95 return &EventedPLEG{ 96 runtime: runtime, 97 runtimeService: runtimeService, 98 eventChannel: eventChannel, 99 cache: cache, 100 genericPleg: handler, 101 eventedPlegMaxStreamRetries: eventedPlegMaxStreamRetries, 102 relistDuration: relistDuration, 103 clock: clock, 104 }, nil 105 } 106 107 // Watch returns a channel from which the subscriber can receive PodLifecycleEvent events. 108 func (e *EventedPLEG) Watch() chan *PodLifecycleEvent { 109 return e.eventChannel 110 } 111 112 // Relist relists all containers using GenericPLEG 113 func (e *EventedPLEG) Relist() { 114 e.genericPleg.Relist() 115 } 116 117 // Start starts the Evented PLEG 118 func (e *EventedPLEG) Start() { 119 e.runningMu.Lock() 120 defer e.runningMu.Unlock() 121 if isEventedPLEGInUse() { 122 return 123 } 124 setEventedPLEGUsage(true) 125 e.stopCh = make(chan struct{}) 126 e.stopCacheUpdateCh = make(chan struct{}) 127 go wait.Until(e.watchEventsChannel, 0, e.stopCh) 128 go wait.Until(e.updateGlobalCache, globalCacheUpdatePeriod, e.stopCacheUpdateCh) 129 } 130 131 // Stop stops the Evented PLEG 132 func (e *EventedPLEG) Stop() { 133 e.runningMu.Lock() 134 defer e.runningMu.Unlock() 135 if !isEventedPLEGInUse() { 136 return 137 } 138 setEventedPLEGUsage(false) 139 close(e.stopCh) 140 close(e.stopCacheUpdateCh) 141 } 142 143 // In case the Evented PLEG experiences undetectable issues in the underlying 144 // GRPC connection there is a remote chance the pod might get stuck in a 145 // given state while it has progressed in its life cycle. This function will be 146 // called periodically to update the global timestamp of the cache so that those 147 // pods stuck at GetNewerThan in pod workers will get unstuck. 148 func (e *EventedPLEG) updateGlobalCache() { 149 e.cache.UpdateTime(time.Now()) 150 } 151 152 // Update the relisting period and threshold 153 func (e *EventedPLEG) Update(relistDuration *RelistDuration) { 154 e.genericPleg.Update(relistDuration) 155 } 156 157 // Healthy check if PLEG work properly. 158 func (e *EventedPLEG) Healthy() (bool, error) { 159 // GenericPLEG is declared unhealthy when relisting time is more 160 // than the relistThreshold. In case EventedPLEG is turned on, 161 // relistingPeriod and relistingThreshold are adjusted to higher 162 // values. So the health check of Generic PLEG should check 163 // the adjusted values of relistingPeriod and relistingThreshold. 164 165 // EventedPLEG is declared unhealthy only if eventChannel is out of capacity. 166 if len(e.eventChannel) == cap(e.eventChannel) { 167 return false, fmt.Errorf("EventedPLEG: pleg event channel capacity is full with %v events", len(e.eventChannel)) 168 } 169 170 timestamp := e.clock.Now() 171 metrics.PLEGLastSeen.Set(float64(timestamp.Unix())) 172 return true, nil 173 } 174 175 func (e *EventedPLEG) watchEventsChannel() { 176 containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel)) 177 defer close(containerEventsResponseCh) 178 179 // Get the container events from the runtime. 180 go func() { 181 numAttempts := 0 182 for { 183 if numAttempts >= e.eventedPlegMaxStreamRetries { 184 if isEventedPLEGInUse() { 185 // Fall back to Generic PLEG relisting since Evented PLEG is not working. 186 klog.V(4).InfoS("Fall back to Generic PLEG relisting since Evented PLEG is not working") 187 e.Stop() 188 e.genericPleg.Stop() // Stop the existing Generic PLEG which runs with longer relisting period when Evented PLEG is in use. 189 e.Update(e.relistDuration) // Update the relisting period to the default value for the Generic PLEG. 190 e.genericPleg.Start() 191 break 192 } 193 } 194 195 err := e.runtimeService.GetContainerEvents(containerEventsResponseCh, func(runtimeapi.RuntimeService_GetContainerEventsClient) { 196 metrics.EventedPLEGConn.Inc() 197 }) 198 if err != nil { 199 metrics.EventedPLEGConnErr.Inc() 200 numAttempts++ 201 e.Relist() // Force a relist to get the latest container and pods running metric. 202 klog.V(4).InfoS("Evented PLEG: Failed to get container events, retrying: ", "err", err) 203 } 204 } 205 }() 206 207 if isEventedPLEGInUse() { 208 e.processCRIEvents(containerEventsResponseCh) 209 } 210 } 211 212 func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) { 213 for event := range containerEventsResponseCh { 214 // Ignore the event if PodSandboxStatus is nil. 215 // This might happen under some race condition where the podSandbox has 216 // been deleted, and therefore container runtime couldn't find the 217 // podSandbox for the container when generating the event. 218 // It is safe to ignore because 219 // a) a event would have been received for the sandbox deletion, 220 // b) in worst case, a relist will eventually sync the pod status. 221 // TODO(#114371): Figure out a way to handle this case instead of ignoring. 222 if event.PodSandboxStatus == nil || event.PodSandboxStatus.Metadata == nil { 223 klog.ErrorS(nil, "Evented PLEG: received ContainerEventResponse with nil PodSandboxStatus or PodSandboxStatus.Metadata", "containerEventResponse", event) 224 continue 225 } 226 227 podID := types.UID(event.PodSandboxStatus.Metadata.Uid) 228 shouldSendPLEGEvent := false 229 230 status, err := e.runtime.GeneratePodStatus(event) 231 if err != nil { 232 // nolint:logcheck // Not using the result of klog.V inside the 233 // if branch is okay, we just use it to determine whether the 234 // additional "podStatus" key and its value should be added. 235 if klog.V(6).Enabled() { 236 klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID, "podStatus", status) 237 } else { 238 klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID) 239 } 240 } else { 241 if klogV := klog.V(6); klogV.Enabled() { 242 klogV.InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID, "podStatus", status) 243 } else { 244 klog.V(4).InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID) 245 } 246 // Preserve the pod IP across cache updates if the new IP is empty. 247 // When a pod is torn down, kubelet may race with PLEG and retrieve 248 // a pod status after network teardown, but the kubernetes API expects 249 // the completed pod's IP to be available after the pod is dead. 250 status.IPs = e.getPodIPs(podID, status) 251 } 252 253 e.updateRunningPodMetric(status) 254 e.updateRunningContainerMetric(status) 255 e.updateLatencyMetric(event) 256 257 if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT { 258 for _, sandbox := range status.SandboxStatuses { 259 if sandbox.Id == event.ContainerId { 260 // When the CONTAINER_DELETED_EVENT is received by the kubelet, 261 // the runtime has indicated that the container has been removed 262 // by the runtime and hence, it must be removed from the cache 263 // of kubelet too. 264 e.cache.Delete(podID) 265 } 266 } 267 shouldSendPLEGEvent = true 268 } else { 269 if e.cache.Set(podID, status, err, time.Unix(event.GetCreatedAt(), 0)) { 270 shouldSendPLEGEvent = true 271 } 272 } 273 274 if shouldSendPLEGEvent { 275 e.processCRIEvent(event) 276 } 277 } 278 } 279 280 func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) { 281 switch event.ContainerEventType { 282 case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT: 283 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId}) 284 klog.V(4).InfoS("Received Container Stopped Event", "event", event.String()) 285 case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT: 286 // We only need to update the pod status on container create. 287 // But we don't have to generate any PodLifeCycleEvent. Container creation related 288 // PodLifeCycleEvent is ignored by the existing Generic PLEG as well. 289 // https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L88 and 290 // https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L273 291 klog.V(4).InfoS("Received Container Created Event", "event", event.String()) 292 case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT: 293 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId}) 294 klog.V(4).InfoS("Received Container Started Event", "event", event.String()) 295 case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT: 296 // In case the pod is deleted it is safe to generate both ContainerDied and ContainerRemoved events, just like in the case of 297 // Generic PLEG. https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L169 298 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId}) 299 e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId}) 300 klog.V(4).InfoS("Received Container Deleted Event", "event", event) 301 } 302 } 303 304 func (e *EventedPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus) []string { 305 if len(status.IPs) != 0 { 306 return status.IPs 307 } 308 309 oldStatus, err := e.cache.Get(pid) 310 if err != nil || len(oldStatus.IPs) == 0 { 311 return nil 312 } 313 314 for _, sandboxStatus := range status.SandboxStatuses { 315 // If at least one sandbox is ready, then use this status update's pod IP 316 if sandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY { 317 return status.IPs 318 } 319 } 320 321 // For pods with no ready containers or sandboxes (like exited pods) 322 // use the old status' pod IP 323 return oldStatus.IPs 324 } 325 326 func (e *EventedPLEG) sendPodLifecycleEvent(event *PodLifecycleEvent) { 327 select { 328 case e.eventChannel <- event: 329 default: 330 // record how many events were discarded due to channel out of capacity 331 metrics.PLEGDiscardEvents.Inc() 332 klog.ErrorS(nil, "Evented PLEG: Event channel is full, discarded pod lifecycle event") 333 } 334 } 335 336 func getPodSandboxState(podStatus *kubecontainer.PodStatus) kubecontainer.State { 337 // increase running pod count when cache doesn't contain podID 338 var sandboxId string 339 for _, sandbox := range podStatus.SandboxStatuses { 340 sandboxId = sandbox.Id 341 // pod must contain only one sandbox 342 break 343 } 344 345 for _, containerStatus := range podStatus.ContainerStatuses { 346 if containerStatus.ID.ID == sandboxId { 347 if containerStatus.State == kubecontainer.ContainerStateRunning { 348 return containerStatus.State 349 } 350 } 351 } 352 return kubecontainer.ContainerStateExited 353 } 354 355 func (e *EventedPLEG) updateRunningPodMetric(podStatus *kubecontainer.PodStatus) { 356 cachedPodStatus, err := e.cache.Get(podStatus.ID) 357 if err != nil { 358 klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID) 359 } 360 // cache miss condition: The pod status object will have empty state if missed in cache 361 if len(cachedPodStatus.SandboxStatuses) < 1 { 362 sandboxState := getPodSandboxState(podStatus) 363 if sandboxState == kubecontainer.ContainerStateRunning { 364 metrics.RunningPodCount.Inc() 365 } 366 } else { 367 oldSandboxState := getPodSandboxState(cachedPodStatus) 368 currentSandboxState := getPodSandboxState(podStatus) 369 370 if oldSandboxState == kubecontainer.ContainerStateRunning && currentSandboxState != kubecontainer.ContainerStateRunning { 371 metrics.RunningPodCount.Dec() 372 } else if oldSandboxState != kubecontainer.ContainerStateRunning && currentSandboxState == kubecontainer.ContainerStateRunning { 373 metrics.RunningPodCount.Inc() 374 } 375 } 376 } 377 378 func getContainerStateCount(podStatus *kubecontainer.PodStatus) map[kubecontainer.State]int { 379 containerStateCount := make(map[kubecontainer.State]int) 380 for _, container := range podStatus.ContainerStatuses { 381 containerStateCount[container.State]++ 382 } 383 return containerStateCount 384 } 385 386 func (e *EventedPLEG) updateRunningContainerMetric(podStatus *kubecontainer.PodStatus) { 387 cachedPodStatus, err := e.cache.Get(podStatus.ID) 388 if err != nil { 389 klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID) 390 } 391 392 // cache miss condition: The pod status object will have empty state if missed in cache 393 if len(cachedPodStatus.SandboxStatuses) < 1 { 394 containerStateCount := getContainerStateCount(podStatus) 395 for state, count := range containerStateCount { 396 // add currently obtained count 397 metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(count)) 398 } 399 } else { 400 oldContainerStateCount := getContainerStateCount(cachedPodStatus) 401 currentContainerStateCount := getContainerStateCount(podStatus) 402 403 // old and new set of container states may vary; 404 // get a unique set of container states combining both 405 containerStates := make(map[kubecontainer.State]bool) 406 for state := range oldContainerStateCount { 407 containerStates[state] = true 408 } 409 for state := range currentContainerStateCount { 410 containerStates[state] = true 411 } 412 413 // update the metric via difference of old and current counts 414 for state := range containerStates { 415 diff := currentContainerStateCount[state] - oldContainerStateCount[state] 416 metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(diff)) 417 } 418 } 419 } 420 421 func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventResponse) { 422 duration := time.Duration(time.Now().UnixNano()-event.CreatedAt) * time.Nanosecond 423 metrics.EventedPLEGConnLatency.Observe(duration.Seconds()) 424 } 425 426 func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) { 427 return fmt.Errorf("not implemented"), false 428 }