k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/pleg/evented.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package pleg
    18  
    19  import (
    20  	"fmt"
    21  	"sync"
    22  	"time"
    23  
    24  	"k8s.io/apimachinery/pkg/types"
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	internalapi "k8s.io/cri-api/pkg/apis"
    27  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    28  	"k8s.io/klog/v2"
    29  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    30  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    31  	"k8s.io/utils/clock"
    32  )
    33  
    34  // The frequency with which global timestamp of the cache is to
    35  // is to be updated periodically. If pod workers get stuck at cache.GetNewerThan
    36  // call, after this period it will be unblocked.
    37  const globalCacheUpdatePeriod = 5 * time.Second
    38  
    39  var (
    40  	eventedPLEGUsage   = false
    41  	eventedPLEGUsageMu = sync.RWMutex{}
    42  )
    43  
    44  // isEventedPLEGInUse indicates whether Evented PLEG is in use. Even after enabling
    45  // the Evented PLEG feature gate, there could be several reasons it may not be in use.
    46  // e.g. Streaming data issues from the runtime or the runtime does not implement the
    47  // container events stream.
    48  func isEventedPLEGInUse() bool {
    49  	eventedPLEGUsageMu.RLock()
    50  	defer eventedPLEGUsageMu.RUnlock()
    51  	return eventedPLEGUsage
    52  }
    53  
    54  // setEventedPLEGUsage should only be accessed from
    55  // Start/Stop of Evented PLEG.
    56  func setEventedPLEGUsage(enable bool) {
    57  	eventedPLEGUsageMu.Lock()
    58  	defer eventedPLEGUsageMu.Unlock()
    59  	eventedPLEGUsage = enable
    60  }
    61  
    62  type EventedPLEG struct {
    63  	// The container runtime.
    64  	runtime kubecontainer.Runtime
    65  	// The runtime service.
    66  	runtimeService internalapi.RuntimeService
    67  	// The channel from which the subscriber listens events.
    68  	eventChannel chan *PodLifecycleEvent
    69  	// Cache for storing the runtime states required for syncing pods.
    70  	cache kubecontainer.Cache
    71  	// For testability.
    72  	clock clock.Clock
    73  	// GenericPLEG is used to force relist when required.
    74  	genericPleg podLifecycleEventGeneratorHandler
    75  	// The maximum number of retries when getting container events from the runtime.
    76  	eventedPlegMaxStreamRetries int
    77  	// Indicates relisting related parameters
    78  	relistDuration *RelistDuration
    79  	// Stop the Evented PLEG by closing the channel.
    80  	stopCh chan struct{}
    81  	// Stops the periodic update of the cache global timestamp.
    82  	stopCacheUpdateCh chan struct{}
    83  	// Locks the start/stop operation of the Evented PLEG.
    84  	runningMu sync.Mutex
    85  }
    86  
    87  // NewEventedPLEG instantiates a new EventedPLEG object and return it.
    88  func NewEventedPLEG(runtime kubecontainer.Runtime, runtimeService internalapi.RuntimeService, eventChannel chan *PodLifecycleEvent,
    89  	cache kubecontainer.Cache, genericPleg PodLifecycleEventGenerator, eventedPlegMaxStreamRetries int,
    90  	relistDuration *RelistDuration, clock clock.Clock) (PodLifecycleEventGenerator, error) {
    91  	handler, ok := genericPleg.(podLifecycleEventGeneratorHandler)
    92  	if !ok {
    93  		return nil, fmt.Errorf("%v doesn't implement podLifecycleEventGeneratorHandler interface", genericPleg)
    94  	}
    95  	return &EventedPLEG{
    96  		runtime:                     runtime,
    97  		runtimeService:              runtimeService,
    98  		eventChannel:                eventChannel,
    99  		cache:                       cache,
   100  		genericPleg:                 handler,
   101  		eventedPlegMaxStreamRetries: eventedPlegMaxStreamRetries,
   102  		relistDuration:              relistDuration,
   103  		clock:                       clock,
   104  	}, nil
   105  }
   106  
   107  // Watch returns a channel from which the subscriber can receive PodLifecycleEvent events.
   108  func (e *EventedPLEG) Watch() chan *PodLifecycleEvent {
   109  	return e.eventChannel
   110  }
   111  
   112  // Relist relists all containers using GenericPLEG
   113  func (e *EventedPLEG) Relist() {
   114  	e.genericPleg.Relist()
   115  }
   116  
   117  // Start starts the Evented PLEG
   118  func (e *EventedPLEG) Start() {
   119  	e.runningMu.Lock()
   120  	defer e.runningMu.Unlock()
   121  	if isEventedPLEGInUse() {
   122  		return
   123  	}
   124  	setEventedPLEGUsage(true)
   125  	e.stopCh = make(chan struct{})
   126  	e.stopCacheUpdateCh = make(chan struct{})
   127  	go wait.Until(e.watchEventsChannel, 0, e.stopCh)
   128  	go wait.Until(e.updateGlobalCache, globalCacheUpdatePeriod, e.stopCacheUpdateCh)
   129  }
   130  
   131  // Stop stops the Evented PLEG
   132  func (e *EventedPLEG) Stop() {
   133  	e.runningMu.Lock()
   134  	defer e.runningMu.Unlock()
   135  	if !isEventedPLEGInUse() {
   136  		return
   137  	}
   138  	setEventedPLEGUsage(false)
   139  	close(e.stopCh)
   140  	close(e.stopCacheUpdateCh)
   141  }
   142  
   143  // In case the Evented PLEG experiences undetectable issues in the underlying
   144  // GRPC connection there is a remote chance the pod might get stuck in a
   145  // given state while it has progressed in its life cycle. This function will be
   146  // called periodically to update the global timestamp of the cache so that those
   147  // pods stuck at GetNewerThan in pod workers will get unstuck.
   148  func (e *EventedPLEG) updateGlobalCache() {
   149  	e.cache.UpdateTime(time.Now())
   150  }
   151  
   152  // Update the relisting period and threshold
   153  func (e *EventedPLEG) Update(relistDuration *RelistDuration) {
   154  	e.genericPleg.Update(relistDuration)
   155  }
   156  
   157  // Healthy check if PLEG work properly.
   158  func (e *EventedPLEG) Healthy() (bool, error) {
   159  	// GenericPLEG is declared unhealthy when relisting time is more
   160  	// than the relistThreshold. In case EventedPLEG is turned on,
   161  	// relistingPeriod and relistingThreshold are adjusted to higher
   162  	// values. So the health check of Generic PLEG should check
   163  	// the adjusted values of relistingPeriod and relistingThreshold.
   164  
   165  	// EventedPLEG is declared unhealthy only if eventChannel is out of capacity.
   166  	if len(e.eventChannel) == cap(e.eventChannel) {
   167  		return false, fmt.Errorf("EventedPLEG: pleg event channel capacity is full with %v events", len(e.eventChannel))
   168  	}
   169  
   170  	timestamp := e.clock.Now()
   171  	metrics.PLEGLastSeen.Set(float64(timestamp.Unix()))
   172  	return true, nil
   173  }
   174  
   175  func (e *EventedPLEG) watchEventsChannel() {
   176  	containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel))
   177  	defer close(containerEventsResponseCh)
   178  
   179  	// Get the container events from the runtime.
   180  	go func() {
   181  		numAttempts := 0
   182  		for {
   183  			if numAttempts >= e.eventedPlegMaxStreamRetries {
   184  				if isEventedPLEGInUse() {
   185  					// Fall back to Generic PLEG relisting since Evented PLEG is not working.
   186  					klog.V(4).InfoS("Fall back to Generic PLEG relisting since Evented PLEG is not working")
   187  					e.Stop()
   188  					e.genericPleg.Stop()       // Stop the existing Generic PLEG which runs with longer relisting period when Evented PLEG is in use.
   189  					e.Update(e.relistDuration) // Update the relisting period to the default value for the Generic PLEG.
   190  					e.genericPleg.Start()
   191  					break
   192  				}
   193  			}
   194  
   195  			err := e.runtimeService.GetContainerEvents(containerEventsResponseCh, func(runtimeapi.RuntimeService_GetContainerEventsClient) {
   196  				metrics.EventedPLEGConn.Inc()
   197  			})
   198  			if err != nil {
   199  				metrics.EventedPLEGConnErr.Inc()
   200  				numAttempts++
   201  				e.Relist() // Force a relist to get the latest container and pods running metric.
   202  				klog.V(4).InfoS("Evented PLEG: Failed to get container events, retrying: ", "err", err)
   203  			}
   204  		}
   205  	}()
   206  
   207  	if isEventedPLEGInUse() {
   208  		e.processCRIEvents(containerEventsResponseCh)
   209  	}
   210  }
   211  
   212  func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) {
   213  	for event := range containerEventsResponseCh {
   214  		// Ignore the event if PodSandboxStatus is nil.
   215  		// This might happen under some race condition where the podSandbox has
   216  		// been deleted, and therefore container runtime couldn't find the
   217  		// podSandbox for the container when generating the event.
   218  		// It is safe to ignore because
   219  		// a) a event would have been received for the sandbox deletion,
   220  		// b) in worst case, a relist will eventually sync the pod status.
   221  		// TODO(#114371): Figure out a way to handle this case instead of ignoring.
   222  		if event.PodSandboxStatus == nil || event.PodSandboxStatus.Metadata == nil {
   223  			klog.ErrorS(nil, "Evented PLEG: received ContainerEventResponse with nil PodSandboxStatus or PodSandboxStatus.Metadata", "containerEventResponse", event)
   224  			continue
   225  		}
   226  
   227  		podID := types.UID(event.PodSandboxStatus.Metadata.Uid)
   228  		shouldSendPLEGEvent := false
   229  
   230  		status, err := e.runtime.GeneratePodStatus(event)
   231  		if err != nil {
   232  			// nolint:logcheck // Not using the result of klog.V inside the
   233  			// if branch is okay, we just use it to determine whether the
   234  			// additional "podStatus" key and its value should be added.
   235  			if klog.V(6).Enabled() {
   236  				klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID, "podStatus", status)
   237  			} else {
   238  				klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID)
   239  			}
   240  		} else {
   241  			if klogV := klog.V(6); klogV.Enabled() {
   242  				klogV.InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID, "podStatus", status)
   243  			} else {
   244  				klog.V(4).InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID)
   245  			}
   246  			// Preserve the pod IP across cache updates if the new IP is empty.
   247  			// When a pod is torn down, kubelet may race with PLEG and retrieve
   248  			// a pod status after network teardown, but the kubernetes API expects
   249  			// the completed pod's IP to be available after the pod is dead.
   250  			status.IPs = e.getPodIPs(podID, status)
   251  		}
   252  
   253  		e.updateRunningPodMetric(status)
   254  		e.updateRunningContainerMetric(status)
   255  		e.updateLatencyMetric(event)
   256  
   257  		if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT {
   258  			for _, sandbox := range status.SandboxStatuses {
   259  				if sandbox.Id == event.ContainerId {
   260  					// When the CONTAINER_DELETED_EVENT is received by the kubelet,
   261  					// the runtime has indicated that the container has been removed
   262  					// by the runtime and hence, it must be removed from the cache
   263  					// of kubelet too.
   264  					e.cache.Delete(podID)
   265  				}
   266  			}
   267  			shouldSendPLEGEvent = true
   268  		} else {
   269  			if e.cache.Set(podID, status, err, time.Unix(event.GetCreatedAt(), 0)) {
   270  				shouldSendPLEGEvent = true
   271  			}
   272  		}
   273  
   274  		if shouldSendPLEGEvent {
   275  			e.processCRIEvent(event)
   276  		}
   277  	}
   278  }
   279  
   280  func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) {
   281  	switch event.ContainerEventType {
   282  	case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT:
   283  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
   284  		klog.V(4).InfoS("Received Container Stopped Event", "event", event.String())
   285  	case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT:
   286  		// We only need to update the pod status on container create.
   287  		// But we don't have to generate any PodLifeCycleEvent. Container creation related
   288  		// PodLifeCycleEvent is ignored by the existing Generic PLEG as well.
   289  		// https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L88 and
   290  		// https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L273
   291  		klog.V(4).InfoS("Received Container Created Event", "event", event.String())
   292  	case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT:
   293  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId})
   294  		klog.V(4).InfoS("Received Container Started Event", "event", event.String())
   295  	case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT:
   296  		// In case the pod is deleted it is safe to generate both ContainerDied and ContainerRemoved events, just like in the case of
   297  		// Generic PLEG. https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L169
   298  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
   299  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId})
   300  		klog.V(4).InfoS("Received Container Deleted Event", "event", event)
   301  	}
   302  }
   303  
   304  func (e *EventedPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus) []string {
   305  	if len(status.IPs) != 0 {
   306  		return status.IPs
   307  	}
   308  
   309  	oldStatus, err := e.cache.Get(pid)
   310  	if err != nil || len(oldStatus.IPs) == 0 {
   311  		return nil
   312  	}
   313  
   314  	for _, sandboxStatus := range status.SandboxStatuses {
   315  		// If at least one sandbox is ready, then use this status update's pod IP
   316  		if sandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY {
   317  			return status.IPs
   318  		}
   319  	}
   320  
   321  	// For pods with no ready containers or sandboxes (like exited pods)
   322  	// use the old status' pod IP
   323  	return oldStatus.IPs
   324  }
   325  
   326  func (e *EventedPLEG) sendPodLifecycleEvent(event *PodLifecycleEvent) {
   327  	select {
   328  	case e.eventChannel <- event:
   329  	default:
   330  		// record how many events were discarded due to channel out of capacity
   331  		metrics.PLEGDiscardEvents.Inc()
   332  		klog.ErrorS(nil, "Evented PLEG: Event channel is full, discarded pod lifecycle event")
   333  	}
   334  }
   335  
   336  func getPodSandboxState(podStatus *kubecontainer.PodStatus) kubecontainer.State {
   337  	// increase running pod count when cache doesn't contain podID
   338  	var sandboxId string
   339  	for _, sandbox := range podStatus.SandboxStatuses {
   340  		sandboxId = sandbox.Id
   341  		// pod must contain only one sandbox
   342  		break
   343  	}
   344  
   345  	for _, containerStatus := range podStatus.ContainerStatuses {
   346  		if containerStatus.ID.ID == sandboxId {
   347  			if containerStatus.State == kubecontainer.ContainerStateRunning {
   348  				return containerStatus.State
   349  			}
   350  		}
   351  	}
   352  	return kubecontainer.ContainerStateExited
   353  }
   354  
   355  func (e *EventedPLEG) updateRunningPodMetric(podStatus *kubecontainer.PodStatus) {
   356  	cachedPodStatus, err := e.cache.Get(podStatus.ID)
   357  	if err != nil {
   358  		klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID)
   359  	}
   360  	// cache miss condition: The pod status object will have empty state if missed in cache
   361  	if len(cachedPodStatus.SandboxStatuses) < 1 {
   362  		sandboxState := getPodSandboxState(podStatus)
   363  		if sandboxState == kubecontainer.ContainerStateRunning {
   364  			metrics.RunningPodCount.Inc()
   365  		}
   366  	} else {
   367  		oldSandboxState := getPodSandboxState(cachedPodStatus)
   368  		currentSandboxState := getPodSandboxState(podStatus)
   369  
   370  		if oldSandboxState == kubecontainer.ContainerStateRunning && currentSandboxState != kubecontainer.ContainerStateRunning {
   371  			metrics.RunningPodCount.Dec()
   372  		} else if oldSandboxState != kubecontainer.ContainerStateRunning && currentSandboxState == kubecontainer.ContainerStateRunning {
   373  			metrics.RunningPodCount.Inc()
   374  		}
   375  	}
   376  }
   377  
   378  func getContainerStateCount(podStatus *kubecontainer.PodStatus) map[kubecontainer.State]int {
   379  	containerStateCount := make(map[kubecontainer.State]int)
   380  	for _, container := range podStatus.ContainerStatuses {
   381  		containerStateCount[container.State]++
   382  	}
   383  	return containerStateCount
   384  }
   385  
   386  func (e *EventedPLEG) updateRunningContainerMetric(podStatus *kubecontainer.PodStatus) {
   387  	cachedPodStatus, err := e.cache.Get(podStatus.ID)
   388  	if err != nil {
   389  		klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID)
   390  	}
   391  
   392  	// cache miss condition: The pod status object will have empty state if missed in cache
   393  	if len(cachedPodStatus.SandboxStatuses) < 1 {
   394  		containerStateCount := getContainerStateCount(podStatus)
   395  		for state, count := range containerStateCount {
   396  			// add currently obtained count
   397  			metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(count))
   398  		}
   399  	} else {
   400  		oldContainerStateCount := getContainerStateCount(cachedPodStatus)
   401  		currentContainerStateCount := getContainerStateCount(podStatus)
   402  
   403  		// old and new set of container states may vary;
   404  		// get a unique set of container states combining both
   405  		containerStates := make(map[kubecontainer.State]bool)
   406  		for state := range oldContainerStateCount {
   407  			containerStates[state] = true
   408  		}
   409  		for state := range currentContainerStateCount {
   410  			containerStates[state] = true
   411  		}
   412  
   413  		// update the metric via difference of old and current counts
   414  		for state := range containerStates {
   415  			diff := currentContainerStateCount[state] - oldContainerStateCount[state]
   416  			metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(diff))
   417  		}
   418  	}
   419  }
   420  
   421  func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventResponse) {
   422  	duration := time.Duration(time.Now().UnixNano()-event.CreatedAt) * time.Nanosecond
   423  	metrics.EventedPLEGConnLatency.Observe(duration.Seconds())
   424  }
   425  
   426  func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) {
   427  	return fmt.Errorf("not implemented"), false
   428  }