k8s.io/kubernetes@v1.29.3/pkg/kubelet/pleg/evented.go

k8s.io/kubernetes@v1.29.3/pkg/kubelet/pleg/evented.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package pleg
    18  
    19  import (
    20  	"fmt"
    21  	"sync"
    22  	"time"
    23  
    24  	"k8s.io/apimachinery/pkg/types"
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	internalapi "k8s.io/cri-api/pkg/apis"
    27  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    28  	"k8s.io/klog/v2"
    29  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    30  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    31  	"k8s.io/utils/clock"
    32  )
    33  
    34  // The frequency with which global timestamp of the cache is to
    35  // is to be updated periodically. If pod workers get stuck at cache.GetNewerThan
    36  // call, after this period it will be unblocked.
    37  const globalCacheUpdatePeriod = 5 * time.Second
    38  
    39  var (
    40  	eventedPLEGUsage   = false
    41  	eventedPLEGUsageMu = sync.RWMutex{}
    42  )
    43  
    44  // isEventedPLEGInUse indicates whether Evented PLEG is in use. Even after enabling
    45  // the Evented PLEG feature gate, there could be several reasons it may not be in use.
    46  // e.g. Streaming data issues from the runtime or the runtime does not implement the
    47  // container events stream.
    48  func isEventedPLEGInUse() bool {
    49  	eventedPLEGUsageMu.RLock()
    50  	defer eventedPLEGUsageMu.RUnlock()
    51  	return eventedPLEGUsage
    52  }
    53  
    54  // setEventedPLEGUsage should only be accessed from
    55  // Start/Stop of Evented PLEG.
    56  func setEventedPLEGUsage(enable bool) {
    57  	eventedPLEGUsageMu.Lock()
    58  	defer eventedPLEGUsageMu.Unlock()
    59  	eventedPLEGUsage = enable
    60  }
    61  
    62  type EventedPLEG struct {
    63  	// The container runtime.
    64  	runtime kubecontainer.Runtime
    65  	// The runtime service.
    66  	runtimeService internalapi.RuntimeService
    67  	// The channel from which the subscriber listens events.
    68  	eventChannel chan *PodLifecycleEvent
    69  	// Cache for storing the runtime states required for syncing pods.
    70  	cache kubecontainer.Cache
    71  	// For testability.
    72  	clock clock.Clock
    73  	// GenericPLEG is used to force relist when required.
    74  	genericPleg podLifecycleEventGeneratorHandler
    75  	// The maximum number of retries when getting container events from the runtime.
    76  	eventedPlegMaxStreamRetries int
    77  	// Indicates relisting related parameters
    78  	relistDuration *RelistDuration
    79  	// Stop the Evented PLEG by closing the channel.
    80  	stopCh chan struct{}
    81  	// Stops the periodic update of the cache global timestamp.
    82  	stopCacheUpdateCh chan struct{}
    83  	// Locks the start/stop operation of the Evented PLEG.
    84  	runningMu sync.Mutex
    85  }
    86  
    87  // NewEventedPLEG instantiates a new EventedPLEG object and return it.
    88  func NewEventedPLEG(runtime kubecontainer.Runtime, runtimeService internalapi.RuntimeService, eventChannel chan *PodLifecycleEvent,
    89  	cache kubecontainer.Cache, genericPleg PodLifecycleEventGenerator, eventedPlegMaxStreamRetries int,
    90  	relistDuration *RelistDuration, clock clock.Clock) (PodLifecycleEventGenerator, error) {
    91  	handler, ok := genericPleg.(podLifecycleEventGeneratorHandler)
    92  	if !ok {
    93  		return nil, fmt.Errorf("%v doesn't implement podLifecycleEventGeneratorHandler interface", genericPleg)
    94  	}
    95  	return &EventedPLEG{
    96  		runtime:                     runtime,
    97  		runtimeService:              runtimeService,
    98  		eventChannel:                eventChannel,
    99  		cache:                       cache,
   100  		genericPleg:                 handler,
   101  		eventedPlegMaxStreamRetries: eventedPlegMaxStreamRetries,
   102  		relistDuration:              relistDuration,
   103  		clock:                       clock,
   104  	}, nil
   105  }
   106  
   107  // Watch returns a channel from which the subscriber can receive PodLifecycleEvent events.
   108  func (e *EventedPLEG) Watch() chan *PodLifecycleEvent {
   109  	return e.eventChannel
   110  }
   111  
   112  // Relist relists all containers using GenericPLEG
   113  func (e *EventedPLEG) Relist() {
   114  	e.genericPleg.Relist()
   115  }
   116  
   117  // Start starts the Evented PLEG
   118  func (e *EventedPLEG) Start() {
   119  	e.runningMu.Lock()
   120  	defer e.runningMu.Unlock()
   121  	if isEventedPLEGInUse() {
   122  		return
   123  	}
   124  	setEventedPLEGUsage(true)
   125  	e.stopCh = make(chan struct{})
   126  	e.stopCacheUpdateCh = make(chan struct{})
   127  	go wait.Until(e.watchEventsChannel, 0, e.stopCh)
   128  	go wait.Until(e.updateGlobalCache, globalCacheUpdatePeriod, e.stopCacheUpdateCh)
   129  }
   130  
   131  // Stop stops the Evented PLEG
   132  func (e *EventedPLEG) Stop() {
   133  	e.runningMu.Lock()
   134  	defer e.runningMu.Unlock()
   135  	if !isEventedPLEGInUse() {
   136  		return
   137  	}
   138  	setEventedPLEGUsage(false)
   139  	close(e.stopCh)
   140  	close(e.stopCacheUpdateCh)
   141  }
   142  
   143  // In case the Evented PLEG experiences undetectable issues in the underlying
   144  // GRPC connection there is a remote chance the pod might get stuck in a
   145  // given state while it has progressed in its life cycle. This function will be
   146  // called periodically to update the global timestamp of the cache so that those
   147  // pods stuck at GetNewerThan in pod workers will get unstuck.
   148  func (e *EventedPLEG) updateGlobalCache() {
   149  	e.cache.UpdateTime(time.Now())
   150  }
   151  
   152  // Update the relisting period and threshold
   153  func (e *EventedPLEG) Update(relistDuration *RelistDuration) {
   154  	e.genericPleg.Update(relistDuration)
   155  }
   156  
   157  // Healthy check if PLEG work properly.
   158  func (e *EventedPLEG) Healthy() (bool, error) {
   159  	// GenericPLEG is declared unhealthy when relisting time is more
   160  	// than the relistThreshold. In case EventedPLEG is turned on,
   161  	// relistingPeriod and relistingThreshold are adjusted to higher
   162  	// values. So the health check of Generic PLEG should check
   163  	// the adjusted values of relistingPeriod and relistingThreshold.
   164  
   165  	// EventedPLEG is declared unhealthy only if eventChannel is out of capacity.
   166  	if len(e.eventChannel) == cap(e.eventChannel) {
   167  		return false, fmt.Errorf("EventedPLEG: pleg event channel capacity is full with %v events", len(e.eventChannel))
   168  	}
   169  
   170  	timestamp := e.clock.Now()
   171  	metrics.PLEGLastSeen.Set(float64(timestamp.Unix()))
   172  	return true, nil
   173  }
   174  
   175  func (e *EventedPLEG) watchEventsChannel() {
   176  	containerEventsResponseCh := make(chan *runtimeapi.ContainerEventResponse, cap(e.eventChannel))
   177  	defer close(containerEventsResponseCh)
   178  
   179  	// Get the container events from the runtime.
   180  	go func() {
   181  		numAttempts := 0
   182  		for {
   183  			if numAttempts >= e.eventedPlegMaxStreamRetries {
   184  				if isEventedPLEGInUse() {
   185  					// Fall back to Generic PLEG relisting since Evented PLEG is not working.
   186  					klog.V(4).InfoS("Fall back to Generic PLEG relisting since Evented PLEG is not working")
   187  					e.Stop()
   188  					e.genericPleg.Stop()       // Stop the existing Generic PLEG which runs with longer relisting period when Evented PLEG is in use.
   189  					e.Update(e.relistDuration) // Update the relisting period to the default value for the Generic PLEG.
   190  					e.genericPleg.Start()
   191  					break
   192  				}
   193  			}
   194  
   195  			err := e.runtimeService.GetContainerEvents(containerEventsResponseCh)
   196  			if err != nil {
   197  				metrics.EventedPLEGConnErr.Inc()
   198  				numAttempts++
   199  				e.Relist() // Force a relist to get the latest container and pods running metric.
   200  				klog.V(4).InfoS("Evented PLEG: Failed to get container events, retrying: ", "err", err)
   201  			}
   202  		}
   203  	}()
   204  
   205  	if isEventedPLEGInUse() {
   206  		e.processCRIEvents(containerEventsResponseCh)
   207  	}
   208  }
   209  
   210  func (e *EventedPLEG) processCRIEvents(containerEventsResponseCh chan *runtimeapi.ContainerEventResponse) {
   211  	for event := range containerEventsResponseCh {
   212  		// Ignore the event if PodSandboxStatus is nil.
   213  		// This might happen under some race condition where the podSandbox has
   214  		// been deleted, and therefore container runtime couldn't find the
   215  		// podSandbox for the container when generating the event.
   216  		// It is safe to ignore because
   217  		// a) a event would have been received for the sandbox deletion,
   218  		// b) in worst case, a relist will eventually sync the pod status.
   219  		// TODO(#114371): Figure out a way to handle this case instead of ignoring.
   220  		if event.PodSandboxStatus == nil || event.PodSandboxStatus.Metadata == nil {
   221  			klog.ErrorS(nil, "Evented PLEG: received ContainerEventResponse with nil PodSandboxStatus or PodSandboxStatus.Metadata", "containerEventResponse", event)
   222  			continue
   223  		}
   224  
   225  		podID := types.UID(event.PodSandboxStatus.Metadata.Uid)
   226  		shouldSendPLEGEvent := false
   227  
   228  		status, err := e.runtime.GeneratePodStatus(event)
   229  		if err != nil {
   230  			// nolint:logcheck // Not using the result of klog.V inside the
   231  			// if branch is okay, we just use it to determine whether the
   232  			// additional "podStatus" key and its value should be added.
   233  			if klog.V(6).Enabled() {
   234  				klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID, "podStatus", status)
   235  			} else {
   236  				klog.ErrorS(err, "Evented PLEG: error generating pod status from the received event", "podUID", podID)
   237  			}
   238  		} else {
   239  			if klogV := klog.V(6); klogV.Enabled() {
   240  				klogV.InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID, "podStatus", status)
   241  			} else {
   242  				klog.V(4).InfoS("Evented PLEG: Generated pod status from the received event", "podUID", podID)
   243  			}
   244  			// Preserve the pod IP across cache updates if the new IP is empty.
   245  			// When a pod is torn down, kubelet may race with PLEG and retrieve
   246  			// a pod status after network teardown, but the kubernetes API expects
   247  			// the completed pod's IP to be available after the pod is dead.
   248  			status.IPs = e.getPodIPs(podID, status)
   249  		}
   250  
   251  		e.updateRunningPodMetric(status)
   252  		e.updateRunningContainerMetric(status)
   253  		e.updateLatencyMetric(event)
   254  
   255  		if event.ContainerEventType == runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT {
   256  			for _, sandbox := range status.SandboxStatuses {
   257  				if sandbox.Id == event.ContainerId {
   258  					// When the CONTAINER_DELETED_EVENT is received by the kubelet,
   259  					// the runtime has indicated that the container has been removed
   260  					// by the runtime and hence, it must be removed from the cache
   261  					// of kubelet too.
   262  					e.cache.Delete(podID)
   263  				}
   264  			}
   265  			shouldSendPLEGEvent = true
   266  		} else {
   267  			if e.cache.Set(podID, status, err, time.Unix(event.GetCreatedAt(), 0)) {
   268  				shouldSendPLEGEvent = true
   269  			}
   270  		}
   271  
   272  		if shouldSendPLEGEvent {
   273  			e.processCRIEvent(event)
   274  		}
   275  	}
   276  }
   277  
   278  func (e *EventedPLEG) processCRIEvent(event *runtimeapi.ContainerEventResponse) {
   279  	switch event.ContainerEventType {
   280  	case runtimeapi.ContainerEventType_CONTAINER_STOPPED_EVENT:
   281  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
   282  		klog.V(4).InfoS("Received Container Stopped Event", "event", event.String())
   283  	case runtimeapi.ContainerEventType_CONTAINER_CREATED_EVENT:
   284  		// We only need to update the pod status on container create.
   285  		// But we don't have to generate any PodLifeCycleEvent. Container creation related
   286  		// PodLifeCycleEvent is ignored by the existing Generic PLEG as well.
   287  		// https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L88 and
   288  		// https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L273
   289  		klog.V(4).InfoS("Received Container Created Event", "event", event.String())
   290  	case runtimeapi.ContainerEventType_CONTAINER_STARTED_EVENT:
   291  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerStarted, Data: event.ContainerId})
   292  		klog.V(4).InfoS("Received Container Started Event", "event", event.String())
   293  	case runtimeapi.ContainerEventType_CONTAINER_DELETED_EVENT:
   294  		// In case the pod is deleted it is safe to generate both ContainerDied and ContainerRemoved events, just like in the case of
   295  		// Generic PLEG. https://github.com/kubernetes/kubernetes/blob/24753aa8a4df8d10bfd6330e0f29186000c018be/pkg/kubelet/pleg/generic.go#L169
   296  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerDied, Data: event.ContainerId})
   297  		e.sendPodLifecycleEvent(&PodLifecycleEvent{ID: types.UID(event.PodSandboxStatus.Metadata.Uid), Type: ContainerRemoved, Data: event.ContainerId})
   298  		klog.V(4).InfoS("Received Container Deleted Event", "event", event)
   299  	}
   300  }
   301  
   302  func (e *EventedPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus) []string {
   303  	if len(status.IPs) != 0 {
   304  		return status.IPs
   305  	}
   306  
   307  	oldStatus, err := e.cache.Get(pid)
   308  	if err != nil || len(oldStatus.IPs) == 0 {
   309  		return nil
   310  	}
   311  
   312  	for _, sandboxStatus := range status.SandboxStatuses {
   313  		// If at least one sandbox is ready, then use this status update's pod IP
   314  		if sandboxStatus.State == runtimeapi.PodSandboxState_SANDBOX_READY {
   315  			return status.IPs
   316  		}
   317  	}
   318  
   319  	// For pods with no ready containers or sandboxes (like exited pods)
   320  	// use the old status' pod IP
   321  	return oldStatus.IPs
   322  }
   323  
   324  func (e *EventedPLEG) sendPodLifecycleEvent(event *PodLifecycleEvent) {
   325  	select {
   326  	case e.eventChannel <- event:
   327  	default:
   328  		// record how many events were discarded due to channel out of capacity
   329  		metrics.PLEGDiscardEvents.Inc()
   330  		klog.ErrorS(nil, "Evented PLEG: Event channel is full, discarded pod lifecycle event")
   331  	}
   332  }
   333  
   334  func getPodSandboxState(podStatus *kubecontainer.PodStatus) kubecontainer.State {
   335  	// increase running pod count when cache doesn't contain podID
   336  	var sandboxId string
   337  	for _, sandbox := range podStatus.SandboxStatuses {
   338  		sandboxId = sandbox.Id
   339  		// pod must contain only one sandbox
   340  		break
   341  	}
   342  
   343  	for _, containerStatus := range podStatus.ContainerStatuses {
   344  		if containerStatus.ID.ID == sandboxId {
   345  			if containerStatus.State == kubecontainer.ContainerStateRunning {
   346  				return containerStatus.State
   347  			}
   348  		}
   349  	}
   350  	return kubecontainer.ContainerStateExited
   351  }
   352  
   353  func (e *EventedPLEG) updateRunningPodMetric(podStatus *kubecontainer.PodStatus) {
   354  	cachedPodStatus, err := e.cache.Get(podStatus.ID)
   355  	if err != nil {
   356  		klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID)
   357  	}
   358  	// cache miss condition: The pod status object will have empty state if missed in cache
   359  	if len(cachedPodStatus.SandboxStatuses) < 1 {
   360  		sandboxState := getPodSandboxState(podStatus)
   361  		if sandboxState == kubecontainer.ContainerStateRunning {
   362  			metrics.RunningPodCount.Inc()
   363  		}
   364  	} else {
   365  		oldSandboxState := getPodSandboxState(cachedPodStatus)
   366  		currentSandboxState := getPodSandboxState(podStatus)
   367  
   368  		if oldSandboxState == kubecontainer.ContainerStateRunning && currentSandboxState != kubecontainer.ContainerStateRunning {
   369  			metrics.RunningPodCount.Dec()
   370  		} else if oldSandboxState != kubecontainer.ContainerStateRunning && currentSandboxState == kubecontainer.ContainerStateRunning {
   371  			metrics.RunningPodCount.Inc()
   372  		}
   373  	}
   374  }
   375  
   376  func getContainerStateCount(podStatus *kubecontainer.PodStatus) map[kubecontainer.State]int {
   377  	containerStateCount := make(map[kubecontainer.State]int)
   378  	for _, container := range podStatus.ContainerStatuses {
   379  		containerStateCount[container.State]++
   380  	}
   381  	return containerStateCount
   382  }
   383  
   384  func (e *EventedPLEG) updateRunningContainerMetric(podStatus *kubecontainer.PodStatus) {
   385  	cachedPodStatus, err := e.cache.Get(podStatus.ID)
   386  	if err != nil {
   387  		klog.ErrorS(err, "Evented PLEG: Get cache", "podID", podStatus.ID)
   388  	}
   389  
   390  	// cache miss condition: The pod status object will have empty state if missed in cache
   391  	if len(cachedPodStatus.SandboxStatuses) < 1 {
   392  		containerStateCount := getContainerStateCount(podStatus)
   393  		for state, count := range containerStateCount {
   394  			// add currently obtained count
   395  			metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(count))
   396  		}
   397  	} else {
   398  		oldContainerStateCount := getContainerStateCount(cachedPodStatus)
   399  		currentContainerStateCount := getContainerStateCount(podStatus)
   400  
   401  		// old and new set of container states may vary;
   402  		// get a unique set of container states combining both
   403  		containerStates := make(map[kubecontainer.State]bool)
   404  		for state := range oldContainerStateCount {
   405  			containerStates[state] = true
   406  		}
   407  		for state := range currentContainerStateCount {
   408  			containerStates[state] = true
   409  		}
   410  
   411  		// update the metric via difference of old and current counts
   412  		for state := range containerStates {
   413  			diff := currentContainerStateCount[state] - oldContainerStateCount[state]
   414  			metrics.RunningContainerCount.WithLabelValues(string(state)).Add(float64(diff))
   415  		}
   416  	}
   417  }
   418  
   419  func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventResponse) {
   420  	duration := time.Duration(time.Now().UnixNano()-event.CreatedAt) * time.Nanosecond
   421  	metrics.EventedPLEGConnLatency.Observe(duration.Seconds())
   422  }
   423  
   424  func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) {
   425  	return fmt.Errorf("not implemented"), false
   426  }