k8s.io/kubernetes@v1.29.3/pkg/kubelet/util/pod_startup_latency_tracker.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package util
    18  
    19  import (
    20  	"sync"
    21  	"time"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/types"
    25  	"k8s.io/klog/v2"
    26  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    27  	"k8s.io/utils/clock"
    28  )
    29  
    30  // PodStartupLatencyTracker records key moments for startup latency calculation,
    31  // e.g. image pulling or pod observed running on watch.
    32  type PodStartupLatencyTracker interface {
    33  	ObservedPodOnWatch(pod *v1.Pod, when time.Time)
    34  	RecordImageStartedPulling(podUID types.UID)
    35  	RecordImageFinishedPulling(podUID types.UID)
    36  	RecordStatusUpdated(pod *v1.Pod)
    37  	DeletePodStartupState(podUID types.UID)
    38  }
    39  
    40  type basicPodStartupLatencyTracker struct {
    41  	// protect against concurrent read and write on pods map
    42  	lock sync.Mutex
    43  	pods map[types.UID]*perPodState
    44  	// For testability
    45  	clock clock.Clock
    46  }
    47  
    48  type perPodState struct {
    49  	firstStartedPulling time.Time
    50  	lastFinishedPulling time.Time
    51  	// first time, when pod status changed into Running
    52  	observedRunningTime time.Time
    53  	// log, if pod latency was already Observed
    54  	metricRecorded bool
    55  }
    56  
    57  // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker
    58  func NewPodStartupLatencyTracker() PodStartupLatencyTracker {
    59  	return &basicPodStartupLatencyTracker{
    60  		pods:  map[types.UID]*perPodState{},
    61  		clock: clock.RealClock{},
    62  	}
    63  }
    64  
    65  func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when time.Time) {
    66  	p.lock.Lock()
    67  	defer p.lock.Unlock()
    68  
    69  	// if the pod is terminal, we do not have to track it anymore for startup
    70  	if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
    71  		delete(p.pods, pod.UID)
    72  		return
    73  	}
    74  
    75  	state := p.pods[pod.UID]
    76  	if state == nil {
    77  		// create a new record for pod, only if it was not yet acknowledged by the Kubelet
    78  		// this is required, as we want to log metric only for those pods, that where scheduled
    79  		// after Kubelet started
    80  		if pod.Status.StartTime.IsZero() {
    81  			p.pods[pod.UID] = &perPodState{}
    82  		}
    83  
    84  		return
    85  	}
    86  
    87  	if state.observedRunningTime.IsZero() {
    88  		// skip, pod didn't start yet
    89  		return
    90  	}
    91  
    92  	if state.metricRecorded {
    93  		// skip, pod's latency already recorded
    94  		return
    95  	}
    96  
    97  	if hasPodStartedSLO(pod) {
    98  		podStartingDuration := when.Sub(pod.CreationTimestamp.Time)
    99  		imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling)
   100  		podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds()
   101  
   102  		klog.InfoS("Observed pod startup duration",
   103  			"pod", klog.KObj(pod),
   104  			"podStartSLOduration", podStartSLOduration,
   105  			"podStartE2EDuration", podStartingDuration,
   106  			"podCreationTimestamp", pod.CreationTimestamp.Time,
   107  			"firstStartedPulling", state.firstStartedPulling,
   108  			"lastFinishedPulling", state.lastFinishedPulling,
   109  			"observedRunningTime", state.observedRunningTime,
   110  			"watchObservedRunningTime", when)
   111  
   112  		metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration)
   113  		metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds())
   114  		state.metricRecorded = true
   115  	}
   116  }
   117  
   118  func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) {
   119  	p.lock.Lock()
   120  	defer p.lock.Unlock()
   121  
   122  	state := p.pods[podUID]
   123  	if state == nil {
   124  		return
   125  	}
   126  
   127  	if state.firstStartedPulling.IsZero() {
   128  		state.firstStartedPulling = p.clock.Now()
   129  	}
   130  }
   131  
   132  func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) {
   133  	p.lock.Lock()
   134  	defer p.lock.Unlock()
   135  
   136  	state := p.pods[podUID]
   137  	if state == nil {
   138  		return
   139  	}
   140  
   141  	state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past.
   142  }
   143  
   144  func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) {
   145  	p.lock.Lock()
   146  	defer p.lock.Unlock()
   147  
   148  	state := p.pods[pod.UID]
   149  	if state == nil {
   150  		return
   151  	}
   152  
   153  	if state.metricRecorded {
   154  		// skip, pod latency already recorded
   155  		return
   156  	}
   157  
   158  	if !state.observedRunningTime.IsZero() {
   159  		// skip, pod already started
   160  		return
   161  	}
   162  
   163  	if hasPodStartedSLO(pod) {
   164  		klog.V(3).InfoS("Mark when the pod was running for the first time", "pod", klog.KObj(pod), "rv", pod.ResourceVersion)
   165  		state.observedRunningTime = p.clock.Now()
   166  	}
   167  }
   168  
   169  // hasPodStartedSLO, check if for given pod, each container has been started at least once
   170  //
   171  // This should reflect "Pod startup latency SLI" definition
   172  // ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md
   173  func hasPodStartedSLO(pod *v1.Pod) bool {
   174  	for _, cs := range pod.Status.ContainerStatuses {
   175  		if cs.State.Running == nil || cs.State.Running.StartedAt.IsZero() {
   176  			return false
   177  		}
   178  	}
   179  
   180  	return true
   181  }
   182  
   183  func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) {
   184  	p.lock.Lock()
   185  	defer p.lock.Unlock()
   186  
   187  	delete(p.pods, podUID)
   188  }