k8s.io/kubernetes@v1.29.3/pkg/kubelet/util/pod_startup_latency_tracker.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package util 18 19 import ( 20 "sync" 21 "time" 22 23 v1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/types" 25 "k8s.io/klog/v2" 26 "k8s.io/kubernetes/pkg/kubelet/metrics" 27 "k8s.io/utils/clock" 28 ) 29 30 // PodStartupLatencyTracker records key moments for startup latency calculation, 31 // e.g. image pulling or pod observed running on watch. 32 type PodStartupLatencyTracker interface { 33 ObservedPodOnWatch(pod *v1.Pod, when time.Time) 34 RecordImageStartedPulling(podUID types.UID) 35 RecordImageFinishedPulling(podUID types.UID) 36 RecordStatusUpdated(pod *v1.Pod) 37 DeletePodStartupState(podUID types.UID) 38 } 39 40 type basicPodStartupLatencyTracker struct { 41 // protect against concurrent read and write on pods map 42 lock sync.Mutex 43 pods map[types.UID]*perPodState 44 // For testability 45 clock clock.Clock 46 } 47 48 type perPodState struct { 49 firstStartedPulling time.Time 50 lastFinishedPulling time.Time 51 // first time, when pod status changed into Running 52 observedRunningTime time.Time 53 // log, if pod latency was already Observed 54 metricRecorded bool 55 } 56 57 // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker 58 func NewPodStartupLatencyTracker() PodStartupLatencyTracker { 59 return &basicPodStartupLatencyTracker{ 60 pods: map[types.UID]*perPodState{}, 61 clock: clock.RealClock{}, 62 } 63 } 64 65 func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when time.Time) { 66 p.lock.Lock() 67 defer p.lock.Unlock() 68 69 // if the pod is terminal, we do not have to track it anymore for startup 70 if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { 71 delete(p.pods, pod.UID) 72 return 73 } 74 75 state := p.pods[pod.UID] 76 if state == nil { 77 // create a new record for pod, only if it was not yet acknowledged by the Kubelet 78 // this is required, as we want to log metric only for those pods, that where scheduled 79 // after Kubelet started 80 if pod.Status.StartTime.IsZero() { 81 p.pods[pod.UID] = &perPodState{} 82 } 83 84 return 85 } 86 87 if state.observedRunningTime.IsZero() { 88 // skip, pod didn't start yet 89 return 90 } 91 92 if state.metricRecorded { 93 // skip, pod's latency already recorded 94 return 95 } 96 97 if hasPodStartedSLO(pod) { 98 podStartingDuration := when.Sub(pod.CreationTimestamp.Time) 99 imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling) 100 podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds() 101 102 klog.InfoS("Observed pod startup duration", 103 "pod", klog.KObj(pod), 104 "podStartSLOduration", podStartSLOduration, 105 "podStartE2EDuration", podStartingDuration, 106 "podCreationTimestamp", pod.CreationTimestamp.Time, 107 "firstStartedPulling", state.firstStartedPulling, 108 "lastFinishedPulling", state.lastFinishedPulling, 109 "observedRunningTime", state.observedRunningTime, 110 "watchObservedRunningTime", when) 111 112 metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration) 113 metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds()) 114 state.metricRecorded = true 115 } 116 } 117 118 func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) { 119 p.lock.Lock() 120 defer p.lock.Unlock() 121 122 state := p.pods[podUID] 123 if state == nil { 124 return 125 } 126 127 if state.firstStartedPulling.IsZero() { 128 state.firstStartedPulling = p.clock.Now() 129 } 130 } 131 132 func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) { 133 p.lock.Lock() 134 defer p.lock.Unlock() 135 136 state := p.pods[podUID] 137 if state == nil { 138 return 139 } 140 141 state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past. 142 } 143 144 func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) { 145 p.lock.Lock() 146 defer p.lock.Unlock() 147 148 state := p.pods[pod.UID] 149 if state == nil { 150 return 151 } 152 153 if state.metricRecorded { 154 // skip, pod latency already recorded 155 return 156 } 157 158 if !state.observedRunningTime.IsZero() { 159 // skip, pod already started 160 return 161 } 162 163 if hasPodStartedSLO(pod) { 164 klog.V(3).InfoS("Mark when the pod was running for the first time", "pod", klog.KObj(pod), "rv", pod.ResourceVersion) 165 state.observedRunningTime = p.clock.Now() 166 } 167 } 168 169 // hasPodStartedSLO, check if for given pod, each container has been started at least once 170 // 171 // This should reflect "Pod startup latency SLI" definition 172 // ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md 173 func hasPodStartedSLO(pod *v1.Pod) bool { 174 for _, cs := range pod.Status.ContainerStatuses { 175 if cs.State.Running == nil || cs.State.Running.StartedAt.IsZero() { 176 return false 177 } 178 } 179 180 return true 181 } 182 183 func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) { 184 p.lock.Lock() 185 defer p.lock.Unlock() 186 187 delete(p.pods, podUID) 188 }