k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/util/pod_startup_latency_tracker.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package util 18 19 import ( 20 "sync" 21 "time" 22 23 v1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/types" 25 "k8s.io/klog/v2" 26 "k8s.io/kubernetes/pkg/kubelet/metrics" 27 "k8s.io/utils/clock" 28 ) 29 30 // PodStartupLatencyTracker records key moments for startup latency calculation, 31 // e.g. image pulling or pod observed running on watch. 32 type PodStartupLatencyTracker interface { 33 ObservedPodOnWatch(pod *v1.Pod, when time.Time) 34 RecordImageStartedPulling(podUID types.UID) 35 RecordImageFinishedPulling(podUID types.UID) 36 RecordStatusUpdated(pod *v1.Pod) 37 DeletePodStartupState(podUID types.UID) 38 } 39 40 type basicPodStartupLatencyTracker struct { 41 // protect against concurrent read and write on pods map 42 lock sync.Mutex 43 pods map[types.UID]*perPodState 44 // metrics for the first network pod only 45 firstNetworkPodSeen bool 46 // For testability 47 clock clock.Clock 48 } 49 50 type perPodState struct { 51 firstStartedPulling time.Time 52 lastFinishedPulling time.Time 53 // first time, when pod status changed into Running 54 observedRunningTime time.Time 55 // log, if pod latency was already Observed 56 metricRecorded bool 57 } 58 59 // NewPodStartupLatencyTracker creates an instance of PodStartupLatencyTracker 60 func NewPodStartupLatencyTracker() PodStartupLatencyTracker { 61 return &basicPodStartupLatencyTracker{ 62 pods: map[types.UID]*perPodState{}, 63 clock: clock.RealClock{}, 64 } 65 } 66 67 func (p *basicPodStartupLatencyTracker) ObservedPodOnWatch(pod *v1.Pod, when time.Time) { 68 p.lock.Lock() 69 defer p.lock.Unlock() 70 71 // if the pod is terminal, we do not have to track it anymore for startup 72 if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { 73 delete(p.pods, pod.UID) 74 return 75 } 76 77 state := p.pods[pod.UID] 78 if state == nil { 79 // create a new record for pod, only if it was not yet acknowledged by the Kubelet 80 // this is required, as we want to log metric only for those pods, that where scheduled 81 // after Kubelet started 82 if pod.Status.StartTime.IsZero() { 83 p.pods[pod.UID] = &perPodState{} 84 } 85 86 return 87 } 88 89 if state.observedRunningTime.IsZero() { 90 // skip, pod didn't start yet 91 return 92 } 93 94 if state.metricRecorded { 95 // skip, pod's latency already recorded 96 return 97 } 98 99 if hasPodStartedSLO(pod) { 100 podStartingDuration := when.Sub(pod.CreationTimestamp.Time) 101 imagePullingDuration := state.lastFinishedPulling.Sub(state.firstStartedPulling) 102 podStartSLOduration := (podStartingDuration - imagePullingDuration).Seconds() 103 104 klog.InfoS("Observed pod startup duration", 105 "pod", klog.KObj(pod), 106 "podStartSLOduration", podStartSLOduration, 107 "podStartE2EDuration", podStartingDuration, 108 "podCreationTimestamp", pod.CreationTimestamp.Time, 109 "firstStartedPulling", state.firstStartedPulling, 110 "lastFinishedPulling", state.lastFinishedPulling, 111 "observedRunningTime", state.observedRunningTime, 112 "watchObservedRunningTime", when) 113 114 metrics.PodStartSLIDuration.WithLabelValues().Observe(podStartSLOduration) 115 metrics.PodStartTotalDuration.WithLabelValues().Observe(podStartingDuration.Seconds()) 116 state.metricRecorded = true 117 // if is the first Pod with network track the start values 118 // these metrics will help to identify problems with the CNI plugin 119 if !pod.Spec.HostNetwork && !p.firstNetworkPodSeen { 120 metrics.FirstNetworkPodStartSLIDuration.Set(podStartSLOduration) 121 p.firstNetworkPodSeen = true 122 } 123 } 124 } 125 126 func (p *basicPodStartupLatencyTracker) RecordImageStartedPulling(podUID types.UID) { 127 p.lock.Lock() 128 defer p.lock.Unlock() 129 130 state := p.pods[podUID] 131 if state == nil { 132 return 133 } 134 135 if state.firstStartedPulling.IsZero() { 136 state.firstStartedPulling = p.clock.Now() 137 } 138 } 139 140 func (p *basicPodStartupLatencyTracker) RecordImageFinishedPulling(podUID types.UID) { 141 p.lock.Lock() 142 defer p.lock.Unlock() 143 144 state := p.pods[podUID] 145 if state == nil { 146 return 147 } 148 149 state.lastFinishedPulling = p.clock.Now() // Now is always grater than values from the past. 150 } 151 152 func (p *basicPodStartupLatencyTracker) RecordStatusUpdated(pod *v1.Pod) { 153 p.lock.Lock() 154 defer p.lock.Unlock() 155 156 state := p.pods[pod.UID] 157 if state == nil { 158 return 159 } 160 161 if state.metricRecorded { 162 // skip, pod latency already recorded 163 return 164 } 165 166 if !state.observedRunningTime.IsZero() { 167 // skip, pod already started 168 return 169 } 170 171 if hasPodStartedSLO(pod) { 172 klog.V(3).InfoS("Mark when the pod was running for the first time", "pod", klog.KObj(pod), "rv", pod.ResourceVersion) 173 state.observedRunningTime = p.clock.Now() 174 } 175 } 176 177 // hasPodStartedSLO, check if for given pod, each container has been started at least once 178 // 179 // This should reflect "Pod startup latency SLI" definition 180 // ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md 181 func hasPodStartedSLO(pod *v1.Pod) bool { 182 for _, cs := range pod.Status.ContainerStatuses { 183 if cs.State.Running == nil || cs.State.Running.StartedAt.IsZero() { 184 return false 185 } 186 } 187 188 return true 189 } 190 191 func (p *basicPodStartupLatencyTracker) DeletePodStartupState(podUID types.UID) { 192 p.lock.Lock() 193 defer p.lock.Unlock() 194 195 delete(p.pods, podUID) 196 }