k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/pod_startup_latency.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package slos 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 corev1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/fields" 27 "k8s.io/apimachinery/pkg/runtime" 28 "k8s.io/apimachinery/pkg/watch" 29 clientset "k8s.io/client-go/kubernetes" 30 "k8s.io/client-go/tools/cache" 31 "k8s.io/client-go/util/workqueue" 32 "k8s.io/klog/v2" 33 "k8s.io/perf-tests/clusterloader2/pkg/errors" 34 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 35 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 36 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer" 37 "k8s.io/perf-tests/clusterloader2/pkg/util" 38 ) 39 40 const ( 41 defaultPodStartupLatencyThreshold = 5 * time.Second 42 defaultSchedulerName = corev1.DefaultSchedulerName 43 podStartupLatencyMeasurementName = "PodStartupLatency" 44 informerSyncTimeout = time.Minute 45 46 createPhase = "create" 47 schedulePhase = "schedule" 48 runPhase = "run" 49 watchPhase = "watch" 50 ) 51 52 func init() { 53 if err := measurement.Register(podStartupLatencyMeasurementName, createPodStartupLatencyMeasurement); err != nil { 54 klog.Fatalf("cant register service %v", err) 55 } 56 } 57 58 func createPodStartupLatencyMeasurement() measurement.Measurement { 59 return &podStartupLatencyMeasurement{ 60 selector: util.NewObjectSelector(), 61 podStartupEntries: measurementutil.NewObjectTransitionTimes(podStartupLatencyMeasurementName), 62 podMetadata: measurementutil.NewPodsMetadata(podStartupLatencyMeasurementName), 63 eventQueue: workqueue.New(), 64 } 65 } 66 67 type eventData struct { 68 obj interface{} 69 recvTime time.Time 70 } 71 72 type podStartupLatencyMeasurement struct { 73 selector *util.ObjectSelector 74 isRunning bool 75 stopCh chan struct{} 76 // This queue can potentially grow indefinitely, beacause we put all changes here. 77 // Usually it's not recommended pattern, but we need it for measuring PodStartupLatency. 78 eventQueue *workqueue.Type 79 podStartupEntries *measurementutil.ObjectTransitionTimes 80 podMetadata *measurementutil.PodsMetadata 81 threshold time.Duration 82 } 83 84 // Execute supports two actions: 85 // - start - Starts to observe pods and pods events. 86 // - gather - Gathers and prints current pod latency data. 87 // Does NOT support concurrency. Multiple calls to this measurement 88 // shouldn't be done within one step. 89 func (p *podStartupLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 90 action, err := util.GetString(config.Params, "action") 91 if err != nil { 92 return nil, err 93 } 94 95 switch action { 96 case "start": 97 if err := p.selector.Parse(config.Params); err != nil { 98 return nil, err 99 } 100 p.threshold, err = util.GetDurationOrDefault(config.Params, "threshold", defaultPodStartupLatencyThreshold) 101 if err != nil { 102 return nil, err 103 } 104 return nil, p.start(config.ClusterFramework.GetClientSets().GetClient()) 105 case "gather": 106 schedulerName, err := util.GetStringOrDefault(config.Params, "schedulerName", defaultSchedulerName) 107 if err != nil { 108 return nil, err 109 } 110 return p.gather(config.ClusterFramework.GetClientSets().GetClient(), config.Identifier, schedulerName) 111 default: 112 return nil, fmt.Errorf("unknown action %v", action) 113 } 114 115 } 116 117 // Dispose cleans up after the measurement. 118 func (p *podStartupLatencyMeasurement) Dispose() { 119 p.stop() 120 } 121 122 // String returns string representation of this measurement. 123 func (p *podStartupLatencyMeasurement) String() string { 124 return podStartupLatencyMeasurementName + ": " + p.selector.String() 125 } 126 127 func (p *podStartupLatencyMeasurement) start(c clientset.Interface) error { 128 if p.isRunning { 129 klog.V(2).Infof("%s: pod startup latancy measurement already running", p) 130 return nil 131 } 132 klog.V(2).Infof("%s: starting pod startup latency measurement...", p) 133 p.isRunning = true 134 p.stopCh = make(chan struct{}) 135 i := informer.NewInformer( 136 &cache.ListWatch{ 137 ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 138 p.selector.ApplySelectors(&options) 139 return c.CoreV1().Pods(p.selector.Namespace).List(context.TODO(), options) 140 }, 141 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 142 p.selector.ApplySelectors(&options) 143 return c.CoreV1().Pods(p.selector.Namespace).Watch(context.TODO(), options) 144 }, 145 }, 146 p.addEvent, 147 ) 148 go p.processEvents() 149 return informer.StartAndSync(i, p.stopCh, informerSyncTimeout) 150 } 151 152 func (p *podStartupLatencyMeasurement) addEvent(_, obj interface{}) { 153 event := &eventData{obj: obj, recvTime: time.Now()} 154 p.eventQueue.Add(event) 155 } 156 157 func (p *podStartupLatencyMeasurement) processEvents() { 158 for p.processNextWorkItem() { 159 } 160 } 161 162 func (p *podStartupLatencyMeasurement) processNextWorkItem() bool { 163 item, quit := p.eventQueue.Get() 164 if quit { 165 return false 166 } 167 defer p.eventQueue.Done(item) 168 169 event, ok := item.(*eventData) 170 if !ok { 171 klog.Warningf("Couldn't convert work item to evetData: %v", item) 172 return true 173 } 174 p.processEvent(event) 175 return true 176 } 177 178 func (p *podStartupLatencyMeasurement) stop() { 179 if p.isRunning { 180 p.isRunning = false 181 close(p.stopCh) 182 p.eventQueue.ShutDown() 183 } 184 } 185 186 var podStartupTransitions = map[string]measurementutil.Transition{ 187 "create_to_schedule": { 188 From: createPhase, 189 To: schedulePhase, 190 }, 191 "schedule_to_run": { 192 From: schedulePhase, 193 To: runPhase, 194 }, 195 "run_to_watch": { 196 From: runPhase, 197 To: watchPhase, 198 }, 199 "schedule_to_watch": { 200 From: schedulePhase, 201 To: watchPhase, 202 }, 203 "pod_startup": { 204 From: createPhase, 205 To: watchPhase, 206 }, 207 } 208 209 func podStartupTransitionsWithThreshold(threshold time.Duration) map[string]measurementutil.Transition { 210 result := make(map[string]measurementutil.Transition) 211 for key, value := range podStartupTransitions { 212 result[key] = value 213 } 214 podStartupTransition := result["pod_startup"] 215 podStartupTransition.Threshold = threshold 216 result["pod_startup"] = podStartupTransition 217 return result 218 } 219 220 type podStartupLatencyCheck struct { 221 namePrefix string 222 filter measurementutil.KeyFilterFunc 223 } 224 225 func (p *podStartupLatencyMeasurement) gather(c clientset.Interface, identifier string, schedulerName string) ([]measurement.Summary, error) { 226 klog.V(2).Infof("%s: gathering pod startup latency measurement...", p) 227 if !p.isRunning { 228 return nil, fmt.Errorf("metric %s has not been started", podStartupLatencyMeasurementName) 229 } 230 231 p.stop() 232 233 if err := p.gatherScheduleTimes(c, schedulerName); err != nil { 234 return nil, err 235 } 236 237 checks := []podStartupLatencyCheck{ 238 { 239 namePrefix: "", 240 filter: measurementutil.MatchAll, 241 }, 242 { 243 namePrefix: "Stateless", 244 filter: p.podMetadata.FilterStateless, 245 }, 246 { 247 namePrefix: "Stateful", 248 filter: p.podMetadata.FilterStateful, 249 }, 250 } 251 252 var summaries []measurement.Summary 253 var err error 254 for _, check := range checks { 255 transitions := podStartupTransitionsWithThreshold(p.threshold) 256 podStartupLatency := p.podStartupEntries.CalculateTransitionsLatency(transitions, check.filter) 257 258 if slosErr := podStartupLatency["pod_startup"].VerifyThreshold(p.threshold); slosErr != nil { 259 err = errors.NewMetricViolationError("pod startup", slosErr.Error()) 260 klog.Errorf("%s%s: %v", check.namePrefix, p, err) 261 } 262 263 content, jsonErr := util.PrettyPrintJSON(measurementutil.LatencyMapToPerfData(podStartupLatency)) 264 if jsonErr != nil { 265 return nil, jsonErr 266 } 267 summaryName := fmt.Sprintf("%s%s_%s", check.namePrefix, podStartupLatencyMeasurementName, identifier) 268 summaries = append(summaries, measurement.CreateSummary(summaryName, "json", content)) 269 } 270 return summaries, err 271 } 272 273 // TODO(#2006): gatherScheduleTimes is currently listing events at the end of the test. 274 // 275 // Given that events by default have 1h TTL, for measurements across longer periods 276 // it just returns incomplete results. 277 // Given that we don't 100% accuracy, we should switch to a mechanism that is similar 278 // to the one that slo-monitor is using (added in #1477). 279 func (p *podStartupLatencyMeasurement) gatherScheduleTimes(c clientset.Interface, schedulerName string) error { 280 selector := fields.Set{ 281 "involvedObject.kind": "Pod", 282 "source": schedulerName, 283 }.AsSelector().String() 284 options := metav1.ListOptions{FieldSelector: selector} 285 schedEvents, err := c.CoreV1().Events(p.selector.Namespace).List(context.TODO(), options) 286 if err != nil { 287 return err 288 } 289 for _, event := range schedEvents.Items { 290 key := createMetaNamespaceKey(event.InvolvedObject.Namespace, event.InvolvedObject.Name) 291 if _, exists := p.podStartupEntries.Get(key, createPhase); exists { 292 if !event.EventTime.IsZero() { 293 p.podStartupEntries.Set(key, schedulePhase, event.EventTime.Time) 294 } else { 295 p.podStartupEntries.Set(key, schedulePhase, event.FirstTimestamp.Time) 296 } 297 } 298 } 299 return nil 300 } 301 302 func (p *podStartupLatencyMeasurement) processEvent(event *eventData) { 303 obj, recvTime := event.obj, event.recvTime 304 if obj == nil { 305 return 306 } 307 pod, ok := obj.(*corev1.Pod) 308 if !ok { 309 return 310 } 311 312 key := createMetaNamespaceKey(pod.Namespace, pod.Name) 313 p.podMetadata.SetStateless(key, isPodStateless(pod)) 314 315 if pod.Status.Phase == corev1.PodRunning { 316 if _, found := p.podStartupEntries.Get(key, createPhase); !found { 317 p.podStartupEntries.Set(key, watchPhase, recvTime) 318 p.podStartupEntries.Set(key, createPhase, pod.CreationTimestamp.Time) 319 var startTime metav1.Time 320 for _, cs := range pod.Status.ContainerStatuses { 321 if cs.State.Running != nil { 322 if startTime.Before(&cs.State.Running.StartedAt) { 323 startTime = cs.State.Running.StartedAt 324 } 325 } 326 } 327 if startTime != metav1.NewTime(time.Time{}) { 328 p.podStartupEntries.Set(key, runPhase, startTime.Time) 329 } else { 330 klog.Errorf("%s: pod %v (%v) is reported to be running, but none of its containers is", p, pod.Name, pod.Namespace) 331 } 332 } 333 } 334 } 335 336 func createMetaNamespaceKey(namespace, name string) string { 337 return namespace + "/" + name 338 } 339 340 func isPodStateless(pod *corev1.Pod) bool { 341 for _, volume := range pod.Spec.Volumes { 342 if volume.EmptyDir != nil || volume.DownwardAPI != nil || volume.ConfigMap != nil || volume.Secret != nil || volume.Projected != nil { 343 continue 344 } 345 klog.V(4).Infof("pod %s/%s classified as stateful", pod.Namespace, pod.Name) 346 return false 347 } 348 return true 349 }