k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/util/wait_for_pods.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package util 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/klog/v2" 27 ) 28 29 type scalingFormat int 30 31 const ( 32 uninitialized scalingFormat = iota 33 up 34 down 35 none 36 ) 37 38 // WaitForPodOptions is an options used by WaitForPods methods. 39 type WaitForPodOptions struct { 40 DesiredPodCount func() int 41 CountErrorMargin int 42 CallerName string 43 WaitForPodsInterval time.Duration 44 45 // IsPodUpdated can be used to detect which pods have been already updated. 46 // nil value means all pods are updated. 47 IsPodUpdated func(*v1.Pod) error 48 } 49 50 // PodLister is an interface around listing pods. 51 type PodLister interface { 52 List() ([]*v1.Pod, error) 53 String() string 54 } 55 56 // WaitForPods waits till desired number of pods is running. 57 // The current set of pods are fetched by calling List() on the provided PodStore. 58 // In the case of failure returns list of pods that were in unexpected state 59 func WaitForPods(ctx context.Context, ps PodLister, options *WaitForPodOptions) (*PodsStatus, error) { 60 var timeout time.Duration 61 if deadline, hasDeadline := ctx.Deadline(); hasDeadline { 62 timeout = time.Until(deadline) 63 } 64 klog.V(2).Infof("%s: %s: starting with timeout: %v", options.CallerName, ps.String(), timeout) 65 oldPods, err := ps.List() 66 if err != nil { 67 return nil, fmt.Errorf("failed to list pods: %w", err) 68 } 69 scaling := uninitialized 70 var oldPodsStatus PodsStartupStatus 71 var lastIsPodUpdatedError error 72 73 for { 74 select { 75 case <-ctx.Done(): 76 pods := ComputePodsStatus(oldPods) 77 if ctx.Err() == context.DeadlineExceeded { 78 desiredPodCount := options.DesiredPodCount() 79 80 klog.V(2).Infof("%s: %s: expected %d pods, got %d pods (not RunningAndReady pods: %v)", options.CallerName, ps.String(), desiredPodCount, len(oldPods), pods.NotRunningAndReady()) 81 klog.V(2).Infof("%s: %s: all pods: %v", options.CallerName, ps.String(), pods) 82 klog.V(2).Infof("%s: %s: last IsPodUpdated error: %v", options.CallerName, ps.String(), lastIsPodUpdatedError) 83 // In case of scaling down we expect unhealth pods to be in TERMINATING state 84 // If we end up with more than expected pods and they are all in RunningAndReady state 85 // we won't report them to the user 86 return pods.NotRunningAndReady(), fmt.Errorf("got %w while waiting for %d pods to be running in %s - summary of pods : %s", ctx.Err(), 87 desiredPodCount, ps.String(), oldPodsStatus.String()) 88 } 89 return pods.NotRunningAndReady(), ctx.Err() 90 91 case <-time.After(options.WaitForPodsInterval): 92 desiredPodCount := options.DesiredPodCount() 93 94 switch { 95 case len(oldPods) == desiredPodCount: 96 scaling = none 97 case len(oldPods) < desiredPodCount: 98 scaling = up 99 case len(oldPods) > desiredPodCount: 100 scaling = down 101 } 102 103 pods, err := ps.List() 104 if err != nil { 105 return nil, fmt.Errorf("failed to list pods: %w", err) 106 } 107 podsStatus := ComputePodsStartupStatus(pods, desiredPodCount, options.IsPodUpdated) 108 if podsStatus.LastIsPodUpdatedError != nil { 109 lastIsPodUpdatedError = podsStatus.LastIsPodUpdatedError 110 } 111 112 diff := DiffPods(oldPods, pods) 113 deletedPods := diff.DeletedPods() 114 if scaling == up && len(deletedPods) > 0 { 115 klog.Warningf("%s: %s: %d pods disappeared: %v", options.CallerName, ps.String(), len(deletedPods), strings.Join(deletedPods, ", ")) 116 } 117 addedPods := diff.AddedPods() 118 if scaling == down && len(addedPods) > 0 { 119 klog.Warningf("%s: %s: %d pods appeared: %v", options.CallerName, ps.String(), len(addedPods), strings.Join(addedPods, ", ")) 120 } 121 if podsStatus.String() != oldPodsStatus.String() { 122 klog.V(2).Infof("%s: %s: %s", options.CallerName, ps.String(), podsStatus.String()) 123 } 124 // We allow inactive pods (e.g. eviction happened). 125 // We wait until there is a desired number of pods running and all other pods are inactive. 126 if len(pods) == (podsStatus.Running+podsStatus.Inactive) && podsStatus.Running == podsStatus.RunningUpdated && podsStatus.RunningUpdated == desiredPodCount { 127 return nil, nil 128 } 129 // When using preemptibles on large scale, number of ready nodes is not stable and reaching DesiredPodCount could take a very long time. 130 // Overall number of pods (especially Inactive pods) should not grow unchecked. 131 if options.CountErrorMargin > 0 && podsStatus.RunningUpdated >= desiredPodCount-options.CountErrorMargin && len(pods)-podsStatus.Inactive <= desiredPodCount && podsStatus.Inactive <= options.CountErrorMargin { 132 return nil, nil 133 } 134 oldPods = pods 135 oldPodsStatus = podsStatus 136 } 137 } 138 }