k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/util/wait_for_pods.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package util
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/klog/v2"
    27  )
    28  
    29  type scalingFormat int
    30  
    31  const (
    32  	uninitialized scalingFormat = iota
    33  	up
    34  	down
    35  	none
    36  )
    37  
    38  // WaitForPodOptions is an options used by WaitForPods methods.
    39  type WaitForPodOptions struct {
    40  	DesiredPodCount     func() int
    41  	CountErrorMargin    int
    42  	CallerName          string
    43  	WaitForPodsInterval time.Duration
    44  
    45  	// IsPodUpdated can be used to detect which pods have been already updated.
    46  	// nil value means all pods are updated.
    47  	IsPodUpdated func(*v1.Pod) error
    48  }
    49  
    50  // PodLister is an interface around listing pods.
    51  type PodLister interface {
    52  	List() ([]*v1.Pod, error)
    53  	String() string
    54  }
    55  
    56  // WaitForPods waits till desired number of pods is running.
    57  // The current set of pods are fetched by calling List() on the provided PodStore.
    58  // In the case of failure returns list of pods that were in unexpected state
    59  func WaitForPods(ctx context.Context, ps PodLister, options *WaitForPodOptions) (*PodsStatus, error) {
    60  	var timeout time.Duration
    61  	if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
    62  		timeout = time.Until(deadline)
    63  	}
    64  	klog.V(2).Infof("%s: %s: starting with timeout: %v", options.CallerName, ps.String(), timeout)
    65  	oldPods, err := ps.List()
    66  	if err != nil {
    67  		return nil, fmt.Errorf("failed to list pods: %w", err)
    68  	}
    69  	scaling := uninitialized
    70  	var oldPodsStatus PodsStartupStatus
    71  	var lastIsPodUpdatedError error
    72  
    73  	for {
    74  		select {
    75  		case <-ctx.Done():
    76  			pods := ComputePodsStatus(oldPods)
    77  			if ctx.Err() == context.DeadlineExceeded {
    78  				desiredPodCount := options.DesiredPodCount()
    79  
    80  				klog.V(2).Infof("%s: %s: expected %d pods, got %d pods (not RunningAndReady pods: %v)", options.CallerName, ps.String(), desiredPodCount, len(oldPods), pods.NotRunningAndReady())
    81  				klog.V(2).Infof("%s: %s: all pods: %v", options.CallerName, ps.String(), pods)
    82  				klog.V(2).Infof("%s: %s: last IsPodUpdated error: %v", options.CallerName, ps.String(), lastIsPodUpdatedError)
    83  				// In case of scaling down we expect unhealth pods to be in TERMINATING state
    84  				// If we end up with more than expected pods and they are all in RunningAndReady state
    85  				// we won't report them to the user
    86  				return pods.NotRunningAndReady(), fmt.Errorf("got %w while waiting for %d pods to be running in %s - summary of pods : %s", ctx.Err(),
    87  					desiredPodCount, ps.String(), oldPodsStatus.String())
    88  			}
    89  			return pods.NotRunningAndReady(), ctx.Err()
    90  
    91  		case <-time.After(options.WaitForPodsInterval):
    92  			desiredPodCount := options.DesiredPodCount()
    93  
    94  			switch {
    95  			case len(oldPods) == desiredPodCount:
    96  				scaling = none
    97  			case len(oldPods) < desiredPodCount:
    98  				scaling = up
    99  			case len(oldPods) > desiredPodCount:
   100  				scaling = down
   101  			}
   102  
   103  			pods, err := ps.List()
   104  			if err != nil {
   105  				return nil, fmt.Errorf("failed to list pods: %w", err)
   106  			}
   107  			podsStatus := ComputePodsStartupStatus(pods, desiredPodCount, options.IsPodUpdated)
   108  			if podsStatus.LastIsPodUpdatedError != nil {
   109  				lastIsPodUpdatedError = podsStatus.LastIsPodUpdatedError
   110  			}
   111  
   112  			diff := DiffPods(oldPods, pods)
   113  			deletedPods := diff.DeletedPods()
   114  			if scaling == up && len(deletedPods) > 0 {
   115  				klog.Warningf("%s: %s: %d pods disappeared: %v", options.CallerName, ps.String(), len(deletedPods), strings.Join(deletedPods, ", "))
   116  			}
   117  			addedPods := diff.AddedPods()
   118  			if scaling == down && len(addedPods) > 0 {
   119  				klog.Warningf("%s: %s: %d pods appeared: %v", options.CallerName, ps.String(), len(addedPods), strings.Join(addedPods, ", "))
   120  			}
   121  			if podsStatus.String() != oldPodsStatus.String() {
   122  				klog.V(2).Infof("%s: %s: %s", options.CallerName, ps.String(), podsStatus.String())
   123  			}
   124  			// We allow inactive pods (e.g. eviction happened).
   125  			// We wait until there is a desired number of pods running and all other pods are inactive.
   126  			if len(pods) == (podsStatus.Running+podsStatus.Inactive) && podsStatus.Running == podsStatus.RunningUpdated && podsStatus.RunningUpdated == desiredPodCount {
   127  				return nil, nil
   128  			}
   129  			// When using preemptibles on large scale, number of ready nodes is not stable and reaching DesiredPodCount could take a very long time.
   130  			// Overall number of pods (especially Inactive pods) should not grow unchecked.
   131  			if options.CountErrorMargin > 0 && podsStatus.RunningUpdated >= desiredPodCount-options.CountErrorMargin && len(pods)-podsStatus.Inactive <= desiredPodCount && podsStatus.Inactive <= options.CountErrorMargin {
   132  				return nil, nil
   133  			}
   134  			oldPods = pods
   135  			oldPodsStatus = podsStatus
   136  		}
   137  	}
   138  }