sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/plank/reconciler.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package plank
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/sirupsen/logrus"
    29  	corev1 "k8s.io/api/core/v1"
    30  	kerrors "k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/labels"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	"k8s.io/apimachinery/pkg/util/sets"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	authorizationv1 "k8s.io/client-go/kubernetes/typed/authorization/v1"
    37  	"k8s.io/client-go/rest"
    38  	"k8s.io/utils/clock"
    39  	controllerruntime "sigs.k8s.io/controller-runtime"
    40  	ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
    41  	"sigs.k8s.io/controller-runtime/pkg/controller"
    42  	"sigs.k8s.io/controller-runtime/pkg/handler"
    43  	"sigs.k8s.io/controller-runtime/pkg/manager"
    44  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    45  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    46  	"sigs.k8s.io/controller-runtime/pkg/source"
    47  
    48  	prowv1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
    49  	"sigs.k8s.io/prow/pkg/config"
    50  	kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api"
    51  	"sigs.k8s.io/prow/pkg/flagutil"
    52  	"sigs.k8s.io/prow/pkg/io"
    53  	"sigs.k8s.io/prow/pkg/io/providers"
    54  	"sigs.k8s.io/prow/pkg/kube"
    55  	"sigs.k8s.io/prow/pkg/pjutil"
    56  	"sigs.k8s.io/prow/pkg/pod-utils/decorate"
    57  	"sigs.k8s.io/prow/pkg/version"
    58  )
    59  
    60  const ControllerName = "plank"
    61  
    62  // PodStatus constants
    63  const (
    64  	Evicted = "Evicted"
    65  )
    66  
    67  // NodeStatus constants
    68  const (
    69  	// NodeUnreachablePodReason is the reason on a pod when its state cannot be confirmed as kubelet is unresponsive
    70  	// on the node it is (was) running.
    71  	NodeUnreachablePodReason = "NodeLost"
    72  )
    73  
    74  // RequiredTestPodVerbs returns a list of verbs that we expect to be able to
    75  // have permissions for when interacting with the test pods. This is used during
    76  // startup to check that we have the necessary authorizations on build clusters.
    77  //
    78  // NOTE: Setting up build cluster managers is tricky because if we don't
    79  // have the required permissions, the controller manager setup machinery
    80  // (library code, not our code) can return an error and this can essentially
    81  // result in a fatal error, resulting in a crash loop on startup. Although
    82  // other components such as crier, deck, and hook also need to talk to build
    83  // clusters, we only perform this preemptive requiredTestPodVerbs check for
    84  // PCM and sinker because only these latter components make use of the
    85  // BuildClusterManagers() call.
    86  func RequiredTestPodVerbs() []string {
    87  	return []string{
    88  		"create",
    89  		"delete",
    90  		"list",
    91  		"watch",
    92  		"get",
    93  		"patch",
    94  	}
    95  }
    96  
    97  func Add(
    98  	mgr controllerruntime.Manager,
    99  	buildMgrs map[string]controllerruntime.Manager,
   100  	knownClusters map[string]rest.Config,
   101  	cfg config.Getter,
   102  	opener io.Opener,
   103  	totURL string,
   104  	additionalSelector string,
   105  ) error {
   106  	return add(mgr, buildMgrs, knownClusters, cfg, opener, totURL, additionalSelector, nil, nil, 10)
   107  }
   108  
   109  func add(
   110  	mgr controllerruntime.Manager,
   111  	buildMgrs map[string]controllerruntime.Manager,
   112  	knownClusters map[string]rest.Config,
   113  	cfg config.Getter,
   114  	opener io.Opener,
   115  	totURL string,
   116  	additionalSelector string,
   117  	overwriteReconcile reconcile.Func,
   118  	predicateCallack func(bool),
   119  	numWorkers int,
   120  ) error {
   121  	predicate, err := predicates(additionalSelector, predicateCallack)
   122  	if err != nil {
   123  		return fmt.Errorf("failed to construct predicate: %w", err)
   124  	}
   125  
   126  	ctx := context.Background()
   127  	if err := mgr.GetFieldIndexer().IndexField(ctx, &prowv1.ProwJob{}, prowJobIndexName, prowJobIndexer(cfg().ProwJobNamespace)); err != nil {
   128  		return fmt.Errorf("failed to add indexer: %w", err)
   129  	}
   130  
   131  	blder := controllerruntime.NewControllerManagedBy(mgr).
   132  		Named(ControllerName).
   133  		For(&prowv1.ProwJob{}).
   134  		WithEventFilter(predicate).
   135  		WithOptions(controller.Options{MaxConcurrentReconciles: numWorkers})
   136  
   137  	r := newReconciler(ctx, mgr.GetClient(), overwriteReconcile, cfg, opener, totURL)
   138  	for buildCluster, buildClusterMgr := range buildMgrs {
   139  		r.log.WithFields(logrus.Fields{
   140  			"buildCluster": buildCluster,
   141  			"host":         buildClusterMgr.GetConfig().Host,
   142  		}).Debug("creating client")
   143  		blder = blder.Watches(
   144  			source.NewKindWithCache(&corev1.Pod{}, buildClusterMgr.GetCache()),
   145  			podEventRequestMapper(cfg().ProwJobNamespace))
   146  		bc := buildClient{
   147  			Client: buildClusterMgr.GetClient()}
   148  		if restConfig, ok := knownClusters[buildCluster]; ok {
   149  			authzClient, err := authorizationv1.NewForConfig(&restConfig)
   150  			if err != nil {
   151  				return fmt.Errorf("failed to construct authz client: %s", err)
   152  			}
   153  			bc.ssar = authzClient.SelfSubjectAccessReviews()
   154  		}
   155  		r.buildClients[buildCluster] = bc
   156  	}
   157  
   158  	if err := blder.Complete(r); err != nil {
   159  		return fmt.Errorf("failed to build controller: %w", err)
   160  	}
   161  
   162  	if err := mgr.Add(manager.RunnableFunc(r.syncMetrics)); err != nil {
   163  		return fmt.Errorf("failed to add metrics runnable to manager: %w", err)
   164  	}
   165  
   166  	if err := mgr.Add(manager.RunnableFunc(r.syncClusterStatus(time.Minute, knownClusters))); err != nil {
   167  		return fmt.Errorf("failed to add cluster status runnable to manager: %w", err)
   168  	}
   169  
   170  	return nil
   171  }
   172  
   173  func newReconciler(ctx context.Context, pjClient ctrlruntimeclient.Client, overwriteReconcile reconcile.Func, cfg config.Getter, opener io.Opener, totURL string) *reconciler {
   174  	return &reconciler{
   175  		pjClient:           pjClient,
   176  		buildClients:       map[string]buildClient{},
   177  		overwriteReconcile: overwriteReconcile,
   178  		log:                logrus.NewEntry(logrus.StandardLogger()).WithField("controller", ControllerName),
   179  		config:             cfg,
   180  		opener:             opener,
   181  		totURL:             totURL,
   182  		clock:              clock.RealClock{},
   183  		maxConcurrencySerializationLocks: &shardedLock{
   184  			mapLock: &sync.Mutex{},
   185  			locks:   map[string]*sync.Mutex{},
   186  		},
   187  		jobQueueSerializationLocks: &shardedLock{
   188  			mapLock: &sync.Mutex{},
   189  			locks:   map[string]*sync.Mutex{},
   190  		},
   191  	}
   192  }
   193  
   194  type reconciler struct {
   195  	pjClient           ctrlruntimeclient.Client
   196  	buildClients       map[string]buildClient
   197  	overwriteReconcile reconcile.Func
   198  	log                *logrus.Entry
   199  	config             config.Getter
   200  	opener             io.Opener
   201  	totURL             string
   202  	clock              clock.WithTickerAndDelayedExecution
   203  	/* maxConcurrencySerializationLocks and jobQueueSerializationLocks are used to serialize
   204  	   reconciliation of ProwJobs that have concurrency limits that might affect eachother.
   205  
   206  	   The concurrency management strategy has 3 basic parts. Each part is skipped if the ProwJob
   207  	   does not specify a MaxConcurrency or JobQueueName.
   208  
   209  	   1. Serialize per the job and/or queue name as needed using these locks. This prevents
   210  	      concurrent reconciliation threads from triggering jobs beyond the concurrency limit.
   211  	   2. Compare against the ProwJob index to see how many jobs there are for the job and job queue
   212  	      and only trigger the job if it won't exceed the concurrency limit(s).
   213  	   3. Once the ProwJob is updated, wait until we see it updated in our cache before completing
   214  	      processing and releasing the serialization lock(s) acquired in step 1. This is necessary
   215  	      to prevent reconciliation threads from processing subsequent jobs before the ProwJob index
   216  	      used in step 2 is up to date.
   217  	*/
   218  	maxConcurrencySerializationLocks *shardedLock
   219  	jobQueueSerializationLocks       *shardedLock
   220  }
   221  
   222  type shardedLock struct {
   223  	mapLock *sync.Mutex
   224  	locks   map[string]*sync.Mutex
   225  }
   226  
   227  type buildClient struct {
   228  	ctrlruntimeclient.Client
   229  	ssar authorizationv1.SelfSubjectAccessReviewInterface
   230  }
   231  
   232  func (s *shardedLock) getLock(key string) *sync.Mutex {
   233  	s.mapLock.Lock()
   234  	defer s.mapLock.Unlock()
   235  	if _, exists := s.locks[key]; !exists {
   236  		s.locks[key] = &sync.Mutex{}
   237  	}
   238  	return s.locks[key]
   239  }
   240  
   241  func (r *reconciler) syncMetrics(ctx context.Context) error {
   242  	ticker := time.NewTicker(30 * time.Second)
   243  	defer ticker.Stop()
   244  	for {
   245  		select {
   246  		case <-ctx.Done():
   247  			return nil
   248  		case <-ticker.C:
   249  			pjs := &prowv1.ProwJobList{}
   250  			if err := r.pjClient.List(ctx, pjs, optAllProwJobs()); err != nil {
   251  				r.log.WithError(err).Error("failed to list prowjobs for metrics")
   252  				continue
   253  			}
   254  			kube.GatherProwJobMetrics(r.log, pjs.Items)
   255  		}
   256  	}
   257  }
   258  
   259  type ClusterStatus string
   260  
   261  const (
   262  	ClusterStatusReachable          ClusterStatus = "Reachable"
   263  	ClusterStatusNoManager          ClusterStatus = "No-Manager"
   264  	ClusterStatusError              ClusterStatus = "Error"
   265  	ClusterStatusMissingPermissions ClusterStatus = "MissingPermissions"
   266  )
   267  
   268  func (r *reconciler) syncClusterStatus(
   269  	interval time.Duration,
   270  	knownClusters map[string]rest.Config,
   271  ) func(context.Context) error {
   272  	return func(ctx context.Context) error {
   273  		ticker := time.NewTicker(interval)
   274  		defer ticker.Stop()
   275  		for {
   276  			select {
   277  			case <-ctx.Done():
   278  				return nil
   279  			case <-ticker.C:
   280  				location := r.config().Plank.BuildClusterStatusFile
   281  				if location == "" {
   282  					continue
   283  				}
   284  				parsedPath, err := prowv1.ParsePath(location)
   285  				if err != nil {
   286  					r.log.WithError(err).Errorf("Failed to parse cluster status file location: %q.", location)
   287  					continue
   288  				}
   289  				// prowv1.ParsePath prepends `Path` with `/`, trim it
   290  				bucket, subPath := parsedPath.Bucket(), strings.TrimPrefix(parsedPath.Path, "/")
   291  
   292  				clusters := map[string]ClusterStatus{}
   293  				for cluster := range knownClusters {
   294  					status := ClusterStatusReachable
   295  					client, ok := r.buildClients[cluster]
   296  					if !ok {
   297  						status = ClusterStatusNoManager
   298  					} else {
   299  						// Check for pod verbs.
   300  						if err := flagutil.CheckAuthorizations(client.ssar, r.config().PodNamespace, RequiredTestPodVerbs()); err != nil {
   301  							r.log.WithField("cluster", cluster).WithError(err).Warn("Error checking pod verbs to check for build cluster usability.")
   302  							if errors.Is(err, flagutil.MissingPermissions) {
   303  								status = ClusterStatusMissingPermissions
   304  							} else {
   305  								status = ClusterStatusError
   306  							}
   307  						}
   308  					}
   309  					clusters[cluster] = status
   310  				}
   311  				payload, err := json.Marshal(clusters)
   312  				if err != nil {
   313  					r.log.WithError(err).Error("Error marshaling cluster status info.")
   314  					continue
   315  				}
   316  				noCache := "no-cache"
   317  				fullStoragePath, err := providers.StoragePath(bucket, subPath)
   318  				if err != nil {
   319  					r.log.WithError(err).Error("Failed to resolve storage path.")
   320  					continue
   321  				}
   322  				if err := io.WriteContent(ctx, r.log, r.opener, fullStoragePath, payload, io.WriterOptions{CacheControl: &noCache}); err != nil {
   323  					r.log.WithError(err).Error("Error writing cluster status info.")
   324  				}
   325  			}
   326  		}
   327  	}
   328  }
   329  
   330  func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) {
   331  	if r.overwriteReconcile != nil {
   332  		return r.overwriteReconcile(ctx, request)
   333  	}
   334  	return r.defaultReconcile(ctx, request)
   335  }
   336  
   337  func (r *reconciler) defaultReconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) {
   338  	pj := &prowv1.ProwJob{}
   339  	if err := r.pjClient.Get(ctx, request.NamespacedName, pj); err != nil {
   340  		if !kerrors.IsNotFound(err) {
   341  			return reconcile.Result{}, fmt.Errorf("failed to get prowjob %s: %w", request.Name, err)
   342  		}
   343  
   344  		// Objects can be deleted from the API while being in our workqueue
   345  		return reconcile.Result{}, nil
   346  	}
   347  	originalPJ := pj.DeepCopy()
   348  
   349  	res, err := r.serializeIfNeeded(ctx, pj)
   350  	if IsTerminalError(err) {
   351  		// Unfixable cases like missing build clusters, do not return an error to prevent requeuing
   352  		log := r.log.WithError(err).WithFields(pjutil.ProwJobFields(pj))
   353  		log.Error("Reconciliation failed with terminal error and will not be requeued")
   354  		if !pj.Complete() {
   355  			pj.SetComplete()
   356  			pj.Status.State = prowv1.ErrorState
   357  			pj.Status.Description = fmt.Sprintf("Terminal error: %v.", err)
   358  			if err := r.pjClient.Patch(ctx, pj, ctrlruntimeclient.MergeFrom(originalPJ)); err != nil {
   359  				// If we fail to complete and mark the job as errorer we will try again on the next sync loop.
   360  				log.Errorf("Error marking job with terminal failure as errored: %v.", err)
   361  			} else {
   362  				log.Info("Marked job with terminal failure as errored.")
   363  			}
   364  		}
   365  		return reconcile.Result{}, nil
   366  	}
   367  	if res == nil {
   368  		res = &reconcile.Result{}
   369  	}
   370  	if err != nil {
   371  		r.log.WithError(err).WithField("name", request.Name).Error("Reconciliation failed")
   372  	}
   373  	return *res, err
   374  }
   375  
   376  // serializeIfNeeded serializes the reconciliation of Jobs that have a MaxConcurrency or a JobQueueName set, otherwise
   377  // multiple reconciliations of the same job or queue may race and not properly respect that setting.
   378  func (r *reconciler) serializeIfNeeded(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
   379  	if pj.Spec.MaxConcurrency > 0 {
   380  		// We need to serialize handling of this job name.
   381  		lock := r.maxConcurrencySerializationLocks.getLock(pj.Spec.Job)
   382  		// Use TryAcquire to avoid blocking workers waiting for the lock
   383  		if !lock.TryLock() {
   384  			return &reconcile.Result{RequeueAfter: time.Second}, nil
   385  		}
   386  		defer lock.Unlock()
   387  	}
   388  
   389  	if pj.Spec.JobQueueName != "" {
   390  		// We need to serialize handling of this job queue.
   391  		lock := r.jobQueueSerializationLocks.getLock(pj.Spec.JobQueueName)
   392  		// Use TryAcquire to avoid blocking workers waiting for the lock
   393  		if !lock.TryLock() {
   394  			return &reconcile.Result{RequeueAfter: time.Second}, nil
   395  		}
   396  		defer lock.Unlock()
   397  	}
   398  	return r.reconcile(ctx, pj)
   399  }
   400  
   401  func (r *reconciler) reconcile(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
   402  	// terminateDupes first, as that might reduce cluster load and prevent us
   403  	// from doing pointless work.
   404  	if err := r.terminateDupes(ctx, pj); err != nil {
   405  		return nil, fmt.Errorf("terminateDupes failed: %w", err)
   406  	}
   407  
   408  	switch pj.Status.State {
   409  	case prowv1.PendingState:
   410  		return r.syncPendingJob(ctx, pj)
   411  	case prowv1.TriggeredState:
   412  		return r.syncTriggeredJob(ctx, pj)
   413  	case prowv1.AbortedState:
   414  		return nil, r.syncAbortedJob(ctx, pj)
   415  	}
   416  
   417  	return nil, nil
   418  }
   419  
   420  func (r *reconciler) terminateDupes(ctx context.Context, pj *prowv1.ProwJob) error {
   421  	pjs := &prowv1.ProwJobList{}
   422  	if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsNamed(pj.Spec.Job)); err != nil {
   423  		return fmt.Errorf("failed to list prowjobs: %w", err)
   424  	}
   425  
   426  	return pjutil.TerminateOlderJobs(r.pjClient, r.log, pjs.Items)
   427  }
   428  
   429  // syncPendingJob syncs jobs for which we already created the test workload
   430  func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
   431  	prevPJ := pj.DeepCopy()
   432  
   433  	pod, podExists, err := r.pod(ctx, pj)
   434  	if err != nil {
   435  		return nil, err
   436  	}
   437  
   438  	if !podExists {
   439  		// Pod is missing. This can happen in case the previous pod was deleted manually or by
   440  		// a rescheduler. Start a new pod.
   441  		id, pn, err := r.startPod(ctx, pj)
   442  		if err != nil {
   443  			if !isRequestError(err) {
   444  				return nil, fmt.Errorf("error starting pod for PJ %s: %w", pj.Name, err)
   445  			}
   446  			pj.Status.State = prowv1.ErrorState
   447  			pj.SetComplete()
   448  			pj.Status.Description = fmt.Sprintf("Pod can not be created: %v", err)
   449  			r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warning("Unprocessable pod.")
   450  		} else {
   451  			pj.Status.BuildID = id
   452  			pj.Status.PodName = pn
   453  			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
   454  		}
   455  	} else if pod.Status.Reason == Evicted {
   456  		// Pod was evicted.
   457  		if pj.Spec.ErrorOnEviction {
   458  			// ErrorOnEviction is enabled, complete the PJ and mark it as
   459  			// errored.
   460  			r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
   461  			pj.SetComplete()
   462  			pj.Status.State = prowv1.ErrorState
   463  			pj.Status.Description = "Job pod was evicted by the cluster."
   464  		} else {
   465  			// ErrorOnEviction is disabled. Delete the pod now and recreate it in
   466  			// the next resync.
   467  			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod")
   468  			client, ok := r.buildClients[pj.ClusterAlias()]
   469  			if !ok {
   470  				return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
   471  			}
   472  			if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
   473  				// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
   474  				oldPod := pod.DeepCopy()
   475  				pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
   476  				if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
   477  					return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
   478  				}
   479  			}
   480  			r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
   481  			return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
   482  		}
   483  	} else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason {
   484  		// This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod
   485  		// vanishes and we will silently re-create it in the next iteration.
   486  		r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod")
   487  		client, ok := r.buildClients[pj.ClusterAlias()]
   488  		if !ok {
   489  			return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
   490  		}
   491  
   492  		if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
   493  			// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
   494  			oldPod := pod.DeepCopy()
   495  			pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
   496  			if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
   497  				return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
   498  			}
   499  		}
   500  
   501  		return nil, nil
   502  	} else {
   503  		switch pod.Status.Phase {
   504  		case corev1.PodUnknown:
   505  			// Pod is in Unknown state. This can happen if there is a problem with
   506  			// the node. Delete the old pod, this will fire an event that triggers
   507  			// a new reconciliation in which we will re-create the pod.
   508  			r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod")
   509  			client, ok := r.buildClients[pj.ClusterAlias()]
   510  			if !ok {
   511  				return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias()))
   512  			}
   513  
   514  			if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) {
   515  				// We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs
   516  				oldPod := pod.DeepCopy()
   517  				pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList()
   518  				if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
   519  					return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err)
   520  				}
   521  			}
   522  			r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.")
   523  			return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod))
   524  
   525  		case corev1.PodSucceeded:
   526  			pj.SetComplete()
   527  			// There were bugs around this in the past so be paranoid and verify each container
   528  			// https://github.com/kubernetes/kubernetes/issues/58711 is only fixed in 1.18+
   529  			if didPodSucceed(pod) {
   530  				// Pod succeeded. Update ProwJob and talk to GitHub.
   531  				pj.Status.State = prowv1.SuccessState
   532  				pj.Status.Description = "Job succeeded."
   533  			} else {
   534  				pj.Status.State = prowv1.ErrorState
   535  				pj.Status.Description = "Pod was in succeeded phase but some containers didn't finish"
   536  			}
   537  
   538  		case corev1.PodFailed:
   539  			// Pod failed. Update ProwJob, talk to GitHub.
   540  			pj.SetComplete()
   541  			pj.Status.State = prowv1.FailureState
   542  			pj.Status.Description = "Job failed."
   543  
   544  		case corev1.PodPending:
   545  			var requeueAfter time.Duration
   546  			maxPodPending := r.config().Plank.PodPendingTimeout.Duration
   547  			if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodPendingTimeout != nil {
   548  				maxPodPending = pj.Spec.DecorationConfig.PodPendingTimeout.Duration
   549  			}
   550  			maxPodUnscheduled := r.config().Plank.PodUnscheduledTimeout.Duration
   551  			if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodUnscheduledTimeout != nil {
   552  				maxPodUnscheduled = pj.Spec.DecorationConfig.PodUnscheduledTimeout.Duration
   553  			}
   554  			if pod.Status.StartTime.IsZero() {
   555  				if time.Since(pod.CreationTimestamp.Time) >= maxPodUnscheduled {
   556  					// Pod is stuck in unscheduled state longer than maxPodUncheduled
   557  					// abort the job, and talk to GitHub
   558  					pj.SetComplete()
   559  					pj.Status.State = prowv1.ErrorState
   560  					pj.Status.Description = "Pod scheduling timeout."
   561  					r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Marked job for stale unscheduled pod as errored.")
   562  					if err := r.deletePod(ctx, pj); err != nil {
   563  						return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err)
   564  					}
   565  					break
   566  				} else {
   567  					// We have to re-check on the pod once we reached maxPodUnscheduled to
   568  					// be able to fail the job if it didn't get scheduled by then.
   569  					requeueAfter = maxPodUnscheduled - time.Since(pod.CreationTimestamp.Time)
   570  				}
   571  			} else {
   572  				if time.Since(pod.Status.StartTime.Time) >= maxPodPending {
   573  					// Pod is stuck in pending state longer than maxPodPending
   574  					// abort the job, and talk to GitHub
   575  					pj.SetComplete()
   576  					pj.Status.State = prowv1.ErrorState
   577  					pj.Status.Description = "Pod pending timeout."
   578  					r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Marked job for stale pending pod as errored.")
   579  					if err := r.deletePod(ctx, pj); err != nil {
   580  						return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err)
   581  					}
   582  					break
   583  				} else {
   584  					// We have to re-check on the pod once we reached maxPodPending to
   585  					// be able to fail the job if it didn't start running by then.
   586  					requeueAfter = maxPodPending - time.Since(pod.Status.StartTime.Time)
   587  				}
   588  			}
   589  			// Pod didn't start but didn't reach the scheduling or pending timeout yet,
   590  			// do nothing but check on it again once the timeout is reached.
   591  			if pod.DeletionTimestamp == nil {
   592  				return &reconcile.Result{RequeueAfter: requeueAfter}, nil
   593  			}
   594  		case corev1.PodRunning:
   595  			if pod.DeletionTimestamp != nil {
   596  				break
   597  			}
   598  			maxPodRunning := r.config().Plank.PodRunningTimeout.Duration
   599  			if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodRunningTimeout != nil {
   600  				maxPodRunning = pj.Spec.DecorationConfig.PodRunningTimeout.Duration
   601  			}
   602  			if pod.Status.StartTime.IsZero() || time.Since(pod.Status.StartTime.Time) < maxPodRunning {
   603  				// Pod is still running. Do nothing.
   604  				return nil, nil
   605  			}
   606  
   607  			// Pod is stuck in running state longer than maxPodRunning
   608  			// abort the job, and talk to GitHub
   609  			pj.SetComplete()
   610  			pj.Status.State = prowv1.AbortedState
   611  			pj.Status.Description = "Pod running timeout."
   612  			if err := r.deletePod(ctx, pj); err != nil {
   613  				return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err)
   614  			}
   615  		default:
   616  			if pod.DeletionTimestamp == nil {
   617  				// other states, ignore
   618  				return nil, nil
   619  			}
   620  		}
   621  	}
   622  
   623  	// If a pod gets deleted unexpectedly, it might be in any phase and will stick around until
   624  	// we complete the job if the kubernetes reporter is used, because it sets a finalizer.
   625  	if !pj.Complete() && pod != nil && pod.DeletionTimestamp != nil {
   626  		pj.SetComplete()
   627  		pj.Status.State = prowv1.ErrorState
   628  		pj.Status.Description = "Pod got deleted unexpectedly"
   629  	}
   630  
   631  	pj.Status.URL, err = pjutil.JobURL(r.config().Plank, *pj, r.log)
   632  	if err != nil {
   633  		r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warn("failed to get jobURL")
   634  	}
   635  
   636  	if prevPJ.Status.State != pj.Status.State {
   637  		r.log.WithFields(pjutil.ProwJobFields(pj)).
   638  			WithField("from", prevPJ.Status.State).
   639  			WithField("to", pj.Status.State).Info("Transitioning states.")
   640  	}
   641  
   642  	if err := r.pjClient.Patch(ctx, pj.DeepCopy(), ctrlruntimeclient.MergeFrom(prevPJ)); err != nil {
   643  		return nil, fmt.Errorf("patching prowjob: %w", err)
   644  	}
   645  
   646  	// If the ProwJob state has changed, we must ensure that the update reaches the cache before
   647  	// processing the key again. Without this we might accidentally replace intentionally deleted pods
   648  	// or otherwise incorrectly react to stale ProwJob state.
   649  	state := pj.Status.State
   650  	if prevPJ.Status.State == state {
   651  		return nil, nil
   652  	}
   653  	nn := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
   654  	if err := wait.Poll(100*time.Millisecond, 2*time.Second, func() (bool, error) {
   655  		if err := r.pjClient.Get(ctx, nn, pj); err != nil {
   656  			return false, fmt.Errorf("failed to get prowjob: %w", err)
   657  		}
   658  		return pj.Status.State == state, nil
   659  	}); err != nil {
   660  		return nil, fmt.Errorf("failed to wait for cached prowjob %s to get into state %s: %w", nn.String(), state, err)
   661  	}
   662  
   663  	return nil, nil
   664  }
   665  
   666  // syncTriggeredJob syncs jobs that do not yet have an associated test workload running
   667  func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) {
   668  	prevPJ := pj.DeepCopy()
   669  
   670  	var id, pn string
   671  
   672  	pod, podExists, err := r.pod(ctx, pj)
   673  	if err != nil {
   674  		return nil, err
   675  	}
   676  	// We may end up in a state where the pod exists but the prowjob is not
   677  	// updated to pending if we successfully create a new pod in a previous
   678  	// sync but the prowjob update fails. Simply ignore creating a new pod
   679  	// and rerun the prowjob update.
   680  	if podExists {
   681  		id = getPodBuildID(pod)
   682  		pn = pod.ObjectMeta.Name
   683  	} else {
   684  		// Do not start more jobs than specified and check again later.
   685  		canExecuteConcurrently, err := r.canExecuteConcurrently(ctx, pj)
   686  		if err != nil {
   687  			return nil, fmt.Errorf("canExecuteConcurrently: %w", err)
   688  		}
   689  		if !canExecuteConcurrently {
   690  			return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
   691  		}
   692  		// We haven't started the pod yet. Do so.
   693  		id, pn, err = r.startPod(ctx, pj)
   694  		if err != nil {
   695  			if !isRequestError(err) {
   696  				return nil, fmt.Errorf("error starting pod: %w", err)
   697  			}
   698  			pj.Status.State = prowv1.ErrorState
   699  			pj.SetComplete()
   700  			pj.Status.Description = fmt.Sprintf("Pod can not be created: %v", err)
   701  			logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.")
   702  		}
   703  	}
   704  
   705  	if pj.Status.State == prowv1.TriggeredState {
   706  		// BuildID needs to be set before we execute the job url template.
   707  		pj.Status.BuildID = id
   708  		now := metav1.NewTime(r.clock.Now())
   709  		pj.Status.PendingTime = &now
   710  		pj.Status.State = prowv1.PendingState
   711  		pj.Status.PodName = pn
   712  		pj.Status.Description = "Job triggered."
   713  		pj.Status.URL, err = pjutil.JobURL(r.config().Plank, *pj, r.log)
   714  		if err != nil {
   715  			r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warn("failed to get jobURL")
   716  		}
   717  	}
   718  
   719  	if prevPJ.Status.State != pj.Status.State {
   720  		r.log.WithFields(pjutil.ProwJobFields(pj)).
   721  			WithField("from", prevPJ.Status.State).
   722  			WithField("to", pj.Status.State).Info("Transitioning states.")
   723  	}
   724  	if err := r.pjClient.Patch(ctx, pj.DeepCopy(), ctrlruntimeclient.MergeFrom(prevPJ)); err != nil {
   725  		return nil, fmt.Errorf("patch prowjob: %w", err)
   726  	}
   727  
   728  	// If the job has either MaxConcurrency or JobQueueName configured, we must block here until we observe the state transition in our cache,
   729  	// otherwise subequent reconciliations for a different run of the same job might incorrectly conclude that they
   730  	// can run because that decision is made based on the data in the cache.
   731  	if pj.Spec.MaxConcurrency == 0 && pj.Spec.JobQueueName == "" {
   732  		return nil, nil
   733  	}
   734  	nn := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
   735  	state := pj.Status.State
   736  	if err := wait.Poll(100*time.Millisecond, 2*time.Second, func() (bool, error) {
   737  		if err := r.pjClient.Get(ctx, nn, pj); err != nil {
   738  			return false, fmt.Errorf("failed to get prowjob: %w", err)
   739  		}
   740  		return pj.Status.State == state, nil
   741  	}); err != nil {
   742  		return nil, fmt.Errorf("failed to wait for cached prowjob %s to get into state %s: %w", nn.String(), state, err)
   743  	}
   744  
   745  	return nil, nil
   746  }
   747  
   748  // syncAbortedJob syncs jobs that got aborted because their result isn't needed anymore,
   749  // for example because of a new push or because a pull request got closed.
   750  func (r *reconciler) syncAbortedJob(ctx context.Context, pj *prowv1.ProwJob) error {
   751  
   752  	buildClient, ok := r.buildClients[pj.ClusterAlias()]
   753  	if !ok {
   754  		return TerminalError(fmt.Errorf("no build client available for cluster %s", pj.ClusterAlias()))
   755  	}
   756  
   757  	// Just optimistically delete and swallow the potential 404
   758  	pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{
   759  		Name:      pj.Name,
   760  		Namespace: r.config().PodNamespace,
   761  	}}
   762  	if err := ctrlruntimeclient.IgnoreNotFound(buildClient.Delete(ctx, pod)); err != nil {
   763  		return fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err)
   764  	}
   765  
   766  	originalPJ := pj.DeepCopy()
   767  	pj.SetComplete()
   768  	return r.pjClient.Patch(ctx, pj, ctrlruntimeclient.MergeFrom(originalPJ))
   769  }
   770  
   771  // pod Gets pod for a pj, returns pod, whether pod exist, and error.
   772  func (r *reconciler) pod(ctx context.Context, pj *prowv1.ProwJob) (*corev1.Pod, bool, error) {
   773  	buildClient, buildClientExists := r.buildClients[pj.ClusterAlias()]
   774  	if !buildClientExists {
   775  		return nil, false, TerminalError(fmt.Errorf("no build client found for cluster %q", pj.ClusterAlias()))
   776  	}
   777  
   778  	pod := &corev1.Pod{}
   779  	name := types.NamespacedName{
   780  		Namespace: r.config().PodNamespace,
   781  		Name:      pj.Name,
   782  	}
   783  
   784  	if err := buildClient.Get(ctx, name, pod); err != nil {
   785  		if kerrors.IsNotFound(err) {
   786  			return nil, false, nil
   787  		}
   788  		return nil, false, fmt.Errorf("failed to get pod: %w", err)
   789  	}
   790  
   791  	return pod, true, nil
   792  }
   793  
   794  func (r *reconciler) deletePod(ctx context.Context, pj *prowv1.ProwJob) error {
   795  	buildClient, buildClientExists := r.buildClients[pj.ClusterAlias()]
   796  	if !buildClientExists {
   797  		return TerminalError(fmt.Errorf("no build client found for cluster %q", pj.ClusterAlias()))
   798  	}
   799  
   800  	pod := &corev1.Pod{
   801  		ObjectMeta: metav1.ObjectMeta{
   802  			Namespace: r.config().PodNamespace,
   803  			Name:      pj.Name,
   804  		},
   805  	}
   806  
   807  	if err := ctrlruntimeclient.IgnoreNotFound(buildClient.Delete(ctx, pod)); err != nil {
   808  		return fmt.Errorf("failed to delete pod: %w", err)
   809  	}
   810  
   811  	r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Deleted stale running pod.")
   812  	return nil
   813  }
   814  
   815  func (r *reconciler) startPod(ctx context.Context, pj *prowv1.ProwJob) (string, string, error) {
   816  	buildID, err := r.getBuildID(pj.Spec.Job)
   817  	if err != nil {
   818  		return "", "", fmt.Errorf("error getting build ID: %w", err)
   819  	}
   820  
   821  	pj.Status.BuildID = buildID
   822  	pod, err := decorate.ProwJobToPod(*pj)
   823  	if err != nil {
   824  		return "", "", err
   825  	}
   826  	pod.Namespace = r.config().PodNamespace
   827  	// Add prow version as a label for better debugging prowjobs.
   828  	pod.ObjectMeta.Labels[kube.PlankVersionLabel] = version.Version
   829  	podName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
   830  
   831  	client, ok := r.buildClients[pj.ClusterAlias()]
   832  	if !ok {
   833  		return "", "", TerminalError(fmt.Errorf("unknown cluster alias %q", pj.ClusterAlias()))
   834  	}
   835  	err = client.Create(ctx, pod)
   836  	r.log.WithFields(pjutil.ProwJobFields(pj)).Debug("Create Pod.")
   837  	if err != nil {
   838  		return "", "", fmt.Errorf("create pod %s in cluster %s: %w", podName.String(), pj.ClusterAlias(), err)
   839  	}
   840  
   841  	// We must block until we see the pod, otherwise a new reconciliation may be triggered that tries to create
   842  	// the pod because its not in the cache yet, errors with IsAlreadyExists and sets the prowjob to failed
   843  	if err := wait.Poll(100*time.Millisecond, 10*time.Second, func() (bool, error) {
   844  		if err := client.Get(ctx, podName, pod); err != nil {
   845  			if kerrors.IsNotFound(err) {
   846  				return false, nil
   847  			}
   848  			return false, fmt.Errorf("failed to get pod %s in cluster %s: %w", podName.String(), pj.ClusterAlias(), err)
   849  		}
   850  		return true, nil
   851  	}); err != nil {
   852  		return "", "", fmt.Errorf("failed waiting for new pod %s in cluster %s  appear in cache: %w", podName.String(), pj.ClusterAlias(), err)
   853  	}
   854  
   855  	return buildID, pod.Name, nil
   856  }
   857  
   858  func (r *reconciler) getBuildID(name string) (string, error) {
   859  	return pjutil.GetBuildID(name, r.totURL)
   860  }
   861  
   862  // canExecuteConcurrently determines if the cocurrency settings allow our job
   863  // to be started. We start jobs with a limited concurrency in order, oldest
   864  // first. This allows us to get away without any global locking by just looking
   865  // at the jobs in the cluster.
   866  func (r *reconciler) canExecuteConcurrently(ctx context.Context, pj *prowv1.ProwJob) (bool, error) {
   867  
   868  	if max := r.config().Plank.MaxConcurrency; max > 0 {
   869  		pjs := &prowv1.ProwJobList{}
   870  		if err := r.pjClient.List(ctx, pjs, optPendingProwJobs()); err != nil {
   871  			return false, fmt.Errorf("failed to list prowjobs: %w", err)
   872  		}
   873  		// The list contains our own ProwJob
   874  		running := len(pjs.Items) - 1
   875  		if running >= max {
   876  			r.log.WithFields(pjutil.ProwJobFields(pj)).Infof("Not starting another job, already %d running.", running)
   877  			return false, nil
   878  		}
   879  	}
   880  
   881  	if canExecute, err := r.canExecuteConcurrentlyPerJob(ctx, pj); err != nil || !canExecute {
   882  		return canExecute, err
   883  	}
   884  
   885  	return r.canExecuteConcurrentlyPerQueue(ctx, pj)
   886  }
   887  
   888  func (r *reconciler) canExecuteConcurrentlyPerJob(ctx context.Context, pj *prowv1.ProwJob) (bool, error) {
   889  	if pj.Spec.MaxConcurrency == 0 {
   890  		return true, nil
   891  	}
   892  
   893  	pjs := &prowv1.ProwJobList{}
   894  	if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsNamed(pj.Spec.Job)); err != nil {
   895  		return false, fmt.Errorf("failed listing prowjobs: %w:", err)
   896  	}
   897  	r.log.Infof("got %d not completed with same name", len(pjs.Items))
   898  
   899  	pendingOrOlderMatchingPJs := countPendingOrOlderTriggeredMatchingPJs(*pj, pjs.Items)
   900  	if pendingOrOlderMatchingPJs >= pj.Spec.MaxConcurrency {
   901  		r.log.WithFields(pjutil.ProwJobFields(pj)).
   902  			Debugf("Not starting another instance of %s, have %d instances that are pending or older, %d is the limit",
   903  				pj.Spec.Job, pendingOrOlderMatchingPJs, pj.Spec.MaxConcurrency)
   904  		return false, nil
   905  	}
   906  
   907  	return true, nil
   908  }
   909  
   910  func (r *reconciler) canExecuteConcurrentlyPerQueue(ctx context.Context, pj *prowv1.ProwJob) (bool, error) {
   911  	queueName := pj.Spec.JobQueueName
   912  	if queueName == "" {
   913  		return true, nil
   914  	}
   915  
   916  	queueConcurrency, queueDefined := r.config().Plank.JobQueueCapacities[queueName]
   917  	if !queueDefined {
   918  		return false, fmt.Errorf("failed to match queue name '%s' with Plank configuration", queueName)
   919  	}
   920  	if queueConcurrency == 0 {
   921  		return false, nil
   922  	}
   923  	if queueConcurrency < 0 {
   924  		return true, nil
   925  	}
   926  
   927  	pjs := &prowv1.ProwJobList{}
   928  	if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsInQueue(queueName)); err != nil {
   929  		return false, fmt.Errorf("failed listing prowjobs in queue %s: %w", queueName, err)
   930  	}
   931  	r.log.Infof("got %d not completed within queue %s", len(pjs.Items), queueName)
   932  
   933  	pendingOrOlderMatchingPJs := countPendingOrOlderTriggeredMatchingPJs(*pj, pjs.Items)
   934  	if pendingOrOlderMatchingPJs >= queueConcurrency {
   935  		r.log.WithFields(pjutil.ProwJobFields(pj)).
   936  			Debugf("Not starting another instance of %s, have %d instances in queue %s that are pending or older, %d is the limit",
   937  				pj.Spec.Job, pendingOrOlderMatchingPJs, queueName, queueConcurrency)
   938  		return false, nil
   939  	}
   940  
   941  	return true, nil
   942  }
   943  
   944  func predicates(additionalSelector string, callback func(bool)) (predicate.Predicate, error) {
   945  	rawSelector := fmt.Sprintf("%s=true", kube.CreatedByProw)
   946  	if additionalSelector != "" {
   947  		rawSelector = fmt.Sprintf("%s,%s", rawSelector, additionalSelector)
   948  	}
   949  	selector, err := labels.Parse(rawSelector)
   950  	if err != nil {
   951  		return nil, fmt.Errorf("failed to parse label selector %s: %w", rawSelector, err)
   952  	}
   953  
   954  	return predicate.NewPredicateFuncs(func(o ctrlruntimeclient.Object) bool {
   955  		result := func() bool {
   956  			pj, ok := o.(*prowv1.ProwJob)
   957  			if !ok {
   958  				// We ignore pods that do not match our selector
   959  				return selector.Matches(labels.Set(o.GetLabels()))
   960  			}
   961  
   962  			// We can ignore completed prowjobs
   963  			if pj.Complete() {
   964  				return false
   965  			}
   966  
   967  			return pj.Spec.Agent == prowv1.KubernetesAgent && pj.Status.State != prowv1.SchedulingState
   968  		}()
   969  		if callback != nil {
   970  			callback(result)
   971  		}
   972  		return result
   973  	}), nil
   974  }
   975  
   976  func podEventRequestMapper(prowJobNamespace string) handler.EventHandler {
   977  	return handler.EnqueueRequestsFromMapFunc(func(o ctrlruntimeclient.Object) []reconcile.Request {
   978  		return []reconcile.Request{{NamespacedName: ctrlruntimeclient.ObjectKey{
   979  			Namespace: prowJobNamespace,
   980  			Name:      o.GetName(),
   981  		}}}
   982  	})
   983  }
   984  
   985  const (
   986  	// prowJobIndexName is the name of an index that
   987  	// holds all ProwJobs that are in the correct namespace
   988  	// and use the Kubernetes agent
   989  	prowJobIndexName = "plank-prow-jobs"
   990  	// prowJobIndexKeyAll is the indexKey for all ProwJobs
   991  	prowJobIndexKeyAll = "all"
   992  	// prowJobIndexKeyPending is the indexKey for prowjobs
   993  	// that are currently pending AKA a corresponding pod
   994  	// exists but didn't yet finish
   995  	prowJobIndexKeyPending = "pending"
   996  )
   997  
   998  func pendingTriggeredIndexKeyByName(jobName string) string {
   999  	return fmt.Sprintf("pending-triggered-named-%s", jobName)
  1000  }
  1001  
  1002  func pendingTriggeredIndexKeyByJobQueueName(jobQueueName string) string {
  1003  	return fmt.Sprintf("pending-triggered-with-job-queue-name-%s", jobQueueName)
  1004  }
  1005  
  1006  func prowJobIndexer(prowJobNamespace string) ctrlruntimeclient.IndexerFunc {
  1007  	return func(o ctrlruntimeclient.Object) []string {
  1008  		pj := o.(*prowv1.ProwJob)
  1009  		if pj.Namespace != prowJobNamespace || pj.Spec.Agent != prowv1.KubernetesAgent {
  1010  			return nil
  1011  		}
  1012  
  1013  		indexes := []string{prowJobIndexKeyAll}
  1014  
  1015  		if pj.Status.State == prowv1.PendingState {
  1016  			indexes = append(indexes, prowJobIndexKeyPending)
  1017  		}
  1018  
  1019  		if pj.Status.State == prowv1.PendingState || pj.Status.State == prowv1.TriggeredState {
  1020  			indexes = append(indexes, pendingTriggeredIndexKeyByName(pj.Spec.Job))
  1021  
  1022  			if pj.Spec.JobQueueName != "" {
  1023  				indexes = append(indexes, pendingTriggeredIndexKeyByJobQueueName(pj.Spec.JobQueueName))
  1024  			}
  1025  		}
  1026  
  1027  		return indexes
  1028  	}
  1029  }
  1030  
  1031  func optAllProwJobs() ctrlruntimeclient.ListOption {
  1032  	return ctrlruntimeclient.MatchingFields{prowJobIndexName: prowJobIndexKeyAll}
  1033  }
  1034  
  1035  func optPendingProwJobs() ctrlruntimeclient.ListOption {
  1036  	return ctrlruntimeclient.MatchingFields{prowJobIndexName: prowJobIndexKeyPending}
  1037  }
  1038  
  1039  func optPendingTriggeredJobsNamed(name string) ctrlruntimeclient.ListOption {
  1040  	return ctrlruntimeclient.MatchingFields{prowJobIndexName: pendingTriggeredIndexKeyByName(name)}
  1041  }
  1042  
  1043  func optPendingTriggeredJobsInQueue(queueName string) ctrlruntimeclient.ListOption {
  1044  	return ctrlruntimeclient.MatchingFields{prowJobIndexName: pendingTriggeredIndexKeyByJobQueueName(queueName)}
  1045  }
  1046  
  1047  func didPodSucceed(p *corev1.Pod) bool {
  1048  	if p.Status.Phase != corev1.PodSucceeded {
  1049  		return false
  1050  	}
  1051  	for _, container := range append(p.Status.ContainerStatuses, p.Status.InitContainerStatuses...) {
  1052  		if container.State.Terminated == nil || container.State.Terminated.ExitCode != 0 || container.State.Terminated.FinishedAt.IsZero() {
  1053  			return false
  1054  		}
  1055  	}
  1056  
  1057  	return true
  1058  }
  1059  
  1060  func getPodBuildID(pod *corev1.Pod) string {
  1061  	if buildID, ok := pod.ObjectMeta.Labels[kube.ProwBuildIDLabel]; ok && buildID != "" {
  1062  		return buildID
  1063  	}
  1064  
  1065  	// For backwards compatibility: existing pods may not have the buildID label.
  1066  	for _, env := range pod.Spec.Containers[0].Env {
  1067  		if env.Name == "BUILD_ID" {
  1068  			return env.Value
  1069  		}
  1070  	}
  1071  
  1072  	logrus.Warningf("BUILD_ID was not found in pod %q: streaming logs from deck will not work", pod.ObjectMeta.Name)
  1073  	return ""
  1074  }
  1075  
  1076  // isRequestError extracts an HTTP status code from a kerrors.APIStatus and
  1077  // returns true if it is a 4xx error.
  1078  func isRequestError(err error) bool {
  1079  	var code int32 = 500 // This is what kerrors.ReasonForError() defaults to.
  1080  	if status := kerrors.APIStatus(nil); errors.As(err, &status) {
  1081  		code = status.Status().Code
  1082  	}
  1083  	return 400 <= code && code < 500
  1084  }
  1085  
  1086  func countPendingOrOlderTriggeredMatchingPJs(pj prowv1.ProwJob, pjs []prowv1.ProwJob) int {
  1087  	var pendingOrOlderTriggeredMatchingPJs int
  1088  
  1089  	for _, foundPJ := range pjs {
  1090  		// Ignore self here.
  1091  		if foundPJ.UID == pj.UID {
  1092  			continue
  1093  		}
  1094  		if foundPJ.Status.State == prowv1.PendingState {
  1095  			pendingOrOlderTriggeredMatchingPJs++
  1096  			continue
  1097  		}
  1098  
  1099  		// At this point if foundPJ is older than our prowJobs it gets
  1100  		// priorized to make sure we execute jobs in creation order.
  1101  		if foundPJ.Status.State == prowv1.TriggeredState &&
  1102  			foundPJ.CreationTimestamp.Before(&pj.CreationTimestamp) {
  1103  			pendingOrOlderTriggeredMatchingPJs++
  1104  		}
  1105  	}
  1106  
  1107  	return pendingOrOlderTriggeredMatchingPJs
  1108  }