sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/plank/reconciler.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package plank 18 19 import ( 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "strings" 25 "sync" 26 "time" 27 28 "github.com/sirupsen/logrus" 29 corev1 "k8s.io/api/core/v1" 30 kerrors "k8s.io/apimachinery/pkg/api/errors" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/labels" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/sets" 35 "k8s.io/apimachinery/pkg/util/wait" 36 authorizationv1 "k8s.io/client-go/kubernetes/typed/authorization/v1" 37 "k8s.io/client-go/rest" 38 "k8s.io/utils/clock" 39 controllerruntime "sigs.k8s.io/controller-runtime" 40 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/controller" 42 "sigs.k8s.io/controller-runtime/pkg/handler" 43 "sigs.k8s.io/controller-runtime/pkg/manager" 44 "sigs.k8s.io/controller-runtime/pkg/predicate" 45 "sigs.k8s.io/controller-runtime/pkg/reconcile" 46 "sigs.k8s.io/controller-runtime/pkg/source" 47 48 prowv1 "sigs.k8s.io/prow/pkg/apis/prowjobs/v1" 49 "sigs.k8s.io/prow/pkg/config" 50 kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api" 51 "sigs.k8s.io/prow/pkg/flagutil" 52 "sigs.k8s.io/prow/pkg/io" 53 "sigs.k8s.io/prow/pkg/io/providers" 54 "sigs.k8s.io/prow/pkg/kube" 55 "sigs.k8s.io/prow/pkg/pjutil" 56 "sigs.k8s.io/prow/pkg/pod-utils/decorate" 57 "sigs.k8s.io/prow/pkg/version" 58 ) 59 60 const ControllerName = "plank" 61 62 // PodStatus constants 63 const ( 64 Evicted = "Evicted" 65 ) 66 67 // NodeStatus constants 68 const ( 69 // NodeUnreachablePodReason is the reason on a pod when its state cannot be confirmed as kubelet is unresponsive 70 // on the node it is (was) running. 71 NodeUnreachablePodReason = "NodeLost" 72 ) 73 74 // RequiredTestPodVerbs returns a list of verbs that we expect to be able to 75 // have permissions for when interacting with the test pods. This is used during 76 // startup to check that we have the necessary authorizations on build clusters. 77 // 78 // NOTE: Setting up build cluster managers is tricky because if we don't 79 // have the required permissions, the controller manager setup machinery 80 // (library code, not our code) can return an error and this can essentially 81 // result in a fatal error, resulting in a crash loop on startup. Although 82 // other components such as crier, deck, and hook also need to talk to build 83 // clusters, we only perform this preemptive requiredTestPodVerbs check for 84 // PCM and sinker because only these latter components make use of the 85 // BuildClusterManagers() call. 86 func RequiredTestPodVerbs() []string { 87 return []string{ 88 "create", 89 "delete", 90 "list", 91 "watch", 92 "get", 93 "patch", 94 } 95 } 96 97 func Add( 98 mgr controllerruntime.Manager, 99 buildMgrs map[string]controllerruntime.Manager, 100 knownClusters map[string]rest.Config, 101 cfg config.Getter, 102 opener io.Opener, 103 totURL string, 104 additionalSelector string, 105 ) error { 106 return add(mgr, buildMgrs, knownClusters, cfg, opener, totURL, additionalSelector, nil, nil, 10) 107 } 108 109 func add( 110 mgr controllerruntime.Manager, 111 buildMgrs map[string]controllerruntime.Manager, 112 knownClusters map[string]rest.Config, 113 cfg config.Getter, 114 opener io.Opener, 115 totURL string, 116 additionalSelector string, 117 overwriteReconcile reconcile.Func, 118 predicateCallack func(bool), 119 numWorkers int, 120 ) error { 121 predicate, err := predicates(additionalSelector, predicateCallack) 122 if err != nil { 123 return fmt.Errorf("failed to construct predicate: %w", err) 124 } 125 126 ctx := context.Background() 127 if err := mgr.GetFieldIndexer().IndexField(ctx, &prowv1.ProwJob{}, prowJobIndexName, prowJobIndexer(cfg().ProwJobNamespace)); err != nil { 128 return fmt.Errorf("failed to add indexer: %w", err) 129 } 130 131 blder := controllerruntime.NewControllerManagedBy(mgr). 132 Named(ControllerName). 133 For(&prowv1.ProwJob{}). 134 WithEventFilter(predicate). 135 WithOptions(controller.Options{MaxConcurrentReconciles: numWorkers}) 136 137 r := newReconciler(ctx, mgr.GetClient(), overwriteReconcile, cfg, opener, totURL) 138 for buildCluster, buildClusterMgr := range buildMgrs { 139 r.log.WithFields(logrus.Fields{ 140 "buildCluster": buildCluster, 141 "host": buildClusterMgr.GetConfig().Host, 142 }).Debug("creating client") 143 blder = blder.Watches( 144 source.NewKindWithCache(&corev1.Pod{}, buildClusterMgr.GetCache()), 145 podEventRequestMapper(cfg().ProwJobNamespace)) 146 bc := buildClient{ 147 Client: buildClusterMgr.GetClient()} 148 if restConfig, ok := knownClusters[buildCluster]; ok { 149 authzClient, err := authorizationv1.NewForConfig(&restConfig) 150 if err != nil { 151 return fmt.Errorf("failed to construct authz client: %s", err) 152 } 153 bc.ssar = authzClient.SelfSubjectAccessReviews() 154 } 155 r.buildClients[buildCluster] = bc 156 } 157 158 if err := blder.Complete(r); err != nil { 159 return fmt.Errorf("failed to build controller: %w", err) 160 } 161 162 if err := mgr.Add(manager.RunnableFunc(r.syncMetrics)); err != nil { 163 return fmt.Errorf("failed to add metrics runnable to manager: %w", err) 164 } 165 166 if err := mgr.Add(manager.RunnableFunc(r.syncClusterStatus(time.Minute, knownClusters))); err != nil { 167 return fmt.Errorf("failed to add cluster status runnable to manager: %w", err) 168 } 169 170 return nil 171 } 172 173 func newReconciler(ctx context.Context, pjClient ctrlruntimeclient.Client, overwriteReconcile reconcile.Func, cfg config.Getter, opener io.Opener, totURL string) *reconciler { 174 return &reconciler{ 175 pjClient: pjClient, 176 buildClients: map[string]buildClient{}, 177 overwriteReconcile: overwriteReconcile, 178 log: logrus.NewEntry(logrus.StandardLogger()).WithField("controller", ControllerName), 179 config: cfg, 180 opener: opener, 181 totURL: totURL, 182 clock: clock.RealClock{}, 183 maxConcurrencySerializationLocks: &shardedLock{ 184 mapLock: &sync.Mutex{}, 185 locks: map[string]*sync.Mutex{}, 186 }, 187 jobQueueSerializationLocks: &shardedLock{ 188 mapLock: &sync.Mutex{}, 189 locks: map[string]*sync.Mutex{}, 190 }, 191 } 192 } 193 194 type reconciler struct { 195 pjClient ctrlruntimeclient.Client 196 buildClients map[string]buildClient 197 overwriteReconcile reconcile.Func 198 log *logrus.Entry 199 config config.Getter 200 opener io.Opener 201 totURL string 202 clock clock.WithTickerAndDelayedExecution 203 /* maxConcurrencySerializationLocks and jobQueueSerializationLocks are used to serialize 204 reconciliation of ProwJobs that have concurrency limits that might affect eachother. 205 206 The concurrency management strategy has 3 basic parts. Each part is skipped if the ProwJob 207 does not specify a MaxConcurrency or JobQueueName. 208 209 1. Serialize per the job and/or queue name as needed using these locks. This prevents 210 concurrent reconciliation threads from triggering jobs beyond the concurrency limit. 211 2. Compare against the ProwJob index to see how many jobs there are for the job and job queue 212 and only trigger the job if it won't exceed the concurrency limit(s). 213 3. Once the ProwJob is updated, wait until we see it updated in our cache before completing 214 processing and releasing the serialization lock(s) acquired in step 1. This is necessary 215 to prevent reconciliation threads from processing subsequent jobs before the ProwJob index 216 used in step 2 is up to date. 217 */ 218 maxConcurrencySerializationLocks *shardedLock 219 jobQueueSerializationLocks *shardedLock 220 } 221 222 type shardedLock struct { 223 mapLock *sync.Mutex 224 locks map[string]*sync.Mutex 225 } 226 227 type buildClient struct { 228 ctrlruntimeclient.Client 229 ssar authorizationv1.SelfSubjectAccessReviewInterface 230 } 231 232 func (s *shardedLock) getLock(key string) *sync.Mutex { 233 s.mapLock.Lock() 234 defer s.mapLock.Unlock() 235 if _, exists := s.locks[key]; !exists { 236 s.locks[key] = &sync.Mutex{} 237 } 238 return s.locks[key] 239 } 240 241 func (r *reconciler) syncMetrics(ctx context.Context) error { 242 ticker := time.NewTicker(30 * time.Second) 243 defer ticker.Stop() 244 for { 245 select { 246 case <-ctx.Done(): 247 return nil 248 case <-ticker.C: 249 pjs := &prowv1.ProwJobList{} 250 if err := r.pjClient.List(ctx, pjs, optAllProwJobs()); err != nil { 251 r.log.WithError(err).Error("failed to list prowjobs for metrics") 252 continue 253 } 254 kube.GatherProwJobMetrics(r.log, pjs.Items) 255 } 256 } 257 } 258 259 type ClusterStatus string 260 261 const ( 262 ClusterStatusReachable ClusterStatus = "Reachable" 263 ClusterStatusNoManager ClusterStatus = "No-Manager" 264 ClusterStatusError ClusterStatus = "Error" 265 ClusterStatusMissingPermissions ClusterStatus = "MissingPermissions" 266 ) 267 268 func (r *reconciler) syncClusterStatus( 269 interval time.Duration, 270 knownClusters map[string]rest.Config, 271 ) func(context.Context) error { 272 return func(ctx context.Context) error { 273 ticker := time.NewTicker(interval) 274 defer ticker.Stop() 275 for { 276 select { 277 case <-ctx.Done(): 278 return nil 279 case <-ticker.C: 280 location := r.config().Plank.BuildClusterStatusFile 281 if location == "" { 282 continue 283 } 284 parsedPath, err := prowv1.ParsePath(location) 285 if err != nil { 286 r.log.WithError(err).Errorf("Failed to parse cluster status file location: %q.", location) 287 continue 288 } 289 // prowv1.ParsePath prepends `Path` with `/`, trim it 290 bucket, subPath := parsedPath.Bucket(), strings.TrimPrefix(parsedPath.Path, "/") 291 292 clusters := map[string]ClusterStatus{} 293 for cluster := range knownClusters { 294 status := ClusterStatusReachable 295 client, ok := r.buildClients[cluster] 296 if !ok { 297 status = ClusterStatusNoManager 298 } else { 299 // Check for pod verbs. 300 if err := flagutil.CheckAuthorizations(client.ssar, r.config().PodNamespace, RequiredTestPodVerbs()); err != nil { 301 r.log.WithField("cluster", cluster).WithError(err).Warn("Error checking pod verbs to check for build cluster usability.") 302 if errors.Is(err, flagutil.MissingPermissions) { 303 status = ClusterStatusMissingPermissions 304 } else { 305 status = ClusterStatusError 306 } 307 } 308 } 309 clusters[cluster] = status 310 } 311 payload, err := json.Marshal(clusters) 312 if err != nil { 313 r.log.WithError(err).Error("Error marshaling cluster status info.") 314 continue 315 } 316 noCache := "no-cache" 317 fullStoragePath, err := providers.StoragePath(bucket, subPath) 318 if err != nil { 319 r.log.WithError(err).Error("Failed to resolve storage path.") 320 continue 321 } 322 if err := io.WriteContent(ctx, r.log, r.opener, fullStoragePath, payload, io.WriterOptions{CacheControl: &noCache}); err != nil { 323 r.log.WithError(err).Error("Error writing cluster status info.") 324 } 325 } 326 } 327 } 328 } 329 330 func (r *reconciler) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { 331 if r.overwriteReconcile != nil { 332 return r.overwriteReconcile(ctx, request) 333 } 334 return r.defaultReconcile(ctx, request) 335 } 336 337 func (r *reconciler) defaultReconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { 338 pj := &prowv1.ProwJob{} 339 if err := r.pjClient.Get(ctx, request.NamespacedName, pj); err != nil { 340 if !kerrors.IsNotFound(err) { 341 return reconcile.Result{}, fmt.Errorf("failed to get prowjob %s: %w", request.Name, err) 342 } 343 344 // Objects can be deleted from the API while being in our workqueue 345 return reconcile.Result{}, nil 346 } 347 originalPJ := pj.DeepCopy() 348 349 res, err := r.serializeIfNeeded(ctx, pj) 350 if IsTerminalError(err) { 351 // Unfixable cases like missing build clusters, do not return an error to prevent requeuing 352 log := r.log.WithError(err).WithFields(pjutil.ProwJobFields(pj)) 353 log.Error("Reconciliation failed with terminal error and will not be requeued") 354 if !pj.Complete() { 355 pj.SetComplete() 356 pj.Status.State = prowv1.ErrorState 357 pj.Status.Description = fmt.Sprintf("Terminal error: %v.", err) 358 if err := r.pjClient.Patch(ctx, pj, ctrlruntimeclient.MergeFrom(originalPJ)); err != nil { 359 // If we fail to complete and mark the job as errorer we will try again on the next sync loop. 360 log.Errorf("Error marking job with terminal failure as errored: %v.", err) 361 } else { 362 log.Info("Marked job with terminal failure as errored.") 363 } 364 } 365 return reconcile.Result{}, nil 366 } 367 if res == nil { 368 res = &reconcile.Result{} 369 } 370 if err != nil { 371 r.log.WithError(err).WithField("name", request.Name).Error("Reconciliation failed") 372 } 373 return *res, err 374 } 375 376 // serializeIfNeeded serializes the reconciliation of Jobs that have a MaxConcurrency or a JobQueueName set, otherwise 377 // multiple reconciliations of the same job or queue may race and not properly respect that setting. 378 func (r *reconciler) serializeIfNeeded(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) { 379 if pj.Spec.MaxConcurrency > 0 { 380 // We need to serialize handling of this job name. 381 lock := r.maxConcurrencySerializationLocks.getLock(pj.Spec.Job) 382 // Use TryAcquire to avoid blocking workers waiting for the lock 383 if !lock.TryLock() { 384 return &reconcile.Result{RequeueAfter: time.Second}, nil 385 } 386 defer lock.Unlock() 387 } 388 389 if pj.Spec.JobQueueName != "" { 390 // We need to serialize handling of this job queue. 391 lock := r.jobQueueSerializationLocks.getLock(pj.Spec.JobQueueName) 392 // Use TryAcquire to avoid blocking workers waiting for the lock 393 if !lock.TryLock() { 394 return &reconcile.Result{RequeueAfter: time.Second}, nil 395 } 396 defer lock.Unlock() 397 } 398 return r.reconcile(ctx, pj) 399 } 400 401 func (r *reconciler) reconcile(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) { 402 // terminateDupes first, as that might reduce cluster load and prevent us 403 // from doing pointless work. 404 if err := r.terminateDupes(ctx, pj); err != nil { 405 return nil, fmt.Errorf("terminateDupes failed: %w", err) 406 } 407 408 switch pj.Status.State { 409 case prowv1.PendingState: 410 return r.syncPendingJob(ctx, pj) 411 case prowv1.TriggeredState: 412 return r.syncTriggeredJob(ctx, pj) 413 case prowv1.AbortedState: 414 return nil, r.syncAbortedJob(ctx, pj) 415 } 416 417 return nil, nil 418 } 419 420 func (r *reconciler) terminateDupes(ctx context.Context, pj *prowv1.ProwJob) error { 421 pjs := &prowv1.ProwJobList{} 422 if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsNamed(pj.Spec.Job)); err != nil { 423 return fmt.Errorf("failed to list prowjobs: %w", err) 424 } 425 426 return pjutil.TerminateOlderJobs(r.pjClient, r.log, pjs.Items) 427 } 428 429 // syncPendingJob syncs jobs for which we already created the test workload 430 func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) { 431 prevPJ := pj.DeepCopy() 432 433 pod, podExists, err := r.pod(ctx, pj) 434 if err != nil { 435 return nil, err 436 } 437 438 if !podExists { 439 // Pod is missing. This can happen in case the previous pod was deleted manually or by 440 // a rescheduler. Start a new pod. 441 id, pn, err := r.startPod(ctx, pj) 442 if err != nil { 443 if !isRequestError(err) { 444 return nil, fmt.Errorf("error starting pod for PJ %s: %w", pj.Name, err) 445 } 446 pj.Status.State = prowv1.ErrorState 447 pj.SetComplete() 448 pj.Status.Description = fmt.Sprintf("Pod can not be created: %v", err) 449 r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warning("Unprocessable pod.") 450 } else { 451 pj.Status.BuildID = id 452 pj.Status.PodName = pn 453 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod") 454 } 455 } else if pod.Status.Reason == Evicted { 456 // Pod was evicted. 457 if pj.Spec.ErrorOnEviction { 458 // ErrorOnEviction is enabled, complete the PJ and mark it as 459 // errored. 460 r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.") 461 pj.SetComplete() 462 pj.Status.State = prowv1.ErrorState 463 pj.Status.Description = "Job pod was evicted by the cluster." 464 } else { 465 // ErrorOnEviction is disabled. Delete the pod now and recreate it in 466 // the next resync. 467 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, deleting & next sync loop will restart pod") 468 client, ok := r.buildClients[pj.ClusterAlias()] 469 if !ok { 470 return nil, TerminalError(fmt.Errorf("evicted pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias())) 471 } 472 if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) { 473 // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs 474 oldPod := pod.DeepCopy() 475 pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList() 476 if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil { 477 return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err) 478 } 479 } 480 r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.") 481 return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod)) 482 } 483 } else if pod.DeletionTimestamp != nil && pod.Status.Reason == NodeUnreachablePodReason { 484 // This can happen in any phase and means the node got evicted after it became unresponsive. Delete the finalizer so the pod 485 // vanishes and we will silently re-create it in the next iteration. 486 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got lost, deleting & next sync loop will restart pod") 487 client, ok := r.buildClients[pj.ClusterAlias()] 488 if !ok { 489 return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias())) 490 } 491 492 if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) { 493 // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs 494 oldPod := pod.DeepCopy() 495 pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList() 496 if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil { 497 return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err) 498 } 499 } 500 501 return nil, nil 502 } else { 503 switch pod.Status.Phase { 504 case corev1.PodUnknown: 505 // Pod is in Unknown state. This can happen if there is a problem with 506 // the node. Delete the old pod, this will fire an event that triggers 507 // a new reconciliation in which we will re-create the pod. 508 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is in unknown state, deleting & restarting pod") 509 client, ok := r.buildClients[pj.ClusterAlias()] 510 if !ok { 511 return nil, TerminalError(fmt.Errorf("unknown pod %s: unknown cluster alias %q", pod.Name, pj.ClusterAlias())) 512 } 513 514 if finalizers := sets.New[string](pod.Finalizers...); finalizers.Has(kubernetesreporterapi.FinalizerName) { 515 // We want the end user to not see this, so we have to remove the finalizer, otherwise the pod hangs 516 oldPod := pod.DeepCopy() 517 pod.Finalizers = finalizers.Delete(kubernetesreporterapi.FinalizerName).UnsortedList() 518 if err := client.Patch(ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil { 519 return nil, fmt.Errorf("failed to patch pod trying to remove %s finalizer: %w", kubernetesreporterapi.FinalizerName, err) 520 } 521 } 522 r.log.WithField("name", pj.ObjectMeta.Name).Debug("Delete Pod.") 523 return nil, ctrlruntimeclient.IgnoreNotFound(client.Delete(ctx, pod)) 524 525 case corev1.PodSucceeded: 526 pj.SetComplete() 527 // There were bugs around this in the past so be paranoid and verify each container 528 // https://github.com/kubernetes/kubernetes/issues/58711 is only fixed in 1.18+ 529 if didPodSucceed(pod) { 530 // Pod succeeded. Update ProwJob and talk to GitHub. 531 pj.Status.State = prowv1.SuccessState 532 pj.Status.Description = "Job succeeded." 533 } else { 534 pj.Status.State = prowv1.ErrorState 535 pj.Status.Description = "Pod was in succeeded phase but some containers didn't finish" 536 } 537 538 case corev1.PodFailed: 539 // Pod failed. Update ProwJob, talk to GitHub. 540 pj.SetComplete() 541 pj.Status.State = prowv1.FailureState 542 pj.Status.Description = "Job failed." 543 544 case corev1.PodPending: 545 var requeueAfter time.Duration 546 maxPodPending := r.config().Plank.PodPendingTimeout.Duration 547 if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodPendingTimeout != nil { 548 maxPodPending = pj.Spec.DecorationConfig.PodPendingTimeout.Duration 549 } 550 maxPodUnscheduled := r.config().Plank.PodUnscheduledTimeout.Duration 551 if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodUnscheduledTimeout != nil { 552 maxPodUnscheduled = pj.Spec.DecorationConfig.PodUnscheduledTimeout.Duration 553 } 554 if pod.Status.StartTime.IsZero() { 555 if time.Since(pod.CreationTimestamp.Time) >= maxPodUnscheduled { 556 // Pod is stuck in unscheduled state longer than maxPodUncheduled 557 // abort the job, and talk to GitHub 558 pj.SetComplete() 559 pj.Status.State = prowv1.ErrorState 560 pj.Status.Description = "Pod scheduling timeout." 561 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Marked job for stale unscheduled pod as errored.") 562 if err := r.deletePod(ctx, pj); err != nil { 563 return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err) 564 } 565 break 566 } else { 567 // We have to re-check on the pod once we reached maxPodUnscheduled to 568 // be able to fail the job if it didn't get scheduled by then. 569 requeueAfter = maxPodUnscheduled - time.Since(pod.CreationTimestamp.Time) 570 } 571 } else { 572 if time.Since(pod.Status.StartTime.Time) >= maxPodPending { 573 // Pod is stuck in pending state longer than maxPodPending 574 // abort the job, and talk to GitHub 575 pj.SetComplete() 576 pj.Status.State = prowv1.ErrorState 577 pj.Status.Description = "Pod pending timeout." 578 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Marked job for stale pending pod as errored.") 579 if err := r.deletePod(ctx, pj); err != nil { 580 return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err) 581 } 582 break 583 } else { 584 // We have to re-check on the pod once we reached maxPodPending to 585 // be able to fail the job if it didn't start running by then. 586 requeueAfter = maxPodPending - time.Since(pod.Status.StartTime.Time) 587 } 588 } 589 // Pod didn't start but didn't reach the scheduling or pending timeout yet, 590 // do nothing but check on it again once the timeout is reached. 591 if pod.DeletionTimestamp == nil { 592 return &reconcile.Result{RequeueAfter: requeueAfter}, nil 593 } 594 case corev1.PodRunning: 595 if pod.DeletionTimestamp != nil { 596 break 597 } 598 maxPodRunning := r.config().Plank.PodRunningTimeout.Duration 599 if pj.Spec.DecorationConfig != nil && pj.Spec.DecorationConfig.PodRunningTimeout != nil { 600 maxPodRunning = pj.Spec.DecorationConfig.PodRunningTimeout.Duration 601 } 602 if pod.Status.StartTime.IsZero() || time.Since(pod.Status.StartTime.Time) < maxPodRunning { 603 // Pod is still running. Do nothing. 604 return nil, nil 605 } 606 607 // Pod is stuck in running state longer than maxPodRunning 608 // abort the job, and talk to GitHub 609 pj.SetComplete() 610 pj.Status.State = prowv1.AbortedState 611 pj.Status.Description = "Pod running timeout." 612 if err := r.deletePod(ctx, pj); err != nil { 613 return nil, fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err) 614 } 615 default: 616 if pod.DeletionTimestamp == nil { 617 // other states, ignore 618 return nil, nil 619 } 620 } 621 } 622 623 // If a pod gets deleted unexpectedly, it might be in any phase and will stick around until 624 // we complete the job if the kubernetes reporter is used, because it sets a finalizer. 625 if !pj.Complete() && pod != nil && pod.DeletionTimestamp != nil { 626 pj.SetComplete() 627 pj.Status.State = prowv1.ErrorState 628 pj.Status.Description = "Pod got deleted unexpectedly" 629 } 630 631 pj.Status.URL, err = pjutil.JobURL(r.config().Plank, *pj, r.log) 632 if err != nil { 633 r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warn("failed to get jobURL") 634 } 635 636 if prevPJ.Status.State != pj.Status.State { 637 r.log.WithFields(pjutil.ProwJobFields(pj)). 638 WithField("from", prevPJ.Status.State). 639 WithField("to", pj.Status.State).Info("Transitioning states.") 640 } 641 642 if err := r.pjClient.Patch(ctx, pj.DeepCopy(), ctrlruntimeclient.MergeFrom(prevPJ)); err != nil { 643 return nil, fmt.Errorf("patching prowjob: %w", err) 644 } 645 646 // If the ProwJob state has changed, we must ensure that the update reaches the cache before 647 // processing the key again. Without this we might accidentally replace intentionally deleted pods 648 // or otherwise incorrectly react to stale ProwJob state. 649 state := pj.Status.State 650 if prevPJ.Status.State == state { 651 return nil, nil 652 } 653 nn := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name} 654 if err := wait.Poll(100*time.Millisecond, 2*time.Second, func() (bool, error) { 655 if err := r.pjClient.Get(ctx, nn, pj); err != nil { 656 return false, fmt.Errorf("failed to get prowjob: %w", err) 657 } 658 return pj.Status.State == state, nil 659 }); err != nil { 660 return nil, fmt.Errorf("failed to wait for cached prowjob %s to get into state %s: %w", nn.String(), state, err) 661 } 662 663 return nil, nil 664 } 665 666 // syncTriggeredJob syncs jobs that do not yet have an associated test workload running 667 func (r *reconciler) syncTriggeredJob(ctx context.Context, pj *prowv1.ProwJob) (*reconcile.Result, error) { 668 prevPJ := pj.DeepCopy() 669 670 var id, pn string 671 672 pod, podExists, err := r.pod(ctx, pj) 673 if err != nil { 674 return nil, err 675 } 676 // We may end up in a state where the pod exists but the prowjob is not 677 // updated to pending if we successfully create a new pod in a previous 678 // sync but the prowjob update fails. Simply ignore creating a new pod 679 // and rerun the prowjob update. 680 if podExists { 681 id = getPodBuildID(pod) 682 pn = pod.ObjectMeta.Name 683 } else { 684 // Do not start more jobs than specified and check again later. 685 canExecuteConcurrently, err := r.canExecuteConcurrently(ctx, pj) 686 if err != nil { 687 return nil, fmt.Errorf("canExecuteConcurrently: %w", err) 688 } 689 if !canExecuteConcurrently { 690 return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil 691 } 692 // We haven't started the pod yet. Do so. 693 id, pn, err = r.startPod(ctx, pj) 694 if err != nil { 695 if !isRequestError(err) { 696 return nil, fmt.Errorf("error starting pod: %w", err) 697 } 698 pj.Status.State = prowv1.ErrorState 699 pj.SetComplete() 700 pj.Status.Description = fmt.Sprintf("Pod can not be created: %v", err) 701 logrus.WithField("job", pj.Spec.Job).WithError(err).Warning("Unprocessable pod.") 702 } 703 } 704 705 if pj.Status.State == prowv1.TriggeredState { 706 // BuildID needs to be set before we execute the job url template. 707 pj.Status.BuildID = id 708 now := metav1.NewTime(r.clock.Now()) 709 pj.Status.PendingTime = &now 710 pj.Status.State = prowv1.PendingState 711 pj.Status.PodName = pn 712 pj.Status.Description = "Job triggered." 713 pj.Status.URL, err = pjutil.JobURL(r.config().Plank, *pj, r.log) 714 if err != nil { 715 r.log.WithFields(pjutil.ProwJobFields(pj)).WithError(err).Warn("failed to get jobURL") 716 } 717 } 718 719 if prevPJ.Status.State != pj.Status.State { 720 r.log.WithFields(pjutil.ProwJobFields(pj)). 721 WithField("from", prevPJ.Status.State). 722 WithField("to", pj.Status.State).Info("Transitioning states.") 723 } 724 if err := r.pjClient.Patch(ctx, pj.DeepCopy(), ctrlruntimeclient.MergeFrom(prevPJ)); err != nil { 725 return nil, fmt.Errorf("patch prowjob: %w", err) 726 } 727 728 // If the job has either MaxConcurrency or JobQueueName configured, we must block here until we observe the state transition in our cache, 729 // otherwise subequent reconciliations for a different run of the same job might incorrectly conclude that they 730 // can run because that decision is made based on the data in the cache. 731 if pj.Spec.MaxConcurrency == 0 && pj.Spec.JobQueueName == "" { 732 return nil, nil 733 } 734 nn := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name} 735 state := pj.Status.State 736 if err := wait.Poll(100*time.Millisecond, 2*time.Second, func() (bool, error) { 737 if err := r.pjClient.Get(ctx, nn, pj); err != nil { 738 return false, fmt.Errorf("failed to get prowjob: %w", err) 739 } 740 return pj.Status.State == state, nil 741 }); err != nil { 742 return nil, fmt.Errorf("failed to wait for cached prowjob %s to get into state %s: %w", nn.String(), state, err) 743 } 744 745 return nil, nil 746 } 747 748 // syncAbortedJob syncs jobs that got aborted because their result isn't needed anymore, 749 // for example because of a new push or because a pull request got closed. 750 func (r *reconciler) syncAbortedJob(ctx context.Context, pj *prowv1.ProwJob) error { 751 752 buildClient, ok := r.buildClients[pj.ClusterAlias()] 753 if !ok { 754 return TerminalError(fmt.Errorf("no build client available for cluster %s", pj.ClusterAlias())) 755 } 756 757 // Just optimistically delete and swallow the potential 404 758 pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{ 759 Name: pj.Name, 760 Namespace: r.config().PodNamespace, 761 }} 762 if err := ctrlruntimeclient.IgnoreNotFound(buildClient.Delete(ctx, pod)); err != nil { 763 return fmt.Errorf("failed to delete pod %s/%s in cluster %s: %w", pod.Namespace, pod.Name, pj.ClusterAlias(), err) 764 } 765 766 originalPJ := pj.DeepCopy() 767 pj.SetComplete() 768 return r.pjClient.Patch(ctx, pj, ctrlruntimeclient.MergeFrom(originalPJ)) 769 } 770 771 // pod Gets pod for a pj, returns pod, whether pod exist, and error. 772 func (r *reconciler) pod(ctx context.Context, pj *prowv1.ProwJob) (*corev1.Pod, bool, error) { 773 buildClient, buildClientExists := r.buildClients[pj.ClusterAlias()] 774 if !buildClientExists { 775 return nil, false, TerminalError(fmt.Errorf("no build client found for cluster %q", pj.ClusterAlias())) 776 } 777 778 pod := &corev1.Pod{} 779 name := types.NamespacedName{ 780 Namespace: r.config().PodNamespace, 781 Name: pj.Name, 782 } 783 784 if err := buildClient.Get(ctx, name, pod); err != nil { 785 if kerrors.IsNotFound(err) { 786 return nil, false, nil 787 } 788 return nil, false, fmt.Errorf("failed to get pod: %w", err) 789 } 790 791 return pod, true, nil 792 } 793 794 func (r *reconciler) deletePod(ctx context.Context, pj *prowv1.ProwJob) error { 795 buildClient, buildClientExists := r.buildClients[pj.ClusterAlias()] 796 if !buildClientExists { 797 return TerminalError(fmt.Errorf("no build client found for cluster %q", pj.ClusterAlias())) 798 } 799 800 pod := &corev1.Pod{ 801 ObjectMeta: metav1.ObjectMeta{ 802 Namespace: r.config().PodNamespace, 803 Name: pj.Name, 804 }, 805 } 806 807 if err := ctrlruntimeclient.IgnoreNotFound(buildClient.Delete(ctx, pod)); err != nil { 808 return fmt.Errorf("failed to delete pod: %w", err) 809 } 810 811 r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Deleted stale running pod.") 812 return nil 813 } 814 815 func (r *reconciler) startPod(ctx context.Context, pj *prowv1.ProwJob) (string, string, error) { 816 buildID, err := r.getBuildID(pj.Spec.Job) 817 if err != nil { 818 return "", "", fmt.Errorf("error getting build ID: %w", err) 819 } 820 821 pj.Status.BuildID = buildID 822 pod, err := decorate.ProwJobToPod(*pj) 823 if err != nil { 824 return "", "", err 825 } 826 pod.Namespace = r.config().PodNamespace 827 // Add prow version as a label for better debugging prowjobs. 828 pod.ObjectMeta.Labels[kube.PlankVersionLabel] = version.Version 829 podName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} 830 831 client, ok := r.buildClients[pj.ClusterAlias()] 832 if !ok { 833 return "", "", TerminalError(fmt.Errorf("unknown cluster alias %q", pj.ClusterAlias())) 834 } 835 err = client.Create(ctx, pod) 836 r.log.WithFields(pjutil.ProwJobFields(pj)).Debug("Create Pod.") 837 if err != nil { 838 return "", "", fmt.Errorf("create pod %s in cluster %s: %w", podName.String(), pj.ClusterAlias(), err) 839 } 840 841 // We must block until we see the pod, otherwise a new reconciliation may be triggered that tries to create 842 // the pod because its not in the cache yet, errors with IsAlreadyExists and sets the prowjob to failed 843 if err := wait.Poll(100*time.Millisecond, 10*time.Second, func() (bool, error) { 844 if err := client.Get(ctx, podName, pod); err != nil { 845 if kerrors.IsNotFound(err) { 846 return false, nil 847 } 848 return false, fmt.Errorf("failed to get pod %s in cluster %s: %w", podName.String(), pj.ClusterAlias(), err) 849 } 850 return true, nil 851 }); err != nil { 852 return "", "", fmt.Errorf("failed waiting for new pod %s in cluster %s appear in cache: %w", podName.String(), pj.ClusterAlias(), err) 853 } 854 855 return buildID, pod.Name, nil 856 } 857 858 func (r *reconciler) getBuildID(name string) (string, error) { 859 return pjutil.GetBuildID(name, r.totURL) 860 } 861 862 // canExecuteConcurrently determines if the cocurrency settings allow our job 863 // to be started. We start jobs with a limited concurrency in order, oldest 864 // first. This allows us to get away without any global locking by just looking 865 // at the jobs in the cluster. 866 func (r *reconciler) canExecuteConcurrently(ctx context.Context, pj *prowv1.ProwJob) (bool, error) { 867 868 if max := r.config().Plank.MaxConcurrency; max > 0 { 869 pjs := &prowv1.ProwJobList{} 870 if err := r.pjClient.List(ctx, pjs, optPendingProwJobs()); err != nil { 871 return false, fmt.Errorf("failed to list prowjobs: %w", err) 872 } 873 // The list contains our own ProwJob 874 running := len(pjs.Items) - 1 875 if running >= max { 876 r.log.WithFields(pjutil.ProwJobFields(pj)).Infof("Not starting another job, already %d running.", running) 877 return false, nil 878 } 879 } 880 881 if canExecute, err := r.canExecuteConcurrentlyPerJob(ctx, pj); err != nil || !canExecute { 882 return canExecute, err 883 } 884 885 return r.canExecuteConcurrentlyPerQueue(ctx, pj) 886 } 887 888 func (r *reconciler) canExecuteConcurrentlyPerJob(ctx context.Context, pj *prowv1.ProwJob) (bool, error) { 889 if pj.Spec.MaxConcurrency == 0 { 890 return true, nil 891 } 892 893 pjs := &prowv1.ProwJobList{} 894 if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsNamed(pj.Spec.Job)); err != nil { 895 return false, fmt.Errorf("failed listing prowjobs: %w:", err) 896 } 897 r.log.Infof("got %d not completed with same name", len(pjs.Items)) 898 899 pendingOrOlderMatchingPJs := countPendingOrOlderTriggeredMatchingPJs(*pj, pjs.Items) 900 if pendingOrOlderMatchingPJs >= pj.Spec.MaxConcurrency { 901 r.log.WithFields(pjutil.ProwJobFields(pj)). 902 Debugf("Not starting another instance of %s, have %d instances that are pending or older, %d is the limit", 903 pj.Spec.Job, pendingOrOlderMatchingPJs, pj.Spec.MaxConcurrency) 904 return false, nil 905 } 906 907 return true, nil 908 } 909 910 func (r *reconciler) canExecuteConcurrentlyPerQueue(ctx context.Context, pj *prowv1.ProwJob) (bool, error) { 911 queueName := pj.Spec.JobQueueName 912 if queueName == "" { 913 return true, nil 914 } 915 916 queueConcurrency, queueDefined := r.config().Plank.JobQueueCapacities[queueName] 917 if !queueDefined { 918 return false, fmt.Errorf("failed to match queue name '%s' with Plank configuration", queueName) 919 } 920 if queueConcurrency == 0 { 921 return false, nil 922 } 923 if queueConcurrency < 0 { 924 return true, nil 925 } 926 927 pjs := &prowv1.ProwJobList{} 928 if err := r.pjClient.List(ctx, pjs, optPendingTriggeredJobsInQueue(queueName)); err != nil { 929 return false, fmt.Errorf("failed listing prowjobs in queue %s: %w", queueName, err) 930 } 931 r.log.Infof("got %d not completed within queue %s", len(pjs.Items), queueName) 932 933 pendingOrOlderMatchingPJs := countPendingOrOlderTriggeredMatchingPJs(*pj, pjs.Items) 934 if pendingOrOlderMatchingPJs >= queueConcurrency { 935 r.log.WithFields(pjutil.ProwJobFields(pj)). 936 Debugf("Not starting another instance of %s, have %d instances in queue %s that are pending or older, %d is the limit", 937 pj.Spec.Job, pendingOrOlderMatchingPJs, queueName, queueConcurrency) 938 return false, nil 939 } 940 941 return true, nil 942 } 943 944 func predicates(additionalSelector string, callback func(bool)) (predicate.Predicate, error) { 945 rawSelector := fmt.Sprintf("%s=true", kube.CreatedByProw) 946 if additionalSelector != "" { 947 rawSelector = fmt.Sprintf("%s,%s", rawSelector, additionalSelector) 948 } 949 selector, err := labels.Parse(rawSelector) 950 if err != nil { 951 return nil, fmt.Errorf("failed to parse label selector %s: %w", rawSelector, err) 952 } 953 954 return predicate.NewPredicateFuncs(func(o ctrlruntimeclient.Object) bool { 955 result := func() bool { 956 pj, ok := o.(*prowv1.ProwJob) 957 if !ok { 958 // We ignore pods that do not match our selector 959 return selector.Matches(labels.Set(o.GetLabels())) 960 } 961 962 // We can ignore completed prowjobs 963 if pj.Complete() { 964 return false 965 } 966 967 return pj.Spec.Agent == prowv1.KubernetesAgent && pj.Status.State != prowv1.SchedulingState 968 }() 969 if callback != nil { 970 callback(result) 971 } 972 return result 973 }), nil 974 } 975 976 func podEventRequestMapper(prowJobNamespace string) handler.EventHandler { 977 return handler.EnqueueRequestsFromMapFunc(func(o ctrlruntimeclient.Object) []reconcile.Request { 978 return []reconcile.Request{{NamespacedName: ctrlruntimeclient.ObjectKey{ 979 Namespace: prowJobNamespace, 980 Name: o.GetName(), 981 }}} 982 }) 983 } 984 985 const ( 986 // prowJobIndexName is the name of an index that 987 // holds all ProwJobs that are in the correct namespace 988 // and use the Kubernetes agent 989 prowJobIndexName = "plank-prow-jobs" 990 // prowJobIndexKeyAll is the indexKey for all ProwJobs 991 prowJobIndexKeyAll = "all" 992 // prowJobIndexKeyPending is the indexKey for prowjobs 993 // that are currently pending AKA a corresponding pod 994 // exists but didn't yet finish 995 prowJobIndexKeyPending = "pending" 996 ) 997 998 func pendingTriggeredIndexKeyByName(jobName string) string { 999 return fmt.Sprintf("pending-triggered-named-%s", jobName) 1000 } 1001 1002 func pendingTriggeredIndexKeyByJobQueueName(jobQueueName string) string { 1003 return fmt.Sprintf("pending-triggered-with-job-queue-name-%s", jobQueueName) 1004 } 1005 1006 func prowJobIndexer(prowJobNamespace string) ctrlruntimeclient.IndexerFunc { 1007 return func(o ctrlruntimeclient.Object) []string { 1008 pj := o.(*prowv1.ProwJob) 1009 if pj.Namespace != prowJobNamespace || pj.Spec.Agent != prowv1.KubernetesAgent { 1010 return nil 1011 } 1012 1013 indexes := []string{prowJobIndexKeyAll} 1014 1015 if pj.Status.State == prowv1.PendingState { 1016 indexes = append(indexes, prowJobIndexKeyPending) 1017 } 1018 1019 if pj.Status.State == prowv1.PendingState || pj.Status.State == prowv1.TriggeredState { 1020 indexes = append(indexes, pendingTriggeredIndexKeyByName(pj.Spec.Job)) 1021 1022 if pj.Spec.JobQueueName != "" { 1023 indexes = append(indexes, pendingTriggeredIndexKeyByJobQueueName(pj.Spec.JobQueueName)) 1024 } 1025 } 1026 1027 return indexes 1028 } 1029 } 1030 1031 func optAllProwJobs() ctrlruntimeclient.ListOption { 1032 return ctrlruntimeclient.MatchingFields{prowJobIndexName: prowJobIndexKeyAll} 1033 } 1034 1035 func optPendingProwJobs() ctrlruntimeclient.ListOption { 1036 return ctrlruntimeclient.MatchingFields{prowJobIndexName: prowJobIndexKeyPending} 1037 } 1038 1039 func optPendingTriggeredJobsNamed(name string) ctrlruntimeclient.ListOption { 1040 return ctrlruntimeclient.MatchingFields{prowJobIndexName: pendingTriggeredIndexKeyByName(name)} 1041 } 1042 1043 func optPendingTriggeredJobsInQueue(queueName string) ctrlruntimeclient.ListOption { 1044 return ctrlruntimeclient.MatchingFields{prowJobIndexName: pendingTriggeredIndexKeyByJobQueueName(queueName)} 1045 } 1046 1047 func didPodSucceed(p *corev1.Pod) bool { 1048 if p.Status.Phase != corev1.PodSucceeded { 1049 return false 1050 } 1051 for _, container := range append(p.Status.ContainerStatuses, p.Status.InitContainerStatuses...) { 1052 if container.State.Terminated == nil || container.State.Terminated.ExitCode != 0 || container.State.Terminated.FinishedAt.IsZero() { 1053 return false 1054 } 1055 } 1056 1057 return true 1058 } 1059 1060 func getPodBuildID(pod *corev1.Pod) string { 1061 if buildID, ok := pod.ObjectMeta.Labels[kube.ProwBuildIDLabel]; ok && buildID != "" { 1062 return buildID 1063 } 1064 1065 // For backwards compatibility: existing pods may not have the buildID label. 1066 for _, env := range pod.Spec.Containers[0].Env { 1067 if env.Name == "BUILD_ID" { 1068 return env.Value 1069 } 1070 } 1071 1072 logrus.Warningf("BUILD_ID was not found in pod %q: streaming logs from deck will not work", pod.ObjectMeta.Name) 1073 return "" 1074 } 1075 1076 // isRequestError extracts an HTTP status code from a kerrors.APIStatus and 1077 // returns true if it is a 4xx error. 1078 func isRequestError(err error) bool { 1079 var code int32 = 500 // This is what kerrors.ReasonForError() defaults to. 1080 if status := kerrors.APIStatus(nil); errors.As(err, &status) { 1081 code = status.Status().Code 1082 } 1083 return 400 <= code && code < 500 1084 } 1085 1086 func countPendingOrOlderTriggeredMatchingPJs(pj prowv1.ProwJob, pjs []prowv1.ProwJob) int { 1087 var pendingOrOlderTriggeredMatchingPJs int 1088 1089 for _, foundPJ := range pjs { 1090 // Ignore self here. 1091 if foundPJ.UID == pj.UID { 1092 continue 1093 } 1094 if foundPJ.Status.State == prowv1.PendingState { 1095 pendingOrOlderTriggeredMatchingPJs++ 1096 continue 1097 } 1098 1099 // At this point if foundPJ is older than our prowJobs it gets 1100 // priorized to make sure we execute jobs in creation order. 1101 if foundPJ.Status.State == prowv1.TriggeredState && 1102 foundPJ.CreationTimestamp.Before(&pj.CreationTimestamp) { 1103 pendingOrOlderTriggeredMatchingPJs++ 1104 } 1105 } 1106 1107 return pendingOrOlderTriggeredMatchingPJs 1108 }