sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/cmd/sinker/main.go

sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/cmd/sinker/main.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"flag"
    23  	"fmt"
    24  	"io/fs"
    25  	"os"
    26  	"path"
    27  	"path/filepath"
    28  	"time"
    29  
    30  	"github.com/prometheus/client_golang/prometheus"
    31  	"github.com/sirupsen/logrus"
    32  	corev1api "k8s.io/api/core/v1"
    33  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/types"
    36  	"k8s.io/apimachinery/pkg/util/sets"
    37  	ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
    38  	ctrlruntimelog "sigs.k8s.io/controller-runtime/pkg/log"
    39  	"sigs.k8s.io/controller-runtime/pkg/log/zap"
    40  	"sigs.k8s.io/controller-runtime/pkg/manager"
    41  	"sigs.k8s.io/prow/pkg/pjutil/pprof"
    42  
    43  	prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1"
    44  	"sigs.k8s.io/prow/pkg/config"
    45  	kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api"
    46  	"sigs.k8s.io/prow/pkg/flagutil"
    47  	configflagutil "sigs.k8s.io/prow/pkg/flagutil/config"
    48  	"sigs.k8s.io/prow/pkg/interrupts"
    49  	"sigs.k8s.io/prow/pkg/kube"
    50  	"sigs.k8s.io/prow/pkg/logrusutil"
    51  	"sigs.k8s.io/prow/pkg/metrics"
    52  	"sigs.k8s.io/prow/pkg/pjutil"
    53  	_ "sigs.k8s.io/prow/pkg/version"
    54  )
    55  
    56  type options struct {
    57  	runOnce                bool
    58  	config                 configflagutil.ConfigOptions
    59  	dryRun                 bool
    60  	kubernetes             flagutil.KubernetesOptions
    61  	instrumentationOptions flagutil.InstrumentationOptions
    62  }
    63  
    64  const (
    65  	reasonPodAged     = "aged"
    66  	reasonPodOrphaned = "orphaned"
    67  	reasonPodTTLed    = "ttled"
    68  
    69  	reasonProwJobAged         = "aged"
    70  	reasonProwJobAgedPeriodic = "aged-periodic"
    71  )
    72  
    73  func gatherOptions(fs *flag.FlagSet, args ...string) options {
    74  	o := options{}
    75  	fs.BoolVar(&o.runOnce, "run-once", false, "If true, run only once then quit.")
    76  
    77  	fs.BoolVar(&o.dryRun, "dry-run", true, "Whether or not to make mutating API calls to Kubernetes.")
    78  
    79  	o.config.AddFlags(fs)
    80  	o.kubernetes.AddFlags(fs)
    81  	o.instrumentationOptions.AddFlags(fs)
    82  	fs.Parse(args)
    83  	return o
    84  }
    85  
    86  func (o *options) Validate() error {
    87  	if err := o.kubernetes.Validate(o.dryRun); err != nil {
    88  		return err
    89  	}
    90  
    91  	if err := o.config.Validate(o.dryRun); err != nil {
    92  		return err
    93  	}
    94  
    95  	return nil
    96  }
    97  
    98  func main() {
    99  	logrusutil.ComponentInit()
   100  
   101  	o := gatherOptions(flag.NewFlagSet(os.Args[0], flag.ExitOnError), os.Args[1:]...)
   102  	if err := o.Validate(); err != nil {
   103  		logrus.WithError(err).Fatal("Invalid options")
   104  	}
   105  
   106  	defer interrupts.WaitForGracefulShutdown()
   107  
   108  	pprof.Instrument(o.instrumentationOptions)
   109  
   110  	configAgent, err := o.config.ConfigAgent()
   111  	if err != nil {
   112  		logrus.WithError(err).Fatal("Error starting config agent.")
   113  	}
   114  	cfg := configAgent.Config
   115  	o.kubernetes.SetDisabledClusters(sets.New[string](cfg().DisabledClusters...))
   116  
   117  	if o.config.JobConfigPath != "" {
   118  		go jobConfigMapMonitor(5*time.Minute, o.config.JobConfigPath)
   119  	}
   120  
   121  	metrics.ExposeMetrics("sinker", cfg().PushGateway, o.instrumentationOptions.MetricsPort)
   122  
   123  	ctrlruntimelog.SetLogger(zap.New(zap.JSONEncoder()))
   124  
   125  	infrastructureClusterConfig, err := o.kubernetes.InfrastructureClusterConfig(o.dryRun)
   126  	if err != nil {
   127  		logrus.WithError(err).Fatal("Error getting config for infastructure cluster")
   128  	}
   129  
   130  	// The watch apimachinery doesn't support restarts, so just exit the binary if a kubeconfig changes
   131  	// to make the kubelet restart us.
   132  	if err := o.kubernetes.AddKubeconfigChangeCallback(func() {
   133  		logrus.Info("Kubeconfig changed, exiting to trigger a restart")
   134  		interrupts.Terminate()
   135  	}); err != nil {
   136  		logrus.WithError(err).Fatal("Failed to register kubeconfig change callback")
   137  	}
   138  
   139  	opts := manager.Options{
   140  		MetricsBindAddress:            "0",
   141  		Namespace:                     cfg().ProwJobNamespace,
   142  		LeaderElection:                true,
   143  		LeaderElectionNamespace:       configAgent.Config().ProwJobNamespace,
   144  		LeaderElectionID:              "prow-sinker-leaderlock",
   145  		LeaderElectionReleaseOnCancel: true,
   146  	}
   147  	mgr, err := manager.New(infrastructureClusterConfig, opts)
   148  	if err != nil {
   149  		logrus.WithError(err).Fatal("Error creating manager")
   150  	}
   151  
   152  	// The watch apimachinery doesn't support restarts, so just exit the
   153  	// binary if a build cluster can be connected later.
   154  	callBack := func() {
   155  		logrus.Info("Build cluster that failed to connect initially now worked, exiting to trigger a restart.")
   156  		interrupts.Terminate()
   157  	}
   158  
   159  	// We require operating on test pods in build clusters with the following
   160  	// verbs. This is used during startup to check that we have the necessary
   161  	// authorizations on build clusters.
   162  	requiredTestPodVerbs := []string{
   163  		"delete",
   164  		"list",
   165  		"watch",
   166  		"get",
   167  		"patch",
   168  	}
   169  
   170  	buildManagers, err := o.kubernetes.BuildClusterManagers(o.dryRun,
   171  		requiredTestPodVerbs,
   172  		// The watch apimachinery doesn't support restarts, so just exit the
   173  		// binary if a build cluster can be connected later .
   174  		callBack,
   175  		func(o *manager.Options) {
   176  			o.Namespace = cfg().PodNamespace
   177  		},
   178  	)
   179  	if err != nil {
   180  		logrus.WithError(err).Error("Failed to construct build cluster managers. Is there a bad entry in the kubeconfig secret?")
   181  	}
   182  
   183  	buildClusterClients := map[string]ctrlruntimeclient.Client{}
   184  	for clusterName, buildManager := range buildManagers {
   185  		if err := mgr.Add(buildManager); err != nil {
   186  			logrus.WithError(err).Fatal("Failed to add build cluster manager to main manager")
   187  		}
   188  		buildClusterClients[clusterName] = buildManager.GetClient()
   189  	}
   190  
   191  	c := controller{
   192  		ctx:           context.Background(),
   193  		logger:        logrus.NewEntry(logrus.StandardLogger()),
   194  		prowJobClient: mgr.GetClient(),
   195  		podClients:    buildClusterClients,
   196  		config:        cfg,
   197  		runOnce:       o.runOnce,
   198  	}
   199  	if err := mgr.Add(&c); err != nil {
   200  		logrus.WithError(err).Fatal("failed to add controller to manager")
   201  	}
   202  	if err := mgr.Start(interrupts.Context()); err != nil {
   203  		logrus.WithError(err).Fatal("failed to start manager")
   204  	}
   205  	logrus.Info("Manager ended gracefully")
   206  }
   207  
   208  type controller struct {
   209  	ctx           context.Context
   210  	logger        *logrus.Entry
   211  	prowJobClient ctrlruntimeclient.Client
   212  	podClients    map[string]ctrlruntimeclient.Client
   213  	config        config.Getter
   214  	runOnce       bool
   215  }
   216  
   217  func (c *controller) Start(ctx context.Context) error {
   218  	runChan := make(chan struct{})
   219  
   220  	// We want to be able to dynamically adjust to changed config values, hence we cant use a time.Ticker
   221  	go func() {
   222  		for {
   223  			runChan <- struct{}{}
   224  			time.Sleep(c.config().Sinker.ResyncPeriod.Duration)
   225  		}
   226  	}()
   227  
   228  	for {
   229  		select {
   230  		case <-ctx.Done():
   231  			c.logger.Info("stop signal received, quitting")
   232  			return nil
   233  		case <-runChan:
   234  			start := time.Now()
   235  			c.clean()
   236  			c.logger.Infof("Sync time: %v", time.Since(start))
   237  			if c.runOnce {
   238  				return nil
   239  			}
   240  		}
   241  	}
   242  }
   243  
   244  type sinkerReconciliationMetrics struct {
   245  	podsCreated            int
   246  	startAt                time.Time
   247  	finishedAt             time.Time
   248  	podsRemoved            map[string]int
   249  	podRemovalErrors       map[string]int
   250  	prowJobsCreated        int
   251  	prowJobsCleaned        map[string]int
   252  	prowJobsCleaningErrors map[string]int
   253  }
   254  
   255  // Prometheus Metrics
   256  var (
   257  	sinkerMetrics = struct {
   258  		podsCreated            prometheus.Gauge
   259  		timeUsed               prometheus.Gauge
   260  		podsRemoved            *prometheus.GaugeVec
   261  		podRemovalErrors       *prometheus.GaugeVec
   262  		prowJobsCreated        prometheus.Gauge
   263  		prowJobsCleaned        *prometheus.GaugeVec
   264  		prowJobsCleaningErrors *prometheus.GaugeVec
   265  		jobConfigMapSize       *prometheus.GaugeVec
   266  	}{
   267  		podsCreated: prometheus.NewGauge(prometheus.GaugeOpts{
   268  			Name: "sinker_pods_existing",
   269  			Help: "Number of the existing pods in each sinker cleaning.",
   270  		}),
   271  		timeUsed: prometheus.NewGauge(prometheus.GaugeOpts{
   272  			Name: "sinker_loop_duration_seconds",
   273  			Help: "Time used in each sinker cleaning.",
   274  		}),
   275  		podsRemoved: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   276  			Name: "sinker_pods_removed",
   277  			Help: "Number of pods removed in each sinker cleaning.",
   278  		}, []string{
   279  			"reason",
   280  		}),
   281  		podRemovalErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   282  			Name: "sinker_pod_removal_errors",
   283  			Help: "Number of errors which occurred in each sinker pod cleaning.",
   284  		}, []string{
   285  			"reason",
   286  		}),
   287  		prowJobsCreated: prometheus.NewGauge(prometheus.GaugeOpts{
   288  			Name: "sinker_prow_jobs_existing",
   289  			Help: "Number of the existing prow jobs in each sinker cleaning.",
   290  		}),
   291  		prowJobsCleaned: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   292  			Name: "sinker_prow_jobs_cleaned",
   293  			Help: "Number of prow jobs cleaned in each sinker cleaning.",
   294  		}, []string{
   295  			"reason",
   296  		}),
   297  		prowJobsCleaningErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   298  			Name: "sinker_prow_jobs_cleaning_errors",
   299  			Help: "Number of errors which occurred in each sinker prow job cleaning.",
   300  		}, []string{
   301  			"reason",
   302  		}),
   303  		jobConfigMapSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{
   304  			Name: "job_configmap_size",
   305  			Help: "Size of ConfigMap storing central job configuration files (gzipped) in bytes.",
   306  		}, []string{
   307  			"name",
   308  		}),
   309  	}
   310  )
   311  
   312  func init() {
   313  	prometheus.MustRegister(sinkerMetrics.podsCreated)
   314  	prometheus.MustRegister(sinkerMetrics.timeUsed)
   315  	prometheus.MustRegister(sinkerMetrics.podsRemoved)
   316  	prometheus.MustRegister(sinkerMetrics.podRemovalErrors)
   317  	prometheus.MustRegister(sinkerMetrics.prowJobsCreated)
   318  	prometheus.MustRegister(sinkerMetrics.prowJobsCleaned)
   319  	prometheus.MustRegister(sinkerMetrics.prowJobsCleaningErrors)
   320  	prometheus.MustRegister(sinkerMetrics.jobConfigMapSize)
   321  }
   322  
   323  func (m *sinkerReconciliationMetrics) getTimeUsed() time.Duration {
   324  	return m.finishedAt.Sub(m.startAt)
   325  }
   326  
   327  func (c *controller) clean() {
   328  
   329  	metrics := sinkerReconciliationMetrics{
   330  		startAt:                time.Now(),
   331  		podsRemoved:            map[string]int{},
   332  		podRemovalErrors:       map[string]int{},
   333  		prowJobsCleaned:        map[string]int{},
   334  		prowJobsCleaningErrors: map[string]int{}}
   335  
   336  	// Clean up old prow jobs first.
   337  	prowJobs := &prowapi.ProwJobList{}
   338  	if err := c.prowJobClient.List(c.ctx, prowJobs, ctrlruntimeclient.InNamespace(c.config().ProwJobNamespace)); err != nil {
   339  		c.logger.WithError(err).Error("Error listing prow jobs.")
   340  		return
   341  	}
   342  	metrics.prowJobsCreated = len(prowJobs.Items)
   343  
   344  	// Only delete pod if its prowjob is marked as finished
   345  	pjMap := map[string]*prowapi.ProwJob{}
   346  	isFinished := sets.New[string]()
   347  
   348  	maxProwJobAge := c.config().Sinker.MaxProwJobAge.Duration
   349  	for i, prowJob := range prowJobs.Items {
   350  		pjMap[prowJob.ObjectMeta.Name] = &prowJobs.Items[i]
   351  		// Handle periodics separately.
   352  		if prowJob.Spec.Type == prowapi.PeriodicJob {
   353  			continue
   354  		}
   355  		if !prowJob.Complete() {
   356  			continue
   357  		}
   358  		isFinished.Insert(prowJob.ObjectMeta.Name)
   359  		if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge {
   360  			continue
   361  		}
   362  		if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil {
   363  			c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.")
   364  			metrics.prowJobsCleaned[reasonProwJobAged]++
   365  		} else {
   366  			c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.")
   367  			metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++
   368  		}
   369  	}
   370  
   371  	// Keep track of what periodic jobs are in the config so we will
   372  	// not clean up their last prowjob.
   373  	isActivePeriodic := make(map[string]bool)
   374  	for _, p := range c.config().Periodics {
   375  		isActivePeriodic[p.Name] = true
   376  	}
   377  
   378  	// Get the jobs that we need to retain so horologium can continue working
   379  	// as intended.
   380  	latestPeriodics := pjutil.GetLatestProwJobs(prowJobs.Items, prowapi.PeriodicJob)
   381  	for _, prowJob := range prowJobs.Items {
   382  		if prowJob.Spec.Type != prowapi.PeriodicJob {
   383  			continue
   384  		}
   385  
   386  		if !prowJob.Complete() {
   387  			continue
   388  		}
   389  		isFinished.Insert(prowJob.ObjectMeta.Name)
   390  		latestPJ := latestPeriodics[prowJob.Spec.Job]
   391  		if isActivePeriodic[prowJob.Spec.Job] && prowJob.ObjectMeta.Name == latestPJ.ObjectMeta.Name {
   392  			// Ignore deleting this one.
   393  			continue
   394  		}
   395  		if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge {
   396  			continue
   397  		}
   398  		if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil {
   399  			c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.")
   400  			metrics.prowJobsCleaned[reasonProwJobAgedPeriodic]++
   401  		} else {
   402  			c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.")
   403  			metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++
   404  		}
   405  	}
   406  
   407  	// Now clean up old pods.
   408  	for cluster, client := range c.podClients {
   409  		log := c.logger.WithField("cluster", cluster)
   410  		var isClusterExcluded bool
   411  		for _, excludeCluster := range c.config().Sinker.ExcludeClusters {
   412  			if excludeCluster == cluster {
   413  				isClusterExcluded = true
   414  				break
   415  			}
   416  		}
   417  		if isClusterExcluded {
   418  			log.Debugf("Cluster %q is excluded, skipping pods deletion.", cluster)
   419  			continue
   420  		}
   421  		var pods corev1api.PodList
   422  		if err := client.List(c.ctx, &pods, ctrlruntimeclient.MatchingLabels{kube.CreatedByProw: "true"}, ctrlruntimeclient.InNamespace(c.config().PodNamespace)); err != nil {
   423  			log.WithError(err).Error("Error listing pods.")
   424  			continue
   425  		}
   426  		log.WithField("pod-count", len(pods.Items)).Debug("Successfully listed pods.")
   427  		metrics.podsCreated += len(pods.Items)
   428  		maxPodAge := c.config().Sinker.MaxPodAge.Duration
   429  		terminatedPodTTL := c.config().Sinker.TerminatedPodTTL.Duration
   430  		for _, pod := range pods.Items {
   431  			reason := ""
   432  			clean := false
   433  
   434  			// by default, use the pod name as the key to match the associated prow job
   435  			// this is to support legacy plank in case the kube.ProwJobIDLabel label is not set
   436  			podJobName := pod.ObjectMeta.Name
   437  			// if the pod has the kube.ProwJobIDLabel label, use this instead of the pod name
   438  			if value, ok := pod.ObjectMeta.Labels[kube.ProwJobIDLabel]; ok {
   439  				podJobName = value
   440  			}
   441  			log = log.WithField("pj", podJobName)
   442  			terminationTime := time.Time{}
   443  			if pj, ok := pjMap[podJobName]; ok && pj.Complete() {
   444  				terminationTime = pj.Status.CompletionTime.Time
   445  			}
   446  
   447  			if podNeedsKubernetesFinalizerCleanup(log, pjMap[podJobName], &pod) {
   448  				if err := c.cleanupKubernetesFinalizer(&pod, client); err != nil {
   449  					log.WithError(err).Error("Failed to remove kubernetesreporter finalizer")
   450  				}
   451  			}
   452  
   453  			switch {
   454  			case !pod.Status.StartTime.IsZero() && time.Since(pod.Status.StartTime.Time) > maxPodAge:
   455  				clean = true
   456  				reason = reasonPodAged
   457  			case !terminationTime.IsZero() && time.Since(terminationTime) > terminatedPodTTL:
   458  				clean = true
   459  				reason = reasonPodTTLed
   460  			}
   461  
   462  			if !isFinished.Has(podJobName) {
   463  				// prowjob exists and is not marked as completed yet
   464  				// deleting the pod now will result in plank creating a brand new pod
   465  				clean = false
   466  			}
   467  
   468  			if c.isPodOrphaned(log, &pod, podJobName) {
   469  				// prowjob has gone, we want to clean orphan pods regardless of the state
   470  				reason = reasonPodOrphaned
   471  				clean = true
   472  			}
   473  
   474  			if !clean {
   475  				continue
   476  			}
   477  
   478  			c.deletePod(log, &pod, reason, client, &metrics)
   479  		}
   480  	}
   481  
   482  	metrics.finishedAt = time.Now()
   483  	sinkerMetrics.podsCreated.Set(float64(metrics.podsCreated))
   484  	sinkerMetrics.timeUsed.Set(float64(metrics.getTimeUsed().Seconds()))
   485  	for k, v := range metrics.podsRemoved {
   486  		sinkerMetrics.podsRemoved.WithLabelValues(k).Set(float64(v))
   487  	}
   488  	for k, v := range metrics.podRemovalErrors {
   489  		sinkerMetrics.podRemovalErrors.WithLabelValues(k).Set(float64(v))
   490  	}
   491  	sinkerMetrics.prowJobsCreated.Set(float64(metrics.prowJobsCreated))
   492  	for k, v := range metrics.prowJobsCleaned {
   493  		sinkerMetrics.prowJobsCleaned.WithLabelValues(k).Set(float64(v))
   494  	}
   495  	for k, v := range metrics.prowJobsCleaningErrors {
   496  		sinkerMetrics.prowJobsCleaningErrors.WithLabelValues(k).Set(float64(v))
   497  	}
   498  	c.logger.Info("Sinker reconciliation complete.")
   499  }
   500  
   501  func (c *controller) cleanupKubernetesFinalizer(pod *corev1api.Pod, client ctrlruntimeclient.Client) error {
   502  
   503  	oldPod := pod.DeepCopy()
   504  	pod.Finalizers = sets.List(sets.New[string](pod.Finalizers...).Delete(kubernetesreporterapi.FinalizerName))
   505  
   506  	if err := client.Patch(c.ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil {
   507  		return fmt.Errorf("failed to patch pod: %w", err)
   508  	}
   509  
   510  	return nil
   511  }
   512  
   513  func (c *controller) deletePod(log *logrus.Entry, pod *corev1api.Pod, reason string, client ctrlruntimeclient.Client, m *sinkerReconciliationMetrics) {
   514  	name := pod.Name
   515  	// Delete old finished or orphan pods. Don't quit if we fail to delete one.
   516  	if err := client.Delete(c.ctx, pod); err == nil {
   517  		log.WithFields(logrus.Fields{"pod": name, "reason": reason}).Info("Deleted old completed pod.")
   518  		m.podsRemoved[reason]++
   519  	} else {
   520  		m.podRemovalErrors[string(k8serrors.ReasonForError(err))]++
   521  		if k8serrors.IsNotFound(err) {
   522  			log.WithField("pod", name).WithError(err).Info("Could not delete missing pod.")
   523  		} else {
   524  			log.WithField("pod", name).WithError(err).Error("Error deleting pod.")
   525  		}
   526  	}
   527  }
   528  
   529  func (c *controller) isPodOrphaned(log *logrus.Entry, pod *corev1api.Pod, prowJobName string) bool {
   530  	// ProwJobs are cached and the cache may lag a bit behind, so never considers
   531  	// pods that are less than 30 seconds old as orphaned
   532  	if !pod.CreationTimestamp.Before(&metav1.Time{Time: time.Now().Add(-30 * time.Second)}) {
   533  		return false
   534  	}
   535  
   536  	// We do a list in the very beginning of our processing. By the time we reach this check, that
   537  	// list might be outdated, so do another GET here before declaring the pod orphaned
   538  	pjName := types.NamespacedName{Namespace: c.config().ProwJobNamespace, Name: prowJobName}
   539  	if err := c.prowJobClient.Get(c.ctx, pjName, &prowapi.ProwJob{}); err != nil {
   540  		if k8serrors.IsNotFound(err) {
   541  			return true
   542  		}
   543  		logrus.WithError(err).Error("Failed to get prowjob")
   544  	}
   545  
   546  	return false
   547  }
   548  
   549  func podNeedsKubernetesFinalizerCleanup(log *logrus.Entry, pj *prowapi.ProwJob, pod *corev1api.Pod) bool {
   550  	// Can happen if someone deletes the prowjob before it finishes
   551  	if pj == nil {
   552  		return true
   553  	}
   554  	// This is always a bug
   555  	if pj.Complete() && pj.Status.PrevReportStates[kubernetesreporterapi.ReporterName] == pj.Status.State && sets.New[string](pod.Finalizers...).Has(kubernetesreporterapi.FinalizerName) {
   556  		log.WithField("pj", pj.Name).Errorf("BUG: Pod for prowjob still had the %s finalizer after completing and being successfully reported by the %s reporter", kubernetesreporterapi.FinalizerName, kubernetesreporterapi.ReporterName)
   557  
   558  		return true
   559  	}
   560  
   561  	return false
   562  }
   563  
   564  // jobConfigMapMonitor reports metrics for the size of the ConfigMap(s) found
   565  // under the the directory specified with --job-config-path (example:
   566  // "--job-config-path=/etc/job-config"). There are two possibilities --- either
   567  // the job ConfigMap is mounted directly at that path, or the ConfigMap was
   568  // partitioned (see https://github.com/kubernetes/test-infra/pull/28835) and
   569  // there are multiple subdirs underneath this one.
   570  func jobConfigMapMonitor(interval time.Duration, jobConfigPath string) {
   571  	logger := logrus.WithField("sync-loop", "job-configmap-monitor")
   572  	ticker := time.NewTicker(interval)
   573  
   574  	for ; true; <-ticker.C {
   575  		dirs, err := getConfigMapDirs(jobConfigPath)
   576  		if err != nil {
   577  			logger.WithField("dir", jobConfigPath).Error("could not resolve ConfigMap dirs")
   578  			continue
   579  		}
   580  		for _, dir := range dirs {
   581  			bytes, err := getConfigMapSize(dir)
   582  			if err != nil {
   583  				logger.WithField("dir", dir).WithError(err).Error("Failed to get configmap metrics")
   584  				continue
   585  			}
   586  			sinkerMetrics.jobConfigMapSize.WithLabelValues(dir).Set(float64(bytes))
   587  		}
   588  	}
   589  }
   590  
   591  // getDataDir gets the "..data" symlink which points to a timestamped directory.
   592  // See the comment for getConfigMapSize() for details.
   593  func getDataDir(toplevel string) string {
   594  	return path.Join(toplevel, "..data")
   595  }
   596  
   597  func getConfigMapDirs(toplevel string) ([]string, error) {
   598  	dataDir := getDataDir(toplevel)
   599  	dirs := []string{}
   600  
   601  	// If the data dir (symlink) does not exist directly, then assume that this
   602  	// path is a partition holding multiple ConfigMap-mounted dirs. We use
   603  	// os.Stat(), which means that both the "..data" symlink and its target
   604  	// folder must exist. Of course, nothing stops the folder from having
   605  	// "..data" as a folder or regular file, which would count as false
   606  	// positives, but we ignore these cases because exhaustive checking here is
   607  	// not our concern. We just want metrics.
   608  	if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) {
   609  		files, err := os.ReadDir(toplevel)
   610  		if err != nil {
   611  			return nil, err
   612  		}
   613  
   614  		for _, file := range files {
   615  			if !file.IsDir() {
   616  				continue
   617  			}
   618  			dirs = append(dirs, filepath.Join(toplevel, file.Name()))
   619  		}
   620  	} else {
   621  		dirs = append(dirs, toplevel)
   622  	}
   623  
   624  	return dirs, nil
   625  }
   626  
   627  // getConfigMapSize expects a path to the filesystem where a Kubernetes
   628  // ConfigMap has been mounted. It iterates over every key (file) found in that
   629  // directory, adding up the sizes of each of the files by calling
   630  // "syscall.Stat".
   631  //
   632  // When ConfigMaps are mounted to disk, all of its keys will become files
   633  // and the value (data) for each key will be the contents of the respective
   634  // files. Another special symlink, `..data`, will also be at the same level
   635  // as the keys and this symlink will point to yet another folder at the same
   636  // level like `..2024_01_11_22_52_09.1709975282`. This timestamped folder is
   637  // where the actual files will be located. So the layout looks like:
   638  //
   639  // folder-named-after-configmap-name
   640  // folder-named-after-configmap-name/..2024_01_11_22_52_09.1709975282
   641  // folder-named-after-configmap-name/..data (symlinked to ..2024_01_11... above)
   642  // folder-named-after-configmap-name/key1 (symlinked to ..data/key1)
   643  // folder-named-after-configmap-name/key2 (symlinked to ..data/key2)
   644  //
   645  // The above layout with the timestamped folder and the "..data" symlink is a
   646  // Kubernetes construct, and is applicable to every ConfigMap mounted to disk by
   647  // Kubernetes.
   648  //
   649  // For our purposes the exact details of this doesn't matter too much ---
   650  // our call to syscall.Stat() will still work for key1 and key2 above even
   651  // though they are symlinks. What we do care about though is the existence
   652  // of such `..data` and `..<timestamp>` files. We have to exclude these
   653  // files from our totals because otherwise we'd be double counting.
   654  func getConfigMapSize(configmapDir string) (int64, error) {
   655  	var total int64
   656  
   657  	// Look into the "..data" symlinked folder, which should contain the actual
   658  	// files where each file is a key in the ConfigMap.
   659  	dataDir := getDataDir(configmapDir)
   660  	if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) {
   661  		return 0, fmt.Errorf("%q is not a ConfigMap-mounted dir", configmapDir)
   662  	}
   663  
   664  	logger := logrus.NewEntry(logrus.StandardLogger())
   665  
   666  	var walkDirFunc = func(path string, d fs.DirEntry, err error) error {
   667  		if err != nil {
   668  			return err
   669  		}
   670  		// Don't process directories (that is, only process files). We don't
   671  		// expect any directories to exist at this level, but it doesn't hurt to
   672  		// skip any we encounter.
   673  		if d.IsDir() {
   674  			return nil
   675  		}
   676  		// Skip any symbolic links.
   677  		if d.Type() == fs.ModeSymlink {
   678  			return nil
   679  		}
   680  
   681  		info, err := d.Info()
   682  		if err != nil {
   683  			return err
   684  		}
   685  		logger.Infof("file %q is %v bytes", path, info.Size())
   686  		total += info.Size()
   687  		return nil
   688  	}
   689  
   690  	if err := filepath.WalkDir(configmapDir, walkDirFunc); err != nil {
   691  		return 0, nil
   692  	}
   693  
   694  	return total, nil
   695  }