sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/cmd/sinker/main.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "fmt" 24 "io/fs" 25 "os" 26 "path" 27 "path/filepath" 28 "time" 29 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/sirupsen/logrus" 32 corev1api "k8s.io/api/core/v1" 33 k8serrors "k8s.io/apimachinery/pkg/api/errors" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/types" 36 "k8s.io/apimachinery/pkg/util/sets" 37 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 38 ctrlruntimelog "sigs.k8s.io/controller-runtime/pkg/log" 39 "sigs.k8s.io/controller-runtime/pkg/log/zap" 40 "sigs.k8s.io/controller-runtime/pkg/manager" 41 "sigs.k8s.io/prow/pkg/pjutil/pprof" 42 43 prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1" 44 "sigs.k8s.io/prow/pkg/config" 45 kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api" 46 "sigs.k8s.io/prow/pkg/flagutil" 47 configflagutil "sigs.k8s.io/prow/pkg/flagutil/config" 48 "sigs.k8s.io/prow/pkg/interrupts" 49 "sigs.k8s.io/prow/pkg/kube" 50 "sigs.k8s.io/prow/pkg/logrusutil" 51 "sigs.k8s.io/prow/pkg/metrics" 52 "sigs.k8s.io/prow/pkg/pjutil" 53 _ "sigs.k8s.io/prow/pkg/version" 54 ) 55 56 type options struct { 57 runOnce bool 58 config configflagutil.ConfigOptions 59 dryRun bool 60 kubernetes flagutil.KubernetesOptions 61 instrumentationOptions flagutil.InstrumentationOptions 62 } 63 64 const ( 65 reasonPodAged = "aged" 66 reasonPodOrphaned = "orphaned" 67 reasonPodTTLed = "ttled" 68 69 reasonProwJobAged = "aged" 70 reasonProwJobAgedPeriodic = "aged-periodic" 71 ) 72 73 func gatherOptions(fs *flag.FlagSet, args ...string) options { 74 o := options{} 75 fs.BoolVar(&o.runOnce, "run-once", false, "If true, run only once then quit.") 76 77 fs.BoolVar(&o.dryRun, "dry-run", true, "Whether or not to make mutating API calls to Kubernetes.") 78 79 o.config.AddFlags(fs) 80 o.kubernetes.AddFlags(fs) 81 o.instrumentationOptions.AddFlags(fs) 82 fs.Parse(args) 83 return o 84 } 85 86 func (o *options) Validate() error { 87 if err := o.kubernetes.Validate(o.dryRun); err != nil { 88 return err 89 } 90 91 if err := o.config.Validate(o.dryRun); err != nil { 92 return err 93 } 94 95 return nil 96 } 97 98 func main() { 99 logrusutil.ComponentInit() 100 101 o := gatherOptions(flag.NewFlagSet(os.Args[0], flag.ExitOnError), os.Args[1:]...) 102 if err := o.Validate(); err != nil { 103 logrus.WithError(err).Fatal("Invalid options") 104 } 105 106 defer interrupts.WaitForGracefulShutdown() 107 108 pprof.Instrument(o.instrumentationOptions) 109 110 configAgent, err := o.config.ConfigAgent() 111 if err != nil { 112 logrus.WithError(err).Fatal("Error starting config agent.") 113 } 114 cfg := configAgent.Config 115 o.kubernetes.SetDisabledClusters(sets.New[string](cfg().DisabledClusters...)) 116 117 if o.config.JobConfigPath != "" { 118 go jobConfigMapMonitor(5*time.Minute, o.config.JobConfigPath) 119 } 120 121 metrics.ExposeMetrics("sinker", cfg().PushGateway, o.instrumentationOptions.MetricsPort) 122 123 ctrlruntimelog.SetLogger(zap.New(zap.JSONEncoder())) 124 125 infrastructureClusterConfig, err := o.kubernetes.InfrastructureClusterConfig(o.dryRun) 126 if err != nil { 127 logrus.WithError(err).Fatal("Error getting config for infastructure cluster") 128 } 129 130 // The watch apimachinery doesn't support restarts, so just exit the binary if a kubeconfig changes 131 // to make the kubelet restart us. 132 if err := o.kubernetes.AddKubeconfigChangeCallback(func() { 133 logrus.Info("Kubeconfig changed, exiting to trigger a restart") 134 interrupts.Terminate() 135 }); err != nil { 136 logrus.WithError(err).Fatal("Failed to register kubeconfig change callback") 137 } 138 139 opts := manager.Options{ 140 MetricsBindAddress: "0", 141 Namespace: cfg().ProwJobNamespace, 142 LeaderElection: true, 143 LeaderElectionNamespace: configAgent.Config().ProwJobNamespace, 144 LeaderElectionID: "prow-sinker-leaderlock", 145 LeaderElectionReleaseOnCancel: true, 146 } 147 mgr, err := manager.New(infrastructureClusterConfig, opts) 148 if err != nil { 149 logrus.WithError(err).Fatal("Error creating manager") 150 } 151 152 // The watch apimachinery doesn't support restarts, so just exit the 153 // binary if a build cluster can be connected later. 154 callBack := func() { 155 logrus.Info("Build cluster that failed to connect initially now worked, exiting to trigger a restart.") 156 interrupts.Terminate() 157 } 158 159 // We require operating on test pods in build clusters with the following 160 // verbs. This is used during startup to check that we have the necessary 161 // authorizations on build clusters. 162 requiredTestPodVerbs := []string{ 163 "delete", 164 "list", 165 "watch", 166 "get", 167 "patch", 168 } 169 170 buildManagers, err := o.kubernetes.BuildClusterManagers(o.dryRun, 171 requiredTestPodVerbs, 172 // The watch apimachinery doesn't support restarts, so just exit the 173 // binary if a build cluster can be connected later . 174 callBack, 175 func(o *manager.Options) { 176 o.Namespace = cfg().PodNamespace 177 }, 178 ) 179 if err != nil { 180 logrus.WithError(err).Error("Failed to construct build cluster managers. Is there a bad entry in the kubeconfig secret?") 181 } 182 183 buildClusterClients := map[string]ctrlruntimeclient.Client{} 184 for clusterName, buildManager := range buildManagers { 185 if err := mgr.Add(buildManager); err != nil { 186 logrus.WithError(err).Fatal("Failed to add build cluster manager to main manager") 187 } 188 buildClusterClients[clusterName] = buildManager.GetClient() 189 } 190 191 c := controller{ 192 ctx: context.Background(), 193 logger: logrus.NewEntry(logrus.StandardLogger()), 194 prowJobClient: mgr.GetClient(), 195 podClients: buildClusterClients, 196 config: cfg, 197 runOnce: o.runOnce, 198 } 199 if err := mgr.Add(&c); err != nil { 200 logrus.WithError(err).Fatal("failed to add controller to manager") 201 } 202 if err := mgr.Start(interrupts.Context()); err != nil { 203 logrus.WithError(err).Fatal("failed to start manager") 204 } 205 logrus.Info("Manager ended gracefully") 206 } 207 208 type controller struct { 209 ctx context.Context 210 logger *logrus.Entry 211 prowJobClient ctrlruntimeclient.Client 212 podClients map[string]ctrlruntimeclient.Client 213 config config.Getter 214 runOnce bool 215 } 216 217 func (c *controller) Start(ctx context.Context) error { 218 runChan := make(chan struct{}) 219 220 // We want to be able to dynamically adjust to changed config values, hence we cant use a time.Ticker 221 go func() { 222 for { 223 runChan <- struct{}{} 224 time.Sleep(c.config().Sinker.ResyncPeriod.Duration) 225 } 226 }() 227 228 for { 229 select { 230 case <-ctx.Done(): 231 c.logger.Info("stop signal received, quitting") 232 return nil 233 case <-runChan: 234 start := time.Now() 235 c.clean() 236 c.logger.Infof("Sync time: %v", time.Since(start)) 237 if c.runOnce { 238 return nil 239 } 240 } 241 } 242 } 243 244 type sinkerReconciliationMetrics struct { 245 podsCreated int 246 startAt time.Time 247 finishedAt time.Time 248 podsRemoved map[string]int 249 podRemovalErrors map[string]int 250 prowJobsCreated int 251 prowJobsCleaned map[string]int 252 prowJobsCleaningErrors map[string]int 253 } 254 255 // Prometheus Metrics 256 var ( 257 sinkerMetrics = struct { 258 podsCreated prometheus.Gauge 259 timeUsed prometheus.Gauge 260 podsRemoved *prometheus.GaugeVec 261 podRemovalErrors *prometheus.GaugeVec 262 prowJobsCreated prometheus.Gauge 263 prowJobsCleaned *prometheus.GaugeVec 264 prowJobsCleaningErrors *prometheus.GaugeVec 265 jobConfigMapSize *prometheus.GaugeVec 266 }{ 267 podsCreated: prometheus.NewGauge(prometheus.GaugeOpts{ 268 Name: "sinker_pods_existing", 269 Help: "Number of the existing pods in each sinker cleaning.", 270 }), 271 timeUsed: prometheus.NewGauge(prometheus.GaugeOpts{ 272 Name: "sinker_loop_duration_seconds", 273 Help: "Time used in each sinker cleaning.", 274 }), 275 podsRemoved: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 276 Name: "sinker_pods_removed", 277 Help: "Number of pods removed in each sinker cleaning.", 278 }, []string{ 279 "reason", 280 }), 281 podRemovalErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 282 Name: "sinker_pod_removal_errors", 283 Help: "Number of errors which occurred in each sinker pod cleaning.", 284 }, []string{ 285 "reason", 286 }), 287 prowJobsCreated: prometheus.NewGauge(prometheus.GaugeOpts{ 288 Name: "sinker_prow_jobs_existing", 289 Help: "Number of the existing prow jobs in each sinker cleaning.", 290 }), 291 prowJobsCleaned: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 292 Name: "sinker_prow_jobs_cleaned", 293 Help: "Number of prow jobs cleaned in each sinker cleaning.", 294 }, []string{ 295 "reason", 296 }), 297 prowJobsCleaningErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 298 Name: "sinker_prow_jobs_cleaning_errors", 299 Help: "Number of errors which occurred in each sinker prow job cleaning.", 300 }, []string{ 301 "reason", 302 }), 303 jobConfigMapSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 304 Name: "job_configmap_size", 305 Help: "Size of ConfigMap storing central job configuration files (gzipped) in bytes.", 306 }, []string{ 307 "name", 308 }), 309 } 310 ) 311 312 func init() { 313 prometheus.MustRegister(sinkerMetrics.podsCreated) 314 prometheus.MustRegister(sinkerMetrics.timeUsed) 315 prometheus.MustRegister(sinkerMetrics.podsRemoved) 316 prometheus.MustRegister(sinkerMetrics.podRemovalErrors) 317 prometheus.MustRegister(sinkerMetrics.prowJobsCreated) 318 prometheus.MustRegister(sinkerMetrics.prowJobsCleaned) 319 prometheus.MustRegister(sinkerMetrics.prowJobsCleaningErrors) 320 prometheus.MustRegister(sinkerMetrics.jobConfigMapSize) 321 } 322 323 func (m *sinkerReconciliationMetrics) getTimeUsed() time.Duration { 324 return m.finishedAt.Sub(m.startAt) 325 } 326 327 func (c *controller) clean() { 328 329 metrics := sinkerReconciliationMetrics{ 330 startAt: time.Now(), 331 podsRemoved: map[string]int{}, 332 podRemovalErrors: map[string]int{}, 333 prowJobsCleaned: map[string]int{}, 334 prowJobsCleaningErrors: map[string]int{}} 335 336 // Clean up old prow jobs first. 337 prowJobs := &prowapi.ProwJobList{} 338 if err := c.prowJobClient.List(c.ctx, prowJobs, ctrlruntimeclient.InNamespace(c.config().ProwJobNamespace)); err != nil { 339 c.logger.WithError(err).Error("Error listing prow jobs.") 340 return 341 } 342 metrics.prowJobsCreated = len(prowJobs.Items) 343 344 // Only delete pod if its prowjob is marked as finished 345 pjMap := map[string]*prowapi.ProwJob{} 346 isFinished := sets.New[string]() 347 348 maxProwJobAge := c.config().Sinker.MaxProwJobAge.Duration 349 for i, prowJob := range prowJobs.Items { 350 pjMap[prowJob.ObjectMeta.Name] = &prowJobs.Items[i] 351 // Handle periodics separately. 352 if prowJob.Spec.Type == prowapi.PeriodicJob { 353 continue 354 } 355 if !prowJob.Complete() { 356 continue 357 } 358 isFinished.Insert(prowJob.ObjectMeta.Name) 359 if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge { 360 continue 361 } 362 if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil { 363 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.") 364 metrics.prowJobsCleaned[reasonProwJobAged]++ 365 } else { 366 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.") 367 metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++ 368 } 369 } 370 371 // Keep track of what periodic jobs are in the config so we will 372 // not clean up their last prowjob. 373 isActivePeriodic := make(map[string]bool) 374 for _, p := range c.config().Periodics { 375 isActivePeriodic[p.Name] = true 376 } 377 378 // Get the jobs that we need to retain so horologium can continue working 379 // as intended. 380 latestPeriodics := pjutil.GetLatestProwJobs(prowJobs.Items, prowapi.PeriodicJob) 381 for _, prowJob := range prowJobs.Items { 382 if prowJob.Spec.Type != prowapi.PeriodicJob { 383 continue 384 } 385 386 if !prowJob.Complete() { 387 continue 388 } 389 isFinished.Insert(prowJob.ObjectMeta.Name) 390 latestPJ := latestPeriodics[prowJob.Spec.Job] 391 if isActivePeriodic[prowJob.Spec.Job] && prowJob.ObjectMeta.Name == latestPJ.ObjectMeta.Name { 392 // Ignore deleting this one. 393 continue 394 } 395 if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge { 396 continue 397 } 398 if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil { 399 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.") 400 metrics.prowJobsCleaned[reasonProwJobAgedPeriodic]++ 401 } else { 402 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.") 403 metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++ 404 } 405 } 406 407 // Now clean up old pods. 408 for cluster, client := range c.podClients { 409 log := c.logger.WithField("cluster", cluster) 410 var isClusterExcluded bool 411 for _, excludeCluster := range c.config().Sinker.ExcludeClusters { 412 if excludeCluster == cluster { 413 isClusterExcluded = true 414 break 415 } 416 } 417 if isClusterExcluded { 418 log.Debugf("Cluster %q is excluded, skipping pods deletion.", cluster) 419 continue 420 } 421 var pods corev1api.PodList 422 if err := client.List(c.ctx, &pods, ctrlruntimeclient.MatchingLabels{kube.CreatedByProw: "true"}, ctrlruntimeclient.InNamespace(c.config().PodNamespace)); err != nil { 423 log.WithError(err).Error("Error listing pods.") 424 continue 425 } 426 log.WithField("pod-count", len(pods.Items)).Debug("Successfully listed pods.") 427 metrics.podsCreated += len(pods.Items) 428 maxPodAge := c.config().Sinker.MaxPodAge.Duration 429 terminatedPodTTL := c.config().Sinker.TerminatedPodTTL.Duration 430 for _, pod := range pods.Items { 431 reason := "" 432 clean := false 433 434 // by default, use the pod name as the key to match the associated prow job 435 // this is to support legacy plank in case the kube.ProwJobIDLabel label is not set 436 podJobName := pod.ObjectMeta.Name 437 // if the pod has the kube.ProwJobIDLabel label, use this instead of the pod name 438 if value, ok := pod.ObjectMeta.Labels[kube.ProwJobIDLabel]; ok { 439 podJobName = value 440 } 441 log = log.WithField("pj", podJobName) 442 terminationTime := time.Time{} 443 if pj, ok := pjMap[podJobName]; ok && pj.Complete() { 444 terminationTime = pj.Status.CompletionTime.Time 445 } 446 447 if podNeedsKubernetesFinalizerCleanup(log, pjMap[podJobName], &pod) { 448 if err := c.cleanupKubernetesFinalizer(&pod, client); err != nil { 449 log.WithError(err).Error("Failed to remove kubernetesreporter finalizer") 450 } 451 } 452 453 switch { 454 case !pod.Status.StartTime.IsZero() && time.Since(pod.Status.StartTime.Time) > maxPodAge: 455 clean = true 456 reason = reasonPodAged 457 case !terminationTime.IsZero() && time.Since(terminationTime) > terminatedPodTTL: 458 clean = true 459 reason = reasonPodTTLed 460 } 461 462 if !isFinished.Has(podJobName) { 463 // prowjob exists and is not marked as completed yet 464 // deleting the pod now will result in plank creating a brand new pod 465 clean = false 466 } 467 468 if c.isPodOrphaned(log, &pod, podJobName) { 469 // prowjob has gone, we want to clean orphan pods regardless of the state 470 reason = reasonPodOrphaned 471 clean = true 472 } 473 474 if !clean { 475 continue 476 } 477 478 c.deletePod(log, &pod, reason, client, &metrics) 479 } 480 } 481 482 metrics.finishedAt = time.Now() 483 sinkerMetrics.podsCreated.Set(float64(metrics.podsCreated)) 484 sinkerMetrics.timeUsed.Set(float64(metrics.getTimeUsed().Seconds())) 485 for k, v := range metrics.podsRemoved { 486 sinkerMetrics.podsRemoved.WithLabelValues(k).Set(float64(v)) 487 } 488 for k, v := range metrics.podRemovalErrors { 489 sinkerMetrics.podRemovalErrors.WithLabelValues(k).Set(float64(v)) 490 } 491 sinkerMetrics.prowJobsCreated.Set(float64(metrics.prowJobsCreated)) 492 for k, v := range metrics.prowJobsCleaned { 493 sinkerMetrics.prowJobsCleaned.WithLabelValues(k).Set(float64(v)) 494 } 495 for k, v := range metrics.prowJobsCleaningErrors { 496 sinkerMetrics.prowJobsCleaningErrors.WithLabelValues(k).Set(float64(v)) 497 } 498 c.logger.Info("Sinker reconciliation complete.") 499 } 500 501 func (c *controller) cleanupKubernetesFinalizer(pod *corev1api.Pod, client ctrlruntimeclient.Client) error { 502 503 oldPod := pod.DeepCopy() 504 pod.Finalizers = sets.List(sets.New[string](pod.Finalizers...).Delete(kubernetesreporterapi.FinalizerName)) 505 506 if err := client.Patch(c.ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil { 507 return fmt.Errorf("failed to patch pod: %w", err) 508 } 509 510 return nil 511 } 512 513 func (c *controller) deletePod(log *logrus.Entry, pod *corev1api.Pod, reason string, client ctrlruntimeclient.Client, m *sinkerReconciliationMetrics) { 514 name := pod.Name 515 // Delete old finished or orphan pods. Don't quit if we fail to delete one. 516 if err := client.Delete(c.ctx, pod); err == nil { 517 log.WithFields(logrus.Fields{"pod": name, "reason": reason}).Info("Deleted old completed pod.") 518 m.podsRemoved[reason]++ 519 } else { 520 m.podRemovalErrors[string(k8serrors.ReasonForError(err))]++ 521 if k8serrors.IsNotFound(err) { 522 log.WithField("pod", name).WithError(err).Info("Could not delete missing pod.") 523 } else { 524 log.WithField("pod", name).WithError(err).Error("Error deleting pod.") 525 } 526 } 527 } 528 529 func (c *controller) isPodOrphaned(log *logrus.Entry, pod *corev1api.Pod, prowJobName string) bool { 530 // ProwJobs are cached and the cache may lag a bit behind, so never considers 531 // pods that are less than 30 seconds old as orphaned 532 if !pod.CreationTimestamp.Before(&metav1.Time{Time: time.Now().Add(-30 * time.Second)}) { 533 return false 534 } 535 536 // We do a list in the very beginning of our processing. By the time we reach this check, that 537 // list might be outdated, so do another GET here before declaring the pod orphaned 538 pjName := types.NamespacedName{Namespace: c.config().ProwJobNamespace, Name: prowJobName} 539 if err := c.prowJobClient.Get(c.ctx, pjName, &prowapi.ProwJob{}); err != nil { 540 if k8serrors.IsNotFound(err) { 541 return true 542 } 543 logrus.WithError(err).Error("Failed to get prowjob") 544 } 545 546 return false 547 } 548 549 func podNeedsKubernetesFinalizerCleanup(log *logrus.Entry, pj *prowapi.ProwJob, pod *corev1api.Pod) bool { 550 // Can happen if someone deletes the prowjob before it finishes 551 if pj == nil { 552 return true 553 } 554 // This is always a bug 555 if pj.Complete() && pj.Status.PrevReportStates[kubernetesreporterapi.ReporterName] == pj.Status.State && sets.New[string](pod.Finalizers...).Has(kubernetesreporterapi.FinalizerName) { 556 log.WithField("pj", pj.Name).Errorf("BUG: Pod for prowjob still had the %s finalizer after completing and being successfully reported by the %s reporter", kubernetesreporterapi.FinalizerName, kubernetesreporterapi.ReporterName) 557 558 return true 559 } 560 561 return false 562 } 563 564 // jobConfigMapMonitor reports metrics for the size of the ConfigMap(s) found 565 // under the the directory specified with --job-config-path (example: 566 // "--job-config-path=/etc/job-config"). There are two possibilities --- either 567 // the job ConfigMap is mounted directly at that path, or the ConfigMap was 568 // partitioned (see https://github.com/kubernetes/test-infra/pull/28835) and 569 // there are multiple subdirs underneath this one. 570 func jobConfigMapMonitor(interval time.Duration, jobConfigPath string) { 571 logger := logrus.WithField("sync-loop", "job-configmap-monitor") 572 ticker := time.NewTicker(interval) 573 574 for ; true; <-ticker.C { 575 dirs, err := getConfigMapDirs(jobConfigPath) 576 if err != nil { 577 logger.WithField("dir", jobConfigPath).Error("could not resolve ConfigMap dirs") 578 continue 579 } 580 for _, dir := range dirs { 581 bytes, err := getConfigMapSize(dir) 582 if err != nil { 583 logger.WithField("dir", dir).WithError(err).Error("Failed to get configmap metrics") 584 continue 585 } 586 sinkerMetrics.jobConfigMapSize.WithLabelValues(dir).Set(float64(bytes)) 587 } 588 } 589 } 590 591 // getDataDir gets the "..data" symlink which points to a timestamped directory. 592 // See the comment for getConfigMapSize() for details. 593 func getDataDir(toplevel string) string { 594 return path.Join(toplevel, "..data") 595 } 596 597 func getConfigMapDirs(toplevel string) ([]string, error) { 598 dataDir := getDataDir(toplevel) 599 dirs := []string{} 600 601 // If the data dir (symlink) does not exist directly, then assume that this 602 // path is a partition holding multiple ConfigMap-mounted dirs. We use 603 // os.Stat(), which means that both the "..data" symlink and its target 604 // folder must exist. Of course, nothing stops the folder from having 605 // "..data" as a folder or regular file, which would count as false 606 // positives, but we ignore these cases because exhaustive checking here is 607 // not our concern. We just want metrics. 608 if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) { 609 files, err := os.ReadDir(toplevel) 610 if err != nil { 611 return nil, err 612 } 613 614 for _, file := range files { 615 if !file.IsDir() { 616 continue 617 } 618 dirs = append(dirs, filepath.Join(toplevel, file.Name())) 619 } 620 } else { 621 dirs = append(dirs, toplevel) 622 } 623 624 return dirs, nil 625 } 626 627 // getConfigMapSize expects a path to the filesystem where a Kubernetes 628 // ConfigMap has been mounted. It iterates over every key (file) found in that 629 // directory, adding up the sizes of each of the files by calling 630 // "syscall.Stat". 631 // 632 // When ConfigMaps are mounted to disk, all of its keys will become files 633 // and the value (data) for each key will be the contents of the respective 634 // files. Another special symlink, `..data`, will also be at the same level 635 // as the keys and this symlink will point to yet another folder at the same 636 // level like `..2024_01_11_22_52_09.1709975282`. This timestamped folder is 637 // where the actual files will be located. So the layout looks like: 638 // 639 // folder-named-after-configmap-name 640 // folder-named-after-configmap-name/..2024_01_11_22_52_09.1709975282 641 // folder-named-after-configmap-name/..data (symlinked to ..2024_01_11... above) 642 // folder-named-after-configmap-name/key1 (symlinked to ..data/key1) 643 // folder-named-after-configmap-name/key2 (symlinked to ..data/key2) 644 // 645 // The above layout with the timestamped folder and the "..data" symlink is a 646 // Kubernetes construct, and is applicable to every ConfigMap mounted to disk by 647 // Kubernetes. 648 // 649 // For our purposes the exact details of this doesn't matter too much --- 650 // our call to syscall.Stat() will still work for key1 and key2 above even 651 // though they are symlinks. What we do care about though is the existence 652 // of such `..data` and `..<timestamp>` files. We have to exclude these 653 // files from our totals because otherwise we'd be double counting. 654 func getConfigMapSize(configmapDir string) (int64, error) { 655 var total int64 656 657 // Look into the "..data" symlinked folder, which should contain the actual 658 // files where each file is a key in the ConfigMap. 659 dataDir := getDataDir(configmapDir) 660 if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) { 661 return 0, fmt.Errorf("%q is not a ConfigMap-mounted dir", configmapDir) 662 } 663 664 logger := logrus.NewEntry(logrus.StandardLogger()) 665 666 var walkDirFunc = func(path string, d fs.DirEntry, err error) error { 667 if err != nil { 668 return err 669 } 670 // Don't process directories (that is, only process files). We don't 671 // expect any directories to exist at this level, but it doesn't hurt to 672 // skip any we encounter. 673 if d.IsDir() { 674 return nil 675 } 676 // Skip any symbolic links. 677 if d.Type() == fs.ModeSymlink { 678 return nil 679 } 680 681 info, err := d.Info() 682 if err != nil { 683 return err 684 } 685 logger.Infof("file %q is %v bytes", path, info.Size()) 686 total += info.Size() 687 return nil 688 } 689 690 if err := filepath.WalkDir(configmapDir, walkDirFunc); err != nil { 691 return 0, nil 692 } 693 694 return total, nil 695 }