github.com/zppinho/prow@v0.0.0-20240510014325-1738badeb017/cmd/sinker/main.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "context" 21 "errors" 22 "flag" 23 "fmt" 24 "io/fs" 25 "os" 26 "path" 27 "path/filepath" 28 "time" 29 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/sirupsen/logrus" 32 corev1api "k8s.io/api/core/v1" 33 k8serrors "k8s.io/apimachinery/pkg/api/errors" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/types" 36 "k8s.io/apimachinery/pkg/util/sets" 37 "sigs.k8s.io/controller-runtime/pkg/cache" 38 ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" 39 ctrlruntimelog "sigs.k8s.io/controller-runtime/pkg/log" 40 "sigs.k8s.io/controller-runtime/pkg/log/zap" 41 "sigs.k8s.io/controller-runtime/pkg/manager" 42 metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 43 "sigs.k8s.io/prow/pkg/pjutil/pprof" 44 45 prowapi "sigs.k8s.io/prow/pkg/apis/prowjobs/v1" 46 "sigs.k8s.io/prow/pkg/config" 47 kubernetesreporterapi "sigs.k8s.io/prow/pkg/crier/reporters/gcs/kubernetes/api" 48 "sigs.k8s.io/prow/pkg/flagutil" 49 configflagutil "sigs.k8s.io/prow/pkg/flagutil/config" 50 "sigs.k8s.io/prow/pkg/interrupts" 51 "sigs.k8s.io/prow/pkg/kube" 52 "sigs.k8s.io/prow/pkg/logrusutil" 53 "sigs.k8s.io/prow/pkg/metrics" 54 "sigs.k8s.io/prow/pkg/pjutil" 55 _ "sigs.k8s.io/prow/pkg/version" 56 ) 57 58 type options struct { 59 runOnce bool 60 config configflagutil.ConfigOptions 61 dryRun bool 62 kubernetes flagutil.KubernetesOptions 63 instrumentationOptions flagutil.InstrumentationOptions 64 } 65 66 const ( 67 reasonPodAged = "aged" 68 reasonPodOrphaned = "orphaned" 69 reasonPodTTLed = "ttled" 70 71 reasonProwJobAged = "aged" 72 reasonProwJobAgedPeriodic = "aged-periodic" 73 ) 74 75 func gatherOptions(fs *flag.FlagSet, args ...string) options { 76 o := options{} 77 fs.BoolVar(&o.runOnce, "run-once", false, "If true, run only once then quit.") 78 79 fs.BoolVar(&o.dryRun, "dry-run", true, "Whether or not to make mutating API calls to Kubernetes.") 80 81 o.config.AddFlags(fs) 82 o.kubernetes.AddFlags(fs) 83 o.instrumentationOptions.AddFlags(fs) 84 fs.Parse(args) 85 return o 86 } 87 88 func (o *options) Validate() error { 89 if err := o.kubernetes.Validate(o.dryRun); err != nil { 90 return err 91 } 92 93 if err := o.config.Validate(o.dryRun); err != nil { 94 return err 95 } 96 97 return nil 98 } 99 100 func main() { 101 logrusutil.ComponentInit() 102 103 o := gatherOptions(flag.NewFlagSet(os.Args[0], flag.ExitOnError), os.Args[1:]...) 104 if err := o.Validate(); err != nil { 105 logrus.WithError(err).Fatal("Invalid options") 106 } 107 108 defer interrupts.WaitForGracefulShutdown() 109 110 pprof.Instrument(o.instrumentationOptions) 111 112 configAgent, err := o.config.ConfigAgent() 113 if err != nil { 114 logrus.WithError(err).Fatal("Error starting config agent.") 115 } 116 cfg := configAgent.Config 117 o.kubernetes.SetDisabledClusters(sets.New[string](cfg().DisabledClusters...)) 118 119 if o.config.JobConfigPath != "" { 120 go jobConfigMapMonitor(5*time.Minute, o.config.JobConfigPath) 121 } 122 123 metrics.ExposeMetrics("sinker", cfg().PushGateway, o.instrumentationOptions.MetricsPort) 124 125 ctrlruntimelog.SetLogger(zap.New(zap.JSONEncoder())) 126 127 infrastructureClusterConfig, err := o.kubernetes.InfrastructureClusterConfig(o.dryRun) 128 if err != nil { 129 logrus.WithError(err).Fatal("Error getting config for infastructure cluster") 130 } 131 132 // The watch apimachinery doesn't support restarts, so just exit the binary if a kubeconfig changes 133 // to make the kubelet restart us. 134 if err := o.kubernetes.AddKubeconfigChangeCallback(func() { 135 logrus.Info("Kubeconfig changed, exiting to trigger a restart") 136 interrupts.Terminate() 137 }); err != nil { 138 logrus.WithError(err).Fatal("Failed to register kubeconfig change callback") 139 } 140 141 opts := manager.Options{ 142 Cache: cache.Options{ 143 DefaultNamespaces: map[string]cache.Config{ 144 cfg().ProwJobNamespace: {}, 145 }, 146 }, 147 Metrics: metricsserver.Options{ 148 BindAddress: "0", 149 }, 150 LeaderElection: true, 151 LeaderElectionNamespace: configAgent.Config().ProwJobNamespace, 152 LeaderElectionID: "prow-sinker-leaderlock", 153 LeaderElectionReleaseOnCancel: true, 154 } 155 mgr, err := manager.New(infrastructureClusterConfig, opts) 156 if err != nil { 157 logrus.WithError(err).Fatal("Error creating manager") 158 } 159 160 // The watch apimachinery doesn't support restarts, so just exit the 161 // binary if a build cluster can be connected later. 162 callBack := func() { 163 logrus.Info("Build cluster that failed to connect initially now worked, exiting to trigger a restart.") 164 interrupts.Terminate() 165 } 166 167 // We require operating on test pods in build clusters with the following 168 // verbs. This is used during startup to check that we have the necessary 169 // authorizations on build clusters. 170 requiredTestPodVerbs := []string{ 171 "delete", 172 "list", 173 "watch", 174 "get", 175 "patch", 176 } 177 178 buildManagers, err := o.kubernetes.BuildClusterManagers(o.dryRun, 179 requiredTestPodVerbs, 180 // The watch apimachinery doesn't support restarts, so just exit the 181 // binary if a build cluster can be connected later . 182 callBack, 183 cfg().PodNamespace, 184 ) 185 if err != nil { 186 logrus.WithError(err).Error("Failed to construct build cluster managers. Is there a bad entry in the kubeconfig secret?") 187 } 188 189 buildClusterClients := map[string]ctrlruntimeclient.Client{} 190 for clusterName, buildManager := range buildManagers { 191 if err := mgr.Add(buildManager); err != nil { 192 logrus.WithError(err).Fatal("Failed to add build cluster manager to main manager") 193 } 194 buildClusterClients[clusterName] = buildManager.GetClient() 195 } 196 197 c := controller{ 198 ctx: context.Background(), 199 logger: logrus.NewEntry(logrus.StandardLogger()), 200 prowJobClient: mgr.GetClient(), 201 podClients: buildClusterClients, 202 config: cfg, 203 runOnce: o.runOnce, 204 } 205 if err := mgr.Add(&c); err != nil { 206 logrus.WithError(err).Fatal("failed to add controller to manager") 207 } 208 if err := mgr.Start(interrupts.Context()); err != nil { 209 logrus.WithError(err).Fatal("failed to start manager") 210 } 211 logrus.Info("Manager ended gracefully") 212 } 213 214 type controller struct { 215 ctx context.Context 216 logger *logrus.Entry 217 prowJobClient ctrlruntimeclient.Client 218 podClients map[string]ctrlruntimeclient.Client 219 config config.Getter 220 runOnce bool 221 } 222 223 func (c *controller) Start(ctx context.Context) error { 224 runChan := make(chan struct{}) 225 226 // We want to be able to dynamically adjust to changed config values, hence we cant use a time.Ticker 227 go func() { 228 for { 229 runChan <- struct{}{} 230 time.Sleep(c.config().Sinker.ResyncPeriod.Duration) 231 } 232 }() 233 234 for { 235 select { 236 case <-ctx.Done(): 237 c.logger.Info("stop signal received, quitting") 238 return nil 239 case <-runChan: 240 start := time.Now() 241 c.clean() 242 c.logger.Infof("Sync time: %v", time.Since(start)) 243 if c.runOnce { 244 return nil 245 } 246 } 247 } 248 } 249 250 type sinkerReconciliationMetrics struct { 251 podsCreated int 252 startAt time.Time 253 finishedAt time.Time 254 podsRemoved map[string]int 255 podRemovalErrors map[string]int 256 prowJobsCreated int 257 prowJobsCleaned map[string]int 258 prowJobsCleaningErrors map[string]int 259 } 260 261 // Prometheus Metrics 262 var ( 263 sinkerMetrics = struct { 264 podsCreated prometheus.Gauge 265 timeUsed prometheus.Gauge 266 podsRemoved *prometheus.GaugeVec 267 podRemovalErrors *prometheus.GaugeVec 268 prowJobsCreated prometheus.Gauge 269 prowJobsCleaned *prometheus.GaugeVec 270 prowJobsCleaningErrors *prometheus.GaugeVec 271 jobConfigMapSize *prometheus.GaugeVec 272 }{ 273 podsCreated: prometheus.NewGauge(prometheus.GaugeOpts{ 274 Name: "sinker_pods_existing", 275 Help: "Number of the existing pods in each sinker cleaning.", 276 }), 277 timeUsed: prometheus.NewGauge(prometheus.GaugeOpts{ 278 Name: "sinker_loop_duration_seconds", 279 Help: "Time used in each sinker cleaning.", 280 }), 281 podsRemoved: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 282 Name: "sinker_pods_removed", 283 Help: "Number of pods removed in each sinker cleaning.", 284 }, []string{ 285 "reason", 286 }), 287 podRemovalErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 288 Name: "sinker_pod_removal_errors", 289 Help: "Number of errors which occurred in each sinker pod cleaning.", 290 }, []string{ 291 "reason", 292 }), 293 prowJobsCreated: prometheus.NewGauge(prometheus.GaugeOpts{ 294 Name: "sinker_prow_jobs_existing", 295 Help: "Number of the existing prow jobs in each sinker cleaning.", 296 }), 297 prowJobsCleaned: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 298 Name: "sinker_prow_jobs_cleaned", 299 Help: "Number of prow jobs cleaned in each sinker cleaning.", 300 }, []string{ 301 "reason", 302 }), 303 prowJobsCleaningErrors: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 304 Name: "sinker_prow_jobs_cleaning_errors", 305 Help: "Number of errors which occurred in each sinker prow job cleaning.", 306 }, []string{ 307 "reason", 308 }), 309 jobConfigMapSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 310 Name: "job_configmap_size", 311 Help: "Size of ConfigMap storing central job configuration files (gzipped) in bytes.", 312 }, []string{ 313 "name", 314 }), 315 } 316 ) 317 318 func init() { 319 prometheus.MustRegister(sinkerMetrics.podsCreated) 320 prometheus.MustRegister(sinkerMetrics.timeUsed) 321 prometheus.MustRegister(sinkerMetrics.podsRemoved) 322 prometheus.MustRegister(sinkerMetrics.podRemovalErrors) 323 prometheus.MustRegister(sinkerMetrics.prowJobsCreated) 324 prometheus.MustRegister(sinkerMetrics.prowJobsCleaned) 325 prometheus.MustRegister(sinkerMetrics.prowJobsCleaningErrors) 326 prometheus.MustRegister(sinkerMetrics.jobConfigMapSize) 327 } 328 329 func (m *sinkerReconciliationMetrics) getTimeUsed() time.Duration { 330 return m.finishedAt.Sub(m.startAt) 331 } 332 333 func (c *controller) clean() { 334 335 metrics := sinkerReconciliationMetrics{ 336 startAt: time.Now(), 337 podsRemoved: map[string]int{}, 338 podRemovalErrors: map[string]int{}, 339 prowJobsCleaned: map[string]int{}, 340 prowJobsCleaningErrors: map[string]int{}} 341 342 // Clean up old prow jobs first. 343 prowJobs := &prowapi.ProwJobList{} 344 if err := c.prowJobClient.List(c.ctx, prowJobs, ctrlruntimeclient.InNamespace(c.config().ProwJobNamespace)); err != nil { 345 c.logger.WithError(err).Error("Error listing prow jobs.") 346 return 347 } 348 metrics.prowJobsCreated = len(prowJobs.Items) 349 350 // Only delete pod if its prowjob is marked as finished 351 pjMap := map[string]*prowapi.ProwJob{} 352 isFinished := sets.New[string]() 353 354 maxProwJobAge := c.config().Sinker.MaxProwJobAge.Duration 355 for i, prowJob := range prowJobs.Items { 356 pjMap[prowJob.ObjectMeta.Name] = &prowJobs.Items[i] 357 // Handle periodics separately. 358 if prowJob.Spec.Type == prowapi.PeriodicJob { 359 continue 360 } 361 if !prowJob.Complete() { 362 continue 363 } 364 isFinished.Insert(prowJob.ObjectMeta.Name) 365 if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge { 366 continue 367 } 368 if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil { 369 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.") 370 metrics.prowJobsCleaned[reasonProwJobAged]++ 371 } else { 372 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.") 373 metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++ 374 } 375 } 376 377 // Keep track of what periodic jobs are in the config so we will 378 // not clean up their last prowjob. 379 isActivePeriodic := make(map[string]bool) 380 for _, p := range c.config().Periodics { 381 isActivePeriodic[p.Name] = true 382 } 383 384 // Get the jobs that we need to retain so horologium can continue working 385 // as intended. 386 latestPeriodics := pjutil.GetLatestProwJobs(prowJobs.Items, prowapi.PeriodicJob) 387 for _, prowJob := range prowJobs.Items { 388 if prowJob.Spec.Type != prowapi.PeriodicJob { 389 continue 390 } 391 392 if !prowJob.Complete() { 393 continue 394 } 395 isFinished.Insert(prowJob.ObjectMeta.Name) 396 latestPJ := latestPeriodics[prowJob.Spec.Job] 397 if isActivePeriodic[prowJob.Spec.Job] && prowJob.ObjectMeta.Name == latestPJ.ObjectMeta.Name { 398 // Ignore deleting this one. 399 continue 400 } 401 if time.Since(prowJob.Status.StartTime.Time) <= maxProwJobAge { 402 continue 403 } 404 if err := c.prowJobClient.Delete(c.ctx, &prowJob); err == nil { 405 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).Info("Deleted prowjob.") 406 metrics.prowJobsCleaned[reasonProwJobAgedPeriodic]++ 407 } else { 408 c.logger.WithFields(pjutil.ProwJobFields(&prowJob)).WithError(err).Error("Error deleting prowjob.") 409 metrics.prowJobsCleaningErrors[string(k8serrors.ReasonForError(err))]++ 410 } 411 } 412 413 // Now clean up old pods. 414 for cluster, client := range c.podClients { 415 log := c.logger.WithField("cluster", cluster) 416 var isClusterExcluded bool 417 for _, excludeCluster := range c.config().Sinker.ExcludeClusters { 418 if excludeCluster == cluster { 419 isClusterExcluded = true 420 break 421 } 422 } 423 if isClusterExcluded { 424 log.Debugf("Cluster %q is excluded, skipping pods deletion.", cluster) 425 continue 426 } 427 var pods corev1api.PodList 428 if err := client.List(c.ctx, &pods, ctrlruntimeclient.MatchingLabels{kube.CreatedByProw: "true"}, ctrlruntimeclient.InNamespace(c.config().PodNamespace)); err != nil { 429 log.WithError(err).Error("Error listing pods.") 430 continue 431 } 432 log.WithField("pod-count", len(pods.Items)).Debug("Successfully listed pods.") 433 metrics.podsCreated += len(pods.Items) 434 maxPodAge := c.config().Sinker.MaxPodAge.Duration 435 terminatedPodTTL := c.config().Sinker.TerminatedPodTTL.Duration 436 for _, pod := range pods.Items { 437 reason := "" 438 clean := false 439 440 // by default, use the pod name as the key to match the associated prow job 441 // this is to support legacy plank in case the kube.ProwJobIDLabel label is not set 442 podJobName := pod.ObjectMeta.Name 443 // if the pod has the kube.ProwJobIDLabel label, use this instead of the pod name 444 if value, ok := pod.ObjectMeta.Labels[kube.ProwJobIDLabel]; ok { 445 podJobName = value 446 } 447 log = log.WithField("pj", podJobName) 448 terminationTime := time.Time{} 449 if pj, ok := pjMap[podJobName]; ok && pj.Complete() { 450 terminationTime = pj.Status.CompletionTime.Time 451 } 452 453 if podNeedsKubernetesFinalizerCleanup(log, pjMap[podJobName], &pod) { 454 if err := c.cleanupKubernetesFinalizer(&pod, client); err != nil { 455 log.WithError(err).Error("Failed to remove kubernetesreporter finalizer") 456 } 457 } 458 459 switch { 460 case !pod.Status.StartTime.IsZero() && time.Since(pod.Status.StartTime.Time) > maxPodAge: 461 clean = true 462 reason = reasonPodAged 463 case !terminationTime.IsZero() && time.Since(terminationTime) > terminatedPodTTL: 464 clean = true 465 reason = reasonPodTTLed 466 } 467 468 if !isFinished.Has(podJobName) { 469 // prowjob exists and is not marked as completed yet 470 // deleting the pod now will result in plank creating a brand new pod 471 clean = false 472 } 473 474 if c.isPodOrphaned(log, &pod, podJobName) { 475 // prowjob has gone, we want to clean orphan pods regardless of the state 476 reason = reasonPodOrphaned 477 clean = true 478 } 479 480 if !clean { 481 continue 482 } 483 484 c.deletePod(log, &pod, reason, client, &metrics) 485 } 486 } 487 488 metrics.finishedAt = time.Now() 489 sinkerMetrics.podsCreated.Set(float64(metrics.podsCreated)) 490 sinkerMetrics.timeUsed.Set(float64(metrics.getTimeUsed().Seconds())) 491 for k, v := range metrics.podsRemoved { 492 sinkerMetrics.podsRemoved.WithLabelValues(k).Set(float64(v)) 493 } 494 for k, v := range metrics.podRemovalErrors { 495 sinkerMetrics.podRemovalErrors.WithLabelValues(k).Set(float64(v)) 496 } 497 sinkerMetrics.prowJobsCreated.Set(float64(metrics.prowJobsCreated)) 498 for k, v := range metrics.prowJobsCleaned { 499 sinkerMetrics.prowJobsCleaned.WithLabelValues(k).Set(float64(v)) 500 } 501 for k, v := range metrics.prowJobsCleaningErrors { 502 sinkerMetrics.prowJobsCleaningErrors.WithLabelValues(k).Set(float64(v)) 503 } 504 c.logger.Info("Sinker reconciliation complete.") 505 } 506 507 func (c *controller) cleanupKubernetesFinalizer(pod *corev1api.Pod, client ctrlruntimeclient.Client) error { 508 509 oldPod := pod.DeepCopy() 510 pod.Finalizers = sets.List(sets.New[string](pod.Finalizers...).Delete(kubernetesreporterapi.FinalizerName)) 511 512 if err := client.Patch(c.ctx, pod, ctrlruntimeclient.MergeFrom(oldPod)); err != nil { 513 return fmt.Errorf("failed to patch pod: %w", err) 514 } 515 516 return nil 517 } 518 519 func (c *controller) deletePod(log *logrus.Entry, pod *corev1api.Pod, reason string, client ctrlruntimeclient.Client, m *sinkerReconciliationMetrics) { 520 name := pod.Name 521 // Delete old finished or orphan pods. Don't quit if we fail to delete one. 522 if err := client.Delete(c.ctx, pod); err == nil { 523 log.WithFields(logrus.Fields{"pod": name, "reason": reason}).Info("Deleted old completed pod.") 524 m.podsRemoved[reason]++ 525 } else { 526 m.podRemovalErrors[string(k8serrors.ReasonForError(err))]++ 527 if k8serrors.IsNotFound(err) { 528 log.WithField("pod", name).WithError(err).Info("Could not delete missing pod.") 529 } else { 530 log.WithField("pod", name).WithError(err).Error("Error deleting pod.") 531 } 532 } 533 } 534 535 func (c *controller) isPodOrphaned(log *logrus.Entry, pod *corev1api.Pod, prowJobName string) bool { 536 // ProwJobs are cached and the cache may lag a bit behind, so never considers 537 // pods that are less than 30 seconds old as orphaned 538 if !pod.CreationTimestamp.Before(&metav1.Time{Time: time.Now().Add(-30 * time.Second)}) { 539 return false 540 } 541 542 // We do a list in the very beginning of our processing. By the time we reach this check, that 543 // list might be outdated, so do another GET here before declaring the pod orphaned 544 pjName := types.NamespacedName{Namespace: c.config().ProwJobNamespace, Name: prowJobName} 545 if err := c.prowJobClient.Get(c.ctx, pjName, &prowapi.ProwJob{}); err != nil { 546 if k8serrors.IsNotFound(err) { 547 return true 548 } 549 logrus.WithError(err).Error("Failed to get prowjob") 550 } 551 552 return false 553 } 554 555 func podNeedsKubernetesFinalizerCleanup(log *logrus.Entry, pj *prowapi.ProwJob, pod *corev1api.Pod) bool { 556 // Can happen if someone deletes the prowjob before it finishes 557 if pj == nil { 558 return true 559 } 560 // This is always a bug 561 if pj.Complete() && pj.Status.PrevReportStates[kubernetesreporterapi.ReporterName] == pj.Status.State && sets.New[string](pod.Finalizers...).Has(kubernetesreporterapi.FinalizerName) { 562 log.WithField("pj", pj.Name).Errorf("BUG: Pod for prowjob still had the %s finalizer after completing and being successfully reported by the %s reporter", kubernetesreporterapi.FinalizerName, kubernetesreporterapi.ReporterName) 563 564 return true 565 } 566 567 return false 568 } 569 570 // jobConfigMapMonitor reports metrics for the size of the ConfigMap(s) found 571 // under the the directory specified with --job-config-path (example: 572 // "--job-config-path=/etc/job-config"). There are two possibilities --- either 573 // the job ConfigMap is mounted directly at that path, or the ConfigMap was 574 // partitioned (see https://github.com/kubernetes/test-infra/pull/28835) and 575 // there are multiple subdirs underneath this one. 576 func jobConfigMapMonitor(interval time.Duration, jobConfigPath string) { 577 logger := logrus.WithField("sync-loop", "job-configmap-monitor") 578 ticker := time.NewTicker(interval) 579 580 for ; true; <-ticker.C { 581 dirs, err := getConfigMapDirs(jobConfigPath) 582 if err != nil { 583 logger.WithField("dir", jobConfigPath).Error("could not resolve ConfigMap dirs") 584 continue 585 } 586 for _, dir := range dirs { 587 bytes, err := getConfigMapSize(dir) 588 if err != nil { 589 logger.WithField("dir", dir).WithError(err).Error("Failed to get configmap metrics") 590 continue 591 } 592 sinkerMetrics.jobConfigMapSize.WithLabelValues(dir).Set(float64(bytes)) 593 } 594 } 595 } 596 597 // getDataDir gets the "..data" symlink which points to a timestamped directory. 598 // See the comment for getConfigMapSize() for details. 599 func getDataDir(toplevel string) string { 600 return path.Join(toplevel, "..data") 601 } 602 603 func getConfigMapDirs(toplevel string) ([]string, error) { 604 dataDir := getDataDir(toplevel) 605 dirs := []string{} 606 607 // If the data dir (symlink) does not exist directly, then assume that this 608 // path is a partition holding multiple ConfigMap-mounted dirs. We use 609 // os.Stat(), which means that both the "..data" symlink and its target 610 // folder must exist. Of course, nothing stops the folder from having 611 // "..data" as a folder or regular file, which would count as false 612 // positives, but we ignore these cases because exhaustive checking here is 613 // not our concern. We just want metrics. 614 if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) { 615 files, err := os.ReadDir(toplevel) 616 if err != nil { 617 return nil, err 618 } 619 620 for _, file := range files { 621 if !file.IsDir() { 622 continue 623 } 624 dirs = append(dirs, filepath.Join(toplevel, file.Name())) 625 } 626 } else { 627 dirs = append(dirs, toplevel) 628 } 629 630 return dirs, nil 631 } 632 633 // getConfigMapSize expects a path to the filesystem where a Kubernetes 634 // ConfigMap has been mounted. It iterates over every key (file) found in that 635 // directory, adding up the sizes of each of the files by calling 636 // "syscall.Stat". 637 // 638 // When ConfigMaps are mounted to disk, all of its keys will become files 639 // and the value (data) for each key will be the contents of the respective 640 // files. Another special symlink, `..data`, will also be at the same level 641 // as the keys and this symlink will point to yet another folder at the same 642 // level like `..2024_01_11_22_52_09.1709975282`. This timestamped folder is 643 // where the actual files will be located. So the layout looks like: 644 // 645 // folder-named-after-configmap-name 646 // folder-named-after-configmap-name/..2024_01_11_22_52_09.1709975282 647 // folder-named-after-configmap-name/..data (symlinked to ..2024_01_11... above) 648 // folder-named-after-configmap-name/key1 (symlinked to ..data/key1) 649 // folder-named-after-configmap-name/key2 (symlinked to ..data/key2) 650 // 651 // The above layout with the timestamped folder and the "..data" symlink is a 652 // Kubernetes construct, and is applicable to every ConfigMap mounted to disk by 653 // Kubernetes. 654 // 655 // For our purposes the exact details of this doesn't matter too much --- 656 // our call to syscall.Stat() will still work for key1 and key2 above even 657 // though they are symlinks. What we do care about though is the existence 658 // of such `..data` and `..<timestamp>` files. We have to exclude these 659 // files from our totals because otherwise we'd be double counting. 660 func getConfigMapSize(configmapDir string) (int64, error) { 661 var total int64 662 663 // Look into the "..data" symlinked folder, which should contain the actual 664 // files where each file is a key in the ConfigMap. 665 dataDir := getDataDir(configmapDir) 666 if _, err := os.Stat(dataDir); errors.Is(err, os.ErrNotExist) { 667 return 0, fmt.Errorf("%q is not a ConfigMap-mounted dir", configmapDir) 668 } 669 670 logger := logrus.NewEntry(logrus.StandardLogger()) 671 672 var walkDirFunc = func(path string, d fs.DirEntry, err error) error { 673 if err != nil { 674 return err 675 } 676 // Don't process directories (that is, only process files). We don't 677 // expect any directories to exist at this level, but it doesn't hurt to 678 // skip any we encounter. 679 if d.IsDir() { 680 return nil 681 } 682 // Skip any symbolic links. 683 if d.Type() == fs.ModeSymlink { 684 return nil 685 } 686 687 info, err := d.Info() 688 if err != nil { 689 return err 690 } 691 logger.Infof("file %q is %v bytes", path, info.Size()) 692 total += info.Size() 693 return nil 694 } 695 696 if err := filepath.WalkDir(configmapDir, walkDirFunc); err != nil { 697 return 0, nil 698 } 699 700 return total, nil 701 }