github.com/argoproj/argo-cd/v3@v3.2.1/controller/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net/http" 8 "os" 9 "slices" 10 "strconv" 11 "time" 12 13 "github.com/argoproj/gitops-engine/pkg/health" 14 "github.com/prometheus/client_golang/prometheus" 15 "github.com/prometheus/client_golang/prometheus/promhttp" 16 "github.com/robfig/cron/v3" 17 log "github.com/sirupsen/logrus" 18 "k8s.io/apimachinery/pkg/labels" 19 ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" 20 21 "github.com/argoproj/argo-cd/v3/common" 22 argoappv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1" 23 applister "github.com/argoproj/argo-cd/v3/pkg/client/listers/application/v1alpha1" 24 "github.com/argoproj/argo-cd/v3/util/argo" 25 "github.com/argoproj/argo-cd/v3/util/db" 26 "github.com/argoproj/argo-cd/v3/util/git" 27 "github.com/argoproj/argo-cd/v3/util/healthz" 28 metricsutil "github.com/argoproj/argo-cd/v3/util/metrics" 29 "github.com/argoproj/argo-cd/v3/util/metrics/kubectl" 30 "github.com/argoproj/argo-cd/v3/util/profile" 31 ) 32 33 type MetricsServer struct { 34 *http.Server 35 syncCounter *prometheus.CounterVec 36 syncDuration *prometheus.CounterVec 37 kubectlExecCounter *prometheus.CounterVec 38 kubectlExecPendingGauge *prometheus.GaugeVec 39 orphanedResourcesGauge *prometheus.GaugeVec 40 k8sRequestCounter *prometheus.CounterVec 41 clusterEventsCounter *prometheus.CounterVec 42 redisRequestCounter *prometheus.CounterVec 43 reconcileHistogram *prometheus.HistogramVec 44 redisRequestHistogram *prometheus.HistogramVec 45 resourceEventsProcessingHistogram *prometheus.HistogramVec 46 resourceEventsNumberGauge *prometheus.GaugeVec 47 registry *prometheus.Registry 48 hostname string 49 cron *cron.Cron 50 } 51 52 const ( 53 // MetricsPath is the endpoint to collect application metrics 54 MetricsPath = "/metrics" 55 ) 56 57 // Follow Prometheus naming practices 58 // https://prometheus.io/docs/practices/naming/ 59 var ( 60 descAppDefaultLabels = []string{"namespace", "name", "project"} 61 62 descAppLabels *prometheus.Desc 63 descAppConditions *prometheus.Desc 64 65 descAppInfo = prometheus.NewDesc( 66 "argocd_app_info", 67 "Information about application.", 68 append(descAppDefaultLabels, "autosync_enabled", "repo", "dest_server", "dest_namespace", "sync_status", "health_status", "operation"), 69 nil, 70 ) 71 72 syncCounter = prometheus.NewCounterVec( 73 prometheus.CounterOpts{ 74 Name: "argocd_app_sync_total", 75 Help: "Number of application syncs.", 76 }, 77 append(descAppDefaultLabels, "dest_server", "phase", "dry_run"), 78 ) 79 80 syncDuration = prometheus.NewCounterVec( 81 prometheus.CounterOpts{ 82 Name: "argocd_app_sync_duration_seconds_total", 83 Help: "Application sync performance in seconds total.", 84 }, 85 append(descAppDefaultLabels, "dest_server"), 86 ) 87 88 k8sRequestCounter = prometheus.NewCounterVec( 89 prometheus.CounterOpts{ 90 Name: "argocd_app_k8s_request_total", 91 Help: "Number of kubernetes requests executed during application reconciliation.", 92 }, 93 append(descAppDefaultLabels, "server", "response_code", "verb", "resource_kind", "resource_namespace", "dry_run"), 94 ) 95 96 kubectlExecCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 97 Name: "argocd_kubectl_exec_total", 98 Help: "Number of kubectl executions", 99 }, []string{"hostname", "command"}) 100 101 kubectlExecPendingGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 102 Name: "argocd_kubectl_exec_pending", 103 Help: "Number of pending kubectl executions", 104 }, []string{"hostname", "command"}) 105 106 reconcileHistogram = prometheus.NewHistogramVec( 107 prometheus.HistogramOpts{ 108 Name: "argocd_app_reconcile", 109 Help: "Application reconciliation performance in seconds.", 110 // Buckets chosen after observing a ~2100ms mean reconcile time 111 Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16}, 112 }, 113 []string{"namespace", "dest_server"}, 114 ) 115 116 clusterEventsCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 117 Name: "argocd_cluster_events_total", 118 Help: "Number of processes k8s resource events.", 119 }, append(descClusterDefaultLabels, "group", "kind")) 120 121 redisRequestCounter = prometheus.NewCounterVec( 122 prometheus.CounterOpts{ 123 Name: "argocd_redis_request_total", 124 Help: "Number of redis requests executed during application reconciliation.", 125 }, 126 []string{"hostname", "initiator", "failed"}, 127 ) 128 129 redisRequestHistogram = prometheus.NewHistogramVec( 130 prometheus.HistogramOpts{ 131 Name: "argocd_redis_request_duration", 132 Help: "Redis requests duration.", 133 Buckets: []float64{0.01, 0.05, 0.10, 0.25, .5, 1}, 134 }, 135 []string{"hostname", "initiator"}, 136 ) 137 138 orphanedResourcesGauge = prometheus.NewGaugeVec( 139 prometheus.GaugeOpts{ 140 Name: "argocd_app_orphaned_resources_count", 141 Help: "Number of orphaned resources per application", 142 }, 143 descAppDefaultLabels, 144 ) 145 146 resourceEventsProcessingHistogram = prometheus.NewHistogramVec( 147 prometheus.HistogramOpts{ 148 Name: "argocd_resource_events_processing", 149 Help: "Time to process resource events in seconds.", 150 Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16}, 151 }, 152 []string{"server"}, 153 ) 154 155 resourceEventsNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 156 Name: "argocd_resource_events_processed_in_batch", 157 Help: "Number of resource events processed in batch", 158 }, []string{"server"}) 159 ) 160 161 // NewMetricsServer returns a new prometheus server which collects application metrics 162 func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFilter func(obj any) bool, healthCheck func(r *http.Request) error, appLabels []string, appConditions []string, db db.ArgoDB) (*MetricsServer, error) { 163 hostname, err := os.Hostname() 164 if err != nil { 165 return nil, err 166 } 167 168 if len(appLabels) > 0 { 169 normalizedLabels := metricsutil.NormalizeLabels("label", appLabels) 170 descAppLabels = prometheus.NewDesc( 171 "argocd_app_labels", 172 "Argo Application labels converted to Prometheus labels", 173 append(descAppDefaultLabels, normalizedLabels...), 174 nil, 175 ) 176 } 177 178 if len(appConditions) > 0 { 179 descAppConditions = prometheus.NewDesc( 180 "argocd_app_condition", 181 "Report application conditions.", 182 append(descAppDefaultLabels, "condition"), 183 nil, 184 ) 185 } 186 187 mux := http.NewServeMux() 188 registry := NewAppRegistry(appLister, appFilter, appLabels, appConditions, db) 189 190 mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{ 191 // contains app controller specific metrics 192 registry, 193 // contains workqueue metrics, process and golang metrics 194 ctrlmetrics.Registry, 195 }, promhttp.HandlerOpts{})) 196 profile.RegisterProfiler(mux) 197 healthz.ServeHealthCheck(mux, healthCheck) 198 199 registry.MustRegister(syncCounter) 200 registry.MustRegister(syncDuration) 201 registry.MustRegister(k8sRequestCounter) 202 registry.MustRegister(kubectlExecCounter) 203 registry.MustRegister(kubectlExecPendingGauge) 204 registry.MustRegister(orphanedResourcesGauge) 205 registry.MustRegister(reconcileHistogram) 206 registry.MustRegister(clusterEventsCounter) 207 registry.MustRegister(redisRequestCounter) 208 registry.MustRegister(redisRequestHistogram) 209 registry.MustRegister(resourceEventsProcessingHistogram) 210 registry.MustRegister(resourceEventsNumberGauge) 211 212 kubectl.RegisterWithClientGo() 213 kubectl.RegisterWithPrometheus(registry) 214 215 metricsServer := &MetricsServer{ 216 registry: registry, 217 Server: &http.Server{ 218 Addr: addr, 219 Handler: mux, 220 }, 221 syncCounter: syncCounter, 222 syncDuration: syncDuration, 223 k8sRequestCounter: k8sRequestCounter, 224 kubectlExecCounter: kubectlExecCounter, 225 kubectlExecPendingGauge: kubectlExecPendingGauge, 226 orphanedResourcesGauge: orphanedResourcesGauge, 227 reconcileHistogram: reconcileHistogram, 228 clusterEventsCounter: clusterEventsCounter, 229 redisRequestCounter: redisRequestCounter, 230 redisRequestHistogram: redisRequestHistogram, 231 resourceEventsProcessingHistogram: resourceEventsProcessingHistogram, 232 resourceEventsNumberGauge: resourceEventsNumberGauge, 233 hostname: hostname, 234 // This cron is used to expire the metrics cache. 235 // Currently clearing the metrics cache is logging and deleting from the map 236 // so there is no possibility of panic, but we will add a chain to keep robfig/cron v1 behavior. 237 cron: cron.New(cron.WithChain(cron.Recover(cron.PrintfLogger(log.StandardLogger())))), 238 } 239 240 return metricsServer, nil 241 } 242 243 func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo, db db.ArgoDB, clusterLabels []string) { 244 collector := NewClusterCollector(ctx, source, db.ListClusters, clusterLabels) 245 m.registry.MustRegister(collector) 246 } 247 248 // IncSync increments the sync counter for an application 249 func (m *MetricsServer) IncSync(app *argoappv1.Application, destServer string, state *argoappv1.OperationState) { 250 if !state.Phase.Completed() { 251 return 252 } 253 isDryRun := app.Operation != nil && app.Operation.DryRun() 254 m.syncCounter.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), destServer, string(state.Phase), strconv.FormatBool(isDryRun)).Inc() 255 } 256 257 // IncAppSyncDuration observes app sync duration 258 func (m *MetricsServer) IncAppSyncDuration(app *argoappv1.Application, destServer string, state *argoappv1.OperationState) { 259 if state.FinishedAt != nil { 260 m.syncDuration.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), destServer). 261 Add(float64(time.Duration(state.FinishedAt.Unix() - state.StartedAt.Unix()))) 262 } 263 } 264 265 func (m *MetricsServer) IncKubectlExec(command string) { 266 m.kubectlExecCounter.WithLabelValues(m.hostname, command).Inc() 267 } 268 269 func (m *MetricsServer) IncKubectlExecPending(command string) { 270 m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Inc() 271 } 272 273 func (m *MetricsServer) DecKubectlExecPending(command string) { 274 m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Dec() 275 } 276 277 func (m *MetricsServer) SetOrphanedResourcesMetric(app *argoappv1.Application, numOrphanedResources int) { 278 m.orphanedResourcesGauge.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject()).Set(float64(numOrphanedResources)) 279 } 280 281 // IncClusterEventsCount increments the number of cluster events 282 func (m *MetricsServer) IncClusterEventsCount(server, group, kind string) { 283 m.clusterEventsCounter.WithLabelValues(server, group, kind).Inc() 284 } 285 286 // IncKubernetesRequest increments the kubernetes requests counter for an application 287 func (m *MetricsServer) IncKubernetesRequest(app *argoappv1.Application, server, statusCode, verb, resourceKind, resourceNamespace string) { 288 var namespace, name, project string 289 isDryRun := false 290 if app != nil { 291 namespace = app.Namespace 292 name = app.Name 293 project = app.Spec.GetProject() 294 isDryRun = app.Operation != nil && app.Operation.DryRun() 295 } 296 m.k8sRequestCounter.WithLabelValues( 297 namespace, name, project, server, statusCode, 298 verb, resourceKind, resourceNamespace, strconv.FormatBool(isDryRun), 299 ).Inc() 300 } 301 302 func (m *MetricsServer) IncRedisRequest(failed bool) { 303 m.redisRequestCounter.WithLabelValues(m.hostname, common.ApplicationController, strconv.FormatBool(failed)).Inc() 304 } 305 306 // ObserveRedisRequestDuration observes redis request duration 307 func (m *MetricsServer) ObserveRedisRequestDuration(duration time.Duration) { 308 m.redisRequestHistogram.WithLabelValues(m.hostname, common.ApplicationController).Observe(duration.Seconds()) 309 } 310 311 // ObserveResourceEventsProcessingDuration observes resource events processing duration 312 func (m *MetricsServer) ObserveResourceEventsProcessingDuration(server string, duration time.Duration, processedEventsNumber int) { 313 m.resourceEventsProcessingHistogram.WithLabelValues(server).Observe(duration.Seconds()) 314 m.resourceEventsNumberGauge.WithLabelValues(server).Set(float64(processedEventsNumber)) 315 } 316 317 // IncReconcile increments the reconcile counter for an application 318 func (m *MetricsServer) IncReconcile(app *argoappv1.Application, destServer string, duration time.Duration) { 319 m.reconcileHistogram.WithLabelValues(app.Namespace, destServer).Observe(duration.Seconds()) 320 } 321 322 // HasExpiration return true if expiration is set 323 func (m *MetricsServer) HasExpiration() bool { 324 return len(m.cron.Entries()) > 0 325 } 326 327 // SetExpiration reset Prometheus metrics based on time duration interval 328 func (m *MetricsServer) SetExpiration(cacheExpiration time.Duration) error { 329 if m.HasExpiration() { 330 return errors.New("expiration is already set") 331 } 332 333 _, err := m.cron.AddFunc(fmt.Sprintf("@every %s", cacheExpiration), func() { 334 log.Infof("Reset Prometheus metrics based on existing expiration '%v'", cacheExpiration) 335 m.syncCounter.Reset() 336 m.syncDuration.Reset() 337 m.kubectlExecCounter.Reset() 338 m.kubectlExecPendingGauge.Reset() 339 m.orphanedResourcesGauge.Reset() 340 m.k8sRequestCounter.Reset() 341 m.clusterEventsCounter.Reset() 342 m.redisRequestCounter.Reset() 343 m.reconcileHistogram.Reset() 344 m.redisRequestHistogram.Reset() 345 m.resourceEventsProcessingHistogram.Reset() 346 m.resourceEventsNumberGauge.Reset() 347 kubectl.ResetAll() 348 }) 349 if err != nil { 350 return err 351 } 352 353 m.cron.Start() 354 return nil 355 } 356 357 type appCollector struct { 358 store applister.ApplicationLister 359 appFilter func(obj any) bool 360 appLabels []string 361 appConditions []string 362 db db.ArgoDB 363 } 364 365 // NewAppCollector returns a prometheus collector for application metrics 366 func NewAppCollector(appLister applister.ApplicationLister, appFilter func(obj any) bool, appLabels []string, appConditions []string, db db.ArgoDB) prometheus.Collector { 367 return &appCollector{ 368 store: appLister, 369 appFilter: appFilter, 370 appLabels: appLabels, 371 appConditions: appConditions, 372 db: db, 373 } 374 } 375 376 // NewAppRegistry creates a new prometheus registry that collects applications 377 func NewAppRegistry(appLister applister.ApplicationLister, appFilter func(obj any) bool, appLabels []string, appConditions []string, db db.ArgoDB) *prometheus.Registry { 378 registry := prometheus.NewRegistry() 379 registry.MustRegister(NewAppCollector(appLister, appFilter, appLabels, appConditions, db)) 380 return registry 381 } 382 383 // Describe implements the prometheus.Collector interface 384 func (c *appCollector) Describe(ch chan<- *prometheus.Desc) { 385 if len(c.appLabels) > 0 { 386 ch <- descAppLabels 387 } 388 if len(c.appConditions) > 0 { 389 ch <- descAppConditions 390 } 391 ch <- descAppInfo 392 } 393 394 // Collect implements the prometheus.Collector interface 395 func (c *appCollector) Collect(ch chan<- prometheus.Metric) { 396 apps, err := c.store.List(labels.NewSelector()) 397 if err != nil { 398 log.Warnf("Failed to collect applications: %v", err) 399 return 400 } 401 for _, app := range apps { 402 if !c.appFilter(app) { 403 continue 404 } 405 destCluster, err := argo.GetDestinationCluster(context.Background(), app.Spec.Destination, c.db) 406 if err != nil { 407 log.Warnf("Failed to get destination cluster for application %s: %v", app.Name, err) 408 } 409 destServer := "" 410 if destCluster != nil { 411 destServer = destCluster.Server 412 } 413 c.collectApps(ch, app, destServer) 414 } 415 } 416 417 func boolFloat64(b bool) float64 { 418 if b { 419 return 1 420 } 421 return 0 422 } 423 424 func (c *appCollector) collectApps(ch chan<- prometheus.Metric, app *argoappv1.Application, destServer string) { 425 addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) { 426 project := app.Spec.GetProject() 427 lv = append([]string{app.Namespace, app.Name, project}, lv...) 428 ch <- prometheus.MustNewConstMetric(desc, t, v, lv...) 429 } 430 addGauge := func(desc *prometheus.Desc, v float64, lv ...string) { 431 addConstMetric(desc, prometheus.GaugeValue, v, lv...) 432 } 433 434 var operation string 435 if app.DeletionTimestamp != nil { 436 operation = "delete" 437 } else if app.Operation != nil && app.Operation.Sync != nil { 438 operation = "sync" 439 } 440 syncStatus := app.Status.Sync.Status 441 if syncStatus == "" { 442 syncStatus = argoappv1.SyncStatusCodeUnknown 443 } 444 healthStatus := app.Status.Health.Status 445 if healthStatus == "" { 446 healthStatus = health.HealthStatusUnknown 447 } 448 449 autoSyncEnabled := app.Spec.SyncPolicy != nil && app.Spec.SyncPolicy.IsAutomatedSyncEnabled() 450 451 addGauge(descAppInfo, 1, strconv.FormatBool(autoSyncEnabled), git.NormalizeGitURL(app.Spec.GetSource().RepoURL), destServer, app.Spec.Destination.Namespace, string(syncStatus), string(healthStatus), operation) 452 453 if len(c.appLabels) > 0 { 454 labelValues := []string{} 455 for _, desiredLabel := range c.appLabels { 456 value := app.GetLabels()[desiredLabel] 457 labelValues = append(labelValues, value) 458 } 459 addGauge(descAppLabels, 1, labelValues...) 460 } 461 462 if len(c.appConditions) > 0 { 463 conditionCount := make(map[string]int) 464 for _, condition := range app.Status.Conditions { 465 if slices.Contains(c.appConditions, condition.Type) { 466 conditionCount[condition.Type]++ 467 } 468 } 469 470 for conditionType, count := range conditionCount { 471 addGauge(descAppConditions, float64(count), conditionType) 472 } 473 } 474 }