github.com/argoproj/argo-cd/v2@v2.10.9/controller/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net/http" 8 "os" 9 "regexp" 10 "strconv" 11 "time" 12 13 "github.com/argoproj/gitops-engine/pkg/health" 14 "github.com/prometheus/client_golang/prometheus" 15 "github.com/prometheus/client_golang/prometheus/promhttp" 16 "github.com/robfig/cron/v3" 17 log "github.com/sirupsen/logrus" 18 "k8s.io/apimachinery/pkg/labels" 19 20 "github.com/argoproj/argo-cd/v2/common" 21 argoappv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" 22 applister "github.com/argoproj/argo-cd/v2/pkg/client/listers/application/v1alpha1" 23 "github.com/argoproj/argo-cd/v2/util/git" 24 "github.com/argoproj/argo-cd/v2/util/healthz" 25 "github.com/argoproj/argo-cd/v2/util/profile" 26 27 ctrl_metrics "sigs.k8s.io/controller-runtime/pkg/metrics" 28 ) 29 30 type MetricsServer struct { 31 *http.Server 32 syncCounter *prometheus.CounterVec 33 kubectlExecCounter *prometheus.CounterVec 34 kubectlExecPendingGauge *prometheus.GaugeVec 35 k8sRequestCounter *prometheus.CounterVec 36 clusterEventsCounter *prometheus.CounterVec 37 redisRequestCounter *prometheus.CounterVec 38 reconcileHistogram *prometheus.HistogramVec 39 redisRequestHistogram *prometheus.HistogramVec 40 registry *prometheus.Registry 41 hostname string 42 cron *cron.Cron 43 } 44 45 const ( 46 // MetricsPath is the endpoint to collect application metrics 47 MetricsPath = "/metrics" 48 // EnvVarLegacyControllerMetrics is a env var to re-enable deprecated prometheus metrics 49 EnvVarLegacyControllerMetrics = "ARGOCD_LEGACY_CONTROLLER_METRICS" 50 ) 51 52 // Follow Prometheus naming practices 53 // https://prometheus.io/docs/practices/naming/ 54 var ( 55 descAppDefaultLabels = []string{"namespace", "name", "project"} 56 57 descAppLabels *prometheus.Desc 58 59 descAppInfo = prometheus.NewDesc( 60 "argocd_app_info", 61 "Information about application.", 62 append(descAppDefaultLabels, "autosync_enabled", "repo", "dest_server", "dest_namespace", "sync_status", "health_status", "operation"), 63 nil, 64 ) 65 // DEPRECATED 66 descAppCreated = prometheus.NewDesc( 67 "argocd_app_created_time", 68 "Creation time in unix timestamp for an application.", 69 descAppDefaultLabels, 70 nil, 71 ) 72 // DEPRECATED: superseded by sync_status label in argocd_app_info 73 descAppSyncStatusCode = prometheus.NewDesc( 74 "argocd_app_sync_status", 75 "The application current sync status.", 76 append(descAppDefaultLabels, "sync_status"), 77 nil, 78 ) 79 // DEPRECATED: superseded by health_status label in argocd_app_info 80 descAppHealthStatus = prometheus.NewDesc( 81 "argocd_app_health_status", 82 "The application current health status.", 83 append(descAppDefaultLabels, "health_status"), 84 nil, 85 ) 86 87 syncCounter = prometheus.NewCounterVec( 88 prometheus.CounterOpts{ 89 Name: "argocd_app_sync_total", 90 Help: "Number of application syncs.", 91 }, 92 append(descAppDefaultLabels, "dest_server", "phase"), 93 ) 94 95 k8sRequestCounter = prometheus.NewCounterVec( 96 prometheus.CounterOpts{ 97 Name: "argocd_app_k8s_request_total", 98 Help: "Number of kubernetes requests executed during application reconciliation.", 99 }, 100 append(descAppDefaultLabels, "server", "response_code", "verb", "resource_kind", "resource_namespace"), 101 ) 102 103 kubectlExecCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 104 Name: "argocd_kubectl_exec_total", 105 Help: "Number of kubectl executions", 106 }, []string{"hostname", "command"}) 107 108 kubectlExecPendingGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 109 Name: "argocd_kubectl_exec_pending", 110 Help: "Number of pending kubectl executions", 111 }, []string{"hostname", "command"}) 112 113 reconcileHistogram = prometheus.NewHistogramVec( 114 prometheus.HistogramOpts{ 115 Name: "argocd_app_reconcile", 116 Help: "Application reconciliation performance.", 117 // Buckets chosen after observing a ~2100ms mean reconcile time 118 Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16}, 119 }, 120 []string{"namespace", "dest_server"}, 121 ) 122 123 clusterEventsCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 124 Name: "argocd_cluster_events_total", 125 Help: "Number of processes k8s resource events.", 126 }, append(descClusterDefaultLabels, "group", "kind")) 127 128 redisRequestCounter = prometheus.NewCounterVec( 129 prometheus.CounterOpts{ 130 Name: "argocd_redis_request_total", 131 Help: "Number of redis requests executed during application reconciliation.", 132 }, 133 []string{"hostname", "initiator", "failed"}, 134 ) 135 136 redisRequestHistogram = prometheus.NewHistogramVec( 137 prometheus.HistogramOpts{ 138 Name: "argocd_redis_request_duration", 139 Help: "Redis requests duration.", 140 Buckets: []float64{0.01, 0.05, 0.10, 0.25, .5, 1}, 141 }, 142 []string{"hostname", "initiator"}, 143 ) 144 ) 145 146 // NewMetricsServer returns a new prometheus server which collects application metrics 147 func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, healthCheck func(r *http.Request) error, appLabels []string) (*MetricsServer, error) { 148 hostname, err := os.Hostname() 149 if err != nil { 150 return nil, err 151 } 152 153 if len(appLabels) > 0 { 154 normalizedLabels := normalizeLabels("label", appLabels) 155 descAppLabels = prometheus.NewDesc( 156 "argocd_app_labels", 157 "Argo Application labels converted to Prometheus labels", 158 append(descAppDefaultLabels, normalizedLabels...), 159 nil, 160 ) 161 } 162 163 mux := http.NewServeMux() 164 registry := NewAppRegistry(appLister, appFilter, appLabels) 165 166 mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{ 167 // contains app controller specific metrics 168 registry, 169 // contains workqueue metrics, process and golang metrics 170 ctrl_metrics.Registry, 171 }, promhttp.HandlerOpts{})) 172 profile.RegisterProfiler(mux) 173 healthz.ServeHealthCheck(mux, healthCheck) 174 175 registry.MustRegister(syncCounter) 176 registry.MustRegister(k8sRequestCounter) 177 registry.MustRegister(kubectlExecCounter) 178 registry.MustRegister(kubectlExecPendingGauge) 179 registry.MustRegister(reconcileHistogram) 180 registry.MustRegister(clusterEventsCounter) 181 registry.MustRegister(redisRequestCounter) 182 registry.MustRegister(redisRequestHistogram) 183 184 return &MetricsServer{ 185 registry: registry, 186 Server: &http.Server{ 187 Addr: addr, 188 Handler: mux, 189 }, 190 syncCounter: syncCounter, 191 k8sRequestCounter: k8sRequestCounter, 192 kubectlExecCounter: kubectlExecCounter, 193 kubectlExecPendingGauge: kubectlExecPendingGauge, 194 reconcileHistogram: reconcileHistogram, 195 clusterEventsCounter: clusterEventsCounter, 196 redisRequestCounter: redisRequestCounter, 197 redisRequestHistogram: redisRequestHistogram, 198 hostname: hostname, 199 // This cron is used to expire the metrics cache. 200 // Currently clearing the metrics cache is logging and deleting from the map 201 // so there is no possibility of panic, but we will add a chain to keep robfig/cron v1 behavior. 202 cron: cron.New(cron.WithChain(cron.Recover(cron.PrintfLogger(log.StandardLogger())))), 203 }, nil 204 } 205 206 // Prometheus invalid labels, more info: https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels. 207 var invalidPromLabelChars = regexp.MustCompile(`[^a-zA-Z0-9_]`) 208 209 func normalizeLabels(prefix string, appLabels []string) []string { 210 results := []string{} 211 for _, label := range appLabels { 212 //prometheus labels don't accept dash in their name 213 curr := invalidPromLabelChars.ReplaceAllString(label, "_") 214 result := fmt.Sprintf("%s_%s", prefix, curr) 215 results = append(results, result) 216 } 217 return results 218 } 219 220 func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo) { 221 collector := &clusterCollector{infoSource: source} 222 go collector.Run(ctx) 223 m.registry.MustRegister(collector) 224 } 225 226 // IncSync increments the sync counter for an application 227 func (m *MetricsServer) IncSync(app *argoappv1.Application, state *argoappv1.OperationState) { 228 if !state.Phase.Completed() { 229 return 230 } 231 m.syncCounter.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), app.Spec.Destination.Server, string(state.Phase)).Inc() 232 } 233 234 func (m *MetricsServer) IncKubectlExec(command string) { 235 m.kubectlExecCounter.WithLabelValues(m.hostname, command).Inc() 236 } 237 238 func (m *MetricsServer) IncKubectlExecPending(command string) { 239 m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Inc() 240 } 241 242 func (m *MetricsServer) DecKubectlExecPending(command string) { 243 m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Dec() 244 } 245 246 // IncClusterEventsCount increments the number of cluster events 247 func (m *MetricsServer) IncClusterEventsCount(server, group, kind string) { 248 m.clusterEventsCounter.WithLabelValues(server, group, kind).Inc() 249 } 250 251 // IncKubernetesRequest increments the kubernetes requests counter for an application 252 func (m *MetricsServer) IncKubernetesRequest(app *argoappv1.Application, server, statusCode, verb, resourceKind, resourceNamespace string) { 253 var namespace, name, project string 254 if app != nil { 255 namespace = app.Namespace 256 name = app.Name 257 project = app.Spec.GetProject() 258 } 259 m.k8sRequestCounter.WithLabelValues( 260 namespace, name, project, server, statusCode, 261 verb, resourceKind, resourceNamespace, 262 ).Inc() 263 } 264 265 func (m *MetricsServer) IncRedisRequest(failed bool) { 266 m.redisRequestCounter.WithLabelValues(m.hostname, common.ApplicationController, strconv.FormatBool(failed)).Inc() 267 } 268 269 // ObserveRedisRequestDuration observes redis request duration 270 func (m *MetricsServer) ObserveRedisRequestDuration(duration time.Duration) { 271 m.redisRequestHistogram.WithLabelValues(m.hostname, common.ApplicationController).Observe(duration.Seconds()) 272 } 273 274 // IncReconcile increments the reconcile counter for an application 275 func (m *MetricsServer) IncReconcile(app *argoappv1.Application, duration time.Duration) { 276 m.reconcileHistogram.WithLabelValues(app.Namespace, app.Spec.Destination.Server).Observe(duration.Seconds()) 277 } 278 279 // HasExpiration return true if expiration is set 280 func (m *MetricsServer) HasExpiration() bool { 281 return len(m.cron.Entries()) > 0 282 } 283 284 // SetExpiration reset Prometheus metrics based on time duration interval 285 func (m *MetricsServer) SetExpiration(cacheExpiration time.Duration) error { 286 if m.HasExpiration() { 287 return errors.New("Expiration is already set") 288 } 289 290 _, err := m.cron.AddFunc(fmt.Sprintf("@every %s", cacheExpiration), func() { 291 log.Infof("Reset Prometheus metrics based on existing expiration '%v'", cacheExpiration) 292 m.syncCounter.Reset() 293 m.kubectlExecCounter.Reset() 294 m.kubectlExecPendingGauge.Reset() 295 m.k8sRequestCounter.Reset() 296 m.clusterEventsCounter.Reset() 297 m.redisRequestCounter.Reset() 298 m.reconcileHistogram.Reset() 299 m.redisRequestHistogram.Reset() 300 }) 301 if err != nil { 302 return err 303 } 304 305 m.cron.Start() 306 return nil 307 } 308 309 type appCollector struct { 310 store applister.ApplicationLister 311 appFilter func(obj interface{}) bool 312 appLabels []string 313 } 314 315 // NewAppCollector returns a prometheus collector for application metrics 316 func NewAppCollector(appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, appLabels []string) prometheus.Collector { 317 return &appCollector{ 318 store: appLister, 319 appFilter: appFilter, 320 appLabels: appLabels, 321 } 322 } 323 324 // NewAppRegistry creates a new prometheus registry that collects applications 325 func NewAppRegistry(appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, appLabels []string) *prometheus.Registry { 326 registry := prometheus.NewRegistry() 327 registry.MustRegister(NewAppCollector(appLister, appFilter, appLabels)) 328 return registry 329 } 330 331 // Describe implements the prometheus.Collector interface 332 func (c *appCollector) Describe(ch chan<- *prometheus.Desc) { 333 if len(c.appLabels) > 0 { 334 ch <- descAppLabels 335 } 336 ch <- descAppInfo 337 ch <- descAppSyncStatusCode 338 ch <- descAppHealthStatus 339 } 340 341 // Collect implements the prometheus.Collector interface 342 func (c *appCollector) Collect(ch chan<- prometheus.Metric) { 343 apps, err := c.store.List(labels.NewSelector()) 344 if err != nil { 345 log.Warnf("Failed to collect applications: %v", err) 346 return 347 } 348 for _, app := range apps { 349 if c.appFilter(app) { 350 c.collectApps(ch, app) 351 } 352 } 353 } 354 355 func boolFloat64(b bool) float64 { 356 if b { 357 return 1 358 } 359 return 0 360 } 361 362 func (c *appCollector) collectApps(ch chan<- prometheus.Metric, app *argoappv1.Application) { 363 addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) { 364 project := app.Spec.GetProject() 365 lv = append([]string{app.Namespace, app.Name, project}, lv...) 366 ch <- prometheus.MustNewConstMetric(desc, t, v, lv...) 367 } 368 addGauge := func(desc *prometheus.Desc, v float64, lv ...string) { 369 addConstMetric(desc, prometheus.GaugeValue, v, lv...) 370 } 371 372 var operation string 373 if app.DeletionTimestamp != nil { 374 operation = "delete" 375 } else if app.Operation != nil && app.Operation.Sync != nil { 376 operation = "sync" 377 } 378 syncStatus := app.Status.Sync.Status 379 if syncStatus == "" { 380 syncStatus = argoappv1.SyncStatusCodeUnknown 381 } 382 healthStatus := app.Status.Health.Status 383 if healthStatus == "" { 384 healthStatus = health.HealthStatusUnknown 385 } 386 387 autoSyncEnabled := app.Spec.SyncPolicy != nil && app.Spec.SyncPolicy.Automated != nil 388 389 addGauge(descAppInfo, 1, strconv.FormatBool(autoSyncEnabled), git.NormalizeGitURL(app.Spec.GetSource().RepoURL), app.Spec.Destination.Server, app.Spec.Destination.Namespace, string(syncStatus), string(healthStatus), operation) 390 391 if len(c.appLabels) > 0 { 392 labelValues := []string{} 393 for _, desiredLabel := range c.appLabels { 394 value := app.GetLabels()[desiredLabel] 395 labelValues = append(labelValues, value) 396 } 397 addGauge(descAppLabels, 1, labelValues...) 398 } 399 400 // Deprecated controller metrics 401 if os.Getenv(EnvVarLegacyControllerMetrics) == "true" { 402 addGauge(descAppCreated, float64(app.CreationTimestamp.Unix())) 403 404 addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeSynced), string(argoappv1.SyncStatusCodeSynced)) 405 addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeOutOfSync), string(argoappv1.SyncStatusCodeOutOfSync)) 406 addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeUnknown || syncStatus == ""), string(argoappv1.SyncStatusCodeUnknown)) 407 408 healthStatus := app.Status.Health.Status 409 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusUnknown || healthStatus == ""), string(health.HealthStatusUnknown)) 410 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusProgressing), string(health.HealthStatusProgressing)) 411 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusSuspended), string(health.HealthStatusSuspended)) 412 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusHealthy), string(health.HealthStatusHealthy)) 413 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusDegraded), string(health.HealthStatusDegraded)) 414 addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusMissing), string(health.HealthStatusMissing)) 415 } 416 }