github.com/argoproj/argo-cd/v3@v3.2.1/controller/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net/http"
     8  	"os"
     9  	"slices"
    10  	"strconv"
    11  	"time"
    12  
    13  	"github.com/argoproj/gitops-engine/pkg/health"
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"github.com/prometheus/client_golang/prometheus/promhttp"
    16  	"github.com/robfig/cron/v3"
    17  	log "github.com/sirupsen/logrus"
    18  	"k8s.io/apimachinery/pkg/labels"
    19  	ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
    20  
    21  	"github.com/argoproj/argo-cd/v3/common"
    22  	argoappv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
    23  	applister "github.com/argoproj/argo-cd/v3/pkg/client/listers/application/v1alpha1"
    24  	"github.com/argoproj/argo-cd/v3/util/argo"
    25  	"github.com/argoproj/argo-cd/v3/util/db"
    26  	"github.com/argoproj/argo-cd/v3/util/git"
    27  	"github.com/argoproj/argo-cd/v3/util/healthz"
    28  	metricsutil "github.com/argoproj/argo-cd/v3/util/metrics"
    29  	"github.com/argoproj/argo-cd/v3/util/metrics/kubectl"
    30  	"github.com/argoproj/argo-cd/v3/util/profile"
    31  )
    32  
    33  type MetricsServer struct {
    34  	*http.Server
    35  	syncCounter                       *prometheus.CounterVec
    36  	syncDuration                      *prometheus.CounterVec
    37  	kubectlExecCounter                *prometheus.CounterVec
    38  	kubectlExecPendingGauge           *prometheus.GaugeVec
    39  	orphanedResourcesGauge            *prometheus.GaugeVec
    40  	k8sRequestCounter                 *prometheus.CounterVec
    41  	clusterEventsCounter              *prometheus.CounterVec
    42  	redisRequestCounter               *prometheus.CounterVec
    43  	reconcileHistogram                *prometheus.HistogramVec
    44  	redisRequestHistogram             *prometheus.HistogramVec
    45  	resourceEventsProcessingHistogram *prometheus.HistogramVec
    46  	resourceEventsNumberGauge         *prometheus.GaugeVec
    47  	registry                          *prometheus.Registry
    48  	hostname                          string
    49  	cron                              *cron.Cron
    50  }
    51  
    52  const (
    53  	// MetricsPath is the endpoint to collect application metrics
    54  	MetricsPath = "/metrics"
    55  )
    56  
    57  // Follow Prometheus naming practices
    58  // https://prometheus.io/docs/practices/naming/
    59  var (
    60  	descAppDefaultLabels = []string{"namespace", "name", "project"}
    61  
    62  	descAppLabels     *prometheus.Desc
    63  	descAppConditions *prometheus.Desc
    64  
    65  	descAppInfo = prometheus.NewDesc(
    66  		"argocd_app_info",
    67  		"Information about application.",
    68  		append(descAppDefaultLabels, "autosync_enabled", "repo", "dest_server", "dest_namespace", "sync_status", "health_status", "operation"),
    69  		nil,
    70  	)
    71  
    72  	syncCounter = prometheus.NewCounterVec(
    73  		prometheus.CounterOpts{
    74  			Name: "argocd_app_sync_total",
    75  			Help: "Number of application syncs.",
    76  		},
    77  		append(descAppDefaultLabels, "dest_server", "phase", "dry_run"),
    78  	)
    79  
    80  	syncDuration = prometheus.NewCounterVec(
    81  		prometheus.CounterOpts{
    82  			Name: "argocd_app_sync_duration_seconds_total",
    83  			Help: "Application sync performance in seconds total.",
    84  		},
    85  		append(descAppDefaultLabels, "dest_server"),
    86  	)
    87  
    88  	k8sRequestCounter = prometheus.NewCounterVec(
    89  		prometheus.CounterOpts{
    90  			Name: "argocd_app_k8s_request_total",
    91  			Help: "Number of kubernetes requests executed during application reconciliation.",
    92  		},
    93  		append(descAppDefaultLabels, "server", "response_code", "verb", "resource_kind", "resource_namespace", "dry_run"),
    94  	)
    95  
    96  	kubectlExecCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
    97  		Name: "argocd_kubectl_exec_total",
    98  		Help: "Number of kubectl executions",
    99  	}, []string{"hostname", "command"})
   100  
   101  	kubectlExecPendingGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   102  		Name: "argocd_kubectl_exec_pending",
   103  		Help: "Number of pending kubectl executions",
   104  	}, []string{"hostname", "command"})
   105  
   106  	reconcileHistogram = prometheus.NewHistogramVec(
   107  		prometheus.HistogramOpts{
   108  			Name: "argocd_app_reconcile",
   109  			Help: "Application reconciliation performance in seconds.",
   110  			// Buckets chosen after observing a ~2100ms mean reconcile time
   111  			Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16},
   112  		},
   113  		[]string{"namespace", "dest_server"},
   114  	)
   115  
   116  	clusterEventsCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
   117  		Name: "argocd_cluster_events_total",
   118  		Help: "Number of processes k8s resource events.",
   119  	}, append(descClusterDefaultLabels, "group", "kind"))
   120  
   121  	redisRequestCounter = prometheus.NewCounterVec(
   122  		prometheus.CounterOpts{
   123  			Name: "argocd_redis_request_total",
   124  			Help: "Number of redis requests executed during application reconciliation.",
   125  		},
   126  		[]string{"hostname", "initiator", "failed"},
   127  	)
   128  
   129  	redisRequestHistogram = prometheus.NewHistogramVec(
   130  		prometheus.HistogramOpts{
   131  			Name:    "argocd_redis_request_duration",
   132  			Help:    "Redis requests duration.",
   133  			Buckets: []float64{0.01, 0.05, 0.10, 0.25, .5, 1},
   134  		},
   135  		[]string{"hostname", "initiator"},
   136  	)
   137  
   138  	orphanedResourcesGauge = prometheus.NewGaugeVec(
   139  		prometheus.GaugeOpts{
   140  			Name: "argocd_app_orphaned_resources_count",
   141  			Help: "Number of orphaned resources per application",
   142  		},
   143  		descAppDefaultLabels,
   144  	)
   145  
   146  	resourceEventsProcessingHistogram = prometheus.NewHistogramVec(
   147  		prometheus.HistogramOpts{
   148  			Name:    "argocd_resource_events_processing",
   149  			Help:    "Time to process resource events in seconds.",
   150  			Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16},
   151  		},
   152  		[]string{"server"},
   153  	)
   154  
   155  	resourceEventsNumberGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   156  		Name: "argocd_resource_events_processed_in_batch",
   157  		Help: "Number of resource events processed in batch",
   158  	}, []string{"server"})
   159  )
   160  
   161  // NewMetricsServer returns a new prometheus server which collects application metrics
   162  func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFilter func(obj any) bool, healthCheck func(r *http.Request) error, appLabels []string, appConditions []string, db db.ArgoDB) (*MetricsServer, error) {
   163  	hostname, err := os.Hostname()
   164  	if err != nil {
   165  		return nil, err
   166  	}
   167  
   168  	if len(appLabels) > 0 {
   169  		normalizedLabels := metricsutil.NormalizeLabels("label", appLabels)
   170  		descAppLabels = prometheus.NewDesc(
   171  			"argocd_app_labels",
   172  			"Argo Application labels converted to Prometheus labels",
   173  			append(descAppDefaultLabels, normalizedLabels...),
   174  			nil,
   175  		)
   176  	}
   177  
   178  	if len(appConditions) > 0 {
   179  		descAppConditions = prometheus.NewDesc(
   180  			"argocd_app_condition",
   181  			"Report application conditions.",
   182  			append(descAppDefaultLabels, "condition"),
   183  			nil,
   184  		)
   185  	}
   186  
   187  	mux := http.NewServeMux()
   188  	registry := NewAppRegistry(appLister, appFilter, appLabels, appConditions, db)
   189  
   190  	mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{
   191  		// contains app controller specific metrics
   192  		registry,
   193  		// contains workqueue metrics, process and golang metrics
   194  		ctrlmetrics.Registry,
   195  	}, promhttp.HandlerOpts{}))
   196  	profile.RegisterProfiler(mux)
   197  	healthz.ServeHealthCheck(mux, healthCheck)
   198  
   199  	registry.MustRegister(syncCounter)
   200  	registry.MustRegister(syncDuration)
   201  	registry.MustRegister(k8sRequestCounter)
   202  	registry.MustRegister(kubectlExecCounter)
   203  	registry.MustRegister(kubectlExecPendingGauge)
   204  	registry.MustRegister(orphanedResourcesGauge)
   205  	registry.MustRegister(reconcileHistogram)
   206  	registry.MustRegister(clusterEventsCounter)
   207  	registry.MustRegister(redisRequestCounter)
   208  	registry.MustRegister(redisRequestHistogram)
   209  	registry.MustRegister(resourceEventsProcessingHistogram)
   210  	registry.MustRegister(resourceEventsNumberGauge)
   211  
   212  	kubectl.RegisterWithClientGo()
   213  	kubectl.RegisterWithPrometheus(registry)
   214  
   215  	metricsServer := &MetricsServer{
   216  		registry: registry,
   217  		Server: &http.Server{
   218  			Addr:    addr,
   219  			Handler: mux,
   220  		},
   221  		syncCounter:                       syncCounter,
   222  		syncDuration:                      syncDuration,
   223  		k8sRequestCounter:                 k8sRequestCounter,
   224  		kubectlExecCounter:                kubectlExecCounter,
   225  		kubectlExecPendingGauge:           kubectlExecPendingGauge,
   226  		orphanedResourcesGauge:            orphanedResourcesGauge,
   227  		reconcileHistogram:                reconcileHistogram,
   228  		clusterEventsCounter:              clusterEventsCounter,
   229  		redisRequestCounter:               redisRequestCounter,
   230  		redisRequestHistogram:             redisRequestHistogram,
   231  		resourceEventsProcessingHistogram: resourceEventsProcessingHistogram,
   232  		resourceEventsNumberGauge:         resourceEventsNumberGauge,
   233  		hostname:                          hostname,
   234  		// This cron is used to expire the metrics cache.
   235  		// Currently clearing the metrics cache is logging and deleting from the map
   236  		// so there is no possibility of panic, but we will add a chain to keep robfig/cron v1 behavior.
   237  		cron: cron.New(cron.WithChain(cron.Recover(cron.PrintfLogger(log.StandardLogger())))),
   238  	}
   239  
   240  	return metricsServer, nil
   241  }
   242  
   243  func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo, db db.ArgoDB, clusterLabels []string) {
   244  	collector := NewClusterCollector(ctx, source, db.ListClusters, clusterLabels)
   245  	m.registry.MustRegister(collector)
   246  }
   247  
   248  // IncSync increments the sync counter for an application
   249  func (m *MetricsServer) IncSync(app *argoappv1.Application, destServer string, state *argoappv1.OperationState) {
   250  	if !state.Phase.Completed() {
   251  		return
   252  	}
   253  	isDryRun := app.Operation != nil && app.Operation.DryRun()
   254  	m.syncCounter.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), destServer, string(state.Phase), strconv.FormatBool(isDryRun)).Inc()
   255  }
   256  
   257  // IncAppSyncDuration observes app sync duration
   258  func (m *MetricsServer) IncAppSyncDuration(app *argoappv1.Application, destServer string, state *argoappv1.OperationState) {
   259  	if state.FinishedAt != nil {
   260  		m.syncDuration.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), destServer).
   261  			Add(float64(time.Duration(state.FinishedAt.Unix() - state.StartedAt.Unix())))
   262  	}
   263  }
   264  
   265  func (m *MetricsServer) IncKubectlExec(command string) {
   266  	m.kubectlExecCounter.WithLabelValues(m.hostname, command).Inc()
   267  }
   268  
   269  func (m *MetricsServer) IncKubectlExecPending(command string) {
   270  	m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Inc()
   271  }
   272  
   273  func (m *MetricsServer) DecKubectlExecPending(command string) {
   274  	m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Dec()
   275  }
   276  
   277  func (m *MetricsServer) SetOrphanedResourcesMetric(app *argoappv1.Application, numOrphanedResources int) {
   278  	m.orphanedResourcesGauge.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject()).Set(float64(numOrphanedResources))
   279  }
   280  
   281  // IncClusterEventsCount increments the number of cluster events
   282  func (m *MetricsServer) IncClusterEventsCount(server, group, kind string) {
   283  	m.clusterEventsCounter.WithLabelValues(server, group, kind).Inc()
   284  }
   285  
   286  // IncKubernetesRequest increments the kubernetes requests counter for an application
   287  func (m *MetricsServer) IncKubernetesRequest(app *argoappv1.Application, server, statusCode, verb, resourceKind, resourceNamespace string) {
   288  	var namespace, name, project string
   289  	isDryRun := false
   290  	if app != nil {
   291  		namespace = app.Namespace
   292  		name = app.Name
   293  		project = app.Spec.GetProject()
   294  		isDryRun = app.Operation != nil && app.Operation.DryRun()
   295  	}
   296  	m.k8sRequestCounter.WithLabelValues(
   297  		namespace, name, project, server, statusCode,
   298  		verb, resourceKind, resourceNamespace, strconv.FormatBool(isDryRun),
   299  	).Inc()
   300  }
   301  
   302  func (m *MetricsServer) IncRedisRequest(failed bool) {
   303  	m.redisRequestCounter.WithLabelValues(m.hostname, common.ApplicationController, strconv.FormatBool(failed)).Inc()
   304  }
   305  
   306  // ObserveRedisRequestDuration observes redis request duration
   307  func (m *MetricsServer) ObserveRedisRequestDuration(duration time.Duration) {
   308  	m.redisRequestHistogram.WithLabelValues(m.hostname, common.ApplicationController).Observe(duration.Seconds())
   309  }
   310  
   311  // ObserveResourceEventsProcessingDuration observes resource events processing duration
   312  func (m *MetricsServer) ObserveResourceEventsProcessingDuration(server string, duration time.Duration, processedEventsNumber int) {
   313  	m.resourceEventsProcessingHistogram.WithLabelValues(server).Observe(duration.Seconds())
   314  	m.resourceEventsNumberGauge.WithLabelValues(server).Set(float64(processedEventsNumber))
   315  }
   316  
   317  // IncReconcile increments the reconcile counter for an application
   318  func (m *MetricsServer) IncReconcile(app *argoappv1.Application, destServer string, duration time.Duration) {
   319  	m.reconcileHistogram.WithLabelValues(app.Namespace, destServer).Observe(duration.Seconds())
   320  }
   321  
   322  // HasExpiration return true if expiration is set
   323  func (m *MetricsServer) HasExpiration() bool {
   324  	return len(m.cron.Entries()) > 0
   325  }
   326  
   327  // SetExpiration reset Prometheus metrics based on time duration interval
   328  func (m *MetricsServer) SetExpiration(cacheExpiration time.Duration) error {
   329  	if m.HasExpiration() {
   330  		return errors.New("expiration is already set")
   331  	}
   332  
   333  	_, err := m.cron.AddFunc(fmt.Sprintf("@every %s", cacheExpiration), func() {
   334  		log.Infof("Reset Prometheus metrics based on existing expiration '%v'", cacheExpiration)
   335  		m.syncCounter.Reset()
   336  		m.syncDuration.Reset()
   337  		m.kubectlExecCounter.Reset()
   338  		m.kubectlExecPendingGauge.Reset()
   339  		m.orphanedResourcesGauge.Reset()
   340  		m.k8sRequestCounter.Reset()
   341  		m.clusterEventsCounter.Reset()
   342  		m.redisRequestCounter.Reset()
   343  		m.reconcileHistogram.Reset()
   344  		m.redisRequestHistogram.Reset()
   345  		m.resourceEventsProcessingHistogram.Reset()
   346  		m.resourceEventsNumberGauge.Reset()
   347  		kubectl.ResetAll()
   348  	})
   349  	if err != nil {
   350  		return err
   351  	}
   352  
   353  	m.cron.Start()
   354  	return nil
   355  }
   356  
   357  type appCollector struct {
   358  	store         applister.ApplicationLister
   359  	appFilter     func(obj any) bool
   360  	appLabels     []string
   361  	appConditions []string
   362  	db            db.ArgoDB
   363  }
   364  
   365  // NewAppCollector returns a prometheus collector for application metrics
   366  func NewAppCollector(appLister applister.ApplicationLister, appFilter func(obj any) bool, appLabels []string, appConditions []string, db db.ArgoDB) prometheus.Collector {
   367  	return &appCollector{
   368  		store:         appLister,
   369  		appFilter:     appFilter,
   370  		appLabels:     appLabels,
   371  		appConditions: appConditions,
   372  		db:            db,
   373  	}
   374  }
   375  
   376  // NewAppRegistry creates a new prometheus registry that collects applications
   377  func NewAppRegistry(appLister applister.ApplicationLister, appFilter func(obj any) bool, appLabels []string, appConditions []string, db db.ArgoDB) *prometheus.Registry {
   378  	registry := prometheus.NewRegistry()
   379  	registry.MustRegister(NewAppCollector(appLister, appFilter, appLabels, appConditions, db))
   380  	return registry
   381  }
   382  
   383  // Describe implements the prometheus.Collector interface
   384  func (c *appCollector) Describe(ch chan<- *prometheus.Desc) {
   385  	if len(c.appLabels) > 0 {
   386  		ch <- descAppLabels
   387  	}
   388  	if len(c.appConditions) > 0 {
   389  		ch <- descAppConditions
   390  	}
   391  	ch <- descAppInfo
   392  }
   393  
   394  // Collect implements the prometheus.Collector interface
   395  func (c *appCollector) Collect(ch chan<- prometheus.Metric) {
   396  	apps, err := c.store.List(labels.NewSelector())
   397  	if err != nil {
   398  		log.Warnf("Failed to collect applications: %v", err)
   399  		return
   400  	}
   401  	for _, app := range apps {
   402  		if !c.appFilter(app) {
   403  			continue
   404  		}
   405  		destCluster, err := argo.GetDestinationCluster(context.Background(), app.Spec.Destination, c.db)
   406  		if err != nil {
   407  			log.Warnf("Failed to get destination cluster for application %s: %v", app.Name, err)
   408  		}
   409  		destServer := ""
   410  		if destCluster != nil {
   411  			destServer = destCluster.Server
   412  		}
   413  		c.collectApps(ch, app, destServer)
   414  	}
   415  }
   416  
   417  func boolFloat64(b bool) float64 {
   418  	if b {
   419  		return 1
   420  	}
   421  	return 0
   422  }
   423  
   424  func (c *appCollector) collectApps(ch chan<- prometheus.Metric, app *argoappv1.Application, destServer string) {
   425  	addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) {
   426  		project := app.Spec.GetProject()
   427  		lv = append([]string{app.Namespace, app.Name, project}, lv...)
   428  		ch <- prometheus.MustNewConstMetric(desc, t, v, lv...)
   429  	}
   430  	addGauge := func(desc *prometheus.Desc, v float64, lv ...string) {
   431  		addConstMetric(desc, prometheus.GaugeValue, v, lv...)
   432  	}
   433  
   434  	var operation string
   435  	if app.DeletionTimestamp != nil {
   436  		operation = "delete"
   437  	} else if app.Operation != nil && app.Operation.Sync != nil {
   438  		operation = "sync"
   439  	}
   440  	syncStatus := app.Status.Sync.Status
   441  	if syncStatus == "" {
   442  		syncStatus = argoappv1.SyncStatusCodeUnknown
   443  	}
   444  	healthStatus := app.Status.Health.Status
   445  	if healthStatus == "" {
   446  		healthStatus = health.HealthStatusUnknown
   447  	}
   448  
   449  	autoSyncEnabled := app.Spec.SyncPolicy != nil && app.Spec.SyncPolicy.IsAutomatedSyncEnabled()
   450  
   451  	addGauge(descAppInfo, 1, strconv.FormatBool(autoSyncEnabled), git.NormalizeGitURL(app.Spec.GetSource().RepoURL), destServer, app.Spec.Destination.Namespace, string(syncStatus), string(healthStatus), operation)
   452  
   453  	if len(c.appLabels) > 0 {
   454  		labelValues := []string{}
   455  		for _, desiredLabel := range c.appLabels {
   456  			value := app.GetLabels()[desiredLabel]
   457  			labelValues = append(labelValues, value)
   458  		}
   459  		addGauge(descAppLabels, 1, labelValues...)
   460  	}
   461  
   462  	if len(c.appConditions) > 0 {
   463  		conditionCount := make(map[string]int)
   464  		for _, condition := range app.Status.Conditions {
   465  			if slices.Contains(c.appConditions, condition.Type) {
   466  				conditionCount[condition.Type]++
   467  			}
   468  		}
   469  
   470  		for conditionType, count := range conditionCount {
   471  			addGauge(descAppConditions, float64(count), conditionType)
   472  		}
   473  	}
   474  }