github.com/argoproj/argo-cd/v2@v2.10.9/controller/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net/http"
     8  	"os"
     9  	"regexp"
    10  	"strconv"
    11  	"time"
    12  
    13  	"github.com/argoproj/gitops-engine/pkg/health"
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"github.com/prometheus/client_golang/prometheus/promhttp"
    16  	"github.com/robfig/cron/v3"
    17  	log "github.com/sirupsen/logrus"
    18  	"k8s.io/apimachinery/pkg/labels"
    19  
    20  	"github.com/argoproj/argo-cd/v2/common"
    21  	argoappv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1"
    22  	applister "github.com/argoproj/argo-cd/v2/pkg/client/listers/application/v1alpha1"
    23  	"github.com/argoproj/argo-cd/v2/util/git"
    24  	"github.com/argoproj/argo-cd/v2/util/healthz"
    25  	"github.com/argoproj/argo-cd/v2/util/profile"
    26  
    27  	ctrl_metrics "sigs.k8s.io/controller-runtime/pkg/metrics"
    28  )
    29  
    30  type MetricsServer struct {
    31  	*http.Server
    32  	syncCounter             *prometheus.CounterVec
    33  	kubectlExecCounter      *prometheus.CounterVec
    34  	kubectlExecPendingGauge *prometheus.GaugeVec
    35  	k8sRequestCounter       *prometheus.CounterVec
    36  	clusterEventsCounter    *prometheus.CounterVec
    37  	redisRequestCounter     *prometheus.CounterVec
    38  	reconcileHistogram      *prometheus.HistogramVec
    39  	redisRequestHistogram   *prometheus.HistogramVec
    40  	registry                *prometheus.Registry
    41  	hostname                string
    42  	cron                    *cron.Cron
    43  }
    44  
    45  const (
    46  	// MetricsPath is the endpoint to collect application metrics
    47  	MetricsPath = "/metrics"
    48  	// EnvVarLegacyControllerMetrics is a env var to re-enable deprecated prometheus metrics
    49  	EnvVarLegacyControllerMetrics = "ARGOCD_LEGACY_CONTROLLER_METRICS"
    50  )
    51  
    52  // Follow Prometheus naming practices
    53  // https://prometheus.io/docs/practices/naming/
    54  var (
    55  	descAppDefaultLabels = []string{"namespace", "name", "project"}
    56  
    57  	descAppLabels *prometheus.Desc
    58  
    59  	descAppInfo = prometheus.NewDesc(
    60  		"argocd_app_info",
    61  		"Information about application.",
    62  		append(descAppDefaultLabels, "autosync_enabled", "repo", "dest_server", "dest_namespace", "sync_status", "health_status", "operation"),
    63  		nil,
    64  	)
    65  	// DEPRECATED
    66  	descAppCreated = prometheus.NewDesc(
    67  		"argocd_app_created_time",
    68  		"Creation time in unix timestamp for an application.",
    69  		descAppDefaultLabels,
    70  		nil,
    71  	)
    72  	// DEPRECATED: superseded by sync_status label in argocd_app_info
    73  	descAppSyncStatusCode = prometheus.NewDesc(
    74  		"argocd_app_sync_status",
    75  		"The application current sync status.",
    76  		append(descAppDefaultLabels, "sync_status"),
    77  		nil,
    78  	)
    79  	// DEPRECATED: superseded by health_status label in argocd_app_info
    80  	descAppHealthStatus = prometheus.NewDesc(
    81  		"argocd_app_health_status",
    82  		"The application current health status.",
    83  		append(descAppDefaultLabels, "health_status"),
    84  		nil,
    85  	)
    86  
    87  	syncCounter = prometheus.NewCounterVec(
    88  		prometheus.CounterOpts{
    89  			Name: "argocd_app_sync_total",
    90  			Help: "Number of application syncs.",
    91  		},
    92  		append(descAppDefaultLabels, "dest_server", "phase"),
    93  	)
    94  
    95  	k8sRequestCounter = prometheus.NewCounterVec(
    96  		prometheus.CounterOpts{
    97  			Name: "argocd_app_k8s_request_total",
    98  			Help: "Number of kubernetes requests executed during application reconciliation.",
    99  		},
   100  		append(descAppDefaultLabels, "server", "response_code", "verb", "resource_kind", "resource_namespace"),
   101  	)
   102  
   103  	kubectlExecCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
   104  		Name: "argocd_kubectl_exec_total",
   105  		Help: "Number of kubectl executions",
   106  	}, []string{"hostname", "command"})
   107  
   108  	kubectlExecPendingGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   109  		Name: "argocd_kubectl_exec_pending",
   110  		Help: "Number of pending kubectl executions",
   111  	}, []string{"hostname", "command"})
   112  
   113  	reconcileHistogram = prometheus.NewHistogramVec(
   114  		prometheus.HistogramOpts{
   115  			Name: "argocd_app_reconcile",
   116  			Help: "Application reconciliation performance.",
   117  			// Buckets chosen after observing a ~2100ms mean reconcile time
   118  			Buckets: []float64{0.25, .5, 1, 2, 4, 8, 16},
   119  		},
   120  		[]string{"namespace", "dest_server"},
   121  	)
   122  
   123  	clusterEventsCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
   124  		Name: "argocd_cluster_events_total",
   125  		Help: "Number of processes k8s resource events.",
   126  	}, append(descClusterDefaultLabels, "group", "kind"))
   127  
   128  	redisRequestCounter = prometheus.NewCounterVec(
   129  		prometheus.CounterOpts{
   130  			Name: "argocd_redis_request_total",
   131  			Help: "Number of redis requests executed during application reconciliation.",
   132  		},
   133  		[]string{"hostname", "initiator", "failed"},
   134  	)
   135  
   136  	redisRequestHistogram = prometheus.NewHistogramVec(
   137  		prometheus.HistogramOpts{
   138  			Name:    "argocd_redis_request_duration",
   139  			Help:    "Redis requests duration.",
   140  			Buckets: []float64{0.01, 0.05, 0.10, 0.25, .5, 1},
   141  		},
   142  		[]string{"hostname", "initiator"},
   143  	)
   144  )
   145  
   146  // NewMetricsServer returns a new prometheus server which collects application metrics
   147  func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, healthCheck func(r *http.Request) error, appLabels []string) (*MetricsServer, error) {
   148  	hostname, err := os.Hostname()
   149  	if err != nil {
   150  		return nil, err
   151  	}
   152  
   153  	if len(appLabels) > 0 {
   154  		normalizedLabels := normalizeLabels("label", appLabels)
   155  		descAppLabels = prometheus.NewDesc(
   156  			"argocd_app_labels",
   157  			"Argo Application labels converted to Prometheus labels",
   158  			append(descAppDefaultLabels, normalizedLabels...),
   159  			nil,
   160  		)
   161  	}
   162  
   163  	mux := http.NewServeMux()
   164  	registry := NewAppRegistry(appLister, appFilter, appLabels)
   165  
   166  	mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{
   167  		// contains app controller specific metrics
   168  		registry,
   169  		// contains workqueue metrics, process and golang metrics
   170  		ctrl_metrics.Registry,
   171  	}, promhttp.HandlerOpts{}))
   172  	profile.RegisterProfiler(mux)
   173  	healthz.ServeHealthCheck(mux, healthCheck)
   174  
   175  	registry.MustRegister(syncCounter)
   176  	registry.MustRegister(k8sRequestCounter)
   177  	registry.MustRegister(kubectlExecCounter)
   178  	registry.MustRegister(kubectlExecPendingGauge)
   179  	registry.MustRegister(reconcileHistogram)
   180  	registry.MustRegister(clusterEventsCounter)
   181  	registry.MustRegister(redisRequestCounter)
   182  	registry.MustRegister(redisRequestHistogram)
   183  
   184  	return &MetricsServer{
   185  		registry: registry,
   186  		Server: &http.Server{
   187  			Addr:    addr,
   188  			Handler: mux,
   189  		},
   190  		syncCounter:             syncCounter,
   191  		k8sRequestCounter:       k8sRequestCounter,
   192  		kubectlExecCounter:      kubectlExecCounter,
   193  		kubectlExecPendingGauge: kubectlExecPendingGauge,
   194  		reconcileHistogram:      reconcileHistogram,
   195  		clusterEventsCounter:    clusterEventsCounter,
   196  		redisRequestCounter:     redisRequestCounter,
   197  		redisRequestHistogram:   redisRequestHistogram,
   198  		hostname:                hostname,
   199  		// This cron is used to expire the metrics cache.
   200  		// Currently clearing the metrics cache is logging and deleting from the map
   201  		// so there is no possibility of panic, but we will add a chain to keep robfig/cron v1 behavior.
   202  		cron: cron.New(cron.WithChain(cron.Recover(cron.PrintfLogger(log.StandardLogger())))),
   203  	}, nil
   204  }
   205  
   206  // Prometheus invalid labels, more info: https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels.
   207  var invalidPromLabelChars = regexp.MustCompile(`[^a-zA-Z0-9_]`)
   208  
   209  func normalizeLabels(prefix string, appLabels []string) []string {
   210  	results := []string{}
   211  	for _, label := range appLabels {
   212  		//prometheus labels don't accept dash in their name
   213  		curr := invalidPromLabelChars.ReplaceAllString(label, "_")
   214  		result := fmt.Sprintf("%s_%s", prefix, curr)
   215  		results = append(results, result)
   216  	}
   217  	return results
   218  }
   219  
   220  func (m *MetricsServer) RegisterClustersInfoSource(ctx context.Context, source HasClustersInfo) {
   221  	collector := &clusterCollector{infoSource: source}
   222  	go collector.Run(ctx)
   223  	m.registry.MustRegister(collector)
   224  }
   225  
   226  // IncSync increments the sync counter for an application
   227  func (m *MetricsServer) IncSync(app *argoappv1.Application, state *argoappv1.OperationState) {
   228  	if !state.Phase.Completed() {
   229  		return
   230  	}
   231  	m.syncCounter.WithLabelValues(app.Namespace, app.Name, app.Spec.GetProject(), app.Spec.Destination.Server, string(state.Phase)).Inc()
   232  }
   233  
   234  func (m *MetricsServer) IncKubectlExec(command string) {
   235  	m.kubectlExecCounter.WithLabelValues(m.hostname, command).Inc()
   236  }
   237  
   238  func (m *MetricsServer) IncKubectlExecPending(command string) {
   239  	m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Inc()
   240  }
   241  
   242  func (m *MetricsServer) DecKubectlExecPending(command string) {
   243  	m.kubectlExecPendingGauge.WithLabelValues(m.hostname, command).Dec()
   244  }
   245  
   246  // IncClusterEventsCount increments the number of cluster events
   247  func (m *MetricsServer) IncClusterEventsCount(server, group, kind string) {
   248  	m.clusterEventsCounter.WithLabelValues(server, group, kind).Inc()
   249  }
   250  
   251  // IncKubernetesRequest increments the kubernetes requests counter for an application
   252  func (m *MetricsServer) IncKubernetesRequest(app *argoappv1.Application, server, statusCode, verb, resourceKind, resourceNamespace string) {
   253  	var namespace, name, project string
   254  	if app != nil {
   255  		namespace = app.Namespace
   256  		name = app.Name
   257  		project = app.Spec.GetProject()
   258  	}
   259  	m.k8sRequestCounter.WithLabelValues(
   260  		namespace, name, project, server, statusCode,
   261  		verb, resourceKind, resourceNamespace,
   262  	).Inc()
   263  }
   264  
   265  func (m *MetricsServer) IncRedisRequest(failed bool) {
   266  	m.redisRequestCounter.WithLabelValues(m.hostname, common.ApplicationController, strconv.FormatBool(failed)).Inc()
   267  }
   268  
   269  // ObserveRedisRequestDuration observes redis request duration
   270  func (m *MetricsServer) ObserveRedisRequestDuration(duration time.Duration) {
   271  	m.redisRequestHistogram.WithLabelValues(m.hostname, common.ApplicationController).Observe(duration.Seconds())
   272  }
   273  
   274  // IncReconcile increments the reconcile counter for an application
   275  func (m *MetricsServer) IncReconcile(app *argoappv1.Application, duration time.Duration) {
   276  	m.reconcileHistogram.WithLabelValues(app.Namespace, app.Spec.Destination.Server).Observe(duration.Seconds())
   277  }
   278  
   279  // HasExpiration return true if expiration is set
   280  func (m *MetricsServer) HasExpiration() bool {
   281  	return len(m.cron.Entries()) > 0
   282  }
   283  
   284  // SetExpiration reset Prometheus metrics based on time duration interval
   285  func (m *MetricsServer) SetExpiration(cacheExpiration time.Duration) error {
   286  	if m.HasExpiration() {
   287  		return errors.New("Expiration is already set")
   288  	}
   289  
   290  	_, err := m.cron.AddFunc(fmt.Sprintf("@every %s", cacheExpiration), func() {
   291  		log.Infof("Reset Prometheus metrics based on existing expiration '%v'", cacheExpiration)
   292  		m.syncCounter.Reset()
   293  		m.kubectlExecCounter.Reset()
   294  		m.kubectlExecPendingGauge.Reset()
   295  		m.k8sRequestCounter.Reset()
   296  		m.clusterEventsCounter.Reset()
   297  		m.redisRequestCounter.Reset()
   298  		m.reconcileHistogram.Reset()
   299  		m.redisRequestHistogram.Reset()
   300  	})
   301  	if err != nil {
   302  		return err
   303  	}
   304  
   305  	m.cron.Start()
   306  	return nil
   307  }
   308  
   309  type appCollector struct {
   310  	store     applister.ApplicationLister
   311  	appFilter func(obj interface{}) bool
   312  	appLabels []string
   313  }
   314  
   315  // NewAppCollector returns a prometheus collector for application metrics
   316  func NewAppCollector(appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, appLabels []string) prometheus.Collector {
   317  	return &appCollector{
   318  		store:     appLister,
   319  		appFilter: appFilter,
   320  		appLabels: appLabels,
   321  	}
   322  }
   323  
   324  // NewAppRegistry creates a new prometheus registry that collects applications
   325  func NewAppRegistry(appLister applister.ApplicationLister, appFilter func(obj interface{}) bool, appLabels []string) *prometheus.Registry {
   326  	registry := prometheus.NewRegistry()
   327  	registry.MustRegister(NewAppCollector(appLister, appFilter, appLabels))
   328  	return registry
   329  }
   330  
   331  // Describe implements the prometheus.Collector interface
   332  func (c *appCollector) Describe(ch chan<- *prometheus.Desc) {
   333  	if len(c.appLabels) > 0 {
   334  		ch <- descAppLabels
   335  	}
   336  	ch <- descAppInfo
   337  	ch <- descAppSyncStatusCode
   338  	ch <- descAppHealthStatus
   339  }
   340  
   341  // Collect implements the prometheus.Collector interface
   342  func (c *appCollector) Collect(ch chan<- prometheus.Metric) {
   343  	apps, err := c.store.List(labels.NewSelector())
   344  	if err != nil {
   345  		log.Warnf("Failed to collect applications: %v", err)
   346  		return
   347  	}
   348  	for _, app := range apps {
   349  		if c.appFilter(app) {
   350  			c.collectApps(ch, app)
   351  		}
   352  	}
   353  }
   354  
   355  func boolFloat64(b bool) float64 {
   356  	if b {
   357  		return 1
   358  	}
   359  	return 0
   360  }
   361  
   362  func (c *appCollector) collectApps(ch chan<- prometheus.Metric, app *argoappv1.Application) {
   363  	addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) {
   364  		project := app.Spec.GetProject()
   365  		lv = append([]string{app.Namespace, app.Name, project}, lv...)
   366  		ch <- prometheus.MustNewConstMetric(desc, t, v, lv...)
   367  	}
   368  	addGauge := func(desc *prometheus.Desc, v float64, lv ...string) {
   369  		addConstMetric(desc, prometheus.GaugeValue, v, lv...)
   370  	}
   371  
   372  	var operation string
   373  	if app.DeletionTimestamp != nil {
   374  		operation = "delete"
   375  	} else if app.Operation != nil && app.Operation.Sync != nil {
   376  		operation = "sync"
   377  	}
   378  	syncStatus := app.Status.Sync.Status
   379  	if syncStatus == "" {
   380  		syncStatus = argoappv1.SyncStatusCodeUnknown
   381  	}
   382  	healthStatus := app.Status.Health.Status
   383  	if healthStatus == "" {
   384  		healthStatus = health.HealthStatusUnknown
   385  	}
   386  
   387  	autoSyncEnabled := app.Spec.SyncPolicy != nil && app.Spec.SyncPolicy.Automated != nil
   388  
   389  	addGauge(descAppInfo, 1, strconv.FormatBool(autoSyncEnabled), git.NormalizeGitURL(app.Spec.GetSource().RepoURL), app.Spec.Destination.Server, app.Spec.Destination.Namespace, string(syncStatus), string(healthStatus), operation)
   390  
   391  	if len(c.appLabels) > 0 {
   392  		labelValues := []string{}
   393  		for _, desiredLabel := range c.appLabels {
   394  			value := app.GetLabels()[desiredLabel]
   395  			labelValues = append(labelValues, value)
   396  		}
   397  		addGauge(descAppLabels, 1, labelValues...)
   398  	}
   399  
   400  	// Deprecated controller metrics
   401  	if os.Getenv(EnvVarLegacyControllerMetrics) == "true" {
   402  		addGauge(descAppCreated, float64(app.CreationTimestamp.Unix()))
   403  
   404  		addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeSynced), string(argoappv1.SyncStatusCodeSynced))
   405  		addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeOutOfSync), string(argoappv1.SyncStatusCodeOutOfSync))
   406  		addGauge(descAppSyncStatusCode, boolFloat64(syncStatus == argoappv1.SyncStatusCodeUnknown || syncStatus == ""), string(argoappv1.SyncStatusCodeUnknown))
   407  
   408  		healthStatus := app.Status.Health.Status
   409  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusUnknown || healthStatus == ""), string(health.HealthStatusUnknown))
   410  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusProgressing), string(health.HealthStatusProgressing))
   411  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusSuspended), string(health.HealthStatusSuspended))
   412  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusHealthy), string(health.HealthStatusHealthy))
   413  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusDegraded), string(health.HealthStatusDegraded))
   414  		addGauge(descAppHealthStatus, boolFloat64(healthStatus == health.HealthStatusMissing), string(health.HealthStatusMissing))
   415  	}
   416  }