github.com/argoproj/argo-cd/v3@v3.2.1/controller/metrics/clustercollector.go (about)

     1  package metrics
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/argoproj/gitops-engine/pkg/cache"
     9  	"github.com/prometheus/client_golang/prometheus"
    10  	log "github.com/sirupsen/logrus"
    11  
    12  	argoappv1 "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
    13  	metricsutil "github.com/argoproj/argo-cd/v3/util/metrics"
    14  )
    15  
    16  const (
    17  	metricsCollectionInterval = 30 * time.Second
    18  	metricsCollectionTimeout  = 10 * time.Second
    19  )
    20  
    21  var (
    22  	descClusterDefaultLabels = []string{"server"}
    23  
    24  	descClusterLabels *prometheus.Desc
    25  
    26  	descClusterInfo = prometheus.NewDesc(
    27  		"argocd_cluster_info",
    28  		"Information about cluster.",
    29  		append(descClusterDefaultLabels, "k8s_version", "name"),
    30  		nil,
    31  	)
    32  	descClusterCacheResources = prometheus.NewDesc(
    33  		"argocd_cluster_api_resource_objects",
    34  		"Number of k8s resource objects in the cache.",
    35  		descClusterDefaultLabels,
    36  		nil,
    37  	)
    38  	descClusterAPIs = prometheus.NewDesc(
    39  		"argocd_cluster_api_resources",
    40  		"Number of monitored kubernetes API resources.",
    41  		descClusterDefaultLabels,
    42  		nil,
    43  	)
    44  	descClusterCacheAgeSeconds = prometheus.NewDesc(
    45  		"argocd_cluster_cache_age_seconds",
    46  		"Cluster cache age in seconds.",
    47  		descClusterDefaultLabels,
    48  		nil,
    49  	)
    50  	descClusterConnectionStatus = prometheus.NewDesc(
    51  		"argocd_cluster_connection_status",
    52  		"The k8s cluster current connection status.",
    53  		append(descClusterDefaultLabels, "k8s_version"),
    54  		nil,
    55  	)
    56  )
    57  
    58  type HasClustersInfo interface {
    59  	GetClustersInfo() []cache.ClusterInfo
    60  }
    61  
    62  type ClusterLister func(ctx context.Context) (*argoappv1.ClusterList, error)
    63  
    64  type clusterCollector struct {
    65  	infoSource    HasClustersInfo
    66  	lock          sync.RWMutex
    67  	clusterLabels []string
    68  	clusterLister ClusterLister
    69  
    70  	latestInfo []*clusterData
    71  }
    72  
    73  type clusterData struct {
    74  	info    *cache.ClusterInfo
    75  	cluster *argoappv1.Cluster
    76  }
    77  
    78  func NewClusterCollector(ctx context.Context, source HasClustersInfo, clusterLister ClusterLister, clusterLabels []string) prometheus.Collector {
    79  	if len(clusterLabels) > 0 {
    80  		normalizedClusterLabels := metricsutil.NormalizeLabels("label", clusterLabels)
    81  		descClusterLabels = prometheus.NewDesc(
    82  			"argocd_cluster_labels",
    83  			"Argo Cluster labels converted to Prometheus labels",
    84  			append(append(descClusterDefaultLabels, "name"), normalizedClusterLabels...),
    85  			nil,
    86  		)
    87  	}
    88  
    89  	collector := &clusterCollector{
    90  		infoSource:    source,
    91  		clusterLabels: clusterLabels,
    92  		clusterLister: clusterLister,
    93  		lock:          sync.RWMutex{},
    94  	}
    95  
    96  	collector.setClusterData()
    97  	go collector.run(ctx)
    98  
    99  	return collector
   100  }
   101  
   102  func (c *clusterCollector) run(ctx context.Context) {
   103  	//nolint:staticcheck // FIXME: complains about SA1015
   104  	tick := time.Tick(metricsCollectionInterval)
   105  	for {
   106  		select {
   107  		case <-ctx.Done():
   108  		case <-tick:
   109  			c.setClusterData()
   110  		}
   111  	}
   112  }
   113  
   114  func (c *clusterCollector) setClusterData() {
   115  	if clusterData, err := c.getClusterData(); err == nil {
   116  		c.lock.Lock()
   117  		c.latestInfo = clusterData
   118  		c.lock.Unlock()
   119  	} else {
   120  		log.Warnf("error collecting cluster metrics: %v", err)
   121  	}
   122  }
   123  
   124  func (c *clusterCollector) getClusterData() ([]*clusterData, error) {
   125  	clusterDatas := []*clusterData{}
   126  	clusterInfos := c.infoSource.GetClustersInfo()
   127  
   128  	ctx, cancel := context.WithTimeout(context.Background(), metricsCollectionTimeout)
   129  	defer cancel()
   130  	clusters, err := c.clusterLister(ctx)
   131  	if err != nil {
   132  		return nil, err
   133  	}
   134  
   135  	clusterMap := map[string]*argoappv1.Cluster{}
   136  	for i, cluster := range clusters.Items {
   137  		clusterMap[cluster.Server] = &clusters.Items[i]
   138  	}
   139  
   140  	// Base the cluster data on the ClusterInfo because it only contains the
   141  	// clusters managed by this controller instance
   142  	for i, info := range clusterInfos {
   143  		cluster, ok := clusterMap[info.Server]
   144  		if !ok {
   145  			// This should not happen, but we cannot emit incomplete metrics, so we skip this cluster
   146  			log.WithField("server", info.Server).Warnf("could find cluster for metrics collection")
   147  			continue
   148  		}
   149  		clusterDatas = append(clusterDatas, &clusterData{
   150  			info:    &clusterInfos[i],
   151  			cluster: cluster,
   152  		})
   153  	}
   154  	return clusterDatas, nil
   155  }
   156  
   157  // Describe implements the prometheus.Collector interface
   158  func (c *clusterCollector) Describe(ch chan<- *prometheus.Desc) {
   159  	ch <- descClusterInfo
   160  	ch <- descClusterCacheResources
   161  	ch <- descClusterAPIs
   162  	ch <- descClusterCacheAgeSeconds
   163  	ch <- descClusterConnectionStatus
   164  	if len(c.clusterLabels) > 0 {
   165  		ch <- descClusterLabels
   166  	}
   167  }
   168  
   169  func (c *clusterCollector) Collect(ch chan<- prometheus.Metric) {
   170  	c.lock.RLock()
   171  	latestInfo := c.latestInfo
   172  	c.lock.RUnlock()
   173  
   174  	now := time.Now()
   175  	for _, clusterData := range latestInfo {
   176  		info := clusterData.info
   177  		name := clusterData.cluster.Name
   178  		labels := clusterData.cluster.Labels
   179  
   180  		defaultValues := []string{info.Server}
   181  		ch <- prometheus.MustNewConstMetric(descClusterInfo, prometheus.GaugeValue, 1, append(defaultValues, info.K8SVersion, name)...)
   182  		ch <- prometheus.MustNewConstMetric(descClusterCacheResources, prometheus.GaugeValue, float64(info.ResourcesCount), defaultValues...)
   183  		ch <- prometheus.MustNewConstMetric(descClusterAPIs, prometheus.GaugeValue, float64(info.APIsCount), defaultValues...)
   184  		cacheAgeSeconds := -1
   185  		if info.LastCacheSyncTime != nil {
   186  			cacheAgeSeconds = int(now.Sub(*info.LastCacheSyncTime).Seconds())
   187  		}
   188  		ch <- prometheus.MustNewConstMetric(descClusterCacheAgeSeconds, prometheus.GaugeValue, float64(cacheAgeSeconds), defaultValues...)
   189  		ch <- prometheus.MustNewConstMetric(descClusterConnectionStatus, prometheus.GaugeValue, boolFloat64(info.SyncError == nil), append(defaultValues, info.K8SVersion)...)
   190  
   191  		if len(c.clusterLabels) > 0 && labels != nil {
   192  			labelValues := []string{}
   193  			labelValues = append(labelValues, info.Server, name)
   194  			for _, desiredLabel := range c.clusterLabels {
   195  				value := labels[desiredLabel]
   196  				labelValues = append(labelValues, value)
   197  			}
   198  			ch <- prometheus.MustNewConstMetric(descClusterLabels, prometheus.GaugeValue, 1, labelValues...)
   199  		}
   200  	}
   201  }