github.com/cilium/cilium@v1.16.2/pkg/ipam/metrics/metrics.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package metrics
     5  
     6  import (
     7  	"github.com/prometheus/client_golang/prometheus"
     8  
     9  	"github.com/cilium/cilium/operator/metrics"
    10  	"github.com/cilium/cilium/pkg/time"
    11  	"github.com/cilium/cilium/pkg/trigger"
    12  )
    13  
    14  const ipamSubsystem = "ipam"
    15  
    16  type prometheusMetrics struct {
    17  	registry             metrics.RegisterGatherer
    18  	Allocation           *prometheus.HistogramVec
    19  	Release              *prometheus.HistogramVec
    20  	AllocateInterfaceOps *prometheus.CounterVec
    21  	AllocateIpOps        *prometheus.CounterVec
    22  	ReleaseIpOps         *prometheus.CounterVec
    23  	AvailableIPs         *prometheus.GaugeVec
    24  	UsedIPs              *prometheus.GaugeVec
    25  	NeededIPs            *prometheus.GaugeVec
    26  	// Deprecated, will be removed in version 1.15.
    27  	// Use AvailableIPs, UsedIPs and NeededIPs instead.
    28  	IPsAllocated *prometheus.GaugeVec
    29  	// Deprecated, will be removed in version 1.14:
    30  	// Use InterfaceCandidates and EmptyInterfaceSlots instead
    31  	AvailableInterfaces   prometheus.Gauge
    32  	InterfaceCandidates   prometheus.Gauge
    33  	EmptyInterfaceSlots   prometheus.Gauge
    34  	AvailableIPsPerSubnet *prometheus.GaugeVec
    35  	Nodes                 *prometheus.GaugeVec
    36  	Resync                prometheus.Counter
    37  	poolMaintainer        *triggerMetrics
    38  	k8sSync               *triggerMetrics
    39  	resync                *triggerMetrics
    40  }
    41  
    42  const LabelTargetNodeName = "target_node"
    43  
    44  // NewPrometheusMetrics returns a new interface metrics implementation backed by
    45  // Prometheus metrics.
    46  func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) *prometheusMetrics {
    47  	m := &prometheusMetrics{
    48  		registry: registry,
    49  	}
    50  
    51  	m.AvailableIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    52  		Namespace: namespace,
    53  		Subsystem: ipamSubsystem,
    54  		Name:      "available_ips",
    55  		Help:      "Total available IPs on Node for IPAM allocation",
    56  	}, []string{LabelTargetNodeName})
    57  
    58  	m.UsedIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    59  		Namespace: namespace,
    60  		Subsystem: ipamSubsystem,
    61  		Name:      "used_ips",
    62  		Help:      "Total used IPs on Node for IPAM allocation",
    63  	}, []string{LabelTargetNodeName})
    64  
    65  	m.NeededIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    66  		Namespace: namespace,
    67  		Subsystem: ipamSubsystem,
    68  		Name:      "needed_ips",
    69  		Help:      "Number of IPs that are needed on the Node to satisfy IPAM allocation requests",
    70  	}, []string{LabelTargetNodeName})
    71  
    72  	m.IPsAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    73  		Namespace: namespace,
    74  		Subsystem: ipamSubsystem,
    75  		Name:      "ips",
    76  		Help:      "Number of IPs allocated",
    77  	}, []string{"type"})
    78  
    79  	m.AllocateIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    80  		Namespace: namespace,
    81  		Subsystem: ipamSubsystem,
    82  		Name:      "ip_allocation_ops",
    83  		Help:      "Number of IP allocation operations",
    84  	}, []string{"subnet_id"})
    85  
    86  	m.ReleaseIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    87  		Namespace: namespace,
    88  		Subsystem: ipamSubsystem,
    89  		Name:      "ip_release_ops",
    90  		Help:      "Number of IP release operations",
    91  	}, []string{"subnet_id"})
    92  
    93  	m.AllocateInterfaceOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    94  		Namespace: namespace,
    95  		Subsystem: ipamSubsystem,
    96  		Name:      "interface_creation_ops",
    97  		Help:      "Number of interfaces allocated",
    98  	}, []string{"subnet_id"})
    99  
   100  	m.AvailableInterfaces = prometheus.NewGauge(prometheus.GaugeOpts{
   101  		Namespace: namespace,
   102  		Subsystem: ipamSubsystem,
   103  		Name:      "available_interfaces",
   104  		Help:      "Number of interfaces with addresses available",
   105  	})
   106  
   107  	m.InterfaceCandidates = prometheus.NewGauge(prometheus.GaugeOpts{
   108  		Namespace: namespace,
   109  		Subsystem: ipamSubsystem,
   110  		Name:      "interface_candidates",
   111  		Help:      "Number of attached interfaces with IPs available for allocation",
   112  	})
   113  
   114  	m.EmptyInterfaceSlots = prometheus.NewGauge(prometheus.GaugeOpts{
   115  		Namespace: namespace,
   116  		Subsystem: ipamSubsystem,
   117  		Name:      "empty_interface_slots",
   118  		Help:      "Number of empty interface slots available for interfaces to be attached",
   119  	})
   120  
   121  	m.AvailableIPsPerSubnet = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   122  		Namespace: namespace,
   123  		Subsystem: ipamSubsystem,
   124  		Name:      "available_ips_per_subnet",
   125  		Help:      "Number of available IPs per subnet ID",
   126  	}, []string{"subnet_id", "availability_zone"})
   127  
   128  	m.Nodes = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   129  		Namespace: namespace,
   130  		Subsystem: ipamSubsystem,
   131  		Name:      "nodes",
   132  		Help:      "Number of nodes by category { total | in-deficit | at-capacity }",
   133  	}, []string{"category"})
   134  
   135  	m.Resync = prometheus.NewCounter(prometheus.CounterOpts{
   136  		Namespace: namespace,
   137  		Subsystem: ipamSubsystem,
   138  		Name:      "resync_total",
   139  		Help:      "Number of resync operations to synchronize and resolve IP deficit of nodes",
   140  	})
   141  
   142  	m.Allocation = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   143  		Namespace: namespace,
   144  		Subsystem: ipamSubsystem,
   145  		Name:      "allocation_duration_seconds",
   146  		Help:      "Allocation ip or interface latency in seconds",
   147  		Buckets: merge(
   148  			prometheus.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s
   149  			prometheus.LinearBuckets(1, 1, 60),      // 1s, 2s, 3s, ... 60s,
   150  		),
   151  	}, []string{"type", "status", "subnet_id"})
   152  
   153  	m.Release = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   154  		Namespace: namespace,
   155  		Subsystem: ipamSubsystem,
   156  		Name:      "release_duration_seconds",
   157  		Help:      "Release ip or interface latency in seconds",
   158  		Buckets: merge(
   159  			prometheus.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s
   160  			prometheus.LinearBuckets(1, 1, 60),      // 1s, 2s, 3s, ... 60s,
   161  		),
   162  	}, []string{"type", "status", "subnet_id"})
   163  
   164  	// pool_maintainer is a more generic name, but for backward compatibility
   165  	// of dashboard, keep the metric name deficit_resolver unchanged
   166  	m.poolMaintainer = NewTriggerMetrics(namespace, "deficit_resolver")
   167  	m.k8sSync = NewTriggerMetrics(namespace, "k8s_sync")
   168  	m.resync = NewTriggerMetrics(namespace, "resync")
   169  
   170  	registry.MustRegister(m.AvailableIPs)
   171  	registry.MustRegister(m.UsedIPs)
   172  	registry.MustRegister(m.NeededIPs)
   173  
   174  	registry.MustRegister(m.IPsAllocated)
   175  	registry.MustRegister(m.AllocateIpOps)
   176  	registry.MustRegister(m.ReleaseIpOps)
   177  	registry.MustRegister(m.AllocateInterfaceOps)
   178  	registry.MustRegister(m.AvailableInterfaces)
   179  	registry.MustRegister(m.InterfaceCandidates)
   180  	registry.MustRegister(m.EmptyInterfaceSlots)
   181  	registry.MustRegister(m.AvailableIPsPerSubnet)
   182  	registry.MustRegister(m.Nodes)
   183  	registry.MustRegister(m.Resync)
   184  	registry.MustRegister(m.Allocation)
   185  	registry.MustRegister(m.Release)
   186  	m.poolMaintainer.Register(registry)
   187  	m.k8sSync.Register(registry)
   188  	m.resync.Register(registry)
   189  
   190  	return m
   191  }
   192  
   193  func (p *prometheusMetrics) PoolMaintainerTrigger() trigger.MetricsObserver {
   194  	return p.poolMaintainer
   195  }
   196  
   197  func (p *prometheusMetrics) K8sSyncTrigger() trigger.MetricsObserver {
   198  	return p.k8sSync
   199  }
   200  
   201  func (p *prometheusMetrics) ResyncTrigger() trigger.MetricsObserver {
   202  	return p.resync
   203  }
   204  
   205  func (p *prometheusMetrics) IncInterfaceAllocation(subnetID string) {
   206  	p.AllocateInterfaceOps.WithLabelValues(subnetID).Inc()
   207  }
   208  
   209  func (p *prometheusMetrics) AddIPAllocation(subnetID string, allocated int64) {
   210  	p.AllocateIpOps.WithLabelValues(subnetID).Add(float64(allocated))
   211  }
   212  
   213  func (p *prometheusMetrics) AddIPRelease(subnetID string, released int64) {
   214  	p.ReleaseIpOps.WithLabelValues(subnetID).Add(float64(released))
   215  }
   216  
   217  func (p *prometheusMetrics) SetAllocatedIPs(typ string, allocated int) {
   218  	p.IPsAllocated.WithLabelValues(typ).Set(float64(allocated))
   219  }
   220  
   221  func (p *prometheusMetrics) SetAvailableInterfaces(available int) {
   222  	p.AvailableInterfaces.Set(float64(available))
   223  }
   224  
   225  func (p *prometheusMetrics) SetInterfaceCandidates(interfaceCandidates int) {
   226  	p.InterfaceCandidates.Set(float64(interfaceCandidates))
   227  }
   228  
   229  func (p *prometheusMetrics) SetEmptyInterfaceSlots(emptyInterfaceSlots int) {
   230  	p.EmptyInterfaceSlots.Set(float64(emptyInterfaceSlots))
   231  }
   232  
   233  func (p *prometheusMetrics) SetAvailableIPsPerSubnet(subnetID string, availabilityZone string, available int) {
   234  	p.AvailableIPsPerSubnet.WithLabelValues(subnetID, availabilityZone).Set(float64(available))
   235  }
   236  
   237  func (p *prometheusMetrics) SetNodes(label string, nodes int) {
   238  	p.Nodes.WithLabelValues(label).Set(float64(nodes))
   239  }
   240  
   241  func (p *prometheusMetrics) IncResyncCount() {
   242  	p.Resync.Inc()
   243  }
   244  
   245  func (p *prometheusMetrics) AllocationAttempt(typ, status, subnetID string, observe float64) {
   246  	p.Allocation.WithLabelValues(typ, status, subnetID).Observe(observe)
   247  }
   248  
   249  func (p *prometheusMetrics) ReleaseAttempt(typ, status, subnetID string, observe float64) {
   250  	p.Release.WithLabelValues(typ, status, subnetID).Observe(observe)
   251  }
   252  
   253  // Per Node metrics.
   254  func (p *prometheusMetrics) SetIPAvailable(node string, cap int) {
   255  	p.AvailableIPs.WithLabelValues(node).Set(float64(cap))
   256  }
   257  
   258  func (p *prometheusMetrics) SetIPUsed(node string, usage int) {
   259  	p.UsedIPs.WithLabelValues(node).Set(float64(usage))
   260  }
   261  
   262  func (p *prometheusMetrics) SetIPNeeded(node string, usage int) {
   263  	p.NeededIPs.WithLabelValues(node).Set(float64(usage))
   264  }
   265  
   266  // DeleteNode removes all per-node metrics for a particular node (i.e. those labeled with "target_node").
   267  // This is to ensure that when a Node/CiliumNode delete event happens that the operator will no longer report
   268  // metrics for that node.
   269  func (p *prometheusMetrics) DeleteNode(node string) {
   270  	p.AvailableIPs.DeleteLabelValues(node)
   271  	p.UsedIPs.DeleteLabelValues(node)
   272  	p.NeededIPs.DeleteLabelValues(node)
   273  }
   274  
   275  type triggerMetrics struct {
   276  	total        prometheus.Counter
   277  	folds        prometheus.Gauge
   278  	callDuration prometheus.Histogram
   279  	latency      prometheus.Histogram
   280  }
   281  
   282  func NewTriggerMetrics(namespace, name string) *triggerMetrics {
   283  	return &triggerMetrics{
   284  		total: prometheus.NewCounter(prometheus.CounterOpts{
   285  			Namespace: namespace,
   286  			Subsystem: ipamSubsystem,
   287  			Name:      name + "_queued_total",
   288  			Help:      "Number of queued triggers",
   289  		}),
   290  		folds: prometheus.NewGauge(prometheus.GaugeOpts{
   291  			Namespace: namespace,
   292  			Subsystem: ipamSubsystem,
   293  			Name:      name + "_folds",
   294  			Help:      "Current level of folding",
   295  		}),
   296  		callDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
   297  			Namespace: namespace,
   298  			Subsystem: ipamSubsystem,
   299  			Name:      name + "_duration_seconds",
   300  			Help:      "Duration of trigger runs",
   301  			Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
   302  				4, 5, 6, 8, 10, 15, 20, 30, 45, 60},
   303  		}),
   304  		latency: prometheus.NewHistogram(prometheus.HistogramOpts{
   305  			Namespace: namespace,
   306  			Subsystem: ipamSubsystem,
   307  			Name:      name + "_latency_seconds",
   308  			Help:      "Latency between queue and trigger run",
   309  			Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
   310  				4, 5, 6, 8, 10, 15, 20, 30, 45, 60},
   311  		}),
   312  	}
   313  }
   314  
   315  func (t *triggerMetrics) Register(registry metrics.RegisterGatherer) {
   316  	registry.MustRegister(t.total)
   317  	registry.MustRegister(t.folds)
   318  	registry.MustRegister(t.callDuration)
   319  	registry.MustRegister(t.latency)
   320  }
   321  
   322  func (t *triggerMetrics) QueueEvent(reason string) {
   323  	t.total.Inc()
   324  }
   325  
   326  func (t *triggerMetrics) PostRun(duration, latency time.Duration, folds int) {
   327  	t.callDuration.Observe(duration.Seconds())
   328  	t.latency.Observe(latency.Seconds())
   329  	t.folds.Set(float64(folds))
   330  }
   331  
   332  // NoOpMetricsObserver is a no-operation implementation of the metrics observer
   333  type NoOpMetricsObserver struct{}
   334  
   335  // MetricsObserver implementation
   336  func (m *NoOpMetricsObserver) PostRun(callDuration, latency time.Duration, folds int) {}
   337  func (m *NoOpMetricsObserver) QueueEvent(reason string)                               {}
   338  
   339  // NoOpMetrics is a no-operation implementation of the metrics
   340  type NoOpMetrics struct{}
   341  
   342  func (m *NoOpMetrics) AllocationAttempt(typ, status, subnetID string, observe float64)           {}
   343  func (m *NoOpMetrics) ReleaseAttempt(typ, status, subnetID string, observe float64)              {}
   344  func (m *NoOpMetrics) IncInterfaceAllocation(subnetID string)                                    {}
   345  func (m *NoOpMetrics) AddIPAllocation(subnetID string, allocated int64)                          {}
   346  func (m *NoOpMetrics) AddIPRelease(subnetID string, released int64)                              {}
   347  func (m *NoOpMetrics) SetAllocatedIPs(typ string, allocated int)                                 {}
   348  func (m *NoOpMetrics) SetAvailableInterfaces(available int)                                      {}
   349  func (m *NoOpMetrics) SetInterfaceCandidates(interfaceCandidates int)                            {}
   350  func (m *NoOpMetrics) SetEmptyInterfaceSlots(emptyInterfaceSlots int)                            {}
   351  func (m *NoOpMetrics) SetAvailableIPsPerSubnet(subnetID, availabilityZone string, available int) {}
   352  func (m *NoOpMetrics) SetNodes(category string, nodes int)                                       {}
   353  func (m *NoOpMetrics) IncResyncCount()                                                           {}
   354  func (m *NoOpMetrics) SetIPAvailable(node string, n int)                                         {}
   355  func (m *NoOpMetrics) SetIPUsed(node string, n int)                                              {}
   356  func (m *NoOpMetrics) SetIPNeeded(node string, n int)                                            {}
   357  func (m *NoOpMetrics) PoolMaintainerTrigger() trigger.MetricsObserver                            { return &NoOpMetricsObserver{} }
   358  func (m *NoOpMetrics) K8sSyncTrigger() trigger.MetricsObserver                                   { return &NoOpMetricsObserver{} }
   359  func (m *NoOpMetrics) ResyncTrigger() trigger.MetricsObserver                                    { return &NoOpMetricsObserver{} }
   360  func (m *NoOpMetrics) DeleteNode(n string)                                                       {}
   361  
   362  func merge(slices ...[]float64) []float64 {
   363  	result := make([]float64, 1)
   364  	for _, s := range slices {
   365  		result = append(result, s...)
   366  	}
   367  	return result
   368  }
   369  
   370  // SinceInSeconds gets the time since the specified start in seconds.
   371  func SinceInSeconds(start time.Time) float64 {
   372  	return time.Since(start).Seconds()
   373  }