github.com/cilium/cilium@v1.16.2/pkg/ipam/metrics/metrics.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package metrics 5 6 import ( 7 "github.com/prometheus/client_golang/prometheus" 8 9 "github.com/cilium/cilium/operator/metrics" 10 "github.com/cilium/cilium/pkg/time" 11 "github.com/cilium/cilium/pkg/trigger" 12 ) 13 14 const ipamSubsystem = "ipam" 15 16 type prometheusMetrics struct { 17 registry metrics.RegisterGatherer 18 Allocation *prometheus.HistogramVec 19 Release *prometheus.HistogramVec 20 AllocateInterfaceOps *prometheus.CounterVec 21 AllocateIpOps *prometheus.CounterVec 22 ReleaseIpOps *prometheus.CounterVec 23 AvailableIPs *prometheus.GaugeVec 24 UsedIPs *prometheus.GaugeVec 25 NeededIPs *prometheus.GaugeVec 26 // Deprecated, will be removed in version 1.15. 27 // Use AvailableIPs, UsedIPs and NeededIPs instead. 28 IPsAllocated *prometheus.GaugeVec 29 // Deprecated, will be removed in version 1.14: 30 // Use InterfaceCandidates and EmptyInterfaceSlots instead 31 AvailableInterfaces prometheus.Gauge 32 InterfaceCandidates prometheus.Gauge 33 EmptyInterfaceSlots prometheus.Gauge 34 AvailableIPsPerSubnet *prometheus.GaugeVec 35 Nodes *prometheus.GaugeVec 36 Resync prometheus.Counter 37 poolMaintainer *triggerMetrics 38 k8sSync *triggerMetrics 39 resync *triggerMetrics 40 } 41 42 const LabelTargetNodeName = "target_node" 43 44 // NewPrometheusMetrics returns a new interface metrics implementation backed by 45 // Prometheus metrics. 46 func NewPrometheusMetrics(namespace string, registry metrics.RegisterGatherer) *prometheusMetrics { 47 m := &prometheusMetrics{ 48 registry: registry, 49 } 50 51 m.AvailableIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 52 Namespace: namespace, 53 Subsystem: ipamSubsystem, 54 Name: "available_ips", 55 Help: "Total available IPs on Node for IPAM allocation", 56 }, []string{LabelTargetNodeName}) 57 58 m.UsedIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 59 Namespace: namespace, 60 Subsystem: ipamSubsystem, 61 Name: "used_ips", 62 Help: "Total used IPs on Node for IPAM allocation", 63 }, []string{LabelTargetNodeName}) 64 65 m.NeededIPs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 66 Namespace: namespace, 67 Subsystem: ipamSubsystem, 68 Name: "needed_ips", 69 Help: "Number of IPs that are needed on the Node to satisfy IPAM allocation requests", 70 }, []string{LabelTargetNodeName}) 71 72 m.IPsAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 73 Namespace: namespace, 74 Subsystem: ipamSubsystem, 75 Name: "ips", 76 Help: "Number of IPs allocated", 77 }, []string{"type"}) 78 79 m.AllocateIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{ 80 Namespace: namespace, 81 Subsystem: ipamSubsystem, 82 Name: "ip_allocation_ops", 83 Help: "Number of IP allocation operations", 84 }, []string{"subnet_id"}) 85 86 m.ReleaseIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{ 87 Namespace: namespace, 88 Subsystem: ipamSubsystem, 89 Name: "ip_release_ops", 90 Help: "Number of IP release operations", 91 }, []string{"subnet_id"}) 92 93 m.AllocateInterfaceOps = prometheus.NewCounterVec(prometheus.CounterOpts{ 94 Namespace: namespace, 95 Subsystem: ipamSubsystem, 96 Name: "interface_creation_ops", 97 Help: "Number of interfaces allocated", 98 }, []string{"subnet_id"}) 99 100 m.AvailableInterfaces = prometheus.NewGauge(prometheus.GaugeOpts{ 101 Namespace: namespace, 102 Subsystem: ipamSubsystem, 103 Name: "available_interfaces", 104 Help: "Number of interfaces with addresses available", 105 }) 106 107 m.InterfaceCandidates = prometheus.NewGauge(prometheus.GaugeOpts{ 108 Namespace: namespace, 109 Subsystem: ipamSubsystem, 110 Name: "interface_candidates", 111 Help: "Number of attached interfaces with IPs available for allocation", 112 }) 113 114 m.EmptyInterfaceSlots = prometheus.NewGauge(prometheus.GaugeOpts{ 115 Namespace: namespace, 116 Subsystem: ipamSubsystem, 117 Name: "empty_interface_slots", 118 Help: "Number of empty interface slots available for interfaces to be attached", 119 }) 120 121 m.AvailableIPsPerSubnet = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 122 Namespace: namespace, 123 Subsystem: ipamSubsystem, 124 Name: "available_ips_per_subnet", 125 Help: "Number of available IPs per subnet ID", 126 }, []string{"subnet_id", "availability_zone"}) 127 128 m.Nodes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 129 Namespace: namespace, 130 Subsystem: ipamSubsystem, 131 Name: "nodes", 132 Help: "Number of nodes by category { total | in-deficit | at-capacity }", 133 }, []string{"category"}) 134 135 m.Resync = prometheus.NewCounter(prometheus.CounterOpts{ 136 Namespace: namespace, 137 Subsystem: ipamSubsystem, 138 Name: "resync_total", 139 Help: "Number of resync operations to synchronize and resolve IP deficit of nodes", 140 }) 141 142 m.Allocation = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 143 Namespace: namespace, 144 Subsystem: ipamSubsystem, 145 Name: "allocation_duration_seconds", 146 Help: "Allocation ip or interface latency in seconds", 147 Buckets: merge( 148 prometheus.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s 149 prometheus.LinearBuckets(1, 1, 60), // 1s, 2s, 3s, ... 60s, 150 ), 151 }, []string{"type", "status", "subnet_id"}) 152 153 m.Release = prometheus.NewHistogramVec(prometheus.HistogramOpts{ 154 Namespace: namespace, 155 Subsystem: ipamSubsystem, 156 Name: "release_duration_seconds", 157 Help: "Release ip or interface latency in seconds", 158 Buckets: merge( 159 prometheus.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s 160 prometheus.LinearBuckets(1, 1, 60), // 1s, 2s, 3s, ... 60s, 161 ), 162 }, []string{"type", "status", "subnet_id"}) 163 164 // pool_maintainer is a more generic name, but for backward compatibility 165 // of dashboard, keep the metric name deficit_resolver unchanged 166 m.poolMaintainer = NewTriggerMetrics(namespace, "deficit_resolver") 167 m.k8sSync = NewTriggerMetrics(namespace, "k8s_sync") 168 m.resync = NewTriggerMetrics(namespace, "resync") 169 170 registry.MustRegister(m.AvailableIPs) 171 registry.MustRegister(m.UsedIPs) 172 registry.MustRegister(m.NeededIPs) 173 174 registry.MustRegister(m.IPsAllocated) 175 registry.MustRegister(m.AllocateIpOps) 176 registry.MustRegister(m.ReleaseIpOps) 177 registry.MustRegister(m.AllocateInterfaceOps) 178 registry.MustRegister(m.AvailableInterfaces) 179 registry.MustRegister(m.InterfaceCandidates) 180 registry.MustRegister(m.EmptyInterfaceSlots) 181 registry.MustRegister(m.AvailableIPsPerSubnet) 182 registry.MustRegister(m.Nodes) 183 registry.MustRegister(m.Resync) 184 registry.MustRegister(m.Allocation) 185 registry.MustRegister(m.Release) 186 m.poolMaintainer.Register(registry) 187 m.k8sSync.Register(registry) 188 m.resync.Register(registry) 189 190 return m 191 } 192 193 func (p *prometheusMetrics) PoolMaintainerTrigger() trigger.MetricsObserver { 194 return p.poolMaintainer 195 } 196 197 func (p *prometheusMetrics) K8sSyncTrigger() trigger.MetricsObserver { 198 return p.k8sSync 199 } 200 201 func (p *prometheusMetrics) ResyncTrigger() trigger.MetricsObserver { 202 return p.resync 203 } 204 205 func (p *prometheusMetrics) IncInterfaceAllocation(subnetID string) { 206 p.AllocateInterfaceOps.WithLabelValues(subnetID).Inc() 207 } 208 209 func (p *prometheusMetrics) AddIPAllocation(subnetID string, allocated int64) { 210 p.AllocateIpOps.WithLabelValues(subnetID).Add(float64(allocated)) 211 } 212 213 func (p *prometheusMetrics) AddIPRelease(subnetID string, released int64) { 214 p.ReleaseIpOps.WithLabelValues(subnetID).Add(float64(released)) 215 } 216 217 func (p *prometheusMetrics) SetAllocatedIPs(typ string, allocated int) { 218 p.IPsAllocated.WithLabelValues(typ).Set(float64(allocated)) 219 } 220 221 func (p *prometheusMetrics) SetAvailableInterfaces(available int) { 222 p.AvailableInterfaces.Set(float64(available)) 223 } 224 225 func (p *prometheusMetrics) SetInterfaceCandidates(interfaceCandidates int) { 226 p.InterfaceCandidates.Set(float64(interfaceCandidates)) 227 } 228 229 func (p *prometheusMetrics) SetEmptyInterfaceSlots(emptyInterfaceSlots int) { 230 p.EmptyInterfaceSlots.Set(float64(emptyInterfaceSlots)) 231 } 232 233 func (p *prometheusMetrics) SetAvailableIPsPerSubnet(subnetID string, availabilityZone string, available int) { 234 p.AvailableIPsPerSubnet.WithLabelValues(subnetID, availabilityZone).Set(float64(available)) 235 } 236 237 func (p *prometheusMetrics) SetNodes(label string, nodes int) { 238 p.Nodes.WithLabelValues(label).Set(float64(nodes)) 239 } 240 241 func (p *prometheusMetrics) IncResyncCount() { 242 p.Resync.Inc() 243 } 244 245 func (p *prometheusMetrics) AllocationAttempt(typ, status, subnetID string, observe float64) { 246 p.Allocation.WithLabelValues(typ, status, subnetID).Observe(observe) 247 } 248 249 func (p *prometheusMetrics) ReleaseAttempt(typ, status, subnetID string, observe float64) { 250 p.Release.WithLabelValues(typ, status, subnetID).Observe(observe) 251 } 252 253 // Per Node metrics. 254 func (p *prometheusMetrics) SetIPAvailable(node string, cap int) { 255 p.AvailableIPs.WithLabelValues(node).Set(float64(cap)) 256 } 257 258 func (p *prometheusMetrics) SetIPUsed(node string, usage int) { 259 p.UsedIPs.WithLabelValues(node).Set(float64(usage)) 260 } 261 262 func (p *prometheusMetrics) SetIPNeeded(node string, usage int) { 263 p.NeededIPs.WithLabelValues(node).Set(float64(usage)) 264 } 265 266 // DeleteNode removes all per-node metrics for a particular node (i.e. those labeled with "target_node"). 267 // This is to ensure that when a Node/CiliumNode delete event happens that the operator will no longer report 268 // metrics for that node. 269 func (p *prometheusMetrics) DeleteNode(node string) { 270 p.AvailableIPs.DeleteLabelValues(node) 271 p.UsedIPs.DeleteLabelValues(node) 272 p.NeededIPs.DeleteLabelValues(node) 273 } 274 275 type triggerMetrics struct { 276 total prometheus.Counter 277 folds prometheus.Gauge 278 callDuration prometheus.Histogram 279 latency prometheus.Histogram 280 } 281 282 func NewTriggerMetrics(namespace, name string) *triggerMetrics { 283 return &triggerMetrics{ 284 total: prometheus.NewCounter(prometheus.CounterOpts{ 285 Namespace: namespace, 286 Subsystem: ipamSubsystem, 287 Name: name + "_queued_total", 288 Help: "Number of queued triggers", 289 }), 290 folds: prometheus.NewGauge(prometheus.GaugeOpts{ 291 Namespace: namespace, 292 Subsystem: ipamSubsystem, 293 Name: name + "_folds", 294 Help: "Current level of folding", 295 }), 296 callDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ 297 Namespace: namespace, 298 Subsystem: ipamSubsystem, 299 Name: name + "_duration_seconds", 300 Help: "Duration of trigger runs", 301 Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 302 4, 5, 6, 8, 10, 15, 20, 30, 45, 60}, 303 }), 304 latency: prometheus.NewHistogram(prometheus.HistogramOpts{ 305 Namespace: namespace, 306 Subsystem: ipamSubsystem, 307 Name: name + "_latency_seconds", 308 Help: "Latency between queue and trigger run", 309 Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 310 4, 5, 6, 8, 10, 15, 20, 30, 45, 60}, 311 }), 312 } 313 } 314 315 func (t *triggerMetrics) Register(registry metrics.RegisterGatherer) { 316 registry.MustRegister(t.total) 317 registry.MustRegister(t.folds) 318 registry.MustRegister(t.callDuration) 319 registry.MustRegister(t.latency) 320 } 321 322 func (t *triggerMetrics) QueueEvent(reason string) { 323 t.total.Inc() 324 } 325 326 func (t *triggerMetrics) PostRun(duration, latency time.Duration, folds int) { 327 t.callDuration.Observe(duration.Seconds()) 328 t.latency.Observe(latency.Seconds()) 329 t.folds.Set(float64(folds)) 330 } 331 332 // NoOpMetricsObserver is a no-operation implementation of the metrics observer 333 type NoOpMetricsObserver struct{} 334 335 // MetricsObserver implementation 336 func (m *NoOpMetricsObserver) PostRun(callDuration, latency time.Duration, folds int) {} 337 func (m *NoOpMetricsObserver) QueueEvent(reason string) {} 338 339 // NoOpMetrics is a no-operation implementation of the metrics 340 type NoOpMetrics struct{} 341 342 func (m *NoOpMetrics) AllocationAttempt(typ, status, subnetID string, observe float64) {} 343 func (m *NoOpMetrics) ReleaseAttempt(typ, status, subnetID string, observe float64) {} 344 func (m *NoOpMetrics) IncInterfaceAllocation(subnetID string) {} 345 func (m *NoOpMetrics) AddIPAllocation(subnetID string, allocated int64) {} 346 func (m *NoOpMetrics) AddIPRelease(subnetID string, released int64) {} 347 func (m *NoOpMetrics) SetAllocatedIPs(typ string, allocated int) {} 348 func (m *NoOpMetrics) SetAvailableInterfaces(available int) {} 349 func (m *NoOpMetrics) SetInterfaceCandidates(interfaceCandidates int) {} 350 func (m *NoOpMetrics) SetEmptyInterfaceSlots(emptyInterfaceSlots int) {} 351 func (m *NoOpMetrics) SetAvailableIPsPerSubnet(subnetID, availabilityZone string, available int) {} 352 func (m *NoOpMetrics) SetNodes(category string, nodes int) {} 353 func (m *NoOpMetrics) IncResyncCount() {} 354 func (m *NoOpMetrics) SetIPAvailable(node string, n int) {} 355 func (m *NoOpMetrics) SetIPUsed(node string, n int) {} 356 func (m *NoOpMetrics) SetIPNeeded(node string, n int) {} 357 func (m *NoOpMetrics) PoolMaintainerTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} } 358 func (m *NoOpMetrics) K8sSyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} } 359 func (m *NoOpMetrics) ResyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} } 360 func (m *NoOpMetrics) DeleteNode(n string) {} 361 362 func merge(slices ...[]float64) []float64 { 363 result := make([]float64, 1) 364 for _, s := range slices { 365 result = append(result, s...) 366 } 367 return result 368 } 369 370 // SinceInSeconds gets the time since the specified start in seconds. 371 func SinceInSeconds(start time.Time) float64 { 372 return time.Since(start).Seconds() 373 }