github.phpd.cn/cilium/cilium@v1.6.12/pkg/aws/eni/metrics/metrics.go (about)

     1  // Copyright 2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/cilium/cilium/pkg/trigger"
    21  
    22  	"github.com/prometheus/client_golang/prometheus"
    23  )
    24  
    25  const eniSubsystem = "eni"
    26  
    27  type prometheusMetrics struct {
    28  	registry              *prometheus.Registry
    29  	AllocateEniOps        *prometheus.CounterVec
    30  	AllocateIpOps         *prometheus.CounterVec
    31  	ReleaseIpOps          *prometheus.CounterVec
    32  	IPsAllocated          *prometheus.GaugeVec
    33  	AvailableENIs         prometheus.Gauge
    34  	AvailableIPsPerSubnet *prometheus.GaugeVec
    35  	Nodes                 *prometheus.GaugeVec
    36  	Resync                prometheus.Counter
    37  	EC2ApiDuration        *prometheus.HistogramVec
    38  	EC2RateLimit          *prometheus.HistogramVec
    39  	poolMaintainer        *triggerMetrics
    40  	k8sSync               *triggerMetrics
    41  	resync                *triggerMetrics
    42  }
    43  
    44  // NewPrometheusMetrics returns a new ENI metrics implementation backed by
    45  // Prometheus metrics.
    46  func NewPrometheusMetrics(namespace string, registry *prometheus.Registry) *prometheusMetrics {
    47  	m := &prometheusMetrics{
    48  		registry: registry,
    49  	}
    50  
    51  	m.IPsAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    52  		Namespace: namespace,
    53  		Subsystem: eniSubsystem,
    54  		Name:      "ips",
    55  		Help:      "Number of IPs allocated",
    56  	}, []string{"type"})
    57  
    58  	m.AllocateIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    59  		Namespace: namespace,
    60  		Subsystem: eniSubsystem,
    61  		Name:      "allocation_ops",
    62  		Help:      "Number of IP allocation operations",
    63  	}, []string{"subnetId"})
    64  
    65  	m.ReleaseIpOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    66  		Namespace: namespace,
    67  		Subsystem: eniSubsystem,
    68  		Name:      "release_ops",
    69  		Help:      "Number of IP release operations",
    70  	}, []string{"subnetId"})
    71  
    72  	m.AllocateEniOps = prometheus.NewCounterVec(prometheus.CounterOpts{
    73  		Namespace: namespace,
    74  		Subsystem: eniSubsystem,
    75  		Name:      "interface_creation_ops",
    76  		Help:      "Number of ENIs allocated",
    77  	}, []string{"subnetId", "status"})
    78  
    79  	m.AvailableENIs = prometheus.NewGauge(prometheus.GaugeOpts{
    80  		Namespace: namespace,
    81  		Subsystem: eniSubsystem,
    82  		Name:      "available",
    83  		Help:      "Number of ENIs with addresses available",
    84  	})
    85  
    86  	m.AvailableIPsPerSubnet = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    87  		Namespace: namespace,
    88  		Subsystem: eniSubsystem,
    89  		Name:      "available_ips_per_subnet",
    90  		Help:      "Number of available IPs per subnet ID",
    91  	}, []string{"subnetId", "availabilityZone"})
    92  
    93  	m.Nodes = prometheus.NewGaugeVec(prometheus.GaugeOpts{
    94  		Namespace: namespace,
    95  		Subsystem: eniSubsystem,
    96  		Name:      "nodes",
    97  		Help:      "Number of nodes by category { total | in-deficit | at-capacity }",
    98  	}, []string{"category"})
    99  
   100  	m.EC2ApiDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   101  		Namespace: namespace,
   102  		Subsystem: eniSubsystem,
   103  		Name:      "aws_api_duration_seconds",
   104  		Help:      "Duration of interactions with AWS API",
   105  	}, []string{"operation", "responseCode"})
   106  
   107  	m.Resync = prometheus.NewCounter(prometheus.CounterOpts{
   108  		Namespace: namespace,
   109  		Subsystem: eniSubsystem,
   110  		Name:      "resync_total",
   111  		Help:      "Number of resync operations to synchronize AWS EC2 metadata",
   112  	})
   113  
   114  	m.EC2RateLimit = prometheus.NewHistogramVec(prometheus.HistogramOpts{
   115  		Namespace: namespace,
   116  		Subsystem: eniSubsystem,
   117  		Name:      "ec2_rate_limit_duration_seconds",
   118  		Help:      "Duration of EC2 client-side rate limiter blocking",
   119  	}, []string{"operation"})
   120  
   121  	// pool_maintainer is a more generic name, but for backward compatibility
   122  	// of dashboard, keep the metric name deficit_resolver unchanged
   123  	m.poolMaintainer = newTriggerMetrics(namespace, "deficit_resolver")
   124  	m.k8sSync = newTriggerMetrics(namespace, "k8s_sync")
   125  	m.resync = newTriggerMetrics(namespace, "ec2_resync")
   126  
   127  	registry.MustRegister(m.IPsAllocated)
   128  	registry.MustRegister(m.AllocateIpOps)
   129  	registry.MustRegister(m.ReleaseIpOps)
   130  	registry.MustRegister(m.AllocateEniOps)
   131  	registry.MustRegister(m.AvailableENIs)
   132  	registry.MustRegister(m.AvailableIPsPerSubnet)
   133  	registry.MustRegister(m.Nodes)
   134  	registry.MustRegister(m.Resync)
   135  	registry.MustRegister(m.EC2ApiDuration)
   136  	registry.MustRegister(m.EC2RateLimit)
   137  	m.poolMaintainer.register(registry)
   138  	m.k8sSync.register(registry)
   139  	m.resync.register(registry)
   140  
   141  	return m
   142  }
   143  
   144  func (p *prometheusMetrics) PoolMaintainerTrigger() trigger.MetricsObserver {
   145  	return p.poolMaintainer
   146  }
   147  
   148  func (p *prometheusMetrics) K8sSyncTrigger() trigger.MetricsObserver {
   149  	return p.k8sSync
   150  }
   151  
   152  func (p *prometheusMetrics) ResyncTrigger() trigger.MetricsObserver {
   153  	return p.resync
   154  }
   155  
   156  func (p *prometheusMetrics) IncENIAllocationAttempt(status, subnetID string) {
   157  	p.AllocateEniOps.WithLabelValues(subnetID, status).Inc()
   158  }
   159  
   160  func (p *prometheusMetrics) AddIPAllocation(subnetID string, allocated int64) {
   161  	p.AllocateIpOps.WithLabelValues(subnetID).Add(float64(allocated))
   162  }
   163  
   164  func (p *prometheusMetrics) AddIPRelease(subnetID string, released int64) {
   165  	p.ReleaseIpOps.WithLabelValues(subnetID).Add(float64(released))
   166  }
   167  
   168  func (p *prometheusMetrics) SetAllocatedIPs(typ string, allocated int) {
   169  	p.IPsAllocated.WithLabelValues(typ).Set(float64(allocated))
   170  }
   171  
   172  func (p *prometheusMetrics) SetAvailableENIs(available int) {
   173  	p.AvailableENIs.Set(float64(available))
   174  }
   175  
   176  func (p *prometheusMetrics) SetAvailableIPsPerSubnet(subnetID string, availabilityZone string, available int) {
   177  	p.AvailableIPsPerSubnet.WithLabelValues(subnetID, availabilityZone).Set(float64(available))
   178  }
   179  
   180  func (p *prometheusMetrics) SetNodes(label string, nodes int) {
   181  	p.Nodes.WithLabelValues(label).Set(float64(nodes))
   182  }
   183  
   184  func (p *prometheusMetrics) ObserveEC2APICall(operation, status string, duration float64) {
   185  	p.EC2ApiDuration.WithLabelValues(operation, status).Observe(duration)
   186  }
   187  
   188  func (p *prometheusMetrics) ObserveEC2RateLimit(operation string, delay time.Duration) {
   189  	p.EC2RateLimit.WithLabelValues(operation).Observe(delay.Seconds())
   190  }
   191  
   192  func (p *prometheusMetrics) IncResyncCount() {
   193  	p.Resync.Inc()
   194  }
   195  
   196  type triggerMetrics struct {
   197  	total        prometheus.Counter
   198  	folds        prometheus.Gauge
   199  	callDuration prometheus.Histogram
   200  	latency      prometheus.Histogram
   201  }
   202  
   203  func newTriggerMetrics(namespace, name string) *triggerMetrics {
   204  	return &triggerMetrics{
   205  		total: prometheus.NewCounter(prometheus.CounterOpts{
   206  			Namespace: namespace,
   207  			Subsystem: eniSubsystem,
   208  			Name:      name + "_queued_total",
   209  			Help:      "Number of queued triggers",
   210  		}),
   211  		folds: prometheus.NewGauge(prometheus.GaugeOpts{
   212  			Namespace: namespace,
   213  			Subsystem: eniSubsystem,
   214  			Name:      name + "_folds",
   215  			Help:      "Current level of folding",
   216  		}),
   217  		callDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
   218  			Namespace: namespace,
   219  			Subsystem: eniSubsystem,
   220  			Name:      name + "_duration_seconds",
   221  			Help:      "Duration of trigger runs",
   222  		}),
   223  		latency: prometheus.NewHistogram(prometheus.HistogramOpts{
   224  			Namespace: namespace,
   225  			Subsystem: eniSubsystem,
   226  			Name:      name + "_latency_seconds",
   227  			Help:      "Latency between queue and trigger run",
   228  		}),
   229  	}
   230  }
   231  
   232  func (t *triggerMetrics) register(registry *prometheus.Registry) {
   233  	registry.MustRegister(t.total)
   234  	registry.MustRegister(t.folds)
   235  	registry.MustRegister(t.callDuration)
   236  	registry.MustRegister(t.latency)
   237  }
   238  
   239  func (t *triggerMetrics) QueueEvent(reason string) {
   240  	t.total.Inc()
   241  }
   242  
   243  func (t *triggerMetrics) PostRun(duration, latency time.Duration, folds int) {
   244  	t.callDuration.Observe(duration.Seconds())
   245  	t.latency.Observe(latency.Seconds())
   246  	t.folds.Set(float64(folds))
   247  }