k8s.io/apiserver@v0.31.1/pkg/storage/etcd3/metrics/metrics.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	compbasemetrics "k8s.io/component-base/metrics"
    26  	"k8s.io/component-base/metrics/legacyregistry"
    27  	"k8s.io/klog/v2"
    28  )
    29  
    30  /*
    31   * By default, all the following metrics are defined as falling under
    32   * ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
    33   *
    34   * Promoting the stability level of the metric is a responsibility of the component owner, since it
    35   * involves explicitly acknowledging support for the metric across multiple releases, in accordance with
    36   * the metric stability policy.
    37   */
    38  var (
    39  	etcdRequestLatency = compbasemetrics.NewHistogramVec(
    40  		&compbasemetrics.HistogramOpts{
    41  			Name: "etcd_request_duration_seconds",
    42  			Help: "Etcd request latency in seconds for each operation and object type.",
    43  			// Etcd request latency in seconds for each operation and object type.
    44  			// This metric is used for verifying etcd api call latencies SLO
    45  			// keep consistent with apiserver metric 'requestLatencies' in
    46  			// staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
    47  			Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3,
    48  				4, 5, 6, 8, 10, 15, 20, 30, 45, 60},
    49  			StabilityLevel: compbasemetrics.ALPHA,
    50  		},
    51  		[]string{"operation", "type"},
    52  	)
    53  	etcdRequestCounts = compbasemetrics.NewCounterVec(
    54  		&compbasemetrics.CounterOpts{
    55  			Name:           "etcd_requests_total",
    56  			Help:           "Etcd request counts for each operation and object type.",
    57  			StabilityLevel: compbasemetrics.ALPHA,
    58  		},
    59  		[]string{"operation", "type"},
    60  	)
    61  	etcdRequestErrorCounts = compbasemetrics.NewCounterVec(
    62  		&compbasemetrics.CounterOpts{
    63  			Name:           "etcd_request_errors_total",
    64  			Help:           "Etcd failed request counts for each operation and object type.",
    65  			StabilityLevel: compbasemetrics.ALPHA,
    66  		},
    67  		[]string{"operation", "type"},
    68  	)
    69  	objectCounts = compbasemetrics.NewGaugeVec(
    70  		&compbasemetrics.GaugeOpts{
    71  			Name:           "apiserver_storage_objects",
    72  			Help:           "Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.",
    73  			StabilityLevel: compbasemetrics.STABLE,
    74  		},
    75  		[]string{"resource"},
    76  	)
    77  	dbTotalSize = compbasemetrics.NewGaugeVec(
    78  		&compbasemetrics.GaugeOpts{
    79  			Subsystem:         "apiserver",
    80  			Name:              "storage_db_total_size_in_bytes",
    81  			Help:              "Total size of the storage database file physically allocated in bytes.",
    82  			StabilityLevel:    compbasemetrics.ALPHA,
    83  			DeprecatedVersion: "1.28.0",
    84  		},
    85  		[]string{"endpoint"},
    86  	)
    87  	storageSizeDescription   = compbasemetrics.NewDesc("apiserver_storage_size_bytes", "Size of the storage database file physically allocated in bytes.", []string{"storage_cluster_id"}, nil, compbasemetrics.STABLE, "")
    88  	storageMonitor           = &monitorCollector{monitorGetter: func() ([]Monitor, error) { return nil, nil }}
    89  	etcdEventsReceivedCounts = compbasemetrics.NewCounterVec(
    90  		&compbasemetrics.CounterOpts{
    91  			Subsystem:      "apiserver",
    92  			Name:           "storage_events_received_total",
    93  			Help:           "Number of etcd events received split by kind.",
    94  			StabilityLevel: compbasemetrics.ALPHA,
    95  		},
    96  		[]string{"resource"},
    97  	)
    98  	etcdBookmarkCounts = compbasemetrics.NewGaugeVec(
    99  		&compbasemetrics.GaugeOpts{
   100  			Name:           "etcd_bookmark_counts",
   101  			Help:           "Number of etcd bookmarks (progress notify events) split by kind.",
   102  			StabilityLevel: compbasemetrics.ALPHA,
   103  		},
   104  		[]string{"resource"},
   105  	)
   106  	etcdLeaseObjectCounts = compbasemetrics.NewHistogramVec(
   107  		&compbasemetrics.HistogramOpts{
   108  			Name:           "etcd_lease_object_counts",
   109  			Help:           "Number of objects attached to a single etcd lease.",
   110  			Buckets:        []float64{10, 50, 100, 500, 1000, 2500, 5000},
   111  			StabilityLevel: compbasemetrics.ALPHA,
   112  		},
   113  		[]string{},
   114  	)
   115  	listStorageCount = compbasemetrics.NewCounterVec(
   116  		&compbasemetrics.CounterOpts{
   117  			Name:           "apiserver_storage_list_total",
   118  			Help:           "Number of LIST requests served from storage",
   119  			StabilityLevel: compbasemetrics.ALPHA,
   120  		},
   121  		[]string{"resource"},
   122  	)
   123  	listStorageNumFetched = compbasemetrics.NewCounterVec(
   124  		&compbasemetrics.CounterOpts{
   125  			Name:           "apiserver_storage_list_fetched_objects_total",
   126  			Help:           "Number of objects read from storage in the course of serving a LIST request",
   127  			StabilityLevel: compbasemetrics.ALPHA,
   128  		},
   129  		[]string{"resource"},
   130  	)
   131  	listStorageNumSelectorEvals = compbasemetrics.NewCounterVec(
   132  		&compbasemetrics.CounterOpts{
   133  			Name:           "apiserver_storage_list_evaluated_objects_total",
   134  			Help:           "Number of objects tested in the course of serving a LIST request from storage",
   135  			StabilityLevel: compbasemetrics.ALPHA,
   136  		},
   137  		[]string{"resource"},
   138  	)
   139  	listStorageNumReturned = compbasemetrics.NewCounterVec(
   140  		&compbasemetrics.CounterOpts{
   141  			Name:           "apiserver_storage_list_returned_objects_total",
   142  			Help:           "Number of objects returned for a LIST request from storage",
   143  			StabilityLevel: compbasemetrics.ALPHA,
   144  		},
   145  		[]string{"resource"},
   146  	)
   147  	decodeErrorCounts = compbasemetrics.NewCounterVec(
   148  		&compbasemetrics.CounterOpts{
   149  			Namespace:      "apiserver",
   150  			Name:           "storage_decode_errors_total",
   151  			Help:           "Number of stored object decode errors split by object type",
   152  			StabilityLevel: compbasemetrics.ALPHA,
   153  		},
   154  		[]string{"resource"},
   155  	)
   156  )
   157  
   158  var registerMetrics sync.Once
   159  
   160  // Register all metrics.
   161  func Register() {
   162  	// Register the metrics.
   163  	registerMetrics.Do(func() {
   164  		legacyregistry.MustRegister(etcdRequestLatency)
   165  		legacyregistry.MustRegister(etcdRequestCounts)
   166  		legacyregistry.MustRegister(etcdRequestErrorCounts)
   167  		legacyregistry.MustRegister(objectCounts)
   168  		legacyregistry.MustRegister(dbTotalSize)
   169  		legacyregistry.CustomMustRegister(storageMonitor)
   170  		legacyregistry.MustRegister(etcdEventsReceivedCounts)
   171  		legacyregistry.MustRegister(etcdBookmarkCounts)
   172  		legacyregistry.MustRegister(etcdLeaseObjectCounts)
   173  		legacyregistry.MustRegister(listStorageCount)
   174  		legacyregistry.MustRegister(listStorageNumFetched)
   175  		legacyregistry.MustRegister(listStorageNumSelectorEvals)
   176  		legacyregistry.MustRegister(listStorageNumReturned)
   177  		legacyregistry.MustRegister(decodeErrorCounts)
   178  	})
   179  }
   180  
   181  // UpdateObjectCount sets the apiserver_storage_object_counts metric.
   182  func UpdateObjectCount(resourcePrefix string, count int64) {
   183  	objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
   184  }
   185  
   186  // RecordEtcdRequest updates and sets the etcd_request_duration_seconds,
   187  // etcd_request_total, etcd_request_errors_total metrics.
   188  func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) {
   189  	v := []string{verb, resource}
   190  	etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime))
   191  	etcdRequestCounts.WithLabelValues(v...).Inc()
   192  	if err != nil {
   193  		etcdRequestErrorCounts.WithLabelValues(v...).Inc()
   194  	}
   195  }
   196  
   197  // RecordEtcdEvent updated the etcd_events_received_total metric.
   198  func RecordEtcdEvent(resource string) {
   199  	etcdEventsReceivedCounts.WithLabelValues(resource).Inc()
   200  }
   201  
   202  // RecordEtcdBookmark updates the etcd_bookmark_counts metric.
   203  func RecordEtcdBookmark(resource string) {
   204  	etcdBookmarkCounts.WithLabelValues(resource).Inc()
   205  }
   206  
   207  // RecordDecodeError sets the storage_decode_errors metrics.
   208  func RecordDecodeError(resource string) {
   209  	decodeErrorCounts.WithLabelValues(resource).Inc()
   210  }
   211  
   212  // Reset resets the etcd_request_duration_seconds metric.
   213  func Reset() {
   214  	etcdRequestLatency.Reset()
   215  }
   216  
   217  // sinceInSeconds gets the time since the specified start in seconds.
   218  //
   219  // This is a variable to facilitate testing.
   220  var sinceInSeconds = func(start time.Time) float64 {
   221  	return time.Since(start).Seconds()
   222  }
   223  
   224  // UpdateEtcdDbSize sets the etcd_db_total_size_in_bytes metric.
   225  // Deprecated: Metric etcd_db_total_size_in_bytes will be replaced with apiserver_storage_size_bytes
   226  func UpdateEtcdDbSize(ep string, size int64) {
   227  	dbTotalSize.WithLabelValues(ep).Set(float64(size))
   228  }
   229  
   230  // SetStorageMonitorGetter sets monitor getter to allow monitoring etcd stats.
   231  func SetStorageMonitorGetter(getter func() ([]Monitor, error)) {
   232  	storageMonitor.setGetter(getter)
   233  }
   234  
   235  // UpdateLeaseObjectCount sets the etcd_lease_object_counts metric.
   236  func UpdateLeaseObjectCount(count int64) {
   237  	// Currently we only store one previous lease, since all the events have the same ttl.
   238  	// See pkg/storage/etcd3/lease_manager.go
   239  	etcdLeaseObjectCounts.WithLabelValues().Observe(float64(count))
   240  }
   241  
   242  // RecordListEtcd3Metrics notes various metrics of the cost to serve a LIST request
   243  func RecordStorageListMetrics(resource string, numFetched, numEvald, numReturned int) {
   244  	listStorageCount.WithLabelValues(resource).Inc()
   245  	listStorageNumFetched.WithLabelValues(resource).Add(float64(numFetched))
   246  	listStorageNumSelectorEvals.WithLabelValues(resource).Add(float64(numEvald))
   247  	listStorageNumReturned.WithLabelValues(resource).Add(float64(numReturned))
   248  }
   249  
   250  type Monitor interface {
   251  	Monitor(ctx context.Context) (StorageMetrics, error)
   252  	Close() error
   253  }
   254  
   255  type StorageMetrics struct {
   256  	Size int64
   257  }
   258  
   259  type monitorCollector struct {
   260  	compbasemetrics.BaseStableCollector
   261  
   262  	mutex         sync.Mutex
   263  	monitorGetter func() ([]Monitor, error)
   264  }
   265  
   266  func (m *monitorCollector) setGetter(monitorGetter func() ([]Monitor, error)) {
   267  	m.mutex.Lock()
   268  	defer m.mutex.Unlock()
   269  	m.monitorGetter = monitorGetter
   270  }
   271  
   272  func (m *monitorCollector) getGetter() func() ([]Monitor, error) {
   273  	m.mutex.Lock()
   274  	defer m.mutex.Unlock()
   275  	return m.monitorGetter
   276  }
   277  
   278  // DescribeWithStability implements compbasemetrics.StableColletor
   279  func (c *monitorCollector) DescribeWithStability(ch chan<- *compbasemetrics.Desc) {
   280  	ch <- storageSizeDescription
   281  }
   282  
   283  // CollectWithStability implements compbasemetrics.StableColletor
   284  func (c *monitorCollector) CollectWithStability(ch chan<- compbasemetrics.Metric) {
   285  	monitors, err := c.getGetter()()
   286  	if err != nil {
   287  		return
   288  	}
   289  
   290  	for i, m := range monitors {
   291  		storageClusterID := fmt.Sprintf("etcd-%d", i)
   292  
   293  		klog.V(4).InfoS("Start collecting storage metrics", "storage_cluster_id", storageClusterID)
   294  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   295  		metrics, err := m.Monitor(ctx)
   296  		cancel()
   297  		m.Close()
   298  		if err != nil {
   299  			klog.InfoS("Failed to get storage metrics", "storage_cluster_id", storageClusterID, "err", err)
   300  			continue
   301  		}
   302  
   303  		metric, err := compbasemetrics.NewConstMetric(storageSizeDescription, compbasemetrics.GaugeValue, float64(metrics.Size), storageClusterID)
   304  		if err != nil {
   305  			klog.ErrorS(err, "Failed to create metric", "storage_cluster_id", storageClusterID)
   306  		}
   307  		ch <- metric
   308  	}
   309  }