k8s.io/apiserver@v0.31.1/pkg/storage/value/encrypt/envelope/metrics/metrics.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"crypto/sha256"
    21  	"errors"
    22  	"fmt"
    23  	"hash"
    24  	"sync"
    25  	"time"
    26  
    27  	"google.golang.org/grpc/codes"
    28  	"google.golang.org/grpc/status"
    29  
    30  	"k8s.io/component-base/metrics"
    31  	"k8s.io/component-base/metrics/legacyregistry"
    32  	"k8s.io/klog/v2"
    33  	"k8s.io/utils/lru"
    34  )
    35  
    36  const (
    37  	namespace        = "apiserver"
    38  	subsystem        = "envelope_encryption"
    39  	FromStorageLabel = "from_storage"
    40  	ToStorageLabel   = "to_storage"
    41  )
    42  
    43  type metricLabels struct {
    44  	transformationType string
    45  	providerName       string
    46  	keyIDHash          string
    47  	apiServerIDHash    string
    48  }
    49  
    50  /*
    51   * By default, all the following metrics are defined as falling under
    52   * ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
    53   *
    54   * Promoting the stability level of the metric is a responsibility of the component owner, since it
    55   * involves explicitly acknowledging support for the metric across multiple releases, in accordance with
    56   * the metric stability policy.
    57   */
    58  var (
    59  	lockLastFromStorage   sync.Mutex
    60  	lockLastToStorage     sync.Mutex
    61  	lockRecordKeyID       sync.Mutex
    62  	lockRecordKeyIDStatus sync.Mutex
    63  
    64  	lastFromStorage                                 time.Time
    65  	lastToStorage                                   time.Time
    66  	keyIDHashTotalMetricLabels                      *lru.Cache
    67  	keyIDHashStatusLastTimestampSecondsMetricLabels *lru.Cache
    68  	cacheSize                                       = 100
    69  
    70  	// This metric is only used for KMS v1 API.
    71  	dekCacheFillPercent = metrics.NewGauge(
    72  		&metrics.GaugeOpts{
    73  			Namespace:      namespace,
    74  			Subsystem:      subsystem,
    75  			Name:           "dek_cache_fill_percent",
    76  			Help:           "Percent of the cache slots currently occupied by cached DEKs.",
    77  			StabilityLevel: metrics.ALPHA,
    78  		},
    79  	)
    80  
    81  	// This metric is only used for KMS v1 API.
    82  	dekCacheInterArrivals = metrics.NewHistogramVec(
    83  		&metrics.HistogramOpts{
    84  			Namespace:      namespace,
    85  			Subsystem:      subsystem,
    86  			Name:           "dek_cache_inter_arrival_time_seconds",
    87  			Help:           "Time (in seconds) of inter arrival of transformation requests.",
    88  			StabilityLevel: metrics.ALPHA,
    89  			Buckets:        metrics.ExponentialBuckets(60, 2, 10),
    90  		},
    91  		[]string{"transformation_type"},
    92  	)
    93  
    94  	// These metrics are made public to be used by unit tests.
    95  	KMSOperationsLatencyMetric = metrics.NewHistogramVec(
    96  		&metrics.HistogramOpts{
    97  			Namespace:      namespace,
    98  			Subsystem:      subsystem,
    99  			Name:           "kms_operations_latency_seconds",
   100  			Help:           "KMS operation duration with gRPC error code status total.",
   101  			StabilityLevel: metrics.ALPHA,
   102  			// Use custom buckets to avoid the default buckets which are too small for KMS operations.
   103  			// Start 0.1ms with the last bucket being [~52s, +Inf)
   104  			Buckets: metrics.ExponentialBuckets(0.0001, 2, 20),
   105  		},
   106  		[]string{"provider_name", "method_name", "grpc_status_code"},
   107  	)
   108  
   109  	// keyIDHashTotal is the number of times a keyID is used
   110  	// e.g. apiserver_envelope_encryption_key_id_hash_total counter
   111  	// apiserver_envelope_encryption_key_id_hash_total{apiserver_id_hash="sha256",key_id_hash="sha256",
   112  	// provider_name="providerName",transformation_type="from_storage"} 1
   113  	KeyIDHashTotal = metrics.NewCounterVec(
   114  		&metrics.CounterOpts{
   115  			Namespace:      namespace,
   116  			Subsystem:      subsystem,
   117  			Name:           "key_id_hash_total",
   118  			Help:           "Number of times a keyID is used split by transformation type, provider, and apiserver identity.",
   119  			StabilityLevel: metrics.ALPHA,
   120  		},
   121  		[]string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"},
   122  	)
   123  
   124  	// keyIDHashLastTimestampSeconds is the last time in seconds when a keyID was used
   125  	// e.g. apiserver_envelope_encryption_key_id_hash_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName",transformation_type="from_storage"} 1.674865558833728e+09
   126  	KeyIDHashLastTimestampSeconds = metrics.NewGaugeVec(
   127  		&metrics.GaugeOpts{
   128  			Namespace:      namespace,
   129  			Subsystem:      subsystem,
   130  			Name:           "key_id_hash_last_timestamp_seconds",
   131  			Help:           "The last time in seconds when a keyID was used.",
   132  			StabilityLevel: metrics.ALPHA,
   133  		},
   134  		[]string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"},
   135  	)
   136  
   137  	// keyIDHashStatusLastTimestampSeconds is the last time in seconds when a keyID was returned by the Status RPC call.
   138  	// e.g. apiserver_envelope_encryption_key_id_hash_status_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName"} 1.674865558833728e+09
   139  	KeyIDHashStatusLastTimestampSeconds = metrics.NewGaugeVec(
   140  		&metrics.GaugeOpts{
   141  			Namespace:      namespace,
   142  			Subsystem:      subsystem,
   143  			Name:           "key_id_hash_status_last_timestamp_seconds",
   144  			Help:           "The last time in seconds when a keyID was returned by the Status RPC call.",
   145  			StabilityLevel: metrics.ALPHA,
   146  		},
   147  		[]string{"provider_name", "key_id_hash", "apiserver_id_hash"},
   148  	)
   149  
   150  	InvalidKeyIDFromStatusTotal = metrics.NewCounterVec(
   151  		&metrics.CounterOpts{
   152  			Namespace:      namespace,
   153  			Subsystem:      subsystem,
   154  			Name:           "invalid_key_id_from_status_total",
   155  			Help:           "Number of times an invalid keyID is returned by the Status RPC call split by error.",
   156  			StabilityLevel: metrics.ALPHA,
   157  		},
   158  		[]string{"provider_name", "error"},
   159  	)
   160  
   161  	DekSourceCacheSize = metrics.NewGaugeVec(
   162  		&metrics.GaugeOpts{
   163  			Namespace:      namespace,
   164  			Subsystem:      subsystem,
   165  			Name:           "dek_source_cache_size",
   166  			Help:           "Number of records in data encryption key (DEK) source cache. On a restart, this value is an approximation of the number of decrypt RPC calls the server will make to the KMS plugin.",
   167  			StabilityLevel: metrics.ALPHA,
   168  		},
   169  		[]string{"provider_name"},
   170  	)
   171  )
   172  
   173  var registerMetricsFunc sync.Once
   174  var hashPool *sync.Pool
   175  
   176  func registerLRUMetrics() {
   177  	if keyIDHashTotalMetricLabels != nil {
   178  		keyIDHashTotalMetricLabels.Clear()
   179  	}
   180  	if keyIDHashStatusLastTimestampSecondsMetricLabels != nil {
   181  		keyIDHashStatusLastTimestampSecondsMetricLabels.Clear()
   182  	}
   183  
   184  	keyIDHashTotalMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) {
   185  		item := key.(metricLabels)
   186  		if deleted := KeyIDHashTotal.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
   187  			klog.InfoS("Deleted keyIDHashTotalMetricLabels", "transformationType", item.transformationType,
   188  				"providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
   189  		}
   190  		if deleted := KeyIDHashLastTimestampSeconds.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
   191  			klog.InfoS("Deleted keyIDHashLastTimestampSecondsMetricLabels", "transformationType", item.transformationType,
   192  				"providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
   193  		}
   194  	})
   195  	keyIDHashStatusLastTimestampSecondsMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) {
   196  		item := key.(metricLabels)
   197  		if deleted := KeyIDHashStatusLastTimestampSeconds.DeleteLabelValues(item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
   198  			klog.InfoS("Deleted keyIDHashStatusLastTimestampSecondsMetricLabels", "providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
   199  		}
   200  	})
   201  }
   202  func RegisterMetrics() {
   203  	registerMetricsFunc.Do(func() {
   204  		registerLRUMetrics()
   205  		hashPool = &sync.Pool{
   206  			New: func() interface{} {
   207  				return sha256.New()
   208  			},
   209  		}
   210  		legacyregistry.MustRegister(dekCacheFillPercent)
   211  		legacyregistry.MustRegister(dekCacheInterArrivals)
   212  		legacyregistry.MustRegister(DekSourceCacheSize)
   213  		legacyregistry.MustRegister(KeyIDHashTotal)
   214  		legacyregistry.MustRegister(KeyIDHashLastTimestampSeconds)
   215  		legacyregistry.MustRegister(KeyIDHashStatusLastTimestampSeconds)
   216  		legacyregistry.MustRegister(InvalidKeyIDFromStatusTotal)
   217  		legacyregistry.MustRegister(KMSOperationsLatencyMetric)
   218  	})
   219  }
   220  
   221  // RecordKeyID records total count and last time in seconds when a KeyID was used for TransformFromStorage and TransformToStorage operations
   222  func RecordKeyID(transformationType, providerName, keyID, apiServerID string) {
   223  	lockRecordKeyID.Lock()
   224  	defer lockRecordKeyID.Unlock()
   225  
   226  	keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashTotalMetricLabels, transformationType, providerName, keyID, apiServerID)
   227  	KeyIDHashTotal.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).Inc()
   228  	KeyIDHashLastTimestampSeconds.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).SetToCurrentTime()
   229  }
   230  
   231  // RecordKeyIDFromStatus records last time in seconds when a KeyID was returned by the Status RPC call.
   232  func RecordKeyIDFromStatus(providerName, keyID, apiServerID string) {
   233  	lockRecordKeyIDStatus.Lock()
   234  	defer lockRecordKeyIDStatus.Unlock()
   235  
   236  	keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashStatusLastTimestampSecondsMetricLabels, "", providerName, keyID, apiServerID)
   237  	KeyIDHashStatusLastTimestampSeconds.WithLabelValues(providerName, keyIDHash, apiServerIDHash).SetToCurrentTime()
   238  }
   239  
   240  func RecordInvalidKeyIDFromStatus(providerName, errCode string) {
   241  	InvalidKeyIDFromStatusTotal.WithLabelValues(providerName, errCode).Inc()
   242  }
   243  
   244  func RecordArrival(transformationType string, start time.Time) {
   245  	switch transformationType {
   246  	case FromStorageLabel:
   247  		lockLastFromStorage.Lock()
   248  		defer lockLastFromStorage.Unlock()
   249  
   250  		if lastFromStorage.IsZero() {
   251  			lastFromStorage = start
   252  		}
   253  		dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastFromStorage).Seconds())
   254  		lastFromStorage = start
   255  	case ToStorageLabel:
   256  		lockLastToStorage.Lock()
   257  		defer lockLastToStorage.Unlock()
   258  
   259  		if lastToStorage.IsZero() {
   260  			lastToStorage = start
   261  		}
   262  		dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastToStorage).Seconds())
   263  		lastToStorage = start
   264  	}
   265  }
   266  
   267  func RecordDekCacheFillPercent(percent float64) {
   268  	dekCacheFillPercent.Set(percent)
   269  }
   270  
   271  func RecordDekSourceCacheSize(providerName string, size int) {
   272  	DekSourceCacheSize.WithLabelValues(providerName).Set(float64(size))
   273  }
   274  
   275  // RecordKMSOperationLatency records the latency of KMS operation.
   276  func RecordKMSOperationLatency(providerName, methodName string, duration time.Duration, err error) {
   277  	KMSOperationsLatencyMetric.WithLabelValues(providerName, methodName, getErrorCode(err)).Observe(duration.Seconds())
   278  }
   279  
   280  type gRPCError interface {
   281  	GRPCStatus() *status.Status
   282  }
   283  
   284  func getErrorCode(err error) string {
   285  	if err == nil {
   286  		return codes.OK.String()
   287  	}
   288  
   289  	// handle errors wrapped with fmt.Errorf and similar
   290  	var s gRPCError
   291  	if errors.As(err, &s) {
   292  		return s.GRPCStatus().Code().String()
   293  	}
   294  
   295  	// This is not gRPC error. The operation must have failed before gRPC
   296  	// method was called, otherwise we would get gRPC error.
   297  	return "unknown-non-grpc"
   298  }
   299  
   300  func getHash(data string) string {
   301  	if len(data) == 0 {
   302  		return ""
   303  	}
   304  	h := hashPool.Get().(hash.Hash)
   305  	h.Reset()
   306  	h.Write([]byte(data))
   307  	dataHash := fmt.Sprintf("sha256:%x", h.Sum(nil))
   308  	hashPool.Put(h)
   309  	return dataHash
   310  }
   311  
   312  func addLabelToCache(c *lru.Cache, transformationType, providerName, keyID, apiServerID string) (string, string) {
   313  	keyIDHash := getHash(keyID)
   314  	apiServerIDHash := getHash(apiServerID)
   315  	c.Add(metricLabels{
   316  		transformationType: transformationType,
   317  		providerName:       providerName,
   318  		keyIDHash:          keyIDHash,
   319  		apiServerIDHash:    apiServerIDHash,
   320  	}, nil) // value is irrelevant, this is a set and not a map
   321  	return keyIDHash, apiServerIDHash
   322  }