k8s.io/apiserver@v0.31.1/pkg/storage/value/encrypt/envelope/metrics/metrics.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "crypto/sha256" 21 "errors" 22 "fmt" 23 "hash" 24 "sync" 25 "time" 26 27 "google.golang.org/grpc/codes" 28 "google.golang.org/grpc/status" 29 30 "k8s.io/component-base/metrics" 31 "k8s.io/component-base/metrics/legacyregistry" 32 "k8s.io/klog/v2" 33 "k8s.io/utils/lru" 34 ) 35 36 const ( 37 namespace = "apiserver" 38 subsystem = "envelope_encryption" 39 FromStorageLabel = "from_storage" 40 ToStorageLabel = "to_storage" 41 ) 42 43 type metricLabels struct { 44 transformationType string 45 providerName string 46 keyIDHash string 47 apiServerIDHash string 48 } 49 50 /* 51 * By default, all the following metrics are defined as falling under 52 * ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes) 53 * 54 * Promoting the stability level of the metric is a responsibility of the component owner, since it 55 * involves explicitly acknowledging support for the metric across multiple releases, in accordance with 56 * the metric stability policy. 57 */ 58 var ( 59 lockLastFromStorage sync.Mutex 60 lockLastToStorage sync.Mutex 61 lockRecordKeyID sync.Mutex 62 lockRecordKeyIDStatus sync.Mutex 63 64 lastFromStorage time.Time 65 lastToStorage time.Time 66 keyIDHashTotalMetricLabels *lru.Cache 67 keyIDHashStatusLastTimestampSecondsMetricLabels *lru.Cache 68 cacheSize = 100 69 70 // This metric is only used for KMS v1 API. 71 dekCacheFillPercent = metrics.NewGauge( 72 &metrics.GaugeOpts{ 73 Namespace: namespace, 74 Subsystem: subsystem, 75 Name: "dek_cache_fill_percent", 76 Help: "Percent of the cache slots currently occupied by cached DEKs.", 77 StabilityLevel: metrics.ALPHA, 78 }, 79 ) 80 81 // This metric is only used for KMS v1 API. 82 dekCacheInterArrivals = metrics.NewHistogramVec( 83 &metrics.HistogramOpts{ 84 Namespace: namespace, 85 Subsystem: subsystem, 86 Name: "dek_cache_inter_arrival_time_seconds", 87 Help: "Time (in seconds) of inter arrival of transformation requests.", 88 StabilityLevel: metrics.ALPHA, 89 Buckets: metrics.ExponentialBuckets(60, 2, 10), 90 }, 91 []string{"transformation_type"}, 92 ) 93 94 // These metrics are made public to be used by unit tests. 95 KMSOperationsLatencyMetric = metrics.NewHistogramVec( 96 &metrics.HistogramOpts{ 97 Namespace: namespace, 98 Subsystem: subsystem, 99 Name: "kms_operations_latency_seconds", 100 Help: "KMS operation duration with gRPC error code status total.", 101 StabilityLevel: metrics.ALPHA, 102 // Use custom buckets to avoid the default buckets which are too small for KMS operations. 103 // Start 0.1ms with the last bucket being [~52s, +Inf) 104 Buckets: metrics.ExponentialBuckets(0.0001, 2, 20), 105 }, 106 []string{"provider_name", "method_name", "grpc_status_code"}, 107 ) 108 109 // keyIDHashTotal is the number of times a keyID is used 110 // e.g. apiserver_envelope_encryption_key_id_hash_total counter 111 // apiserver_envelope_encryption_key_id_hash_total{apiserver_id_hash="sha256",key_id_hash="sha256", 112 // provider_name="providerName",transformation_type="from_storage"} 1 113 KeyIDHashTotal = metrics.NewCounterVec( 114 &metrics.CounterOpts{ 115 Namespace: namespace, 116 Subsystem: subsystem, 117 Name: "key_id_hash_total", 118 Help: "Number of times a keyID is used split by transformation type, provider, and apiserver identity.", 119 StabilityLevel: metrics.ALPHA, 120 }, 121 []string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"}, 122 ) 123 124 // keyIDHashLastTimestampSeconds is the last time in seconds when a keyID was used 125 // e.g. apiserver_envelope_encryption_key_id_hash_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName",transformation_type="from_storage"} 1.674865558833728e+09 126 KeyIDHashLastTimestampSeconds = metrics.NewGaugeVec( 127 &metrics.GaugeOpts{ 128 Namespace: namespace, 129 Subsystem: subsystem, 130 Name: "key_id_hash_last_timestamp_seconds", 131 Help: "The last time in seconds when a keyID was used.", 132 StabilityLevel: metrics.ALPHA, 133 }, 134 []string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"}, 135 ) 136 137 // keyIDHashStatusLastTimestampSeconds is the last time in seconds when a keyID was returned by the Status RPC call. 138 // e.g. apiserver_envelope_encryption_key_id_hash_status_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName"} 1.674865558833728e+09 139 KeyIDHashStatusLastTimestampSeconds = metrics.NewGaugeVec( 140 &metrics.GaugeOpts{ 141 Namespace: namespace, 142 Subsystem: subsystem, 143 Name: "key_id_hash_status_last_timestamp_seconds", 144 Help: "The last time in seconds when a keyID was returned by the Status RPC call.", 145 StabilityLevel: metrics.ALPHA, 146 }, 147 []string{"provider_name", "key_id_hash", "apiserver_id_hash"}, 148 ) 149 150 InvalidKeyIDFromStatusTotal = metrics.NewCounterVec( 151 &metrics.CounterOpts{ 152 Namespace: namespace, 153 Subsystem: subsystem, 154 Name: "invalid_key_id_from_status_total", 155 Help: "Number of times an invalid keyID is returned by the Status RPC call split by error.", 156 StabilityLevel: metrics.ALPHA, 157 }, 158 []string{"provider_name", "error"}, 159 ) 160 161 DekSourceCacheSize = metrics.NewGaugeVec( 162 &metrics.GaugeOpts{ 163 Namespace: namespace, 164 Subsystem: subsystem, 165 Name: "dek_source_cache_size", 166 Help: "Number of records in data encryption key (DEK) source cache. On a restart, this value is an approximation of the number of decrypt RPC calls the server will make to the KMS plugin.", 167 StabilityLevel: metrics.ALPHA, 168 }, 169 []string{"provider_name"}, 170 ) 171 ) 172 173 var registerMetricsFunc sync.Once 174 var hashPool *sync.Pool 175 176 func registerLRUMetrics() { 177 if keyIDHashTotalMetricLabels != nil { 178 keyIDHashTotalMetricLabels.Clear() 179 } 180 if keyIDHashStatusLastTimestampSecondsMetricLabels != nil { 181 keyIDHashStatusLastTimestampSecondsMetricLabels.Clear() 182 } 183 184 keyIDHashTotalMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) { 185 item := key.(metricLabels) 186 if deleted := KeyIDHashTotal.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted { 187 klog.InfoS("Deleted keyIDHashTotalMetricLabels", "transformationType", item.transformationType, 188 "providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash) 189 } 190 if deleted := KeyIDHashLastTimestampSeconds.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted { 191 klog.InfoS("Deleted keyIDHashLastTimestampSecondsMetricLabels", "transformationType", item.transformationType, 192 "providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash) 193 } 194 }) 195 keyIDHashStatusLastTimestampSecondsMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) { 196 item := key.(metricLabels) 197 if deleted := KeyIDHashStatusLastTimestampSeconds.DeleteLabelValues(item.providerName, item.keyIDHash, item.apiServerIDHash); deleted { 198 klog.InfoS("Deleted keyIDHashStatusLastTimestampSecondsMetricLabels", "providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash) 199 } 200 }) 201 } 202 func RegisterMetrics() { 203 registerMetricsFunc.Do(func() { 204 registerLRUMetrics() 205 hashPool = &sync.Pool{ 206 New: func() interface{} { 207 return sha256.New() 208 }, 209 } 210 legacyregistry.MustRegister(dekCacheFillPercent) 211 legacyregistry.MustRegister(dekCacheInterArrivals) 212 legacyregistry.MustRegister(DekSourceCacheSize) 213 legacyregistry.MustRegister(KeyIDHashTotal) 214 legacyregistry.MustRegister(KeyIDHashLastTimestampSeconds) 215 legacyregistry.MustRegister(KeyIDHashStatusLastTimestampSeconds) 216 legacyregistry.MustRegister(InvalidKeyIDFromStatusTotal) 217 legacyregistry.MustRegister(KMSOperationsLatencyMetric) 218 }) 219 } 220 221 // RecordKeyID records total count and last time in seconds when a KeyID was used for TransformFromStorage and TransformToStorage operations 222 func RecordKeyID(transformationType, providerName, keyID, apiServerID string) { 223 lockRecordKeyID.Lock() 224 defer lockRecordKeyID.Unlock() 225 226 keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashTotalMetricLabels, transformationType, providerName, keyID, apiServerID) 227 KeyIDHashTotal.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).Inc() 228 KeyIDHashLastTimestampSeconds.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).SetToCurrentTime() 229 } 230 231 // RecordKeyIDFromStatus records last time in seconds when a KeyID was returned by the Status RPC call. 232 func RecordKeyIDFromStatus(providerName, keyID, apiServerID string) { 233 lockRecordKeyIDStatus.Lock() 234 defer lockRecordKeyIDStatus.Unlock() 235 236 keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashStatusLastTimestampSecondsMetricLabels, "", providerName, keyID, apiServerID) 237 KeyIDHashStatusLastTimestampSeconds.WithLabelValues(providerName, keyIDHash, apiServerIDHash).SetToCurrentTime() 238 } 239 240 func RecordInvalidKeyIDFromStatus(providerName, errCode string) { 241 InvalidKeyIDFromStatusTotal.WithLabelValues(providerName, errCode).Inc() 242 } 243 244 func RecordArrival(transformationType string, start time.Time) { 245 switch transformationType { 246 case FromStorageLabel: 247 lockLastFromStorage.Lock() 248 defer lockLastFromStorage.Unlock() 249 250 if lastFromStorage.IsZero() { 251 lastFromStorage = start 252 } 253 dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastFromStorage).Seconds()) 254 lastFromStorage = start 255 case ToStorageLabel: 256 lockLastToStorage.Lock() 257 defer lockLastToStorage.Unlock() 258 259 if lastToStorage.IsZero() { 260 lastToStorage = start 261 } 262 dekCacheInterArrivals.WithLabelValues(transformationType).Observe(start.Sub(lastToStorage).Seconds()) 263 lastToStorage = start 264 } 265 } 266 267 func RecordDekCacheFillPercent(percent float64) { 268 dekCacheFillPercent.Set(percent) 269 } 270 271 func RecordDekSourceCacheSize(providerName string, size int) { 272 DekSourceCacheSize.WithLabelValues(providerName).Set(float64(size)) 273 } 274 275 // RecordKMSOperationLatency records the latency of KMS operation. 276 func RecordKMSOperationLatency(providerName, methodName string, duration time.Duration, err error) { 277 KMSOperationsLatencyMetric.WithLabelValues(providerName, methodName, getErrorCode(err)).Observe(duration.Seconds()) 278 } 279 280 type gRPCError interface { 281 GRPCStatus() *status.Status 282 } 283 284 func getErrorCode(err error) string { 285 if err == nil { 286 return codes.OK.String() 287 } 288 289 // handle errors wrapped with fmt.Errorf and similar 290 var s gRPCError 291 if errors.As(err, &s) { 292 return s.GRPCStatus().Code().String() 293 } 294 295 // This is not gRPC error. The operation must have failed before gRPC 296 // method was called, otherwise we would get gRPC error. 297 return "unknown-non-grpc" 298 } 299 300 func getHash(data string) string { 301 if len(data) == 0 { 302 return "" 303 } 304 h := hashPool.Get().(hash.Hash) 305 h.Reset() 306 h.Write([]byte(data)) 307 dataHash := fmt.Sprintf("sha256:%x", h.Sum(nil)) 308 hashPool.Put(h) 309 return dataHash 310 } 311 312 func addLabelToCache(c *lru.Cache, transformationType, providerName, keyID, apiServerID string) (string, string) { 313 keyIDHash := getHash(keyID) 314 apiServerIDHash := getHash(apiServerID) 315 c.Add(metricLabels{ 316 transformationType: transformationType, 317 providerName: providerName, 318 keyIDHash: keyIDHash, 319 apiServerIDHash: apiServerIDHash, 320 }, nil) // value is irrelevant, this is a set and not a map 321 return keyIDHash, apiServerIDHash 322 }