k8s.io/apiserver@v0.31.1/pkg/storage/storagebackend/factory/etcd3.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package factory 18 19 import ( 20 "context" 21 "fmt" 22 "log" 23 "math/rand" 24 "net" 25 "net/url" 26 "os" 27 "path" 28 "strings" 29 "sync" 30 "time" 31 32 grpcprom "github.com/grpc-ecosystem/go-grpc-prometheus" 33 "go.etcd.io/etcd/client/pkg/v3/logutil" 34 "go.etcd.io/etcd/client/pkg/v3/transport" 35 clientv3 "go.etcd.io/etcd/client/v3" 36 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 37 "go.uber.org/zap" 38 "go.uber.org/zap/zapcore" 39 "golang.org/x/time/rate" 40 "google.golang.org/grpc" 41 "k8s.io/klog/v2" 42 43 "k8s.io/apimachinery/pkg/runtime" 44 utilnet "k8s.io/apimachinery/pkg/util/net" 45 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 46 "k8s.io/apimachinery/pkg/util/wait" 47 genericfeatures "k8s.io/apiserver/pkg/features" 48 "k8s.io/apiserver/pkg/server/egressselector" 49 "k8s.io/apiserver/pkg/storage" 50 "k8s.io/apiserver/pkg/storage/etcd3" 51 "k8s.io/apiserver/pkg/storage/etcd3/metrics" 52 "k8s.io/apiserver/pkg/storage/storagebackend" 53 "k8s.io/apiserver/pkg/storage/value/encrypt/identity" 54 utilfeature "k8s.io/apiserver/pkg/util/feature" 55 "k8s.io/component-base/metrics/legacyregistry" 56 tracing "k8s.io/component-base/tracing" 57 ) 58 59 const ( 60 // The short keepalive timeout and interval have been chosen to aggressively 61 // detect a failed etcd server without introducing much overhead. 62 keepaliveTime = 30 * time.Second 63 keepaliveTimeout = 10 * time.Second 64 65 // dialTimeout is the timeout for failing to establish a connection. 66 // It is set to 20 seconds as times shorter than that will cause TLS connections to fail 67 // on heavily loaded arm64 CPUs (issue #64649) 68 dialTimeout = 20 * time.Second 69 70 dbMetricsMonitorJitter = 0.5 71 ) 72 73 // TODO(negz): Stop using a package scoped logger. At the time of writing we're 74 // creating an etcd client for each CRD. We need to pass each etcd client a 75 // logger or each client will create its own, which comes with a significant 76 // memory cost (around 20% of the API server's memory when hundreds of CRDs are 77 // present). The correct fix here is to not create a client per CRD. See 78 // https://github.com/kubernetes/kubernetes/issues/111476 for more. 79 var etcd3ClientLogger *zap.Logger 80 81 func init() { 82 // grpcprom auto-registers (via an init function) their client metrics, since we are opting out of 83 // using the global prometheus registry and using our own wrapped global registry, 84 // we need to explicitly register these metrics to our global registry here. 85 // For reference: https://github.com/kubernetes/kubernetes/pull/81387 86 legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics) 87 dbMetricsMonitors = make(map[string]struct{}) 88 89 l, err := logutil.CreateDefaultZapLogger(etcdClientDebugLevel()) 90 if err != nil { 91 l = zap.NewNop() 92 } 93 etcd3ClientLogger = l.Named("etcd-client") 94 } 95 96 // etcdClientDebugLevel translates ETCD_CLIENT_DEBUG into zap log level. 97 // NOTE(negz): This is a copy of a private etcd client function: 98 // https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/logger.go#L47 99 func etcdClientDebugLevel() zapcore.Level { 100 envLevel := os.Getenv("ETCD_CLIENT_DEBUG") 101 if envLevel == "" || envLevel == "true" { 102 return zapcore.InfoLevel 103 } 104 var l zapcore.Level 105 if err := l.Set(envLevel); err == nil { 106 log.Printf("Deprecated env ETCD_CLIENT_DEBUG value. Using default level: 'info'") 107 return zapcore.InfoLevel 108 } 109 return l 110 } 111 112 func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { 113 timeout := storagebackend.DefaultHealthcheckTimeout 114 if c.HealthcheckTimeout != time.Duration(0) { 115 timeout = c.HealthcheckTimeout 116 } 117 return newETCD3Check(c, timeout, stopCh) 118 } 119 120 func newETCD3ReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { 121 timeout := storagebackend.DefaultReadinessTimeout 122 if c.ReadycheckTimeout != time.Duration(0) { 123 timeout = c.ReadycheckTimeout 124 } 125 return newETCD3Check(c, timeout, stopCh) 126 } 127 128 // atomic error acts as a cache for atomically store an error 129 // the error is only updated if the timestamp is more recent than 130 // current stored error. 131 type atomicLastError struct { 132 mu sync.RWMutex 133 err error 134 timestamp time.Time 135 } 136 137 func (a *atomicLastError) Store(err error, t time.Time) { 138 a.mu.Lock() 139 defer a.mu.Unlock() 140 if a.timestamp.IsZero() || a.timestamp.Before(t) { 141 a.err = err 142 a.timestamp = t 143 } 144 } 145 146 func (a *atomicLastError) Load() error { 147 a.mu.RLock() 148 defer a.mu.RUnlock() 149 return a.err 150 } 151 152 func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan struct{}) (func() error, error) { 153 // constructing the etcd v3 client blocks and times out if etcd is not available. 154 // retry in a loop in the background until we successfully create the client, storing the client or error encountered 155 156 lock := sync.RWMutex{} 157 var prober *etcd3ProberMonitor 158 clientErr := fmt.Errorf("etcd client connection not yet established") 159 160 go wait.PollImmediateUntil(time.Second, func() (bool, error) { 161 lock.Lock() 162 defer lock.Unlock() 163 newProber, err := newETCD3ProberMonitor(c) 164 // Ensure that server is already not shutting down. 165 select { 166 case <-stopCh: 167 if err == nil { 168 newProber.Close() 169 } 170 return true, nil 171 default: 172 } 173 if err != nil { 174 clientErr = err 175 return false, nil 176 } 177 prober = newProber 178 clientErr = nil 179 return true, nil 180 }, stopCh) 181 182 // Close the client on shutdown. 183 go func() { 184 defer utilruntime.HandleCrash() 185 <-stopCh 186 187 lock.Lock() 188 defer lock.Unlock() 189 if prober != nil { 190 prober.Close() 191 clientErr = fmt.Errorf("server is shutting down") 192 } 193 }() 194 195 // limit to a request every half of the configured timeout with a maximum burst of one 196 // rate limited requests will receive the last request sent error (note: not the last received response) 197 limiter := rate.NewLimiter(rate.Every(timeout/2), 1) 198 // initial state is the clientErr 199 lastError := &atomicLastError{err: fmt.Errorf("etcd client connection not yet established")} 200 201 return func() error { 202 // Given that client is closed on shutdown we hold the lock for 203 // the entire period of healthcheck call to ensure that client will 204 // not be closed during healthcheck. 205 // Given that healthchecks has a 2s timeout, worst case of blocking 206 // shutdown for additional 2s seems acceptable. 207 lock.RLock() 208 defer lock.RUnlock() 209 210 if clientErr != nil { 211 return clientErr 212 } 213 if limiter.Allow() == false { 214 return lastError.Load() 215 } 216 ctx, cancel := context.WithTimeout(context.Background(), timeout) 217 defer cancel() 218 now := time.Now() 219 err := prober.Probe(ctx) 220 lastError.Store(err, now) 221 return err 222 }, nil 223 } 224 225 func newETCD3ProberMonitor(c storagebackend.Config) (*etcd3ProberMonitor, error) { 226 client, err := newETCD3Client(c.Transport) 227 if err != nil { 228 return nil, err 229 } 230 return &etcd3ProberMonitor{ 231 client: client, 232 prefix: c.Prefix, 233 endpoints: c.Transport.ServerList, 234 }, nil 235 } 236 237 type etcd3ProberMonitor struct { 238 prefix string 239 endpoints []string 240 241 mux sync.RWMutex 242 client *clientv3.Client 243 closed bool 244 } 245 246 func (t *etcd3ProberMonitor) Close() error { 247 t.mux.Lock() 248 defer t.mux.Unlock() 249 if !t.closed { 250 t.closed = true 251 return t.client.Close() 252 } 253 return fmt.Errorf("closed") 254 } 255 256 func (t *etcd3ProberMonitor) Probe(ctx context.Context) error { 257 t.mux.RLock() 258 defer t.mux.RUnlock() 259 if t.closed { 260 return fmt.Errorf("closed") 261 } 262 // See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118 263 _, err := t.client.Get(ctx, path.Join("/", t.prefix, "health")) 264 if err != nil { 265 return fmt.Errorf("error getting data from etcd: %w", err) 266 } 267 return nil 268 } 269 270 func (t *etcd3ProberMonitor) Monitor(ctx context.Context) (metrics.StorageMetrics, error) { 271 t.mux.RLock() 272 defer t.mux.RUnlock() 273 if t.closed { 274 return metrics.StorageMetrics{}, fmt.Errorf("closed") 275 } 276 status, err := t.client.Status(ctx, t.endpoints[rand.Int()%len(t.endpoints)]) 277 if err != nil { 278 return metrics.StorageMetrics{}, err 279 } 280 return metrics.StorageMetrics{ 281 Size: status.DbSize, 282 }, nil 283 } 284 285 var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) { 286 tlsInfo := transport.TLSInfo{ 287 CertFile: c.CertFile, 288 KeyFile: c.KeyFile, 289 TrustedCAFile: c.TrustedCAFile, 290 } 291 tlsConfig, err := tlsInfo.ClientConfig() 292 if err != nil { 293 return nil, err 294 } 295 // NOTE: Client relies on nil tlsConfig 296 // for non-secure connections, update the implicit variable 297 if len(c.CertFile) == 0 && len(c.KeyFile) == 0 && len(c.TrustedCAFile) == 0 { 298 tlsConfig = nil 299 } 300 networkContext := egressselector.Etcd.AsNetworkContext() 301 var egressDialer utilnet.DialFunc 302 if c.EgressLookup != nil { 303 egressDialer, err = c.EgressLookup(networkContext) 304 if err != nil { 305 return nil, err 306 } 307 } 308 dialOptions := []grpc.DialOption{ 309 grpc.WithBlock(), // block until the underlying connection is up 310 // use chained interceptors so that the default (retry and backoff) interceptors are added. 311 // otherwise they will be overwritten by the metric interceptor. 312 // 313 // these optional interceptors will be placed after the default ones. 314 // which seems to be what we want as the metrics will be collected on each attempt (retry) 315 grpc.WithChainUnaryInterceptor(grpcprom.UnaryClientInterceptor), 316 grpc.WithChainStreamInterceptor(grpcprom.StreamClientInterceptor), 317 } 318 if utilfeature.DefaultFeatureGate.Enabled(genericfeatures.APIServerTracing) { 319 tracingOpts := []otelgrpc.Option{ 320 otelgrpc.WithMessageEvents(otelgrpc.ReceivedEvents, otelgrpc.SentEvents), 321 otelgrpc.WithPropagators(tracing.Propagators()), 322 otelgrpc.WithTracerProvider(c.TracerProvider), 323 } 324 // Even with Noop TracerProvider, the otelgrpc still handles context propagation. 325 // See https://github.com/open-telemetry/opentelemetry-go/tree/main/example/passthrough 326 dialOptions = append(dialOptions, 327 grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor(tracingOpts...)), 328 grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor(tracingOpts...))) 329 } 330 if egressDialer != nil { 331 dialer := func(ctx context.Context, addr string) (net.Conn, error) { 332 if strings.Contains(addr, "//") { 333 // etcd client prior to 3.5 passed URLs to dialer, normalize to address 334 u, err := url.Parse(addr) 335 if err != nil { 336 return nil, err 337 } 338 addr = u.Host 339 } 340 return egressDialer(ctx, "tcp", addr) 341 } 342 dialOptions = append(dialOptions, grpc.WithContextDialer(dialer)) 343 } 344 345 cfg := clientv3.Config{ 346 DialTimeout: dialTimeout, 347 DialKeepAliveTime: keepaliveTime, 348 DialKeepAliveTimeout: keepaliveTimeout, 349 DialOptions: dialOptions, 350 Endpoints: c.ServerList, 351 TLS: tlsConfig, 352 Logger: etcd3ClientLogger, 353 } 354 355 return clientv3.New(cfg) 356 } 357 358 type runningCompactor struct { 359 interval time.Duration 360 cancel context.CancelFunc 361 client *clientv3.Client 362 refs int 363 } 364 365 var ( 366 // compactorsMu guards access to compactors map 367 compactorsMu sync.Mutex 368 compactors = map[string]*runningCompactor{} 369 // dbMetricsMonitorsMu guards access to dbMetricsMonitors map 370 dbMetricsMonitorsMu sync.Mutex 371 dbMetricsMonitors map[string]struct{} 372 ) 373 374 // startCompactorOnce start one compactor per transport. If the interval get smaller on repeated calls, the 375 // compactor is replaced. A destroy func is returned. If all destroy funcs with the same transport are called, 376 // the compactor is stopped. 377 func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration) (func(), error) { 378 compactorsMu.Lock() 379 defer compactorsMu.Unlock() 380 381 key := fmt.Sprintf("%v", c) // gives: {[server1 server2] keyFile certFile caFile} 382 if compactor, foundBefore := compactors[key]; !foundBefore || compactor.interval > interval { 383 compactorClient, err := newETCD3Client(c) 384 if err != nil { 385 return nil, err 386 } 387 388 if foundBefore { 389 // replace compactor 390 compactor.cancel() 391 compactor.client.Close() 392 } else { 393 // start new compactor 394 compactor = &runningCompactor{} 395 compactors[key] = compactor 396 } 397 398 ctx, cancel := context.WithCancel(context.Background()) 399 400 compactor.interval = interval 401 compactor.cancel = cancel 402 compactor.client = compactorClient 403 404 etcd3.StartCompactor(ctx, compactorClient, interval) 405 } 406 407 compactors[key].refs++ 408 409 return func() { 410 compactorsMu.Lock() 411 defer compactorsMu.Unlock() 412 413 compactor := compactors[key] 414 compactor.refs-- 415 if compactor.refs == 0 { 416 compactor.cancel() 417 compactor.client.Close() 418 delete(compactors, key) 419 } 420 }, nil 421 } 422 423 func newETCD3Storage(c storagebackend.ConfigForResource, newFunc, newListFunc func() runtime.Object, resourcePrefix string) (storage.Interface, DestroyFunc, error) { 424 stopCompactor, err := startCompactorOnce(c.Transport, c.CompactionInterval) 425 if err != nil { 426 return nil, nil, err 427 } 428 429 client, err := newETCD3Client(c.Transport) 430 if err != nil { 431 stopCompactor() 432 return nil, nil, err 433 } 434 435 // decorate the KV instance so we can track etcd latency per request. 436 client.KV = etcd3.NewETCDLatencyTracker(client.KV) 437 438 stopDBSizeMonitor, err := startDBSizeMonitorPerEndpoint(client, c.DBMetricPollInterval) 439 if err != nil { 440 return nil, nil, err 441 } 442 443 var once sync.Once 444 destroyFunc := func() { 445 // we know that storage destroy funcs are called multiple times (due to reuse in subresources). 446 // Hence, we only destroy once. 447 // TODO: fix duplicated storage destroy calls higher level 448 once.Do(func() { 449 stopCompactor() 450 stopDBSizeMonitor() 451 client.Close() 452 }) 453 } 454 transformer := c.Transformer 455 if transformer == nil { 456 transformer = identity.NewEncryptCheckTransformer() 457 } 458 return etcd3.New(client, c.Codec, newFunc, newListFunc, c.Prefix, resourcePrefix, c.GroupResource, transformer, c.LeaseManagerConfig), destroyFunc, nil 459 } 460 461 // startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the 462 // corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint. 463 // Deprecated: Will be replaced with newETCD3ProberMonitor 464 func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) { 465 if interval == 0 { 466 return func() {}, nil 467 } 468 dbMetricsMonitorsMu.Lock() 469 defer dbMetricsMonitorsMu.Unlock() 470 471 ctx, cancel := context.WithCancel(context.Background()) 472 for _, ep := range client.Endpoints() { 473 if _, found := dbMetricsMonitors[ep]; found { 474 continue 475 } 476 dbMetricsMonitors[ep] = struct{}{} 477 endpoint := ep 478 klog.V(4).Infof("Start monitoring storage db size metric for endpoint %s with polling interval %v", endpoint, interval) 479 go wait.JitterUntilWithContext(ctx, func(context.Context) { 480 epStatus, err := client.Maintenance.Status(ctx, endpoint) 481 if err != nil { 482 klog.V(4).Infof("Failed to get storage db size for ep %s: %v", endpoint, err) 483 metrics.UpdateEtcdDbSize(endpoint, -1) 484 } else { 485 metrics.UpdateEtcdDbSize(endpoint, epStatus.DbSize) 486 } 487 }, interval, dbMetricsMonitorJitter, true) 488 } 489 490 return func() { 491 cancel() 492 }, nil 493 }