k8s.io/apiserver@v0.29.3/pkg/storage/storagebackend/factory/etcd3.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package factory 18 19 import ( 20 "context" 21 "fmt" 22 "log" 23 "math/rand" 24 "net" 25 "net/url" 26 "os" 27 "path" 28 "strings" 29 "sync" 30 "time" 31 32 grpcprom "github.com/grpc-ecosystem/go-grpc-prometheus" 33 "go.etcd.io/etcd/client/pkg/v3/logutil" 34 "go.etcd.io/etcd/client/pkg/v3/transport" 35 clientv3 "go.etcd.io/etcd/client/v3" 36 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" 37 "go.uber.org/zap" 38 "go.uber.org/zap/zapcore" 39 "golang.org/x/time/rate" 40 "google.golang.org/grpc" 41 "k8s.io/klog/v2" 42 43 "k8s.io/apimachinery/pkg/runtime" 44 utilnet "k8s.io/apimachinery/pkg/util/net" 45 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 46 "k8s.io/apimachinery/pkg/util/wait" 47 genericfeatures "k8s.io/apiserver/pkg/features" 48 "k8s.io/apiserver/pkg/server/egressselector" 49 "k8s.io/apiserver/pkg/storage" 50 "k8s.io/apiserver/pkg/storage/etcd3" 51 "k8s.io/apiserver/pkg/storage/etcd3/metrics" 52 "k8s.io/apiserver/pkg/storage/storagebackend" 53 "k8s.io/apiserver/pkg/storage/value/encrypt/identity" 54 utilfeature "k8s.io/apiserver/pkg/util/feature" 55 "k8s.io/component-base/metrics/legacyregistry" 56 tracing "k8s.io/component-base/tracing" 57 ) 58 59 const ( 60 // The short keepalive timeout and interval have been chosen to aggressively 61 // detect a failed etcd server without introducing much overhead. 62 keepaliveTime = 30 * time.Second 63 keepaliveTimeout = 10 * time.Second 64 65 // dialTimeout is the timeout for failing to establish a connection. 66 // It is set to 20 seconds as times shorter than that will cause TLS connections to fail 67 // on heavily loaded arm64 CPUs (issue #64649) 68 dialTimeout = 20 * time.Second 69 70 dbMetricsMonitorJitter = 0.5 71 ) 72 73 // TODO(negz): Stop using a package scoped logger. At the time of writing we're 74 // creating an etcd client for each CRD. We need to pass each etcd client a 75 // logger or each client will create its own, which comes with a significant 76 // memory cost (around 20% of the API server's memory when hundreds of CRDs are 77 // present). The correct fix here is to not create a client per CRD. See 78 // https://github.com/kubernetes/kubernetes/issues/111476 for more. 79 var etcd3ClientLogger *zap.Logger 80 81 func init() { 82 // grpcprom auto-registers (via an init function) their client metrics, since we are opting out of 83 // using the global prometheus registry and using our own wrapped global registry, 84 // we need to explicitly register these metrics to our global registry here. 85 // For reference: https://github.com/kubernetes/kubernetes/pull/81387 86 legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics) 87 dbMetricsMonitors = make(map[string]struct{}) 88 89 l, err := logutil.CreateDefaultZapLogger(etcdClientDebugLevel()) 90 if err != nil { 91 l = zap.NewNop() 92 } 93 etcd3ClientLogger = l.Named("etcd-client") 94 } 95 96 // etcdClientDebugLevel translates ETCD_CLIENT_DEBUG into zap log level. 97 // NOTE(negz): This is a copy of a private etcd client function: 98 // https://github.com/etcd-io/etcd/blob/v3.5.4/client/v3/logger.go#L47 99 func etcdClientDebugLevel() zapcore.Level { 100 envLevel := os.Getenv("ETCD_CLIENT_DEBUG") 101 if envLevel == "" || envLevel == "true" { 102 return zapcore.InfoLevel 103 } 104 var l zapcore.Level 105 if err := l.Set(envLevel); err == nil { 106 log.Printf("Deprecated env ETCD_CLIENT_DEBUG value. Using default level: 'info'") 107 return zapcore.InfoLevel 108 } 109 return l 110 } 111 112 func newETCD3HealthCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { 113 timeout := storagebackend.DefaultHealthcheckTimeout 114 if c.HealthcheckTimeout != time.Duration(0) { 115 timeout = c.HealthcheckTimeout 116 } 117 return newETCD3Check(c, timeout, stopCh) 118 } 119 120 func newETCD3ReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() error, error) { 121 timeout := storagebackend.DefaultReadinessTimeout 122 if c.ReadycheckTimeout != time.Duration(0) { 123 timeout = c.ReadycheckTimeout 124 } 125 return newETCD3Check(c, timeout, stopCh) 126 } 127 128 // atomic error acts as a cache for atomically store an error 129 // the error is only updated if the timestamp is more recent than 130 // current stored error. 131 type atomicLastError struct { 132 mu sync.RWMutex 133 err error 134 timestamp time.Time 135 } 136 137 func (a *atomicLastError) Store(err error, t time.Time) { 138 a.mu.Lock() 139 defer a.mu.Unlock() 140 if a.timestamp.IsZero() || a.timestamp.Before(t) { 141 a.err = err 142 a.timestamp = t 143 } 144 } 145 146 func (a *atomicLastError) Load() error { 147 a.mu.RLock() 148 defer a.mu.RUnlock() 149 return a.err 150 } 151 152 func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan struct{}) (func() error, error) { 153 // constructing the etcd v3 client blocks and times out if etcd is not available. 154 // retry in a loop in the background until we successfully create the client, storing the client or error encountered 155 156 lock := sync.RWMutex{} 157 var prober *etcd3ProberMonitor 158 clientErr := fmt.Errorf("etcd client connection not yet established") 159 160 go wait.PollImmediateUntil(time.Second, func() (bool, error) { 161 lock.Lock() 162 defer lock.Unlock() 163 newProber, err := newETCD3ProberMonitor(c) 164 // Ensure that server is already not shutting down. 165 select { 166 case <-stopCh: 167 if err == nil { 168 newProber.Close() 169 } 170 return true, nil 171 default: 172 } 173 if err != nil { 174 clientErr = err 175 return false, nil 176 } 177 prober = newProber 178 clientErr = nil 179 return true, nil 180 }, stopCh) 181 182 // Close the client on shutdown. 183 go func() { 184 defer utilruntime.HandleCrash() 185 <-stopCh 186 187 lock.Lock() 188 defer lock.Unlock() 189 if prober != nil { 190 prober.Close() 191 clientErr = fmt.Errorf("server is shutting down") 192 } 193 }() 194 195 // limit to a request every half of the configured timeout with a maximum burst of one 196 // rate limited requests will receive the last request sent error (note: not the last received response) 197 limiter := rate.NewLimiter(rate.Every(timeout/2), 1) 198 // initial state is the clientErr 199 lastError := &atomicLastError{err: fmt.Errorf("etcd client connection not yet established")} 200 201 return func() error { 202 // Given that client is closed on shutdown we hold the lock for 203 // the entire period of healthcheck call to ensure that client will 204 // not be closed during healthcheck. 205 // Given that healthchecks has a 2s timeout, worst case of blocking 206 // shutdown for additional 2s seems acceptable. 207 lock.RLock() 208 defer lock.RUnlock() 209 210 if clientErr != nil { 211 return clientErr 212 } 213 if limiter.Allow() == false { 214 return lastError.Load() 215 } 216 ctx, cancel := context.WithTimeout(context.Background(), timeout) 217 defer cancel() 218 now := time.Now() 219 err := prober.Probe(ctx) 220 lastError.Store(err, now) 221 return err 222 }, nil 223 } 224 225 func newETCD3ProberMonitor(c storagebackend.Config) (*etcd3ProberMonitor, error) { 226 client, err := newETCD3Client(c.Transport) 227 if err != nil { 228 return nil, err 229 } 230 return &etcd3ProberMonitor{ 231 client: client, 232 prefix: c.Prefix, 233 endpoints: c.Transport.ServerList, 234 }, nil 235 } 236 237 type etcd3ProberMonitor struct { 238 prefix string 239 endpoints []string 240 241 mux sync.RWMutex 242 client *clientv3.Client 243 closed bool 244 } 245 246 func (t *etcd3ProberMonitor) Close() error { 247 t.mux.Lock() 248 defer t.mux.Unlock() 249 if !t.closed { 250 t.closed = true 251 return t.client.Close() 252 } 253 return fmt.Errorf("closed") 254 } 255 256 func (t *etcd3ProberMonitor) Probe(ctx context.Context) error { 257 t.mux.RLock() 258 defer t.mux.RUnlock() 259 if t.closed { 260 return fmt.Errorf("closed") 261 } 262 // See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118 263 _, err := t.client.Get(ctx, path.Join("/", t.prefix, "health")) 264 if err != nil { 265 return fmt.Errorf("error getting data from etcd: %w", err) 266 } 267 return nil 268 } 269 270 func (t *etcd3ProberMonitor) Monitor(ctx context.Context) (metrics.StorageMetrics, error) { 271 t.mux.RLock() 272 defer t.mux.RUnlock() 273 if t.closed { 274 return metrics.StorageMetrics{}, fmt.Errorf("closed") 275 } 276 status, err := t.client.Status(ctx, t.endpoints[rand.Int()%len(t.endpoints)]) 277 if err != nil { 278 return metrics.StorageMetrics{}, err 279 } 280 return metrics.StorageMetrics{ 281 Size: status.DbSize, 282 }, nil 283 } 284 285 var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) { 286 tlsInfo := transport.TLSInfo{ 287 CertFile: c.CertFile, 288 KeyFile: c.KeyFile, 289 TrustedCAFile: c.TrustedCAFile, 290 } 291 tlsConfig, err := tlsInfo.ClientConfig() 292 if err != nil { 293 return nil, err 294 } 295 // NOTE: Client relies on nil tlsConfig 296 // for non-secure connections, update the implicit variable 297 if len(c.CertFile) == 0 && len(c.KeyFile) == 0 && len(c.TrustedCAFile) == 0 { 298 tlsConfig = nil 299 } 300 networkContext := egressselector.Etcd.AsNetworkContext() 301 var egressDialer utilnet.DialFunc 302 if c.EgressLookup != nil { 303 egressDialer, err = c.EgressLookup(networkContext) 304 if err != nil { 305 return nil, err 306 } 307 } 308 dialOptions := []grpc.DialOption{ 309 grpc.WithBlock(), // block until the underlying connection is up 310 // use chained interceptors so that the default (retry and backoff) interceptors are added. 311 // otherwise they will be overwritten by the metric interceptor. 312 // 313 // these optional interceptors will be placed after the default ones. 314 // which seems to be what we want as the metrics will be collected on each attempt (retry) 315 grpc.WithChainUnaryInterceptor(grpcprom.UnaryClientInterceptor), 316 grpc.WithChainStreamInterceptor(grpcprom.StreamClientInterceptor), 317 } 318 if utilfeature.DefaultFeatureGate.Enabled(genericfeatures.APIServerTracing) { 319 tracingOpts := []otelgrpc.Option{ 320 otelgrpc.WithPropagators(tracing.Propagators()), 321 otelgrpc.WithTracerProvider(c.TracerProvider), 322 } 323 // Even with Noop TracerProvider, the otelgrpc still handles context propagation. 324 // See https://github.com/open-telemetry/opentelemetry-go/tree/main/example/passthrough 325 dialOptions = append(dialOptions, 326 grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor(tracingOpts...)), 327 grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor(tracingOpts...))) 328 } 329 if egressDialer != nil { 330 dialer := func(ctx context.Context, addr string) (net.Conn, error) { 331 if strings.Contains(addr, "//") { 332 // etcd client prior to 3.5 passed URLs to dialer, normalize to address 333 u, err := url.Parse(addr) 334 if err != nil { 335 return nil, err 336 } 337 addr = u.Host 338 } 339 return egressDialer(ctx, "tcp", addr) 340 } 341 dialOptions = append(dialOptions, grpc.WithContextDialer(dialer)) 342 } 343 344 cfg := clientv3.Config{ 345 DialTimeout: dialTimeout, 346 DialKeepAliveTime: keepaliveTime, 347 DialKeepAliveTimeout: keepaliveTimeout, 348 DialOptions: dialOptions, 349 Endpoints: c.ServerList, 350 TLS: tlsConfig, 351 Logger: etcd3ClientLogger, 352 } 353 354 return clientv3.New(cfg) 355 } 356 357 type runningCompactor struct { 358 interval time.Duration 359 cancel context.CancelFunc 360 client *clientv3.Client 361 refs int 362 } 363 364 var ( 365 // compactorsMu guards access to compactors map 366 compactorsMu sync.Mutex 367 compactors = map[string]*runningCompactor{} 368 // dbMetricsMonitorsMu guards access to dbMetricsMonitors map 369 dbMetricsMonitorsMu sync.Mutex 370 dbMetricsMonitors map[string]struct{} 371 ) 372 373 // startCompactorOnce start one compactor per transport. If the interval get smaller on repeated calls, the 374 // compactor is replaced. A destroy func is returned. If all destroy funcs with the same transport are called, 375 // the compactor is stopped. 376 func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration) (func(), error) { 377 compactorsMu.Lock() 378 defer compactorsMu.Unlock() 379 380 key := fmt.Sprintf("%v", c) // gives: {[server1 server2] keyFile certFile caFile} 381 if compactor, foundBefore := compactors[key]; !foundBefore || compactor.interval > interval { 382 compactorClient, err := newETCD3Client(c) 383 if err != nil { 384 return nil, err 385 } 386 387 if foundBefore { 388 // replace compactor 389 compactor.cancel() 390 compactor.client.Close() 391 } else { 392 // start new compactor 393 compactor = &runningCompactor{} 394 compactors[key] = compactor 395 } 396 397 ctx, cancel := context.WithCancel(context.Background()) 398 399 compactor.interval = interval 400 compactor.cancel = cancel 401 compactor.client = compactorClient 402 403 etcd3.StartCompactor(ctx, compactorClient, interval) 404 } 405 406 compactors[key].refs++ 407 408 return func() { 409 compactorsMu.Lock() 410 defer compactorsMu.Unlock() 411 412 compactor := compactors[key] 413 compactor.refs-- 414 if compactor.refs == 0 { 415 compactor.cancel() 416 compactor.client.Close() 417 delete(compactors, key) 418 } 419 }, nil 420 } 421 422 func newETCD3Storage(c storagebackend.ConfigForResource, newFunc, newListFunc func() runtime.Object, resourcePrefix string) (storage.Interface, DestroyFunc, error) { 423 stopCompactor, err := startCompactorOnce(c.Transport, c.CompactionInterval) 424 if err != nil { 425 return nil, nil, err 426 } 427 428 client, err := newETCD3Client(c.Transport) 429 if err != nil { 430 stopCompactor() 431 return nil, nil, err 432 } 433 434 // decorate the KV instance so we can track etcd latency per request. 435 client.KV = etcd3.NewETCDLatencyTracker(client.KV) 436 437 stopDBSizeMonitor, err := startDBSizeMonitorPerEndpoint(client, c.DBMetricPollInterval) 438 if err != nil { 439 return nil, nil, err 440 } 441 442 var once sync.Once 443 destroyFunc := func() { 444 // we know that storage destroy funcs are called multiple times (due to reuse in subresources). 445 // Hence, we only destroy once. 446 // TODO: fix duplicated storage destroy calls higher level 447 once.Do(func() { 448 stopCompactor() 449 stopDBSizeMonitor() 450 client.Close() 451 }) 452 } 453 transformer := c.Transformer 454 if transformer == nil { 455 transformer = identity.NewEncryptCheckTransformer() 456 } 457 return etcd3.New(client, c.Codec, newFunc, newListFunc, c.Prefix, resourcePrefix, c.GroupResource, transformer, c.LeaseManagerConfig), destroyFunc, nil 458 } 459 460 // startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the 461 // corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint. 462 // Deprecated: Will be replaced with newETCD3ProberMonitor 463 func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) { 464 if interval == 0 { 465 return func() {}, nil 466 } 467 dbMetricsMonitorsMu.Lock() 468 defer dbMetricsMonitorsMu.Unlock() 469 470 ctx, cancel := context.WithCancel(context.Background()) 471 for _, ep := range client.Endpoints() { 472 if _, found := dbMetricsMonitors[ep]; found { 473 continue 474 } 475 dbMetricsMonitors[ep] = struct{}{} 476 endpoint := ep 477 klog.V(4).Infof("Start monitoring storage db size metric for endpoint %s with polling interval %v", endpoint, interval) 478 go wait.JitterUntilWithContext(ctx, func(context.Context) { 479 epStatus, err := client.Maintenance.Status(ctx, endpoint) 480 if err != nil { 481 klog.V(4).Infof("Failed to get storage db size for ep %s: %v", endpoint, err) 482 metrics.UpdateEtcdDbSize(endpoint, -1) 483 } else { 484 metrics.UpdateEtcdDbSize(endpoint, epStatus.DbSize) 485 } 486 }, interval, dbMetricsMonitorJitter, true) 487 } 488 489 return func() { 490 cancel() 491 }, nil 492 }