sigs.k8s.io/cluster-api@v1.6.3/controllers/remote/cluster_cache_tracker.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package remote 18 19 import ( 20 "context" 21 "crypto/rsa" 22 "fmt" 23 "net/http" 24 "os" 25 "sync" 26 "time" 27 28 "github.com/go-logr/logr" 29 "github.com/pkg/errors" 30 corev1 "k8s.io/api/core/v1" 31 apierrors "k8s.io/apimachinery/pkg/api/errors" 32 "k8s.io/apimachinery/pkg/api/meta" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/runtime/serializer" 36 "k8s.io/apimachinery/pkg/types" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/client-go/kubernetes/scheme" 40 "k8s.io/client-go/rest" 41 "k8s.io/klog/v2" 42 ctrl "sigs.k8s.io/controller-runtime" 43 "sigs.k8s.io/controller-runtime/pkg/cache" 44 "sigs.k8s.io/controller-runtime/pkg/client" 45 "sigs.k8s.io/controller-runtime/pkg/client/apiutil" 46 "sigs.k8s.io/controller-runtime/pkg/handler" 47 "sigs.k8s.io/controller-runtime/pkg/log" 48 "sigs.k8s.io/controller-runtime/pkg/predicate" 49 "sigs.k8s.io/controller-runtime/pkg/source" 50 51 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 52 "sigs.k8s.io/cluster-api/util/certs" 53 "sigs.k8s.io/cluster-api/util/conditions" 54 ) 55 56 const ( 57 healthCheckPollInterval = 10 * time.Second 58 healthCheckRequestTimeout = 5 * time.Second 59 healthCheckUnhealthyThreshold = 10 60 initialCacheSyncTimeout = 5 * time.Minute 61 clusterCacheControllerName = "cluster-cache-tracker" 62 ) 63 64 // ErrClusterLocked is returned in methods that require cluster-level locking 65 // if the cluster is already locked by another concurrent call. 66 var ErrClusterLocked = errors.New("cluster is locked already") 67 68 // ClusterCacheTracker manages client caches for workload clusters. 69 type ClusterCacheTracker struct { 70 log logr.Logger 71 clientUncachedObjects []client.Object 72 73 client client.Client 74 75 // SecretCachingClient is a client which caches secrets. 76 // If set it will be used to read the kubeconfig secret. 77 // Otherwise the default client from the manager will be used. 78 secretCachingClient client.Client 79 80 scheme *runtime.Scheme 81 82 // clusterAccessorsLock is used to lock the access to the clusterAccessors map. 83 clusterAccessorsLock sync.RWMutex 84 // clusterAccessors is the map of clusterAccessors by cluster. 85 clusterAccessors map[client.ObjectKey]*clusterAccessor 86 // clusterLock is a per-cluster lock used whenever we're locking for a specific cluster. 87 // E.g. for actions like creating a client or adding watches. 88 clusterLock *keyedMutex 89 90 indexes []Index 91 92 // controllerName is the name of the controller. 93 // This is used to calculate the user agent string. 94 controllerName string 95 96 // controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker. 97 // This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set. 98 // This information will be used to detected if the controller is running on a workload cluster, so 99 // that we can then access the apiserver directly. 100 controllerPodMetadata *metav1.ObjectMeta 101 } 102 103 // ClusterCacheTrackerOptions defines options to configure 104 // a ClusterCacheTracker. 105 type ClusterCacheTrackerOptions struct { 106 // SecretCachingClient is a client which caches secrets. 107 // If set it will be used to read the kubeconfig secret. 108 // Otherwise the default client from the manager will be used. 109 SecretCachingClient client.Client 110 111 // Log is the logger used throughout the lifecycle of caches. 112 // Defaults to a no-op logger if it's not set. 113 Log *logr.Logger 114 115 // ClientUncachedObjects instructs the Client to never cache the following objects, 116 // it'll instead query the API server directly. 117 // Defaults to never caching ConfigMap and Secret if not set. 118 ClientUncachedObjects []client.Object 119 Indexes []Index 120 121 // ControllerName is the name of the controller. 122 // This is used to calculate the user agent string. 123 // If not set, it defaults to "cluster-cache-tracker". 124 ControllerName string 125 } 126 127 func setDefaultOptions(opts *ClusterCacheTrackerOptions) { 128 if opts.Log == nil { 129 l := logr.New(log.NullLogSink{}) 130 opts.Log = &l 131 } 132 133 l := opts.Log.WithValues("component", "remote/clustercachetracker") 134 opts.Log = &l 135 136 if len(opts.ClientUncachedObjects) == 0 { 137 opts.ClientUncachedObjects = []client.Object{ 138 &corev1.ConfigMap{}, 139 &corev1.Secret{}, 140 } 141 } 142 } 143 144 // NewClusterCacheTracker creates a new ClusterCacheTracker. 145 func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOptions) (*ClusterCacheTracker, error) { 146 setDefaultOptions(&options) 147 148 controllerName := options.ControllerName 149 if controllerName == "" { 150 controllerName = clusterCacheControllerName 151 } 152 153 var controllerPodMetadata *metav1.ObjectMeta 154 podNamespace := os.Getenv("POD_NAMESPACE") 155 podName := os.Getenv("POD_NAME") 156 podUID := os.Getenv("POD_UID") 157 if podNamespace != "" && podName != "" && podUID != "" { 158 options.Log.Info("Found controller pod metadata, the ClusterCacheTracker will try to access the cluster directly when possible") 159 controllerPodMetadata = &metav1.ObjectMeta{ 160 Namespace: podNamespace, 161 Name: podName, 162 UID: types.UID(podUID), 163 } 164 } else { 165 options.Log.Info("Couldn't find controller pod metadata, the ClusterCacheTracker will always access clusters using the regular apiserver endpoint") 166 } 167 168 return &ClusterCacheTracker{ 169 controllerName: controllerName, 170 controllerPodMetadata: controllerPodMetadata, 171 log: *options.Log, 172 clientUncachedObjects: options.ClientUncachedObjects, 173 client: manager.GetClient(), 174 secretCachingClient: options.SecretCachingClient, 175 scheme: manager.GetScheme(), 176 clusterAccessors: make(map[client.ObjectKey]*clusterAccessor), 177 clusterLock: newKeyedMutex(), 178 indexes: options.Indexes, 179 }, nil 180 } 181 182 // GetClient returns a cached client for the given cluster. 183 func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) { 184 accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...) 185 if err != nil { 186 return nil, err 187 } 188 189 return accessor.client, nil 190 } 191 192 // GetRESTConfig returns a cached REST config for the given cluster. 193 func (t *ClusterCacheTracker) GetRESTConfig(ctc context.Context, cluster client.ObjectKey) (*rest.Config, error) { 194 accessor, err := t.getClusterAccessor(ctc, cluster, t.indexes...) 195 if err != nil { 196 return nil, err 197 } 198 199 return accessor.config, nil 200 } 201 202 // GetEtcdClientCertificateKey returns a cached certificate key to be used for generating certificates for accessing etcd in the given cluster. 203 func (t *ClusterCacheTracker) GetEtcdClientCertificateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) { 204 accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...) 205 if err != nil { 206 return nil, err 207 } 208 209 return accessor.etcdClientCertificateKey, nil 210 } 211 212 // clusterAccessor represents the combination of a delegating client, cache, and watches for a remote cluster. 213 type clusterAccessor struct { 214 cache *stoppableCache 215 client client.Client 216 watches sets.Set[string] 217 config *rest.Config 218 etcdClientCertificateKey *rsa.PrivateKey 219 } 220 221 // clusterAccessorExists returns true if a clusterAccessor exists for cluster. 222 func (t *ClusterCacheTracker) clusterAccessorExists(cluster client.ObjectKey) bool { 223 t.clusterAccessorsLock.RLock() 224 defer t.clusterAccessorsLock.RUnlock() 225 226 _, exists := t.clusterAccessors[cluster] 227 return exists 228 } 229 230 // loadAccessor loads a clusterAccessor. 231 func (t *ClusterCacheTracker) loadAccessor(cluster client.ObjectKey) (*clusterAccessor, bool) { 232 t.clusterAccessorsLock.RLock() 233 defer t.clusterAccessorsLock.RUnlock() 234 235 accessor, ok := t.clusterAccessors[cluster] 236 return accessor, ok 237 } 238 239 // storeAccessor stores a clusterAccessor. 240 func (t *ClusterCacheTracker) storeAccessor(cluster client.ObjectKey, accessor *clusterAccessor) { 241 t.clusterAccessorsLock.Lock() 242 defer t.clusterAccessorsLock.Unlock() 243 244 t.clusterAccessors[cluster] = accessor 245 } 246 247 // getClusterAccessor returns a clusterAccessor for cluster. 248 // It first tries to return an already-created clusterAccessor. 249 // It then falls back to create a new clusterAccessor if needed. 250 // If there is already another go routine trying to create a clusterAccessor 251 // for the same cluster, an error is returned. 252 func (t *ClusterCacheTracker) getClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) { 253 log := ctrl.LoggerFrom(ctx, "cluster", klog.KRef(cluster.Namespace, cluster.Name)) 254 255 // If the clusterAccessor already exists, return early. 256 if accessor, ok := t.loadAccessor(cluster); ok { 257 return accessor, nil 258 } 259 260 // clusterAccessor doesn't exist yet, we might have to initialize one. 261 // Lock on the cluster to ensure only one clusterAccessor is initialized 262 // for the cluster at the same time. 263 // Return an error if another go routine already tries to create a clusterAccessor. 264 if ok := t.clusterLock.TryLock(cluster); !ok { 265 return nil, errors.Wrapf(ErrClusterLocked, "failed to create cluster accessor: failed to get lock for cluster") 266 } 267 defer t.clusterLock.Unlock(cluster) 268 269 // Until we got the cluster lock a different goroutine might have initialized the clusterAccessor 270 // for this cluster successfully already. If this is the case we return it. 271 if accessor, ok := t.loadAccessor(cluster); ok { 272 return accessor, nil 273 } 274 275 // We are the go routine who has to initialize the clusterAccessor. 276 log.V(4).Info("Creating new cluster accessor") 277 accessor, err := t.newClusterAccessor(ctx, cluster, indexes...) 278 if err != nil { 279 return nil, errors.Wrap(err, "failed to create cluster accessor") 280 } 281 282 log.V(4).Info("Storing new cluster accessor") 283 t.storeAccessor(cluster, accessor) 284 return accessor, nil 285 } 286 287 // newClusterAccessor creates a new clusterAccessor. 288 func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) { 289 log := ctrl.LoggerFrom(ctx) 290 291 // Get a rest config for the remote cluster. 292 // Use the secretCachingClient if set. 293 secretClient := t.client 294 if t.secretCachingClient != nil { 295 secretClient = t.secretCachingClient 296 } 297 config, err := RESTConfig(ctx, t.controllerName, secretClient, cluster) 298 if err != nil { 299 return nil, errors.Wrapf(err, "error fetching REST client config for remote cluster %q", cluster.String()) 300 } 301 302 // Create a http client and a mapper for the cluster. 303 httpClient, mapper, err := t.createHTTPClientAndMapper(config, cluster) 304 if err != nil { 305 return nil, errors.Wrapf(err, "error creating http client and mapper for remote cluster %q", cluster.String()) 306 } 307 308 // Create an uncached client for the cluster. 309 uncachedClient, err := t.createUncachedClient(config, cluster, httpClient, mapper) 310 if err != nil { 311 return nil, err 312 } 313 314 // Detect if the controller is running on the workload cluster. 315 // This function uses an uncached client to ensure pods aren't cached by the long-lived client. 316 runningOnCluster, err := t.runningOnWorkloadCluster(ctx, uncachedClient, cluster) 317 if err != nil { 318 return nil, err 319 } 320 321 // If the controller runs on the workload cluster, access the apiserver directly by using the 322 // CA and Host from the in-cluster configuration. 323 if runningOnCluster { 324 inClusterConfig, err := ctrl.GetConfig() 325 if err != nil { 326 return nil, errors.Wrapf(err, "error creating client for self-hosted cluster %q", cluster.String()) 327 } 328 329 // Use CA and Host from in-cluster config. 330 config.CAData = nil 331 config.CAFile = inClusterConfig.CAFile 332 config.Host = inClusterConfig.Host 333 334 // Update the http client and the mapper to use in-cluster config. 335 httpClient, mapper, err = t.createHTTPClientAndMapper(config, cluster) 336 if err != nil { 337 return nil, errors.Wrapf(err, "error creating http client and mapper (using in-cluster config) for remote cluster %q", cluster.String()) 338 } 339 340 log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with in-cluster service %q", cluster.String(), config.Host)) 341 } else { 342 log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with the regular apiserver endpoint %q", cluster.String(), config.Host)) 343 } 344 345 // Create a client and a cache for the cluster. 346 cachedClient, err := t.createCachedClient(ctx, config, cluster, httpClient, mapper, indexes) 347 if err != nil { 348 return nil, err 349 } 350 351 // Generating a new private key to be used for generating temporary certificates to connect to 352 // etcd on the target cluster. 353 // NOTE: Generating a private key is an expensive operation, so we store it in the cluster accessor. 354 etcdKey, err := certs.NewPrivateKey() 355 if err != nil { 356 return nil, errors.Wrapf(err, "error creating etcd client key for remote cluster %q", cluster.String()) 357 } 358 359 return &clusterAccessor{ 360 cache: cachedClient.Cache, 361 config: config, 362 client: cachedClient.Client, 363 watches: sets.Set[string]{}, 364 etcdClientCertificateKey: etcdKey, 365 }, nil 366 } 367 368 // runningOnWorkloadCluster detects if the current controller runs on the workload cluster. 369 func (t *ClusterCacheTracker) runningOnWorkloadCluster(ctx context.Context, c client.Client, cluster client.ObjectKey) (bool, error) { 370 // Controller Pod metadata was not found, so we can't detect if we run on the workload cluster. 371 if t.controllerPodMetadata == nil { 372 return false, nil 373 } 374 375 // Try to get the controller pod. 376 var pod corev1.Pod 377 if err := c.Get(ctx, client.ObjectKey{ 378 Namespace: t.controllerPodMetadata.Namespace, 379 Name: t.controllerPodMetadata.Name, 380 }, &pod); err != nil { 381 // If the controller pod is not found, we assume we are not running on the workload cluster. 382 if apierrors.IsNotFound(err) { 383 return false, nil 384 } 385 386 // If we got another error, we return the error so that this will be retried later. 387 return false, errors.Wrapf(err, "error checking if we're running on workload cluster %q", cluster.String()) 388 } 389 390 // If the uid is the same we found the controller pod on the workload cluster. 391 return t.controllerPodMetadata.UID == pod.UID, nil 392 } 393 394 // createHTTPClientAndMapper creates a http client and a dynamic rest mapper for the given cluster, based on the rest.Config. 395 func (t *ClusterCacheTracker) createHTTPClientAndMapper(config *rest.Config, cluster client.ObjectKey) (*http.Client, meta.RESTMapper, error) { 396 // Create a http client for the cluster. 397 httpClient, err := rest.HTTPClientFor(config) 398 if err != nil { 399 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating http client", cluster.String()) 400 } 401 402 // Create a mapper for it 403 mapper, err := apiutil.NewDynamicRESTMapper(config, httpClient) 404 if err != nil { 405 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating dynamic rest mapper", cluster.String()) 406 } 407 408 // Verify if we can get a rest mapping from the workload cluster apiserver. 409 // Note: This also checks if the apiserver is up in general. We do this already here 410 // to avoid further effort creating a cache and a client and to produce a clearer error message. 411 _, err = mapper.RESTMapping(corev1.SchemeGroupVersion.WithKind("Node").GroupKind(), corev1.SchemeGroupVersion.Version) 412 if err != nil { 413 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error getting rest mapping", cluster.String()) 414 } 415 416 return httpClient, mapper, nil 417 } 418 419 // createUncachedClient creates an uncached client for the given cluster, based on the rest.Config. 420 func (t *ClusterCacheTracker) createUncachedClient(config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, error) { 421 // Create the uncached client for the remote cluster 422 uncachedClient, err := client.New(config, client.Options{ 423 Scheme: t.scheme, 424 Mapper: mapper, 425 HTTPClient: httpClient, 426 }) 427 if err != nil { 428 return nil, errors.Wrapf(err, "error creating uncached client for remote cluster %q", cluster.String()) 429 } 430 431 return uncachedClient, nil 432 } 433 434 type cachedClientOutput struct { 435 Client client.Client 436 Cache *stoppableCache 437 } 438 439 // createCachedClient creates a cached client for the given cluster, based on a rest.Config. 440 func (t *ClusterCacheTracker) createCachedClient(ctx context.Context, config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper, indexes []Index) (*cachedClientOutput, error) { 441 // Create the cache for the remote cluster 442 cacheOptions := cache.Options{ 443 HTTPClient: httpClient, 444 Scheme: t.scheme, 445 Mapper: mapper, 446 } 447 remoteCache, err := cache.New(config, cacheOptions) 448 if err != nil { 449 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error creating cache", cluster.String()) 450 } 451 452 cacheCtx, cacheCtxCancel := context.WithCancel(ctx) 453 454 // We need to be able to stop the cache's shared informers, so wrap this in a stoppableCache. 455 cache := &stoppableCache{ 456 Cache: remoteCache, 457 cancelFunc: cacheCtxCancel, 458 } 459 460 for _, index := range indexes { 461 if err := cache.IndexField(ctx, index.Object, index.Field, index.ExtractValue); err != nil { 462 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error adding index for field %q to cache", cluster.String(), index.Field) 463 } 464 } 465 466 // Create the client for the remote cluster 467 cachedClient, err := client.New(config, client.Options{ 468 Scheme: t.scheme, 469 Mapper: mapper, 470 HTTPClient: httpClient, 471 Cache: &client.CacheOptions{ 472 Reader: cache, 473 DisableFor: t.clientUncachedObjects, 474 Unstructured: true, 475 }, 476 }) 477 if err != nil { 478 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q", cluster.String()) 479 } 480 481 // Start the cache!!! 482 go cache.Start(cacheCtx) //nolint:errcheck 483 484 // Wait until the cache is initially synced 485 cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, initialCacheSyncTimeout) 486 defer cacheSyncCtxCancel() 487 if !cache.WaitForCacheSync(cacheSyncCtx) { 488 cache.Stop() 489 return nil, fmt.Errorf("failed waiting for cache for remote cluster %v to sync: %w", cluster, cacheCtx.Err()) 490 } 491 492 // Wrap the cached client with a client that sets timeouts on all Get and List calls 493 // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer 494 // and the informer than doesn't sync because the workload cluster apiserver is not reachable. 495 // An alternative would be to set timeouts in the contexts we pass into all Get and List calls. 496 // It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig. 497 cachedClient = newClientWithTimeout(cachedClient, config.Timeout) 498 499 // Start cluster healthcheck!!! 500 go t.healthCheckCluster(cacheCtx, &healthCheckInput{ 501 cluster: cluster, 502 cfg: config, 503 httpClient: httpClient, 504 }) 505 506 return &cachedClientOutput{ 507 Client: cachedClient, 508 Cache: cache, 509 }, nil 510 } 511 512 // deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker. 513 func (t *ClusterCacheTracker) deleteAccessor(_ context.Context, cluster client.ObjectKey) { 514 t.clusterAccessorsLock.Lock() 515 defer t.clusterAccessorsLock.Unlock() 516 517 a, exists := t.clusterAccessors[cluster] 518 if !exists { 519 return 520 } 521 522 log := t.log.WithValues("Cluster", klog.KRef(cluster.Namespace, cluster.Name)) 523 log.V(2).Info("Deleting clusterAccessor") 524 log.V(4).Info("Stopping cache") 525 a.cache.Stop() 526 log.V(4).Info("Cache stopped") 527 528 delete(t.clusterAccessors, cluster) 529 } 530 531 // Watcher is a scoped-down interface from Controller that only knows how to watch. 532 type Watcher interface { 533 // Watch watches src for changes, sending events to eventHandler if they pass predicates. 534 Watch(src source.Source, eventHandler handler.EventHandler, predicates ...predicate.Predicate) error 535 } 536 537 // WatchInput specifies the parameters used to establish a new watch for a remote cluster. 538 type WatchInput struct { 539 // Name represents a unique watch request for the specified Cluster. 540 Name string 541 542 // Cluster is the key for the remote cluster. 543 Cluster client.ObjectKey 544 545 // Watcher is the watcher (controller) whose Reconcile() function will be called for events. 546 Watcher Watcher 547 548 // Kind is the type of resource to watch. 549 Kind client.Object 550 551 // EventHandler contains the event handlers to invoke for resource events. 552 EventHandler handler.EventHandler 553 554 // Predicates is used to filter resource events. 555 Predicates []predicate.Predicate 556 } 557 558 // Watch watches a remote cluster for resource events. If the watch already exists based on input.Name, this is a no-op. 559 func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error { 560 if input.Name == "" { 561 return errors.New("input.Name is required") 562 } 563 564 accessor, err := t.getClusterAccessor(ctx, input.Cluster, t.indexes...) 565 if err != nil { 566 return errors.Wrapf(err, "failed to add %s watch on cluster %s", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 567 } 568 569 // We have to lock the cluster, so that the watch is not created multiple times in parallel. 570 ok := t.clusterLock.TryLock(input.Cluster) 571 if !ok { 572 return errors.Wrapf(ErrClusterLocked, "failed to add %T watch on cluster %s: failed to get lock for cluster", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 573 } 574 defer t.clusterLock.Unlock(input.Cluster) 575 576 if accessor.watches.Has(input.Name) { 577 log := ctrl.LoggerFrom(ctx) 578 log.V(6).Info("Watch already exists", "Cluster", klog.KRef(input.Cluster.Namespace, input.Cluster.Name), "name", input.Name) 579 return nil 580 } 581 582 // Need to create the watch 583 if err := input.Watcher.Watch(source.Kind(accessor.cache, input.Kind), input.EventHandler, input.Predicates...); err != nil { 584 return errors.Wrapf(err, "failed to add %s watch on cluster %s: failed to create watch", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 585 } 586 587 accessor.watches.Insert(input.Name) 588 589 return nil 590 } 591 592 // healthCheckInput provides the input for the healthCheckCluster method. 593 type healthCheckInput struct { 594 cluster client.ObjectKey 595 httpClient *http.Client 596 cfg *rest.Config 597 interval time.Duration 598 requestTimeout time.Duration 599 unhealthyThreshold int 600 path string 601 } 602 603 // setDefaults sets default values if optional parameters are not set. 604 func (h *healthCheckInput) setDefaults() { 605 if h.interval == 0 { 606 h.interval = healthCheckPollInterval 607 } 608 if h.requestTimeout == 0 { 609 h.requestTimeout = healthCheckRequestTimeout 610 } 611 if h.unhealthyThreshold == 0 { 612 h.unhealthyThreshold = healthCheckUnhealthyThreshold 613 } 614 if h.path == "" { 615 h.path = "/" 616 } 617 } 618 619 // healthCheckCluster will poll the cluster's API at the path given and, if there are 620 // `unhealthyThreshold` consecutive failures, will deem the cluster unhealthy. 621 // Once the cluster is deemed unhealthy, the cluster's cache is stopped and removed. 622 func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *healthCheckInput) { 623 // populate optional params for healthCheckInput 624 in.setDefaults() 625 626 unhealthyCount := 0 627 628 // This gets us a client that can make raw http(s) calls to the remote apiserver. We only need to create it once 629 // and we can reuse it inside the polling loop. 630 codec := runtime.NoopEncoder{Decoder: scheme.Codecs.UniversalDecoder()} 631 cfg := rest.CopyConfig(in.cfg) 632 cfg.NegotiatedSerializer = serializer.NegotiatedSerializerWrapper(runtime.SerializerInfo{Serializer: codec}) 633 restClient, restClientErr := rest.UnversionedRESTClientForConfigAndClient(cfg, in.httpClient) 634 635 runHealthCheckWithThreshold := func(ctx context.Context) (bool, error) { 636 if restClientErr != nil { 637 return false, restClientErr 638 } 639 640 cluster := &clusterv1.Cluster{} 641 if err := t.client.Get(ctx, in.cluster, cluster); err != nil { 642 if apierrors.IsNotFound(err) { 643 // If the cluster can't be found, we should delete the cache. 644 return false, err 645 } 646 // Otherwise, requeue. 647 return false, nil 648 } 649 650 if !cluster.Status.InfrastructureReady || !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { 651 // If the infrastructure or control plane aren't marked as ready, we should requeue and wait. 652 return false, nil 653 } 654 655 if _, ok := t.loadAccessor(in.cluster); !ok { 656 // If there is no accessor but the cluster is locked, we're probably in the middle of the cluster accessor 657 // creation and we should requeue the health check until it's done. 658 if ok := t.clusterLock.TryLock(in.cluster); !ok { 659 t.log.V(4).Info("Waiting for cluster to be unlocked. Requeuing health check") 660 return false, nil 661 } 662 t.clusterLock.Unlock(in.cluster) 663 // Cache for this cluster has already been cleaned up. 664 // Nothing to do, so return true. 665 return true, nil 666 } 667 668 // An error here means there was either an issue connecting or the API returned an error. 669 // If no error occurs, reset the unhealthy counter. 670 _, err := restClient.Get().AbsPath(in.path).Timeout(in.requestTimeout).DoRaw(ctx) 671 if err != nil { 672 if apierrors.IsUnauthorized(err) { 673 // Unauthorized means that the underlying kubeconfig is not authorizing properly anymore, which 674 // usually is the result of automatic kubeconfig refreshes, meaning that we have to throw away the 675 // clusterAccessor and rely on the creation of a new one (with a refreshed kubeconfig) 676 return false, err 677 } 678 unhealthyCount++ 679 } else { 680 unhealthyCount = 0 681 } 682 683 if unhealthyCount >= in.unhealthyThreshold { 684 // Cluster is now considered unhealthy. 685 return false, err 686 } 687 688 return false, nil 689 } 690 691 err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold) 692 // An error returned implies the health check has failed a sufficient number of times for the cluster 693 // to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the 694 // cache context into wait.PollUntilContextCancel). 695 // NB. Log all errors that occurred even if this error might just be from a cancel of the cache context 696 // when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier. 697 if err != nil { 698 t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name)) 699 } 700 // Ensure in any case that the accessor is deleted (even if it is a no-op). 701 // NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the 702 // cluster is reachable again 703 t.deleteAccessor(ctx, in.cluster) 704 } 705 706 // newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls. 707 // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer 708 // and the informer than doesn't sync because the workload cluster apiserver is not reachable. 709 // An alternative would be to set timeouts in the contexts we pass into all Get and List calls. 710 func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client { 711 return clientWithTimeout{ 712 Client: client, 713 timeout: timeout, 714 } 715 } 716 717 type clientWithTimeout struct { 718 client.Client 719 timeout time.Duration 720 } 721 722 var _ client.Client = &clientWithTimeout{} 723 724 func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { 725 ctx, cancel := context.WithTimeout(ctx, c.timeout) 726 defer cancel() 727 return c.Client.Get(ctx, key, obj, opts...) 728 } 729 730 func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { 731 ctx, cancel := context.WithTimeout(ctx, c.timeout) 732 defer cancel() 733 return c.Client.List(ctx, list, opts...) 734 }