sigs.k8s.io/cluster-api@v1.7.1/controllers/remote/cluster_cache_tracker.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package remote 18 19 import ( 20 "context" 21 "crypto/rsa" 22 "fmt" 23 "net/http" 24 "os" 25 "sync" 26 "time" 27 28 "github.com/go-logr/logr" 29 "github.com/pkg/errors" 30 corev1 "k8s.io/api/core/v1" 31 apierrors "k8s.io/apimachinery/pkg/api/errors" 32 "k8s.io/apimachinery/pkg/api/meta" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/runtime/serializer" 36 "k8s.io/apimachinery/pkg/types" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/client-go/kubernetes/scheme" 40 "k8s.io/client-go/rest" 41 "k8s.io/klog/v2" 42 ctrl "sigs.k8s.io/controller-runtime" 43 "sigs.k8s.io/controller-runtime/pkg/cache" 44 "sigs.k8s.io/controller-runtime/pkg/client" 45 "sigs.k8s.io/controller-runtime/pkg/client/apiutil" 46 "sigs.k8s.io/controller-runtime/pkg/handler" 47 "sigs.k8s.io/controller-runtime/pkg/log" 48 "sigs.k8s.io/controller-runtime/pkg/predicate" 49 "sigs.k8s.io/controller-runtime/pkg/source" 50 51 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" 52 "sigs.k8s.io/cluster-api/util/certs" 53 "sigs.k8s.io/cluster-api/util/conditions" 54 ) 55 56 const ( 57 healthCheckPollInterval = 10 * time.Second 58 healthCheckRequestTimeout = 5 * time.Second 59 healthCheckUnhealthyThreshold = 10 60 initialCacheSyncTimeout = 5 * time.Minute 61 clusterCacheControllerName = "cluster-cache-tracker" 62 ) 63 64 // ErrClusterLocked is returned in methods that require cluster-level locking 65 // if the cluster is already locked by another concurrent call. 66 var ErrClusterLocked = errors.New("cluster is locked already") 67 68 // ClusterCacheTracker manages client caches for workload clusters. 69 type ClusterCacheTracker struct { 70 log logr.Logger 71 clientUncachedObjects []client.Object 72 73 client client.Client 74 75 // SecretCachingClient is a client which caches secrets. 76 // If set it will be used to read the kubeconfig secret. 77 // Otherwise the default client from the manager will be used. 78 secretCachingClient client.Client 79 80 scheme *runtime.Scheme 81 82 // clusterAccessorsLock is used to lock the access to the clusterAccessors map. 83 clusterAccessorsLock sync.RWMutex 84 // clusterAccessors is the map of clusterAccessors by cluster. 85 clusterAccessors map[client.ObjectKey]*clusterAccessor 86 // clusterLock is a per-cluster lock used whenever we're locking for a specific cluster. 87 // E.g. for actions like creating a client or adding watches. 88 clusterLock *keyedMutex 89 90 indexes []Index 91 92 // controllerName is the name of the controller. 93 // This is used to calculate the user agent string. 94 controllerName string 95 96 // controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker. 97 // This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set. 98 // This information will be used to detected if the controller is running on a workload cluster, so 99 // that we can then access the apiserver directly. 100 controllerPodMetadata *metav1.ObjectMeta 101 } 102 103 // ClusterCacheTrackerOptions defines options to configure 104 // a ClusterCacheTracker. 105 type ClusterCacheTrackerOptions struct { 106 // SecretCachingClient is a client which caches secrets. 107 // If set it will be used to read the kubeconfig secret. 108 // Otherwise the default client from the manager will be used. 109 SecretCachingClient client.Client 110 111 // Log is the logger used throughout the lifecycle of caches. 112 // Defaults to a no-op logger if it's not set. 113 Log *logr.Logger 114 115 // ClientUncachedObjects instructs the Client to never cache the following objects, 116 // it'll instead query the API server directly. 117 // Defaults to never caching ConfigMap and Secret if not set. 118 ClientUncachedObjects []client.Object 119 Indexes []Index 120 121 // ControllerName is the name of the controller. 122 // This is used to calculate the user agent string. 123 // If not set, it defaults to "cluster-cache-tracker". 124 ControllerName string 125 } 126 127 func setDefaultOptions(opts *ClusterCacheTrackerOptions) { 128 if opts.Log == nil { 129 l := logr.New(log.NullLogSink{}) 130 opts.Log = &l 131 } 132 133 l := opts.Log.WithValues("component", "remote/clustercachetracker") 134 opts.Log = &l 135 136 if len(opts.ClientUncachedObjects) == 0 { 137 opts.ClientUncachedObjects = []client.Object{ 138 &corev1.ConfigMap{}, 139 &corev1.Secret{}, 140 } 141 } 142 } 143 144 // NewClusterCacheTracker creates a new ClusterCacheTracker. 145 func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOptions) (*ClusterCacheTracker, error) { 146 setDefaultOptions(&options) 147 148 controllerName := options.ControllerName 149 if controllerName == "" { 150 controllerName = clusterCacheControllerName 151 } 152 153 var controllerPodMetadata *metav1.ObjectMeta 154 podNamespace := os.Getenv("POD_NAMESPACE") 155 podName := os.Getenv("POD_NAME") 156 podUID := os.Getenv("POD_UID") 157 if podNamespace != "" && podName != "" && podUID != "" { 158 options.Log.Info("Found controller pod metadata, the ClusterCacheTracker will try to access the cluster directly when possible") 159 controllerPodMetadata = &metav1.ObjectMeta{ 160 Namespace: podNamespace, 161 Name: podName, 162 UID: types.UID(podUID), 163 } 164 } else { 165 options.Log.Info("Couldn't find controller pod metadata, the ClusterCacheTracker will always access clusters using the regular apiserver endpoint") 166 } 167 168 return &ClusterCacheTracker{ 169 controllerName: controllerName, 170 controllerPodMetadata: controllerPodMetadata, 171 log: *options.Log, 172 clientUncachedObjects: options.ClientUncachedObjects, 173 client: manager.GetClient(), 174 secretCachingClient: options.SecretCachingClient, 175 scheme: manager.GetScheme(), 176 clusterAccessors: make(map[client.ObjectKey]*clusterAccessor), 177 clusterLock: newKeyedMutex(), 178 indexes: options.Indexes, 179 }, nil 180 } 181 182 // GetClient returns a cached client for the given cluster. 183 func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) { 184 accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...) 185 if err != nil { 186 return nil, err 187 } 188 189 return accessor.client, nil 190 } 191 192 // GetReader returns a cached read-only client for the given cluster. 193 func (t *ClusterCacheTracker) GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error) { 194 return t.GetClient(ctx, cluster) 195 } 196 197 // GetRESTConfig returns a cached REST config for the given cluster. 198 func (t *ClusterCacheTracker) GetRESTConfig(ctc context.Context, cluster client.ObjectKey) (*rest.Config, error) { 199 accessor, err := t.getClusterAccessor(ctc, cluster, t.indexes...) 200 if err != nil { 201 return nil, err 202 } 203 204 return accessor.config, nil 205 } 206 207 // GetEtcdClientCertificateKey returns a cached certificate key to be used for generating certificates for accessing etcd in the given cluster. 208 func (t *ClusterCacheTracker) GetEtcdClientCertificateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) { 209 accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...) 210 if err != nil { 211 return nil, err 212 } 213 214 return accessor.etcdClientCertificateKey, nil 215 } 216 217 // clusterAccessor represents the combination of a delegating client, cache, and watches for a remote cluster. 218 type clusterAccessor struct { 219 cache *stoppableCache 220 client client.Client 221 watches sets.Set[string] 222 config *rest.Config 223 etcdClientCertificateKey *rsa.PrivateKey 224 } 225 226 // clusterAccessorExists returns true if a clusterAccessor exists for cluster. 227 func (t *ClusterCacheTracker) clusterAccessorExists(cluster client.ObjectKey) bool { 228 t.clusterAccessorsLock.RLock() 229 defer t.clusterAccessorsLock.RUnlock() 230 231 _, exists := t.clusterAccessors[cluster] 232 return exists 233 } 234 235 // loadAccessor loads a clusterAccessor. 236 func (t *ClusterCacheTracker) loadAccessor(cluster client.ObjectKey) (*clusterAccessor, bool) { 237 t.clusterAccessorsLock.RLock() 238 defer t.clusterAccessorsLock.RUnlock() 239 240 accessor, ok := t.clusterAccessors[cluster] 241 return accessor, ok 242 } 243 244 // storeAccessor stores a clusterAccessor. 245 func (t *ClusterCacheTracker) storeAccessor(cluster client.ObjectKey, accessor *clusterAccessor) { 246 t.clusterAccessorsLock.Lock() 247 defer t.clusterAccessorsLock.Unlock() 248 249 t.clusterAccessors[cluster] = accessor 250 } 251 252 // getClusterAccessor returns a clusterAccessor for cluster. 253 // It first tries to return an already-created clusterAccessor. 254 // It then falls back to create a new clusterAccessor if needed. 255 // If there is already another go routine trying to create a clusterAccessor 256 // for the same cluster, an error is returned. 257 func (t *ClusterCacheTracker) getClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) { 258 log := ctrl.LoggerFrom(ctx, "cluster", klog.KRef(cluster.Namespace, cluster.Name)) 259 260 // If the clusterAccessor already exists, return early. 261 if accessor, ok := t.loadAccessor(cluster); ok { 262 return accessor, nil 263 } 264 265 // clusterAccessor doesn't exist yet, we might have to initialize one. 266 // Lock on the cluster to ensure only one clusterAccessor is initialized 267 // for the cluster at the same time. 268 // Return an error if another go routine already tries to create a clusterAccessor. 269 if ok := t.clusterLock.TryLock(cluster); !ok { 270 return nil, errors.Wrapf(ErrClusterLocked, "failed to create cluster accessor: failed to get lock for cluster") 271 } 272 defer t.clusterLock.Unlock(cluster) 273 274 // Until we got the cluster lock a different goroutine might have initialized the clusterAccessor 275 // for this cluster successfully already. If this is the case we return it. 276 if accessor, ok := t.loadAccessor(cluster); ok { 277 return accessor, nil 278 } 279 280 // We are the go routine who has to initialize the clusterAccessor. 281 log.V(4).Info("Creating new cluster accessor") 282 accessor, err := t.newClusterAccessor(ctx, cluster, indexes...) 283 if err != nil { 284 return nil, errors.Wrap(err, "failed to create cluster accessor") 285 } 286 287 log.V(4).Info("Storing new cluster accessor") 288 t.storeAccessor(cluster, accessor) 289 return accessor, nil 290 } 291 292 // newClusterAccessor creates a new clusterAccessor. 293 func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) { 294 log := ctrl.LoggerFrom(ctx) 295 296 // Get a rest config for the remote cluster. 297 // Use the secretCachingClient if set. 298 secretClient := t.client 299 if t.secretCachingClient != nil { 300 secretClient = t.secretCachingClient 301 } 302 config, err := RESTConfig(ctx, t.controllerName, secretClient, cluster) 303 if err != nil { 304 return nil, errors.Wrapf(err, "error fetching REST client config for remote cluster %q", cluster.String()) 305 } 306 307 // Create a http client and a mapper for the cluster. 308 httpClient, mapper, err := t.createHTTPClientAndMapper(config, cluster) 309 if err != nil { 310 return nil, errors.Wrapf(err, "error creating http client and mapper for remote cluster %q", cluster.String()) 311 } 312 313 // Create an uncached client for the cluster. 314 uncachedClient, err := t.createUncachedClient(config, cluster, httpClient, mapper) 315 if err != nil { 316 return nil, err 317 } 318 319 // Detect if the controller is running on the workload cluster. 320 // This function uses an uncached client to ensure pods aren't cached by the long-lived client. 321 runningOnCluster, err := t.runningOnWorkloadCluster(ctx, uncachedClient, cluster) 322 if err != nil { 323 return nil, err 324 } 325 326 // If the controller runs on the workload cluster, access the apiserver directly by using the 327 // CA and Host from the in-cluster configuration. 328 if runningOnCluster { 329 inClusterConfig, err := ctrl.GetConfig() 330 if err != nil { 331 return nil, errors.Wrapf(err, "error creating client for self-hosted cluster %q", cluster.String()) 332 } 333 334 // Use CA and Host from in-cluster config. 335 config.CAData = nil 336 config.CAFile = inClusterConfig.CAFile 337 config.Host = inClusterConfig.Host 338 339 // Update the http client and the mapper to use in-cluster config. 340 httpClient, mapper, err = t.createHTTPClientAndMapper(config, cluster) 341 if err != nil { 342 return nil, errors.Wrapf(err, "error creating http client and mapper (using in-cluster config) for remote cluster %q", cluster.String()) 343 } 344 345 log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with in-cluster service %q", cluster.String(), config.Host)) 346 } else { 347 log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with the regular apiserver endpoint %q", cluster.String(), config.Host)) 348 } 349 350 // Create a client and a cache for the cluster. 351 cachedClient, err := t.createCachedClient(ctx, config, cluster, httpClient, mapper, indexes) 352 if err != nil { 353 return nil, err 354 } 355 356 // Generating a new private key to be used for generating temporary certificates to connect to 357 // etcd on the target cluster. 358 // NOTE: Generating a private key is an expensive operation, so we store it in the cluster accessor. 359 etcdKey, err := certs.NewPrivateKey() 360 if err != nil { 361 return nil, errors.Wrapf(err, "error creating etcd client key for remote cluster %q", cluster.String()) 362 } 363 364 return &clusterAccessor{ 365 cache: cachedClient.Cache, 366 config: config, 367 client: cachedClient.Client, 368 watches: sets.Set[string]{}, 369 etcdClientCertificateKey: etcdKey, 370 }, nil 371 } 372 373 // runningOnWorkloadCluster detects if the current controller runs on the workload cluster. 374 func (t *ClusterCacheTracker) runningOnWorkloadCluster(ctx context.Context, c client.Client, cluster client.ObjectKey) (bool, error) { 375 // Controller Pod metadata was not found, so we can't detect if we run on the workload cluster. 376 if t.controllerPodMetadata == nil { 377 return false, nil 378 } 379 380 // Try to get the controller pod. 381 var pod corev1.Pod 382 if err := c.Get(ctx, client.ObjectKey{ 383 Namespace: t.controllerPodMetadata.Namespace, 384 Name: t.controllerPodMetadata.Name, 385 }, &pod); err != nil { 386 // If the controller pod is not found, we assume we are not running on the workload cluster. 387 if apierrors.IsNotFound(err) { 388 return false, nil 389 } 390 391 // If we got another error, we return the error so that this will be retried later. 392 return false, errors.Wrapf(err, "error checking if we're running on workload cluster %q", cluster.String()) 393 } 394 395 // If the uid is the same we found the controller pod on the workload cluster. 396 return t.controllerPodMetadata.UID == pod.UID, nil 397 } 398 399 // createHTTPClientAndMapper creates a http client and a dynamic rest mapper for the given cluster, based on the rest.Config. 400 func (t *ClusterCacheTracker) createHTTPClientAndMapper(config *rest.Config, cluster client.ObjectKey) (*http.Client, meta.RESTMapper, error) { 401 // Create a http client for the cluster. 402 httpClient, err := rest.HTTPClientFor(config) 403 if err != nil { 404 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating http client", cluster.String()) 405 } 406 407 // Create a mapper for it 408 mapper, err := apiutil.NewDynamicRESTMapper(config, httpClient) 409 if err != nil { 410 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating dynamic rest mapper", cluster.String()) 411 } 412 413 // Verify if we can get a rest mapping from the workload cluster apiserver. 414 // Note: This also checks if the apiserver is up in general. We do this already here 415 // to avoid further effort creating a cache and a client and to produce a clearer error message. 416 _, err = mapper.RESTMapping(corev1.SchemeGroupVersion.WithKind("Node").GroupKind(), corev1.SchemeGroupVersion.Version) 417 if err != nil { 418 return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error getting rest mapping", cluster.String()) 419 } 420 421 return httpClient, mapper, nil 422 } 423 424 // createUncachedClient creates an uncached client for the given cluster, based on the rest.Config. 425 func (t *ClusterCacheTracker) createUncachedClient(config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, error) { 426 // Create the uncached client for the remote cluster 427 uncachedClient, err := client.New(config, client.Options{ 428 Scheme: t.scheme, 429 Mapper: mapper, 430 HTTPClient: httpClient, 431 }) 432 if err != nil { 433 return nil, errors.Wrapf(err, "error creating uncached client for remote cluster %q", cluster.String()) 434 } 435 436 return uncachedClient, nil 437 } 438 439 type cachedClientOutput struct { 440 Client client.Client 441 Cache *stoppableCache 442 } 443 444 // createCachedClient creates a cached client for the given cluster, based on a rest.Config. 445 func (t *ClusterCacheTracker) createCachedClient(ctx context.Context, config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper, indexes []Index) (*cachedClientOutput, error) { 446 // Create the cache for the remote cluster 447 cacheOptions := cache.Options{ 448 HTTPClient: httpClient, 449 Scheme: t.scheme, 450 Mapper: mapper, 451 } 452 remoteCache, err := cache.New(config, cacheOptions) 453 if err != nil { 454 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error creating cache", cluster.String()) 455 } 456 457 cacheCtx, cacheCtxCancel := context.WithCancel(ctx) 458 459 // We need to be able to stop the cache's shared informers, so wrap this in a stoppableCache. 460 cache := &stoppableCache{ 461 Cache: remoteCache, 462 cancelFunc: cacheCtxCancel, 463 } 464 465 for _, index := range indexes { 466 if err := cache.IndexField(ctx, index.Object, index.Field, index.ExtractValue); err != nil { 467 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error adding index for field %q to cache", cluster.String(), index.Field) 468 } 469 } 470 471 // Create the client for the remote cluster 472 cachedClient, err := client.New(config, client.Options{ 473 Scheme: t.scheme, 474 Mapper: mapper, 475 HTTPClient: httpClient, 476 Cache: &client.CacheOptions{ 477 Reader: cache, 478 DisableFor: t.clientUncachedObjects, 479 Unstructured: true, 480 }, 481 }) 482 if err != nil { 483 return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q", cluster.String()) 484 } 485 486 // Start the cache!!! 487 go cache.Start(cacheCtx) //nolint:errcheck 488 489 // Wait until the cache is initially synced 490 cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, initialCacheSyncTimeout) 491 defer cacheSyncCtxCancel() 492 if !cache.WaitForCacheSync(cacheSyncCtx) { 493 cache.Stop() 494 return nil, fmt.Errorf("failed waiting for cache for remote cluster %v to sync: %w", cluster, cacheCtx.Err()) 495 } 496 497 // Wrap the cached client with a client that sets timeouts on all Get and List calls 498 // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer 499 // and the informer than doesn't sync because the workload cluster apiserver is not reachable. 500 // An alternative would be to set timeouts in the contexts we pass into all Get and List calls. 501 // It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig. 502 cachedClient = newClientWithTimeout(cachedClient, config.Timeout) 503 504 // Start cluster healthcheck!!! 505 go t.healthCheckCluster(cacheCtx, &healthCheckInput{ 506 cluster: cluster, 507 cfg: config, 508 httpClient: httpClient, 509 }) 510 511 return &cachedClientOutput{ 512 Client: cachedClient, 513 Cache: cache, 514 }, nil 515 } 516 517 // deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker. 518 func (t *ClusterCacheTracker) deleteAccessor(_ context.Context, cluster client.ObjectKey) { 519 t.clusterAccessorsLock.Lock() 520 defer t.clusterAccessorsLock.Unlock() 521 522 a, exists := t.clusterAccessors[cluster] 523 if !exists { 524 return 525 } 526 527 log := t.log.WithValues("Cluster", klog.KRef(cluster.Namespace, cluster.Name)) 528 log.V(2).Info("Deleting clusterAccessor") 529 log.V(4).Info("Stopping cache") 530 a.cache.Stop() 531 log.V(4).Info("Cache stopped") 532 533 delete(t.clusterAccessors, cluster) 534 } 535 536 // Watcher is a scoped-down interface from Controller that only knows how to watch. 537 type Watcher interface { 538 // Watch watches src for changes, sending events to eventHandler if they pass predicates. 539 Watch(src source.Source, eventHandler handler.EventHandler, predicates ...predicate.Predicate) error 540 } 541 542 // WatchInput specifies the parameters used to establish a new watch for a remote cluster. 543 type WatchInput struct { 544 // Name represents a unique watch request for the specified Cluster. 545 Name string 546 547 // Cluster is the key for the remote cluster. 548 Cluster client.ObjectKey 549 550 // Watcher is the watcher (controller) whose Reconcile() function will be called for events. 551 Watcher Watcher 552 553 // Kind is the type of resource to watch. 554 Kind client.Object 555 556 // EventHandler contains the event handlers to invoke for resource events. 557 EventHandler handler.EventHandler 558 559 // Predicates is used to filter resource events. 560 Predicates []predicate.Predicate 561 } 562 563 // Watch watches a remote cluster for resource events. If the watch already exists based on input.Name, this is a no-op. 564 func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error { 565 if input.Name == "" { 566 return errors.New("input.Name is required") 567 } 568 569 accessor, err := t.getClusterAccessor(ctx, input.Cluster, t.indexes...) 570 if err != nil { 571 return errors.Wrapf(err, "failed to add %s watch on cluster %s", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 572 } 573 574 // We have to lock the cluster, so that the watch is not created multiple times in parallel. 575 ok := t.clusterLock.TryLock(input.Cluster) 576 if !ok { 577 return errors.Wrapf(ErrClusterLocked, "failed to add %T watch on cluster %s: failed to get lock for cluster", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 578 } 579 defer t.clusterLock.Unlock(input.Cluster) 580 581 if accessor.watches.Has(input.Name) { 582 log := ctrl.LoggerFrom(ctx) 583 log.V(6).Info("Watch already exists", "Cluster", klog.KRef(input.Cluster.Namespace, input.Cluster.Name), "name", input.Name) 584 return nil 585 } 586 587 // Need to create the watch 588 if err := input.Watcher.Watch(source.Kind(accessor.cache, input.Kind), input.EventHandler, input.Predicates...); err != nil { 589 return errors.Wrapf(err, "failed to add %s watch on cluster %s: failed to create watch", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name)) 590 } 591 592 accessor.watches.Insert(input.Name) 593 594 return nil 595 } 596 597 // healthCheckInput provides the input for the healthCheckCluster method. 598 type healthCheckInput struct { 599 cluster client.ObjectKey 600 httpClient *http.Client 601 cfg *rest.Config 602 interval time.Duration 603 requestTimeout time.Duration 604 unhealthyThreshold int 605 path string 606 } 607 608 // setDefaults sets default values if optional parameters are not set. 609 func (h *healthCheckInput) setDefaults() { 610 if h.interval == 0 { 611 h.interval = healthCheckPollInterval 612 } 613 if h.requestTimeout == 0 { 614 h.requestTimeout = healthCheckRequestTimeout 615 } 616 if h.unhealthyThreshold == 0 { 617 h.unhealthyThreshold = healthCheckUnhealthyThreshold 618 } 619 if h.path == "" { 620 h.path = "/" 621 } 622 } 623 624 // healthCheckCluster will poll the cluster's API at the path given and, if there are 625 // `unhealthyThreshold` consecutive failures, will deem the cluster unhealthy. 626 // Once the cluster is deemed unhealthy, the cluster's cache is stopped and removed. 627 func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *healthCheckInput) { 628 // populate optional params for healthCheckInput 629 in.setDefaults() 630 631 unhealthyCount := 0 632 633 // This gets us a client that can make raw http(s) calls to the remote apiserver. We only need to create it once 634 // and we can reuse it inside the polling loop. 635 codec := runtime.NoopEncoder{Decoder: scheme.Codecs.UniversalDecoder()} 636 cfg := rest.CopyConfig(in.cfg) 637 cfg.NegotiatedSerializer = serializer.NegotiatedSerializerWrapper(runtime.SerializerInfo{Serializer: codec}) 638 restClient, restClientErr := rest.UnversionedRESTClientForConfigAndClient(cfg, in.httpClient) 639 640 runHealthCheckWithThreshold := func(ctx context.Context) (bool, error) { 641 if restClientErr != nil { 642 return false, restClientErr 643 } 644 645 cluster := &clusterv1.Cluster{} 646 if err := t.client.Get(ctx, in.cluster, cluster); err != nil { 647 if apierrors.IsNotFound(err) { 648 // If the cluster can't be found, we should delete the cache. 649 return false, err 650 } 651 // Otherwise, requeue. 652 return false, nil 653 } 654 655 if !cluster.Status.InfrastructureReady || !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) { 656 // If the infrastructure or control plane aren't marked as ready, we should requeue and wait. 657 return false, nil 658 } 659 660 if _, ok := t.loadAccessor(in.cluster); !ok { 661 // If there is no accessor but the cluster is locked, we're probably in the middle of the cluster accessor 662 // creation and we should requeue the health check until it's done. 663 if ok := t.clusterLock.TryLock(in.cluster); !ok { 664 t.log.V(4).Info("Waiting for cluster to be unlocked. Requeuing health check") 665 return false, nil 666 } 667 t.clusterLock.Unlock(in.cluster) 668 // Cache for this cluster has already been cleaned up. 669 // Nothing to do, so return true. 670 return true, nil 671 } 672 673 // An error here means there was either an issue connecting or the API returned an error. 674 // If no error occurs, reset the unhealthy counter. 675 _, err := restClient.Get().AbsPath(in.path).Timeout(in.requestTimeout).DoRaw(ctx) 676 if err != nil { 677 if apierrors.IsUnauthorized(err) { 678 // Unauthorized means that the underlying kubeconfig is not authorizing properly anymore, which 679 // usually is the result of automatic kubeconfig refreshes, meaning that we have to throw away the 680 // clusterAccessor and rely on the creation of a new one (with a refreshed kubeconfig) 681 return false, err 682 } 683 unhealthyCount++ 684 } else { 685 unhealthyCount = 0 686 } 687 688 if unhealthyCount >= in.unhealthyThreshold { 689 // Cluster is now considered unhealthy. 690 return false, err 691 } 692 693 return false, nil 694 } 695 696 err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold) 697 // An error returned implies the health check has failed a sufficient number of times for the cluster 698 // to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the 699 // cache context into wait.PollUntilContextCancel). 700 // NB. Log all errors that occurred even if this error might just be from a cancel of the cache context 701 // when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier. 702 if err != nil { 703 t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name)) 704 } 705 // Ensure in any case that the accessor is deleted (even if it is a no-op). 706 // NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the 707 // cluster is reachable again 708 t.deleteAccessor(ctx, in.cluster) 709 } 710 711 // newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls. 712 // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer 713 // and the informer than doesn't sync because the workload cluster apiserver is not reachable. 714 // An alternative would be to set timeouts in the contexts we pass into all Get and List calls. 715 func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client { 716 return clientWithTimeout{ 717 Client: client, 718 timeout: timeout, 719 } 720 } 721 722 type clientWithTimeout struct { 723 client.Client 724 timeout time.Duration 725 } 726 727 var _ client.Client = &clientWithTimeout{} 728 729 func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { 730 ctx, cancel := context.WithTimeout(ctx, c.timeout) 731 defer cancel() 732 return c.Client.Get(ctx, key, obj, opts...) 733 } 734 735 func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { 736 ctx, cancel := context.WithTimeout(ctx, c.timeout) 737 defer cancel() 738 return c.Client.List(ctx, list, opts...) 739 }