sigs.k8s.io/cluster-api@v1.7.1/controllers/remote/cluster_cache_tracker.go

sigs.k8s.io/cluster-api@v1.7.1/controllers/remote/cluster_cache_tracker.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package remote
    18  
    19  import (
    20  	"context"
    21  	"crypto/rsa"
    22  	"fmt"
    23  	"net/http"
    24  	"os"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/go-logr/logr"
    29  	"github.com/pkg/errors"
    30  	corev1 "k8s.io/api/core/v1"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	"k8s.io/apimachinery/pkg/api/meta"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/runtime/serializer"
    36  	"k8s.io/apimachinery/pkg/types"
    37  	"k8s.io/apimachinery/pkg/util/sets"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/client-go/kubernetes/scheme"
    40  	"k8s.io/client-go/rest"
    41  	"k8s.io/klog/v2"
    42  	ctrl "sigs.k8s.io/controller-runtime"
    43  	"sigs.k8s.io/controller-runtime/pkg/cache"
    44  	"sigs.k8s.io/controller-runtime/pkg/client"
    45  	"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
    46  	"sigs.k8s.io/controller-runtime/pkg/handler"
    47  	"sigs.k8s.io/controller-runtime/pkg/log"
    48  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    49  	"sigs.k8s.io/controller-runtime/pkg/source"
    50  
    51  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    52  	"sigs.k8s.io/cluster-api/util/certs"
    53  	"sigs.k8s.io/cluster-api/util/conditions"
    54  )
    55  
    56  const (
    57  	healthCheckPollInterval       = 10 * time.Second
    58  	healthCheckRequestTimeout     = 5 * time.Second
    59  	healthCheckUnhealthyThreshold = 10
    60  	initialCacheSyncTimeout       = 5 * time.Minute
    61  	clusterCacheControllerName    = "cluster-cache-tracker"
    62  )
    63  
    64  // ErrClusterLocked is returned in methods that require cluster-level locking
    65  // if the cluster is already locked by another concurrent call.
    66  var ErrClusterLocked = errors.New("cluster is locked already")
    67  
    68  // ClusterCacheTracker manages client caches for workload clusters.
    69  type ClusterCacheTracker struct {
    70  	log                   logr.Logger
    71  	clientUncachedObjects []client.Object
    72  
    73  	client client.Client
    74  
    75  	// SecretCachingClient is a client which caches secrets.
    76  	// If set it will be used to read the kubeconfig secret.
    77  	// Otherwise the default client from the manager will be used.
    78  	secretCachingClient client.Client
    79  
    80  	scheme *runtime.Scheme
    81  
    82  	// clusterAccessorsLock is used to lock the access to the clusterAccessors map.
    83  	clusterAccessorsLock sync.RWMutex
    84  	// clusterAccessors is the map of clusterAccessors by cluster.
    85  	clusterAccessors map[client.ObjectKey]*clusterAccessor
    86  	// clusterLock is a per-cluster lock used whenever we're locking for a specific cluster.
    87  	// E.g. for actions like creating a client or adding watches.
    88  	clusterLock *keyedMutex
    89  
    90  	indexes []Index
    91  
    92  	// controllerName is the name of the controller.
    93  	// This is used to calculate the user agent string.
    94  	controllerName string
    95  
    96  	// controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker.
    97  	// This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set.
    98  	// This information will be used to detected if the controller is running on a workload cluster, so
    99  	// that we can then access the apiserver directly.
   100  	controllerPodMetadata *metav1.ObjectMeta
   101  }
   102  
   103  // ClusterCacheTrackerOptions defines options to configure
   104  // a ClusterCacheTracker.
   105  type ClusterCacheTrackerOptions struct {
   106  	// SecretCachingClient is a client which caches secrets.
   107  	// If set it will be used to read the kubeconfig secret.
   108  	// Otherwise the default client from the manager will be used.
   109  	SecretCachingClient client.Client
   110  
   111  	// Log is the logger used throughout the lifecycle of caches.
   112  	// Defaults to a no-op logger if it's not set.
   113  	Log *logr.Logger
   114  
   115  	// ClientUncachedObjects instructs the Client to never cache the following objects,
   116  	// it'll instead query the API server directly.
   117  	// Defaults to never caching ConfigMap and Secret if not set.
   118  	ClientUncachedObjects []client.Object
   119  	Indexes               []Index
   120  
   121  	// ControllerName is the name of the controller.
   122  	// This is used to calculate the user agent string.
   123  	// If not set, it defaults to "cluster-cache-tracker".
   124  	ControllerName string
   125  }
   126  
   127  func setDefaultOptions(opts *ClusterCacheTrackerOptions) {
   128  	if opts.Log == nil {
   129  		l := logr.New(log.NullLogSink{})
   130  		opts.Log = &l
   131  	}
   132  
   133  	l := opts.Log.WithValues("component", "remote/clustercachetracker")
   134  	opts.Log = &l
   135  
   136  	if len(opts.ClientUncachedObjects) == 0 {
   137  		opts.ClientUncachedObjects = []client.Object{
   138  			&corev1.ConfigMap{},
   139  			&corev1.Secret{},
   140  		}
   141  	}
   142  }
   143  
   144  // NewClusterCacheTracker creates a new ClusterCacheTracker.
   145  func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOptions) (*ClusterCacheTracker, error) {
   146  	setDefaultOptions(&options)
   147  
   148  	controllerName := options.ControllerName
   149  	if controllerName == "" {
   150  		controllerName = clusterCacheControllerName
   151  	}
   152  
   153  	var controllerPodMetadata *metav1.ObjectMeta
   154  	podNamespace := os.Getenv("POD_NAMESPACE")
   155  	podName := os.Getenv("POD_NAME")
   156  	podUID := os.Getenv("POD_UID")
   157  	if podNamespace != "" && podName != "" && podUID != "" {
   158  		options.Log.Info("Found controller pod metadata, the ClusterCacheTracker will try to access the cluster directly when possible")
   159  		controllerPodMetadata = &metav1.ObjectMeta{
   160  			Namespace: podNamespace,
   161  			Name:      podName,
   162  			UID:       types.UID(podUID),
   163  		}
   164  	} else {
   165  		options.Log.Info("Couldn't find controller pod metadata, the ClusterCacheTracker will always access clusters using the regular apiserver endpoint")
   166  	}
   167  
   168  	return &ClusterCacheTracker{
   169  		controllerName:        controllerName,
   170  		controllerPodMetadata: controllerPodMetadata,
   171  		log:                   *options.Log,
   172  		clientUncachedObjects: options.ClientUncachedObjects,
   173  		client:                manager.GetClient(),
   174  		secretCachingClient:   options.SecretCachingClient,
   175  		scheme:                manager.GetScheme(),
   176  		clusterAccessors:      make(map[client.ObjectKey]*clusterAccessor),
   177  		clusterLock:           newKeyedMutex(),
   178  		indexes:               options.Indexes,
   179  	}, nil
   180  }
   181  
   182  // GetClient returns a cached client for the given cluster.
   183  func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) {
   184  	accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...)
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  
   189  	return accessor.client, nil
   190  }
   191  
   192  // GetReader returns a cached read-only client for the given cluster.
   193  func (t *ClusterCacheTracker) GetReader(ctx context.Context, cluster client.ObjectKey) (client.Reader, error) {
   194  	return t.GetClient(ctx, cluster)
   195  }
   196  
   197  // GetRESTConfig returns a cached REST config for the given cluster.
   198  func (t *ClusterCacheTracker) GetRESTConfig(ctc context.Context, cluster client.ObjectKey) (*rest.Config, error) {
   199  	accessor, err := t.getClusterAccessor(ctc, cluster, t.indexes...)
   200  	if err != nil {
   201  		return nil, err
   202  	}
   203  
   204  	return accessor.config, nil
   205  }
   206  
   207  // GetEtcdClientCertificateKey returns a cached certificate key to be used for generating certificates for accessing etcd in the given cluster.
   208  func (t *ClusterCacheTracker) GetEtcdClientCertificateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) {
   209  	accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...)
   210  	if err != nil {
   211  		return nil, err
   212  	}
   213  
   214  	return accessor.etcdClientCertificateKey, nil
   215  }
   216  
   217  // clusterAccessor represents the combination of a delegating client, cache, and watches for a remote cluster.
   218  type clusterAccessor struct {
   219  	cache                    *stoppableCache
   220  	client                   client.Client
   221  	watches                  sets.Set[string]
   222  	config                   *rest.Config
   223  	etcdClientCertificateKey *rsa.PrivateKey
   224  }
   225  
   226  // clusterAccessorExists returns true if a clusterAccessor exists for cluster.
   227  func (t *ClusterCacheTracker) clusterAccessorExists(cluster client.ObjectKey) bool {
   228  	t.clusterAccessorsLock.RLock()
   229  	defer t.clusterAccessorsLock.RUnlock()
   230  
   231  	_, exists := t.clusterAccessors[cluster]
   232  	return exists
   233  }
   234  
   235  // loadAccessor loads a clusterAccessor.
   236  func (t *ClusterCacheTracker) loadAccessor(cluster client.ObjectKey) (*clusterAccessor, bool) {
   237  	t.clusterAccessorsLock.RLock()
   238  	defer t.clusterAccessorsLock.RUnlock()
   239  
   240  	accessor, ok := t.clusterAccessors[cluster]
   241  	return accessor, ok
   242  }
   243  
   244  // storeAccessor stores a clusterAccessor.
   245  func (t *ClusterCacheTracker) storeAccessor(cluster client.ObjectKey, accessor *clusterAccessor) {
   246  	t.clusterAccessorsLock.Lock()
   247  	defer t.clusterAccessorsLock.Unlock()
   248  
   249  	t.clusterAccessors[cluster] = accessor
   250  }
   251  
   252  // getClusterAccessor returns a clusterAccessor for cluster.
   253  // It first tries to return an already-created clusterAccessor.
   254  // It then falls back to create a new clusterAccessor if needed.
   255  // If there is already another go routine trying to create a clusterAccessor
   256  // for the same cluster, an error is returned.
   257  func (t *ClusterCacheTracker) getClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) {
   258  	log := ctrl.LoggerFrom(ctx, "cluster", klog.KRef(cluster.Namespace, cluster.Name))
   259  
   260  	// If the clusterAccessor already exists, return early.
   261  	if accessor, ok := t.loadAccessor(cluster); ok {
   262  		return accessor, nil
   263  	}
   264  
   265  	// clusterAccessor doesn't exist yet, we might have to initialize one.
   266  	// Lock on the cluster to ensure only one clusterAccessor is initialized
   267  	// for the cluster at the same time.
   268  	// Return an error if another go routine already tries to create a clusterAccessor.
   269  	if ok := t.clusterLock.TryLock(cluster); !ok {
   270  		return nil, errors.Wrapf(ErrClusterLocked, "failed to create cluster accessor: failed to get lock for cluster")
   271  	}
   272  	defer t.clusterLock.Unlock(cluster)
   273  
   274  	// Until we got the cluster lock a different goroutine might have initialized the clusterAccessor
   275  	// for this cluster successfully already. If this is the case we return it.
   276  	if accessor, ok := t.loadAccessor(cluster); ok {
   277  		return accessor, nil
   278  	}
   279  
   280  	// We are the go routine who has to initialize the clusterAccessor.
   281  	log.V(4).Info("Creating new cluster accessor")
   282  	accessor, err := t.newClusterAccessor(ctx, cluster, indexes...)
   283  	if err != nil {
   284  		return nil, errors.Wrap(err, "failed to create cluster accessor")
   285  	}
   286  
   287  	log.V(4).Info("Storing new cluster accessor")
   288  	t.storeAccessor(cluster, accessor)
   289  	return accessor, nil
   290  }
   291  
   292  // newClusterAccessor creates a new clusterAccessor.
   293  func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) {
   294  	log := ctrl.LoggerFrom(ctx)
   295  
   296  	// Get a rest config for the remote cluster.
   297  	// Use the secretCachingClient if set.
   298  	secretClient := t.client
   299  	if t.secretCachingClient != nil {
   300  		secretClient = t.secretCachingClient
   301  	}
   302  	config, err := RESTConfig(ctx, t.controllerName, secretClient, cluster)
   303  	if err != nil {
   304  		return nil, errors.Wrapf(err, "error fetching REST client config for remote cluster %q", cluster.String())
   305  	}
   306  
   307  	// Create a http client and a mapper for the cluster.
   308  	httpClient, mapper, err := t.createHTTPClientAndMapper(config, cluster)
   309  	if err != nil {
   310  		return nil, errors.Wrapf(err, "error creating http client and mapper for remote cluster %q", cluster.String())
   311  	}
   312  
   313  	// Create an uncached client for the cluster.
   314  	uncachedClient, err := t.createUncachedClient(config, cluster, httpClient, mapper)
   315  	if err != nil {
   316  		return nil, err
   317  	}
   318  
   319  	// Detect if the controller is running on the workload cluster.
   320  	// This function uses an uncached client to ensure pods aren't cached by the long-lived client.
   321  	runningOnCluster, err := t.runningOnWorkloadCluster(ctx, uncachedClient, cluster)
   322  	if err != nil {
   323  		return nil, err
   324  	}
   325  
   326  	// If the controller runs on the workload cluster, access the apiserver directly by using the
   327  	// CA and Host from the in-cluster configuration.
   328  	if runningOnCluster {
   329  		inClusterConfig, err := ctrl.GetConfig()
   330  		if err != nil {
   331  			return nil, errors.Wrapf(err, "error creating client for self-hosted cluster %q", cluster.String())
   332  		}
   333  
   334  		// Use CA and Host from in-cluster config.
   335  		config.CAData = nil
   336  		config.CAFile = inClusterConfig.CAFile
   337  		config.Host = inClusterConfig.Host
   338  
   339  		// Update the http client and the mapper to use in-cluster config.
   340  		httpClient, mapper, err = t.createHTTPClientAndMapper(config, cluster)
   341  		if err != nil {
   342  			return nil, errors.Wrapf(err, "error creating http client and mapper (using in-cluster config) for remote cluster %q", cluster.String())
   343  		}
   344  
   345  		log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with in-cluster service %q", cluster.String(), config.Host))
   346  	} else {
   347  		log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with the regular apiserver endpoint %q", cluster.String(), config.Host))
   348  	}
   349  
   350  	// Create a client and a cache for the cluster.
   351  	cachedClient, err := t.createCachedClient(ctx, config, cluster, httpClient, mapper, indexes)
   352  	if err != nil {
   353  		return nil, err
   354  	}
   355  
   356  	// Generating a new private key to be used for generating temporary certificates to connect to
   357  	// etcd on the target cluster.
   358  	// NOTE: Generating a private key is an expensive operation, so we store it in the cluster accessor.
   359  	etcdKey, err := certs.NewPrivateKey()
   360  	if err != nil {
   361  		return nil, errors.Wrapf(err, "error creating etcd client key for remote cluster %q", cluster.String())
   362  	}
   363  
   364  	return &clusterAccessor{
   365  		cache:                    cachedClient.Cache,
   366  		config:                   config,
   367  		client:                   cachedClient.Client,
   368  		watches:                  sets.Set[string]{},
   369  		etcdClientCertificateKey: etcdKey,
   370  	}, nil
   371  }
   372  
   373  // runningOnWorkloadCluster detects if the current controller runs on the workload cluster.
   374  func (t *ClusterCacheTracker) runningOnWorkloadCluster(ctx context.Context, c client.Client, cluster client.ObjectKey) (bool, error) {
   375  	// Controller Pod metadata was not found, so we can't detect if we run on the workload cluster.
   376  	if t.controllerPodMetadata == nil {
   377  		return false, nil
   378  	}
   379  
   380  	// Try to get the controller pod.
   381  	var pod corev1.Pod
   382  	if err := c.Get(ctx, client.ObjectKey{
   383  		Namespace: t.controllerPodMetadata.Namespace,
   384  		Name:      t.controllerPodMetadata.Name,
   385  	}, &pod); err != nil {
   386  		// If the controller pod is not found, we assume we are not running on the workload cluster.
   387  		if apierrors.IsNotFound(err) {
   388  			return false, nil
   389  		}
   390  
   391  		// If we got another error, we return the error so that this will be retried later.
   392  		return false, errors.Wrapf(err, "error checking if we're running on workload cluster %q", cluster.String())
   393  	}
   394  
   395  	// If the uid is the same we found the controller pod on the workload cluster.
   396  	return t.controllerPodMetadata.UID == pod.UID, nil
   397  }
   398  
   399  // createHTTPClientAndMapper creates a http client and a dynamic rest mapper for the given cluster, based on the rest.Config.
   400  func (t *ClusterCacheTracker) createHTTPClientAndMapper(config *rest.Config, cluster client.ObjectKey) (*http.Client, meta.RESTMapper, error) {
   401  	// Create a http client for the cluster.
   402  	httpClient, err := rest.HTTPClientFor(config)
   403  	if err != nil {
   404  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating http client", cluster.String())
   405  	}
   406  
   407  	// Create a mapper for it
   408  	mapper, err := apiutil.NewDynamicRESTMapper(config, httpClient)
   409  	if err != nil {
   410  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating dynamic rest mapper", cluster.String())
   411  	}
   412  
   413  	// Verify if we can get a rest mapping from the workload cluster apiserver.
   414  	// Note: This also checks if the apiserver is up in general. We do this already here
   415  	// to avoid further effort creating a cache and a client and to produce a clearer error message.
   416  	_, err = mapper.RESTMapping(corev1.SchemeGroupVersion.WithKind("Node").GroupKind(), corev1.SchemeGroupVersion.Version)
   417  	if err != nil {
   418  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error getting rest mapping", cluster.String())
   419  	}
   420  
   421  	return httpClient, mapper, nil
   422  }
   423  
   424  // createUncachedClient creates an uncached client for the given cluster, based on the rest.Config.
   425  func (t *ClusterCacheTracker) createUncachedClient(config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, error) {
   426  	// Create the uncached client for the remote cluster
   427  	uncachedClient, err := client.New(config, client.Options{
   428  		Scheme:     t.scheme,
   429  		Mapper:     mapper,
   430  		HTTPClient: httpClient,
   431  	})
   432  	if err != nil {
   433  		return nil, errors.Wrapf(err, "error creating uncached client for remote cluster %q", cluster.String())
   434  	}
   435  
   436  	return uncachedClient, nil
   437  }
   438  
   439  type cachedClientOutput struct {
   440  	Client client.Client
   441  	Cache  *stoppableCache
   442  }
   443  
   444  // createCachedClient creates a cached client for the given cluster, based on a rest.Config.
   445  func (t *ClusterCacheTracker) createCachedClient(ctx context.Context, config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper, indexes []Index) (*cachedClientOutput, error) {
   446  	// Create the cache for the remote cluster
   447  	cacheOptions := cache.Options{
   448  		HTTPClient: httpClient,
   449  		Scheme:     t.scheme,
   450  		Mapper:     mapper,
   451  	}
   452  	remoteCache, err := cache.New(config, cacheOptions)
   453  	if err != nil {
   454  		return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error creating cache", cluster.String())
   455  	}
   456  
   457  	cacheCtx, cacheCtxCancel := context.WithCancel(ctx)
   458  
   459  	// We need to be able to stop the cache's shared informers, so wrap this in a stoppableCache.
   460  	cache := &stoppableCache{
   461  		Cache:      remoteCache,
   462  		cancelFunc: cacheCtxCancel,
   463  	}
   464  
   465  	for _, index := range indexes {
   466  		if err := cache.IndexField(ctx, index.Object, index.Field, index.ExtractValue); err != nil {
   467  			return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error adding index for field %q to cache", cluster.String(), index.Field)
   468  		}
   469  	}
   470  
   471  	// Create the client for the remote cluster
   472  	cachedClient, err := client.New(config, client.Options{
   473  		Scheme:     t.scheme,
   474  		Mapper:     mapper,
   475  		HTTPClient: httpClient,
   476  		Cache: &client.CacheOptions{
   477  			Reader:       cache,
   478  			DisableFor:   t.clientUncachedObjects,
   479  			Unstructured: true,
   480  		},
   481  	})
   482  	if err != nil {
   483  		return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q", cluster.String())
   484  	}
   485  
   486  	// Start the cache!!!
   487  	go cache.Start(cacheCtx) //nolint:errcheck
   488  
   489  	// Wait until the cache is initially synced
   490  	cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, initialCacheSyncTimeout)
   491  	defer cacheSyncCtxCancel()
   492  	if !cache.WaitForCacheSync(cacheSyncCtx) {
   493  		cache.Stop()
   494  		return nil, fmt.Errorf("failed waiting for cache for remote cluster %v to sync: %w", cluster, cacheCtx.Err())
   495  	}
   496  
   497  	// Wrap the cached client with a client that sets timeouts on all Get and List calls
   498  	// If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
   499  	// and the informer than doesn't sync because the workload cluster apiserver is not reachable.
   500  	// An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
   501  	// It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig.
   502  	cachedClient = newClientWithTimeout(cachedClient, config.Timeout)
   503  
   504  	// Start cluster healthcheck!!!
   505  	go t.healthCheckCluster(cacheCtx, &healthCheckInput{
   506  		cluster:    cluster,
   507  		cfg:        config,
   508  		httpClient: httpClient,
   509  	})
   510  
   511  	return &cachedClientOutput{
   512  		Client: cachedClient,
   513  		Cache:  cache,
   514  	}, nil
   515  }
   516  
   517  // deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker.
   518  func (t *ClusterCacheTracker) deleteAccessor(_ context.Context, cluster client.ObjectKey) {
   519  	t.clusterAccessorsLock.Lock()
   520  	defer t.clusterAccessorsLock.Unlock()
   521  
   522  	a, exists := t.clusterAccessors[cluster]
   523  	if !exists {
   524  		return
   525  	}
   526  
   527  	log := t.log.WithValues("Cluster", klog.KRef(cluster.Namespace, cluster.Name))
   528  	log.V(2).Info("Deleting clusterAccessor")
   529  	log.V(4).Info("Stopping cache")
   530  	a.cache.Stop()
   531  	log.V(4).Info("Cache stopped")
   532  
   533  	delete(t.clusterAccessors, cluster)
   534  }
   535  
   536  // Watcher is a scoped-down interface from Controller that only knows how to watch.
   537  type Watcher interface {
   538  	// Watch watches src for changes, sending events to eventHandler if they pass predicates.
   539  	Watch(src source.Source, eventHandler handler.EventHandler, predicates ...predicate.Predicate) error
   540  }
   541  
   542  // WatchInput specifies the parameters used to establish a new watch for a remote cluster.
   543  type WatchInput struct {
   544  	// Name represents a unique watch request for the specified Cluster.
   545  	Name string
   546  
   547  	// Cluster is the key for the remote cluster.
   548  	Cluster client.ObjectKey
   549  
   550  	// Watcher is the watcher (controller) whose Reconcile() function will be called for events.
   551  	Watcher Watcher
   552  
   553  	// Kind is the type of resource to watch.
   554  	Kind client.Object
   555  
   556  	// EventHandler contains the event handlers to invoke for resource events.
   557  	EventHandler handler.EventHandler
   558  
   559  	// Predicates is used to filter resource events.
   560  	Predicates []predicate.Predicate
   561  }
   562  
   563  // Watch watches a remote cluster for resource events. If the watch already exists based on input.Name, this is a no-op.
   564  func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error {
   565  	if input.Name == "" {
   566  		return errors.New("input.Name is required")
   567  	}
   568  
   569  	accessor, err := t.getClusterAccessor(ctx, input.Cluster, t.indexes...)
   570  	if err != nil {
   571  		return errors.Wrapf(err, "failed to add %s watch on cluster %s", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   572  	}
   573  
   574  	// We have to lock the cluster, so that the watch is not created multiple times in parallel.
   575  	ok := t.clusterLock.TryLock(input.Cluster)
   576  	if !ok {
   577  		return errors.Wrapf(ErrClusterLocked, "failed to add %T watch on cluster %s: failed to get lock for cluster", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   578  	}
   579  	defer t.clusterLock.Unlock(input.Cluster)
   580  
   581  	if accessor.watches.Has(input.Name) {
   582  		log := ctrl.LoggerFrom(ctx)
   583  		log.V(6).Info("Watch already exists", "Cluster", klog.KRef(input.Cluster.Namespace, input.Cluster.Name), "name", input.Name)
   584  		return nil
   585  	}
   586  
   587  	// Need to create the watch
   588  	if err := input.Watcher.Watch(source.Kind(accessor.cache, input.Kind), input.EventHandler, input.Predicates...); err != nil {
   589  		return errors.Wrapf(err, "failed to add %s watch on cluster %s: failed to create watch", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   590  	}
   591  
   592  	accessor.watches.Insert(input.Name)
   593  
   594  	return nil
   595  }
   596  
   597  // healthCheckInput provides the input for the healthCheckCluster method.
   598  type healthCheckInput struct {
   599  	cluster            client.ObjectKey
   600  	httpClient         *http.Client
   601  	cfg                *rest.Config
   602  	interval           time.Duration
   603  	requestTimeout     time.Duration
   604  	unhealthyThreshold int
   605  	path               string
   606  }
   607  
   608  // setDefaults sets default values if optional parameters are not set.
   609  func (h *healthCheckInput) setDefaults() {
   610  	if h.interval == 0 {
   611  		h.interval = healthCheckPollInterval
   612  	}
   613  	if h.requestTimeout == 0 {
   614  		h.requestTimeout = healthCheckRequestTimeout
   615  	}
   616  	if h.unhealthyThreshold == 0 {
   617  		h.unhealthyThreshold = healthCheckUnhealthyThreshold
   618  	}
   619  	if h.path == "" {
   620  		h.path = "/"
   621  	}
   622  }
   623  
   624  // healthCheckCluster will poll the cluster's API at the path given and, if there are
   625  // `unhealthyThreshold` consecutive failures, will deem the cluster unhealthy.
   626  // Once the cluster is deemed unhealthy, the cluster's cache is stopped and removed.
   627  func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *healthCheckInput) {
   628  	// populate optional params for healthCheckInput
   629  	in.setDefaults()
   630  
   631  	unhealthyCount := 0
   632  
   633  	// This gets us a client that can make raw http(s) calls to the remote apiserver. We only need to create it once
   634  	// and we can reuse it inside the polling loop.
   635  	codec := runtime.NoopEncoder{Decoder: scheme.Codecs.UniversalDecoder()}
   636  	cfg := rest.CopyConfig(in.cfg)
   637  	cfg.NegotiatedSerializer = serializer.NegotiatedSerializerWrapper(runtime.SerializerInfo{Serializer: codec})
   638  	restClient, restClientErr := rest.UnversionedRESTClientForConfigAndClient(cfg, in.httpClient)
   639  
   640  	runHealthCheckWithThreshold := func(ctx context.Context) (bool, error) {
   641  		if restClientErr != nil {
   642  			return false, restClientErr
   643  		}
   644  
   645  		cluster := &clusterv1.Cluster{}
   646  		if err := t.client.Get(ctx, in.cluster, cluster); err != nil {
   647  			if apierrors.IsNotFound(err) {
   648  				// If the cluster can't be found, we should delete the cache.
   649  				return false, err
   650  			}
   651  			// Otherwise, requeue.
   652  			return false, nil
   653  		}
   654  
   655  		if !cluster.Status.InfrastructureReady || !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
   656  			// If the infrastructure or control plane aren't marked as ready, we should requeue and wait.
   657  			return false, nil
   658  		}
   659  
   660  		if _, ok := t.loadAccessor(in.cluster); !ok {
   661  			// If there is no accessor but the cluster is locked, we're probably in the middle of the cluster accessor
   662  			// creation and we should requeue the health check until it's done.
   663  			if ok := t.clusterLock.TryLock(in.cluster); !ok {
   664  				t.log.V(4).Info("Waiting for cluster to be unlocked. Requeuing health check")
   665  				return false, nil
   666  			}
   667  			t.clusterLock.Unlock(in.cluster)
   668  			// Cache for this cluster has already been cleaned up.
   669  			// Nothing to do, so return true.
   670  			return true, nil
   671  		}
   672  
   673  		// An error here means there was either an issue connecting or the API returned an error.
   674  		// If no error occurs, reset the unhealthy counter.
   675  		_, err := restClient.Get().AbsPath(in.path).Timeout(in.requestTimeout).DoRaw(ctx)
   676  		if err != nil {
   677  			if apierrors.IsUnauthorized(err) {
   678  				// Unauthorized means that the underlying kubeconfig is not authorizing properly anymore, which
   679  				// usually is the result of automatic kubeconfig refreshes, meaning that we have to throw away the
   680  				// clusterAccessor and rely on the creation of a new one (with a refreshed kubeconfig)
   681  				return false, err
   682  			}
   683  			unhealthyCount++
   684  		} else {
   685  			unhealthyCount = 0
   686  		}
   687  
   688  		if unhealthyCount >= in.unhealthyThreshold {
   689  			// Cluster is now considered unhealthy.
   690  			return false, err
   691  		}
   692  
   693  		return false, nil
   694  	}
   695  
   696  	err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold)
   697  	// An error returned implies the health check has failed a sufficient number of times for the cluster
   698  	// to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
   699  	// cache context into wait.PollUntilContextCancel).
   700  	// NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
   701  	// when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
   702  	if err != nil {
   703  		t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name))
   704  	}
   705  	// Ensure in any case that the accessor is deleted (even if it is a no-op).
   706  	// NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
   707  	// cluster is reachable again
   708  	t.deleteAccessor(ctx, in.cluster)
   709  }
   710  
   711  // newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.
   712  // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
   713  // and the informer than doesn't sync because the workload cluster apiserver is not reachable.
   714  // An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
   715  func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client {
   716  	return clientWithTimeout{
   717  		Client:  client,
   718  		timeout: timeout,
   719  	}
   720  }
   721  
   722  type clientWithTimeout struct {
   723  	client.Client
   724  	timeout time.Duration
   725  }
   726  
   727  var _ client.Client = &clientWithTimeout{}
   728  
   729  func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
   730  	ctx, cancel := context.WithTimeout(ctx, c.timeout)
   731  	defer cancel()
   732  	return c.Client.Get(ctx, key, obj, opts...)
   733  }
   734  
   735  func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
   736  	ctx, cancel := context.WithTimeout(ctx, c.timeout)
   737  	defer cancel()
   738  	return c.Client.List(ctx, list, opts...)
   739  }