sigs.k8s.io/cluster-api@v1.6.3/controllers/remote/cluster_cache_tracker.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package remote
    18  
    19  import (
    20  	"context"
    21  	"crypto/rsa"
    22  	"fmt"
    23  	"net/http"
    24  	"os"
    25  	"sync"
    26  	"time"
    27  
    28  	"github.com/go-logr/logr"
    29  	"github.com/pkg/errors"
    30  	corev1 "k8s.io/api/core/v1"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	"k8s.io/apimachinery/pkg/api/meta"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/runtime/serializer"
    36  	"k8s.io/apimachinery/pkg/types"
    37  	"k8s.io/apimachinery/pkg/util/sets"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/client-go/kubernetes/scheme"
    40  	"k8s.io/client-go/rest"
    41  	"k8s.io/klog/v2"
    42  	ctrl "sigs.k8s.io/controller-runtime"
    43  	"sigs.k8s.io/controller-runtime/pkg/cache"
    44  	"sigs.k8s.io/controller-runtime/pkg/client"
    45  	"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
    46  	"sigs.k8s.io/controller-runtime/pkg/handler"
    47  	"sigs.k8s.io/controller-runtime/pkg/log"
    48  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    49  	"sigs.k8s.io/controller-runtime/pkg/source"
    50  
    51  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    52  	"sigs.k8s.io/cluster-api/util/certs"
    53  	"sigs.k8s.io/cluster-api/util/conditions"
    54  )
    55  
    56  const (
    57  	healthCheckPollInterval       = 10 * time.Second
    58  	healthCheckRequestTimeout     = 5 * time.Second
    59  	healthCheckUnhealthyThreshold = 10
    60  	initialCacheSyncTimeout       = 5 * time.Minute
    61  	clusterCacheControllerName    = "cluster-cache-tracker"
    62  )
    63  
    64  // ErrClusterLocked is returned in methods that require cluster-level locking
    65  // if the cluster is already locked by another concurrent call.
    66  var ErrClusterLocked = errors.New("cluster is locked already")
    67  
    68  // ClusterCacheTracker manages client caches for workload clusters.
    69  type ClusterCacheTracker struct {
    70  	log                   logr.Logger
    71  	clientUncachedObjects []client.Object
    72  
    73  	client client.Client
    74  
    75  	// SecretCachingClient is a client which caches secrets.
    76  	// If set it will be used to read the kubeconfig secret.
    77  	// Otherwise the default client from the manager will be used.
    78  	secretCachingClient client.Client
    79  
    80  	scheme *runtime.Scheme
    81  
    82  	// clusterAccessorsLock is used to lock the access to the clusterAccessors map.
    83  	clusterAccessorsLock sync.RWMutex
    84  	// clusterAccessors is the map of clusterAccessors by cluster.
    85  	clusterAccessors map[client.ObjectKey]*clusterAccessor
    86  	// clusterLock is a per-cluster lock used whenever we're locking for a specific cluster.
    87  	// E.g. for actions like creating a client or adding watches.
    88  	clusterLock *keyedMutex
    89  
    90  	indexes []Index
    91  
    92  	// controllerName is the name of the controller.
    93  	// This is used to calculate the user agent string.
    94  	controllerName string
    95  
    96  	// controllerPodMetadata is the Pod metadata of the controller using this ClusterCacheTracker.
    97  	// This is only set when the POD_NAMESPACE, POD_NAME and POD_UID environment variables are set.
    98  	// This information will be used to detected if the controller is running on a workload cluster, so
    99  	// that we can then access the apiserver directly.
   100  	controllerPodMetadata *metav1.ObjectMeta
   101  }
   102  
   103  // ClusterCacheTrackerOptions defines options to configure
   104  // a ClusterCacheTracker.
   105  type ClusterCacheTrackerOptions struct {
   106  	// SecretCachingClient is a client which caches secrets.
   107  	// If set it will be used to read the kubeconfig secret.
   108  	// Otherwise the default client from the manager will be used.
   109  	SecretCachingClient client.Client
   110  
   111  	// Log is the logger used throughout the lifecycle of caches.
   112  	// Defaults to a no-op logger if it's not set.
   113  	Log *logr.Logger
   114  
   115  	// ClientUncachedObjects instructs the Client to never cache the following objects,
   116  	// it'll instead query the API server directly.
   117  	// Defaults to never caching ConfigMap and Secret if not set.
   118  	ClientUncachedObjects []client.Object
   119  	Indexes               []Index
   120  
   121  	// ControllerName is the name of the controller.
   122  	// This is used to calculate the user agent string.
   123  	// If not set, it defaults to "cluster-cache-tracker".
   124  	ControllerName string
   125  }
   126  
   127  func setDefaultOptions(opts *ClusterCacheTrackerOptions) {
   128  	if opts.Log == nil {
   129  		l := logr.New(log.NullLogSink{})
   130  		opts.Log = &l
   131  	}
   132  
   133  	l := opts.Log.WithValues("component", "remote/clustercachetracker")
   134  	opts.Log = &l
   135  
   136  	if len(opts.ClientUncachedObjects) == 0 {
   137  		opts.ClientUncachedObjects = []client.Object{
   138  			&corev1.ConfigMap{},
   139  			&corev1.Secret{},
   140  		}
   141  	}
   142  }
   143  
   144  // NewClusterCacheTracker creates a new ClusterCacheTracker.
   145  func NewClusterCacheTracker(manager ctrl.Manager, options ClusterCacheTrackerOptions) (*ClusterCacheTracker, error) {
   146  	setDefaultOptions(&options)
   147  
   148  	controllerName := options.ControllerName
   149  	if controllerName == "" {
   150  		controllerName = clusterCacheControllerName
   151  	}
   152  
   153  	var controllerPodMetadata *metav1.ObjectMeta
   154  	podNamespace := os.Getenv("POD_NAMESPACE")
   155  	podName := os.Getenv("POD_NAME")
   156  	podUID := os.Getenv("POD_UID")
   157  	if podNamespace != "" && podName != "" && podUID != "" {
   158  		options.Log.Info("Found controller pod metadata, the ClusterCacheTracker will try to access the cluster directly when possible")
   159  		controllerPodMetadata = &metav1.ObjectMeta{
   160  			Namespace: podNamespace,
   161  			Name:      podName,
   162  			UID:       types.UID(podUID),
   163  		}
   164  	} else {
   165  		options.Log.Info("Couldn't find controller pod metadata, the ClusterCacheTracker will always access clusters using the regular apiserver endpoint")
   166  	}
   167  
   168  	return &ClusterCacheTracker{
   169  		controllerName:        controllerName,
   170  		controllerPodMetadata: controllerPodMetadata,
   171  		log:                   *options.Log,
   172  		clientUncachedObjects: options.ClientUncachedObjects,
   173  		client:                manager.GetClient(),
   174  		secretCachingClient:   options.SecretCachingClient,
   175  		scheme:                manager.GetScheme(),
   176  		clusterAccessors:      make(map[client.ObjectKey]*clusterAccessor),
   177  		clusterLock:           newKeyedMutex(),
   178  		indexes:               options.Indexes,
   179  	}, nil
   180  }
   181  
   182  // GetClient returns a cached client for the given cluster.
   183  func (t *ClusterCacheTracker) GetClient(ctx context.Context, cluster client.ObjectKey) (client.Client, error) {
   184  	accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...)
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  
   189  	return accessor.client, nil
   190  }
   191  
   192  // GetRESTConfig returns a cached REST config for the given cluster.
   193  func (t *ClusterCacheTracker) GetRESTConfig(ctc context.Context, cluster client.ObjectKey) (*rest.Config, error) {
   194  	accessor, err := t.getClusterAccessor(ctc, cluster, t.indexes...)
   195  	if err != nil {
   196  		return nil, err
   197  	}
   198  
   199  	return accessor.config, nil
   200  }
   201  
   202  // GetEtcdClientCertificateKey returns a cached certificate key to be used for generating certificates for accessing etcd in the given cluster.
   203  func (t *ClusterCacheTracker) GetEtcdClientCertificateKey(ctx context.Context, cluster client.ObjectKey) (*rsa.PrivateKey, error) {
   204  	accessor, err := t.getClusterAccessor(ctx, cluster, t.indexes...)
   205  	if err != nil {
   206  		return nil, err
   207  	}
   208  
   209  	return accessor.etcdClientCertificateKey, nil
   210  }
   211  
   212  // clusterAccessor represents the combination of a delegating client, cache, and watches for a remote cluster.
   213  type clusterAccessor struct {
   214  	cache                    *stoppableCache
   215  	client                   client.Client
   216  	watches                  sets.Set[string]
   217  	config                   *rest.Config
   218  	etcdClientCertificateKey *rsa.PrivateKey
   219  }
   220  
   221  // clusterAccessorExists returns true if a clusterAccessor exists for cluster.
   222  func (t *ClusterCacheTracker) clusterAccessorExists(cluster client.ObjectKey) bool {
   223  	t.clusterAccessorsLock.RLock()
   224  	defer t.clusterAccessorsLock.RUnlock()
   225  
   226  	_, exists := t.clusterAccessors[cluster]
   227  	return exists
   228  }
   229  
   230  // loadAccessor loads a clusterAccessor.
   231  func (t *ClusterCacheTracker) loadAccessor(cluster client.ObjectKey) (*clusterAccessor, bool) {
   232  	t.clusterAccessorsLock.RLock()
   233  	defer t.clusterAccessorsLock.RUnlock()
   234  
   235  	accessor, ok := t.clusterAccessors[cluster]
   236  	return accessor, ok
   237  }
   238  
   239  // storeAccessor stores a clusterAccessor.
   240  func (t *ClusterCacheTracker) storeAccessor(cluster client.ObjectKey, accessor *clusterAccessor) {
   241  	t.clusterAccessorsLock.Lock()
   242  	defer t.clusterAccessorsLock.Unlock()
   243  
   244  	t.clusterAccessors[cluster] = accessor
   245  }
   246  
   247  // getClusterAccessor returns a clusterAccessor for cluster.
   248  // It first tries to return an already-created clusterAccessor.
   249  // It then falls back to create a new clusterAccessor if needed.
   250  // If there is already another go routine trying to create a clusterAccessor
   251  // for the same cluster, an error is returned.
   252  func (t *ClusterCacheTracker) getClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) {
   253  	log := ctrl.LoggerFrom(ctx, "cluster", klog.KRef(cluster.Namespace, cluster.Name))
   254  
   255  	// If the clusterAccessor already exists, return early.
   256  	if accessor, ok := t.loadAccessor(cluster); ok {
   257  		return accessor, nil
   258  	}
   259  
   260  	// clusterAccessor doesn't exist yet, we might have to initialize one.
   261  	// Lock on the cluster to ensure only one clusterAccessor is initialized
   262  	// for the cluster at the same time.
   263  	// Return an error if another go routine already tries to create a clusterAccessor.
   264  	if ok := t.clusterLock.TryLock(cluster); !ok {
   265  		return nil, errors.Wrapf(ErrClusterLocked, "failed to create cluster accessor: failed to get lock for cluster")
   266  	}
   267  	defer t.clusterLock.Unlock(cluster)
   268  
   269  	// Until we got the cluster lock a different goroutine might have initialized the clusterAccessor
   270  	// for this cluster successfully already. If this is the case we return it.
   271  	if accessor, ok := t.loadAccessor(cluster); ok {
   272  		return accessor, nil
   273  	}
   274  
   275  	// We are the go routine who has to initialize the clusterAccessor.
   276  	log.V(4).Info("Creating new cluster accessor")
   277  	accessor, err := t.newClusterAccessor(ctx, cluster, indexes...)
   278  	if err != nil {
   279  		return nil, errors.Wrap(err, "failed to create cluster accessor")
   280  	}
   281  
   282  	log.V(4).Info("Storing new cluster accessor")
   283  	t.storeAccessor(cluster, accessor)
   284  	return accessor, nil
   285  }
   286  
   287  // newClusterAccessor creates a new clusterAccessor.
   288  func (t *ClusterCacheTracker) newClusterAccessor(ctx context.Context, cluster client.ObjectKey, indexes ...Index) (*clusterAccessor, error) {
   289  	log := ctrl.LoggerFrom(ctx)
   290  
   291  	// Get a rest config for the remote cluster.
   292  	// Use the secretCachingClient if set.
   293  	secretClient := t.client
   294  	if t.secretCachingClient != nil {
   295  		secretClient = t.secretCachingClient
   296  	}
   297  	config, err := RESTConfig(ctx, t.controllerName, secretClient, cluster)
   298  	if err != nil {
   299  		return nil, errors.Wrapf(err, "error fetching REST client config for remote cluster %q", cluster.String())
   300  	}
   301  
   302  	// Create a http client and a mapper for the cluster.
   303  	httpClient, mapper, err := t.createHTTPClientAndMapper(config, cluster)
   304  	if err != nil {
   305  		return nil, errors.Wrapf(err, "error creating http client and mapper for remote cluster %q", cluster.String())
   306  	}
   307  
   308  	// Create an uncached client for the cluster.
   309  	uncachedClient, err := t.createUncachedClient(config, cluster, httpClient, mapper)
   310  	if err != nil {
   311  		return nil, err
   312  	}
   313  
   314  	// Detect if the controller is running on the workload cluster.
   315  	// This function uses an uncached client to ensure pods aren't cached by the long-lived client.
   316  	runningOnCluster, err := t.runningOnWorkloadCluster(ctx, uncachedClient, cluster)
   317  	if err != nil {
   318  		return nil, err
   319  	}
   320  
   321  	// If the controller runs on the workload cluster, access the apiserver directly by using the
   322  	// CA and Host from the in-cluster configuration.
   323  	if runningOnCluster {
   324  		inClusterConfig, err := ctrl.GetConfig()
   325  		if err != nil {
   326  			return nil, errors.Wrapf(err, "error creating client for self-hosted cluster %q", cluster.String())
   327  		}
   328  
   329  		// Use CA and Host from in-cluster config.
   330  		config.CAData = nil
   331  		config.CAFile = inClusterConfig.CAFile
   332  		config.Host = inClusterConfig.Host
   333  
   334  		// Update the http client and the mapper to use in-cluster config.
   335  		httpClient, mapper, err = t.createHTTPClientAndMapper(config, cluster)
   336  		if err != nil {
   337  			return nil, errors.Wrapf(err, "error creating http client and mapper (using in-cluster config) for remote cluster %q", cluster.String())
   338  		}
   339  
   340  		log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with in-cluster service %q", cluster.String(), config.Host))
   341  	} else {
   342  		log.Info(fmt.Sprintf("Creating cluster accessor for cluster %q with the regular apiserver endpoint %q", cluster.String(), config.Host))
   343  	}
   344  
   345  	// Create a client and a cache for the cluster.
   346  	cachedClient, err := t.createCachedClient(ctx, config, cluster, httpClient, mapper, indexes)
   347  	if err != nil {
   348  		return nil, err
   349  	}
   350  
   351  	// Generating a new private key to be used for generating temporary certificates to connect to
   352  	// etcd on the target cluster.
   353  	// NOTE: Generating a private key is an expensive operation, so we store it in the cluster accessor.
   354  	etcdKey, err := certs.NewPrivateKey()
   355  	if err != nil {
   356  		return nil, errors.Wrapf(err, "error creating etcd client key for remote cluster %q", cluster.String())
   357  	}
   358  
   359  	return &clusterAccessor{
   360  		cache:                    cachedClient.Cache,
   361  		config:                   config,
   362  		client:                   cachedClient.Client,
   363  		watches:                  sets.Set[string]{},
   364  		etcdClientCertificateKey: etcdKey,
   365  	}, nil
   366  }
   367  
   368  // runningOnWorkloadCluster detects if the current controller runs on the workload cluster.
   369  func (t *ClusterCacheTracker) runningOnWorkloadCluster(ctx context.Context, c client.Client, cluster client.ObjectKey) (bool, error) {
   370  	// Controller Pod metadata was not found, so we can't detect if we run on the workload cluster.
   371  	if t.controllerPodMetadata == nil {
   372  		return false, nil
   373  	}
   374  
   375  	// Try to get the controller pod.
   376  	var pod corev1.Pod
   377  	if err := c.Get(ctx, client.ObjectKey{
   378  		Namespace: t.controllerPodMetadata.Namespace,
   379  		Name:      t.controllerPodMetadata.Name,
   380  	}, &pod); err != nil {
   381  		// If the controller pod is not found, we assume we are not running on the workload cluster.
   382  		if apierrors.IsNotFound(err) {
   383  			return false, nil
   384  		}
   385  
   386  		// If we got another error, we return the error so that this will be retried later.
   387  		return false, errors.Wrapf(err, "error checking if we're running on workload cluster %q", cluster.String())
   388  	}
   389  
   390  	// If the uid is the same we found the controller pod on the workload cluster.
   391  	return t.controllerPodMetadata.UID == pod.UID, nil
   392  }
   393  
   394  // createHTTPClientAndMapper creates a http client and a dynamic rest mapper for the given cluster, based on the rest.Config.
   395  func (t *ClusterCacheTracker) createHTTPClientAndMapper(config *rest.Config, cluster client.ObjectKey) (*http.Client, meta.RESTMapper, error) {
   396  	// Create a http client for the cluster.
   397  	httpClient, err := rest.HTTPClientFor(config)
   398  	if err != nil {
   399  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating http client", cluster.String())
   400  	}
   401  
   402  	// Create a mapper for it
   403  	mapper, err := apiutil.NewDynamicRESTMapper(config, httpClient)
   404  	if err != nil {
   405  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error creating dynamic rest mapper", cluster.String())
   406  	}
   407  
   408  	// Verify if we can get a rest mapping from the workload cluster apiserver.
   409  	// Note: This also checks if the apiserver is up in general. We do this already here
   410  	// to avoid further effort creating a cache and a client and to produce a clearer error message.
   411  	_, err = mapper.RESTMapping(corev1.SchemeGroupVersion.WithKind("Node").GroupKind(), corev1.SchemeGroupVersion.Version)
   412  	if err != nil {
   413  		return nil, nil, errors.Wrapf(err, "error creating client for remote cluster %q: error getting rest mapping", cluster.String())
   414  	}
   415  
   416  	return httpClient, mapper, nil
   417  }
   418  
   419  // createUncachedClient creates an uncached client for the given cluster, based on the rest.Config.
   420  func (t *ClusterCacheTracker) createUncachedClient(config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper) (client.Client, error) {
   421  	// Create the uncached client for the remote cluster
   422  	uncachedClient, err := client.New(config, client.Options{
   423  		Scheme:     t.scheme,
   424  		Mapper:     mapper,
   425  		HTTPClient: httpClient,
   426  	})
   427  	if err != nil {
   428  		return nil, errors.Wrapf(err, "error creating uncached client for remote cluster %q", cluster.String())
   429  	}
   430  
   431  	return uncachedClient, nil
   432  }
   433  
   434  type cachedClientOutput struct {
   435  	Client client.Client
   436  	Cache  *stoppableCache
   437  }
   438  
   439  // createCachedClient creates a cached client for the given cluster, based on a rest.Config.
   440  func (t *ClusterCacheTracker) createCachedClient(ctx context.Context, config *rest.Config, cluster client.ObjectKey, httpClient *http.Client, mapper meta.RESTMapper, indexes []Index) (*cachedClientOutput, error) {
   441  	// Create the cache for the remote cluster
   442  	cacheOptions := cache.Options{
   443  		HTTPClient: httpClient,
   444  		Scheme:     t.scheme,
   445  		Mapper:     mapper,
   446  	}
   447  	remoteCache, err := cache.New(config, cacheOptions)
   448  	if err != nil {
   449  		return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error creating cache", cluster.String())
   450  	}
   451  
   452  	cacheCtx, cacheCtxCancel := context.WithCancel(ctx)
   453  
   454  	// We need to be able to stop the cache's shared informers, so wrap this in a stoppableCache.
   455  	cache := &stoppableCache{
   456  		Cache:      remoteCache,
   457  		cancelFunc: cacheCtxCancel,
   458  	}
   459  
   460  	for _, index := range indexes {
   461  		if err := cache.IndexField(ctx, index.Object, index.Field, index.ExtractValue); err != nil {
   462  			return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q: error adding index for field %q to cache", cluster.String(), index.Field)
   463  		}
   464  	}
   465  
   466  	// Create the client for the remote cluster
   467  	cachedClient, err := client.New(config, client.Options{
   468  		Scheme:     t.scheme,
   469  		Mapper:     mapper,
   470  		HTTPClient: httpClient,
   471  		Cache: &client.CacheOptions{
   472  			Reader:       cache,
   473  			DisableFor:   t.clientUncachedObjects,
   474  			Unstructured: true,
   475  		},
   476  	})
   477  	if err != nil {
   478  		return nil, errors.Wrapf(err, "error creating cached client for remote cluster %q", cluster.String())
   479  	}
   480  
   481  	// Start the cache!!!
   482  	go cache.Start(cacheCtx) //nolint:errcheck
   483  
   484  	// Wait until the cache is initially synced
   485  	cacheSyncCtx, cacheSyncCtxCancel := context.WithTimeout(ctx, initialCacheSyncTimeout)
   486  	defer cacheSyncCtxCancel()
   487  	if !cache.WaitForCacheSync(cacheSyncCtx) {
   488  		cache.Stop()
   489  		return nil, fmt.Errorf("failed waiting for cache for remote cluster %v to sync: %w", cluster, cacheCtx.Err())
   490  	}
   491  
   492  	// Wrap the cached client with a client that sets timeouts on all Get and List calls
   493  	// If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
   494  	// and the informer than doesn't sync because the workload cluster apiserver is not reachable.
   495  	// An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
   496  	// It should be reasonable to have Get and List calls timeout within the duration configured in the restConfig.
   497  	cachedClient = newClientWithTimeout(cachedClient, config.Timeout)
   498  
   499  	// Start cluster healthcheck!!!
   500  	go t.healthCheckCluster(cacheCtx, &healthCheckInput{
   501  		cluster:    cluster,
   502  		cfg:        config,
   503  		httpClient: httpClient,
   504  	})
   505  
   506  	return &cachedClientOutput{
   507  		Client: cachedClient,
   508  		Cache:  cache,
   509  	}, nil
   510  }
   511  
   512  // deleteAccessor stops a clusterAccessor's cache and removes the clusterAccessor from the tracker.
   513  func (t *ClusterCacheTracker) deleteAccessor(_ context.Context, cluster client.ObjectKey) {
   514  	t.clusterAccessorsLock.Lock()
   515  	defer t.clusterAccessorsLock.Unlock()
   516  
   517  	a, exists := t.clusterAccessors[cluster]
   518  	if !exists {
   519  		return
   520  	}
   521  
   522  	log := t.log.WithValues("Cluster", klog.KRef(cluster.Namespace, cluster.Name))
   523  	log.V(2).Info("Deleting clusterAccessor")
   524  	log.V(4).Info("Stopping cache")
   525  	a.cache.Stop()
   526  	log.V(4).Info("Cache stopped")
   527  
   528  	delete(t.clusterAccessors, cluster)
   529  }
   530  
   531  // Watcher is a scoped-down interface from Controller that only knows how to watch.
   532  type Watcher interface {
   533  	// Watch watches src for changes, sending events to eventHandler if they pass predicates.
   534  	Watch(src source.Source, eventHandler handler.EventHandler, predicates ...predicate.Predicate) error
   535  }
   536  
   537  // WatchInput specifies the parameters used to establish a new watch for a remote cluster.
   538  type WatchInput struct {
   539  	// Name represents a unique watch request for the specified Cluster.
   540  	Name string
   541  
   542  	// Cluster is the key for the remote cluster.
   543  	Cluster client.ObjectKey
   544  
   545  	// Watcher is the watcher (controller) whose Reconcile() function will be called for events.
   546  	Watcher Watcher
   547  
   548  	// Kind is the type of resource to watch.
   549  	Kind client.Object
   550  
   551  	// EventHandler contains the event handlers to invoke for resource events.
   552  	EventHandler handler.EventHandler
   553  
   554  	// Predicates is used to filter resource events.
   555  	Predicates []predicate.Predicate
   556  }
   557  
   558  // Watch watches a remote cluster for resource events. If the watch already exists based on input.Name, this is a no-op.
   559  func (t *ClusterCacheTracker) Watch(ctx context.Context, input WatchInput) error {
   560  	if input.Name == "" {
   561  		return errors.New("input.Name is required")
   562  	}
   563  
   564  	accessor, err := t.getClusterAccessor(ctx, input.Cluster, t.indexes...)
   565  	if err != nil {
   566  		return errors.Wrapf(err, "failed to add %s watch on cluster %s", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   567  	}
   568  
   569  	// We have to lock the cluster, so that the watch is not created multiple times in parallel.
   570  	ok := t.clusterLock.TryLock(input.Cluster)
   571  	if !ok {
   572  		return errors.Wrapf(ErrClusterLocked, "failed to add %T watch on cluster %s: failed to get lock for cluster", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   573  	}
   574  	defer t.clusterLock.Unlock(input.Cluster)
   575  
   576  	if accessor.watches.Has(input.Name) {
   577  		log := ctrl.LoggerFrom(ctx)
   578  		log.V(6).Info("Watch already exists", "Cluster", klog.KRef(input.Cluster.Namespace, input.Cluster.Name), "name", input.Name)
   579  		return nil
   580  	}
   581  
   582  	// Need to create the watch
   583  	if err := input.Watcher.Watch(source.Kind(accessor.cache, input.Kind), input.EventHandler, input.Predicates...); err != nil {
   584  		return errors.Wrapf(err, "failed to add %s watch on cluster %s: failed to create watch", input.Kind, klog.KRef(input.Cluster.Namespace, input.Cluster.Name))
   585  	}
   586  
   587  	accessor.watches.Insert(input.Name)
   588  
   589  	return nil
   590  }
   591  
   592  // healthCheckInput provides the input for the healthCheckCluster method.
   593  type healthCheckInput struct {
   594  	cluster            client.ObjectKey
   595  	httpClient         *http.Client
   596  	cfg                *rest.Config
   597  	interval           time.Duration
   598  	requestTimeout     time.Duration
   599  	unhealthyThreshold int
   600  	path               string
   601  }
   602  
   603  // setDefaults sets default values if optional parameters are not set.
   604  func (h *healthCheckInput) setDefaults() {
   605  	if h.interval == 0 {
   606  		h.interval = healthCheckPollInterval
   607  	}
   608  	if h.requestTimeout == 0 {
   609  		h.requestTimeout = healthCheckRequestTimeout
   610  	}
   611  	if h.unhealthyThreshold == 0 {
   612  		h.unhealthyThreshold = healthCheckUnhealthyThreshold
   613  	}
   614  	if h.path == "" {
   615  		h.path = "/"
   616  	}
   617  }
   618  
   619  // healthCheckCluster will poll the cluster's API at the path given and, if there are
   620  // `unhealthyThreshold` consecutive failures, will deem the cluster unhealthy.
   621  // Once the cluster is deemed unhealthy, the cluster's cache is stopped and removed.
   622  func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *healthCheckInput) {
   623  	// populate optional params for healthCheckInput
   624  	in.setDefaults()
   625  
   626  	unhealthyCount := 0
   627  
   628  	// This gets us a client that can make raw http(s) calls to the remote apiserver. We only need to create it once
   629  	// and we can reuse it inside the polling loop.
   630  	codec := runtime.NoopEncoder{Decoder: scheme.Codecs.UniversalDecoder()}
   631  	cfg := rest.CopyConfig(in.cfg)
   632  	cfg.NegotiatedSerializer = serializer.NegotiatedSerializerWrapper(runtime.SerializerInfo{Serializer: codec})
   633  	restClient, restClientErr := rest.UnversionedRESTClientForConfigAndClient(cfg, in.httpClient)
   634  
   635  	runHealthCheckWithThreshold := func(ctx context.Context) (bool, error) {
   636  		if restClientErr != nil {
   637  			return false, restClientErr
   638  		}
   639  
   640  		cluster := &clusterv1.Cluster{}
   641  		if err := t.client.Get(ctx, in.cluster, cluster); err != nil {
   642  			if apierrors.IsNotFound(err) {
   643  				// If the cluster can't be found, we should delete the cache.
   644  				return false, err
   645  			}
   646  			// Otherwise, requeue.
   647  			return false, nil
   648  		}
   649  
   650  		if !cluster.Status.InfrastructureReady || !conditions.IsTrue(cluster, clusterv1.ControlPlaneInitializedCondition) {
   651  			// If the infrastructure or control plane aren't marked as ready, we should requeue and wait.
   652  			return false, nil
   653  		}
   654  
   655  		if _, ok := t.loadAccessor(in.cluster); !ok {
   656  			// If there is no accessor but the cluster is locked, we're probably in the middle of the cluster accessor
   657  			// creation and we should requeue the health check until it's done.
   658  			if ok := t.clusterLock.TryLock(in.cluster); !ok {
   659  				t.log.V(4).Info("Waiting for cluster to be unlocked. Requeuing health check")
   660  				return false, nil
   661  			}
   662  			t.clusterLock.Unlock(in.cluster)
   663  			// Cache for this cluster has already been cleaned up.
   664  			// Nothing to do, so return true.
   665  			return true, nil
   666  		}
   667  
   668  		// An error here means there was either an issue connecting or the API returned an error.
   669  		// If no error occurs, reset the unhealthy counter.
   670  		_, err := restClient.Get().AbsPath(in.path).Timeout(in.requestTimeout).DoRaw(ctx)
   671  		if err != nil {
   672  			if apierrors.IsUnauthorized(err) {
   673  				// Unauthorized means that the underlying kubeconfig is not authorizing properly anymore, which
   674  				// usually is the result of automatic kubeconfig refreshes, meaning that we have to throw away the
   675  				// clusterAccessor and rely on the creation of a new one (with a refreshed kubeconfig)
   676  				return false, err
   677  			}
   678  			unhealthyCount++
   679  		} else {
   680  			unhealthyCount = 0
   681  		}
   682  
   683  		if unhealthyCount >= in.unhealthyThreshold {
   684  			// Cluster is now considered unhealthy.
   685  			return false, err
   686  		}
   687  
   688  		return false, nil
   689  	}
   690  
   691  	err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold)
   692  	// An error returned implies the health check has failed a sufficient number of times for the cluster
   693  	// to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
   694  	// cache context into wait.PollUntilContextCancel).
   695  	// NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
   696  	// when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
   697  	if err != nil {
   698  		t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name))
   699  	}
   700  	// Ensure in any case that the accessor is deleted (even if it is a no-op).
   701  	// NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
   702  	// cluster is reachable again
   703  	t.deleteAccessor(ctx, in.cluster)
   704  }
   705  
   706  // newClientWithTimeout returns a new client which sets the specified timeout on all Get and List calls.
   707  // If we don't set timeouts here Get and List calls can get stuck if they lazily create a new informer
   708  // and the informer than doesn't sync because the workload cluster apiserver is not reachable.
   709  // An alternative would be to set timeouts in the contexts we pass into all Get and List calls.
   710  func newClientWithTimeout(client client.Client, timeout time.Duration) client.Client {
   711  	return clientWithTimeout{
   712  		Client:  client,
   713  		timeout: timeout,
   714  	}
   715  }
   716  
   717  type clientWithTimeout struct {
   718  	client.Client
   719  	timeout time.Duration
   720  }
   721  
   722  var _ client.Client = &clientWithTimeout{}
   723  
   724  func (c clientWithTimeout) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
   725  	ctx, cancel := context.WithTimeout(ctx, c.timeout)
   726  	defer cancel()
   727  	return c.Client.Get(ctx, key, obj, opts...)
   728  }
   729  
   730  func (c clientWithTimeout) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
   731  	ctx, cancel := context.WithTimeout(ctx, c.timeout)
   732  	defer cancel()
   733  	return c.Client.List(ctx, list, opts...)
   734  }