github.com/verrazzano/verrazzano@v1.7.0/cluster-operator/controllers/vmc/vmc_controller.go (about)

     1  // Copyright (c) 2021, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package vmc
     5  
     6  import (
     7  	"context"
     8  	goerrors "errors"
     9  	"fmt"
    10  	"time"
    11  
    12  	"github.com/verrazzano/verrazzano/pkg/k8sutil"
    13  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/keycloak"
    14  	"github.com/verrazzano/verrazzano/platform-operator/controllers/verrazzano/component/spi"
    15  
    16  	"github.com/prometheus/client_golang/prometheus"
    17  	"github.com/prometheus/client_golang/prometheus/promauto"
    18  	clustersv1alpha1 "github.com/verrazzano/verrazzano/cluster-operator/apis/clusters/v1alpha1"
    19  	"github.com/verrazzano/verrazzano/cluster-operator/internal/capi"
    20  	vzconstants "github.com/verrazzano/verrazzano/pkg/constants"
    21  	vzctrl "github.com/verrazzano/verrazzano/pkg/controller"
    22  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    23  	"github.com/verrazzano/verrazzano/pkg/rancherutil"
    24  	vzstring "github.com/verrazzano/verrazzano/pkg/string"
    25  	"github.com/verrazzano/verrazzano/platform-operator/apis/verrazzano/v1beta1"
    26  	"github.com/verrazzano/verrazzano/platform-operator/constants"
    27  	"go.uber.org/zap"
    28  	corev1 "k8s.io/api/core/v1"
    29  	rbacv1 "k8s.io/api/rbac/v1"
    30  	"k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    33  	"k8s.io/apimachinery/pkg/runtime"
    34  	"k8s.io/apimachinery/pkg/types"
    35  	ctrl "sigs.k8s.io/controller-runtime"
    36  	"sigs.k8s.io/controller-runtime/pkg/client"
    37  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    38  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    39  )
    40  
    41  const finalizerName = "managedcluster.verrazzano.io"
    42  
    43  // VerrazzanoManagedClusterReconciler reconciles a VerrazzanoManagedCluster object.
    44  // The reconciler will create a ServiceAcount, RoleBinding, and a Secret which
    45  // contains the kubeconfig to be used by the Multi-Cluster Agent to access the admin cluster.
    46  type VerrazzanoManagedClusterReconciler struct {
    47  	client.Client
    48  	Scheme             *runtime.Scheme
    49  	RancherIngressHost string
    50  	log                vzlog.VerrazzanoLogger
    51  }
    52  
    53  // bindingParams used to mutate the RoleBinding
    54  type bindingParams struct {
    55  	vmc                *clustersv1alpha1.VerrazzanoManagedCluster
    56  	roleName           string
    57  	serviceAccountName string
    58  }
    59  
    60  var (
    61  	reconcileTimeMetric = promauto.NewGauge(prometheus.GaugeOpts{
    62  		Name: "vz_cluster_operator_reconcile_vmc_duration_seconds",
    63  		Help: "The duration of the reconcile process for cluster objects",
    64  	})
    65  	reconcileErrorCount = promauto.NewCounter(prometheus.CounterOpts{
    66  		Name: "vz_cluster_operator_reconcile_vmc_error_total",
    67  		Help: "The amount of errors encountered in the reconcile process",
    68  	})
    69  	reconcileSuccessCount = promauto.NewCounter(prometheus.CounterOpts{
    70  		Name: "vz_cluster_operator_reconcile_vmc_success_total",
    71  		Help: "The number of times the reconcile process succeeded",
    72  	})
    73  )
    74  
    75  func (r *VerrazzanoManagedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
    76  	// Time the reconcile process and set the metric with the elapsed time
    77  	startTime := time.Now()
    78  	defer reconcileTimeMetric.Set(time.Since(startTime).Seconds())
    79  
    80  	if ctx == nil {
    81  		reconcileErrorCount.Inc()
    82  		return ctrl.Result{}, goerrors.New("context cannot be nil")
    83  	}
    84  	cr := &clustersv1alpha1.VerrazzanoManagedCluster{}
    85  	if err := r.Get(context.TODO(), req.NamespacedName, cr); err != nil {
    86  		// If the resource is not found, that means all of the finalizers have been removed,
    87  		// and the Verrazzano resource has been deleted, so there is nothing left to do.
    88  		if errors.IsNotFound(err) {
    89  			reconcileSuccessCount.Inc()
    90  			return reconcile.Result{}, nil
    91  		}
    92  		reconcileErrorCount.Inc()
    93  		zap.S().Errorf("Failed to fetch VerrazzanoManagedCluster resource: %v", err)
    94  		return newRequeueWithDelay(), nil
    95  	}
    96  
    97  	// Get the resource logger needed to log message using 'progress' and 'once' methods
    98  	log, err := vzlog.EnsureResourceLogger(&vzlog.ResourceConfig{
    99  		Name:           cr.Name,
   100  		Namespace:      cr.Namespace,
   101  		ID:             string(cr.UID),
   102  		Generation:     cr.Generation,
   103  		ControllerName: "multicluster",
   104  	})
   105  	if err != nil {
   106  		reconcileErrorCount.Inc()
   107  		zap.S().Errorf("Failed to create controller logger for VerrazzanoManagedCluster controller", err)
   108  	}
   109  
   110  	r.log = log
   111  	log.Oncef("Reconciling Verrazzano resource %v", req.NamespacedName)
   112  	res, err := r.doReconcile(ctx, log, cr)
   113  	if err != nil {
   114  		// Never return an error since it has already been logged and we don't want the
   115  		// controller runtime to log again (with stack trace).  Just re-queue if there is an error.
   116  		reconcileErrorCount.Inc()
   117  		return newRequeueWithDelay(), nil
   118  	}
   119  	if vzctrl.ShouldRequeue(res) {
   120  		reconcileSuccessCount.Inc()
   121  		return res, nil
   122  	}
   123  
   124  	// The resource has been reconciled.
   125  	log.Oncef("Successfully reconciled VerrazzanoManagedCluster resource %v", req.NamespacedName)
   126  
   127  	reconcileSuccessCount.Inc()
   128  	return ctrl.Result{}, nil
   129  }
   130  
   131  // Reconcile reconciles a VerrazzanoManagedCluster object
   132  func (r *VerrazzanoManagedClusterReconciler) doReconcile(ctx context.Context, log vzlog.VerrazzanoLogger, vmc *clustersv1alpha1.VerrazzanoManagedCluster) (ctrl.Result, error) {
   133  
   134  	if !vmc.ObjectMeta.DeletionTimestamp.IsZero() {
   135  		// Finalizer is present, so lets do the cluster deletion
   136  		if vzstring.SliceContainsString(vmc.ObjectMeta.Finalizers, finalizerName) {
   137  			if err := r.reconcileManagedClusterDelete(ctx, vmc); err != nil {
   138  				return reconcile.Result{}, err
   139  			}
   140  
   141  			// Remove the finalizer and update the Verrazzano resource if the deletion has finished.
   142  			log.Infof("Removing finalizer %s", finalizerName)
   143  			vmc.ObjectMeta.Finalizers = vzstring.RemoveStringFromSlice(vmc.ObjectMeta.Finalizers, finalizerName)
   144  			err := r.Update(ctx, vmc)
   145  			if err != nil && !errors.IsConflict(err) {
   146  				return reconcile.Result{}, err
   147  			}
   148  		}
   149  		return reconcile.Result{}, nil
   150  	}
   151  
   152  	// Add our finalizer if not already added
   153  	if !vzstring.SliceContainsString(vmc.ObjectMeta.Finalizers, finalizerName) {
   154  		log.Infof("Adding finalizer %s", finalizerName)
   155  		vmc.ObjectMeta.Finalizers = append(vmc.ObjectMeta.Finalizers, finalizerName)
   156  		if err := r.Update(ctx, vmc); err != nil {
   157  			return ctrl.Result{}, err
   158  		}
   159  	}
   160  
   161  	// Sync the service account
   162  	log.Debugf("Syncing the ServiceAccount for VMC %s", vmc.Name)
   163  	err := r.syncServiceAccount(vmc)
   164  	if err != nil {
   165  		r.handleError(ctx, vmc, "Failed to sync the ServiceAccount", err, log)
   166  		return newRequeueWithDelay(), err
   167  	}
   168  
   169  	log.Debugf("Syncing the RoleBinding for VMC %s", vmc.Name)
   170  	_, err = r.syncManagedRoleBinding(vmc)
   171  	if err != nil {
   172  		r.handleError(ctx, vmc, "Failed to sync the RoleBinding", err, log)
   173  		return newRequeueWithDelay(), err
   174  	}
   175  
   176  	log.Debugf("Syncing the Agent secret for VMC %s", vmc.Name)
   177  	err = r.syncAgentSecret(vmc)
   178  	if err != nil {
   179  		r.handleError(ctx, vmc, "Failed to sync the agent secret", err, log)
   180  		return newRequeueWithDelay(), err
   181  	}
   182  
   183  	log.Debugf("Syncing the Registration secret for VMC %s", vmc.Name)
   184  	err = r.syncRegistrationSecret(vmc)
   185  	if err != nil {
   186  		r.handleError(ctx, vmc, "Failed to sync the registration secret", err, log)
   187  		return newRequeueWithDelay(), err
   188  	}
   189  
   190  	log.Debugf("Syncing the Manifest secret for VMC %s", vmc.Name)
   191  	vzVMCWaitingForClusterID, err := r.syncManifestSecret(ctx, vmc)
   192  	if err != nil {
   193  		r.handleError(ctx, vmc, "Failed to sync the Manifest secret", err, log)
   194  		return newRequeueWithDelay(), err
   195  	}
   196  	if vzVMCWaitingForClusterID {
   197  		// waiting for the cluster ID to be set in the status, so requeue and try again
   198  		return newRequeueWithDelay(), nil
   199  	}
   200  
   201  	// create/update a secret with the CA cert from the managed cluster (if any errors occur we just log and continue)
   202  	syncedCert, err := r.syncCACertSecret(vmc)
   203  	if err != nil {
   204  		msg := fmt.Sprintf("Unable to get CA cert from managed cluster %s with id %s: %v", vmc.Name, vmc.Status.RancherRegistration.ClusterID, err)
   205  		r.log.Infof(msg)
   206  		r.setStatusConditionManagedCARetrieved(vmc, corev1.ConditionFalse, msg)
   207  	} else {
   208  		if syncedCert {
   209  			r.setStatusConditionManagedCARetrieved(vmc, corev1.ConditionTrue, "Managed cluster CA cert retrieved successfully")
   210  		}
   211  	}
   212  
   213  	log.Debugf("Updating Rancher ClusterRoleBindingTemplate for VMC %s", vmc.Name)
   214  	err = r.updateRancherClusterRoleBindingTemplate(vmc)
   215  	if err != nil {
   216  		r.handleError(ctx, vmc, "Failed to update Rancher ClusterRoleBindingTemplate", err, log)
   217  		return newRequeueWithDelay(), err
   218  	}
   219  
   220  	log.Debugf("Pushing the Manifest objects for VMC %s", vmc.Name)
   221  	pushedManifest, err := r.pushManifestObjects(vmc)
   222  	if err != nil {
   223  		r.handleError(ctx, vmc, "Failed to push the Manifest objects", err, log)
   224  		r.setStatusConditionManifestPushed(vmc, corev1.ConditionFalse, fmt.Sprintf("Failed to push the manifest objects to the managed cluster: %v", err))
   225  		return newRequeueWithDelay(), err
   226  	}
   227  	if pushedManifest {
   228  		r.log.Info("Manifest objects have been successfully pushed to the managed cluster")
   229  		r.setStatusConditionManifestPushed(vmc, corev1.ConditionTrue, "Manifest objects pushed to the managed cluster")
   230  	}
   231  
   232  	log.Debugf("Registering ArgoCD for VMC %s", vmc.Name)
   233  	var argoCDRegistration *clustersv1alpha1.ArgoCDRegistration
   234  	argoCDEnabled, err := r.isArgoCDEnabled()
   235  	if err != nil {
   236  		return newRequeueWithDelay(), err
   237  	}
   238  	rancherEnabled, err := r.isRancherEnabled()
   239  	if err != nil {
   240  		return newRequeueWithDelay(), err
   241  	}
   242  	if argoCDEnabled && rancherEnabled {
   243  		argoCDRegistration, err = r.registerManagedClusterWithArgoCD(vmc)
   244  		if err != nil {
   245  			r.handleError(ctx, vmc, "Failed to register managed cluster with Argo CD", err, log)
   246  			return newRequeueWithDelay(), err
   247  		}
   248  		vmc.Status.ArgoCDRegistration = *argoCDRegistration
   249  	}
   250  	if !rancherEnabled && argoCDEnabled {
   251  		now := metav1.Now()
   252  		vmc.Status.ArgoCDRegistration = clustersv1alpha1.ArgoCDRegistration{
   253  			Status:    clustersv1alpha1.RegistrationPendingRancher,
   254  			Timestamp: &now,
   255  			Message:   "Skipping Argo CD cluster registration due to Rancher not installed"}
   256  	}
   257  
   258  	r.setStatusConditionReady(vmc, "Ready")
   259  	statusErr := r.updateStatus(ctx, vmc)
   260  
   261  	if statusErr != nil {
   262  		log.Errorf("Failed to update status to ready for VMC %s: %v", vmc.Name, statusErr)
   263  	}
   264  
   265  	if err := r.syncManagedMetrics(ctx, log, vmc); err != nil {
   266  		return newRequeueWithDelay(), err
   267  	}
   268  
   269  	log.Debugf("Creating or updating keycloak client for %s", vmc.Name)
   270  	err = r.createManagedClusterKeycloakClient(vmc)
   271  	if err != nil {
   272  		r.handleError(ctx, vmc, "Failed to create or update Keycloak client for managed cluster", err, log)
   273  		return newRequeueWithDelay(), err
   274  	}
   275  
   276  	return ctrl.Result{Requeue: true, RequeueAfter: constants.ReconcileLoopRequeueInterval}, nil
   277  }
   278  
   279  func (r *VerrazzanoManagedClusterReconciler) syncServiceAccount(vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   280  	// Create or update the service account
   281  	_, serviceAccount, err := r.createOrUpdateServiceAccount(context.TODO(), vmc)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	if len(serviceAccount.Secrets) == 0 {
   287  		_, err = r.createServiceAccountTokenSecret(context.TODO(), serviceAccount)
   288  		if err != nil {
   289  			return err
   290  		}
   291  	}
   292  
   293  	// Does the VerrazzanoManagedCluster object contain the service account name?
   294  	saName := generateManagedResourceName(vmc.Name)
   295  	if vmc.Spec.ServiceAccount != saName {
   296  		r.log.Oncef("Updating ServiceAccount from %s to %s", vmc.Spec.ServiceAccount, saName)
   297  		vmc.Spec.ServiceAccount = saName
   298  		err = r.Update(context.TODO(), vmc)
   299  		if err != nil {
   300  			return err
   301  		}
   302  	}
   303  
   304  	return nil
   305  }
   306  
   307  // Create or update the ServiceAccount for a VerrazzanoManagedCluster
   308  func (r *VerrazzanoManagedClusterReconciler) createOrUpdateServiceAccount(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster) (controllerutil.OperationResult, *corev1.ServiceAccount, error) {
   309  	var serviceAccount corev1.ServiceAccount
   310  	serviceAccount.Namespace = vmc.Namespace
   311  	serviceAccount.Name = generateManagedResourceName(vmc.Name)
   312  
   313  	operationResult, err := controllerutil.CreateOrUpdate(ctx, r.Client, &serviceAccount, func() error {
   314  		r.mutateServiceAccount(vmc, &serviceAccount)
   315  		// This SetControllerReference call will trigger garbage collection i.e. the serviceAccount
   316  		// will automatically get deleted when the VerrazzanoManagedCluster is deleted
   317  		return controllerutil.SetControllerReference(vmc, &serviceAccount, r.Scheme)
   318  	})
   319  	return operationResult, &serviceAccount, err
   320  }
   321  
   322  func (r *VerrazzanoManagedClusterReconciler) mutateServiceAccount(vmc *clustersv1alpha1.VerrazzanoManagedCluster, serviceAccount *corev1.ServiceAccount) {
   323  	serviceAccount.Name = generateManagedResourceName(vmc.Name)
   324  }
   325  
   326  func (r *VerrazzanoManagedClusterReconciler) createServiceAccountTokenSecret(ctx context.Context, serviceAccount *corev1.ServiceAccount) (controllerutil.OperationResult, error) {
   327  	var secret corev1.Secret
   328  	secret.Name = serviceAccount.Name + "-token"
   329  	secret.Namespace = serviceAccount.Namespace
   330  	secret.Type = corev1.SecretTypeServiceAccountToken
   331  	secret.Annotations = map[string]string{
   332  		corev1.ServiceAccountNameKey: serviceAccount.Name,
   333  	}
   334  
   335  	return controllerutil.CreateOrUpdate(ctx, r.Client, &secret, func() error {
   336  		// This SetControllerReference call will trigger garbage collection i.e. the token secret
   337  		// will automatically get deleted when the service account is deleted
   338  		return controllerutil.SetControllerReference(serviceAccount, &secret, r.Scheme)
   339  	})
   340  }
   341  
   342  // syncManagedRoleBinding syncs the RoleBinding that binds the service account used by the managed cluster
   343  // to the role containing the permission
   344  func (r *VerrazzanoManagedClusterReconciler) syncManagedRoleBinding(vmc *clustersv1alpha1.VerrazzanoManagedCluster) (controllerutil.OperationResult, error) {
   345  	var roleBinding rbacv1.RoleBinding
   346  	roleBinding.Namespace = vmc.Namespace
   347  	roleBinding.Name = generateManagedResourceName(vmc.Name)
   348  
   349  	return controllerutil.CreateOrUpdate(context.TODO(), r.Client, &roleBinding, func() error {
   350  		mutateBinding(&roleBinding, bindingParams{
   351  			vmc:                vmc,
   352  			roleName:           constants.MCClusterRole,
   353  			serviceAccountName: vmc.Spec.ServiceAccount,
   354  		})
   355  		// This SetControllerReference call will trigger garbage collection i.e. the roleBinding
   356  		// will automatically get deleted when the VerrazzanoManagedCluster is deleted
   357  		return controllerutil.SetControllerReference(vmc, &roleBinding, r.Scheme)
   358  	})
   359  }
   360  
   361  // syncMultiClusterCASecret gets the CA secret in the VMC from the managed cluster and populates the CA secret for metrics scraping
   362  func (r *VerrazzanoManagedClusterReconciler) syncMultiClusterCASecret(ctx context.Context, log vzlog.VerrazzanoLogger, vmc *clustersv1alpha1.VerrazzanoManagedCluster) (corev1.Secret, error) {
   363  	var secret corev1.Secret
   364  
   365  	// read the configuration secret specified if it exists
   366  	if len(vmc.Spec.CASecret) > 0 {
   367  		secretNsn := types.NamespacedName{
   368  			Namespace: vmc.Namespace,
   369  			Name:      vmc.Spec.CASecret,
   370  		}
   371  
   372  		// validate secret if it exists
   373  		if err := r.Get(context.TODO(), secretNsn, &secret); err != nil {
   374  			return secret, log.ErrorfNewErr("failed to fetch the managed cluster CA secret %s/%s, %v", vmc.Namespace, vmc.Spec.CASecret, err)
   375  		}
   376  	}
   377  	if err := r.mutateManagedClusterCACertsSecret(ctx, vmc, &secret); err != nil {
   378  		return secret, log.ErrorfNewErr("Failed to sync the managed cluster CA certs for VMC %s: %v", vmc.Name, err)
   379  	}
   380  	return secret, nil
   381  }
   382  
   383  // mutateManagedClusterCACertsSecret adds and removes managed cluster CA certs to/from the managed cluster CA certs secret
   384  func (r *VerrazzanoManagedClusterReconciler) mutateManagedClusterCACertsSecret(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster, cacrtSecret *corev1.Secret) error {
   385  	ns := &corev1.Namespace{}
   386  	err := r.Client.Get(ctx, types.NamespacedName{Name: constants.VerrazzanoMonitoringNamespace}, ns)
   387  	if errors.IsNotFound(err) {
   388  		r.log.Infof("namespace %s does not exist", constants.VerrazzanoMonitoringNamespace)
   389  		return nil
   390  	}
   391  	secret := &corev1.Secret{
   392  		ObjectMeta: metav1.ObjectMeta{
   393  			Name:      constants.PromManagedClusterCACertsSecretName,
   394  			Namespace: constants.VerrazzanoMonitoringNamespace,
   395  		},
   396  	}
   397  
   398  	if _, err := controllerutil.CreateOrUpdate(ctx, r.Client, secret, func() error {
   399  		if secret.Data == nil {
   400  			secret.Data = make(map[string][]byte)
   401  		}
   402  		if cacrtSecret != nil && cacrtSecret.Data != nil && len(cacrtSecret.Data["cacrt"]) > 0 {
   403  			secret.Data[getCAKey(vmc)] = cacrtSecret.Data["cacrt"]
   404  		} else {
   405  			delete(secret.Data, getCAKey(vmc))
   406  		}
   407  		return nil
   408  	}); err != nil {
   409  		return err
   410  	}
   411  
   412  	return nil
   413  }
   414  
   415  // syncManagedMetrics syncs the metrics federation for managed clusters
   416  // There are currently two ways of federating metrics from managed clusters:
   417  // 1. Creating a Scrape config for the managed cluster on the admin cluster Prometheus
   418  // 2. Creating a Store in Thanos so that managed cluster metrics can be accessed by the admin cluster Query
   419  // These scenarios are mutually exclusive and the Thanos Query method takes precedence
   420  // There are two conditions that enable the Thanos query method
   421  //  1. Thanos is enabled on the managed cluster
   422  //     a. This manifests as the ThanosHost field in the VMC being populated
   423  //  2. Thanos is enabled on the managed cluster
   424  //
   425  // If these two conditions are not met, the Prometheus federation will be enabled
   426  func (r *VerrazzanoManagedClusterReconciler) syncManagedMetrics(ctx context.Context, log vzlog.VerrazzanoLogger, vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   427  	// We need to sync the multicluster CA secret for Prometheus and Thanos
   428  	caSecret, err := r.syncMultiClusterCASecret(ctx, log, vmc)
   429  	if err != nil {
   430  		r.handleError(ctx, vmc, "Failed to sync the multicluster CA secret", err, log)
   431  	}
   432  
   433  	thanosEnabled, err := r.isThanosEnabled()
   434  	if err != nil {
   435  		r.handleError(ctx, vmc, "Failed to verify if Thanos is enabled", err, log)
   436  		return err
   437  	}
   438  	// If the Thanos multicluster requirements are met, set up the Thanos Query store
   439  	if vmc.Status.ThanosQueryStore != "" && thanosEnabled {
   440  		err = r.syncThanosQuery(ctx, vmc)
   441  		if err != nil {
   442  			r.handleError(ctx, vmc, "Failed to update Thanos Query endpoint managed cluster", err, log)
   443  			return err
   444  		}
   445  
   446  		// If we successfully sync the managed cluster Thanos Query store, we should remove the federated Prometheus to avoid duplication
   447  		r.log.Oncef("Thanos Query synced for VMC %s. Removing the Prometheus scraper", vmc.Name)
   448  		err = r.deleteClusterPrometheusConfiguration(ctx, vmc)
   449  		if err != nil {
   450  			r.handleError(ctx, vmc, "Failed to remove the Prometheus scrape config", err, log)
   451  			return err
   452  		}
   453  		return nil
   454  	}
   455  
   456  	// If Thanos multicluster is disabled, attempt to delete left over resources
   457  	err = r.syncThanosQueryEndpointDelete(ctx, vmc)
   458  	if err != nil {
   459  		r.handleError(ctx, vmc, "Failed to delete Thanos Query endpoint managed cluster", err, log)
   460  		return err
   461  	}
   462  
   463  	// If the Prometheus host is not populated, skip federation and do nothing
   464  	if vmc.Status.PrometheusHost == "" {
   465  		// If reached, the managed cluster metrics are not populated, so we should remove the CA cert from the secret
   466  		err := r.mutateManagedClusterCACertsSecret(ctx, vmc, nil)
   467  		if err != nil {
   468  			r.handleError(ctx, vmc, "Failed to delete the managed cluster CA cert from the secret", err, log)
   469  			return err
   470  		}
   471  		log.Oncef("Managed cluster Prometheus Host not found in VMC Status for VMC %s. Waiting for VMC to be registered...", vmc.Name)
   472  		return nil
   473  	}
   474  
   475  	// Sync the Prometheus Scraper if Thanos multicluster is disabled and the host is populated
   476  	log.Debugf("Syncing the prometheus scraper for VMC %s", vmc.Name)
   477  	err = r.syncPrometheusScraper(ctx, vmc, &caSecret)
   478  	if err != nil {
   479  		r.handleError(ctx, vmc, "Failed to setup the prometheus scraper for managed cluster", err, log)
   480  		return err
   481  	}
   482  
   483  	return nil
   484  }
   485  
   486  // mutateBinding mutates the RoleBinding to ensure it has the valid params
   487  func mutateBinding(binding *rbacv1.RoleBinding, p bindingParams) {
   488  	binding.Name = generateManagedResourceName(p.vmc.Name)
   489  	binding.Namespace = p.vmc.Namespace
   490  	binding.Labels = p.vmc.Labels
   491  
   492  	binding.RoleRef = rbacv1.RoleRef{
   493  		APIGroup: "rbac.authorization.k8s.io",
   494  		Kind:     "ClusterRole",
   495  		Name:     p.roleName,
   496  	}
   497  	binding.Subjects = []rbacv1.Subject{
   498  		{
   499  			Kind:      "ServiceAccount",
   500  			Name:      p.serviceAccountName,
   501  			Namespace: constants.VerrazzanoMultiClusterNamespace,
   502  		},
   503  	}
   504  }
   505  
   506  // Generate the common name used by all resources specific to a given managed cluster
   507  func generateManagedResourceName(clusterName string) string {
   508  	return fmt.Sprintf("verrazzano-cluster-%s", clusterName)
   509  }
   510  
   511  // SetupWithManager creates a new controller and adds it to the manager
   512  func (r *VerrazzanoManagedClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
   513  	return ctrl.NewControllerManagedBy(mgr).
   514  		For(&clustersv1alpha1.VerrazzanoManagedCluster{}).
   515  		Complete(r)
   516  }
   517  
   518  // reconcileManagedClusterDelete performs all necessary cleanup during cluster deletion
   519  func (r *VerrazzanoManagedClusterReconciler) reconcileManagedClusterDelete(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   520  	if err := r.deleteClusterPrometheusConfiguration(ctx, vmc); err != nil {
   521  		return err
   522  	}
   523  	if err := r.unregisterClusterFromArgoCD(ctx, vmc); err != nil {
   524  		return err
   525  	}
   526  	if err := r.syncThanosQueryEndpointDelete(ctx, vmc); err != nil {
   527  		return err
   528  	}
   529  	if err := r.mutateManagedClusterCACertsSecret(ctx, vmc, nil); err != nil {
   530  		return err
   531  	}
   532  	return r.deleteClusterFromRancher(ctx, vmc)
   533  }
   534  
   535  // deleteClusterFromRancher calls the Rancher API to delete the cluster associated with the VMC if the VMC has a cluster id set in the status.
   536  func (r *VerrazzanoManagedClusterReconciler) deleteClusterFromRancher(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   537  	clusterID := vmc.Status.RancherRegistration.ClusterID
   538  	if clusterID == "" {
   539  		r.log.Debugf("VMC %s/%s has no Rancher cluster id, skipping delete", vmc.Namespace, vmc.Name)
   540  		return nil
   541  	}
   542  
   543  	rc, err := rancherutil.NewAdminRancherConfig(r.Client, r.RancherIngressHost, r.log)
   544  	if err != nil {
   545  		msg := "Failed to create Rancher API client"
   546  		r.updateRancherStatus(ctx, vmc, clustersv1alpha1.DeleteFailed, clusterID, msg)
   547  		r.log.Errorf("Unable to connect to Rancher API on admin cluster while attempting delete operation: %v", err)
   548  		return err
   549  	}
   550  	if _, err = DeleteClusterFromRancher(rc, clusterID, r.log); err != nil {
   551  		msg := "Failed deleting cluster"
   552  		r.updateRancherStatus(ctx, vmc, clustersv1alpha1.DeleteFailed, clusterID, msg)
   553  		r.log.Errorf("Unable to delete Rancher cluster %s/%s: %v", vmc.Namespace, vmc.Name, err)
   554  		return err
   555  	}
   556  
   557  	return nil
   558  }
   559  
   560  func (r *VerrazzanoManagedClusterReconciler) setStatusConditionManagedCARetrieved(vmc *clustersv1alpha1.VerrazzanoManagedCluster, value corev1.ConditionStatus, msg string) {
   561  	now := metav1.Now()
   562  	r.setStatusCondition(vmc, clustersv1alpha1.Condition{Status: value, Type: clustersv1alpha1.ConditionManagedCARetrieved, Message: msg, LastTransitionTime: &now}, false)
   563  }
   564  
   565  func (r *VerrazzanoManagedClusterReconciler) setStatusConditionManifestPushed(vmc *clustersv1alpha1.VerrazzanoManagedCluster, value corev1.ConditionStatus, msg string) {
   566  	now := metav1.Now()
   567  	r.setStatusCondition(vmc, clustersv1alpha1.Condition{Status: value, Type: clustersv1alpha1.ConditionManifestPushed, Message: msg, LastTransitionTime: &now}, true)
   568  }
   569  
   570  // setStatusConditionNotReady sets the status condition Ready = false on the VMC in memory - does NOT update the status in the cluster
   571  func (r *VerrazzanoManagedClusterReconciler) setStatusConditionNotReady(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster, msg string) {
   572  	now := metav1.Now()
   573  	r.setStatusCondition(vmc, clustersv1alpha1.Condition{Status: corev1.ConditionFalse, Type: clustersv1alpha1.ConditionReady, Message: msg, LastTransitionTime: &now}, false)
   574  }
   575  
   576  // setStatusConditionReady sets the status condition Ready = true on the VMC in memory - does NOT update the status in the cluster
   577  func (r *VerrazzanoManagedClusterReconciler) setStatusConditionReady(vmc *clustersv1alpha1.VerrazzanoManagedCluster, msg string) {
   578  	now := metav1.Now()
   579  	r.setStatusCondition(vmc, clustersv1alpha1.Condition{Status: corev1.ConditionTrue, Type: clustersv1alpha1.ConditionReady, Message: msg, LastTransitionTime: &now}, false)
   580  }
   581  
   582  func (r *VerrazzanoManagedClusterReconciler) handleError(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster, msg string, err error, log vzlog.VerrazzanoLogger) {
   583  	fullMsg := fmt.Sprintf("%s: %v", msg, err)
   584  	log.ErrorfThrottled(fullMsg)
   585  	r.setStatusConditionNotReady(ctx, vmc, fullMsg)
   586  	statusErr := r.updateStatus(ctx, vmc)
   587  	if statusErr != nil {
   588  		log.ErrorfThrottled("Failed to update status for VMC %s: %v", vmc.Name, statusErr)
   589  	}
   590  }
   591  
   592  // setStatusCondition updates the VMC status conditions based and replaces already created status conditions
   593  // the onTime flag updates the status condition if the time has changed
   594  func (r *VerrazzanoManagedClusterReconciler) setStatusCondition(vmc *clustersv1alpha1.VerrazzanoManagedCluster, condition clustersv1alpha1.Condition, onTime bool) {
   595  	r.log.Debugf("Entered setStatusCondition for VMC %s for condition %s = %s, existing conditions = %v",
   596  		vmc.Name, condition.Type, condition.Status, vmc.Status.Conditions)
   597  	var matchingCondition *clustersv1alpha1.Condition
   598  	var conditionExists bool
   599  	for i, existingCondition := range vmc.Status.Conditions {
   600  		if condition.Type == existingCondition.Type &&
   601  			condition.Status == existingCondition.Status &&
   602  			condition.Message == existingCondition.Message &&
   603  			(!onTime || condition.LastTransitionTime == existingCondition.LastTransitionTime) {
   604  			// the exact same condition already exists, don't update
   605  			conditionExists = true
   606  			break
   607  		}
   608  		if condition.Type == existingCondition.Type {
   609  			// use the index here since "existingCondition" is a copy and won't point to the object in the slice
   610  			matchingCondition = &vmc.Status.Conditions[i]
   611  			break
   612  		}
   613  	}
   614  	if !conditionExists {
   615  
   616  		if matchingCondition == nil {
   617  			vmc.Status.Conditions = append(vmc.Status.Conditions, condition)
   618  		} else {
   619  			matchingCondition.Message = condition.Message
   620  			matchingCondition.Status = condition.Status
   621  			matchingCondition.LastTransitionTime = condition.LastTransitionTime
   622  		}
   623  	}
   624  }
   625  
   626  // updateStatus updates the status of the VMC in the cluster, with all provided conditions, after setting the vmc.Status.State field for the cluster
   627  func (r *VerrazzanoManagedClusterReconciler) updateStatus(ctx context.Context, vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   628  	if err := r.updateState(vmc); err != nil {
   629  		return err
   630  	}
   631  
   632  	// Fetch the existing VMC to avoid conflicts in the status update
   633  	existingVMC := &clustersv1alpha1.VerrazzanoManagedCluster{}
   634  	err := r.Get(context.TODO(), types.NamespacedName{Namespace: vmc.Namespace, Name: vmc.Name}, existingVMC)
   635  	if err != nil {
   636  		return err
   637  	}
   638  
   639  	// Replace the existing status conditions and state with the conditions generated from this reconcile
   640  	for _, genCondition := range vmc.Status.Conditions {
   641  		r.setStatusCondition(existingVMC, genCondition, genCondition.Type == clustersv1alpha1.ConditionManifestPushed)
   642  	}
   643  	existingVMC.Status.State = vmc.Status.State
   644  	existingVMC.Status.ArgoCDRegistration = vmc.Status.ArgoCDRegistration
   645  
   646  	r.log.Debugf("Updating Status of VMC %s: %v", vmc.Name, vmc.Status.Conditions)
   647  	return r.Status().Update(ctx, existingVMC)
   648  }
   649  
   650  // updateState sets the vmc.Status.State for the given VMC.
   651  // The state field functions differently according to whether this VMC references an underlying ClusterAPI cluster.
   652  func (r *VerrazzanoManagedClusterReconciler) updateState(vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   653  	// If there is no underlying CAPI cluster, set the state field based on the lastAgentConnectTime
   654  	if vmc.Status.ClusterRef == nil {
   655  		r.updateStateFromLastAgentConnectTime(vmc)
   656  		return nil
   657  	}
   658  
   659  	// If there is an underlying CAPI cluster, set the state field according to the phase of the CAPI cluster.
   660  	capiClusterPhase, err := r.getCAPIClusterPhase(vmc.Status.ClusterRef)
   661  	if err != nil {
   662  		return err
   663  	}
   664  	if capiClusterPhase != "" {
   665  		vmc.Status.State = capiClusterPhase
   666  	}
   667  	return nil
   668  }
   669  
   670  // updateStateFromLastAgentConnectTime sets the vmc.Status.State according to the lastAgentConnectTime,
   671  // setting possible values of Active, Inactive, or Pending.
   672  func (r *VerrazzanoManagedClusterReconciler) updateStateFromLastAgentConnectTime(vmc *clustersv1alpha1.VerrazzanoManagedCluster) {
   673  	if vmc.Status.LastAgentConnectTime != nil {
   674  		currentTime := metav1.Now()
   675  		// Using the current plus added time to find the difference with lastAgentConnectTime to validate
   676  		// if it exceeds the max allowed time before changing the state of the vmc resource.
   677  		maxPollingTime := currentTime.Add(vzconstants.VMCAgentPollingTimeInterval * vzconstants.MaxTimesVMCAgentPollingTime)
   678  		timeDiff := maxPollingTime.Sub(vmc.Status.LastAgentConnectTime.Time)
   679  		if int(timeDiff.Minutes()) > vzconstants.MaxTimesVMCAgentPollingTime {
   680  			vmc.Status.State = clustersv1alpha1.StateInactive
   681  		} else if vmc.Status.State == "" {
   682  			vmc.Status.State = clustersv1alpha1.StatePending
   683  		} else {
   684  			vmc.Status.State = clustersv1alpha1.StateActive
   685  		}
   686  	}
   687  }
   688  
   689  // getCAPIClusterPhase returns the phase reported by the CAPI Cluster CR which is referenced by clusterRef.
   690  func (r *VerrazzanoManagedClusterReconciler) getCAPIClusterPhase(clusterRef *clustersv1alpha1.ClusterReference) (clustersv1alpha1.StateType, error) {
   691  	// Get the CAPI Cluster CR
   692  	cluster := &unstructured.Unstructured{}
   693  	cluster.SetGroupVersionKind(capi.GVKCAPICluster)
   694  	clusterNamespacedName := types.NamespacedName{
   695  		Name:      clusterRef.Name,
   696  		Namespace: clusterRef.Namespace,
   697  	}
   698  	if err := r.Get(context.TODO(), clusterNamespacedName, cluster); err != nil {
   699  		if errors.IsNotFound(err) {
   700  			return "", nil
   701  		}
   702  		return "", err
   703  	}
   704  
   705  	// Get the state
   706  	phase, found, err := unstructured.NestedString(cluster.Object, "status", "phase")
   707  	if !found {
   708  		r.log.Progressf("could not find status.phase field inside cluster %s: %v", clusterNamespacedName, err)
   709  		return "", nil
   710  	}
   711  	if err != nil {
   712  		r.log.Progressf("error while looking for status.phase field for cluster %s: %v", clusterNamespacedName, err)
   713  		return "", nil
   714  	}
   715  
   716  	// Validate that the CAPI Phase is a proper StateType for the VMC
   717  	switch state := clustersv1alpha1.StateType(phase); state {
   718  	case clustersv1alpha1.StatePending,
   719  		clustersv1alpha1.StateProvisioning,
   720  		clustersv1alpha1.StateProvisioned,
   721  		clustersv1alpha1.StateDeleting,
   722  		clustersv1alpha1.StateUnknown,
   723  		clustersv1alpha1.StateFailed:
   724  		return state, nil
   725  	default:
   726  		r.log.Progressf("retrieved an invalid ClusterAPI Cluster phase of %s", state)
   727  		return clustersv1alpha1.StateUnknown, nil
   728  	}
   729  }
   730  
   731  // getVerrazzanoResource gets the installed Verrazzano resource in the cluster (of which only one is expected)
   732  func (r *VerrazzanoManagedClusterReconciler) getVerrazzanoResource() (*v1beta1.Verrazzano, error) {
   733  	// Get the Verrazzano resource
   734  	verrazzano := v1beta1.VerrazzanoList{}
   735  	err := r.Client.List(context.TODO(), &verrazzano, &client.ListOptions{})
   736  	if err != nil || len(verrazzano.Items) == 0 {
   737  		return nil, r.log.ErrorfNewErr("Verrazzano must be installed: %v", err)
   738  
   739  	}
   740  	return &verrazzano.Items[0], nil
   741  }
   742  
   743  // leveraged to replace method (unit testing)
   744  var createClient = func(r *VerrazzanoManagedClusterReconciler, vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   745  	const prometheusHostPrefix = "prometheus.vmi.system"
   746  	promHost := vmc.Status.PrometheusHost
   747  	// Skip Keycloak client generation if Prometheus isn't present in VMC status
   748  	// MCAgent on the managed cluster will set this if/when it is ready
   749  	if len(promHost) == 0 {
   750  		r.log.Debug("Skipping Prometheus Keycloak client creation: VMC Prometheus not found")
   751  		return nil
   752  	}
   753  
   754  	// login to keycloak
   755  	cfg, cli, err := k8sutil.ClientConfig()
   756  	if err != nil {
   757  		return err
   758  	}
   759  
   760  	// create a context that can be leveraged by keycloak method
   761  	ctx, err := spi.NewMinimalContext(r.Client, r.log)
   762  	if err != nil {
   763  		return err
   764  	}
   765  
   766  	err = keycloak.LoginKeycloak(ctx, cfg, cli)
   767  	if err != nil {
   768  		return err
   769  	}
   770  
   771  	dnsSubdomain := promHost[len(prometheusHostPrefix)+1:]
   772  	clientID := fmt.Sprintf("verrazzano-%s", vmc.Name)
   773  	err = keycloak.CreateOrUpdateClient(ctx, cfg, cli, clientID, keycloak.ManagedClusterClientTmpl, keycloak.ManagedClusterClientUrisTemplate, false, &dnsSubdomain)
   774  	if err != nil {
   775  		return err
   776  	}
   777  
   778  	return nil
   779  }
   780  
   781  // createManagedClusterKeycloakClient creates a Keycloak client for the managed cluster
   782  func (r *VerrazzanoManagedClusterReconciler) createManagedClusterKeycloakClient(vmc *clustersv1alpha1.VerrazzanoManagedCluster) error {
   783  	return createClient(r, vmc)
   784  }
   785  
   786  // Create a new Result that will cause a reconcile requeue after a short delay
   787  func newRequeueWithDelay() ctrl.Result {
   788  	return vzctrl.NewRequeueWithDelay(2, 3, time.Second)
   789  }