github.com/verrazzano/verrazzano@v1.7.0/application-operator/mcagent/mcagent.go (about)

     1  // Copyright (c) 2021, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package mcagent
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"os"
    10  
    11  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    12  	ctrl "sigs.k8s.io/controller-runtime"
    13  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    14  	"sigs.k8s.io/controller-runtime/pkg/event"
    15  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    16  
    17  	oamv1alpha2 "github.com/crossplane/oam-kubernetes-runtime/apis/core/v1alpha2"
    18  	clustersv1alpha1 "github.com/verrazzano/verrazzano/application-operator/apis/clusters/v1alpha1"
    19  	"github.com/verrazzano/verrazzano/application-operator/constants"
    20  	"github.com/verrazzano/verrazzano/application-operator/controllers/clusters"
    21  	"github.com/verrazzano/verrazzano/cluster-operator/apis/clusters/v1alpha1"
    22  	"github.com/verrazzano/verrazzano/pkg/mcconstants"
    23  	"go.uber.org/zap"
    24  	corev1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/runtime"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/client-go/tools/clientcmd"
    28  	"sigs.k8s.io/controller-runtime/pkg/client"
    29  )
    30  
    31  // ENV VAR for registration secret version
    32  const (
    33  	registrationSecretVersion = "REGISTRATION_SECRET_VERSION"
    34  	cattleAgentHashData       = "cattle-agent-hash" // the data field name for the cattleAgentHash in the agent state configmap
    35  	requeueDelayMinSeconds    = 50
    36  	requeueDelayMaxSeconds    = 70
    37  )
    38  
    39  // Name of config map that stores mc agent state
    40  var mcAgentStateConfigMapName = types.NamespacedName{Name: "mc-agent-state", Namespace: constants.VerrazzanoMultiClusterNamespace}
    41  
    42  var getAdminClientFunc = createAdminClient
    43  
    44  var mcAppConfCRDName = fmt.Sprintf("%s.%s", clustersv1alpha1.MultiClusterAppConfigResource, clustersv1alpha1.SchemeGroupVersion.Group)
    45  
    46  // Reconciler reconciles one iteration of the Managed cluster agent
    47  type Reconciler struct {
    48  	client.Client
    49  	Log          *zap.SugaredLogger
    50  	Scheme       *runtime.Scheme
    51  	AgentChannel chan clusters.StatusUpdateMessage
    52  }
    53  
    54  // SetupWithManager registers our controller with the manager
    55  func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
    56  	return ctrl.NewControllerManagedBy(mgr).
    57  		For(&corev1.Secret{}).
    58  		WithEventFilter(r.createAgentPredicate()).
    59  		Complete(r)
    60  }
    61  
    62  func (r *Reconciler) createAgentPredicate() predicate.Predicate {
    63  	return predicate.Funcs{
    64  		CreateFunc: func(e event.CreateEvent) bool {
    65  			return r.isAgentSecret(e.Object)
    66  		},
    67  		DeleteFunc: func(e event.DeleteEvent) bool {
    68  			return r.isAgentSecret(e.Object)
    69  		},
    70  		UpdateFunc: func(e event.UpdateEvent) bool {
    71  			return r.isAgentSecret(e.ObjectNew)
    72  		},
    73  	}
    74  }
    75  
    76  func (r *Reconciler) isAgentSecret(object client.Object) bool {
    77  	return object.GetNamespace() == constants.VerrazzanoSystemNamespace && object.GetName() == constants.MCAgentSecret
    78  }
    79  
    80  func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
    81  	// Get the agent secret
    82  	agentSecret := corev1.Secret{}
    83  	if err := r.Get(ctx, req.NamespacedName, &agentSecret); err != nil {
    84  		// there is no admin cluster we are connected to, so nowhere to send any status updates
    85  		// received - discard them
    86  		discardStatusMessages(r.AgentChannel)
    87  		return clusters.IgnoreNotFoundWithLog(err, r.Log)
    88  	}
    89  	if agentSecret.DeletionTimestamp != nil {
    90  		r.Log.Debugf("the secret %v was deleted", req.NamespacedName)
    91  		// there is no admin cluster we are connected to, so nowhere to send any status updates
    92  		// received - discard them
    93  		discardStatusMessages(r.AgentChannel)
    94  		return clusters.NewRequeueWithRandomDelay(requeueDelayMinSeconds, requeueDelayMaxSeconds), nil
    95  	}
    96  	if err := validateAgentSecret(&agentSecret); err != nil {
    97  		// agent secret is invalid - log and also discard status messages on the channel since there
    98  		// is no valid admin cluster to send status updates to
    99  		discardStatusMessages(r.AgentChannel)
   100  		return clusters.NewRequeueWithRandomDelay(requeueDelayMinSeconds, requeueDelayMaxSeconds), fmt.Errorf("Agent secret validation failed: %v", err)
   101  	}
   102  	r.Log.Debug("Reconciling multi-cluster agent")
   103  
   104  	// Process one iteration of the agent thread
   105  	err := r.doReconcile(ctx, agentSecret)
   106  	if err != nil {
   107  		r.Log.Errorf("failed processing multi-cluster resources: %v", err)
   108  	}
   109  	return clusters.NewRequeueWithRandomDelay(requeueDelayMinSeconds, requeueDelayMaxSeconds), nil
   110  }
   111  
   112  // doReconcile - process one iteration of the agent thread
   113  func (r *Reconciler) doReconcile(ctx context.Context, agentSecret corev1.Secret) error {
   114  	managedClusterName := string(agentSecret.Data[constants.ClusterNameData])
   115  
   116  	// Create the discovery client for the managed cluster
   117  	localDiscoveryClient, err := getDiscoveryClientFunc()
   118  	if err != nil {
   119  		return fmt.Errorf("failed to get discovery client for this workload cluster: %v", err)
   120  	}
   121  
   122  	// Initialize the syncer object
   123  	s := &Syncer{
   124  		LocalClient:          r.Client,
   125  		LocalDiscoveryClient: localDiscoveryClient,
   126  		Log:                  r.Log,
   127  		Context:              ctx,
   128  		ProjectNamespaces:    []string{},
   129  		StatusUpdateChannel:  r.AgentChannel,
   130  		ManagedClusterName:   managedClusterName,
   131  	}
   132  
   133  	// Read current agent state from config map
   134  	mcAgentStateConfigMap := corev1.ConfigMap{Data: map[string]string{}}
   135  	if err := r.Get(ctx, mcAgentStateConfigMapName, &mcAgentStateConfigMap); client.IgnoreNotFound(err) != nil {
   136  		return fmt.Errorf("failed to get the agent state config map %v: %v", mcAgentStateConfigMapName, err)
   137  	}
   138  
   139  	// Create the client for accessing the admin cluster
   140  	adminClient, err := getAdminClientFunc(&agentSecret)
   141  	// If we are unauthorized to create a client on the admin cluster
   142  	// the cluster must have been deregistered
   143  	if apierrors.IsUnauthorized(err) {
   144  		return s.syncDeregistration()
   145  	}
   146  	if err != nil {
   147  		return fmt.Errorf("failed to get the client for cluster %q with error %v", managedClusterName, err)
   148  	}
   149  	s.AdminClient = adminClient
   150  
   151  	// Sync cattle-cluster-agent deployment which will set the new cattleAgentHash on the Syncer
   152  	cattleAgentHashValue, err := s.syncCattleClusterAgent(mcAgentStateConfigMap.Data[cattleAgentHashData], "")
   153  	if err != nil {
   154  		// we couldn't sync the cattle-cluster-agent - but we should keep going with the rest of the work
   155  		r.Log.Errorf("Failed to synchronize cattle-cluster-agent: %v", err)
   156  	}
   157  
   158  	// Update mc-agent-state config map with the managed cluster name or cattle agent hash if needed
   159  	if err := r.updateMCAgentStateConfigMap(ctx, managedClusterName, cattleAgentHashValue); err != nil {
   160  		return err
   161  	}
   162  
   163  	// Update all Prometheus monitors relabel configs in all namespaces with new cluster name if needed
   164  	err = s.updatePrometheusMonitorsClusterName()
   165  	if err != nil {
   166  		return fmt.Errorf("failed to update the cluster name to %s on Prometheus monitor resources with error %v", s.ManagedClusterName, err)
   167  	}
   168  
   169  	// Update the status of our VMC on the admin cluster to record the last time we connected
   170  	// and update other fields of in the VMC status
   171  	err = s.updateVMCStatus()
   172  	if err != nil {
   173  		// we couldn't update status of the VMC - but we should keep going with the rest of the work
   174  		r.Log.Errorf("Failed to update VMC status on admin cluster: %v", err)
   175  	}
   176  
   177  	// Sync multi-cluster objects
   178  	s.SyncMultiClusterResources()
   179  
   180  	// Delete the managed cluster resources if deregistration occurs
   181  	err = s.syncDeregistration()
   182  	if err != nil {
   183  		// we couldn't delete the managed cluster resources - but we should keep going with the rest of the work
   184  		r.Log.Errorf("Failed to sync the deregistration process: %v", err)
   185  	}
   186  
   187  	// Check whether the admin or local clusters' CA certs have rolled, and sync as necessary
   188  	_, err = s.syncClusterCAs()
   189  	if err != nil {
   190  		// we couldn't sync the cluster CAs - but we should keep going with the rest of the work
   191  		r.Log.Errorf("Failed to synchronize cluster CA certificates: %v", err)
   192  	}
   193  
   194  	return nil
   195  }
   196  
   197  // updateMCAgentStateConfigMap updates the managed cluster name and cattle agent hash in the
   198  // agent state config map if those have changed from what was there before
   199  func (r *Reconciler) updateMCAgentStateConfigMap(ctx context.Context, managedClusterName string, cattleAgentHashValue string) error {
   200  	// create the ConfigMap's namespace if it doesn't already exist
   201  	mcAgentStateNamespace := corev1.Namespace{}
   202  	mcAgentStateNamespace.Name = mcAgentStateConfigMapName.Namespace
   203  	_, err := controllerutil.CreateOrUpdate(ctx, r.Client, &mcAgentStateNamespace, func() error { return nil })
   204  	if err != nil {
   205  		return fmt.Errorf("failed to create namespace %s: %v", mcAgentStateConfigMapName.Namespace, err)
   206  	}
   207  
   208  	mcAgentStateConfigMap := corev1.ConfigMap{}
   209  	mcAgentStateConfigMap.Name = mcAgentStateConfigMapName.Name
   210  	mcAgentStateConfigMap.Namespace = mcAgentStateConfigMapName.Namespace
   211  	_, err = controllerutil.CreateOrUpdate(ctx, r.Client, &mcAgentStateConfigMap, func() error {
   212  		if mcAgentStateConfigMap.Data == nil {
   213  			mcAgentStateConfigMap.Data = map[string]string{}
   214  		}
   215  		existingClusterName := mcAgentStateConfigMap.Data[constants.ClusterNameData]
   216  		if existingClusterName != managedClusterName {
   217  			// Log the cluster name only if it changes
   218  			r.Log.Infof("Cluster name changed from '%q' to '%q', updating the agent state ConfigMap", existingClusterName, managedClusterName)
   219  			mcAgentStateConfigMap.Data[constants.ClusterNameData] = managedClusterName
   220  		}
   221  		existingCattleAgentHash := mcAgentStateConfigMap.Data[cattleAgentHashData]
   222  		if existingCattleAgentHash != cattleAgentHashValue {
   223  			// Log that the cattle agent hash has changed
   224  			r.Log.Infof("The %s has changed, updating the agent state ConfigMap", cattleAgentHashData)
   225  			mcAgentStateConfigMap.Data[cattleAgentHashData] = cattleAgentHashValue
   226  		}
   227  		return nil
   228  	})
   229  	if err != nil {
   230  		return fmt.Errorf("failed to update agent state in ConfigMap %v: %v", mcAgentStateConfigMapName, err)
   231  	}
   232  	return nil
   233  }
   234  
   235  // Validate the agent secret
   236  func validateAgentSecret(secret *corev1.Secret) error {
   237  	// The secret must contain a cluster name
   238  	_, ok := secret.Data[constants.ClusterNameData]
   239  	if !ok {
   240  		return fmt.Errorf("the secret named %s in namespace %s is missing the required field %s", secret.Name, secret.Namespace, constants.ClusterNameData)
   241  	}
   242  
   243  	// The secret must contain a kubeconfig
   244  	_, ok = secret.Data[mcconstants.KubeconfigKey]
   245  	if !ok {
   246  		return fmt.Errorf("the secret named %s in namespace %s is missing the required field %s", secret.Name, secret.Namespace, mcconstants.KubeconfigKey)
   247  	}
   248  
   249  	return nil
   250  }
   251  
   252  // Get the clientset for accessing the admin cluster
   253  func createAdminClient(secret *corev1.Secret) (client.Client, error) {
   254  	// Create a temp file that contains the kubeconfig
   255  	tmpFile, err := os.CreateTemp("", "kubeconfig")
   256  	if err != nil {
   257  		return nil, err
   258  	}
   259  
   260  	err = os.WriteFile(tmpFile.Name(), secret.Data[mcconstants.KubeconfigKey], 0600)
   261  	defer os.Remove(tmpFile.Name())
   262  	if err != nil {
   263  		return nil, err
   264  	}
   265  
   266  	config, err := clientcmd.BuildConfigFromFlags("", tmpFile.Name())
   267  	if err != nil {
   268  		return nil, err
   269  	}
   270  	scheme := runtime.NewScheme()
   271  	_ = clustersv1alpha1.AddToScheme(scheme)
   272  	_ = v1alpha1.AddToScheme(scheme)
   273  	_ = oamv1alpha2.SchemeBuilder.AddToScheme(scheme)
   274  	_ = corev1.SchemeBuilder.AddToScheme(scheme)
   275  
   276  	clientset, err := client.New(config, client.Options{Scheme: scheme})
   277  	if err != nil {
   278  		return nil, err
   279  	}
   280  
   281  	return clientset, nil
   282  }
   283  
   284  func getEnvValue(containers *[]corev1.Container, envName string) string {
   285  	for _, container := range *containers {
   286  		for _, env := range container.Env {
   287  			if env.Name == envName {
   288  				return env.Value
   289  			}
   290  		}
   291  	}
   292  	return ""
   293  }
   294  
   295  func updateEnvValue(envs []corev1.EnvVar, envName string, newValue string) []corev1.EnvVar {
   296  	for i, env := range envs {
   297  		if env.Name == envName {
   298  			envs[i].Value = newValue
   299  			return envs
   300  		}
   301  	}
   302  	return append(envs, corev1.EnvVar{Name: envName, Value: newValue})
   303  }
   304  
   305  // discardStatusMessages discards all messages in the statusUpdateChannel - this will
   306  // prevent the channel buffer from filling up in the case of a non-managed cluster
   307  func discardStatusMessages(statusUpdateChannel chan clusters.StatusUpdateMessage) {
   308  	length := len(statusUpdateChannel)
   309  	for i := 0; i < length; i++ {
   310  		<-statusUpdateChannel
   311  	}
   312  }