open-cluster-management.io/governance-policy-propagator@v0.13.0/controllers/propagator/propagation.go (about)

     1  // Copyright (c) 2021 Red Hat, Inc.
     2  // Copyright Contributors to the Open Cluster Management project
     3  
     4  package propagator
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	templates "github.com/stolostron/go-template-utils/v4/pkg/templates"
    16  	k8sdepwatches "github.com/stolostron/kubernetes-dependency-watches/client"
    17  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    18  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    19  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    20  	"k8s.io/apimachinery/pkg/runtime"
    21  	"k8s.io/apimachinery/pkg/runtime/schema"
    22  	"k8s.io/apimachinery/pkg/util/json"
    23  	"k8s.io/client-go/tools/record"
    24  	"sigs.k8s.io/controller-runtime/pkg/client"
    25  	"sigs.k8s.io/controller-runtime/pkg/event"
    26  
    27  	policiesv1 "open-cluster-management.io/governance-policy-propagator/api/v1"
    28  	"open-cluster-management.io/governance-policy-propagator/controllers/common"
    29  )
    30  
    31  const (
    32  	TemplateStartDelim      = "{{hub"
    33  	TemplateStopDelim       = "hub}}"
    34  	TriggerUpdateAnnotation = "policy.open-cluster-management.io/trigger-update"
    35  )
    36  
    37  var ErrRetryable = errors.New("")
    38  
    39  type Propagator struct {
    40  	client.Client
    41  	Scheme                  *runtime.Scheme
    42  	Recorder                record.EventRecorder
    43  	RootPolicyLocks         *sync.Map
    44  	ReplicatedPolicyUpdates chan event.GenericEvent
    45  }
    46  
    47  // clusterDecision contains a single decision where the replicated policy
    48  // should be processed and any overrides to the root policy
    49  type clusterDecision struct {
    50  	Cluster         string
    51  	PolicyOverrides policiesv1.BindingOverrides
    52  }
    53  
    54  // cleanUpOrphanedRplPolicies compares the status of the input policy against the input placement
    55  // decisions. If the cluster exists in the status but doesn't exist in the input placement
    56  // decisions, then it's considered stale and an event is sent to the replicated policy reconciler
    57  // so the policy will be removed.
    58  func (r *RootPolicyReconciler) cleanUpOrphanedRplPolicies(
    59  	instance *policiesv1.Policy, originalCPCS []*policiesv1.CompliancePerClusterStatus, allDecisions common.DecisionSet,
    60  ) error {
    61  	log := log.WithValues("policyName", instance.GetName(), "policyNamespace", instance.GetNamespace())
    62  
    63  	for _, cluster := range originalCPCS {
    64  		if allDecisions[cluster.ClusterName] {
    65  			continue
    66  		}
    67  
    68  		// not found in allDecisions, orphan, send an event for it to delete itself
    69  		simpleObj := &common.GuttedObject{
    70  			TypeMeta: metav1.TypeMeta{
    71  				Kind:       policiesv1.Kind,
    72  				APIVersion: policiesv1.GroupVersion.String(),
    73  			},
    74  			ObjectMeta: metav1.ObjectMeta{
    75  				Name:      common.FullNameForPolicy(instance),
    76  				Namespace: cluster.ClusterName,
    77  			},
    78  		}
    79  
    80  		log.V(2).Info("Sending reconcile for replicated policy", "replicatedPolicyName", simpleObj.GetName())
    81  
    82  		r.ReplicatedPolicyUpdates <- event.GenericEvent{Object: simpleObj}
    83  	}
    84  
    85  	return nil
    86  }
    87  
    88  // handleRootPolicy will properly replicate or clean up when a root policy is updated.
    89  func (r *RootPolicyReconciler) handleRootPolicy(ctx context.Context, instance *policiesv1.Policy) error {
    90  	// Generate a metric for elapsed handling time for each policy
    91  	entryTS := time.Now()
    92  	defer func() {
    93  		now := time.Now()
    94  		elapsed := now.Sub(entryTS).Seconds()
    95  		roothandlerMeasure.Observe(elapsed)
    96  	}()
    97  
    98  	log := log.WithValues("policyName", instance.GetName(), "policyNamespace", instance.GetNamespace())
    99  
   100  	// Clean up the replicated policies if the policy is disabled
   101  	if instance.Spec.Disabled {
   102  		log.Info("The policy is disabled, doing clean up")
   103  
   104  		updateCount, err := r.updateExistingReplicas(ctx, instance.Namespace+"."+instance.Name)
   105  		if err != nil {
   106  			return err
   107  		}
   108  
   109  		// Checks if replicated policies exist in the event that
   110  		// a double reconcile to prevent emitting the same event twice
   111  		if updateCount > 0 {
   112  			r.Recorder.Event(instance, "Normal", "PolicyPropagation",
   113  				fmt.Sprintf("Policy %s/%s was disabled", instance.GetNamespace(), instance.GetName()))
   114  		}
   115  	}
   116  
   117  	// make a copy of the original status
   118  	originalCPCS := make([]*policiesv1.CompliancePerClusterStatus, len(instance.Status.Status))
   119  	copy(originalCPCS, instance.Status.Status)
   120  
   121  	decisions, err := common.RootStatusUpdate(ctx, r.Client, instance)
   122  	if err != nil {
   123  		return err
   124  	}
   125  
   126  	log.Info("Sending reconcile events to replicated policies", "decisionsCount", len(decisions))
   127  
   128  	for decision := range decisions {
   129  		simpleObj := &common.GuttedObject{
   130  			TypeMeta: metav1.TypeMeta{
   131  				Kind:       policiesv1.Kind,
   132  				APIVersion: policiesv1.GroupVersion.String(),
   133  			},
   134  			ObjectMeta: metav1.ObjectMeta{
   135  				Name:      common.FullNameForPolicy(instance),
   136  				Namespace: decision,
   137  			},
   138  		}
   139  
   140  		log.V(2).Info("Sending reconcile for replicated policy", "replicatedPolicyName", simpleObj.GetName())
   141  
   142  		r.ReplicatedPolicyUpdates <- event.GenericEvent{Object: simpleObj}
   143  	}
   144  
   145  	err = r.cleanUpOrphanedRplPolicies(instance, originalCPCS, decisions)
   146  	if err != nil {
   147  		log.Error(err, "Failed to delete orphaned replicated policies")
   148  
   149  		return err
   150  	}
   151  
   152  	return nil
   153  }
   154  
   155  // a helper to quickly check if there are any templates in any of the policy templates
   156  func policyHasTemplates(instance *policiesv1.Policy) bool {
   157  	for _, policyT := range instance.Spec.PolicyTemplates {
   158  		if templates.HasTemplate(policyT.ObjectDefinition.Raw, TemplateStartDelim, false) {
   159  			return true
   160  		}
   161  	}
   162  
   163  	return false
   164  }
   165  
   166  type templateCtx struct {
   167  	ManagedClusterName   string
   168  	ManagedClusterLabels map[string]string
   169  }
   170  
   171  func addManagedClusterLabels(clusterName string) func(templates.CachingQueryAPI, interface{}) (interface{}, error) {
   172  	return func(api templates.CachingQueryAPI, ctx interface{}) (interface{}, error) {
   173  		typedCtx, ok := ctx.(templateCtx)
   174  		if !ok {
   175  			return ctx, nil
   176  		}
   177  
   178  		managedClusterGVK := schema.GroupVersionKind{
   179  			Group:   "cluster.open-cluster-management.io",
   180  			Version: "v1",
   181  			Kind:    "ManagedCluster",
   182  		}
   183  
   184  		managedCluster, err := api.Get(managedClusterGVK, "", clusterName)
   185  		if err != nil {
   186  			return ctx, err
   187  		}
   188  
   189  		typedCtx.ManagedClusterLabels = managedCluster.GetLabels()
   190  
   191  		return typedCtx, nil
   192  	}
   193  }
   194  
   195  // Iterates through policy definitions and processes hub templates. A special annotation
   196  // policy.open-cluster-management.io/trigger-update is used to trigger reprocessing of the templates
   197  // and ensure that replicated-policies in the cluster are updated only if there is a change. This
   198  // annotation is deleted from the replicated policies and not propagated to the cluster namespaces.
   199  func (r *ReplicatedPolicyReconciler) processTemplates(
   200  	ctx context.Context,
   201  	replicatedPlc *policiesv1.Policy, clusterName string, rootPlc *policiesv1.Policy,
   202  ) error {
   203  	log := log.WithValues(
   204  		"policyName", rootPlc.GetName(),
   205  		"policyNamespace", rootPlc.GetNamespace(),
   206  		"cluster", clusterName,
   207  	)
   208  	log.V(1).Info("Processing templates")
   209  
   210  	annotations := replicatedPlc.GetAnnotations()
   211  
   212  	// handle possible nil map
   213  	if len(annotations) == 0 {
   214  		annotations = make(map[string]string)
   215  	}
   216  
   217  	// if disable-templates annotations exists and is true, then exit without processing templates
   218  	if disable, ok := annotations["policy.open-cluster-management.io/disable-templates"]; ok {
   219  		if boolDisable, err := strconv.ParseBool(disable); err == nil && boolDisable {
   220  			log.Info("Detected the disable-templates annotation. Will not process templates.")
   221  
   222  			return nil
   223  		}
   224  	}
   225  
   226  	// clear the trigger-update annotation, it's only for the root policy shouldn't be in replicated
   227  	// policies as it will cause an unnecessary update to the managed clusters
   228  	if _, ok := annotations[TriggerUpdateAnnotation]; ok {
   229  		delete(annotations, TriggerUpdateAnnotation)
   230  		replicatedPlc.SetAnnotations(annotations)
   231  	}
   232  
   233  	plcGVK := replicatedPlc.GroupVersionKind()
   234  
   235  	templateResolverOptions := templates.ResolveOptions{
   236  		ClusterScopedAllowList: []templates.ClusterScopedObjectIdentifier{
   237  			{
   238  				Group: "cluster.open-cluster-management.io",
   239  				Kind:  "ManagedCluster",
   240  				Name:  clusterName,
   241  			},
   242  		},
   243  		DisableAutoCacheCleanUp: true,
   244  		LookupNamespace:         rootPlc.GetNamespace(),
   245  		Watcher: &k8sdepwatches.ObjectIdentifier{
   246  			Group:     plcGVK.Group,
   247  			Version:   plcGVK.Version,
   248  			Kind:      plcGVK.Kind,
   249  			Namespace: replicatedPlc.GetNamespace(),
   250  			Name:      replicatedPlc.GetName(),
   251  		},
   252  	}
   253  
   254  	var templateResult templates.TemplateResult
   255  	var cacheCleanUp templates.CacheCleanUpFunc
   256  
   257  	defer func() {
   258  		if cacheCleanUp != nil {
   259  			err := cacheCleanUp()
   260  			if err != nil {
   261  				log.Error(err, "Failed to perform the cache clean up after template resolution")
   262  			}
   263  		}
   264  	}()
   265  
   266  	// A policy can have multiple policy templates within it, iterate and process each
   267  	for _, policyT := range replicatedPlc.Spec.PolicyTemplates {
   268  		if !templates.HasTemplate(policyT.ObjectDefinition.Raw, TemplateStartDelim, false) {
   269  			continue
   270  		}
   271  
   272  		if !isConfigurationPolicy(policyT) {
   273  			// has Templates but not a configuration policy
   274  			err := k8serrors.NewBadRequest("Templates are restricted to only Configuration Policies")
   275  			log.Error(err, "Not a Configuration Policy")
   276  
   277  			r.Recorder.Event(rootPlc, "Warning", "PolicyPropagation",
   278  				fmt.Sprintf(
   279  					"Policy %s/%s has templates but it is not a ConfigurationPolicy.",
   280  					rootPlc.GetName(),
   281  					rootPlc.GetNamespace(),
   282  				),
   283  			)
   284  
   285  			return err
   286  		}
   287  
   288  		log.V(1).Info("Found an object definition with templates")
   289  
   290  		templateContext := templateCtx{ManagedClusterName: clusterName}
   291  
   292  		if strings.Contains(string(policyT.ObjectDefinition.Raw), "ManagedClusterLabels") {
   293  			templateResolverOptions.ContextTransformers = append(
   294  				templateResolverOptions.ContextTransformers, addManagedClusterLabels(clusterName),
   295  			)
   296  		}
   297  
   298  		// Handle value encryption initialization
   299  		usesEncryption := templates.UsesEncryption(policyT.ObjectDefinition.Raw, TemplateStartDelim, TemplateStopDelim)
   300  		// Initialize AES Key and initialization vector
   301  		if usesEncryption && !templateResolverOptions.EncryptionEnabled {
   302  			log.V(1).Info("Found an object definition requiring encryption. Handling encryption keys.")
   303  			// Get/generate the encryption key
   304  			encryptionKey, err := r.getEncryptionKey(ctx, clusterName)
   305  			if err != nil {
   306  				log.Error(err, "Failed to get/generate the policy encryption key")
   307  
   308  				return fmt.Errorf("%w%w", ErrRetryable, err)
   309  			}
   310  
   311  			// Get/generate the initialization vector
   312  			initializationVector, err := r.getInitializationVector(
   313  				rootPlc.GetName(), clusterName, annotations,
   314  			)
   315  			if err != nil {
   316  				log.Error(err, "Failed to get initialization vector")
   317  
   318  				return err
   319  			}
   320  
   321  			// Set the initialization vector in the annotations
   322  			replicatedPlc.SetAnnotations(annotations)
   323  
   324  			// Set the EncryptionConfig with the retrieved key
   325  			templateResolverOptions.EncryptionConfig = templates.EncryptionConfig{
   326  				EncryptionEnabled:    true,
   327  				AESKey:               encryptionKey,
   328  				InitializationVector: initializationVector,
   329  			}
   330  		}
   331  
   332  		var tplErr error
   333  
   334  		templateResult, tplErr = r.TemplateResolver.ResolveTemplate(
   335  			policyT.ObjectDefinition.Raw, templateContext, &templateResolverOptions,
   336  		)
   337  
   338  		if templateResult.CacheCleanUp != nil {
   339  			cacheCleanUp = templateResult.CacheCleanUp
   340  		}
   341  
   342  		if tplErr != nil {
   343  			log.Error(tplErr, "Failed to resolve templates")
   344  
   345  			r.Recorder.Event(
   346  				rootPlc,
   347  				"Warning",
   348  				"PolicyPropagation",
   349  				fmt.Sprintf(
   350  					"Failed to resolve templates for cluster %s: %s",
   351  					clusterName,
   352  					tplErr.Error(),
   353  				),
   354  			)
   355  			// Set an annotation on the policyTemplate(e.g. ConfigurationPolicy) to the template processing error msg
   356  			// managed clusters will use this when creating a violation
   357  			policyTObjectUnstructured := &unstructured.Unstructured{}
   358  
   359  			jsonErr := json.Unmarshal(policyT.ObjectDefinition.Raw, policyTObjectUnstructured)
   360  			if jsonErr != nil {
   361  				// it shouldn't get here but if it did just log a msg
   362  				// it's all right, a generic msg will be used on the managedcluster
   363  				log.Error(jsonErr, "Error unmarshalling the object definition to JSON")
   364  			} else {
   365  				policyTAnnotations := policyTObjectUnstructured.GetAnnotations()
   366  				if policyTAnnotations == nil {
   367  					policyTAnnotations = make(map[string]string)
   368  				}
   369  				policyTAnnotations["policy.open-cluster-management.io/hub-templates-error"] = tplErr.Error()
   370  				policyTObjectUnstructured.SetAnnotations(policyTAnnotations)
   371  
   372  				updatedPolicyT, jsonErr := json.Marshal(policyTObjectUnstructured)
   373  				if jsonErr != nil {
   374  					log.Error(jsonErr, "Failed to marshall the policy template to JSON")
   375  				} else {
   376  					policyT.ObjectDefinition.Raw = updatedPolicyT
   377  				}
   378  			}
   379  
   380  			// If the failure was due to a Kubernetes API error that could be recoverable, let's retry it.
   381  			// Missing objects are handled by the templating library sending reconcile requests when they get created.
   382  			if errors.Is(tplErr, templates.ErrMissingAPIResource) ||
   383  				k8serrors.IsInternalError(tplErr) ||
   384  				k8serrors.IsServiceUnavailable(tplErr) ||
   385  				k8serrors.IsTimeout(tplErr) ||
   386  				k8serrors.IsTooManyRequests(tplErr) {
   387  				tplErr = fmt.Errorf("%w%w", ErrRetryable, tplErr)
   388  			}
   389  
   390  			return tplErr
   391  		}
   392  
   393  		policyT.ObjectDefinition.Raw = templateResult.ResolvedJSON
   394  
   395  		// Set initialization vector annotation on the ObjectDefinition for the controller's use
   396  		if usesEncryption {
   397  			policyTObjectUnstructured := &unstructured.Unstructured{}
   398  
   399  			jsonErr := json.Unmarshal(templateResult.ResolvedJSON, policyTObjectUnstructured)
   400  			if jsonErr != nil {
   401  				return fmt.Errorf("failed to unmarshal the object definition to JSON: %w", jsonErr)
   402  			}
   403  
   404  			policyTAnnotations := policyTObjectUnstructured.GetAnnotations()
   405  			if policyTAnnotations == nil {
   406  				policyTAnnotations = make(map[string]string)
   407  			}
   408  
   409  			policyIV := annotations[IVAnnotation]
   410  			foundIV := policyTAnnotations[IVAnnotation]
   411  
   412  			if policyIV != foundIV {
   413  				policyTAnnotations[IVAnnotation] = policyIV
   414  				policyTObjectUnstructured.SetAnnotations(policyTAnnotations)
   415  
   416  				updatedPolicyT, jsonErr := json.Marshal(policyTObjectUnstructured)
   417  				if jsonErr != nil {
   418  					return fmt.Errorf("failed to marshal the policy template to JSON: %w", jsonErr)
   419  				}
   420  
   421  				policyT.ObjectDefinition.Raw = updatedPolicyT
   422  			}
   423  		}
   424  	}
   425  
   426  	log.V(1).Info("Successfully processed templates")
   427  
   428  	return nil
   429  }
   430  
   431  func isConfigurationPolicy(policyT *policiesv1.PolicyTemplate) bool {
   432  	// check if it is a configuration policy first
   433  	var jsonDef map[string]interface{}
   434  	_ = json.Unmarshal(policyT.ObjectDefinition.Raw, &jsonDef)
   435  
   436  	return jsonDef != nil && jsonDef["kind"] == "ConfigurationPolicy"
   437  }