open-cluster-management.io/governance-policy-propagator@v0.13.0/controllers/automation/policyautomation_controller.go (about)

     1  // Copyright (c) 2021 Red Hat, Inc.
     2  // Copyright Contributors to the Open Cluster Management project
     3  
     4  package automation
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"strconv"
    10  	"time"
    11  
    12  	"k8s.io/apimachinery/pkg/api/errors"
    13  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    14  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    15  	"k8s.io/apimachinery/pkg/runtime"
    16  	"k8s.io/apimachinery/pkg/runtime/schema"
    17  	"k8s.io/apimachinery/pkg/types"
    18  	"k8s.io/client-go/dynamic"
    19  	"k8s.io/client-go/tools/record"
    20  	ctrl "sigs.k8s.io/controller-runtime"
    21  	"sigs.k8s.io/controller-runtime/pkg/builder"
    22  	"sigs.k8s.io/controller-runtime/pkg/client"
    23  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    24  
    25  	policyv1 "open-cluster-management.io/governance-policy-propagator/api/v1"
    26  	policyv1beta1 "open-cluster-management.io/governance-policy-propagator/api/v1beta1"
    27  	"open-cluster-management.io/governance-policy-propagator/controllers/common"
    28  )
    29  
    30  const ControllerName string = "policy-automation"
    31  
    32  var dnsGVR = schema.GroupVersionResource{Group: "config.openshift.io", Version: "v1", Resource: "dnses"}
    33  
    34  var log = ctrl.Log.WithName(ControllerName)
    35  
    36  //+kubebuilder:rbac:groups=config.openshift.io,resources=dnses,resourceNames=cluster,verbs=get
    37  //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations,verbs=get;list;watch;create;update;patch;delete
    38  //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations/status,verbs=get;update;patch
    39  //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations/finalizers,verbs=update
    40  //+kubebuilder:rbac:groups=tower.ansible.com,resources=ansiblejobs,verbs=get;list;watch;create;update;patch;delete;deletecollection
    41  
    42  // SetupWithManager sets up the controller with the Manager.
    43  func (r *PolicyAutomationReconciler) SetupWithManager(mgr ctrl.Manager) error {
    44  	return ctrl.NewControllerManagedBy(mgr).
    45  		Named(ControllerName).
    46  		Watches(
    47  			&policyv1.Policy{},
    48  			&common.EnqueueRequestsFromMapFunc{ToRequests: policyMapper(mgr.GetClient())},
    49  			builder.WithPredicates(policyPredicateFuncs)).
    50  		For(
    51  			&policyv1beta1.PolicyAutomation{},
    52  			builder.WithPredicates(policyAuomtationPredicateFuncs)).
    53  		Complete(r)
    54  }
    55  
    56  // blank assignment to verify that ReconcilePolicy implements reconcile.Reconciler
    57  var _ reconcile.Reconciler = &PolicyAutomationReconciler{}
    58  
    59  // PolicyAutomationReconciler reconciles a PolicyAutomation object
    60  type PolicyAutomationReconciler struct {
    61  	client.Client
    62  	DynamicClient dynamic.Interface
    63  	Scheme        *runtime.Scheme
    64  	Recorder      record.EventRecorder
    65  	counter       int
    66  }
    67  
    68  // setOwnerReferences will set the input policy as the sole owner of the input policyAutomation and make the update
    69  // with the API. In practice, this will cause the input policyAutomation to be deleted when the policy is deleted.
    70  func (r *PolicyAutomationReconciler) setOwnerReferences(
    71  	ctx context.Context,
    72  	policyAutomation *policyv1beta1.PolicyAutomation,
    73  	policy *policyv1.Policy,
    74  ) error {
    75  	var policyOwnerRefFound bool
    76  
    77  	for _, ownerRef := range policyAutomation.GetOwnerReferences() {
    78  		if ownerRef.UID == policy.UID {
    79  			policyOwnerRefFound = true
    80  
    81  			break
    82  		}
    83  	}
    84  
    85  	if !policyOwnerRefFound {
    86  		log.V(3).Info(fmt.Sprintf("Setting the owner reference on the PolicyAutomation %s", policyAutomation.GetName()))
    87  		policyAutomation.SetOwnerReferences([]metav1.OwnerReference{
    88  			*metav1.NewControllerRef(policy, policy.GroupVersionKind()),
    89  		})
    90  
    91  		return r.Update(ctx, policyAutomation)
    92  	}
    93  
    94  	return nil
    95  }
    96  
    97  // getTargetListMap will convert slice targetList to map for search efficiency
    98  func getTargetListMap(targetList []string) map[string]bool {
    99  	targetListMap := map[string]bool{}
   100  	for _, target := range targetList {
   101  		targetListMap[target] = true
   102  	}
   103  
   104  	return targetListMap
   105  }
   106  
   107  // getClusterDNSName will get the Hub cluster DNS name if the Hub is an OpenShift cluster.
   108  func (r *PolicyAutomationReconciler) getClusterDNSName(ctx context.Context) (string, error) {
   109  	dnsCluster, err := r.DynamicClient.Resource(dnsGVR).Get(ctx, "cluster", metav1.GetOptions{})
   110  	if err != nil {
   111  		if errors.IsNotFound(err) {
   112  			// This is a debug log to not spam the logs when the Hub is installed on a Kubernetes distribution other
   113  			// than OpenShift.
   114  			log.V(2).Info("The Hub cluster DNS name couldn't be determined")
   115  
   116  			return "", nil
   117  		}
   118  
   119  		return "", err
   120  	}
   121  
   122  	dnsName, _, _ := unstructured.NestedString(dnsCluster.Object, "spec", "baseDomain")
   123  	if dnsName == "" {
   124  		log.Info("The OpenShift DNS object named cluster did not contain a valid spec.baseDomain value")
   125  	} else {
   126  		log.V(2).Info("The Hub cluster DNS name was found", "name", dnsName)
   127  	}
   128  
   129  	return dnsName, nil
   130  }
   131  
   132  // getViolationContext will put the root policy information into violationContext
   133  // It also puts the status of the non-compliant replicated policies into violationContext
   134  func (r *PolicyAutomationReconciler) getViolationContext(
   135  	ctx context.Context,
   136  	policy *policyv1.Policy,
   137  	targetList []string,
   138  	policyAutomation *policyv1beta1.PolicyAutomation,
   139  ) (policyv1beta1.ViolationContext, error) {
   140  	log.V(3).Info(
   141  		"Get the violation context from the root policy %s/%s",
   142  		policy.GetNamespace(),
   143  		policy.GetName(),
   144  	)
   145  
   146  	violationContext := policyv1beta1.ViolationContext{}
   147  	// 1) get the target cluster list
   148  	violationContext.TargetClusters = targetList
   149  	// 2) get the root policy name
   150  	violationContext.PolicyName = policy.GetName()
   151  	// 3) get the root policy namespace
   152  	violationContext.PolicyNamespace = policy.GetNamespace()
   153  	// 4) get the root policy hub cluster name
   154  	var err error
   155  
   156  	violationContext.HubCluster, err = r.getClusterDNSName(ctx)
   157  	if err != nil {
   158  		return policyv1beta1.ViolationContext{}, err
   159  	}
   160  
   161  	// 5) get the policy sets of the root policy
   162  	plcPlacement := policy.Status.Placement
   163  	policySets := []string{}
   164  
   165  	for _, placement := range plcPlacement {
   166  		if placement.PolicySet != "" {
   167  			policySets = append(policySets, placement.PolicySet)
   168  		}
   169  	}
   170  
   171  	violationContext.PolicySets = policySets
   172  
   173  	// skip policy_violation_context if all clusters are compliant
   174  	if len(targetList) == 0 {
   175  		return violationContext, nil
   176  	}
   177  
   178  	replicatedPlcList := &policyv1.PolicyList{}
   179  
   180  	err = r.List(
   181  		context.TODO(),
   182  		replicatedPlcList,
   183  		client.MatchingLabels(common.LabelsForRootPolicy(policy)),
   184  	)
   185  	if err != nil {
   186  		log.Error(err, "Failed to list the replicated policies")
   187  
   188  		return violationContext, err
   189  	}
   190  
   191  	if len(replicatedPlcList.Items) == 0 {
   192  		log.V(2).Info("The replicated policies cannot be found.")
   193  
   194  		return violationContext, nil
   195  	}
   196  
   197  	policyViolationsLimit := policyAutomation.Spec.Automation.PolicyViolationsLimit
   198  	if policyViolationsLimit == nil {
   199  		policyViolationsLimit = new(uint)
   200  		*policyViolationsLimit = policyv1beta1.DefaultPolicyViolationsLimit
   201  	}
   202  
   203  	contextLimit := int(*policyViolationsLimit)
   204  
   205  	targetListMap := getTargetListMap(targetList)
   206  	violationContext.PolicyViolations = make(
   207  		map[string]policyv1beta1.ReplicatedPolicyStatus,
   208  		len(replicatedPlcList.Items),
   209  	)
   210  
   211  	// 6) get the status of the non-compliance replicated policies
   212  	for _, rPlc := range replicatedPlcList.Items {
   213  		clusterName := rPlc.GetLabels()[common.ClusterNameLabel]
   214  		if !targetListMap[clusterName] {
   215  			continue // skip the compliance replicated policies
   216  		}
   217  
   218  		rPlcStatus := policyv1beta1.ReplicatedPolicyStatus{}
   219  		// Convert PolicyStatus to ReplicatedPolicyStatus and skip the unnecessary items
   220  		err := common.TypeConverter(rPlc.Status, &rPlcStatus)
   221  		if err != nil { // still assign the empty rPlcStatus to PolicyViolations later
   222  			log.Error(err, "The PolicyStatus cannot be converted to the type ReplicatedPolicyStatus.")
   223  		}
   224  
   225  		// get the latest violation message from the replicated policy
   226  		statusDetails := rPlc.Status.Details
   227  		if len(statusDetails) > 0 && len(statusDetails[0].History) > 0 {
   228  			rPlcStatus.ViolationMessage = statusDetails[0].History[0].Message
   229  		}
   230  
   231  		violationContext.PolicyViolations[clusterName] = rPlcStatus
   232  		if contextLimit > 0 && len(violationContext.PolicyViolations) == contextLimit {
   233  			log.V(2).Info(
   234  				"PolicyViolationsLimit is %s so skipping %s remaining replicated policies violations.",
   235  				fmt.Sprint(contextLimit),
   236  				fmt.Sprint(len(replicatedPlcList.Items)-contextLimit),
   237  			)
   238  
   239  			break
   240  		}
   241  	}
   242  
   243  	return violationContext, nil
   244  }
   245  
   246  // Reconcile reads that state of the cluster for a Policy object and makes changes based on the state read
   247  // and what is in the Policy.Spec
   248  // Note:
   249  // The Controller will requeue the Request to be processed again if the returned error is non-nil or
   250  // Result.Requeue is true, otherwise upon completion it will remove the work from the queue.
   251  func (r *PolicyAutomationReconciler) Reconcile(
   252  	ctx context.Context, request ctrl.Request,
   253  ) (ctrl.Result, error) {
   254  	log := log.WithValues("Request.Namespace", request.Namespace, "Request.Name", request.Name)
   255  
   256  	// Fetch the PolicyAutomation instance
   257  	policyAutomation := &policyv1beta1.PolicyAutomation{}
   258  
   259  	err := r.Get(ctx, request.NamespacedName, policyAutomation)
   260  	if err != nil {
   261  		if errors.IsNotFound(err) {
   262  			log.V(2).Info("Automation was deleted. Nothing to do.")
   263  
   264  			return reconcile.Result{}, nil
   265  		}
   266  
   267  		// Error reading the object - requeue the request.
   268  		return reconcile.Result{}, err
   269  	}
   270  
   271  	if policyAutomation.Spec.PolicyRef == "" {
   272  		log.Info("No policyRef in PolicyAutomation. Will ignore it.")
   273  
   274  		return reconcile.Result{}, nil
   275  	}
   276  
   277  	log = log.WithValues("policyRef", policyAutomation.Spec.PolicyRef)
   278  
   279  	policy := &policyv1.Policy{}
   280  
   281  	err = r.Get(ctx, types.NamespacedName{
   282  		Name:      policyAutomation.Spec.PolicyRef,
   283  		Namespace: policyAutomation.GetNamespace(),
   284  	}, policy)
   285  	if err != nil {
   286  		if errors.IsNotFound(err) {
   287  			log.Info("Policy specified in policyRef field not found, may have been deleted, doing nothing")
   288  
   289  			return reconcile.Result{}, nil
   290  		}
   291  
   292  		log.Error(err, "Failed to retrieve the policy specified in the policyRef field")
   293  
   294  		return reconcile.Result{}, err
   295  	}
   296  
   297  	err = r.setOwnerReferences(ctx, policyAutomation, policy)
   298  	if err != nil {
   299  		log.Error(err, "Failed to set the owner reference. Will requeue.")
   300  
   301  		return reconcile.Result{}, err
   302  	}
   303  
   304  	if policyAutomation.Annotations["policy.open-cluster-management.io/rerun"] == "true" {
   305  		AjExist, err := MatchPAResouceV(policyAutomation,
   306  			r.DynamicClient, policyAutomation.GetResourceVersion())
   307  		if err != nil {
   308  			log.Error(err, "Failed to compare Ansible job's resourceVersion")
   309  
   310  			return reconcile.Result{}, err
   311  		}
   312  
   313  		if AjExist {
   314  			log.Info("Ansiblejob already exist under this policyautomation resourceVersion")
   315  
   316  			return reconcile.Result{}, nil
   317  		}
   318  
   319  		targetList := common.FindNonCompliantClustersForPolicy(policy)
   320  		log.Info(
   321  			"Creating an Ansible job", "mode", "manual",
   322  			"clusterCount", strconv.Itoa(len(targetList)))
   323  
   324  		violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation)
   325  
   326  		err = CreateAnsibleJob(
   327  			policyAutomation,
   328  			r.DynamicClient,
   329  			"manual",
   330  			violationContext,
   331  		)
   332  		if err != nil {
   333  			log.Error(err, "Failed to create the Ansible job", "mode", "manual")
   334  
   335  			return reconcile.Result{}, err
   336  		}
   337  		// manual run succeeded, remove annotation
   338  		delete(policyAutomation.Annotations, "policy.open-cluster-management.io/rerun")
   339  
   340  		err = r.Update(ctx, policyAutomation, &client.UpdateOptions{})
   341  		if err != nil {
   342  			log.Error(err, "Failed to remove the annotation `policy.open-cluster-management.io/rerun`")
   343  
   344  			return reconcile.Result{}, err
   345  		}
   346  
   347  		return reconcile.Result{}, nil
   348  	} else if policyAutomation.Spec.Mode == policyv1beta1.Disabled {
   349  		log.Info("Automation is disabled, doing nothing")
   350  
   351  		return reconcile.Result{}, nil
   352  	} else {
   353  		if policy.Spec.Disabled {
   354  			log.Info("The policy is disabled. Doing nothing.")
   355  
   356  			return reconcile.Result{}, nil
   357  		}
   358  
   359  		if policyAutomation.Spec.Mode == "scan" {
   360  			log := log.WithValues("mode", "scan")
   361  			log.V(2).Info("Triggering scan mode")
   362  
   363  			requeueAfter, err := time.ParseDuration(policyAutomation.Spec.RescanAfter)
   364  			if err != nil {
   365  				if policyAutomation.Spec.RescanAfter != "" {
   366  					log.Error(err, "Invalid spec.rescanAfter value")
   367  				}
   368  
   369  				return reconcile.Result{RequeueAfter: requeueAfter}, err
   370  			}
   371  
   372  			targetList := common.FindNonCompliantClustersForPolicy(policy)
   373  			if len(targetList) > 0 {
   374  				log.Info("Creating An Ansible job", "targetList", targetList)
   375  				violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation)
   376  				err = CreateAnsibleJob(policyAutomation, r.DynamicClient, "scan",
   377  					violationContext)
   378  				if err != nil {
   379  					return reconcile.Result{RequeueAfter: requeueAfter}, err
   380  				}
   381  			} else {
   382  				log.Info("All clusters are compliant. Doing nothing.")
   383  			}
   384  
   385  			// no violations found, doing nothing
   386  			r.counter++
   387  			log.V(2).Info(
   388  				"RequeueAfter.", "RequeueAfter", requeueAfter.String(), "Counter", fmt.Sprintf("%d", r.counter),
   389  			)
   390  
   391  			return reconcile.Result{RequeueAfter: requeueAfter}, nil
   392  		} else if policyAutomation.Spec.Mode == policyv1beta1.Once {
   393  			log := log.WithValues("mode", string(policyv1beta1.Once))
   394  			targetList := common.FindNonCompliantClustersForPolicy(policy)
   395  			if len(targetList) > 0 {
   396  				log.Info("Creating an Ansible job", "targetList", targetList)
   397  
   398  				AjExist, err := MatchPAGeneration(policyAutomation,
   399  					r.DynamicClient, policyAutomation.GetGeneration())
   400  				if err != nil {
   401  					log.Error(err, "Failed to get Ansible job's generation")
   402  
   403  					return reconcile.Result{}, err
   404  				}
   405  				if AjExist {
   406  					return reconcile.Result{}, nil
   407  				}
   408  				violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation)
   409  				err = CreateAnsibleJob(
   410  					policyAutomation,
   411  					r.DynamicClient,
   412  					string(policyv1beta1.Once),
   413  					violationContext,
   414  				)
   415  				if err != nil {
   416  					log.Error(err, "Failed to create the Ansible job")
   417  
   418  					return reconcile.Result{}, err
   419  				}
   420  
   421  				policyAutomation.Spec.Mode = policyv1beta1.Disabled
   422  
   423  				err = r.Update(ctx, policyAutomation, &client.UpdateOptions{})
   424  				if err != nil {
   425  					log.Error(err, "Failed to update the mode to disabled")
   426  
   427  					return reconcile.Result{}, err
   428  				}
   429  			} else {
   430  				log.Info("All clusters are compliant. Doing nothing.")
   431  			}
   432  		} else if policyAutomation.Spec.Mode == policyv1beta1.EveryEvent {
   433  			log := log.WithValues("mode", string(policyv1beta1.EveryEvent))
   434  			targetList := common.FindNonCompliantClustersForPolicy(policy)
   435  			targetListMap := getTargetListMap(targetList)
   436  			// The clusters map that the new ansible job will target
   437  			trimmedTargetMap := map[string]bool{}
   438  			// delayAfterRunSeconds and requeueDuration default value = zero
   439  			delayAfterRunSeconds := policyAutomation.Spec.DelayAfterRunSeconds
   440  			requeueDuration := 0
   441  			requeueFlag := false
   442  			// Automation event time grouped by the cluster name
   443  			eventMap := map[string]policyv1beta1.ClusterEvent{}
   444  			if len(policyAutomation.Status.ClustersWithEvent) > 0 {
   445  				eventMap = policyAutomation.Status.ClustersWithEvent
   446  			}
   447  
   448  			now := time.Now().UTC()
   449  			nowStr := now.Format(time.RFC3339)
   450  
   451  			for clusterName, clusterEvent := range eventMap {
   452  				originalStartTime, err := time.Parse(time.RFC3339, clusterEvent.AutomationStartTime)
   453  				if err != nil {
   454  					log.Error(err, "Failed to retrieve AutomationStartTime in ClustersWithEvent")
   455  					delete(eventMap, clusterName)
   456  				}
   457  
   458  				preEventTime, err := time.Parse(time.RFC3339, clusterEvent.EventTime)
   459  				if err != nil {
   460  					log.Error(err, "Failed to retrieve EventTime in ClustersWithEvent")
   461  					delete(eventMap, clusterName)
   462  				}
   463  
   464  				// The time that delayAfterRunSeconds setting expires
   465  				delayUntil := originalStartTime.Add(time.Duration(delayAfterRunSeconds) * time.Second)
   466  
   467  				// The policy is non-compliant with the target cluster
   468  				if targetListMap[clusterName] {
   469  					// Policy status changed from non-compliant to compliant
   470  					// then back to non-compliant during the delay period
   471  					if delayAfterRunSeconds > 0 && preEventTime.After(originalStartTime) {
   472  						if now.After(delayUntil) {
   473  							// The delay period passed so remove the previous event
   474  							delete(eventMap, clusterName)
   475  							// Add the cluster name to create a new ansible job
   476  							if !trimmedTargetMap[clusterName] {
   477  								trimmedTargetMap[clusterName] = true
   478  							}
   479  						} else {
   480  							requeueFlag = true
   481  							// Within the delay period and use the earliest requeueDuration to requeue
   482  							if (requeueDuration == 0) || (requeueDuration > int(delayUntil.Sub(now)+1)) {
   483  								requeueDuration = int(delayUntil.Sub(now) + 1)
   484  							}
   485  							// keep the event and update eventTime
   486  							clusterEvent.EventTime = nowStr
   487  							// new event from compliant to non-compliant
   488  							eventMap[clusterName] = clusterEvent
   489  						}
   490  					} // Otherwise, the policy keeps non-compliant since originalStartTime, do nothing
   491  				} else { // The policy is compliant with the target cluster
   492  					if delayAfterRunSeconds > 0 && now.Before(delayUntil) {
   493  						// Within the delay period, keep the event and update eventTime
   494  						clusterEvent.EventTime = nowStr
   495  						// new event from non-compliant to compliant
   496  						eventMap[clusterName] = clusterEvent
   497  					} else { // No delay period or it is expired, remove the event
   498  						delete(eventMap, clusterName)
   499  					}
   500  				}
   501  			}
   502  
   503  			for _, clusterName := range targetList {
   504  				if _, ok := eventMap[clusterName]; !ok {
   505  					// Add the non-compliant clusters without previous automation event
   506  					if !trimmedTargetMap[clusterName] {
   507  						trimmedTargetMap[clusterName] = true
   508  					}
   509  				}
   510  			}
   511  
   512  			if len(trimmedTargetMap) > 0 {
   513  				trimmedTargetList := []string{}
   514  				for clusterName := range trimmedTargetMap {
   515  					trimmedTargetList = append(trimmedTargetList, clusterName)
   516  				}
   517  				log.Info("Creating An Ansible job", "trimmedTargetList", trimmedTargetList)
   518  				violationContext, _ := r.getViolationContext(ctx, policy, trimmedTargetList, policyAutomation)
   519  				err = CreateAnsibleJob(
   520  					policyAutomation,
   521  					r.DynamicClient,
   522  					string(policyv1beta1.EveryEvent),
   523  					violationContext,
   524  				)
   525  				if err != nil {
   526  					log.Error(err, "Failed to create the Ansible job")
   527  
   528  					return reconcile.Result{}, err
   529  				}
   530  
   531  				automationStartTimeStr := time.Now().UTC().Format(time.RFC3339)
   532  
   533  				for _, clusterName := range trimmedTargetList {
   534  					eventMap[clusterName] = policyv1beta1.ClusterEvent{
   535  						AutomationStartTime: automationStartTimeStr,
   536  						EventTime:           nowStr,
   537  					}
   538  				}
   539  			} else {
   540  				log.Info("All clusters are compliant. No new Ansible job. Just update ClustersWithEvent.")
   541  			}
   542  
   543  			policyAutomation.Status.ClustersWithEvent = eventMap
   544  			// use StatusWriter to update status subresource of a Kubernetes object
   545  			err = r.Status().Update(ctx, policyAutomation)
   546  			if err != nil {
   547  				log.Error(err, "Failed to update ClustersWithEvent in policyAutomation status")
   548  
   549  				return reconcile.Result{}, err
   550  			}
   551  
   552  			if requeueFlag {
   553  				log.Info(
   554  					"Requeue for the new non-compliant event during the delay period",
   555  					"Delay in seconds", delayAfterRunSeconds,
   556  					"Requeue After", requeueDuration,
   557  				)
   558  
   559  				return reconcile.Result{RequeueAfter: time.Duration(requeueDuration)}, nil
   560  			}
   561  		}
   562  	}
   563  
   564  	return ctrl.Result{}, nil
   565  }