open-cluster-management.io/governance-policy-propagator@v0.13.0/controllers/automation/policyautomation_controller.go (about) 1 // Copyright (c) 2021 Red Hat, Inc. 2 // Copyright Contributors to the Open Cluster Management project 3 4 package automation 5 6 import ( 7 "context" 8 "fmt" 9 "strconv" 10 "time" 11 12 "k8s.io/apimachinery/pkg/api/errors" 13 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 15 "k8s.io/apimachinery/pkg/runtime" 16 "k8s.io/apimachinery/pkg/runtime/schema" 17 "k8s.io/apimachinery/pkg/types" 18 "k8s.io/client-go/dynamic" 19 "k8s.io/client-go/tools/record" 20 ctrl "sigs.k8s.io/controller-runtime" 21 "sigs.k8s.io/controller-runtime/pkg/builder" 22 "sigs.k8s.io/controller-runtime/pkg/client" 23 "sigs.k8s.io/controller-runtime/pkg/reconcile" 24 25 policyv1 "open-cluster-management.io/governance-policy-propagator/api/v1" 26 policyv1beta1 "open-cluster-management.io/governance-policy-propagator/api/v1beta1" 27 "open-cluster-management.io/governance-policy-propagator/controllers/common" 28 ) 29 30 const ControllerName string = "policy-automation" 31 32 var dnsGVR = schema.GroupVersionResource{Group: "config.openshift.io", Version: "v1", Resource: "dnses"} 33 34 var log = ctrl.Log.WithName(ControllerName) 35 36 //+kubebuilder:rbac:groups=config.openshift.io,resources=dnses,resourceNames=cluster,verbs=get 37 //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations,verbs=get;list;watch;create;update;patch;delete 38 //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations/status,verbs=get;update;patch 39 //+kubebuilder:rbac:groups=policy.open-cluster-management.io,resources=policyautomations/finalizers,verbs=update 40 //+kubebuilder:rbac:groups=tower.ansible.com,resources=ansiblejobs,verbs=get;list;watch;create;update;patch;delete;deletecollection 41 42 // SetupWithManager sets up the controller with the Manager. 43 func (r *PolicyAutomationReconciler) SetupWithManager(mgr ctrl.Manager) error { 44 return ctrl.NewControllerManagedBy(mgr). 45 Named(ControllerName). 46 Watches( 47 &policyv1.Policy{}, 48 &common.EnqueueRequestsFromMapFunc{ToRequests: policyMapper(mgr.GetClient())}, 49 builder.WithPredicates(policyPredicateFuncs)). 50 For( 51 &policyv1beta1.PolicyAutomation{}, 52 builder.WithPredicates(policyAuomtationPredicateFuncs)). 53 Complete(r) 54 } 55 56 // blank assignment to verify that ReconcilePolicy implements reconcile.Reconciler 57 var _ reconcile.Reconciler = &PolicyAutomationReconciler{} 58 59 // PolicyAutomationReconciler reconciles a PolicyAutomation object 60 type PolicyAutomationReconciler struct { 61 client.Client 62 DynamicClient dynamic.Interface 63 Scheme *runtime.Scheme 64 Recorder record.EventRecorder 65 counter int 66 } 67 68 // setOwnerReferences will set the input policy as the sole owner of the input policyAutomation and make the update 69 // with the API. In practice, this will cause the input policyAutomation to be deleted when the policy is deleted. 70 func (r *PolicyAutomationReconciler) setOwnerReferences( 71 ctx context.Context, 72 policyAutomation *policyv1beta1.PolicyAutomation, 73 policy *policyv1.Policy, 74 ) error { 75 var policyOwnerRefFound bool 76 77 for _, ownerRef := range policyAutomation.GetOwnerReferences() { 78 if ownerRef.UID == policy.UID { 79 policyOwnerRefFound = true 80 81 break 82 } 83 } 84 85 if !policyOwnerRefFound { 86 log.V(3).Info(fmt.Sprintf("Setting the owner reference on the PolicyAutomation %s", policyAutomation.GetName())) 87 policyAutomation.SetOwnerReferences([]metav1.OwnerReference{ 88 *metav1.NewControllerRef(policy, policy.GroupVersionKind()), 89 }) 90 91 return r.Update(ctx, policyAutomation) 92 } 93 94 return nil 95 } 96 97 // getTargetListMap will convert slice targetList to map for search efficiency 98 func getTargetListMap(targetList []string) map[string]bool { 99 targetListMap := map[string]bool{} 100 for _, target := range targetList { 101 targetListMap[target] = true 102 } 103 104 return targetListMap 105 } 106 107 // getClusterDNSName will get the Hub cluster DNS name if the Hub is an OpenShift cluster. 108 func (r *PolicyAutomationReconciler) getClusterDNSName(ctx context.Context) (string, error) { 109 dnsCluster, err := r.DynamicClient.Resource(dnsGVR).Get(ctx, "cluster", metav1.GetOptions{}) 110 if err != nil { 111 if errors.IsNotFound(err) { 112 // This is a debug log to not spam the logs when the Hub is installed on a Kubernetes distribution other 113 // than OpenShift. 114 log.V(2).Info("The Hub cluster DNS name couldn't be determined") 115 116 return "", nil 117 } 118 119 return "", err 120 } 121 122 dnsName, _, _ := unstructured.NestedString(dnsCluster.Object, "spec", "baseDomain") 123 if dnsName == "" { 124 log.Info("The OpenShift DNS object named cluster did not contain a valid spec.baseDomain value") 125 } else { 126 log.V(2).Info("The Hub cluster DNS name was found", "name", dnsName) 127 } 128 129 return dnsName, nil 130 } 131 132 // getViolationContext will put the root policy information into violationContext 133 // It also puts the status of the non-compliant replicated policies into violationContext 134 func (r *PolicyAutomationReconciler) getViolationContext( 135 ctx context.Context, 136 policy *policyv1.Policy, 137 targetList []string, 138 policyAutomation *policyv1beta1.PolicyAutomation, 139 ) (policyv1beta1.ViolationContext, error) { 140 log.V(3).Info( 141 "Get the violation context from the root policy %s/%s", 142 policy.GetNamespace(), 143 policy.GetName(), 144 ) 145 146 violationContext := policyv1beta1.ViolationContext{} 147 // 1) get the target cluster list 148 violationContext.TargetClusters = targetList 149 // 2) get the root policy name 150 violationContext.PolicyName = policy.GetName() 151 // 3) get the root policy namespace 152 violationContext.PolicyNamespace = policy.GetNamespace() 153 // 4) get the root policy hub cluster name 154 var err error 155 156 violationContext.HubCluster, err = r.getClusterDNSName(ctx) 157 if err != nil { 158 return policyv1beta1.ViolationContext{}, err 159 } 160 161 // 5) get the policy sets of the root policy 162 plcPlacement := policy.Status.Placement 163 policySets := []string{} 164 165 for _, placement := range plcPlacement { 166 if placement.PolicySet != "" { 167 policySets = append(policySets, placement.PolicySet) 168 } 169 } 170 171 violationContext.PolicySets = policySets 172 173 // skip policy_violation_context if all clusters are compliant 174 if len(targetList) == 0 { 175 return violationContext, nil 176 } 177 178 replicatedPlcList := &policyv1.PolicyList{} 179 180 err = r.List( 181 context.TODO(), 182 replicatedPlcList, 183 client.MatchingLabels(common.LabelsForRootPolicy(policy)), 184 ) 185 if err != nil { 186 log.Error(err, "Failed to list the replicated policies") 187 188 return violationContext, err 189 } 190 191 if len(replicatedPlcList.Items) == 0 { 192 log.V(2).Info("The replicated policies cannot be found.") 193 194 return violationContext, nil 195 } 196 197 policyViolationsLimit := policyAutomation.Spec.Automation.PolicyViolationsLimit 198 if policyViolationsLimit == nil { 199 policyViolationsLimit = new(uint) 200 *policyViolationsLimit = policyv1beta1.DefaultPolicyViolationsLimit 201 } 202 203 contextLimit := int(*policyViolationsLimit) 204 205 targetListMap := getTargetListMap(targetList) 206 violationContext.PolicyViolations = make( 207 map[string]policyv1beta1.ReplicatedPolicyStatus, 208 len(replicatedPlcList.Items), 209 ) 210 211 // 6) get the status of the non-compliance replicated policies 212 for _, rPlc := range replicatedPlcList.Items { 213 clusterName := rPlc.GetLabels()[common.ClusterNameLabel] 214 if !targetListMap[clusterName] { 215 continue // skip the compliance replicated policies 216 } 217 218 rPlcStatus := policyv1beta1.ReplicatedPolicyStatus{} 219 // Convert PolicyStatus to ReplicatedPolicyStatus and skip the unnecessary items 220 err := common.TypeConverter(rPlc.Status, &rPlcStatus) 221 if err != nil { // still assign the empty rPlcStatus to PolicyViolations later 222 log.Error(err, "The PolicyStatus cannot be converted to the type ReplicatedPolicyStatus.") 223 } 224 225 // get the latest violation message from the replicated policy 226 statusDetails := rPlc.Status.Details 227 if len(statusDetails) > 0 && len(statusDetails[0].History) > 0 { 228 rPlcStatus.ViolationMessage = statusDetails[0].History[0].Message 229 } 230 231 violationContext.PolicyViolations[clusterName] = rPlcStatus 232 if contextLimit > 0 && len(violationContext.PolicyViolations) == contextLimit { 233 log.V(2).Info( 234 "PolicyViolationsLimit is %s so skipping %s remaining replicated policies violations.", 235 fmt.Sprint(contextLimit), 236 fmt.Sprint(len(replicatedPlcList.Items)-contextLimit), 237 ) 238 239 break 240 } 241 } 242 243 return violationContext, nil 244 } 245 246 // Reconcile reads that state of the cluster for a Policy object and makes changes based on the state read 247 // and what is in the Policy.Spec 248 // Note: 249 // The Controller will requeue the Request to be processed again if the returned error is non-nil or 250 // Result.Requeue is true, otherwise upon completion it will remove the work from the queue. 251 func (r *PolicyAutomationReconciler) Reconcile( 252 ctx context.Context, request ctrl.Request, 253 ) (ctrl.Result, error) { 254 log := log.WithValues("Request.Namespace", request.Namespace, "Request.Name", request.Name) 255 256 // Fetch the PolicyAutomation instance 257 policyAutomation := &policyv1beta1.PolicyAutomation{} 258 259 err := r.Get(ctx, request.NamespacedName, policyAutomation) 260 if err != nil { 261 if errors.IsNotFound(err) { 262 log.V(2).Info("Automation was deleted. Nothing to do.") 263 264 return reconcile.Result{}, nil 265 } 266 267 // Error reading the object - requeue the request. 268 return reconcile.Result{}, err 269 } 270 271 if policyAutomation.Spec.PolicyRef == "" { 272 log.Info("No policyRef in PolicyAutomation. Will ignore it.") 273 274 return reconcile.Result{}, nil 275 } 276 277 log = log.WithValues("policyRef", policyAutomation.Spec.PolicyRef) 278 279 policy := &policyv1.Policy{} 280 281 err = r.Get(ctx, types.NamespacedName{ 282 Name: policyAutomation.Spec.PolicyRef, 283 Namespace: policyAutomation.GetNamespace(), 284 }, policy) 285 if err != nil { 286 if errors.IsNotFound(err) { 287 log.Info("Policy specified in policyRef field not found, may have been deleted, doing nothing") 288 289 return reconcile.Result{}, nil 290 } 291 292 log.Error(err, "Failed to retrieve the policy specified in the policyRef field") 293 294 return reconcile.Result{}, err 295 } 296 297 err = r.setOwnerReferences(ctx, policyAutomation, policy) 298 if err != nil { 299 log.Error(err, "Failed to set the owner reference. Will requeue.") 300 301 return reconcile.Result{}, err 302 } 303 304 if policyAutomation.Annotations["policy.open-cluster-management.io/rerun"] == "true" { 305 AjExist, err := MatchPAResouceV(policyAutomation, 306 r.DynamicClient, policyAutomation.GetResourceVersion()) 307 if err != nil { 308 log.Error(err, "Failed to compare Ansible job's resourceVersion") 309 310 return reconcile.Result{}, err 311 } 312 313 if AjExist { 314 log.Info("Ansiblejob already exist under this policyautomation resourceVersion") 315 316 return reconcile.Result{}, nil 317 } 318 319 targetList := common.FindNonCompliantClustersForPolicy(policy) 320 log.Info( 321 "Creating an Ansible job", "mode", "manual", 322 "clusterCount", strconv.Itoa(len(targetList))) 323 324 violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation) 325 326 err = CreateAnsibleJob( 327 policyAutomation, 328 r.DynamicClient, 329 "manual", 330 violationContext, 331 ) 332 if err != nil { 333 log.Error(err, "Failed to create the Ansible job", "mode", "manual") 334 335 return reconcile.Result{}, err 336 } 337 // manual run succeeded, remove annotation 338 delete(policyAutomation.Annotations, "policy.open-cluster-management.io/rerun") 339 340 err = r.Update(ctx, policyAutomation, &client.UpdateOptions{}) 341 if err != nil { 342 log.Error(err, "Failed to remove the annotation `policy.open-cluster-management.io/rerun`") 343 344 return reconcile.Result{}, err 345 } 346 347 return reconcile.Result{}, nil 348 } else if policyAutomation.Spec.Mode == policyv1beta1.Disabled { 349 log.Info("Automation is disabled, doing nothing") 350 351 return reconcile.Result{}, nil 352 } else { 353 if policy.Spec.Disabled { 354 log.Info("The policy is disabled. Doing nothing.") 355 356 return reconcile.Result{}, nil 357 } 358 359 if policyAutomation.Spec.Mode == "scan" { 360 log := log.WithValues("mode", "scan") 361 log.V(2).Info("Triggering scan mode") 362 363 requeueAfter, err := time.ParseDuration(policyAutomation.Spec.RescanAfter) 364 if err != nil { 365 if policyAutomation.Spec.RescanAfter != "" { 366 log.Error(err, "Invalid spec.rescanAfter value") 367 } 368 369 return reconcile.Result{RequeueAfter: requeueAfter}, err 370 } 371 372 targetList := common.FindNonCompliantClustersForPolicy(policy) 373 if len(targetList) > 0 { 374 log.Info("Creating An Ansible job", "targetList", targetList) 375 violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation) 376 err = CreateAnsibleJob(policyAutomation, r.DynamicClient, "scan", 377 violationContext) 378 if err != nil { 379 return reconcile.Result{RequeueAfter: requeueAfter}, err 380 } 381 } else { 382 log.Info("All clusters are compliant. Doing nothing.") 383 } 384 385 // no violations found, doing nothing 386 r.counter++ 387 log.V(2).Info( 388 "RequeueAfter.", "RequeueAfter", requeueAfter.String(), "Counter", fmt.Sprintf("%d", r.counter), 389 ) 390 391 return reconcile.Result{RequeueAfter: requeueAfter}, nil 392 } else if policyAutomation.Spec.Mode == policyv1beta1.Once { 393 log := log.WithValues("mode", string(policyv1beta1.Once)) 394 targetList := common.FindNonCompliantClustersForPolicy(policy) 395 if len(targetList) > 0 { 396 log.Info("Creating an Ansible job", "targetList", targetList) 397 398 AjExist, err := MatchPAGeneration(policyAutomation, 399 r.DynamicClient, policyAutomation.GetGeneration()) 400 if err != nil { 401 log.Error(err, "Failed to get Ansible job's generation") 402 403 return reconcile.Result{}, err 404 } 405 if AjExist { 406 return reconcile.Result{}, nil 407 } 408 violationContext, _ := r.getViolationContext(ctx, policy, targetList, policyAutomation) 409 err = CreateAnsibleJob( 410 policyAutomation, 411 r.DynamicClient, 412 string(policyv1beta1.Once), 413 violationContext, 414 ) 415 if err != nil { 416 log.Error(err, "Failed to create the Ansible job") 417 418 return reconcile.Result{}, err 419 } 420 421 policyAutomation.Spec.Mode = policyv1beta1.Disabled 422 423 err = r.Update(ctx, policyAutomation, &client.UpdateOptions{}) 424 if err != nil { 425 log.Error(err, "Failed to update the mode to disabled") 426 427 return reconcile.Result{}, err 428 } 429 } else { 430 log.Info("All clusters are compliant. Doing nothing.") 431 } 432 } else if policyAutomation.Spec.Mode == policyv1beta1.EveryEvent { 433 log := log.WithValues("mode", string(policyv1beta1.EveryEvent)) 434 targetList := common.FindNonCompliantClustersForPolicy(policy) 435 targetListMap := getTargetListMap(targetList) 436 // The clusters map that the new ansible job will target 437 trimmedTargetMap := map[string]bool{} 438 // delayAfterRunSeconds and requeueDuration default value = zero 439 delayAfterRunSeconds := policyAutomation.Spec.DelayAfterRunSeconds 440 requeueDuration := 0 441 requeueFlag := false 442 // Automation event time grouped by the cluster name 443 eventMap := map[string]policyv1beta1.ClusterEvent{} 444 if len(policyAutomation.Status.ClustersWithEvent) > 0 { 445 eventMap = policyAutomation.Status.ClustersWithEvent 446 } 447 448 now := time.Now().UTC() 449 nowStr := now.Format(time.RFC3339) 450 451 for clusterName, clusterEvent := range eventMap { 452 originalStartTime, err := time.Parse(time.RFC3339, clusterEvent.AutomationStartTime) 453 if err != nil { 454 log.Error(err, "Failed to retrieve AutomationStartTime in ClustersWithEvent") 455 delete(eventMap, clusterName) 456 } 457 458 preEventTime, err := time.Parse(time.RFC3339, clusterEvent.EventTime) 459 if err != nil { 460 log.Error(err, "Failed to retrieve EventTime in ClustersWithEvent") 461 delete(eventMap, clusterName) 462 } 463 464 // The time that delayAfterRunSeconds setting expires 465 delayUntil := originalStartTime.Add(time.Duration(delayAfterRunSeconds) * time.Second) 466 467 // The policy is non-compliant with the target cluster 468 if targetListMap[clusterName] { 469 // Policy status changed from non-compliant to compliant 470 // then back to non-compliant during the delay period 471 if delayAfterRunSeconds > 0 && preEventTime.After(originalStartTime) { 472 if now.After(delayUntil) { 473 // The delay period passed so remove the previous event 474 delete(eventMap, clusterName) 475 // Add the cluster name to create a new ansible job 476 if !trimmedTargetMap[clusterName] { 477 trimmedTargetMap[clusterName] = true 478 } 479 } else { 480 requeueFlag = true 481 // Within the delay period and use the earliest requeueDuration to requeue 482 if (requeueDuration == 0) || (requeueDuration > int(delayUntil.Sub(now)+1)) { 483 requeueDuration = int(delayUntil.Sub(now) + 1) 484 } 485 // keep the event and update eventTime 486 clusterEvent.EventTime = nowStr 487 // new event from compliant to non-compliant 488 eventMap[clusterName] = clusterEvent 489 } 490 } // Otherwise, the policy keeps non-compliant since originalStartTime, do nothing 491 } else { // The policy is compliant with the target cluster 492 if delayAfterRunSeconds > 0 && now.Before(delayUntil) { 493 // Within the delay period, keep the event and update eventTime 494 clusterEvent.EventTime = nowStr 495 // new event from non-compliant to compliant 496 eventMap[clusterName] = clusterEvent 497 } else { // No delay period or it is expired, remove the event 498 delete(eventMap, clusterName) 499 } 500 } 501 } 502 503 for _, clusterName := range targetList { 504 if _, ok := eventMap[clusterName]; !ok { 505 // Add the non-compliant clusters without previous automation event 506 if !trimmedTargetMap[clusterName] { 507 trimmedTargetMap[clusterName] = true 508 } 509 } 510 } 511 512 if len(trimmedTargetMap) > 0 { 513 trimmedTargetList := []string{} 514 for clusterName := range trimmedTargetMap { 515 trimmedTargetList = append(trimmedTargetList, clusterName) 516 } 517 log.Info("Creating An Ansible job", "trimmedTargetList", trimmedTargetList) 518 violationContext, _ := r.getViolationContext(ctx, policy, trimmedTargetList, policyAutomation) 519 err = CreateAnsibleJob( 520 policyAutomation, 521 r.DynamicClient, 522 string(policyv1beta1.EveryEvent), 523 violationContext, 524 ) 525 if err != nil { 526 log.Error(err, "Failed to create the Ansible job") 527 528 return reconcile.Result{}, err 529 } 530 531 automationStartTimeStr := time.Now().UTC().Format(time.RFC3339) 532 533 for _, clusterName := range trimmedTargetList { 534 eventMap[clusterName] = policyv1beta1.ClusterEvent{ 535 AutomationStartTime: automationStartTimeStr, 536 EventTime: nowStr, 537 } 538 } 539 } else { 540 log.Info("All clusters are compliant. No new Ansible job. Just update ClustersWithEvent.") 541 } 542 543 policyAutomation.Status.ClustersWithEvent = eventMap 544 // use StatusWriter to update status subresource of a Kubernetes object 545 err = r.Status().Update(ctx, policyAutomation) 546 if err != nil { 547 log.Error(err, "Failed to update ClustersWithEvent in policyAutomation status") 548 549 return reconcile.Result{}, err 550 } 551 552 if requeueFlag { 553 log.Info( 554 "Requeue for the new non-compliant event during the delay period", 555 "Delay in seconds", delayAfterRunSeconds, 556 "Requeue After", requeueDuration, 557 ) 558 559 return reconcile.Result{RequeueAfter: time.Duration(requeueDuration)}, nil 560 } 561 } 562 } 563 564 return ctrl.Result{}, nil 565 }