sigs.k8s.io/cluster-api-provider-azure@v1.17.0/azure/services/aso/aso.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package aso
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"time"
    24  
    25  	asoannotations "github.com/Azure/azure-service-operator/v2/pkg/common/annotations"
    26  	"github.com/Azure/azure-service-operator/v2/pkg/genruntime"
    27  	"github.com/Azure/azure-service-operator/v2/pkg/genruntime/conditions"
    28  	jsonpatch "github.com/evanphx/json-patch/v5"
    29  	"github.com/google/go-cmp/cmp"
    30  	"github.com/pkg/errors"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/runtime"
    34  	"k8s.io/apimachinery/pkg/runtime/serializer"
    35  	"k8s.io/apimachinery/pkg/util/yaml"
    36  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    37  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    38  	"sigs.k8s.io/cluster-api-provider-azure/util/aso"
    39  	"sigs.k8s.io/cluster-api-provider-azure/util/tele"
    40  	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
    41  	"sigs.k8s.io/controller-runtime/pkg/client"
    42  	"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
    43  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    44  )
    45  
    46  const (
    47  	// prePauseReconcilePolicyAnnotation is the annotation key for the value of
    48  	// asoannotations.ReconcilePolicy that was set before pausing.
    49  	prePauseReconcilePolicyAnnotation = "sigs.k8s.io/cluster-api-provider-azure-pre-pause-reconcile-policy"
    50  
    51  	requeueInterval = 20 * time.Second
    52  
    53  	createOrUpdateFutureType = "ASOCreateOrUpdate"
    54  	deleteFutureType         = "ASODelete"
    55  )
    56  
    57  // reconciler is an implementation of the Reconciler interface. It handles creation
    58  // and deletion of resources using ASO.
    59  type reconciler[T genruntime.MetaObject] struct {
    60  	client.Client
    61  
    62  	clusterName string
    63  	owner       client.Object
    64  }
    65  
    66  // New creates a new ASO reconciler.
    67  func New[T genruntime.MetaObject](ctrlClient client.Client, clusterName string, owner client.Object) Reconciler[T] {
    68  	return &reconciler[T]{
    69  		Client:      ctrlClient,
    70  		clusterName: clusterName,
    71  		owner:       owner,
    72  	}
    73  }
    74  
    75  // CreateOrUpdateResource implements the logic for creating a new or updating an
    76  // existing resource with ASO.
    77  func (r *reconciler[T]) CreateOrUpdateResource(ctx context.Context, spec azure.ASOResourceSpecGetter[T], serviceName string) (T, error) {
    78  	ctx, log, done := tele.StartSpanWithLogger(ctx, "services.aso.CreateOrUpdateResource")
    79  	defer done()
    80  
    81  	resource := spec.ResourceRef()
    82  	resource.SetNamespace(r.owner.GetNamespace())
    83  	resourceName := resource.GetName()
    84  	resourceNamespace := resource.GetNamespace()
    85  
    86  	log = log.WithValues("service", serviceName, "resource", resourceName, "namespace", resourceNamespace)
    87  
    88  	var readyErr error
    89  	var adopt bool
    90  	var existing T
    91  	var zero T // holds the zero value, to be returned with non-nil errors.
    92  	resourceExists := false
    93  	if err := r.Client.Get(ctx, client.ObjectKeyFromObject(resource), resource); err != nil {
    94  		if !apierrors.IsNotFound(err) {
    95  			return zero, errors.Wrapf(err, "failed to get existing resource %s/%s (service: %s)", resourceNamespace, resourceName, serviceName)
    96  		}
    97  		log.V(2).Info("existing resource not found, will create a new one")
    98  	} else {
    99  		existing = resource
   100  		resourceExists = true
   101  		log.V(2).Info("successfully got existing resource")
   102  
   103  		if isOwned, err := isOwnedBy(resource, r.owner, r.Scheme()); err != nil {
   104  			return zero, err
   105  		} else if !isOwned && !hasLegacyOwnedByLabel(resource.GetLabels(), r.clusterName) {
   106  			log.V(4).Info("skipping reconcile for unmanaged resource")
   107  			return existing, nil
   108  		}
   109  
   110  		// Check if there is an ongoing long running operation.
   111  		conds := existing.GetConditions()
   112  		i, readyExists := conds.FindIndexByType(conditions.ConditionTypeReady)
   113  		if !readyExists {
   114  			return zero, azure.WithTransientError(errors.New("ready status unknown"), requeueInterval)
   115  		}
   116  		if cond := conds[i]; cond.Status != metav1.ConditionTrue {
   117  			switch {
   118  			case cond.Reason == conditions.ReasonAzureResourceNotFound.Name &&
   119  				existing.GetAnnotations()[asoannotations.ReconcilePolicy] == string(asoannotations.ReconcilePolicySkip):
   120  				// This resource was originally created by CAPZ and a
   121  				// corresponding Azure resource has been found not to exist, so
   122  				// CAPZ will tell ASO to adopt the resource by setting its
   123  				// reconcile policy to "manage". This extra step is necessary to
   124  				// handle user-managed resources that already exist in Azure and
   125  				// should not be reconciled by ASO while ensuring they're still
   126  				// represented in ASO.
   127  				log.V(2).Info("resource not found in Azure and \"skip\" reconcile-policy set, adopting")
   128  				// Don't set readyErr so the resource can be adopted with an
   129  				// update instead of returning early.
   130  				adopt = true
   131  			case cond.Reason == conditions.ReasonReconciling.Name:
   132  				readyErr = azure.NewOperationNotDoneError(&infrav1.Future{
   133  					Type:          createOrUpdateFutureType,
   134  					ResourceGroup: existing.GetNamespace(),
   135  					Name:          existing.GetName(),
   136  				})
   137  			default:
   138  				readyErr = fmt.Errorf("resource is not Ready: %s", conds[i].Message)
   139  			}
   140  
   141  			if readyErr != nil {
   142  				if conds[i].Severity == conditions.ConditionSeverityError {
   143  					readyErr = azure.WithTerminalError(readyErr)
   144  				} else {
   145  					readyErr = azure.WithTransientError(readyErr, requeueInterval)
   146  				}
   147  			}
   148  		}
   149  	}
   150  
   151  	// Construct parameters using the resource spec and information from the existing resource, if there is one.
   152  	var existingCopy T
   153  	if resourceExists {
   154  		existingCopy = existing.DeepCopyObject().(T)
   155  	}
   156  	parameters, err := PatchedParameters(ctx, r.Scheme(), spec, existingCopy)
   157  	if err != nil {
   158  		return zero, errors.Wrapf(err, "failed to get desired parameters for resource %s/%s (service: %s)", resourceNamespace, resourceName, serviceName)
   159  	}
   160  
   161  	parameters.SetName(resourceName)
   162  	parameters.SetNamespace(resourceNamespace)
   163  
   164  	if err := controllerutil.SetControllerReference(r.owner, parameters, r.Client.Scheme()); err != nil {
   165  		return zero, errors.Wrap(err, "failed to set owner ref")
   166  	}
   167  
   168  	if t, ok := spec.(TagsGetterSetter[T]); ok {
   169  		if err := reconcileTags(t, existing, resourceExists, parameters); err != nil {
   170  			return zero, errors.Wrap(err, "failed to reconcile tags")
   171  		}
   172  	}
   173  
   174  	labels := parameters.GetLabels()
   175  	if labels == nil {
   176  		labels = make(map[string]string)
   177  	}
   178  	labels[clusterv1.ClusterNameLabel] = r.clusterName
   179  
   180  	annotations := parameters.GetAnnotations()
   181  	if annotations == nil {
   182  		annotations = make(map[string]string)
   183  	}
   184  
   185  	if prevReconcilePolicy, ok := annotations[prePauseReconcilePolicyAnnotation]; ok {
   186  		annotations[asoannotations.ReconcilePolicy] = prevReconcilePolicy
   187  		delete(annotations, prePauseReconcilePolicyAnnotation)
   188  	}
   189  	if !resourceExists {
   190  		// Create the ASO resource with "skip" in case a matching resource
   191  		// already exists in Azure, in which case CAPZ will assume it is managed
   192  		// by the user and ASO should not actively reconcile changes to the ASO
   193  		// resource. In the canonical "entirely managed by CAPZ" case, the next
   194  		// reconciliation will reveal the resource does not already exist in
   195  		// Azure and the ASO resource will be adopted by changing this
   196  		// annotation to "manage".
   197  		annotations[asoannotations.ReconcilePolicy] = string(asoannotations.ReconcilePolicySkip)
   198  	} else {
   199  		adopt = adopt || spec.WasManaged(existing)
   200  	}
   201  	if adopt {
   202  		annotations[asoannotations.ReconcilePolicy] = string(asoannotations.ReconcilePolicyManage)
   203  	}
   204  
   205  	// Set the secret name annotation in order to leverage the ASO resource credential scope as defined in
   206  	// https://azure.github.io/azure-service-operator/guide/authentication/credential-scope/#resource-scope.
   207  	annotations[asoannotations.PerResourceSecret] = aso.GetASOSecretName(r.clusterName)
   208  
   209  	if len(labels) == 0 {
   210  		labels = nil
   211  	}
   212  	parameters.SetLabels(labels)
   213  	if len(annotations) == 0 {
   214  		annotations = nil
   215  	}
   216  	parameters.SetAnnotations(annotations)
   217  
   218  	diff := cmp.Diff(existing, parameters)
   219  	if diff == "" {
   220  		if readyErr != nil {
   221  			// Only return this error when the resource is up to date in order to permit updates from
   222  			// Parameters which may fix the resource's current state.
   223  			return zero, readyErr
   224  		}
   225  		log.V(2).Info("resource up to date")
   226  		return existing, nil
   227  	}
   228  	log.V(2).Info("creating or updating resource", "diff", diff)
   229  	return r.createOrUpdateResource(ctx, existing, parameters, resourceExists, serviceName)
   230  }
   231  
   232  // PatchedParameters returns the Parameters of spec with patches applied.
   233  func PatchedParameters[T genruntime.MetaObject](ctx context.Context, scheme *runtime.Scheme, spec azure.ASOResourceSpecGetter[T], existing T) (T, error) {
   234  	var zero T // to be returned with non-nil errors
   235  	parameters, err := spec.Parameters(ctx, existing)
   236  	if err != nil {
   237  		return zero, err
   238  	}
   239  	return applyPatches(scheme, spec, parameters)
   240  }
   241  
   242  func applyPatches[T genruntime.MetaObject](scheme *runtime.Scheme, spec azure.ASOResourceSpecGetter[T], parameters T) (T, error) {
   243  	p, ok := spec.(Patcher)
   244  	if !ok {
   245  		return parameters, nil
   246  	}
   247  
   248  	var zero T // to be returned with non-nil errors
   249  
   250  	gvk, err := apiutil.GVKForObject(parameters, scheme)
   251  	if err != nil {
   252  		return zero, errors.Wrap(err, "failed to get GroupVersionKind for object")
   253  	}
   254  
   255  	parameters.GetObjectKind().SetGroupVersionKind(gvk)
   256  	paramData, err := json.Marshal(parameters)
   257  	if err != nil {
   258  		return zero, errors.Wrap(err, "failed to marshal JSON for patch")
   259  	}
   260  
   261  	for i, extraPatch := range p.ExtraPatches() {
   262  		jsonPatch, err := yaml.ToJSON([]byte(extraPatch))
   263  		if err != nil {
   264  			return zero, errors.Wrapf(err, "failed to convert patch at index %d to JSON", i)
   265  		}
   266  		paramData, err = jsonpatch.MergePatch(paramData, jsonPatch)
   267  		if err != nil {
   268  			return zero, errors.Wrapf(err, "failed to apply patch at index %d", i)
   269  		}
   270  	}
   271  
   272  	decoder := serializer.NewCodecFactory(scheme).UniversalDeserializer()
   273  	obj, _, err := decoder.Decode(paramData, nil, nil)
   274  	if err != nil {
   275  		return zero, errors.Wrap(err, "failed to decode object")
   276  	}
   277  
   278  	t, ok := obj.(T)
   279  	if !ok {
   280  		return zero, fmt.Errorf("decoded patched object is %T, not %T", obj, parameters)
   281  	}
   282  
   283  	return t, nil
   284  }
   285  
   286  func (r *reconciler[T]) createOrUpdateResource(ctx context.Context, existing T, parameters client.Object, resourceExists bool, serviceName string) (T, error) {
   287  	var zero T
   288  	var err error
   289  	var logMessageVerbPrefix string
   290  	if resourceExists {
   291  		logMessageVerbPrefix = "updat"
   292  		err = r.Client.Patch(ctx, parameters, client.MergeFrom(existing))
   293  	} else {
   294  		logMessageVerbPrefix = "creat"
   295  		err = r.Client.Create(ctx, parameters)
   296  	}
   297  	if err == nil {
   298  		// Resources need to be requeued to wait for the create or update to finish.
   299  		return zero, azure.WithTransientError(azure.NewOperationNotDoneError(&infrav1.Future{
   300  			Type:          createOrUpdateFutureType,
   301  			ResourceGroup: parameters.GetNamespace(),
   302  			Name:          parameters.GetName(),
   303  		}), requeueInterval)
   304  	}
   305  	return zero, errors.Wrapf(err, fmt.Sprintf("failed to %se resource %s/%s (service: %s)", logMessageVerbPrefix, parameters.GetNamespace(), parameters.GetName(), serviceName))
   306  }
   307  
   308  // DeleteResource implements the logic for deleting a resource Asynchronously.
   309  func (r *reconciler[T]) DeleteResource(ctx context.Context, resource T, serviceName string) (err error) {
   310  	ctx, log, done := tele.StartSpanWithLogger(ctx, "services.aso.DeleteResource")
   311  	defer done()
   312  
   313  	resource.SetNamespace(r.owner.GetNamespace())
   314  	resourceName := resource.GetName()
   315  	resourceNamespace := resource.GetNamespace()
   316  
   317  	log = log.WithValues("service", serviceName, "resource", resourceName, "namespace", resourceNamespace)
   318  
   319  	managed, err := IsManaged(ctx, r.Client, resource, r.owner)
   320  	if apierrors.IsNotFound(err) {
   321  		// already deleted
   322  		log.V(2).Info("successfully deleted resource")
   323  		return nil
   324  	}
   325  	if err != nil {
   326  		return errors.Wrap(err, "failed to determine if resource is managed")
   327  	}
   328  	if !managed {
   329  		log.V(4).Info("skipping delete for unmanaged resource")
   330  		return nil
   331  	}
   332  
   333  	log.V(2).Info("deleting resource")
   334  	err = r.Client.Delete(ctx, resource)
   335  	if err != nil {
   336  		if apierrors.IsNotFound(err) {
   337  			// already deleted
   338  			log.V(2).Info("successfully deleted resource")
   339  			return nil
   340  		}
   341  		return errors.Wrapf(err, "failed to delete resource %s/%s (service: %s)", resourceNamespace, resourceName, serviceName)
   342  	}
   343  
   344  	return azure.WithTransientError(azure.NewOperationNotDoneError(&infrav1.Future{
   345  		Type:          deleteFutureType,
   346  		ResourceGroup: resourceNamespace,
   347  		Name:          resourceName,
   348  	}), requeueInterval)
   349  }
   350  
   351  // IsManaged returns whether the ASO resource referred to by spec was created by
   352  // CAPZ and therefore whether CAPZ should manage its lifecycle.
   353  func IsManaged[T genruntime.MetaObject](ctx context.Context, ctrlClient client.Client, resource T, owner client.Object) (bool, error) {
   354  	ctx, _, done := tele.StartSpanWithLogger(ctx, "services.aso.IsManaged")
   355  	defer done()
   356  
   357  	resource.SetNamespace(owner.GetNamespace())
   358  
   359  	err := ctrlClient.Get(ctx, client.ObjectKeyFromObject(resource), resource)
   360  	if err != nil {
   361  		return false, errors.Wrap(err, "error getting resource")
   362  	}
   363  
   364  	return isOwnedBy(resource, owner, ctrlClient.Scheme())
   365  }
   366  
   367  func isOwnedBy(resource client.Object, owner client.Object, scheme *runtime.Scheme) (bool, error) {
   368  	ownerGVK, err := apiutil.GVKForObject(owner, scheme)
   369  	if err != nil {
   370  		return false, err
   371  	}
   372  	existingOwner := metav1.GetControllerOf(resource)
   373  	return existingOwner != nil &&
   374  		existingOwner.APIVersion == ownerGVK.GroupVersion().String() &&
   375  		existingOwner.Kind == ownerGVK.Kind &&
   376  		existingOwner.Name == owner.GetName(), nil
   377  }
   378  
   379  func hasLegacyOwnedByLabel(labels map[string]string, clusterName string) bool {
   380  	//nolint:staticcheck // Referencing this deprecated value is required for backwards compatibility.
   381  	return labels[infrav1.OwnedByClusterLabelKey] == clusterName
   382  }
   383  
   384  // PauseResource pauses an ASO resource by updating its `reconcile-policy` to `skip`.
   385  func (r *reconciler[T]) PauseResource(ctx context.Context, resource T, serviceName string) error {
   386  	ctx, log, done := tele.StartSpanWithLogger(ctx, "services.aso.PauseResource")
   387  	defer done()
   388  
   389  	resource.SetNamespace(r.owner.GetNamespace())
   390  
   391  	log = log.WithValues("service", serviceName, "resource", resource.GetName(), "namespace", resource.GetNamespace())
   392  
   393  	if err := r.Client.Get(ctx, client.ObjectKeyFromObject(resource), resource); err != nil {
   394  		return err
   395  	}
   396  	if isOwned, err := isOwnedBy(resource, r.owner, r.Scheme()); err != nil {
   397  		return err
   398  	} else if !isOwned {
   399  		log.V(4).Info("Skipping pause of unmanaged resource")
   400  		return nil
   401  	}
   402  
   403  	annotations := resource.GetAnnotations()
   404  	if _, exists := annotations[prePauseReconcilePolicyAnnotation]; exists {
   405  		log.V(4).Info("resource is already paused")
   406  		return nil
   407  	}
   408  
   409  	log.V(4).Info("Pausing resource")
   410  	before := resource.DeepCopyObject().(genruntime.MetaObject)
   411  
   412  	if annotations == nil {
   413  		annotations = make(map[string]string, 2)
   414  	}
   415  	annotations[prePauseReconcilePolicyAnnotation] = annotations[asoannotations.ReconcilePolicy]
   416  	annotations[asoannotations.ReconcilePolicy] = string(asoannotations.ReconcilePolicySkip)
   417  	resource.SetAnnotations(annotations)
   418  
   419  	return r.Client.Patch(ctx, resource, client.MergeFrom(before))
   420  }