github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/control/controller_ref_manager.go (about)

     1  // Copyright 2019 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"fmt"
    19  	"sync"
    20  
    21  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    22  	log "github.com/sirupsen/logrus"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/errors"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/apimachinery/pkg/labels"
    28  	"k8s.io/apimachinery/pkg/runtime/schema"
    29  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    30  )
    31  
    32  type BaseControllerRefManager struct {
    33  	Controller metav1.Object
    34  	Selector   labels.Selector
    35  
    36  	canAdoptErr  error
    37  	canAdoptOnce sync.Once
    38  	CanAdoptFunc func() error
    39  }
    40  
    41  func (m *BaseControllerRefManager) CanAdopt() error {
    42  	m.canAdoptOnce.Do(func() {
    43  		if m.CanAdoptFunc != nil {
    44  			m.canAdoptErr = m.CanAdoptFunc()
    45  		}
    46  	})
    47  	return m.canAdoptErr
    48  }
    49  
    50  // ClaimObject tries to take ownership of an object for this controller.
    51  //
    52  // It will reconcile the following:
    53  //   - Adopt orphans if the match function returns true.
    54  //   - Release owned objects if the match function returns false.
    55  //
    56  // A non-nil error is returned if some form of reconciliation was attempted and
    57  // failed. Usually, controllers should try again later in case reconciliation
    58  // is still needed.
    59  //
    60  // If the error is nil, either the reconciliation succeeded, or no
    61  // reconciliation was necessary. The returned boolean indicates whether you now
    62  // own the object.
    63  //
    64  // No reconciliation will be attempted if the controller is being deleted.
    65  func (m *BaseControllerRefManager) ClaimObject(obj metav1.Object, match func(metav1.Object) bool, adopt, release func(metav1.Object) error) (bool, error) {
    66  	controllerRef := metav1.GetControllerOf(obj)
    67  	if controllerRef != nil {
    68  		if controllerRef.UID != m.Controller.GetUID() {
    69  			// Owned by someone else. Ignore.
    70  			return false, nil
    71  		}
    72  		if match(obj) {
    73  			// We already own it and the selector matches.
    74  			// Return true (successfully claimed) before checking deletion timestamp.
    75  			// We're still allowed to claim things we already own while being deleted
    76  			// because doing so requires taking no actions.
    77  			return true, nil
    78  		}
    79  		// Owned by us but selector doesn't match.
    80  		// Try to release, unless we're being deleted.
    81  		if m.Controller.GetDeletionTimestamp() != nil {
    82  			return false, nil
    83  		}
    84  		if err := release(obj); err != nil {
    85  			// If the pod no longer exists, ignore the error.
    86  			if errors.IsNotFound(err) {
    87  				return false, nil
    88  			}
    89  			// Either someone else released it, or there was a transient error.
    90  			// The controller should requeue and try again if it's still stale.
    91  			return false, err
    92  		}
    93  		// Successfully released.
    94  		return false, nil
    95  	}
    96  
    97  	// It's an orphan.
    98  	if m.Controller.GetDeletionTimestamp() != nil || !match(obj) {
    99  		// Ignore if we're being deleted or selector doesn't match.
   100  		return false, nil
   101  	}
   102  	if obj.GetDeletionTimestamp() != nil {
   103  		// Ignore if the object is being deleted
   104  		return false, nil
   105  	}
   106  	// Selector matches. Try to adopt.
   107  	if err := adopt(obj); err != nil {
   108  		// If the pod no longer exists, ignore the error.
   109  		if errors.IsNotFound(err) {
   110  			return false, nil
   111  		}
   112  		// Either someone else claimed it first, or there was a transient error.
   113  		// The controller should requeue and try again if it's still orphaned.
   114  		return false, err
   115  	}
   116  	// Successfully adopted.
   117  	return true, nil
   118  }
   119  
   120  type PodControllerRefManager struct {
   121  	BaseControllerRefManager
   122  	controllerKind schema.GroupVersionKind
   123  	podControl     PodControlInterface
   124  }
   125  
   126  // NewPodControllerRefManager returns a PodControllerRefManager that exposes
   127  // methods to manage the controllerRef of pods.
   128  //
   129  // The CanAdopt() function can be used to perform a potentially expensive check
   130  // (such as a live GET from the API server) prior to the first adoption.
   131  // It will only be called (at most once) if an adoption is actually attempted.
   132  // If CanAdopt() returns a non-nil error, all adoptions will fail.
   133  //
   134  // NOTE: Once CanAdopt() is called, it will not be called again by the same
   135  //
   136  //	PodControllerRefManager instance. Create a new instance if it makes
   137  //	sense to check CanAdopt() again (e.g. in a different sync pass).
   138  func NewPodControllerRefManager(
   139  	podControl PodControlInterface,
   140  	controller metav1.Object,
   141  	selector labels.Selector,
   142  	controllerKind schema.GroupVersionKind,
   143  	canAdopt func() error,
   144  ) *PodControllerRefManager {
   145  	return &PodControllerRefManager{
   146  		BaseControllerRefManager: BaseControllerRefManager{
   147  			Controller:   controller,
   148  			Selector:     selector,
   149  			CanAdoptFunc: canAdopt,
   150  		},
   151  		controllerKind: controllerKind,
   152  		podControl:     podControl,
   153  	}
   154  }
   155  
   156  // ClaimPods tries to take ownership of a list of Pods.
   157  //
   158  // It will reconcile the following:
   159  //   - Adopt orphans if the selector matches.
   160  //   - Release owned objects if the selector no longer matches.
   161  //
   162  // Optional: If one or more filters are specified, a Pod will only be claimed if
   163  // all filters return true.
   164  //
   165  // A non-nil error is returned if some form of reconciliation was attempted and
   166  // failed. Usually, controllers should try again later in case reconciliation
   167  // is still needed.
   168  //
   169  // If the error is nil, either the reconciliation succeeded, or no
   170  // reconciliation was necessary. The list of Pods that you now own is returned.
   171  func (m *PodControllerRefManager) ClaimPods(pods []*v1.Pod, filters ...func(*v1.Pod) bool) ([]*v1.Pod, error) {
   172  	var claimed []*v1.Pod
   173  	var errlist []error
   174  
   175  	match := func(obj metav1.Object) bool {
   176  		pod := obj.(*v1.Pod)
   177  		// Check selector first so filters only run on potentially matching Pods.
   178  		if !m.Selector.Matches(labels.Set(pod.Labels)) {
   179  			return false
   180  		}
   181  		for _, filter := range filters {
   182  			if !filter(pod) {
   183  				return false
   184  			}
   185  		}
   186  		return true
   187  	}
   188  	adopt := func(obj metav1.Object) error {
   189  		return m.AdoptPod(obj.(*v1.Pod))
   190  	}
   191  	release := func(obj metav1.Object) error {
   192  		return m.ReleasePod(obj.(*v1.Pod))
   193  	}
   194  
   195  	for _, pod := range pods {
   196  		ok, err := m.ClaimObject(pod, match, adopt, release)
   197  		if err != nil {
   198  			errlist = append(errlist, err)
   199  			continue
   200  		}
   201  		if ok {
   202  			claimed = append(claimed, pod)
   203  		}
   204  	}
   205  	return claimed, utilerrors.NewAggregate(errlist)
   206  }
   207  
   208  // AdoptPod sends a patch to take control of the pod. It returns the error if
   209  // the patching fails.
   210  func (m *PodControllerRefManager) AdoptPod(pod *v1.Pod) error {
   211  	if err := m.CanAdopt(); err != nil {
   212  		return fmt.Errorf("can't adopt Pod %v/%v (%v): %v", pod.Namespace, pod.Name, pod.UID, err)
   213  	}
   214  	// Note that ValidateOwnerReferences() will reject this patch if another
   215  	// OwnerReference exists with controller=true.
   216  	addControllerPatch := fmt.Sprintf(
   217  		`{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`,
   218  		m.controllerKind.GroupVersion(), m.controllerKind.Kind,
   219  		m.Controller.GetName(), m.Controller.GetUID(), pod.UID)
   220  	return m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(addControllerPatch))
   221  }
   222  
   223  // ReleasePod sends a patch to free the pod from the control of the controller.
   224  // It returns the error if the patching fails. 404 and 422 errors are ignored.
   225  func (m *PodControllerRefManager) ReleasePod(pod *v1.Pod) error {
   226  	log.Infof("patching pod %s_%s to remove its controllerRef to %s/%s:%s",
   227  		pod.Namespace, pod.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName())
   228  	deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), pod.UID)
   229  	err := m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(deleteOwnerRefPatch))
   230  	if err != nil {
   231  		if errors.IsNotFound(err) {
   232  			// If the pod no longer exists, ignore it.
   233  			return nil
   234  		}
   235  		if errors.IsInvalid(err) {
   236  			// Invalid error will be returned in two cases: 1. the pod
   237  			// has no owner reference, 2. the uid of the pod doesn't
   238  			// match, which means the pod is deleted and then recreated.
   239  			// In both cases, the error can be ignored.
   240  
   241  			// TODO: If the pod has owner references, but none of them
   242  			// has the owner.UID, server will silently ignore the patch.
   243  			// Investigate why.
   244  			return nil
   245  		}
   246  	}
   247  	return err
   248  }
   249  
   250  type ServiceControllerRefManager struct {
   251  	BaseControllerRefManager
   252  
   253  	controllerKind schema.GroupVersionKind
   254  	serviceControl ServiceControlInterface
   255  }
   256  
   257  // NewServiceControllerRefManager returns a ServiceControllerRefManager that exposes
   258  // methods to manage the controllerRef of services.
   259  //
   260  // The canAdopt() function can be used to perform a potentially expensive check
   261  // (such as a live GET from the API server) prior to the first adoption.
   262  // It will only be called (at most once) if an adoption is actually attempted.
   263  // If canAdopt() returns a non-nil error, all adoptions will fail.
   264  //
   265  // NOTE: Once canAdopt() is called, it will not be called again by the same
   266  //
   267  //	ServiceControllerRefManager instance. Create a new instance if it makes
   268  //	sense to check canAdopt() again (e.g. in a different sync pass).
   269  func NewServiceControllerRefManager(
   270  	serviceControl ServiceControlInterface,
   271  	ctr metav1.Object,
   272  	selector labels.Selector,
   273  	controllerKind schema.GroupVersionKind,
   274  	canAdopt func() error,
   275  ) *ServiceControllerRefManager {
   276  	return &ServiceControllerRefManager{
   277  		BaseControllerRefManager: BaseControllerRefManager{
   278  			Controller:   ctr,
   279  			Selector:     selector,
   280  			CanAdoptFunc: canAdopt,
   281  		},
   282  		controllerKind: controllerKind,
   283  		serviceControl: serviceControl,
   284  	}
   285  }
   286  
   287  // ClaimServices tries to take ownership of a list of Services.
   288  //
   289  // It will reconcile the following:
   290  //   - Adopt orphans if the selector matches.
   291  //   - Release owned objects if the selector no longer matches.
   292  //
   293  // Optional: If one or more filters are specified, a Service will only be claimed if
   294  // all filters return true.
   295  //
   296  // A non-nil error is returned if some form of reconciliation was attempted and
   297  // failed. Usually, controllers should try again later in case reconciliation
   298  // is still needed.
   299  //
   300  // If the error is nil, either the reconciliation succeeded, or no
   301  // reconciliation was necessary. The list of Services that you now own is returned.
   302  func (m *ServiceControllerRefManager) ClaimServices(services []*v1.Service, filters ...func(*v1.Service) bool) ([]*v1.Service, error) {
   303  	var claimed []*v1.Service
   304  	var errlist []error
   305  
   306  	match := func(obj metav1.Object) bool {
   307  		service := obj.(*v1.Service)
   308  		// Check selector first so filters only run on potentially matching Services.
   309  		if !m.Selector.Matches(labels.Set(service.Labels)) {
   310  			return false
   311  		}
   312  		for _, filter := range filters {
   313  			if !filter(service) {
   314  				return false
   315  			}
   316  		}
   317  		return true
   318  	}
   319  	adopt := func(obj metav1.Object) error {
   320  		return m.AdoptService(obj.(*v1.Service))
   321  	}
   322  	release := func(obj metav1.Object) error {
   323  		return m.ReleaseService(obj.(*v1.Service))
   324  	}
   325  
   326  	for _, service := range services {
   327  		ok, err := m.ClaimObject(service, match, adopt, release)
   328  		if err != nil {
   329  			errlist = append(errlist, err)
   330  			continue
   331  		}
   332  		if ok {
   333  			claimed = append(claimed, service)
   334  		}
   335  	}
   336  	return claimed, utilerrors.NewAggregate(errlist)
   337  }
   338  
   339  // AdoptService sends a patch to take control of the service. It returns the error if
   340  // the patching fails.
   341  func (m *ServiceControllerRefManager) AdoptService(service *v1.Service) error {
   342  	if err := m.CanAdopt(); err != nil {
   343  		return fmt.Errorf("can't adopt Service %v/%v (%v): %v", service.Namespace, service.Name, service.UID, err)
   344  	}
   345  	// Note that ValidateOwnerReferences() will reject this patch if another
   346  	// OwnerReference exists with controller=true.
   347  	addControllerPatch := fmt.Sprintf(
   348  		`{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`,
   349  		m.controllerKind.GroupVersion(), m.controllerKind.Kind,
   350  		m.Controller.GetName(), m.Controller.GetUID(), service.UID)
   351  	return m.serviceControl.PatchService(service.Namespace, service.Name, []byte(addControllerPatch))
   352  }
   353  
   354  // ReleaseService sends a patch to free the service from the control of the controller.
   355  // It returns the error if the patching fails. 404 and 422 errors are ignored.
   356  func (m *ServiceControllerRefManager) ReleaseService(service *v1.Service) error {
   357  	logger := commonutil.LoggerForService(service, m.controllerKind.Kind)
   358  	logger.Infof("patching service %s_%s to remove its controllerRef to %s/%s:%s",
   359  		service.Namespace, service.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName())
   360  	deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), service.UID)
   361  	err := m.serviceControl.PatchService(service.Namespace, service.Name, []byte(deleteOwnerRefPatch))
   362  	if err != nil {
   363  		if errors.IsNotFound(err) {
   364  			// If the service no longer exists, ignore it.
   365  			return nil
   366  		}
   367  		if errors.IsInvalid(err) {
   368  			// Invalid error will be returned in two cases: 1. the service
   369  			// has no owner reference, 2. the uid of the service doesn't
   370  			// match, which means the service is deleted and then recreated.
   371  			// In both cases, the error can be ignored.
   372  
   373  			// TODO: If the service has owner references, but none of them
   374  			// has the owner.UID, server will silently ignore the patch.
   375  			// Investigate why.
   376  			return nil
   377  		}
   378  	}
   379  	return err
   380  }