github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/control/controller_ref_manager.go (about) 1 // Copyright 2019 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "fmt" 19 "sync" 20 21 commonutil "github.com/kubeflow/training-operator/pkg/util" 22 log "github.com/sirupsen/logrus" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/errors" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/labels" 28 "k8s.io/apimachinery/pkg/runtime/schema" 29 utilerrors "k8s.io/apimachinery/pkg/util/errors" 30 ) 31 32 type BaseControllerRefManager struct { 33 Controller metav1.Object 34 Selector labels.Selector 35 36 canAdoptErr error 37 canAdoptOnce sync.Once 38 CanAdoptFunc func() error 39 } 40 41 func (m *BaseControllerRefManager) CanAdopt() error { 42 m.canAdoptOnce.Do(func() { 43 if m.CanAdoptFunc != nil { 44 m.canAdoptErr = m.CanAdoptFunc() 45 } 46 }) 47 return m.canAdoptErr 48 } 49 50 // ClaimObject tries to take ownership of an object for this controller. 51 // 52 // It will reconcile the following: 53 // - Adopt orphans if the match function returns true. 54 // - Release owned objects if the match function returns false. 55 // 56 // A non-nil error is returned if some form of reconciliation was attempted and 57 // failed. Usually, controllers should try again later in case reconciliation 58 // is still needed. 59 // 60 // If the error is nil, either the reconciliation succeeded, or no 61 // reconciliation was necessary. The returned boolean indicates whether you now 62 // own the object. 63 // 64 // No reconciliation will be attempted if the controller is being deleted. 65 func (m *BaseControllerRefManager) ClaimObject(obj metav1.Object, match func(metav1.Object) bool, adopt, release func(metav1.Object) error) (bool, error) { 66 controllerRef := metav1.GetControllerOf(obj) 67 if controllerRef != nil { 68 if controllerRef.UID != m.Controller.GetUID() { 69 // Owned by someone else. Ignore. 70 return false, nil 71 } 72 if match(obj) { 73 // We already own it and the selector matches. 74 // Return true (successfully claimed) before checking deletion timestamp. 75 // We're still allowed to claim things we already own while being deleted 76 // because doing so requires taking no actions. 77 return true, nil 78 } 79 // Owned by us but selector doesn't match. 80 // Try to release, unless we're being deleted. 81 if m.Controller.GetDeletionTimestamp() != nil { 82 return false, nil 83 } 84 if err := release(obj); err != nil { 85 // If the pod no longer exists, ignore the error. 86 if errors.IsNotFound(err) { 87 return false, nil 88 } 89 // Either someone else released it, or there was a transient error. 90 // The controller should requeue and try again if it's still stale. 91 return false, err 92 } 93 // Successfully released. 94 return false, nil 95 } 96 97 // It's an orphan. 98 if m.Controller.GetDeletionTimestamp() != nil || !match(obj) { 99 // Ignore if we're being deleted or selector doesn't match. 100 return false, nil 101 } 102 if obj.GetDeletionTimestamp() != nil { 103 // Ignore if the object is being deleted 104 return false, nil 105 } 106 // Selector matches. Try to adopt. 107 if err := adopt(obj); err != nil { 108 // If the pod no longer exists, ignore the error. 109 if errors.IsNotFound(err) { 110 return false, nil 111 } 112 // Either someone else claimed it first, or there was a transient error. 113 // The controller should requeue and try again if it's still orphaned. 114 return false, err 115 } 116 // Successfully adopted. 117 return true, nil 118 } 119 120 type PodControllerRefManager struct { 121 BaseControllerRefManager 122 controllerKind schema.GroupVersionKind 123 podControl PodControlInterface 124 } 125 126 // NewPodControllerRefManager returns a PodControllerRefManager that exposes 127 // methods to manage the controllerRef of pods. 128 // 129 // The CanAdopt() function can be used to perform a potentially expensive check 130 // (such as a live GET from the API server) prior to the first adoption. 131 // It will only be called (at most once) if an adoption is actually attempted. 132 // If CanAdopt() returns a non-nil error, all adoptions will fail. 133 // 134 // NOTE: Once CanAdopt() is called, it will not be called again by the same 135 // 136 // PodControllerRefManager instance. Create a new instance if it makes 137 // sense to check CanAdopt() again (e.g. in a different sync pass). 138 func NewPodControllerRefManager( 139 podControl PodControlInterface, 140 controller metav1.Object, 141 selector labels.Selector, 142 controllerKind schema.GroupVersionKind, 143 canAdopt func() error, 144 ) *PodControllerRefManager { 145 return &PodControllerRefManager{ 146 BaseControllerRefManager: BaseControllerRefManager{ 147 Controller: controller, 148 Selector: selector, 149 CanAdoptFunc: canAdopt, 150 }, 151 controllerKind: controllerKind, 152 podControl: podControl, 153 } 154 } 155 156 // ClaimPods tries to take ownership of a list of Pods. 157 // 158 // It will reconcile the following: 159 // - Adopt orphans if the selector matches. 160 // - Release owned objects if the selector no longer matches. 161 // 162 // Optional: If one or more filters are specified, a Pod will only be claimed if 163 // all filters return true. 164 // 165 // A non-nil error is returned if some form of reconciliation was attempted and 166 // failed. Usually, controllers should try again later in case reconciliation 167 // is still needed. 168 // 169 // If the error is nil, either the reconciliation succeeded, or no 170 // reconciliation was necessary. The list of Pods that you now own is returned. 171 func (m *PodControllerRefManager) ClaimPods(pods []*v1.Pod, filters ...func(*v1.Pod) bool) ([]*v1.Pod, error) { 172 var claimed []*v1.Pod 173 var errlist []error 174 175 match := func(obj metav1.Object) bool { 176 pod := obj.(*v1.Pod) 177 // Check selector first so filters only run on potentially matching Pods. 178 if !m.Selector.Matches(labels.Set(pod.Labels)) { 179 return false 180 } 181 for _, filter := range filters { 182 if !filter(pod) { 183 return false 184 } 185 } 186 return true 187 } 188 adopt := func(obj metav1.Object) error { 189 return m.AdoptPod(obj.(*v1.Pod)) 190 } 191 release := func(obj metav1.Object) error { 192 return m.ReleasePod(obj.(*v1.Pod)) 193 } 194 195 for _, pod := range pods { 196 ok, err := m.ClaimObject(pod, match, adopt, release) 197 if err != nil { 198 errlist = append(errlist, err) 199 continue 200 } 201 if ok { 202 claimed = append(claimed, pod) 203 } 204 } 205 return claimed, utilerrors.NewAggregate(errlist) 206 } 207 208 // AdoptPod sends a patch to take control of the pod. It returns the error if 209 // the patching fails. 210 func (m *PodControllerRefManager) AdoptPod(pod *v1.Pod) error { 211 if err := m.CanAdopt(); err != nil { 212 return fmt.Errorf("can't adopt Pod %v/%v (%v): %v", pod.Namespace, pod.Name, pod.UID, err) 213 } 214 // Note that ValidateOwnerReferences() will reject this patch if another 215 // OwnerReference exists with controller=true. 216 addControllerPatch := fmt.Sprintf( 217 `{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`, 218 m.controllerKind.GroupVersion(), m.controllerKind.Kind, 219 m.Controller.GetName(), m.Controller.GetUID(), pod.UID) 220 return m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(addControllerPatch)) 221 } 222 223 // ReleasePod sends a patch to free the pod from the control of the controller. 224 // It returns the error if the patching fails. 404 and 422 errors are ignored. 225 func (m *PodControllerRefManager) ReleasePod(pod *v1.Pod) error { 226 log.Infof("patching pod %s_%s to remove its controllerRef to %s/%s:%s", 227 pod.Namespace, pod.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName()) 228 deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), pod.UID) 229 err := m.podControl.PatchPod(pod.Namespace, pod.Name, []byte(deleteOwnerRefPatch)) 230 if err != nil { 231 if errors.IsNotFound(err) { 232 // If the pod no longer exists, ignore it. 233 return nil 234 } 235 if errors.IsInvalid(err) { 236 // Invalid error will be returned in two cases: 1. the pod 237 // has no owner reference, 2. the uid of the pod doesn't 238 // match, which means the pod is deleted and then recreated. 239 // In both cases, the error can be ignored. 240 241 // TODO: If the pod has owner references, but none of them 242 // has the owner.UID, server will silently ignore the patch. 243 // Investigate why. 244 return nil 245 } 246 } 247 return err 248 } 249 250 type ServiceControllerRefManager struct { 251 BaseControllerRefManager 252 253 controllerKind schema.GroupVersionKind 254 serviceControl ServiceControlInterface 255 } 256 257 // NewServiceControllerRefManager returns a ServiceControllerRefManager that exposes 258 // methods to manage the controllerRef of services. 259 // 260 // The canAdopt() function can be used to perform a potentially expensive check 261 // (such as a live GET from the API server) prior to the first adoption. 262 // It will only be called (at most once) if an adoption is actually attempted. 263 // If canAdopt() returns a non-nil error, all adoptions will fail. 264 // 265 // NOTE: Once canAdopt() is called, it will not be called again by the same 266 // 267 // ServiceControllerRefManager instance. Create a new instance if it makes 268 // sense to check canAdopt() again (e.g. in a different sync pass). 269 func NewServiceControllerRefManager( 270 serviceControl ServiceControlInterface, 271 ctr metav1.Object, 272 selector labels.Selector, 273 controllerKind schema.GroupVersionKind, 274 canAdopt func() error, 275 ) *ServiceControllerRefManager { 276 return &ServiceControllerRefManager{ 277 BaseControllerRefManager: BaseControllerRefManager{ 278 Controller: ctr, 279 Selector: selector, 280 CanAdoptFunc: canAdopt, 281 }, 282 controllerKind: controllerKind, 283 serviceControl: serviceControl, 284 } 285 } 286 287 // ClaimServices tries to take ownership of a list of Services. 288 // 289 // It will reconcile the following: 290 // - Adopt orphans if the selector matches. 291 // - Release owned objects if the selector no longer matches. 292 // 293 // Optional: If one or more filters are specified, a Service will only be claimed if 294 // all filters return true. 295 // 296 // A non-nil error is returned if some form of reconciliation was attempted and 297 // failed. Usually, controllers should try again later in case reconciliation 298 // is still needed. 299 // 300 // If the error is nil, either the reconciliation succeeded, or no 301 // reconciliation was necessary. The list of Services that you now own is returned. 302 func (m *ServiceControllerRefManager) ClaimServices(services []*v1.Service, filters ...func(*v1.Service) bool) ([]*v1.Service, error) { 303 var claimed []*v1.Service 304 var errlist []error 305 306 match := func(obj metav1.Object) bool { 307 service := obj.(*v1.Service) 308 // Check selector first so filters only run on potentially matching Services. 309 if !m.Selector.Matches(labels.Set(service.Labels)) { 310 return false 311 } 312 for _, filter := range filters { 313 if !filter(service) { 314 return false 315 } 316 } 317 return true 318 } 319 adopt := func(obj metav1.Object) error { 320 return m.AdoptService(obj.(*v1.Service)) 321 } 322 release := func(obj metav1.Object) error { 323 return m.ReleaseService(obj.(*v1.Service)) 324 } 325 326 for _, service := range services { 327 ok, err := m.ClaimObject(service, match, adopt, release) 328 if err != nil { 329 errlist = append(errlist, err) 330 continue 331 } 332 if ok { 333 claimed = append(claimed, service) 334 } 335 } 336 return claimed, utilerrors.NewAggregate(errlist) 337 } 338 339 // AdoptService sends a patch to take control of the service. It returns the error if 340 // the patching fails. 341 func (m *ServiceControllerRefManager) AdoptService(service *v1.Service) error { 342 if err := m.CanAdopt(); err != nil { 343 return fmt.Errorf("can't adopt Service %v/%v (%v): %v", service.Namespace, service.Name, service.UID, err) 344 } 345 // Note that ValidateOwnerReferences() will reject this patch if another 346 // OwnerReference exists with controller=true. 347 addControllerPatch := fmt.Sprintf( 348 `{"metadata":{"ownerReferences":[{"apiVersion":"%s","kind":"%s","name":"%s","uid":"%s","controller":true,"blockOwnerDeletion":true}],"uid":"%s"}}`, 349 m.controllerKind.GroupVersion(), m.controllerKind.Kind, 350 m.Controller.GetName(), m.Controller.GetUID(), service.UID) 351 return m.serviceControl.PatchService(service.Namespace, service.Name, []byte(addControllerPatch)) 352 } 353 354 // ReleaseService sends a patch to free the service from the control of the controller. 355 // It returns the error if the patching fails. 404 and 422 errors are ignored. 356 func (m *ServiceControllerRefManager) ReleaseService(service *v1.Service) error { 357 logger := commonutil.LoggerForService(service, m.controllerKind.Kind) 358 logger.Infof("patching service %s_%s to remove its controllerRef to %s/%s:%s", 359 service.Namespace, service.Name, m.controllerKind.GroupVersion(), m.controllerKind.Kind, m.Controller.GetName()) 360 deleteOwnerRefPatch := fmt.Sprintf(`{"metadata":{"ownerReferences":[{"$patch":"delete","uid":"%s"}],"uid":"%s"}}`, m.Controller.GetUID(), service.UID) 361 err := m.serviceControl.PatchService(service.Namespace, service.Name, []byte(deleteOwnerRefPatch)) 362 if err != nil { 363 if errors.IsNotFound(err) { 364 // If the service no longer exists, ignore it. 365 return nil 366 } 367 if errors.IsInvalid(err) { 368 // Invalid error will be returned in two cases: 1. the service 369 // has no owner reference, 2. the uid of the service doesn't 370 // match, which means the service is deleted and then recreated. 371 // In both cases, the error can be ignored. 372 373 // TODO: If the service has owner references, but none of them 374 // has the owner.UID, server will silently ignore the patch. 375 // Investigate why. 376 return nil 377 } 378 } 379 return err 380 }