github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/control/pod_control.go (about)

     1  // Copyright 2019 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sync"
    21  
    22  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/apimachinery/pkg/api/meta"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/labels"
    29  	"k8s.io/apimachinery/pkg/runtime"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	"k8s.io/client-go/tools/record"
    33  )
    34  
    35  // Reasons for pod events
    36  const (
    37  	// FailedCreatePodReason is added in an event and in a job condition
    38  	// when a pod for a replica set is failed to be created.
    39  	FailedCreatePodReason = "FailedCreatePod"
    40  	// SuccessfulCreatePodReason is added in an event when a pod for a job
    41  	// is successfully created.
    42  	SuccessfulCreatePodReason = "SuccessfulCreatePod"
    43  	// FailedDeletePodReason is added in an event and in a job condition
    44  	// when a pod for a replica set is failed to be deleted.
    45  	FailedDeletePodReason = "FailedDeletePod"
    46  	// SuccessfulDeletePodReason is added in an event when a pod for a job
    47  	// is successfully deleted.
    48  	SuccessfulDeletePodReason = "SuccessfulDeletePod"
    49  )
    50  
    51  // PodControlInterface is an interface that knows how to add or delete pods
    52  // created as an interface to allow testing.
    53  type PodControlInterface interface {
    54  	// CreatePods creates new pods according to the spec.
    55  	CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error
    56  	// CreatePodsOnNode creates a new pod according to the spec on the specified node,
    57  	// and sets the ControllerRef.
    58  	CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error
    59  	// CreatePodsWithControllerRef creates new pods according to the spec, and sets object as the pod's controller.
    60  	CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error
    61  	// DeletePod deletes the pod identified by podID.
    62  	DeletePod(namespace string, podID string, object runtime.Object) error
    63  	// PatchPod patches the pod.
    64  	PatchPod(namespace, name string, data []byte) error
    65  }
    66  
    67  // RealPodControl is the default implementation of PodControlInterface.
    68  type RealPodControl struct {
    69  	KubeClient clientset.Interface
    70  	Recorder   record.EventRecorder
    71  }
    72  
    73  var _ PodControlInterface = &RealPodControl{}
    74  
    75  func getPodsLabelSet(template *v1.PodTemplateSpec) labels.Set {
    76  	desiredLabels := make(labels.Set)
    77  	for k, v := range template.Labels {
    78  		desiredLabels[k] = v
    79  	}
    80  	return desiredLabels
    81  }
    82  
    83  func getPodsFinalizers(template *v1.PodTemplateSpec) []string {
    84  	desiredFinalizers := make([]string, len(template.Finalizers))
    85  	copy(desiredFinalizers, template.Finalizers)
    86  	return desiredFinalizers
    87  }
    88  
    89  func getPodsAnnotationSet(template *v1.PodTemplateSpec) labels.Set {
    90  	desiredAnnotations := make(labels.Set)
    91  	for k, v := range template.Annotations {
    92  		desiredAnnotations[k] = v
    93  	}
    94  	return desiredAnnotations
    95  }
    96  
    97  func (r RealPodControl) CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error {
    98  	return r.createPods("", namespace, template, object, nil)
    99  }
   100  
   101  func (r RealPodControl) CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error {
   102  	if err := ValidateControllerRef(controllerRef); err != nil {
   103  		return err
   104  	}
   105  	return r.createPods("", namespace, template, controllerObject, controllerRef)
   106  }
   107  
   108  func (r RealPodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
   109  	if err := ValidateControllerRef(controllerRef); err != nil {
   110  		return err
   111  	}
   112  	return r.createPods(nodeName, namespace, template, object, controllerRef)
   113  }
   114  
   115  func (r RealPodControl) PatchPod(namespace, name string, data []byte) error {
   116  	_, err := r.KubeClient.CoreV1().Pods(namespace).Patch(context.TODO(), name, types.StrategicMergePatchType, data, metav1.PatchOptions{})
   117  	return err
   118  }
   119  
   120  func GetPodFromTemplate(template *v1.PodTemplateSpec, parentObject runtime.Object, controllerRef *metav1.OwnerReference) (*v1.Pod, error) {
   121  	desiredLabels := getPodsLabelSet(template)
   122  	desiredFinalizers := getPodsFinalizers(template)
   123  	desiredAnnotations := getPodsAnnotationSet(template)
   124  
   125  	pod := &v1.Pod{
   126  		ObjectMeta: metav1.ObjectMeta{
   127  			Labels:      desiredLabels,
   128  			Annotations: desiredAnnotations,
   129  			Name:        template.Name,
   130  			Finalizers:  desiredFinalizers,
   131  		},
   132  	}
   133  	if controllerRef != nil {
   134  		pod.OwnerReferences = append(pod.OwnerReferences, *controllerRef)
   135  	}
   136  	pod.Spec = *template.Spec.DeepCopy()
   137  	return pod, nil
   138  }
   139  
   140  func (r RealPodControl) createPods(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
   141  	pod, err := GetPodFromTemplate(template, object, controllerRef)
   142  	if err != nil {
   143  		return err
   144  	}
   145  	if len(nodeName) != 0 {
   146  		pod.Spec.NodeName = nodeName
   147  	}
   148  	if labels.Set(pod.Labels).AsSelectorPreValidated().Empty() {
   149  		return fmt.Errorf("unable to create pods, no labels")
   150  	}
   151  	logger := commonutil.LoggerForPod(pod, object.GetObjectKind().GroupVersionKind().Kind)
   152  	if newPod, err := r.KubeClient.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{}); err != nil {
   153  		r.Recorder.Eventf(object, v1.EventTypeWarning, FailedCreatePodReason, "Error creating: %v", err)
   154  		return err
   155  	} else {
   156  		accessor, err := meta.Accessor(object)
   157  		if err != nil {
   158  			logger.Errorf("parentObject does not have ObjectMeta, %v", err)
   159  			return nil
   160  		}
   161  		logger.Infof("Controller %v created pod %v", accessor.GetName(), newPod.Name)
   162  		r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulCreatePodReason, "Created pod: %v", newPod.Name)
   163  	}
   164  	return nil
   165  }
   166  
   167  func (r RealPodControl) DeletePod(namespace string, podID string, object runtime.Object) error {
   168  	accessor, err := meta.Accessor(object)
   169  	if err != nil {
   170  		return fmt.Errorf("object does not have ObjectMeta, %v", err)
   171  	}
   172  	logger := commonutil.LoggerForJob(accessor)
   173  	pod, err := r.KubeClient.CoreV1().Pods(namespace).Get(context.TODO(), podID, metav1.GetOptions{})
   174  	if err != nil {
   175  		if errors.IsNotFound(err) {
   176  			return nil
   177  		}
   178  		return err
   179  	}
   180  	if pod.DeletionTimestamp != nil {
   181  		logger.Infof("pod %s/%s is terminating, skip deleting", pod.Namespace, pod.Name)
   182  		return nil
   183  	}
   184  	logger.Infof("Controller %v deleting pod %v/%v", accessor.GetName(), namespace, podID)
   185  	// delete options
   186  	if err := r.KubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), podID, metav1.DeleteOptions{}); err != nil {
   187  		r.Recorder.Eventf(object, v1.EventTypeWarning, FailedDeletePodReason, "Error deleting: %v", err)
   188  		return fmt.Errorf("unable to delete pods: %v", err)
   189  	} else {
   190  		r.Recorder.Eventf(object, v1.EventTypeNormal, SuccessfulDeletePodReason, "Deleted pod: %v", podID)
   191  	}
   192  	return nil
   193  }
   194  
   195  type FakePodControl struct {
   196  	sync.Mutex
   197  	Templates       []v1.PodTemplateSpec
   198  	ControllerRefs  []metav1.OwnerReference
   199  	DeletePodName   []string
   200  	Patches         [][]byte
   201  	Err             error
   202  	CreateLimit     int
   203  	CreateCallCount int
   204  }
   205  
   206  var _ PodControlInterface = &FakePodControl{}
   207  
   208  func (f *FakePodControl) PatchPod(namespace, name string, data []byte) error {
   209  	f.Lock()
   210  	defer f.Unlock()
   211  	f.Patches = append(f.Patches, data)
   212  	if f.Err != nil {
   213  		return f.Err
   214  	}
   215  	return nil
   216  }
   217  
   218  func (f *FakePodControl) CreatePods(namespace string, spec *v1.PodTemplateSpec, object runtime.Object) error {
   219  	f.Lock()
   220  	defer f.Unlock()
   221  	f.CreateCallCount++
   222  	if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
   223  		return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
   224  	}
   225  	f.Templates = append(f.Templates, *spec)
   226  	if f.Err != nil {
   227  		return f.Err
   228  	}
   229  	return nil
   230  }
   231  
   232  func (f *FakePodControl) CreatePodsWithControllerRef(namespace string, spec *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
   233  	f.Lock()
   234  	defer f.Unlock()
   235  	f.CreateCallCount++
   236  	if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
   237  		return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
   238  	}
   239  	f.Templates = append(f.Templates, *spec)
   240  	f.ControllerRefs = append(f.ControllerRefs, *controllerRef)
   241  	if f.Err != nil {
   242  		return f.Err
   243  	}
   244  	return nil
   245  }
   246  
   247  func (f *FakePodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error {
   248  	f.Lock()
   249  	defer f.Unlock()
   250  	f.CreateCallCount++
   251  	if f.CreateLimit != 0 && f.CreateCallCount > f.CreateLimit {
   252  		return fmt.Errorf("not creating pod, limit %d already reached (create call %d)", f.CreateLimit, f.CreateCallCount)
   253  	}
   254  	f.Templates = append(f.Templates, *template)
   255  	f.ControllerRefs = append(f.ControllerRefs, *controllerRef)
   256  	if f.Err != nil {
   257  		return f.Err
   258  	}
   259  	return nil
   260  }
   261  
   262  func (f *FakePodControl) DeletePod(namespace string, podID string, object runtime.Object) error {
   263  	f.Lock()
   264  	defer f.Unlock()
   265  	f.DeletePodName = append(f.DeletePodName, podID)
   266  	if f.Err != nil {
   267  		return f.Err
   268  	}
   269  	return nil
   270  }
   271  
   272  func (f *FakePodControl) Clear() {
   273  	f.Lock()
   274  	defer f.Unlock()
   275  	f.DeletePodName = []string{}
   276  	f.Templates = []v1.PodTemplateSpec{}
   277  	f.ControllerRefs = []metav1.OwnerReference{}
   278  	f.Patches = [][]byte{}
   279  	f.CreateLimit = 0
   280  	f.CreateCallCount = 0
   281  }