github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go (about)

     1  // Copyright 2022 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package paddle
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	. "github.com/onsi/ginkgo/v2"
    22  	. "github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/utils/pointer"
    28  	"sigs.k8s.io/controller-runtime/pkg/client"
    29  
    30  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    31  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    32  	"github.com/kubeflow/training-operator/pkg/util/testutil"
    33  )
    34  
    35  var _ = Describe("PaddleJob controller", func() {
    36  	// Define utility constants for object names and testing timeouts/durations and intervals.
    37  	const (
    38  		expectedPort = int32(8080)
    39  	)
    40  	Context("When creating the PaddleJob", func() {
    41  		const name = "test-job"
    42  		var (
    43  			ctx        = context.Background()
    44  			ns         *corev1.Namespace
    45  			job        *kubeflowv1.PaddleJob
    46  			jobKey     types.NamespacedName
    47  			masterKey  types.NamespacedName
    48  			worker0Key types.NamespacedName
    49  		)
    50  		BeforeEach(func() {
    51  			ns = &corev1.Namespace{
    52  				ObjectMeta: metav1.ObjectMeta{
    53  					GenerateName: "paddle-test-",
    54  				},
    55  			}
    56  			Expect(testK8sClient.Create(ctx, ns)).Should(Succeed())
    57  
    58  			job = newPaddleJobForTest(name, ns.Name)
    59  			jobKey = client.ObjectKeyFromObject(job)
    60  			masterKey = types.NamespacedName{
    61  				Name:      fmt.Sprintf("%s-master-0", name),
    62  				Namespace: ns.Name,
    63  			}
    64  			worker0Key = types.NamespacedName{
    65  				Name:      fmt.Sprintf("%s-worker-0", name),
    66  				Namespace: ns.Name,
    67  			}
    68  			job.Spec.PaddleReplicaSpecs = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{
    69  				kubeflowv1.PaddleJobReplicaTypeMaster: {
    70  					Replicas: pointer.Int32(1),
    71  					Template: corev1.PodTemplateSpec{
    72  						Spec: corev1.PodSpec{
    73  							Containers: []corev1.Container{
    74  								{
    75  									Image: "test-image",
    76  									Name:  kubeflowv1.PaddleJobDefaultContainerName,
    77  									Ports: []corev1.ContainerPort{
    78  										{
    79  											Name:          kubeflowv1.PaddleJobDefaultPortName,
    80  											ContainerPort: expectedPort,
    81  											Protocol:      corev1.ProtocolTCP,
    82  										},
    83  									},
    84  								},
    85  							},
    86  						},
    87  					},
    88  				},
    89  				kubeflowv1.PaddleJobReplicaTypeWorker: {
    90  					Replicas: pointer.Int32(2),
    91  					Template: corev1.PodTemplateSpec{
    92  						Spec: corev1.PodSpec{
    93  							Containers: []corev1.Container{
    94  								{
    95  									Image: "test-image",
    96  									Name:  kubeflowv1.PaddleJobDefaultContainerName,
    97  									Ports: []corev1.ContainerPort{
    98  										{
    99  											Name:          kubeflowv1.PaddleJobDefaultPortName,
   100  											ContainerPort: expectedPort,
   101  											Protocol:      corev1.ProtocolTCP,
   102  										},
   103  									},
   104  								},
   105  							},
   106  						},
   107  					},
   108  				},
   109  			}
   110  		})
   111  		AfterEach(func() {
   112  			Expect(testK8sClient.Delete(ctx, job)).Should(Succeed())
   113  			Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed())
   114  		})
   115  		It("Should get the corresponding resources successfully", func() {
   116  			By("By creating a new PaddleJob")
   117  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   118  
   119  			created := &kubeflowv1.PaddleJob{}
   120  
   121  			// We'll need to retry getting this newly created PaddleJob, given that creation may not immediately happen.
   122  			Eventually(func() bool {
   123  				err := testK8sClient.Get(ctx, jobKey, created)
   124  				return err == nil
   125  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   126  
   127  			masterPod := &corev1.Pod{}
   128  			Eventually(func() bool {
   129  				err := testK8sClient.Get(ctx, masterKey, masterPod)
   130  				return err == nil
   131  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   132  
   133  			masterSvc := &corev1.Service{}
   134  			Eventually(func() bool {
   135  				err := testK8sClient.Get(ctx, masterKey, masterSvc)
   136  				return err == nil
   137  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   138  
   139  			// Check the pod port.
   140  			Expect(masterPod.Spec.Containers[0].Ports).To(ContainElement(corev1.ContainerPort{
   141  				Name:          kubeflowv1.PaddleJobDefaultPortName,
   142  				ContainerPort: expectedPort,
   143  				Protocol:      corev1.ProtocolTCP}))
   144  			// Check env variable
   145  			Expect(masterPod.Spec.Containers[0].Env).To(ContainElements(corev1.EnvVar{
   146  				Name:  EnvMasterEndpoint,
   147  				Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", expectedPort),
   148  			}))
   149  			// Check service port.
   150  			Expect(masterSvc.Spec.Ports[0].Port).To(Equal(expectedPort))
   151  			// Check owner reference.
   152  			trueVal := true
   153  			Expect(masterPod.OwnerReferences).To(ContainElement(metav1.OwnerReference{
   154  				APIVersion:         kubeflowv1.SchemeGroupVersion.String(),
   155  				Kind:               kubeflowv1.PaddleJobKind,
   156  				Name:               name,
   157  				UID:                created.UID,
   158  				Controller:         &trueVal,
   159  				BlockOwnerDeletion: &trueVal,
   160  			}))
   161  			Expect(masterSvc.OwnerReferences).To(ContainElement(metav1.OwnerReference{
   162  				APIVersion:         kubeflowv1.SchemeGroupVersion.String(),
   163  				Kind:               kubeflowv1.PaddleJobKind,
   164  				Name:               name,
   165  				UID:                created.UID,
   166  				Controller:         &trueVal,
   167  				BlockOwnerDeletion: &trueVal,
   168  			}))
   169  
   170  			// Test job status.
   171  			masterPod.Status.Phase = corev1.PodSucceeded
   172  			masterPod.ResourceVersion = ""
   173  			Expect(testK8sClient.Status().Update(ctx, masterPod)).Should(Succeed())
   174  			Eventually(func() bool {
   175  				err := testK8sClient.Get(ctx, jobKey, created)
   176  				if err != nil {
   177  					return false
   178  				}
   179  				return created.Status.ReplicaStatuses != nil && created.Status.
   180  					ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Succeeded == 1
   181  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   182  			// Check if the job is succeeded.
   183  			cond := getCondition(created.Status, kubeflowv1.JobSucceeded)
   184  			Expect(cond.Status).To(Equal(corev1.ConditionTrue))
   185  		})
   186  		It("Shouldn't create resources if PaddleJob is suspended", func() {
   187  			By("By creating a new PaddleJob with suspend=true")
   188  			job.Spec.RunPolicy.Suspend = pointer.Bool(true)
   189  			job.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeWorker].Replicas = pointer.Int32(1)
   190  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   191  
   192  			created := &kubeflowv1.PaddleJob{}
   193  			masterPod := &corev1.Pod{}
   194  			workerPod := &corev1.Pod{}
   195  			masterSvc := &corev1.Service{}
   196  			workerSvc := &corev1.Service{}
   197  
   198  			By("Checking created PaddleJob")
   199  			Eventually(func() bool {
   200  				err := testK8sClient.Get(ctx, jobKey, created)
   201  				return err == nil
   202  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   203  			By("Checking created PaddleJob has a nil startTime")
   204  			Consistently(func() *metav1.Time {
   205  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   206  				return created.Status.StartTime
   207  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeNil())
   208  
   209  			By("Checking if the pods and services aren't created")
   210  			Consistently(func() bool {
   211  				errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod)
   212  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   213  				errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc)
   214  				errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc)
   215  				return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) &&
   216  					errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc)
   217  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   218  
   219  			By("Checking if the PaddleJob has suspended condition")
   220  			Eventually(func() []kubeflowv1.JobCondition {
   221  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   222  				return created.Status.Conditions
   223  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   224  				{
   225  					Type:    kubeflowv1.JobCreated,
   226  					Status:  corev1.ConditionTrue,
   227  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason),
   228  					Message: fmt.Sprintf("PaddleJob %s is created.", name),
   229  				},
   230  				{
   231  					Type:    kubeflowv1.JobSuspended,
   232  					Status:  corev1.ConditionTrue,
   233  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason),
   234  					Message: fmt.Sprintf("PaddleJob %s is suspended.", name),
   235  				},
   236  			}, testutil.IgnoreJobConditionsTimes))
   237  		})
   238  
   239  		It("Should delete resources after PaddleJob is suspended; Should resume PaddleJob after PaddleJob is unsuspended", func() {
   240  			By("By creating a new PaddleJob")
   241  			job.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeWorker].Replicas = pointer.Int32(1)
   242  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   243  
   244  			created := &kubeflowv1.PaddleJob{}
   245  			masterPod := &corev1.Pod{}
   246  			workerPod := &corev1.Pod{}
   247  			masterSvc := &corev1.Service{}
   248  			workerSvc := &corev1.Service{}
   249  
   250  			// We'll need to retry getting this newly created PaddleJob, given that creation may not immediately happen.
   251  			By("Checking created PaddleJob")
   252  			Eventually(func() bool {
   253  				err := testK8sClient.Get(ctx, jobKey, created)
   254  				return err == nil
   255  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   256  
   257  			var startTimeBeforeSuspended *metav1.Time
   258  			Eventually(func() *metav1.Time {
   259  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   260  				startTimeBeforeSuspended = created.Status.StartTime
   261  				return startTimeBeforeSuspended
   262  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   263  
   264  			By("Checking the created pods and services")
   265  			Eventually(func() bool {
   266  				errMaster := testK8sClient.Get(ctx, masterKey, masterPod)
   267  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   268  				return errMaster == nil && errWorker == nil
   269  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   270  			Eventually(func() bool {
   271  				errMaster := testK8sClient.Get(ctx, masterKey, masterSvc)
   272  				errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc)
   273  				return errMaster == nil && errWorker == nil
   274  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   275  
   276  			By("Updating the pod's phase with Running")
   277  			Eventually(func() error {
   278  				Expect(testK8sClient.Get(ctx, masterKey, masterPod)).Should(Succeed())
   279  				masterPod.Status.Phase = corev1.PodRunning
   280  				return testK8sClient.Status().Update(ctx, masterPod)
   281  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   282  			Eventually(func() error {
   283  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
   284  				workerPod.Status.Phase = corev1.PodRunning
   285  				return testK8sClient.Status().Update(ctx, workerPod)
   286  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   287  
   288  			By("Checking the PaddleJob's condition")
   289  			Eventually(func() []kubeflowv1.JobCondition {
   290  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   291  				return created.Status.Conditions
   292  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   293  				{
   294  					Type:    kubeflowv1.JobCreated,
   295  					Status:  corev1.ConditionTrue,
   296  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason),
   297  					Message: fmt.Sprintf("PaddleJob %s is created.", name),
   298  				},
   299  				{
   300  					Type:    kubeflowv1.JobRunning,
   301  					Status:  corev1.ConditionTrue,
   302  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason),
   303  					Message: fmt.Sprintf("PaddleJob %s is running.", name),
   304  				},
   305  			}, testutil.IgnoreJobConditionsTimes))
   306  
   307  			By("Updating the PaddleJob with suspend=true")
   308  			Eventually(func() error {
   309  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   310  				created.Spec.RunPolicy.Suspend = pointer.Bool(true)
   311  				return testK8sClient.Update(ctx, created)
   312  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   313  
   314  			By("Checking if the pods and services are removed")
   315  			Eventually(func() bool {
   316  				errMaster := testK8sClient.Get(ctx, masterKey, masterPod)
   317  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   318  				return errors.IsNotFound(errMaster) && errors.IsNotFound(errWorker)
   319  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   320  			Eventually(func() bool {
   321  				errMaster := testK8sClient.Get(ctx, masterKey, masterSvc)
   322  				errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc)
   323  				return errors.IsNotFound(errMaster) && errors.IsNotFound(errWorker)
   324  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   325  			Consistently(func() bool {
   326  				errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod)
   327  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   328  				errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc)
   329  				errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc)
   330  				return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) &&
   331  					errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc)
   332  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   333  
   334  			By("Checking if the PaddleJob has a suspended condition")
   335  			Eventually(func() bool {
   336  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   337  				return created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Active == 0 &&
   338  					created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeWorker].Active == 0 &&
   339  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   340  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   341  			Consistently(func() bool {
   342  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   343  				return created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Active == 0 &&
   344  					created.Status.ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeWorker].Active == 0 &&
   345  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   346  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   347  			Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{
   348  				{
   349  					Type:    kubeflowv1.JobCreated,
   350  					Status:  corev1.ConditionTrue,
   351  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason),
   352  					Message: fmt.Sprintf("PaddleJob %s is created.", name),
   353  				},
   354  				{
   355  					Type:    kubeflowv1.JobRunning,
   356  					Status:  corev1.ConditionFalse,
   357  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason),
   358  					Message: fmt.Sprintf("PaddleJob %s is suspended.", name),
   359  				},
   360  				{
   361  					Type:    kubeflowv1.JobSuspended,
   362  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSuspendedReason),
   363  					Message: fmt.Sprintf("PaddleJob %s is suspended.", name),
   364  					Status:  corev1.ConditionTrue,
   365  				},
   366  			}, testutil.IgnoreJobConditionsTimes))
   367  
   368  			By("Unsuspending the PaddleJob")
   369  			Eventually(func() error {
   370  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   371  				created.Spec.RunPolicy.Suspend = pointer.Bool(false)
   372  				return testK8sClient.Update(ctx, created)
   373  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   374  			Eventually(func() *metav1.Time {
   375  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   376  				return created.Status.StartTime
   377  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   378  
   379  			By("Check if the pods and services are created")
   380  			Eventually(func() error {
   381  				return testK8sClient.Get(ctx, masterKey, masterPod)
   382  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   383  			Eventually(func() error {
   384  				return testK8sClient.Get(ctx, worker0Key, workerPod)
   385  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   386  			Eventually(func() error {
   387  				return testK8sClient.Get(ctx, masterKey, masterSvc)
   388  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   389  			Eventually(func() error {
   390  				return testK8sClient.Get(ctx, worker0Key, workerSvc)
   391  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   392  
   393  			By("Updating Pod's condition with running")
   394  			Eventually(func() error {
   395  				Expect(testK8sClient.Get(ctx, masterKey, masterPod)).Should(Succeed())
   396  				masterPod.Status.Phase = corev1.PodRunning
   397  				return testK8sClient.Status().Update(ctx, masterPod)
   398  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   399  			Eventually(func() error {
   400  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
   401  				workerPod.Status.Phase = corev1.PodRunning
   402  				return testK8sClient.Status().Update(ctx, workerPod)
   403  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   404  
   405  			By("Checking if the PaddleJob has resumed conditions")
   406  			Eventually(func() []kubeflowv1.JobCondition {
   407  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   408  				return created.Status.Conditions
   409  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   410  				{
   411  					Type:    kubeflowv1.JobCreated,
   412  					Status:  corev1.ConditionTrue,
   413  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason),
   414  					Message: fmt.Sprintf("PaddleJob %s is created.", name),
   415  				},
   416  				{
   417  					Type:    kubeflowv1.JobSuspended,
   418  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobResumedReason),
   419  					Message: fmt.Sprintf("PaddleJob %s is resumed.", name),
   420  					Status:  corev1.ConditionFalse,
   421  				},
   422  				{
   423  					Type:    kubeflowv1.JobRunning,
   424  					Status:  corev1.ConditionTrue,
   425  					Reason:  commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason),
   426  					Message: fmt.Sprintf("PaddleJob %s is running.", name),
   427  				},
   428  			}, testutil.IgnoreJobConditionsTimes))
   429  
   430  			By("Checking if the startTime is updated")
   431  			Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended))
   432  		})
   433  	})
   434  })
   435  
   436  func newPaddleJobForTest(name, namespace string) *kubeflowv1.PaddleJob {
   437  	return &kubeflowv1.PaddleJob{
   438  		ObjectMeta: metav1.ObjectMeta{
   439  			Name:      name,
   440  			Namespace: namespace,
   441  		},
   442  	}
   443  }
   444  
   445  // getCondition returns the condition with the provided type.
   446  func getCondition(status kubeflowv1.JobStatus, condType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition {
   447  	for _, condition := range status.Conditions {
   448  		if condition.Type == condType {
   449  			return &condition
   450  		}
   451  	}
   452  	return nil
   453  }