github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/testutil/tfjob.go (about)

     1  // Copyright 2018 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package testutil
    16  
    17  import (
    18  	v1 "k8s.io/api/core/v1"
    19  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    20  
    21  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    22  )
    23  
    24  const TestTFJobName = "test-tfjob"
    25  
    26  func NewTFJobWithCleanPolicy(chief, worker, ps int, policy kubeflowv1.CleanPodPolicy) *kubeflowv1.TFJob {
    27  	if chief == 1 {
    28  		tfJob := NewTFJobWithChief(worker, ps)
    29  		tfJob.Spec.RunPolicy.CleanPodPolicy = &policy
    30  		return tfJob
    31  	}
    32  	tfJob := NewTFJob(worker, ps)
    33  	tfJob.Spec.RunPolicy.CleanPodPolicy = &policy
    34  	return tfJob
    35  }
    36  
    37  func NewTFJobWithCleanupJobDelay(chief, worker, ps int, ttl *int32) *kubeflowv1.TFJob {
    38  	if chief == 1 {
    39  		tfJob := NewTFJobWithChief(worker, ps)
    40  		tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl
    41  		tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyNone)
    42  		return tfJob
    43  	}
    44  	tfJob := NewTFJob(worker, ps)
    45  	tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl
    46  	tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyNone)
    47  	return tfJob
    48  }
    49  
    50  func NewTFJobWithActiveDeadlineSeconds(chief, worker, ps int, ads *int64) *kubeflowv1.TFJob {
    51  	if chief == 1 {
    52  		tfJob := NewTFJobWithChief(worker, ps)
    53  		tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads
    54  		tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll)
    55  		return tfJob
    56  	}
    57  	tfJob := NewTFJob(worker, ps)
    58  	tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads
    59  	tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll)
    60  	return tfJob
    61  }
    62  
    63  func NewTFJobWithBackoffLimit(chief, worker, ps int, backoffLimit *int32) *kubeflowv1.TFJob {
    64  	if chief == 1 {
    65  		tfJob := NewTFJobWithChief(worker, ps)
    66  		tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit
    67  		tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure"
    68  		tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll)
    69  		return tfJob
    70  	}
    71  	tfJob := NewTFJob(worker, ps)
    72  	tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit
    73  	tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure"
    74  	tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll)
    75  	return tfJob
    76  }
    77  
    78  func NewTFJobWithChief(worker, ps int) *kubeflowv1.TFJob {
    79  	tfJob := NewTFJob(worker, ps)
    80  	chief := int32(1)
    81  	tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = &kubeflowv1.ReplicaSpec{
    82  		Replicas: &chief,
    83  		Template: NewTFReplicaSpecTemplate(),
    84  	}
    85  	return tfJob
    86  }
    87  
    88  func NewTFJobWithEvaluator(worker, ps, evaluator int) *kubeflowv1.TFJob {
    89  	tfJob := NewTFJob(worker, ps)
    90  	if evaluator > 0 {
    91  		evaluator := int32(evaluator)
    92  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeEval] = &kubeflowv1.ReplicaSpec{
    93  			Replicas: &evaluator,
    94  			Template: NewTFReplicaSpecTemplate(),
    95  		}
    96  	}
    97  	return tfJob
    98  }
    99  
   100  func NewTFJobWithSuccessPolicy(worker, ps int, successPolicy kubeflowv1.SuccessPolicy) *kubeflowv1.TFJob {
   101  	tfJob := NewTFJob(worker, ps)
   102  	tfJob.Spec.SuccessPolicy = &successPolicy
   103  	return tfJob
   104  }
   105  
   106  func NewTFJob(worker, ps int) *kubeflowv1.TFJob {
   107  	tfJob := &kubeflowv1.TFJob{
   108  		TypeMeta: metav1.TypeMeta{
   109  			Kind: kubeflowv1.TFJobKind,
   110  		},
   111  		ObjectMeta: metav1.ObjectMeta{
   112  			Name:      TestTFJobName,
   113  			Namespace: metav1.NamespaceDefault,
   114  		},
   115  		Spec: kubeflowv1.TFJobSpec{
   116  			TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec),
   117  		},
   118  	}
   119  	kubeflowv1.SetObjectDefaults_TFJob(tfJob)
   120  
   121  	if worker > 0 {
   122  		worker := int32(worker)
   123  		workerReplicaSpec := &kubeflowv1.ReplicaSpec{
   124  			Replicas: &worker,
   125  			Template: NewTFReplicaSpecTemplate(),
   126  		}
   127  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker] = workerReplicaSpec
   128  	}
   129  
   130  	if ps > 0 {
   131  		ps := int32(ps)
   132  		psReplicaSpec := &kubeflowv1.ReplicaSpec{
   133  			Replicas: &ps,
   134  			Template: NewTFReplicaSpecTemplate(),
   135  		}
   136  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypePS] = psReplicaSpec
   137  	}
   138  	return tfJob
   139  }
   140  
   141  func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob {
   142  	tfJob := &kubeflowv1.TFJob{
   143  		TypeMeta: metav1.TypeMeta{
   144  			Kind: kubeflowv1.TFJobKind,
   145  		},
   146  		ObjectMeta: metav1.ObjectMeta{
   147  			Name:      TestTFJobName,
   148  			Namespace: metav1.NamespaceDefault,
   149  		},
   150  		Spec: kubeflowv1.TFJobSpec{
   151  			TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec),
   152  		},
   153  	}
   154  	kubeflowv1.SetObjectDefaults_TFJob(tfJob)
   155  
   156  	if worker > 0 {
   157  		worker := int32(worker)
   158  		workerReplicaSpec := &kubeflowv1.ReplicaSpec{
   159  			Replicas: &worker,
   160  			Template: NewTFReplicaSpecTemplate(),
   161  		}
   162  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker] = workerReplicaSpec
   163  	}
   164  
   165  	if ps > 0 {
   166  		ps := int32(ps)
   167  		psReplicaSpec := &kubeflowv1.ReplicaSpec{
   168  			Replicas: &ps,
   169  			Template: NewTFReplicaSpecTemplate(),
   170  		}
   171  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypePS] = psReplicaSpec
   172  	}
   173  
   174  	if master > 0 {
   175  		master := int32(master)
   176  		masterReplicaSpec := &kubeflowv1.ReplicaSpec{
   177  			Replicas: &master,
   178  			Template: NewTFReplicaSpecTemplate(),
   179  		}
   180  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeMaster] = masterReplicaSpec
   181  	}
   182  
   183  	if chief > 0 {
   184  		chief := int32(chief)
   185  		chiefReplicaSpec := &kubeflowv1.ReplicaSpec{
   186  			Replicas: &chief,
   187  			Template: NewTFReplicaSpecTemplate(),
   188  		}
   189  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = chiefReplicaSpec
   190  	}
   191  
   192  	if evaluator > 0 {
   193  		evaluator := int32(evaluator)
   194  		evaluatorReplicaSpec := &kubeflowv1.ReplicaSpec{
   195  			Replicas: &evaluator,
   196  			Template: NewTFReplicaSpecTemplate(),
   197  		}
   198  		tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = evaluatorReplicaSpec
   199  	}
   200  	return tfJob
   201  }
   202  
   203  func NewTFJobWithNamespace(worker, ps int, ns string) *kubeflowv1.TFJob {
   204  	tfJob := NewTFJob(worker, ps)
   205  	tfJob.Namespace = ns
   206  
   207  	return tfJob
   208  }
   209  
   210  func NewTFJobWithEvaluatorAndNamespace(worker, ps, evaluator int, ns string) *kubeflowv1.TFJob {
   211  	tfJob := NewTFJobWithEvaluator(worker, ps, evaluator)
   212  	tfJob.Namespace = ns
   213  
   214  	return tfJob
   215  }
   216  
   217  func NewTFReplicaSpecTemplate() v1.PodTemplateSpec {
   218  	return v1.PodTemplateSpec{
   219  		Spec: v1.PodSpec{
   220  			Containers: []v1.Container{
   221  				v1.Container{
   222  					Name:  kubeflowv1.TFJobDefaultContainerName,
   223  					Image: "test-image-for-kubeflow-training-operator:latest",
   224  					Args:  []string{"Fake", "Fake"},
   225  					Ports: []v1.ContainerPort{
   226  						v1.ContainerPort{
   227  							Name:          kubeflowv1.TFJobDefaultPortName,
   228  							ContainerPort: kubeflowv1.TFJobDefaultPort,
   229  						},
   230  					},
   231  				},
   232  			},
   233  		},
   234  	}
   235  }
   236  
   237  func CheckCondition(tfJob *kubeflowv1.TFJob, condition kubeflowv1.JobConditionType, reason string) bool {
   238  	for _, v := range tfJob.Status.Conditions {
   239  		if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason {
   240  			return true
   241  		}
   242  	}
   243  	return false
   244  }