github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/testutil/tfjob.go (about) 1 // Copyright 2018 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package testutil 16 17 import ( 18 v1 "k8s.io/api/core/v1" 19 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 20 21 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 22 ) 23 24 const TestTFJobName = "test-tfjob" 25 26 func NewTFJobWithCleanPolicy(chief, worker, ps int, policy kubeflowv1.CleanPodPolicy) *kubeflowv1.TFJob { 27 if chief == 1 { 28 tfJob := NewTFJobWithChief(worker, ps) 29 tfJob.Spec.RunPolicy.CleanPodPolicy = &policy 30 return tfJob 31 } 32 tfJob := NewTFJob(worker, ps) 33 tfJob.Spec.RunPolicy.CleanPodPolicy = &policy 34 return tfJob 35 } 36 37 func NewTFJobWithCleanupJobDelay(chief, worker, ps int, ttl *int32) *kubeflowv1.TFJob { 38 if chief == 1 { 39 tfJob := NewTFJobWithChief(worker, ps) 40 tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl 41 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyNone) 42 return tfJob 43 } 44 tfJob := NewTFJob(worker, ps) 45 tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl 46 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyNone) 47 return tfJob 48 } 49 50 func NewTFJobWithActiveDeadlineSeconds(chief, worker, ps int, ads *int64) *kubeflowv1.TFJob { 51 if chief == 1 { 52 tfJob := NewTFJobWithChief(worker, ps) 53 tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads 54 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll) 55 return tfJob 56 } 57 tfJob := NewTFJob(worker, ps) 58 tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads 59 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll) 60 return tfJob 61 } 62 63 func NewTFJobWithBackoffLimit(chief, worker, ps int, backoffLimit *int32) *kubeflowv1.TFJob { 64 if chief == 1 { 65 tfJob := NewTFJobWithChief(worker, ps) 66 tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit 67 tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure" 68 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll) 69 return tfJob 70 } 71 tfJob := NewTFJob(worker, ps) 72 tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit 73 tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure" 74 tfJob.Spec.RunPolicy.CleanPodPolicy = kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll) 75 return tfJob 76 } 77 78 func NewTFJobWithChief(worker, ps int) *kubeflowv1.TFJob { 79 tfJob := NewTFJob(worker, ps) 80 chief := int32(1) 81 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = &kubeflowv1.ReplicaSpec{ 82 Replicas: &chief, 83 Template: NewTFReplicaSpecTemplate(), 84 } 85 return tfJob 86 } 87 88 func NewTFJobWithEvaluator(worker, ps, evaluator int) *kubeflowv1.TFJob { 89 tfJob := NewTFJob(worker, ps) 90 if evaluator > 0 { 91 evaluator := int32(evaluator) 92 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeEval] = &kubeflowv1.ReplicaSpec{ 93 Replicas: &evaluator, 94 Template: NewTFReplicaSpecTemplate(), 95 } 96 } 97 return tfJob 98 } 99 100 func NewTFJobWithSuccessPolicy(worker, ps int, successPolicy kubeflowv1.SuccessPolicy) *kubeflowv1.TFJob { 101 tfJob := NewTFJob(worker, ps) 102 tfJob.Spec.SuccessPolicy = &successPolicy 103 return tfJob 104 } 105 106 func NewTFJob(worker, ps int) *kubeflowv1.TFJob { 107 tfJob := &kubeflowv1.TFJob{ 108 TypeMeta: metav1.TypeMeta{ 109 Kind: kubeflowv1.TFJobKind, 110 }, 111 ObjectMeta: metav1.ObjectMeta{ 112 Name: TestTFJobName, 113 Namespace: metav1.NamespaceDefault, 114 }, 115 Spec: kubeflowv1.TFJobSpec{ 116 TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec), 117 }, 118 } 119 kubeflowv1.SetObjectDefaults_TFJob(tfJob) 120 121 if worker > 0 { 122 worker := int32(worker) 123 workerReplicaSpec := &kubeflowv1.ReplicaSpec{ 124 Replicas: &worker, 125 Template: NewTFReplicaSpecTemplate(), 126 } 127 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker] = workerReplicaSpec 128 } 129 130 if ps > 0 { 131 ps := int32(ps) 132 psReplicaSpec := &kubeflowv1.ReplicaSpec{ 133 Replicas: &ps, 134 Template: NewTFReplicaSpecTemplate(), 135 } 136 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypePS] = psReplicaSpec 137 } 138 return tfJob 139 } 140 141 func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { 142 tfJob := &kubeflowv1.TFJob{ 143 TypeMeta: metav1.TypeMeta{ 144 Kind: kubeflowv1.TFJobKind, 145 }, 146 ObjectMeta: metav1.ObjectMeta{ 147 Name: TestTFJobName, 148 Namespace: metav1.NamespaceDefault, 149 }, 150 Spec: kubeflowv1.TFJobSpec{ 151 TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec), 152 }, 153 } 154 kubeflowv1.SetObjectDefaults_TFJob(tfJob) 155 156 if worker > 0 { 157 worker := int32(worker) 158 workerReplicaSpec := &kubeflowv1.ReplicaSpec{ 159 Replicas: &worker, 160 Template: NewTFReplicaSpecTemplate(), 161 } 162 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker] = workerReplicaSpec 163 } 164 165 if ps > 0 { 166 ps := int32(ps) 167 psReplicaSpec := &kubeflowv1.ReplicaSpec{ 168 Replicas: &ps, 169 Template: NewTFReplicaSpecTemplate(), 170 } 171 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypePS] = psReplicaSpec 172 } 173 174 if master > 0 { 175 master := int32(master) 176 masterReplicaSpec := &kubeflowv1.ReplicaSpec{ 177 Replicas: &master, 178 Template: NewTFReplicaSpecTemplate(), 179 } 180 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeMaster] = masterReplicaSpec 181 } 182 183 if chief > 0 { 184 chief := int32(chief) 185 chiefReplicaSpec := &kubeflowv1.ReplicaSpec{ 186 Replicas: &chief, 187 Template: NewTFReplicaSpecTemplate(), 188 } 189 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = chiefReplicaSpec 190 } 191 192 if evaluator > 0 { 193 evaluator := int32(evaluator) 194 evaluatorReplicaSpec := &kubeflowv1.ReplicaSpec{ 195 Replicas: &evaluator, 196 Template: NewTFReplicaSpecTemplate(), 197 } 198 tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = evaluatorReplicaSpec 199 } 200 return tfJob 201 } 202 203 func NewTFJobWithNamespace(worker, ps int, ns string) *kubeflowv1.TFJob { 204 tfJob := NewTFJob(worker, ps) 205 tfJob.Namespace = ns 206 207 return tfJob 208 } 209 210 func NewTFJobWithEvaluatorAndNamespace(worker, ps, evaluator int, ns string) *kubeflowv1.TFJob { 211 tfJob := NewTFJobWithEvaluator(worker, ps, evaluator) 212 tfJob.Namespace = ns 213 214 return tfJob 215 } 216 217 func NewTFReplicaSpecTemplate() v1.PodTemplateSpec { 218 return v1.PodTemplateSpec{ 219 Spec: v1.PodSpec{ 220 Containers: []v1.Container{ 221 v1.Container{ 222 Name: kubeflowv1.TFJobDefaultContainerName, 223 Image: "test-image-for-kubeflow-training-operator:latest", 224 Args: []string{"Fake", "Fake"}, 225 Ports: []v1.ContainerPort{ 226 v1.ContainerPort{ 227 Name: kubeflowv1.TFJobDefaultPortName, 228 ContainerPort: kubeflowv1.TFJobDefaultPort, 229 }, 230 }, 231 }, 232 }, 233 }, 234 } 235 } 236 237 func CheckCondition(tfJob *kubeflowv1.TFJob, condition kubeflowv1.JobConditionType, reason string) bool { 238 for _, v := range tfJob.Status.Conditions { 239 if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { 240 return true 241 } 242 } 243 return false 244 }