volcano.sh/volcano@v1.9.0/pkg/controllers/jobflow/jobflow_controller_action.go (about) 1 /* 2 Copyright 2022 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package jobflow 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 corev1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/errors" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/labels" 28 "k8s.io/apimachinery/pkg/selection" 29 "k8s.io/klog" 30 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 31 32 "volcano.sh/apis/pkg/apis/batch/v1alpha1" 33 v1alpha1flow "volcano.sh/apis/pkg/apis/flow/v1alpha1" 34 "volcano.sh/apis/pkg/client/clientset/versioned/scheme" 35 "volcano.sh/volcano/pkg/controllers/jobflow/state" 36 ) 37 38 func (jf *jobflowcontroller) syncJobFlow(jobFlow *v1alpha1flow.JobFlow, updateStateFn state.UpdateJobFlowStatusFn) error { 39 klog.V(4).Infof("Begin to sync JobFlow %s.", jobFlow.Name) 40 defer klog.V(4).Infof("End sync JobFlow %s.", jobFlow.Name) 41 42 // JobRetainPolicy Judging whether jobs are necessary to delete 43 if jobFlow.Spec.JobRetainPolicy == v1alpha1flow.Delete && jobFlow.Status.State.Phase == v1alpha1flow.Succeed { 44 if err := jf.deleteAllJobsCreatedByJobFlow(jobFlow); err != nil { 45 klog.Errorf("Failed to delete jobs of JobFlow %v/%v: %v", 46 jobFlow.Namespace, jobFlow.Name, err) 47 return err 48 } 49 return nil 50 } 51 52 // deploy job by dependence order. 53 if err := jf.deployJob(jobFlow); err != nil { 54 klog.Errorf("Failed to create jobs of JobFlow %v/%v: %v", 55 jobFlow.Namespace, jobFlow.Name, err) 56 return err 57 } 58 59 // update jobFlow status 60 jobFlowStatus, err := jf.getAllJobStatus(jobFlow) 61 if err != nil { 62 return err 63 } 64 jobFlow.Status = *jobFlowStatus 65 updateStateFn(&jobFlow.Status, len(jobFlow.Spec.Flows)) 66 _, err = jf.vcClient.FlowV1alpha1().JobFlows(jobFlow.Namespace).UpdateStatus(context.Background(), jobFlow, metav1.UpdateOptions{}) 67 if err != nil { 68 klog.Errorf("Failed to update status of JobFlow %v/%v: %v", 69 jobFlow.Namespace, jobFlow.Name, err) 70 return err 71 } 72 73 return nil 74 } 75 76 func (jf *jobflowcontroller) deployJob(jobFlow *v1alpha1flow.JobFlow) error { 77 // load jobTemplate by flow and deploy it 78 for _, flow := range jobFlow.Spec.Flows { 79 jobName := getJobName(jobFlow.Name, flow.Name) 80 if _, err := jf.jobLister.Jobs(jobFlow.Namespace).Get(jobName); err != nil { 81 if errors.IsNotFound(err) { 82 // If it is not distributed, judge whether the dependency of the VcJob meets the requirements 83 if flow.DependsOn == nil || flow.DependsOn.Targets == nil { 84 if err := jf.createJob(jobFlow, flow); err != nil { 85 return err 86 } 87 } else { 88 // query whether the dependencies of the job have been met 89 flag, err := jf.judge(jobFlow, flow) 90 if err != nil { 91 return err 92 } 93 if flag { 94 if err := jf.createJob(jobFlow, flow); err != nil { 95 return err 96 } 97 } 98 } 99 continue 100 } 101 return err 102 } 103 } 104 return nil 105 } 106 107 // judge query whether the dependencies of the job have been met. If it is satisfied, create the job, if not, judge the next job. Create the job if satisfied 108 func (jf *jobflowcontroller) judge(jobFlow *v1alpha1flow.JobFlow, flow v1alpha1flow.Flow) (bool, error) { 109 for _, targetName := range flow.DependsOn.Targets { 110 targetJobName := getJobName(jobFlow.Name, targetName) 111 job, err := jf.jobLister.Jobs(jobFlow.Namespace).Get(targetJobName) 112 if err != nil { 113 if errors.IsNotFound(err) { 114 klog.Info(fmt.Sprintf("No %v Job found!", targetJobName)) 115 return false, nil 116 } 117 return false, err 118 } 119 if job.Status.State.Phase != v1alpha1.Completed { 120 return false, nil 121 } 122 } 123 return true, nil 124 } 125 126 // createJob 127 func (jf *jobflowcontroller) createJob(jobFlow *v1alpha1flow.JobFlow, flow v1alpha1flow.Flow) error { 128 job := new(v1alpha1.Job) 129 if err := jf.loadJobTemplateAndSetJob(jobFlow, flow.Name, getJobName(jobFlow.Name, flow.Name), job); err != nil { 130 return err 131 } 132 if _, err := jf.vcClient.BatchV1alpha1().Jobs(jobFlow.Namespace).Create(context.Background(), job, metav1.CreateOptions{}); err != nil { 133 if errors.IsAlreadyExists(err) { 134 return nil 135 } 136 return err 137 } 138 jf.recorder.Eventf(jobFlow, corev1.EventTypeNormal, "Created", fmt.Sprintf("create a job named %v!", job.Name)) 139 return nil 140 } 141 142 // getAllJobStatus Get the information of all created jobs 143 func (jf *jobflowcontroller) getAllJobStatus(jobFlow *v1alpha1flow.JobFlow) (*v1alpha1flow.JobFlowStatus, error) { 144 jobList, err := jf.getAllJobsCreatedByJobFlow(jobFlow) 145 if err != nil { 146 klog.Error(err, "get jobList error") 147 return nil, err 148 } 149 150 statusListJobMap := map[v1alpha1.JobPhase][]string{ 151 v1alpha1.Pending: make([]string, 0), 152 v1alpha1.Running: make([]string, 0), 153 v1alpha1.Completing: make([]string, 0), 154 v1alpha1.Completed: make([]string, 0), 155 v1alpha1.Terminating: make([]string, 0), 156 v1alpha1.Terminated: make([]string, 0), 157 v1alpha1.Failed: make([]string, 0), 158 } 159 160 UnKnowJobs := make([]string, 0) 161 conditions := make(map[string]v1alpha1flow.Condition) 162 for _, job := range jobList { 163 if _, ok := statusListJobMap[job.Status.State.Phase]; ok { 164 statusListJobMap[job.Status.State.Phase] = append(statusListJobMap[job.Status.State.Phase], job.Name) 165 } else { 166 UnKnowJobs = append(UnKnowJobs, job.Name) 167 } 168 conditions[job.Name] = v1alpha1flow.Condition{ 169 Phase: job.Status.State.Phase, 170 CreateTimestamp: job.CreationTimestamp, 171 RunningDuration: job.Status.RunningDuration, 172 TaskStatusCount: job.Status.TaskStatusCount, 173 } 174 } 175 jobStatusList := make([]v1alpha1flow.JobStatus, 0) 176 if jobFlow.Status.JobStatusList != nil { 177 jobStatusList = jobFlow.Status.JobStatusList 178 } 179 for _, job := range jobList { 180 runningHistories := getRunningHistories(jobStatusList, job) 181 endTimeStamp := metav1.Time{} 182 if job.Status.RunningDuration != nil { 183 endTimeStamp = metav1.Time{Time: job.CreationTimestamp.Add(job.Status.RunningDuration.Duration)} 184 } 185 jobStatus := v1alpha1flow.JobStatus{ 186 Name: job.Name, 187 State: job.Status.State.Phase, 188 StartTimestamp: job.CreationTimestamp, 189 EndTimestamp: endTimeStamp, 190 RestartCount: job.Status.RetryCount, 191 RunningHistories: runningHistories, 192 } 193 jobFlag := true 194 for i := range jobStatusList { 195 if jobStatusList[i].Name == jobStatus.Name { 196 jobFlag = false 197 jobStatusList[i] = jobStatus 198 } 199 } 200 if jobFlag { 201 jobStatusList = append(jobStatusList, jobStatus) 202 } 203 } 204 205 jobFlowStatus := v1alpha1flow.JobFlowStatus{ 206 PendingJobs: statusListJobMap[v1alpha1.Pending], 207 RunningJobs: statusListJobMap[v1alpha1.Running], 208 FailedJobs: statusListJobMap[v1alpha1.Failed], 209 CompletedJobs: statusListJobMap[v1alpha1.Completed], 210 TerminatedJobs: statusListJobMap[v1alpha1.Terminated], 211 UnKnowJobs: UnKnowJobs, 212 JobStatusList: jobStatusList, 213 Conditions: conditions, 214 State: jobFlow.Status.State, 215 } 216 return &jobFlowStatus, nil 217 } 218 219 func getRunningHistories(jobStatusList []v1alpha1flow.JobStatus, job *v1alpha1.Job) []v1alpha1flow.JobRunningHistory { 220 runningHistories := make([]v1alpha1flow.JobRunningHistory, 0) 221 flag := true 222 for _, jobStatusGet := range jobStatusList { 223 if jobStatusGet.Name == job.Name && jobStatusGet.RunningHistories != nil { 224 flag = false 225 runningHistories = jobStatusGet.RunningHistories 226 // State change 227 if len(runningHistories) == 0 { 228 continue 229 } 230 if runningHistories[len(runningHistories)-1].State != job.Status.State.Phase { 231 runningHistories[len(runningHistories)-1].EndTimestamp = metav1.Time{ 232 Time: time.Now(), 233 } 234 runningHistories = append(runningHistories, v1alpha1flow.JobRunningHistory{ 235 StartTimestamp: metav1.Time{Time: time.Now()}, 236 EndTimestamp: metav1.Time{}, 237 State: job.Status.State.Phase, 238 }) 239 } 240 } 241 } 242 if flag && job.Status.State.Phase != "" { 243 runningHistories = append(runningHistories, v1alpha1flow.JobRunningHistory{ 244 StartTimestamp: metav1.Time{ 245 Time: time.Now(), 246 }, 247 EndTimestamp: metav1.Time{}, 248 State: job.Status.State.Phase, 249 }) 250 } 251 return runningHistories 252 } 253 254 func (jf *jobflowcontroller) loadJobTemplateAndSetJob(jobFlow *v1alpha1flow.JobFlow, flowName string, jobName string, job *v1alpha1.Job) error { 255 // load jobTemplate 256 jobTemplate, err := jf.jobTemplateLister.JobTemplates(jobFlow.Namespace).Get(flowName) 257 if err != nil { 258 return err 259 } 260 261 *job = v1alpha1.Job{ 262 ObjectMeta: metav1.ObjectMeta{ 263 Name: jobName, 264 Namespace: jobFlow.Namespace, 265 Labels: map[string]string{CreatedByJobTemplate: GetTemplateString(jobFlow.Namespace, flowName)}, 266 Annotations: map[string]string{CreatedByJobTemplate: GetTemplateString(jobFlow.Namespace, flowName)}, 267 }, 268 Spec: jobTemplate.Spec, 269 Status: v1alpha1.JobStatus{}, 270 } 271 272 return controllerutil.SetControllerReference(jobFlow, job, scheme.Scheme) 273 } 274 275 func (jf *jobflowcontroller) deleteAllJobsCreatedByJobFlow(jobFlow *v1alpha1flow.JobFlow) error { 276 jobList, err := jf.getAllJobsCreatedByJobFlow(jobFlow) 277 if err != nil { 278 return err 279 } 280 281 for _, job := range jobList { 282 err := jf.vcClient.BatchV1alpha1().Jobs(jobFlow.Namespace).Delete(context.Background(), job.Name, metav1.DeleteOptions{}) 283 if err != nil { 284 klog.Errorf("Failed to delete job of JobFlow %v/%v: %v", 285 jobFlow.Namespace, jobFlow.Name, err) 286 return err 287 } 288 } 289 return nil 290 } 291 292 func (jf *jobflowcontroller) getAllJobsCreatedByJobFlow(jobFlow *v1alpha1flow.JobFlow) ([]*v1alpha1.Job, error) { 293 var flowNames []string 294 for _, flow := range jobFlow.Spec.Flows { 295 flowNames = append(flowNames, GetTemplateString(jobFlow.Namespace, flow.Name)) 296 } 297 selector := labels.NewSelector() 298 r, err := labels.NewRequirement(CreatedByJobTemplate, selection.In, flowNames) 299 if err != nil { 300 return nil, err 301 } 302 selector = selector.Add(*r) 303 return jf.jobLister.Jobs(jobFlow.Namespace).List(selector) 304 }