volcano.sh/volcano@v1.9.0/pkg/controllers/jobflow/jobflow_controller_action.go (about)

     1  /*
     2  Copyright 2022 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jobflow
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	corev1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/errors"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/apimachinery/pkg/labels"
    28  	"k8s.io/apimachinery/pkg/selection"
    29  	"k8s.io/klog"
    30  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    31  
    32  	"volcano.sh/apis/pkg/apis/batch/v1alpha1"
    33  	v1alpha1flow "volcano.sh/apis/pkg/apis/flow/v1alpha1"
    34  	"volcano.sh/apis/pkg/client/clientset/versioned/scheme"
    35  	"volcano.sh/volcano/pkg/controllers/jobflow/state"
    36  )
    37  
    38  func (jf *jobflowcontroller) syncJobFlow(jobFlow *v1alpha1flow.JobFlow, updateStateFn state.UpdateJobFlowStatusFn) error {
    39  	klog.V(4).Infof("Begin to sync JobFlow %s.", jobFlow.Name)
    40  	defer klog.V(4).Infof("End sync JobFlow %s.", jobFlow.Name)
    41  
    42  	// JobRetainPolicy Judging whether jobs are necessary to delete
    43  	if jobFlow.Spec.JobRetainPolicy == v1alpha1flow.Delete && jobFlow.Status.State.Phase == v1alpha1flow.Succeed {
    44  		if err := jf.deleteAllJobsCreatedByJobFlow(jobFlow); err != nil {
    45  			klog.Errorf("Failed to delete jobs of JobFlow %v/%v: %v",
    46  				jobFlow.Namespace, jobFlow.Name, err)
    47  			return err
    48  		}
    49  		return nil
    50  	}
    51  
    52  	// deploy job by dependence order.
    53  	if err := jf.deployJob(jobFlow); err != nil {
    54  		klog.Errorf("Failed to create jobs of JobFlow %v/%v: %v",
    55  			jobFlow.Namespace, jobFlow.Name, err)
    56  		return err
    57  	}
    58  
    59  	// update jobFlow status
    60  	jobFlowStatus, err := jf.getAllJobStatus(jobFlow)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	jobFlow.Status = *jobFlowStatus
    65  	updateStateFn(&jobFlow.Status, len(jobFlow.Spec.Flows))
    66  	_, err = jf.vcClient.FlowV1alpha1().JobFlows(jobFlow.Namespace).UpdateStatus(context.Background(), jobFlow, metav1.UpdateOptions{})
    67  	if err != nil {
    68  		klog.Errorf("Failed to update status of JobFlow %v/%v: %v",
    69  			jobFlow.Namespace, jobFlow.Name, err)
    70  		return err
    71  	}
    72  
    73  	return nil
    74  }
    75  
    76  func (jf *jobflowcontroller) deployJob(jobFlow *v1alpha1flow.JobFlow) error {
    77  	// load jobTemplate by flow and deploy it
    78  	for _, flow := range jobFlow.Spec.Flows {
    79  		jobName := getJobName(jobFlow.Name, flow.Name)
    80  		if _, err := jf.jobLister.Jobs(jobFlow.Namespace).Get(jobName); err != nil {
    81  			if errors.IsNotFound(err) {
    82  				// If it is not distributed, judge whether the dependency of the VcJob meets the requirements
    83  				if flow.DependsOn == nil || flow.DependsOn.Targets == nil {
    84  					if err := jf.createJob(jobFlow, flow); err != nil {
    85  						return err
    86  					}
    87  				} else {
    88  					// query whether the dependencies of the job have been met
    89  					flag, err := jf.judge(jobFlow, flow)
    90  					if err != nil {
    91  						return err
    92  					}
    93  					if flag {
    94  						if err := jf.createJob(jobFlow, flow); err != nil {
    95  							return err
    96  						}
    97  					}
    98  				}
    99  				continue
   100  			}
   101  			return err
   102  		}
   103  	}
   104  	return nil
   105  }
   106  
   107  // judge query whether the dependencies of the job have been met. If it is satisfied, create the job, if not, judge the next job. Create the job if satisfied
   108  func (jf *jobflowcontroller) judge(jobFlow *v1alpha1flow.JobFlow, flow v1alpha1flow.Flow) (bool, error) {
   109  	for _, targetName := range flow.DependsOn.Targets {
   110  		targetJobName := getJobName(jobFlow.Name, targetName)
   111  		job, err := jf.jobLister.Jobs(jobFlow.Namespace).Get(targetJobName)
   112  		if err != nil {
   113  			if errors.IsNotFound(err) {
   114  				klog.Info(fmt.Sprintf("No %v Job found!", targetJobName))
   115  				return false, nil
   116  			}
   117  			return false, err
   118  		}
   119  		if job.Status.State.Phase != v1alpha1.Completed {
   120  			return false, nil
   121  		}
   122  	}
   123  	return true, nil
   124  }
   125  
   126  // createJob
   127  func (jf *jobflowcontroller) createJob(jobFlow *v1alpha1flow.JobFlow, flow v1alpha1flow.Flow) error {
   128  	job := new(v1alpha1.Job)
   129  	if err := jf.loadJobTemplateAndSetJob(jobFlow, flow.Name, getJobName(jobFlow.Name, flow.Name), job); err != nil {
   130  		return err
   131  	}
   132  	if _, err := jf.vcClient.BatchV1alpha1().Jobs(jobFlow.Namespace).Create(context.Background(), job, metav1.CreateOptions{}); err != nil {
   133  		if errors.IsAlreadyExists(err) {
   134  			return nil
   135  		}
   136  		return err
   137  	}
   138  	jf.recorder.Eventf(jobFlow, corev1.EventTypeNormal, "Created", fmt.Sprintf("create a job named %v!", job.Name))
   139  	return nil
   140  }
   141  
   142  // getAllJobStatus Get the information of all created jobs
   143  func (jf *jobflowcontroller) getAllJobStatus(jobFlow *v1alpha1flow.JobFlow) (*v1alpha1flow.JobFlowStatus, error) {
   144  	jobList, err := jf.getAllJobsCreatedByJobFlow(jobFlow)
   145  	if err != nil {
   146  		klog.Error(err, "get jobList error")
   147  		return nil, err
   148  	}
   149  
   150  	statusListJobMap := map[v1alpha1.JobPhase][]string{
   151  		v1alpha1.Pending:     make([]string, 0),
   152  		v1alpha1.Running:     make([]string, 0),
   153  		v1alpha1.Completing:  make([]string, 0),
   154  		v1alpha1.Completed:   make([]string, 0),
   155  		v1alpha1.Terminating: make([]string, 0),
   156  		v1alpha1.Terminated:  make([]string, 0),
   157  		v1alpha1.Failed:      make([]string, 0),
   158  	}
   159  
   160  	UnKnowJobs := make([]string, 0)
   161  	conditions := make(map[string]v1alpha1flow.Condition)
   162  	for _, job := range jobList {
   163  		if _, ok := statusListJobMap[job.Status.State.Phase]; ok {
   164  			statusListJobMap[job.Status.State.Phase] = append(statusListJobMap[job.Status.State.Phase], job.Name)
   165  		} else {
   166  			UnKnowJobs = append(UnKnowJobs, job.Name)
   167  		}
   168  		conditions[job.Name] = v1alpha1flow.Condition{
   169  			Phase:           job.Status.State.Phase,
   170  			CreateTimestamp: job.CreationTimestamp,
   171  			RunningDuration: job.Status.RunningDuration,
   172  			TaskStatusCount: job.Status.TaskStatusCount,
   173  		}
   174  	}
   175  	jobStatusList := make([]v1alpha1flow.JobStatus, 0)
   176  	if jobFlow.Status.JobStatusList != nil {
   177  		jobStatusList = jobFlow.Status.JobStatusList
   178  	}
   179  	for _, job := range jobList {
   180  		runningHistories := getRunningHistories(jobStatusList, job)
   181  		endTimeStamp := metav1.Time{}
   182  		if job.Status.RunningDuration != nil {
   183  			endTimeStamp = metav1.Time{Time: job.CreationTimestamp.Add(job.Status.RunningDuration.Duration)}
   184  		}
   185  		jobStatus := v1alpha1flow.JobStatus{
   186  			Name:             job.Name,
   187  			State:            job.Status.State.Phase,
   188  			StartTimestamp:   job.CreationTimestamp,
   189  			EndTimestamp:     endTimeStamp,
   190  			RestartCount:     job.Status.RetryCount,
   191  			RunningHistories: runningHistories,
   192  		}
   193  		jobFlag := true
   194  		for i := range jobStatusList {
   195  			if jobStatusList[i].Name == jobStatus.Name {
   196  				jobFlag = false
   197  				jobStatusList[i] = jobStatus
   198  			}
   199  		}
   200  		if jobFlag {
   201  			jobStatusList = append(jobStatusList, jobStatus)
   202  		}
   203  	}
   204  
   205  	jobFlowStatus := v1alpha1flow.JobFlowStatus{
   206  		PendingJobs:    statusListJobMap[v1alpha1.Pending],
   207  		RunningJobs:    statusListJobMap[v1alpha1.Running],
   208  		FailedJobs:     statusListJobMap[v1alpha1.Failed],
   209  		CompletedJobs:  statusListJobMap[v1alpha1.Completed],
   210  		TerminatedJobs: statusListJobMap[v1alpha1.Terminated],
   211  		UnKnowJobs:     UnKnowJobs,
   212  		JobStatusList:  jobStatusList,
   213  		Conditions:     conditions,
   214  		State:          jobFlow.Status.State,
   215  	}
   216  	return &jobFlowStatus, nil
   217  }
   218  
   219  func getRunningHistories(jobStatusList []v1alpha1flow.JobStatus, job *v1alpha1.Job) []v1alpha1flow.JobRunningHistory {
   220  	runningHistories := make([]v1alpha1flow.JobRunningHistory, 0)
   221  	flag := true
   222  	for _, jobStatusGet := range jobStatusList {
   223  		if jobStatusGet.Name == job.Name && jobStatusGet.RunningHistories != nil {
   224  			flag = false
   225  			runningHistories = jobStatusGet.RunningHistories
   226  			// State change
   227  			if len(runningHistories) == 0 {
   228  				continue
   229  			}
   230  			if runningHistories[len(runningHistories)-1].State != job.Status.State.Phase {
   231  				runningHistories[len(runningHistories)-1].EndTimestamp = metav1.Time{
   232  					Time: time.Now(),
   233  				}
   234  				runningHistories = append(runningHistories, v1alpha1flow.JobRunningHistory{
   235  					StartTimestamp: metav1.Time{Time: time.Now()},
   236  					EndTimestamp:   metav1.Time{},
   237  					State:          job.Status.State.Phase,
   238  				})
   239  			}
   240  		}
   241  	}
   242  	if flag && job.Status.State.Phase != "" {
   243  		runningHistories = append(runningHistories, v1alpha1flow.JobRunningHistory{
   244  			StartTimestamp: metav1.Time{
   245  				Time: time.Now(),
   246  			},
   247  			EndTimestamp: metav1.Time{},
   248  			State:        job.Status.State.Phase,
   249  		})
   250  	}
   251  	return runningHistories
   252  }
   253  
   254  func (jf *jobflowcontroller) loadJobTemplateAndSetJob(jobFlow *v1alpha1flow.JobFlow, flowName string, jobName string, job *v1alpha1.Job) error {
   255  	// load jobTemplate
   256  	jobTemplate, err := jf.jobTemplateLister.JobTemplates(jobFlow.Namespace).Get(flowName)
   257  	if err != nil {
   258  		return err
   259  	}
   260  
   261  	*job = v1alpha1.Job{
   262  		ObjectMeta: metav1.ObjectMeta{
   263  			Name:        jobName,
   264  			Namespace:   jobFlow.Namespace,
   265  			Labels:      map[string]string{CreatedByJobTemplate: GetTemplateString(jobFlow.Namespace, flowName)},
   266  			Annotations: map[string]string{CreatedByJobTemplate: GetTemplateString(jobFlow.Namespace, flowName)},
   267  		},
   268  		Spec:   jobTemplate.Spec,
   269  		Status: v1alpha1.JobStatus{},
   270  	}
   271  
   272  	return controllerutil.SetControllerReference(jobFlow, job, scheme.Scheme)
   273  }
   274  
   275  func (jf *jobflowcontroller) deleteAllJobsCreatedByJobFlow(jobFlow *v1alpha1flow.JobFlow) error {
   276  	jobList, err := jf.getAllJobsCreatedByJobFlow(jobFlow)
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	for _, job := range jobList {
   282  		err := jf.vcClient.BatchV1alpha1().Jobs(jobFlow.Namespace).Delete(context.Background(), job.Name, metav1.DeleteOptions{})
   283  		if err != nil {
   284  			klog.Errorf("Failed to delete job of JobFlow %v/%v: %v",
   285  				jobFlow.Namespace, jobFlow.Name, err)
   286  			return err
   287  		}
   288  	}
   289  	return nil
   290  }
   291  
   292  func (jf *jobflowcontroller) getAllJobsCreatedByJobFlow(jobFlow *v1alpha1flow.JobFlow) ([]*v1alpha1.Job, error) {
   293  	var flowNames []string
   294  	for _, flow := range jobFlow.Spec.Flows {
   295  		flowNames = append(flowNames, GetTemplateString(jobFlow.Namespace, flow.Name))
   296  	}
   297  	selector := labels.NewSelector()
   298  	r, err := labels.NewRequirement(CreatedByJobTemplate, selection.In, flowNames)
   299  	if err != nil {
   300  		return nil, err
   301  	}
   302  	selector = selector.Add(*r)
   303  	return jf.jobLister.Jobs(jobFlow.Namespace).List(selector)
   304  }