volcano.sh/volcano@v1.9.0/pkg/controllers/job/job_controller_handler.go (about)

     1  /*
     2  Copyright 2017 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package job
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"strconv"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/client-go/tools/cache"
    29  	"k8s.io/klog/v2"
    30  
    31  	batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    32  	bus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
    33  	"volcano.sh/apis/pkg/apis/helpers"
    34  	scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    35  	"volcano.sh/volcano/pkg/controllers/apis"
    36  	jobcache "volcano.sh/volcano/pkg/controllers/cache"
    37  	jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
    38  )
    39  
    40  func (cc *jobcontroller) addCommand(obj interface{}) {
    41  	cmd, ok := obj.(*bus.Command)
    42  	if !ok {
    43  		klog.Errorf("obj is not Command")
    44  		return
    45  	}
    46  
    47  	cc.commandQueue.Add(cmd)
    48  }
    49  
    50  func (cc *jobcontroller) addJob(obj interface{}) {
    51  	job, ok := obj.(*batch.Job)
    52  	if !ok {
    53  		klog.Errorf("obj is not Job")
    54  		return
    55  	}
    56  
    57  	req := apis.Request{
    58  		Namespace: job.Namespace,
    59  		JobName:   job.Name,
    60  
    61  		Event: bus.OutOfSyncEvent,
    62  	}
    63  
    64  	// TODO(k82cn): if failed to add job, the cache should be refresh
    65  	if err := cc.cache.Add(job); err != nil {
    66  		klog.Errorf("Failed to add job <%s/%s>: %v in cache",
    67  			job.Namespace, job.Name, err)
    68  	}
    69  	key := jobhelpers.GetJobKeyByReq(&req)
    70  	queue := cc.getWorkerQueue(key)
    71  	queue.Add(req)
    72  }
    73  
    74  func (cc *jobcontroller) updateJob(oldObj, newObj interface{}) {
    75  	newJob, ok := newObj.(*batch.Job)
    76  	if !ok {
    77  		klog.Errorf("newObj is not Job")
    78  		return
    79  	}
    80  
    81  	oldJob, ok := oldObj.(*batch.Job)
    82  	if !ok {
    83  		klog.Errorf("oldJob is not Job")
    84  		return
    85  	}
    86  
    87  	// No need to update if ResourceVersion is not changed
    88  	if newJob.ResourceVersion == oldJob.ResourceVersion {
    89  		klog.V(6).Infof("No need to update because job is not modified.")
    90  		return
    91  	}
    92  
    93  	if err := cc.cache.Update(newJob); err != nil {
    94  		klog.Errorf("UpdateJob - Failed to update job <%s/%s>: %v in cache",
    95  			newJob.Namespace, newJob.Name, err)
    96  	}
    97  
    98  	// NOTE: Since we only reconcile job based on Spec, we will ignore other attributes
    99  	// For Job status, it's used internally and always been updated via our controller.
   100  	if reflect.DeepEqual(newJob.Spec, oldJob.Spec) && newJob.Status.State.Phase == oldJob.Status.State.Phase {
   101  		klog.V(6).Infof("Job update event is ignored since no update in 'Spec'.")
   102  		return
   103  	}
   104  
   105  	req := apis.Request{
   106  		Namespace: newJob.Namespace,
   107  		JobName:   newJob.Name,
   108  		Event:     bus.OutOfSyncEvent,
   109  	}
   110  	key := jobhelpers.GetJobKeyByReq(&req)
   111  	queue := cc.getWorkerQueue(key)
   112  	queue.Add(req)
   113  }
   114  
   115  func (cc *jobcontroller) deleteJob(obj interface{}) {
   116  	job, ok := obj.(*batch.Job)
   117  	if !ok {
   118  		// If we reached here it means the Job was deleted but its final state is unrecorded.
   119  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   120  		if !ok {
   121  			klog.Errorf("Couldn't get object from tombstone %#v", obj)
   122  			return
   123  		}
   124  		job, ok = tombstone.Obj.(*batch.Job)
   125  		if !ok {
   126  			klog.Errorf("Tombstone contained object that is not a volcano Job: %#v", obj)
   127  			return
   128  		}
   129  	}
   130  
   131  	if err := cc.cache.Delete(job); err != nil {
   132  		klog.Errorf("Failed to delete job <%s/%s>: %v in cache",
   133  			job.Namespace, job.Name, err)
   134  	}
   135  }
   136  
   137  func (cc *jobcontroller) addPod(obj interface{}) {
   138  	pod, ok := obj.(*v1.Pod)
   139  	if !ok {
   140  		klog.Errorf("Failed to convert %v to v1.Pod", obj)
   141  		return
   142  	}
   143  	// Filter out pods that are not created from volcano job
   144  	if !isControlledBy(pod, helpers.JobKind) {
   145  		return
   146  	}
   147  
   148  	jobName, found := pod.Annotations[batch.JobNameKey]
   149  	if !found {
   150  		klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
   151  			pod.Namespace, pod.Name)
   152  		return
   153  	}
   154  
   155  	version, found := pod.Annotations[batch.JobVersion]
   156  	if !found {
   157  		klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
   158  			pod.Namespace, pod.Name)
   159  		return
   160  	}
   161  
   162  	dVersion, err := strconv.Atoi(version)
   163  	if err != nil {
   164  		klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
   165  			pod.Namespace, pod.Name)
   166  		return
   167  	}
   168  
   169  	if pod.DeletionTimestamp != nil {
   170  		cc.deletePod(pod)
   171  		return
   172  	}
   173  
   174  	req := apis.Request{
   175  		Namespace: pod.Namespace,
   176  		JobName:   jobName,
   177  
   178  		Event:      bus.OutOfSyncEvent,
   179  		JobVersion: int32(dVersion),
   180  	}
   181  
   182  	if err := cc.cache.AddPod(pod); err != nil {
   183  		klog.Errorf("Failed to add Pod <%s/%s>: %v to cache",
   184  			pod.Namespace, pod.Name, err)
   185  	}
   186  	key := jobhelpers.GetJobKeyByReq(&req)
   187  	queue := cc.getWorkerQueue(key)
   188  	queue.Add(req)
   189  }
   190  
   191  func (cc *jobcontroller) updatePod(oldObj, newObj interface{}) {
   192  	oldPod, ok := oldObj.(*v1.Pod)
   193  	if !ok {
   194  		klog.Errorf("Failed to convert %v to v1.Pod", oldObj)
   195  		return
   196  	}
   197  
   198  	newPod, ok := newObj.(*v1.Pod)
   199  	if !ok {
   200  		klog.Errorf("Failed to convert %v to v1.Pod", newObj)
   201  		return
   202  	}
   203  
   204  	// Filter out pods that are not created from volcano job
   205  	if !isControlledBy(newPod, helpers.JobKind) {
   206  		return
   207  	}
   208  
   209  	if newPod.ResourceVersion == oldPod.ResourceVersion {
   210  		return
   211  	}
   212  
   213  	if newPod.DeletionTimestamp != nil {
   214  		cc.deletePod(newObj)
   215  		return
   216  	}
   217  
   218  	taskName, found := newPod.Annotations[batch.TaskSpecKey]
   219  	if !found {
   220  		klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
   221  			newPod.Namespace, newPod.Name)
   222  		return
   223  	}
   224  
   225  	jobName, found := newPod.Annotations[batch.JobNameKey]
   226  	if !found {
   227  		klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
   228  			newPod.Namespace, newPod.Name)
   229  		return
   230  	}
   231  
   232  	version, found := newPod.Annotations[batch.JobVersion]
   233  	if !found {
   234  		klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
   235  			newPod.Namespace, newPod.Name)
   236  		return
   237  	}
   238  
   239  	dVersion, err := strconv.Atoi(version)
   240  	if err != nil {
   241  		klog.Infof("Failed to convert jobVersion of Pod into number <%s/%s>, skipping",
   242  			newPod.Namespace, newPod.Name)
   243  		return
   244  	}
   245  
   246  	if err := cc.cache.UpdatePod(newPod); err != nil {
   247  		klog.Errorf("Failed to update Pod <%s/%s>: %v in cache",
   248  			newPod.Namespace, newPod.Name, err)
   249  	}
   250  
   251  	event := bus.OutOfSyncEvent
   252  	var exitCode int32
   253  
   254  	switch newPod.Status.Phase {
   255  	case v1.PodFailed:
   256  		if oldPod.Status.Phase != v1.PodFailed {
   257  			event = bus.PodFailedEvent
   258  			// TODO: currently only one container pod is supported by volcano
   259  			// Once multi containers pod is supported, update accordingly.
   260  			if len(newPod.Status.ContainerStatuses) > 0 && newPod.Status.ContainerStatuses[0].State.Terminated != nil {
   261  				exitCode = newPod.Status.ContainerStatuses[0].State.Terminated.ExitCode
   262  			}
   263  		}
   264  	case v1.PodSucceeded:
   265  		if oldPod.Status.Phase != v1.PodSucceeded &&
   266  			cc.cache.TaskCompleted(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
   267  			event = bus.TaskCompletedEvent
   268  		}
   269  	case v1.PodPending, v1.PodRunning:
   270  		if cc.cache.TaskFailed(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
   271  			event = bus.TaskFailedEvent
   272  		}
   273  	}
   274  
   275  	req := apis.Request{
   276  		Namespace: newPod.Namespace,
   277  		JobName:   jobName,
   278  		TaskName:  taskName,
   279  
   280  		Event:      event,
   281  		ExitCode:   exitCode,
   282  		JobVersion: int32(dVersion),
   283  	}
   284  
   285  	key := jobhelpers.GetJobKeyByReq(&req)
   286  	queue := cc.getWorkerQueue(key)
   287  	queue.Add(req)
   288  }
   289  
   290  func (cc *jobcontroller) deletePod(obj interface{}) {
   291  	pod, ok := obj.(*v1.Pod)
   292  	if !ok {
   293  		// If we reached here it means the pod was deleted but its final state is unrecorded.
   294  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   295  		if !ok {
   296  			klog.Errorf("Couldn't get object from tombstone %#v", obj)
   297  			return
   298  		}
   299  		pod, ok = tombstone.Obj.(*v1.Pod)
   300  		if !ok {
   301  			klog.Errorf("Tombstone contained object that is not a Pod: %#v", obj)
   302  			return
   303  		}
   304  	}
   305  
   306  	// Filter out pods that are not created from volcano job
   307  	if !isControlledBy(pod, helpers.JobKind) {
   308  		return
   309  	}
   310  
   311  	taskName, found := pod.Annotations[batch.TaskSpecKey]
   312  	if !found {
   313  		klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
   314  			pod.Namespace, pod.Name)
   315  		return
   316  	}
   317  
   318  	jobName, found := pod.Annotations[batch.JobNameKey]
   319  	if !found {
   320  		klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
   321  			pod.Namespace, pod.Name)
   322  		return
   323  	}
   324  
   325  	version, found := pod.Annotations[batch.JobVersion]
   326  	if !found {
   327  		klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
   328  			pod.Namespace, pod.Name)
   329  		return
   330  	}
   331  
   332  	dVersion, err := strconv.Atoi(version)
   333  	if err != nil {
   334  		klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
   335  			pod.Namespace, pod.Name)
   336  		return
   337  	}
   338  
   339  	req := apis.Request{
   340  		Namespace: pod.Namespace,
   341  		JobName:   jobName,
   342  		TaskName:  taskName,
   343  
   344  		Event:      bus.PodEvictedEvent,
   345  		JobVersion: int32(dVersion),
   346  	}
   347  
   348  	if err := cc.cache.DeletePod(pod); err != nil {
   349  		klog.Errorf("Failed to delete Pod <%s/%s>: %v in cache",
   350  			pod.Namespace, pod.Name, err)
   351  	}
   352  
   353  	key := jobhelpers.GetJobKeyByReq(&req)
   354  	queue := cc.getWorkerQueue(key)
   355  	queue.Add(req)
   356  }
   357  
   358  func (cc *jobcontroller) recordJobEvent(namespace, name string, event batch.JobEvent, message string) {
   359  	job, err := cc.cache.Get(jobcache.JobKeyByName(namespace, name))
   360  	if err != nil {
   361  		klog.Warningf("Failed to find job in cache when reporting job event <%s/%s>: %v",
   362  			namespace, name, err)
   363  		return
   364  	}
   365  	cc.recorder.Event(job.Job, v1.EventTypeNormal, string(event), message)
   366  }
   367  
   368  func (cc *jobcontroller) handleCommands() {
   369  	for cc.processNextCommand() {
   370  	}
   371  }
   372  
   373  func (cc *jobcontroller) processNextCommand() bool {
   374  	obj, shutdown := cc.commandQueue.Get()
   375  	if shutdown {
   376  		return false
   377  	}
   378  	cmd := obj.(*bus.Command)
   379  	defer cc.commandQueue.Done(cmd)
   380  
   381  	if err := cc.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{}); err != nil {
   382  		if !apierrors.IsNotFound(err) {
   383  			klog.Errorf("Failed to delete Command <%s/%s>.", cmd.Namespace, cmd.Name)
   384  			cc.commandQueue.AddRateLimited(cmd)
   385  		}
   386  		return true
   387  	}
   388  	cc.recordJobEvent(cmd.Namespace, cmd.TargetObject.Name,
   389  		batch.CommandIssued,
   390  		fmt.Sprintf(
   391  			"Start to execute command %s, and clean it up to make sure executed not more than once.", cmd.Action))
   392  	req := apis.Request{
   393  		Namespace: cmd.Namespace,
   394  		JobName:   cmd.TargetObject.Name,
   395  		Event:     bus.CommandIssuedEvent,
   396  		Action:    bus.Action(cmd.Action),
   397  	}
   398  
   399  	key := jobhelpers.GetJobKeyByReq(&req)
   400  	queue := cc.getWorkerQueue(key)
   401  	queue.Add(req)
   402  
   403  	return true
   404  }
   405  
   406  func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) {
   407  	oldPG, ok := oldObj.(*scheduling.PodGroup)
   408  	if !ok {
   409  		klog.Errorf("Failed to convert %v to PodGroup", newObj)
   410  		return
   411  	}
   412  
   413  	newPG, ok := newObj.(*scheduling.PodGroup)
   414  	if !ok {
   415  		klog.Errorf("Failed to convert %v to PodGroup", newObj)
   416  		return
   417  	}
   418  
   419  	jobNameKey := newPG.Name
   420  	ors := newPG.OwnerReferences
   421  	for _, or := range ors {
   422  		if or.Kind == "Job" {
   423  			jobNameKey = or.Name
   424  		}
   425  	}
   426  
   427  	_, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, jobNameKey))
   428  	if err != nil && newPG.Annotations != nil {
   429  		klog.Warningf(
   430  			"Failed to find job in cache by PodGroup(%s/%s), this may not be a PodGroup for volcano job.", newPG.Namespace, newPG.Name)
   431  	}
   432  
   433  	if newPG.Status.Phase != oldPG.Status.Phase {
   434  		req := apis.Request{
   435  			Namespace: newPG.Namespace,
   436  			JobName:   jobNameKey,
   437  		}
   438  		switch newPG.Status.Phase {
   439  		case scheduling.PodGroupUnknown:
   440  			req.Event = bus.JobUnknownEvent
   441  		}
   442  		key := jobhelpers.GetJobKeyByReq(&req)
   443  		queue := cc.getWorkerQueue(key)
   444  		queue.Add(req)
   445  	}
   446  }
   447  
   448  // TODO(k82cn): add handler for PodGroup unschedulable event.