volcano.sh/volcano@v1.9.0/pkg/controllers/job/job_controller.go (about)

     1  /*
     2  Copyright 2017 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package job
    18  
    19  import (
    20  	"fmt"
    21  	"hash"
    22  	"hash/fnv"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    28  	"k8s.io/client-go/informers"
    29  	coreinformers "k8s.io/client-go/informers/core/v1"
    30  	kubeschedulinginformers "k8s.io/client-go/informers/scheduling/v1"
    31  	"k8s.io/client-go/kubernetes"
    32  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    33  	corelisters "k8s.io/client-go/listers/core/v1"
    34  	kubeschedulinglisters "k8s.io/client-go/listers/scheduling/v1"
    35  	"k8s.io/client-go/tools/cache"
    36  	"k8s.io/client-go/tools/record"
    37  	"k8s.io/client-go/util/workqueue"
    38  	"k8s.io/klog/v2"
    39  
    40  	batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    41  	busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
    42  	vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
    43  	vcscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
    44  	informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
    45  	vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
    46  	batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
    47  	businformer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
    48  	schedulinginformers "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
    49  	batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
    50  	buslister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
    51  	schedulinglisters "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
    52  
    53  	"volcano.sh/volcano/pkg/controllers/apis"
    54  	jobcache "volcano.sh/volcano/pkg/controllers/cache"
    55  	"volcano.sh/volcano/pkg/controllers/framework"
    56  	"volcano.sh/volcano/pkg/controllers/job/state"
    57  	"volcano.sh/volcano/pkg/features"
    58  )
    59  
    60  func init() {
    61  	framework.RegisterController(&jobcontroller{})
    62  }
    63  
    64  // jobcontroller the Job jobcontroller type.
    65  type jobcontroller struct {
    66  	kubeClient kubernetes.Interface
    67  	vcClient   vcclientset.Interface
    68  
    69  	jobInformer   batchinformer.JobInformer
    70  	podInformer   coreinformers.PodInformer
    71  	pvcInformer   coreinformers.PersistentVolumeClaimInformer
    72  	pgInformer    schedulinginformers.PodGroupInformer
    73  	svcInformer   coreinformers.ServiceInformer
    74  	cmdInformer   businformer.CommandInformer
    75  	pcInformer    kubeschedulinginformers.PriorityClassInformer
    76  	queueInformer schedulinginformers.QueueInformer
    77  
    78  	informerFactory   informers.SharedInformerFactory
    79  	vcInformerFactory vcinformer.SharedInformerFactory
    80  
    81  	// A store of jobs
    82  	jobLister batchlister.JobLister
    83  	jobSynced func() bool
    84  
    85  	// A store of pods
    86  	podLister corelisters.PodLister
    87  	podSynced func() bool
    88  
    89  	pvcLister corelisters.PersistentVolumeClaimLister
    90  	pvcSynced func() bool
    91  
    92  	// A store of podgroups
    93  	pgLister schedulinglisters.PodGroupLister
    94  	pgSynced func() bool
    95  
    96  	// A store of service
    97  	svcLister corelisters.ServiceLister
    98  	svcSynced func() bool
    99  
   100  	cmdLister buslister.CommandLister
   101  	cmdSynced func() bool
   102  
   103  	pcLister kubeschedulinglisters.PriorityClassLister
   104  	pcSynced func() bool
   105  
   106  	queueLister schedulinglisters.QueueLister
   107  	queueSynced func() bool
   108  
   109  	// queue that need to sync up
   110  	queueList    []workqueue.RateLimitingInterface
   111  	commandQueue workqueue.RateLimitingInterface
   112  	cache        jobcache.Cache
   113  	// Job Event recorder
   114  	recorder record.EventRecorder
   115  
   116  	errTasks      workqueue.RateLimitingInterface
   117  	workers       uint32
   118  	maxRequeueNum int
   119  }
   120  
   121  func (cc *jobcontroller) Name() string {
   122  	return "job-controller"
   123  }
   124  
   125  // Initialize creates the new Job job controller.
   126  func (cc *jobcontroller) Initialize(opt *framework.ControllerOption) error {
   127  	cc.kubeClient = opt.KubeClient
   128  	cc.vcClient = opt.VolcanoClient
   129  
   130  	sharedInformers := opt.SharedInformerFactory
   131  	workers := opt.WorkerNum
   132  	// Initialize event client
   133  	eventBroadcaster := record.NewBroadcaster()
   134  	eventBroadcaster.StartLogging(klog.Infof)
   135  	eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: cc.kubeClient.CoreV1().Events("")})
   136  	recorder := eventBroadcaster.NewRecorder(vcscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
   137  
   138  	cc.informerFactory = sharedInformers
   139  	cc.queueList = make([]workqueue.RateLimitingInterface, workers)
   140  	cc.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
   141  	cc.cache = jobcache.New()
   142  	cc.errTasks = newRateLimitingQueue()
   143  	cc.recorder = recorder
   144  	cc.workers = workers
   145  	cc.maxRequeueNum = opt.MaxRequeueNum
   146  	if cc.maxRequeueNum < 0 {
   147  		cc.maxRequeueNum = -1
   148  	}
   149  
   150  	var i uint32
   151  	for i = 0; i < workers; i++ {
   152  		cc.queueList[i] = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
   153  	}
   154  
   155  	factory := informerfactory.NewSharedInformerFactory(cc.vcClient, 0)
   156  	cc.vcInformerFactory = factory
   157  	if utilfeature.DefaultFeatureGate.Enabled(features.WorkLoadSupport) {
   158  		cc.jobInformer = factory.Batch().V1alpha1().Jobs()
   159  		cc.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   160  			AddFunc:    cc.addJob,
   161  			UpdateFunc: cc.updateJob,
   162  			DeleteFunc: cc.deleteJob,
   163  		})
   164  		cc.jobLister = cc.jobInformer.Lister()
   165  		cc.jobSynced = cc.jobInformer.Informer().HasSynced
   166  	}
   167  
   168  	if utilfeature.DefaultFeatureGate.Enabled(features.QueueCommandSync) {
   169  		cc.cmdInformer = factory.Bus().V1alpha1().Commands()
   170  		cc.cmdInformer.Informer().AddEventHandler(
   171  			cache.FilteringResourceEventHandler{
   172  				FilterFunc: func(obj interface{}) bool {
   173  					switch v := obj.(type) {
   174  					case *busv1alpha1.Command:
   175  						if v.TargetObject != nil &&
   176  							v.TargetObject.APIVersion == batchv1alpha1.SchemeGroupVersion.String() &&
   177  							v.TargetObject.Kind == "Job" {
   178  							return true
   179  						}
   180  
   181  						return false
   182  					default:
   183  						return false
   184  					}
   185  				},
   186  				Handler: cache.ResourceEventHandlerFuncs{
   187  					AddFunc: cc.addCommand,
   188  				},
   189  			},
   190  		)
   191  		cc.cmdLister = cc.cmdInformer.Lister()
   192  		cc.cmdSynced = cc.cmdInformer.Informer().HasSynced
   193  	}
   194  
   195  	cc.podInformer = sharedInformers.Core().V1().Pods()
   196  	cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   197  		AddFunc:    cc.addPod,
   198  		UpdateFunc: cc.updatePod,
   199  		DeleteFunc: cc.deletePod,
   200  	})
   201  
   202  	cc.podLister = cc.podInformer.Lister()
   203  	cc.podSynced = cc.podInformer.Informer().HasSynced
   204  
   205  	cc.pvcInformer = sharedInformers.Core().V1().PersistentVolumeClaims()
   206  	cc.pvcLister = cc.pvcInformer.Lister()
   207  	cc.pvcSynced = cc.pvcInformer.Informer().HasSynced
   208  
   209  	cc.svcInformer = sharedInformers.Core().V1().Services()
   210  	cc.svcLister = cc.svcInformer.Lister()
   211  	cc.svcSynced = cc.svcInformer.Informer().HasSynced
   212  
   213  	cc.pgInformer = factory.Scheduling().V1beta1().PodGroups()
   214  	cc.pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   215  		UpdateFunc: cc.updatePodGroup,
   216  	})
   217  	cc.pgLister = cc.pgInformer.Lister()
   218  	cc.pgSynced = cc.pgInformer.Informer().HasSynced
   219  
   220  	if utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) {
   221  		cc.pcInformer = sharedInformers.Scheduling().V1().PriorityClasses()
   222  		cc.pcLister = cc.pcInformer.Lister()
   223  		cc.pcSynced = cc.pcInformer.Informer().HasSynced
   224  	}
   225  
   226  	cc.queueInformer = factory.Scheduling().V1beta1().Queues()
   227  	cc.queueLister = cc.queueInformer.Lister()
   228  	cc.queueSynced = cc.queueInformer.Informer().HasSynced
   229  
   230  	// Register actions
   231  	state.SyncJob = cc.syncJob
   232  	state.KillJob = cc.killJob
   233  
   234  	return nil
   235  }
   236  
   237  // Run start JobController.
   238  func (cc *jobcontroller) Run(stopCh <-chan struct{}) {
   239  	cc.informerFactory.Start(stopCh)
   240  	cc.vcInformerFactory.Start(stopCh)
   241  
   242  	for informerType, ok := range cc.informerFactory.WaitForCacheSync(stopCh) {
   243  		if !ok {
   244  			klog.Errorf("caches failed to sync: %v", informerType)
   245  			return
   246  		}
   247  	}
   248  
   249  	for informerType, ok := range cc.vcInformerFactory.WaitForCacheSync(stopCh) {
   250  		if !ok {
   251  			klog.Errorf("caches failed to sync: %v", informerType)
   252  			return
   253  		}
   254  	}
   255  
   256  	go wait.Until(cc.handleCommands, 0, stopCh)
   257  	var i uint32
   258  	for i = 0; i < cc.workers; i++ {
   259  		go func(num uint32) {
   260  			wait.Until(
   261  				func() {
   262  					cc.worker(num)
   263  				},
   264  				time.Second,
   265  				stopCh)
   266  		}(i)
   267  	}
   268  
   269  	go cc.cache.Run(stopCh)
   270  
   271  	// Re-sync error tasks.
   272  	go wait.Until(cc.processResyncTask, 0, stopCh)
   273  
   274  	klog.Infof("JobController is running ...... ")
   275  }
   276  
   277  func (cc *jobcontroller) worker(i uint32) {
   278  	klog.Infof("worker %d start ...... ", i)
   279  
   280  	for cc.processNextReq(i) {
   281  	}
   282  }
   283  
   284  func (cc *jobcontroller) belongsToThisRoutine(key string, count uint32) bool {
   285  	var hashVal hash.Hash32
   286  	var val uint32
   287  
   288  	hashVal = fnv.New32()
   289  	hashVal.Write([]byte(key))
   290  
   291  	val = hashVal.Sum32()
   292  
   293  	return val%cc.workers == count
   294  }
   295  
   296  func (cc *jobcontroller) getWorkerQueue(key string) workqueue.RateLimitingInterface {
   297  	var hashVal hash.Hash32
   298  	var val uint32
   299  
   300  	hashVal = fnv.New32()
   301  	hashVal.Write([]byte(key))
   302  
   303  	val = hashVal.Sum32()
   304  
   305  	queue := cc.queueList[val%cc.workers]
   306  
   307  	return queue
   308  }
   309  
   310  func (cc *jobcontroller) processNextReq(count uint32) bool {
   311  	queue := cc.queueList[count]
   312  	obj, shutdown := queue.Get()
   313  	if shutdown {
   314  		klog.Errorf("Fail to pop item from queue")
   315  		return false
   316  	}
   317  
   318  	req := obj.(apis.Request)
   319  	defer queue.Done(req)
   320  
   321  	key := jobcache.JobKeyByReq(&req)
   322  	if !cc.belongsToThisRoutine(key, count) {
   323  		klog.Errorf("should not occur The job does not belongs to this routine key:%s, worker:%d...... ", key, count)
   324  		queueLocal := cc.getWorkerQueue(key)
   325  		queueLocal.Add(req)
   326  		return true
   327  	}
   328  
   329  	klog.V(3).Infof("Try to handle request <%v>", req)
   330  
   331  	jobInfo, err := cc.cache.Get(key)
   332  	if err != nil {
   333  		// TODO(k82cn): ignore not-ready error.
   334  		klog.Errorf("Failed to get job by <%v> from cache: %v", req, err)
   335  		return true
   336  	}
   337  
   338  	st := state.NewState(jobInfo)
   339  	if st == nil {
   340  		klog.Errorf("Invalid state <%s> of Job <%v/%v>",
   341  			jobInfo.Job.Status.State, jobInfo.Job.Namespace, jobInfo.Job.Name)
   342  		return true
   343  	}
   344  
   345  	action := applyPolicies(jobInfo.Job, &req)
   346  	klog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.",
   347  		action, req.Namespace, req.JobName, jobInfo.Job.Status.State.Phase, st)
   348  
   349  	if action != busv1alpha1.SyncJobAction {
   350  		cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
   351  			"Start to execute action %s ", action))
   352  	}
   353  
   354  	if err := st.Execute(action); err != nil {
   355  		if cc.maxRequeueNum == -1 || queue.NumRequeues(req) < cc.maxRequeueNum {
   356  			klog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
   357  				jobInfo.Job.Namespace, jobInfo.Job.Name, err)
   358  			// If any error, requeue it.
   359  			queue.AddRateLimited(req)
   360  			return true
   361  		}
   362  		cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
   363  			"Job failed on action %s for retry limit reached", action))
   364  		klog.Warningf("Terminating Job <%s/%s> and releasing resources", jobInfo.Job.Namespace, jobInfo.Job.Name)
   365  		if err = st.Execute(busv1alpha1.TerminateJobAction); err != nil {
   366  			klog.Errorf("Failed to terminate Job<%s/%s>: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
   367  		}
   368  		klog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
   369  	}
   370  
   371  	// If no error, forget it.
   372  	queue.Forget(req)
   373  
   374  	return true
   375  }