volcano.sh/volcano@v1.9.0/pkg/controllers/job/job_controller.go (about) 1 /* 2 Copyright 2017 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "fmt" 21 "hash" 22 "hash/fnv" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/util/wait" 27 utilfeature "k8s.io/apiserver/pkg/util/feature" 28 "k8s.io/client-go/informers" 29 coreinformers "k8s.io/client-go/informers/core/v1" 30 kubeschedulinginformers "k8s.io/client-go/informers/scheduling/v1" 31 "k8s.io/client-go/kubernetes" 32 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 33 corelisters "k8s.io/client-go/listers/core/v1" 34 kubeschedulinglisters "k8s.io/client-go/listers/scheduling/v1" 35 "k8s.io/client-go/tools/cache" 36 "k8s.io/client-go/tools/record" 37 "k8s.io/client-go/util/workqueue" 38 "k8s.io/klog/v2" 39 40 batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1" 41 busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1" 42 vcclientset "volcano.sh/apis/pkg/client/clientset/versioned" 43 vcscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" 44 informerfactory "volcano.sh/apis/pkg/client/informers/externalversions" 45 vcinformer "volcano.sh/apis/pkg/client/informers/externalversions" 46 batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1" 47 businformer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1" 48 schedulinginformers "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1" 49 batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1" 50 buslister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1" 51 schedulinglisters "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1" 52 53 "volcano.sh/volcano/pkg/controllers/apis" 54 jobcache "volcano.sh/volcano/pkg/controllers/cache" 55 "volcano.sh/volcano/pkg/controllers/framework" 56 "volcano.sh/volcano/pkg/controllers/job/state" 57 "volcano.sh/volcano/pkg/features" 58 ) 59 60 func init() { 61 framework.RegisterController(&jobcontroller{}) 62 } 63 64 // jobcontroller the Job jobcontroller type. 65 type jobcontroller struct { 66 kubeClient kubernetes.Interface 67 vcClient vcclientset.Interface 68 69 jobInformer batchinformer.JobInformer 70 podInformer coreinformers.PodInformer 71 pvcInformer coreinformers.PersistentVolumeClaimInformer 72 pgInformer schedulinginformers.PodGroupInformer 73 svcInformer coreinformers.ServiceInformer 74 cmdInformer businformer.CommandInformer 75 pcInformer kubeschedulinginformers.PriorityClassInformer 76 queueInformer schedulinginformers.QueueInformer 77 78 informerFactory informers.SharedInformerFactory 79 vcInformerFactory vcinformer.SharedInformerFactory 80 81 // A store of jobs 82 jobLister batchlister.JobLister 83 jobSynced func() bool 84 85 // A store of pods 86 podLister corelisters.PodLister 87 podSynced func() bool 88 89 pvcLister corelisters.PersistentVolumeClaimLister 90 pvcSynced func() bool 91 92 // A store of podgroups 93 pgLister schedulinglisters.PodGroupLister 94 pgSynced func() bool 95 96 // A store of service 97 svcLister corelisters.ServiceLister 98 svcSynced func() bool 99 100 cmdLister buslister.CommandLister 101 cmdSynced func() bool 102 103 pcLister kubeschedulinglisters.PriorityClassLister 104 pcSynced func() bool 105 106 queueLister schedulinglisters.QueueLister 107 queueSynced func() bool 108 109 // queue that need to sync up 110 queueList []workqueue.RateLimitingInterface 111 commandQueue workqueue.RateLimitingInterface 112 cache jobcache.Cache 113 // Job Event recorder 114 recorder record.EventRecorder 115 116 errTasks workqueue.RateLimitingInterface 117 workers uint32 118 maxRequeueNum int 119 } 120 121 func (cc *jobcontroller) Name() string { 122 return "job-controller" 123 } 124 125 // Initialize creates the new Job job controller. 126 func (cc *jobcontroller) Initialize(opt *framework.ControllerOption) error { 127 cc.kubeClient = opt.KubeClient 128 cc.vcClient = opt.VolcanoClient 129 130 sharedInformers := opt.SharedInformerFactory 131 workers := opt.WorkerNum 132 // Initialize event client 133 eventBroadcaster := record.NewBroadcaster() 134 eventBroadcaster.StartLogging(klog.Infof) 135 eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: cc.kubeClient.CoreV1().Events("")}) 136 recorder := eventBroadcaster.NewRecorder(vcscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"}) 137 138 cc.informerFactory = sharedInformers 139 cc.queueList = make([]workqueue.RateLimitingInterface, workers) 140 cc.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) 141 cc.cache = jobcache.New() 142 cc.errTasks = newRateLimitingQueue() 143 cc.recorder = recorder 144 cc.workers = workers 145 cc.maxRequeueNum = opt.MaxRequeueNum 146 if cc.maxRequeueNum < 0 { 147 cc.maxRequeueNum = -1 148 } 149 150 var i uint32 151 for i = 0; i < workers; i++ { 152 cc.queueList[i] = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) 153 } 154 155 factory := informerfactory.NewSharedInformerFactory(cc.vcClient, 0) 156 cc.vcInformerFactory = factory 157 if utilfeature.DefaultFeatureGate.Enabled(features.WorkLoadSupport) { 158 cc.jobInformer = factory.Batch().V1alpha1().Jobs() 159 cc.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 160 AddFunc: cc.addJob, 161 UpdateFunc: cc.updateJob, 162 DeleteFunc: cc.deleteJob, 163 }) 164 cc.jobLister = cc.jobInformer.Lister() 165 cc.jobSynced = cc.jobInformer.Informer().HasSynced 166 } 167 168 if utilfeature.DefaultFeatureGate.Enabled(features.QueueCommandSync) { 169 cc.cmdInformer = factory.Bus().V1alpha1().Commands() 170 cc.cmdInformer.Informer().AddEventHandler( 171 cache.FilteringResourceEventHandler{ 172 FilterFunc: func(obj interface{}) bool { 173 switch v := obj.(type) { 174 case *busv1alpha1.Command: 175 if v.TargetObject != nil && 176 v.TargetObject.APIVersion == batchv1alpha1.SchemeGroupVersion.String() && 177 v.TargetObject.Kind == "Job" { 178 return true 179 } 180 181 return false 182 default: 183 return false 184 } 185 }, 186 Handler: cache.ResourceEventHandlerFuncs{ 187 AddFunc: cc.addCommand, 188 }, 189 }, 190 ) 191 cc.cmdLister = cc.cmdInformer.Lister() 192 cc.cmdSynced = cc.cmdInformer.Informer().HasSynced 193 } 194 195 cc.podInformer = sharedInformers.Core().V1().Pods() 196 cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 197 AddFunc: cc.addPod, 198 UpdateFunc: cc.updatePod, 199 DeleteFunc: cc.deletePod, 200 }) 201 202 cc.podLister = cc.podInformer.Lister() 203 cc.podSynced = cc.podInformer.Informer().HasSynced 204 205 cc.pvcInformer = sharedInformers.Core().V1().PersistentVolumeClaims() 206 cc.pvcLister = cc.pvcInformer.Lister() 207 cc.pvcSynced = cc.pvcInformer.Informer().HasSynced 208 209 cc.svcInformer = sharedInformers.Core().V1().Services() 210 cc.svcLister = cc.svcInformer.Lister() 211 cc.svcSynced = cc.svcInformer.Informer().HasSynced 212 213 cc.pgInformer = factory.Scheduling().V1beta1().PodGroups() 214 cc.pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 215 UpdateFunc: cc.updatePodGroup, 216 }) 217 cc.pgLister = cc.pgInformer.Lister() 218 cc.pgSynced = cc.pgInformer.Informer().HasSynced 219 220 if utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) { 221 cc.pcInformer = sharedInformers.Scheduling().V1().PriorityClasses() 222 cc.pcLister = cc.pcInformer.Lister() 223 cc.pcSynced = cc.pcInformer.Informer().HasSynced 224 } 225 226 cc.queueInformer = factory.Scheduling().V1beta1().Queues() 227 cc.queueLister = cc.queueInformer.Lister() 228 cc.queueSynced = cc.queueInformer.Informer().HasSynced 229 230 // Register actions 231 state.SyncJob = cc.syncJob 232 state.KillJob = cc.killJob 233 234 return nil 235 } 236 237 // Run start JobController. 238 func (cc *jobcontroller) Run(stopCh <-chan struct{}) { 239 cc.informerFactory.Start(stopCh) 240 cc.vcInformerFactory.Start(stopCh) 241 242 for informerType, ok := range cc.informerFactory.WaitForCacheSync(stopCh) { 243 if !ok { 244 klog.Errorf("caches failed to sync: %v", informerType) 245 return 246 } 247 } 248 249 for informerType, ok := range cc.vcInformerFactory.WaitForCacheSync(stopCh) { 250 if !ok { 251 klog.Errorf("caches failed to sync: %v", informerType) 252 return 253 } 254 } 255 256 go wait.Until(cc.handleCommands, 0, stopCh) 257 var i uint32 258 for i = 0; i < cc.workers; i++ { 259 go func(num uint32) { 260 wait.Until( 261 func() { 262 cc.worker(num) 263 }, 264 time.Second, 265 stopCh) 266 }(i) 267 } 268 269 go cc.cache.Run(stopCh) 270 271 // Re-sync error tasks. 272 go wait.Until(cc.processResyncTask, 0, stopCh) 273 274 klog.Infof("JobController is running ...... ") 275 } 276 277 func (cc *jobcontroller) worker(i uint32) { 278 klog.Infof("worker %d start ...... ", i) 279 280 for cc.processNextReq(i) { 281 } 282 } 283 284 func (cc *jobcontroller) belongsToThisRoutine(key string, count uint32) bool { 285 var hashVal hash.Hash32 286 var val uint32 287 288 hashVal = fnv.New32() 289 hashVal.Write([]byte(key)) 290 291 val = hashVal.Sum32() 292 293 return val%cc.workers == count 294 } 295 296 func (cc *jobcontroller) getWorkerQueue(key string) workqueue.RateLimitingInterface { 297 var hashVal hash.Hash32 298 var val uint32 299 300 hashVal = fnv.New32() 301 hashVal.Write([]byte(key)) 302 303 val = hashVal.Sum32() 304 305 queue := cc.queueList[val%cc.workers] 306 307 return queue 308 } 309 310 func (cc *jobcontroller) processNextReq(count uint32) bool { 311 queue := cc.queueList[count] 312 obj, shutdown := queue.Get() 313 if shutdown { 314 klog.Errorf("Fail to pop item from queue") 315 return false 316 } 317 318 req := obj.(apis.Request) 319 defer queue.Done(req) 320 321 key := jobcache.JobKeyByReq(&req) 322 if !cc.belongsToThisRoutine(key, count) { 323 klog.Errorf("should not occur The job does not belongs to this routine key:%s, worker:%d...... ", key, count) 324 queueLocal := cc.getWorkerQueue(key) 325 queueLocal.Add(req) 326 return true 327 } 328 329 klog.V(3).Infof("Try to handle request <%v>", req) 330 331 jobInfo, err := cc.cache.Get(key) 332 if err != nil { 333 // TODO(k82cn): ignore not-ready error. 334 klog.Errorf("Failed to get job by <%v> from cache: %v", req, err) 335 return true 336 } 337 338 st := state.NewState(jobInfo) 339 if st == nil { 340 klog.Errorf("Invalid state <%s> of Job <%v/%v>", 341 jobInfo.Job.Status.State, jobInfo.Job.Namespace, jobInfo.Job.Name) 342 return true 343 } 344 345 action := applyPolicies(jobInfo.Job, &req) 346 klog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.", 347 action, req.Namespace, req.JobName, jobInfo.Job.Status.State.Phase, st) 348 349 if action != busv1alpha1.SyncJobAction { 350 cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf( 351 "Start to execute action %s ", action)) 352 } 353 354 if err := st.Execute(action); err != nil { 355 if cc.maxRequeueNum == -1 || queue.NumRequeues(req) < cc.maxRequeueNum { 356 klog.V(2).Infof("Failed to handle Job <%s/%s>: %v", 357 jobInfo.Job.Namespace, jobInfo.Job.Name, err) 358 // If any error, requeue it. 359 queue.AddRateLimited(req) 360 return true 361 } 362 cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf( 363 "Job failed on action %s for retry limit reached", action)) 364 klog.Warningf("Terminating Job <%s/%s> and releasing resources", jobInfo.Job.Namespace, jobInfo.Job.Name) 365 if err = st.Execute(busv1alpha1.TerminateJobAction); err != nil { 366 klog.Errorf("Failed to terminate Job<%s/%s>: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err) 367 } 368 klog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err) 369 } 370 371 // If no error, forget it. 372 queue.Forget(req) 373 374 return true 375 }