volcano.sh/volcano@v1.9.0/pkg/controllers/jobflow/jobflow_controller.go (about) 1 /* 2 Copyright 2022 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package jobflow 18 19 import ( 20 "fmt" 21 "time" 22 23 v1 "k8s.io/api/core/v1" 24 apierrors "k8s.io/apimachinery/pkg/api/errors" 25 "k8s.io/apimachinery/pkg/util/wait" 26 "k8s.io/client-go/kubernetes" 27 corev1 "k8s.io/client-go/kubernetes/typed/core/v1" 28 "k8s.io/client-go/tools/cache" 29 "k8s.io/client-go/tools/record" 30 "k8s.io/client-go/util/workqueue" 31 "k8s.io/klog" 32 33 jobflowstate "volcano.sh/volcano/pkg/controllers/jobflow/state" 34 35 vcclientset "volcano.sh/apis/pkg/client/clientset/versioned" 36 versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" 37 informerfactory "volcano.sh/apis/pkg/client/informers/externalversions" 38 batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1" 39 flowinformer "volcano.sh/apis/pkg/client/informers/externalversions/flow/v1alpha1" 40 batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1" 41 flowlister "volcano.sh/apis/pkg/client/listers/flow/v1alpha1" 42 "volcano.sh/volcano/pkg/controllers/apis" 43 "volcano.sh/volcano/pkg/controllers/framework" 44 "volcano.sh/volcano/pkg/controllers/jobflow/state" 45 ) 46 47 func init() { 48 framework.RegisterController(&jobflowcontroller{}) 49 } 50 51 // jobflowcontroller the JobFlow jobflowcontroller type. 52 type jobflowcontroller struct { 53 kubeClient kubernetes.Interface 54 vcClient vcclientset.Interface 55 56 //informer 57 jobFlowInformer flowinformer.JobFlowInformer 58 jobTemplateInformer flowinformer.JobTemplateInformer 59 jobInformer batchinformer.JobInformer 60 61 //jobFlowLister 62 jobFlowLister flowlister.JobFlowLister 63 jobFlowSynced cache.InformerSynced 64 65 //jobTemplateLister 66 jobTemplateLister flowlister.JobTemplateLister 67 jobTemplateSynced cache.InformerSynced 68 69 //jobLister 70 jobLister batchlister.JobLister 71 jobSynced cache.InformerSynced 72 73 // JobFlow Event recorder 74 recorder record.EventRecorder 75 76 queue workqueue.RateLimitingInterface 77 enqueueJobFlow func(req apis.FlowRequest) 78 79 syncHandler func(req *apis.FlowRequest) error 80 81 maxRequeueNum int 82 } 83 84 func (jf *jobflowcontroller) Name() string { 85 return "jobflow-controller" 86 } 87 88 func (jf *jobflowcontroller) Initialize(opt *framework.ControllerOption) error { 89 jf.kubeClient = opt.KubeClient 90 jf.vcClient = opt.VolcanoClient 91 92 jf.jobFlowInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Flow().V1alpha1().JobFlows() 93 jf.jobFlowSynced = jf.jobFlowInformer.Informer().HasSynced 94 jf.jobFlowLister = jf.jobFlowInformer.Lister() 95 jf.jobFlowInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 96 AddFunc: jf.addJobFlow, 97 UpdateFunc: jf.updateJobFlow, 98 }) 99 100 jf.jobTemplateInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Flow().V1alpha1().JobTemplates() 101 jf.jobTemplateSynced = jf.jobTemplateInformer.Informer().HasSynced 102 jf.jobTemplateLister = jf.jobTemplateInformer.Lister() 103 104 jf.jobInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Batch().V1alpha1().Jobs() 105 jf.jobSynced = jf.jobInformer.Informer().HasSynced 106 jf.jobLister = jf.jobInformer.Lister() 107 jf.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 108 UpdateFunc: jf.updateJob, 109 }) 110 111 jf.maxRequeueNum = opt.MaxRequeueNum 112 if jf.maxRequeueNum < 0 { 113 jf.maxRequeueNum = -1 114 } 115 116 eventBroadcaster := record.NewBroadcaster() 117 eventBroadcaster.StartLogging(klog.Infof) 118 eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: jf.kubeClient.CoreV1().Events("")}) 119 120 jf.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"}) 121 jf.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) 122 123 jf.enqueueJobFlow = jf.enqueue 124 125 jf.syncHandler = jf.handleJobFlow 126 127 state.SyncJobFlow = jf.syncJobFlow 128 return nil 129 } 130 131 func (jf *jobflowcontroller) Run(stopCh <-chan struct{}) { 132 defer jf.queue.ShutDown() 133 134 go jf.jobFlowInformer.Informer().Run(stopCh) 135 go jf.jobTemplateInformer.Informer().Run(stopCh) 136 go jf.jobInformer.Informer().Run(stopCh) 137 138 cache.WaitForCacheSync(stopCh, jf.jobSynced, jf.jobFlowSynced, jf.jobTemplateSynced) 139 140 go wait.Until(jf.worker, time.Second, stopCh) 141 142 klog.Infof("JobFlowController is running ...... ") 143 144 <-stopCh 145 } 146 147 func (jf *jobflowcontroller) worker() { 148 for jf.processNextWorkItem() { 149 } 150 } 151 152 func (jf *jobflowcontroller) processNextWorkItem() bool { 153 obj, shutdown := jf.queue.Get() 154 if shutdown { 155 // Stop working 156 return false 157 } 158 159 // We call Done here so the workqueue knows we have finished 160 // processing this item. We also must remember to call Forget if we 161 // do not want this work item being re-queued. For example, we do 162 // not call Forget if a transient error occurs, instead the item is 163 // put back on the workqueue and attempted again after a back-off 164 // period. 165 defer jf.queue.Done(obj) 166 167 req, ok := obj.(apis.FlowRequest) 168 if !ok { 169 klog.Errorf("%v is not a valid queue request struct.", obj) 170 return true 171 } 172 173 err := jf.syncHandler(&req) 174 jf.handleJobFlowErr(err, obj) 175 176 return true 177 } 178 179 func (jf *jobflowcontroller) handleJobFlow(req *apis.FlowRequest) error { 180 startTime := time.Now() 181 defer func() { 182 klog.V(4).Infof("Finished syncing jobflow %s (%v).", req.JobFlowName, time.Since(startTime)) 183 }() 184 185 jobflow, err := jf.jobFlowLister.JobFlows(req.Namespace).Get(req.JobFlowName) 186 if err != nil { 187 if apierrors.IsNotFound(err) { 188 klog.V(4).Infof("JobFlow %s has been deleted.", req.JobFlowName) 189 return nil 190 } 191 192 return fmt.Errorf("get jobflow %s failed for %v", req.JobFlowName, err) 193 } 194 195 jobFlowState := jobflowstate.NewState(jobflow) 196 if jobFlowState == nil { 197 return fmt.Errorf("jobflow %s state %s is invalid", jobflow.Name, jobflow.Status.State) 198 } 199 200 klog.V(4).Infof("Begin execute %s action for jobflow %s", req.Action, req.JobFlowName) 201 if err := jobFlowState.Execute(req.Action); err != nil { 202 return fmt.Errorf("sync jobflow %s failed for %v, event is %v, action is %s", 203 req.JobFlowName, err, req.Event, req.Action) 204 } 205 206 return nil 207 } 208 209 func (jf *jobflowcontroller) handleJobFlowErr(err error, obj interface{}) { 210 if err == nil { 211 jf.queue.Forget(obj) 212 return 213 } 214 215 if jf.maxRequeueNum == -1 || jf.queue.NumRequeues(obj) < jf.maxRequeueNum { 216 klog.V(4).Infof("Error syncing jobFlow request %v for %v.", obj, err) 217 jf.queue.AddRateLimited(obj) 218 return 219 } 220 221 req, _ := obj.(apis.FlowRequest) 222 jf.recordEventsForJobFlow(req.Namespace, req.JobFlowName, v1.EventTypeWarning, string(req.Action), 223 fmt.Sprintf("%v JobFlow failed for %v", req.Action, err)) 224 klog.V(4).Infof("Dropping JobFlow request %v out of the queue for %v.", obj, err) 225 jf.queue.Forget(obj) 226 } 227 228 func (jf *jobflowcontroller) recordEventsForJobFlow(namespace, name, eventType, reason, message string) { 229 jobFlow, err := jf.jobFlowLister.JobFlows(namespace).Get(name) 230 if err != nil { 231 klog.Errorf("Get JobFlow %s failed for %v.", name, err) 232 return 233 } 234 235 jf.recorder.Event(jobFlow, eventType, reason, message) 236 }