volcano.sh/volcano@v1.9.0/pkg/controllers/jobflow/jobflow_controller.go (about)

     1  /*
     2  Copyright 2022 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jobflow
    18  
    19  import (
    20  	"fmt"
    21  	"time"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	"k8s.io/client-go/kubernetes"
    27  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    28  	"k8s.io/client-go/tools/cache"
    29  	"k8s.io/client-go/tools/record"
    30  	"k8s.io/client-go/util/workqueue"
    31  	"k8s.io/klog"
    32  
    33  	jobflowstate "volcano.sh/volcano/pkg/controllers/jobflow/state"
    34  
    35  	vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
    36  	versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
    37  	informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
    38  	batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
    39  	flowinformer "volcano.sh/apis/pkg/client/informers/externalversions/flow/v1alpha1"
    40  	batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
    41  	flowlister "volcano.sh/apis/pkg/client/listers/flow/v1alpha1"
    42  	"volcano.sh/volcano/pkg/controllers/apis"
    43  	"volcano.sh/volcano/pkg/controllers/framework"
    44  	"volcano.sh/volcano/pkg/controllers/jobflow/state"
    45  )
    46  
    47  func init() {
    48  	framework.RegisterController(&jobflowcontroller{})
    49  }
    50  
    51  // jobflowcontroller the JobFlow jobflowcontroller type.
    52  type jobflowcontroller struct {
    53  	kubeClient kubernetes.Interface
    54  	vcClient   vcclientset.Interface
    55  
    56  	//informer
    57  	jobFlowInformer     flowinformer.JobFlowInformer
    58  	jobTemplateInformer flowinformer.JobTemplateInformer
    59  	jobInformer         batchinformer.JobInformer
    60  
    61  	//jobFlowLister
    62  	jobFlowLister flowlister.JobFlowLister
    63  	jobFlowSynced cache.InformerSynced
    64  
    65  	//jobTemplateLister
    66  	jobTemplateLister flowlister.JobTemplateLister
    67  	jobTemplateSynced cache.InformerSynced
    68  
    69  	//jobLister
    70  	jobLister batchlister.JobLister
    71  	jobSynced cache.InformerSynced
    72  
    73  	// JobFlow Event recorder
    74  	recorder record.EventRecorder
    75  
    76  	queue          workqueue.RateLimitingInterface
    77  	enqueueJobFlow func(req apis.FlowRequest)
    78  
    79  	syncHandler func(req *apis.FlowRequest) error
    80  
    81  	maxRequeueNum int
    82  }
    83  
    84  func (jf *jobflowcontroller) Name() string {
    85  	return "jobflow-controller"
    86  }
    87  
    88  func (jf *jobflowcontroller) Initialize(opt *framework.ControllerOption) error {
    89  	jf.kubeClient = opt.KubeClient
    90  	jf.vcClient = opt.VolcanoClient
    91  
    92  	jf.jobFlowInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Flow().V1alpha1().JobFlows()
    93  	jf.jobFlowSynced = jf.jobFlowInformer.Informer().HasSynced
    94  	jf.jobFlowLister = jf.jobFlowInformer.Lister()
    95  	jf.jobFlowInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
    96  		AddFunc:    jf.addJobFlow,
    97  		UpdateFunc: jf.updateJobFlow,
    98  	})
    99  
   100  	jf.jobTemplateInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Flow().V1alpha1().JobTemplates()
   101  	jf.jobTemplateSynced = jf.jobTemplateInformer.Informer().HasSynced
   102  	jf.jobTemplateLister = jf.jobTemplateInformer.Lister()
   103  
   104  	jf.jobInformer = informerfactory.NewSharedInformerFactory(jf.vcClient, 0).Batch().V1alpha1().Jobs()
   105  	jf.jobSynced = jf.jobInformer.Informer().HasSynced
   106  	jf.jobLister = jf.jobInformer.Lister()
   107  	jf.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   108  		UpdateFunc: jf.updateJob,
   109  	})
   110  
   111  	jf.maxRequeueNum = opt.MaxRequeueNum
   112  	if jf.maxRequeueNum < 0 {
   113  		jf.maxRequeueNum = -1
   114  	}
   115  
   116  	eventBroadcaster := record.NewBroadcaster()
   117  	eventBroadcaster.StartLogging(klog.Infof)
   118  	eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: jf.kubeClient.CoreV1().Events("")})
   119  
   120  	jf.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
   121  	jf.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
   122  
   123  	jf.enqueueJobFlow = jf.enqueue
   124  
   125  	jf.syncHandler = jf.handleJobFlow
   126  
   127  	state.SyncJobFlow = jf.syncJobFlow
   128  	return nil
   129  }
   130  
   131  func (jf *jobflowcontroller) Run(stopCh <-chan struct{}) {
   132  	defer jf.queue.ShutDown()
   133  
   134  	go jf.jobFlowInformer.Informer().Run(stopCh)
   135  	go jf.jobTemplateInformer.Informer().Run(stopCh)
   136  	go jf.jobInformer.Informer().Run(stopCh)
   137  
   138  	cache.WaitForCacheSync(stopCh, jf.jobSynced, jf.jobFlowSynced, jf.jobTemplateSynced)
   139  
   140  	go wait.Until(jf.worker, time.Second, stopCh)
   141  
   142  	klog.Infof("JobFlowController is running ...... ")
   143  
   144  	<-stopCh
   145  }
   146  
   147  func (jf *jobflowcontroller) worker() {
   148  	for jf.processNextWorkItem() {
   149  	}
   150  }
   151  
   152  func (jf *jobflowcontroller) processNextWorkItem() bool {
   153  	obj, shutdown := jf.queue.Get()
   154  	if shutdown {
   155  		// Stop working
   156  		return false
   157  	}
   158  
   159  	// We call Done here so the workqueue knows we have finished
   160  	// processing this item. We also must remember to call Forget if we
   161  	// do not want this work item being re-queued. For example, we do
   162  	// not call Forget if a transient error occurs, instead the item is
   163  	// put back on the workqueue and attempted again after a back-off
   164  	// period.
   165  	defer jf.queue.Done(obj)
   166  
   167  	req, ok := obj.(apis.FlowRequest)
   168  	if !ok {
   169  		klog.Errorf("%v is not a valid queue request struct.", obj)
   170  		return true
   171  	}
   172  
   173  	err := jf.syncHandler(&req)
   174  	jf.handleJobFlowErr(err, obj)
   175  
   176  	return true
   177  }
   178  
   179  func (jf *jobflowcontroller) handleJobFlow(req *apis.FlowRequest) error {
   180  	startTime := time.Now()
   181  	defer func() {
   182  		klog.V(4).Infof("Finished syncing jobflow %s (%v).", req.JobFlowName, time.Since(startTime))
   183  	}()
   184  
   185  	jobflow, err := jf.jobFlowLister.JobFlows(req.Namespace).Get(req.JobFlowName)
   186  	if err != nil {
   187  		if apierrors.IsNotFound(err) {
   188  			klog.V(4).Infof("JobFlow %s has been deleted.", req.JobFlowName)
   189  			return nil
   190  		}
   191  
   192  		return fmt.Errorf("get jobflow %s failed for %v", req.JobFlowName, err)
   193  	}
   194  
   195  	jobFlowState := jobflowstate.NewState(jobflow)
   196  	if jobFlowState == nil {
   197  		return fmt.Errorf("jobflow %s state %s is invalid", jobflow.Name, jobflow.Status.State)
   198  	}
   199  
   200  	klog.V(4).Infof("Begin execute %s action for jobflow %s", req.Action, req.JobFlowName)
   201  	if err := jobFlowState.Execute(req.Action); err != nil {
   202  		return fmt.Errorf("sync jobflow %s failed for %v, event is %v, action is %s",
   203  			req.JobFlowName, err, req.Event, req.Action)
   204  	}
   205  
   206  	return nil
   207  }
   208  
   209  func (jf *jobflowcontroller) handleJobFlowErr(err error, obj interface{}) {
   210  	if err == nil {
   211  		jf.queue.Forget(obj)
   212  		return
   213  	}
   214  
   215  	if jf.maxRequeueNum == -1 || jf.queue.NumRequeues(obj) < jf.maxRequeueNum {
   216  		klog.V(4).Infof("Error syncing jobFlow request %v for %v.", obj, err)
   217  		jf.queue.AddRateLimited(obj)
   218  		return
   219  	}
   220  
   221  	req, _ := obj.(apis.FlowRequest)
   222  	jf.recordEventsForJobFlow(req.Namespace, req.JobFlowName, v1.EventTypeWarning, string(req.Action),
   223  		fmt.Sprintf("%v JobFlow failed for %v", req.Action, err))
   224  	klog.V(4).Infof("Dropping JobFlow request %v out of the queue for %v.", obj, err)
   225  	jf.queue.Forget(obj)
   226  }
   227  
   228  func (jf *jobflowcontroller) recordEventsForJobFlow(namespace, name, eventType, reason, message string) {
   229  	jobFlow, err := jf.jobFlowLister.JobFlows(namespace).Get(name)
   230  	if err != nil {
   231  		klog.Errorf("Get JobFlow %s failed for %v.", name, err)
   232  		return
   233  	}
   234  
   235  	jf.recorder.Event(jobFlow, eventType, reason, message)
   236  }