volcano.sh/volcano@v1.9.0/pkg/controllers/queue/queue_controller.go (about)

     1  /*
     2  Copyright 2019 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package queue
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    31  	"k8s.io/client-go/kubernetes"
    32  	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    33  	"k8s.io/client-go/tools/cache"
    34  	"k8s.io/client-go/tools/record"
    35  	"k8s.io/client-go/util/workqueue"
    36  	"k8s.io/klog/v2"
    37  
    38  	busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
    39  	vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
    40  	versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
    41  	informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
    42  	vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
    43  	busv1alpha1informer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
    44  	schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
    45  	busv1alpha1lister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
    46  	schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
    47  	"volcano.sh/volcano/pkg/controllers/apis"
    48  	"volcano.sh/volcano/pkg/controllers/framework"
    49  	queuestate "volcano.sh/volcano/pkg/controllers/queue/state"
    50  	"volcano.sh/volcano/pkg/features"
    51  )
    52  
    53  func init() {
    54  	framework.RegisterController(&queuecontroller{})
    55  }
    56  
    57  // queuecontroller manages queue status.
    58  type queuecontroller struct {
    59  	kubeClient kubernetes.Interface
    60  	vcClient   vcclientset.Interface
    61  
    62  	// informer
    63  	queueInformer schedulinginformer.QueueInformer
    64  	pgInformer    schedulinginformer.PodGroupInformer
    65  
    66  	// queueLister
    67  	queueLister schedulinglister.QueueLister
    68  	queueSynced cache.InformerSynced
    69  
    70  	// podGroup lister
    71  	pgLister schedulinglister.PodGroupLister
    72  	pgSynced cache.InformerSynced
    73  
    74  	cmdInformer busv1alpha1informer.CommandInformer
    75  	cmdLister   busv1alpha1lister.CommandLister
    76  	cmdSynced   cache.InformerSynced
    77  
    78  	vcInformerFactory vcinformer.SharedInformerFactory
    79  
    80  	// queues that need to be updated.
    81  	queue        workqueue.RateLimitingInterface
    82  	commandQueue workqueue.RateLimitingInterface
    83  
    84  	pgMutex sync.RWMutex
    85  	// queue name -> podgroup namespace/name
    86  	podGroups map[string]map[string]struct{}
    87  
    88  	syncHandler        func(req *apis.Request) error
    89  	syncCommandHandler func(cmd *busv1alpha1.Command) error
    90  
    91  	enqueueQueue func(req *apis.Request)
    92  
    93  	recorder      record.EventRecorder
    94  	maxRequeueNum int
    95  }
    96  
    97  func (c *queuecontroller) Name() string {
    98  	return "queue-controller"
    99  }
   100  
   101  // NewQueueController creates a QueueController.
   102  func (c *queuecontroller) Initialize(opt *framework.ControllerOption) error {
   103  	c.vcClient = opt.VolcanoClient
   104  	c.kubeClient = opt.KubeClient
   105  
   106  	factory := informerfactory.NewSharedInformerFactory(c.vcClient, 0)
   107  	queueInformer := factory.Scheduling().V1beta1().Queues()
   108  	pgInformer := factory.Scheduling().V1beta1().PodGroups()
   109  
   110  	eventBroadcaster := record.NewBroadcaster()
   111  	eventBroadcaster.StartLogging(klog.Infof)
   112  	eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: c.kubeClient.CoreV1().Events("")})
   113  
   114  	c.vcInformerFactory = factory
   115  	c.queueInformer = queueInformer
   116  	c.pgInformer = pgInformer
   117  	c.queueLister = queueInformer.Lister()
   118  	c.queueSynced = queueInformer.Informer().HasSynced
   119  	c.pgLister = pgInformer.Lister()
   120  	c.pgSynced = pgInformer.Informer().HasSynced
   121  	c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
   122  	c.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
   123  	c.podGroups = make(map[string]map[string]struct{})
   124  	c.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
   125  	c.maxRequeueNum = opt.MaxRequeueNum
   126  	if c.maxRequeueNum < 0 {
   127  		c.maxRequeueNum = -1
   128  	}
   129  
   130  	queueInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   131  		AddFunc:    c.addQueue,
   132  		UpdateFunc: c.updateQueue,
   133  		DeleteFunc: c.deleteQueue,
   134  	})
   135  
   136  	pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   137  		AddFunc:    c.addPodGroup,
   138  		UpdateFunc: c.updatePodGroup,
   139  		DeleteFunc: c.deletePodGroup,
   140  	})
   141  
   142  	if utilfeature.DefaultFeatureGate.Enabled(features.QueueCommandSync) {
   143  		c.cmdInformer = factory.Bus().V1alpha1().Commands()
   144  		c.cmdInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{
   145  			FilterFunc: func(obj interface{}) bool {
   146  				switch v := obj.(type) {
   147  				case *busv1alpha1.Command:
   148  					return IsQueueReference(v.TargetObject)
   149  				default:
   150  					return false
   151  				}
   152  			},
   153  			Handler: cache.ResourceEventHandlerFuncs{
   154  				AddFunc: c.addCommand,
   155  			},
   156  		})
   157  		c.cmdLister = c.cmdInformer.Lister()
   158  		c.cmdSynced = c.cmdInformer.Informer().HasSynced
   159  	}
   160  
   161  	queuestate.SyncQueue = c.syncQueue
   162  	queuestate.OpenQueue = c.openQueue
   163  	queuestate.CloseQueue = c.closeQueue
   164  
   165  	c.syncHandler = c.handleQueue
   166  	c.syncCommandHandler = c.handleCommand
   167  
   168  	c.enqueueQueue = c.enqueue
   169  
   170  	return nil
   171  }
   172  
   173  // Run starts QueueController.
   174  func (c *queuecontroller) Run(stopCh <-chan struct{}) {
   175  	defer utilruntime.HandleCrash()
   176  	defer c.queue.ShutDown()
   177  	defer c.commandQueue.ShutDown()
   178  
   179  	klog.Infof("Starting queue controller.")
   180  	defer klog.Infof("Shutting down queue controller.")
   181  
   182  	c.vcInformerFactory.Start(stopCh)
   183  
   184  	for informerType, ok := range c.vcInformerFactory.WaitForCacheSync(stopCh) {
   185  		if !ok {
   186  			klog.Errorf("caches failed to sync: %v", informerType)
   187  			return
   188  		}
   189  	}
   190  
   191  	go wait.Until(c.worker, 0, stopCh)
   192  	go wait.Until(c.commandWorker, 0, stopCh)
   193  
   194  	<-stopCh
   195  }
   196  
   197  // worker runs a worker thread that just dequeues items, processes them, and
   198  // marks them done. You may run as many of these in parallel as you wish; the
   199  // workqueue guarantees that they will not end up processing the same `queue`
   200  // at the same time.
   201  func (c *queuecontroller) worker() {
   202  	for c.processNextWorkItem() {
   203  	}
   204  }
   205  
   206  func (c *queuecontroller) processNextWorkItem() bool {
   207  	obj, shutdown := c.queue.Get()
   208  	if shutdown {
   209  		return false
   210  	}
   211  	defer c.queue.Done(obj)
   212  
   213  	req, ok := obj.(*apis.Request)
   214  	if !ok {
   215  		klog.Errorf("%v is not a valid queue request struct.", obj)
   216  		return true
   217  	}
   218  
   219  	err := c.syncHandler(req)
   220  	c.handleQueueErr(err, obj)
   221  
   222  	return true
   223  }
   224  
   225  func (c *queuecontroller) handleQueue(req *apis.Request) error {
   226  	startTime := time.Now()
   227  	defer func() {
   228  		klog.V(4).Infof("Finished syncing queue %s (%v).", req.QueueName, time.Since(startTime))
   229  	}()
   230  
   231  	queue, err := c.queueLister.Get(req.QueueName)
   232  	if err != nil {
   233  		if apierrors.IsNotFound(err) {
   234  			klog.V(4).Infof("Queue %s has been deleted.", req.QueueName)
   235  			return nil
   236  		}
   237  
   238  		return fmt.Errorf("get queue %s failed for %v", req.QueueName, err)
   239  	}
   240  
   241  	queueState := queuestate.NewState(queue)
   242  	if queueState == nil {
   243  		return fmt.Errorf("queue %s state %s is invalid", queue.Name, queue.Status.State)
   244  	}
   245  
   246  	klog.V(4).Infof("Begin execute %s action for queue %s, current status %s", req.Action, req.QueueName, queue.Status.State)
   247  	if err := queueState.Execute(req.Action); err != nil {
   248  		return fmt.Errorf("sync queue %s failed for %v, event is %v, action is %s",
   249  			req.QueueName, err, req.Event, req.Action)
   250  	}
   251  
   252  	return nil
   253  }
   254  
   255  func (c *queuecontroller) handleQueueErr(err error, obj interface{}) {
   256  	if err == nil {
   257  		c.queue.Forget(obj)
   258  		return
   259  	}
   260  
   261  	if c.maxRequeueNum == -1 || c.queue.NumRequeues(obj) < c.maxRequeueNum {
   262  		klog.V(4).Infof("Error syncing queue request %v for %v.", obj, err)
   263  		c.queue.AddRateLimited(obj)
   264  		return
   265  	}
   266  
   267  	req, _ := obj.(*apis.Request)
   268  	c.recordEventsForQueue(req.QueueName, v1.EventTypeWarning, string(req.Action),
   269  		fmt.Sprintf("%v queue failed for %v", req.Action, err))
   270  	klog.V(2).Infof("Dropping queue request %v out of the queue for %v.", obj, err)
   271  	c.queue.Forget(obj)
   272  }
   273  
   274  func (c *queuecontroller) commandWorker() {
   275  	for c.processNextCommand() {
   276  	}
   277  }
   278  
   279  func (c *queuecontroller) processNextCommand() bool {
   280  	obj, shutdown := c.commandQueue.Get()
   281  	if shutdown {
   282  		return false
   283  	}
   284  	defer c.commandQueue.Done(obj)
   285  
   286  	cmd, ok := obj.(*busv1alpha1.Command)
   287  	if !ok {
   288  		klog.Errorf("%v is not a valid Command struct.", obj)
   289  		return true
   290  	}
   291  
   292  	err := c.syncCommandHandler(cmd)
   293  	c.handleCommandErr(err, obj)
   294  
   295  	return true
   296  }
   297  
   298  func (c *queuecontroller) handleCommand(cmd *busv1alpha1.Command) error {
   299  	startTime := time.Now()
   300  	defer func() {
   301  		klog.V(4).Infof("Finished syncing command %s/%s (%v).", cmd.Namespace, cmd.Name, time.Since(startTime))
   302  	}()
   303  
   304  	err := c.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{})
   305  	if err != nil {
   306  		if apierrors.IsNotFound(err) {
   307  			return nil
   308  		}
   309  
   310  		return fmt.Errorf("failed to delete command <%s/%s> for %v", cmd.Namespace, cmd.Name, err)
   311  	}
   312  
   313  	req := &apis.Request{
   314  		QueueName: cmd.TargetObject.Name,
   315  		Event:     busv1alpha1.CommandIssuedEvent,
   316  		Action:    busv1alpha1.Action(cmd.Action),
   317  	}
   318  
   319  	c.enqueueQueue(req)
   320  
   321  	return nil
   322  }
   323  
   324  func (c *queuecontroller) handleCommandErr(err error, obj interface{}) {
   325  	if err == nil {
   326  		c.commandQueue.Forget(obj)
   327  		return
   328  	}
   329  
   330  	if c.maxRequeueNum == -1 || c.commandQueue.NumRequeues(obj) < c.maxRequeueNum {
   331  		klog.V(4).Infof("Error syncing command %v for %v.", obj, err)
   332  		c.commandQueue.AddRateLimited(obj)
   333  		return
   334  	}
   335  
   336  	klog.V(2).Infof("Dropping command %v out of the queue for %v.", obj, err)
   337  	c.commandQueue.Forget(obj)
   338  }