github.com/cilium/cilium@v1.16.2/operator/pkg/ciliumendpointslice/endpointslice.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package ciliumendpointslice
     5  
     6  import (
     7  	"context"
     8  	"time"
     9  
    10  	"github.com/cilium/hive/cell"
    11  	"github.com/cilium/workerpool"
    12  	"github.com/sirupsen/logrus"
    13  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    14  	"k8s.io/client-go/util/workqueue"
    15  
    16  	"github.com/cilium/cilium/pkg/k8s"
    17  	cilium_api_v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
    18  	capi_v2a1 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2alpha1"
    19  	"github.com/cilium/cilium/pkg/k8s/resource"
    20  	"github.com/cilium/cilium/pkg/logging/logfields"
    21  )
    22  
    23  const (
    24  	// cesNamePrefix is the prefix name added for the CiliumEndpointSlice
    25  	// resource.
    26  	cesNamePrefix = "ces"
    27  
    28  	// defaultSyncBackOff is the default backoff period for cesSync calls.
    29  	defaultSyncBackOff = 1 * time.Second
    30  	// maxSyncBackOff is the max backoff period for cesSync calls.
    31  	maxSyncBackOff = 100 * time.Second
    32  	// maxRetries is the number of times a cesSync will be retried before it is
    33  	// dropped out of the queue.
    34  	maxRetries = 15
    35  	// CEPs are batched into a CES, based on its Identity
    36  	cesIdentityBasedSlicing = "cesSliceModeIdentity"
    37  	// Default CES Synctime, multiple consecutive syncs with k8s-apiserver are
    38  	// batched and synced together after a short delay.
    39  	DefaultCESSyncTime = 500 * time.Millisecond
    40  
    41  	CESWriteQPSLimitMax = 50
    42  	CESWriteQPSBurstMax = 100
    43  )
    44  
    45  func (c *Controller) initializeQueue() {
    46  	c.logger.WithFields(logrus.Fields{
    47  		logfields.WorkQueueQPSLimit:    c.rateLimit.current.Limit,
    48  		logfields.WorkQueueBurstLimit:  c.rateLimit.current.Burst,
    49  		logfields.WorkQueueSyncBackOff: defaultSyncBackOff,
    50  	}).Info("CES controller workqueue configuration")
    51  	c.queue = workqueue.NewRateLimitingQueueWithConfig(
    52  		workqueue.NewItemExponentialFailureRateLimiter(defaultSyncBackOff, maxSyncBackOff),
    53  		workqueue.RateLimitingQueueConfig{Name: "cilium_endpoint_slice"})
    54  }
    55  
    56  func (c *Controller) onEndpointUpdate(cep *cilium_api_v2.CiliumEndpoint) {
    57  	if cep.Status.Networking == nil || cep.Status.Identity == nil || cep.GetName() == "" || cep.Namespace == "" {
    58  		return
    59  	}
    60  	touchedCESs := c.manager.UpdateCEPMapping(k8s.ConvertCEPToCoreCEP(cep), cep.Namespace)
    61  	c.enqueueCESReconciliation(touchedCESs)
    62  }
    63  
    64  func (c *Controller) onEndpointDelete(cep *cilium_api_v2.CiliumEndpoint) {
    65  	touchedCES := c.manager.RemoveCEPMapping(k8s.ConvertCEPToCoreCEP(cep), cep.Namespace)
    66  	c.enqueueCESReconciliation([]CESName{touchedCES})
    67  }
    68  
    69  func (c *Controller) onSliceUpdate(ces *capi_v2a1.CiliumEndpointSlice) {
    70  	c.enqueueCESReconciliation([]CESName{NewCESName(ces.Name)})
    71  }
    72  
    73  func (c *Controller) onSliceDelete(ces *capi_v2a1.CiliumEndpointSlice) {
    74  	c.enqueueCESReconciliation([]CESName{NewCESName(ces.Name)})
    75  }
    76  
    77  func (c *Controller) enqueueCESReconciliation(cess []CESName) {
    78  	for _, ces := range cess {
    79  		c.logger.WithFields(logrus.Fields{
    80  			logfields.CESName: ces.string(),
    81  		}).Debug("Enqueueing CES (if not empty name)")
    82  		if ces.Name != "" {
    83  			c.enqueuedAtLock.Lock()
    84  			if c.enqueuedAt[ces].IsZero() {
    85  				c.enqueuedAt[ces] = time.Now()
    86  			}
    87  			c.enqueuedAtLock.Unlock()
    88  			c.queue.AddAfter(ces, DefaultCESSyncTime)
    89  		}
    90  	}
    91  }
    92  
    93  func (c *Controller) getAndResetCESProcessingDelay(ces CESName) float64 {
    94  	c.enqueuedAtLock.Lock()
    95  	defer c.enqueuedAtLock.Unlock()
    96  	enqueued, exists := c.enqueuedAt[ces]
    97  	if !exists {
    98  		return 0
    99  	}
   100  	if !enqueued.IsZero() {
   101  		delay := time.Since(enqueued)
   102  		c.enqueuedAt[ces] = time.Time{}
   103  		return delay.Seconds()
   104  	}
   105  	return 0
   106  }
   107  
   108  // start the worker thread, reconciles the modified CESs with api-server
   109  func (c *Controller) Start(ctx cell.HookContext) error {
   110  	c.logger.Info("Bootstrap ces controller")
   111  	c.context, c.contextCancel = context.WithCancel(context.Background())
   112  	defer utilruntime.HandleCrash()
   113  	if c.slicingMode == cesIdentityBasedSlicing {
   114  		c.manager = newCESManagerIdentity(c.maxCEPsInCES, c.logger)
   115  	} else {
   116  		c.manager = newCESManagerFcfs(c.maxCEPsInCES, c.logger)
   117  	}
   118  	c.reconciler = newReconciler(c.context, c.clientset.CiliumV2alpha1(), c.manager, c.logger, c.ciliumEndpoint, c.ciliumEndpointSlice, c.metrics)
   119  
   120  	c.initializeQueue()
   121  
   122  	if err := c.syncCESsInLocalCache(ctx); err != nil {
   123  		return err
   124  	}
   125  
   126  	// Start the work pools processing CEP events only after syncing CES in local cache.
   127  	c.wp = workerpool.New(3)
   128  	c.wp.Submit("cilium-endpoints-updater", c.runCiliumEndpointsUpdater)
   129  	c.wp.Submit("cilium-endpoint-slices-updater", c.runCiliumEndpointSliceUpdater)
   130  	c.wp.Submit("cilium-nodes-updater", c.runCiliumNodesUpdater)
   131  
   132  	c.logger.Info("Starting CES controller reconciler.")
   133  	go c.worker()
   134  
   135  	return nil
   136  }
   137  
   138  func (c *Controller) Stop(ctx cell.HookContext) error {
   139  	c.wp.Close()
   140  	c.queue.ShutDown()
   141  	c.contextCancel()
   142  	return nil
   143  }
   144  
   145  func (c *Controller) runCiliumEndpointsUpdater(ctx context.Context) error {
   146  	for event := range c.ciliumEndpoint.Events(ctx) {
   147  		switch event.Kind {
   148  		case resource.Upsert:
   149  			c.logger.WithFields(logrus.Fields{
   150  				logfields.CEPName: event.Key.String()}).Debug("Got Upsert Endpoint event")
   151  			c.onEndpointUpdate(event.Object)
   152  		case resource.Delete:
   153  			c.logger.WithFields(logrus.Fields{
   154  				logfields.CEPName: event.Key.String()}).Debug("Got Delete Endpoint event")
   155  			c.onEndpointDelete(event.Object)
   156  		}
   157  		event.Done(nil)
   158  	}
   159  	return nil
   160  }
   161  
   162  func (c *Controller) runCiliumEndpointSliceUpdater(ctx context.Context) error {
   163  	for event := range c.ciliumEndpointSlice.Events(ctx) {
   164  		switch event.Kind {
   165  		case resource.Upsert:
   166  			c.logger.WithFields(logrus.Fields{
   167  				logfields.CESName: event.Key.String()}).Debug("Got Upsert Endpoint Slice event")
   168  			c.onSliceUpdate(event.Object)
   169  		case resource.Delete:
   170  			c.logger.WithFields(logrus.Fields{
   171  				logfields.CESName: event.Key.String()}).Debug("Got Delete Endpoint Slice event")
   172  			c.onSliceDelete(event.Object)
   173  		}
   174  		event.Done(nil)
   175  	}
   176  	return nil
   177  }
   178  
   179  func (c *Controller) runCiliumNodesUpdater(ctx context.Context) error {
   180  	ciliumNodesStore, err := c.ciliumNodes.Store(ctx)
   181  	if err != nil {
   182  		c.logger.WithError(err).Warn("Couldn't get CiliumNodes store")
   183  		return err
   184  	}
   185  	for event := range c.ciliumNodes.Events(ctx) {
   186  		event.Done(nil)
   187  		totalNodes := len(ciliumNodesStore.List())
   188  		if c.rateLimit.updateRateLimiterWithNodes(totalNodes) {
   189  			c.logger.WithFields(logrus.Fields{
   190  				logfields.WorkQueueQPSLimit:   c.rateLimit.current.Limit,
   191  				logfields.WorkQueueBurstLimit: c.rateLimit.current.Burst,
   192  			}).Info("Updated CES controller workqueue configuration")
   193  		}
   194  	}
   195  	return nil
   196  }
   197  
   198  // Sync all CESs from cesStore to manager cache.
   199  // Note: CESs are synced locally before CES controller running and this is required.
   200  func (c *Controller) syncCESsInLocalCache(ctx context.Context) error {
   201  	store, err := c.ciliumEndpointSlice.Store(ctx)
   202  	if err != nil {
   203  		c.logger.WithError(err).Warn("Error getting CES Store")
   204  		return err
   205  	}
   206  	for _, ces := range store.List() {
   207  		cesName := c.manager.initializeMappingForCES(ces)
   208  		for _, cep := range ces.Endpoints {
   209  			c.manager.initializeMappingCEPtoCES(&cep, ces.Namespace, cesName)
   210  		}
   211  	}
   212  	c.logger.Debug("Successfully synced all CESs locally")
   213  	return nil
   214  }
   215  
   216  // worker runs a worker thread that just dequeues items, processes them, and
   217  // marks them done.
   218  func (c *Controller) worker() {
   219  	for c.processNextWorkItem() {
   220  	}
   221  }
   222  
   223  func (c *Controller) rateLimitProcessing() {
   224  	delay := c.rateLimit.getDelay()
   225  	select {
   226  	case <-c.context.Done():
   227  	case <-time.After(delay):
   228  	}
   229  }
   230  
   231  func (c *Controller) processNextWorkItem() bool {
   232  	c.rateLimitProcessing()
   233  	cKey, quit := c.queue.Get()
   234  	if quit {
   235  		return false
   236  	}
   237  	key := cKey.(CESName)
   238  	c.logger.WithFields(logrus.Fields{
   239  		logfields.CESName: key.string(),
   240  	}).Debug("Processing CES")
   241  	defer c.queue.Done(key)
   242  
   243  	queueDelay := c.getAndResetCESProcessingDelay(key)
   244  	err := c.reconciler.reconcileCES(key)
   245  	c.metrics.CiliumEndpointSliceQueueDelay.Observe(queueDelay)
   246  	if err != nil {
   247  		c.metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(LabelValueOutcomeFail).Inc()
   248  	} else {
   249  		c.metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(LabelValueOutcomeSuccess).Inc()
   250  	}
   251  
   252  	c.handleErr(err, key)
   253  
   254  	return true
   255  }
   256  
   257  func (c *Controller) handleErr(err error, key CESName) {
   258  	if err == nil {
   259  		c.queue.Forget(key)
   260  		return
   261  	}
   262  
   263  	if c.queue.NumRequeues(key) < maxRetries {
   264  		c.queue.AddRateLimited(key)
   265  		return
   266  	}
   267  
   268  	// Drop the CES from queue, we maxed out retries.
   269  	c.logger.WithError(err).WithFields(logrus.Fields{
   270  		logfields.CESName: key.string(),
   271  	}).Error("Dropping the CES from queue, exceeded maxRetries")
   272  	c.queue.Forget(key)
   273  }