github.com/cilium/cilium@v1.16.2/operator/pkg/ciliumendpointslice/endpointslice.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package ciliumendpointslice 5 6 import ( 7 "context" 8 "time" 9 10 "github.com/cilium/hive/cell" 11 "github.com/cilium/workerpool" 12 "github.com/sirupsen/logrus" 13 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 14 "k8s.io/client-go/util/workqueue" 15 16 "github.com/cilium/cilium/pkg/k8s" 17 cilium_api_v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2" 18 capi_v2a1 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2alpha1" 19 "github.com/cilium/cilium/pkg/k8s/resource" 20 "github.com/cilium/cilium/pkg/logging/logfields" 21 ) 22 23 const ( 24 // cesNamePrefix is the prefix name added for the CiliumEndpointSlice 25 // resource. 26 cesNamePrefix = "ces" 27 28 // defaultSyncBackOff is the default backoff period for cesSync calls. 29 defaultSyncBackOff = 1 * time.Second 30 // maxSyncBackOff is the max backoff period for cesSync calls. 31 maxSyncBackOff = 100 * time.Second 32 // maxRetries is the number of times a cesSync will be retried before it is 33 // dropped out of the queue. 34 maxRetries = 15 35 // CEPs are batched into a CES, based on its Identity 36 cesIdentityBasedSlicing = "cesSliceModeIdentity" 37 // Default CES Synctime, multiple consecutive syncs with k8s-apiserver are 38 // batched and synced together after a short delay. 39 DefaultCESSyncTime = 500 * time.Millisecond 40 41 CESWriteQPSLimitMax = 50 42 CESWriteQPSBurstMax = 100 43 ) 44 45 func (c *Controller) initializeQueue() { 46 c.logger.WithFields(logrus.Fields{ 47 logfields.WorkQueueQPSLimit: c.rateLimit.current.Limit, 48 logfields.WorkQueueBurstLimit: c.rateLimit.current.Burst, 49 logfields.WorkQueueSyncBackOff: defaultSyncBackOff, 50 }).Info("CES controller workqueue configuration") 51 c.queue = workqueue.NewRateLimitingQueueWithConfig( 52 workqueue.NewItemExponentialFailureRateLimiter(defaultSyncBackOff, maxSyncBackOff), 53 workqueue.RateLimitingQueueConfig{Name: "cilium_endpoint_slice"}) 54 } 55 56 func (c *Controller) onEndpointUpdate(cep *cilium_api_v2.CiliumEndpoint) { 57 if cep.Status.Networking == nil || cep.Status.Identity == nil || cep.GetName() == "" || cep.Namespace == "" { 58 return 59 } 60 touchedCESs := c.manager.UpdateCEPMapping(k8s.ConvertCEPToCoreCEP(cep), cep.Namespace) 61 c.enqueueCESReconciliation(touchedCESs) 62 } 63 64 func (c *Controller) onEndpointDelete(cep *cilium_api_v2.CiliumEndpoint) { 65 touchedCES := c.manager.RemoveCEPMapping(k8s.ConvertCEPToCoreCEP(cep), cep.Namespace) 66 c.enqueueCESReconciliation([]CESName{touchedCES}) 67 } 68 69 func (c *Controller) onSliceUpdate(ces *capi_v2a1.CiliumEndpointSlice) { 70 c.enqueueCESReconciliation([]CESName{NewCESName(ces.Name)}) 71 } 72 73 func (c *Controller) onSliceDelete(ces *capi_v2a1.CiliumEndpointSlice) { 74 c.enqueueCESReconciliation([]CESName{NewCESName(ces.Name)}) 75 } 76 77 func (c *Controller) enqueueCESReconciliation(cess []CESName) { 78 for _, ces := range cess { 79 c.logger.WithFields(logrus.Fields{ 80 logfields.CESName: ces.string(), 81 }).Debug("Enqueueing CES (if not empty name)") 82 if ces.Name != "" { 83 c.enqueuedAtLock.Lock() 84 if c.enqueuedAt[ces].IsZero() { 85 c.enqueuedAt[ces] = time.Now() 86 } 87 c.enqueuedAtLock.Unlock() 88 c.queue.AddAfter(ces, DefaultCESSyncTime) 89 } 90 } 91 } 92 93 func (c *Controller) getAndResetCESProcessingDelay(ces CESName) float64 { 94 c.enqueuedAtLock.Lock() 95 defer c.enqueuedAtLock.Unlock() 96 enqueued, exists := c.enqueuedAt[ces] 97 if !exists { 98 return 0 99 } 100 if !enqueued.IsZero() { 101 delay := time.Since(enqueued) 102 c.enqueuedAt[ces] = time.Time{} 103 return delay.Seconds() 104 } 105 return 0 106 } 107 108 // start the worker thread, reconciles the modified CESs with api-server 109 func (c *Controller) Start(ctx cell.HookContext) error { 110 c.logger.Info("Bootstrap ces controller") 111 c.context, c.contextCancel = context.WithCancel(context.Background()) 112 defer utilruntime.HandleCrash() 113 if c.slicingMode == cesIdentityBasedSlicing { 114 c.manager = newCESManagerIdentity(c.maxCEPsInCES, c.logger) 115 } else { 116 c.manager = newCESManagerFcfs(c.maxCEPsInCES, c.logger) 117 } 118 c.reconciler = newReconciler(c.context, c.clientset.CiliumV2alpha1(), c.manager, c.logger, c.ciliumEndpoint, c.ciliumEndpointSlice, c.metrics) 119 120 c.initializeQueue() 121 122 if err := c.syncCESsInLocalCache(ctx); err != nil { 123 return err 124 } 125 126 // Start the work pools processing CEP events only after syncing CES in local cache. 127 c.wp = workerpool.New(3) 128 c.wp.Submit("cilium-endpoints-updater", c.runCiliumEndpointsUpdater) 129 c.wp.Submit("cilium-endpoint-slices-updater", c.runCiliumEndpointSliceUpdater) 130 c.wp.Submit("cilium-nodes-updater", c.runCiliumNodesUpdater) 131 132 c.logger.Info("Starting CES controller reconciler.") 133 go c.worker() 134 135 return nil 136 } 137 138 func (c *Controller) Stop(ctx cell.HookContext) error { 139 c.wp.Close() 140 c.queue.ShutDown() 141 c.contextCancel() 142 return nil 143 } 144 145 func (c *Controller) runCiliumEndpointsUpdater(ctx context.Context) error { 146 for event := range c.ciliumEndpoint.Events(ctx) { 147 switch event.Kind { 148 case resource.Upsert: 149 c.logger.WithFields(logrus.Fields{ 150 logfields.CEPName: event.Key.String()}).Debug("Got Upsert Endpoint event") 151 c.onEndpointUpdate(event.Object) 152 case resource.Delete: 153 c.logger.WithFields(logrus.Fields{ 154 logfields.CEPName: event.Key.String()}).Debug("Got Delete Endpoint event") 155 c.onEndpointDelete(event.Object) 156 } 157 event.Done(nil) 158 } 159 return nil 160 } 161 162 func (c *Controller) runCiliumEndpointSliceUpdater(ctx context.Context) error { 163 for event := range c.ciliumEndpointSlice.Events(ctx) { 164 switch event.Kind { 165 case resource.Upsert: 166 c.logger.WithFields(logrus.Fields{ 167 logfields.CESName: event.Key.String()}).Debug("Got Upsert Endpoint Slice event") 168 c.onSliceUpdate(event.Object) 169 case resource.Delete: 170 c.logger.WithFields(logrus.Fields{ 171 logfields.CESName: event.Key.String()}).Debug("Got Delete Endpoint Slice event") 172 c.onSliceDelete(event.Object) 173 } 174 event.Done(nil) 175 } 176 return nil 177 } 178 179 func (c *Controller) runCiliumNodesUpdater(ctx context.Context) error { 180 ciliumNodesStore, err := c.ciliumNodes.Store(ctx) 181 if err != nil { 182 c.logger.WithError(err).Warn("Couldn't get CiliumNodes store") 183 return err 184 } 185 for event := range c.ciliumNodes.Events(ctx) { 186 event.Done(nil) 187 totalNodes := len(ciliumNodesStore.List()) 188 if c.rateLimit.updateRateLimiterWithNodes(totalNodes) { 189 c.logger.WithFields(logrus.Fields{ 190 logfields.WorkQueueQPSLimit: c.rateLimit.current.Limit, 191 logfields.WorkQueueBurstLimit: c.rateLimit.current.Burst, 192 }).Info("Updated CES controller workqueue configuration") 193 } 194 } 195 return nil 196 } 197 198 // Sync all CESs from cesStore to manager cache. 199 // Note: CESs are synced locally before CES controller running and this is required. 200 func (c *Controller) syncCESsInLocalCache(ctx context.Context) error { 201 store, err := c.ciliumEndpointSlice.Store(ctx) 202 if err != nil { 203 c.logger.WithError(err).Warn("Error getting CES Store") 204 return err 205 } 206 for _, ces := range store.List() { 207 cesName := c.manager.initializeMappingForCES(ces) 208 for _, cep := range ces.Endpoints { 209 c.manager.initializeMappingCEPtoCES(&cep, ces.Namespace, cesName) 210 } 211 } 212 c.logger.Debug("Successfully synced all CESs locally") 213 return nil 214 } 215 216 // worker runs a worker thread that just dequeues items, processes them, and 217 // marks them done. 218 func (c *Controller) worker() { 219 for c.processNextWorkItem() { 220 } 221 } 222 223 func (c *Controller) rateLimitProcessing() { 224 delay := c.rateLimit.getDelay() 225 select { 226 case <-c.context.Done(): 227 case <-time.After(delay): 228 } 229 } 230 231 func (c *Controller) processNextWorkItem() bool { 232 c.rateLimitProcessing() 233 cKey, quit := c.queue.Get() 234 if quit { 235 return false 236 } 237 key := cKey.(CESName) 238 c.logger.WithFields(logrus.Fields{ 239 logfields.CESName: key.string(), 240 }).Debug("Processing CES") 241 defer c.queue.Done(key) 242 243 queueDelay := c.getAndResetCESProcessingDelay(key) 244 err := c.reconciler.reconcileCES(key) 245 c.metrics.CiliumEndpointSliceQueueDelay.Observe(queueDelay) 246 if err != nil { 247 c.metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(LabelValueOutcomeFail).Inc() 248 } else { 249 c.metrics.CiliumEndpointSliceSyncTotal.WithLabelValues(LabelValueOutcomeSuccess).Inc() 250 } 251 252 c.handleErr(err, key) 253 254 return true 255 } 256 257 func (c *Controller) handleErr(err error, key CESName) { 258 if err == nil { 259 c.queue.Forget(key) 260 return 261 } 262 263 if c.queue.NumRequeues(key) < maxRetries { 264 c.queue.AddRateLimited(key) 265 return 266 } 267 268 // Drop the CES from queue, we maxed out retries. 269 c.logger.WithError(err).WithFields(logrus.Fields{ 270 logfields.CESName: key.string(), 271 }).Error("Dropping the CES from queue, exceeded maxRetries") 272 c.queue.Forget(key) 273 }