k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/nodeipam/ipam/range_allocator.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package ipam 18 19 import ( 20 "context" 21 "fmt" 22 "net" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/util/wait" 27 "k8s.io/klog/v2" 28 netutils "k8s.io/utils/net" 29 30 apierrors "k8s.io/apimachinery/pkg/api/errors" 31 "k8s.io/apimachinery/pkg/types" 32 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 33 informers "k8s.io/client-go/informers/core/v1" 34 clientset "k8s.io/client-go/kubernetes" 35 "k8s.io/client-go/kubernetes/scheme" 36 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 37 corelisters "k8s.io/client-go/listers/core/v1" 38 "k8s.io/client-go/tools/cache" 39 "k8s.io/client-go/tools/record" 40 "k8s.io/client-go/util/workqueue" 41 nodeutil "k8s.io/component-helpers/node/util" 42 "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset" 43 controllerutil "k8s.io/kubernetes/pkg/controller/util/node" 44 ) 45 46 type rangeAllocator struct { 47 client clientset.Interface 48 // cluster cidrs as passed in during controller creation 49 clusterCIDRs []*net.IPNet 50 // for each entry in clusterCIDRs we maintain a list of what is used and what is not 51 cidrSets []*cidrset.CidrSet 52 // nodeLister is able to list/get nodes and is populated by the shared informer passed to controller 53 nodeLister corelisters.NodeLister 54 // nodesSynced returns true if the node shared informer has been synced at least once. 55 nodesSynced cache.InformerSynced 56 broadcaster record.EventBroadcaster 57 recorder record.EventRecorder 58 59 // queues are where incoming work is placed to de-dup and to allow "easy" 60 // rate limited requeues on errors 61 queue workqueue.RateLimitingInterface 62 } 63 64 var _ CIDRAllocator = &rangeAllocator{} 65 66 // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs) 67 // Caller must ensure subNetMaskSize is not less than cluster CIDR mask size. 68 // Caller must always pass in a list of existing nodes so the new allocator. 69 // Caller must ensure that ClusterCIDRs are semantically correct e.g (1 for non DualStack, 2 for DualStack etc..) 70 // can initialize its CIDR map. NodeList is only nil in testing. 71 func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) { 72 logger := klog.FromContext(ctx) 73 if client == nil { 74 logger.Error(nil, "kubeClient is nil when starting CIDRRangeAllocator") 75 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 76 } 77 78 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 79 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cidrAllocator"}) 80 81 // create a cidrSet for each cidr we operate on 82 // cidrSet are mapped to clusterCIDR by index 83 cidrSets := make([]*cidrset.CidrSet, len(allocatorParams.ClusterCIDRs)) 84 for idx, cidr := range allocatorParams.ClusterCIDRs { 85 cidrSet, err := cidrset.NewCIDRSet(cidr, allocatorParams.NodeCIDRMaskSizes[idx]) 86 if err != nil { 87 return nil, err 88 } 89 cidrSets[idx] = cidrSet 90 } 91 92 ra := &rangeAllocator{ 93 client: client, 94 clusterCIDRs: allocatorParams.ClusterCIDRs, 95 cidrSets: cidrSets, 96 nodeLister: nodeInformer.Lister(), 97 nodesSynced: nodeInformer.Informer().HasSynced, 98 broadcaster: eventBroadcaster, 99 recorder: recorder, 100 queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cidrallocator_node"), 101 } 102 103 if allocatorParams.ServiceCIDR != nil { 104 ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR) 105 } else { 106 logger.Info("No Service CIDR provided. Skipping filtering out service addresses") 107 } 108 109 if allocatorParams.SecondaryServiceCIDR != nil { 110 ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR) 111 } else { 112 logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses") 113 } 114 115 if nodeList != nil { 116 for _, node := range nodeList.Items { 117 if len(node.Spec.PodCIDRs) == 0 { 118 logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node)) 119 continue 120 } 121 logger.V(4).Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDR", node.Spec.PodCIDR) 122 if err := ra.occupyCIDRs(&node); err != nil { 123 // This will happen if: 124 // 1. We find garbage in the podCIDRs field. Retrying is useless. 125 // 2. CIDR out of range: This means a node CIDR has changed. 126 // This error will keep crashing controller-manager. 127 return nil, err 128 } 129 } 130 } 131 132 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 133 AddFunc: func(obj interface{}) { 134 key, err := cache.MetaNamespaceKeyFunc(obj) 135 if err == nil { 136 ra.queue.Add(key) 137 } 138 }, 139 UpdateFunc: func(old, new interface{}) { 140 key, err := cache.MetaNamespaceKeyFunc(new) 141 if err == nil { 142 ra.queue.Add(key) 143 } 144 }, 145 DeleteFunc: func(obj interface{}) { 146 // The informer cache no longer has the object, and since Node doesn't have a finalizer, 147 // we don't see the Update with DeletionTimestamp != 0. 148 // TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name 149 // and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures. 150 if err := ra.ReleaseCIDR(logger, obj.(*v1.Node)); err != nil { 151 utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err)) 152 } 153 // IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this 154 // key function. 155 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) 156 if err == nil { 157 ra.queue.Add(key) 158 } 159 }, 160 }) 161 162 return ra, nil 163 } 164 165 func (r *rangeAllocator) Run(ctx context.Context) { 166 defer utilruntime.HandleCrash() 167 168 // Start event processing pipeline. 169 r.broadcaster.StartStructuredLogging(3) 170 logger := klog.FromContext(ctx) 171 logger.Info("Sending events to api server") 172 r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) 173 defer r.broadcaster.Shutdown() 174 175 defer r.queue.ShutDown() 176 177 logger.Info("Starting range CIDR allocator") 178 defer logger.Info("Shutting down range CIDR allocator") 179 180 if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), r.nodesSynced) { 181 return 182 } 183 184 for i := 0; i < cidrUpdateWorkers; i++ { 185 go wait.UntilWithContext(ctx, r.runWorker, time.Second) 186 } 187 188 <-ctx.Done() 189 } 190 191 // runWorker is a long-running function that will continually call the 192 // processNextWorkItem function in order to read and process a message on the 193 // queue. 194 func (r *rangeAllocator) runWorker(ctx context.Context) { 195 for r.processNextNodeWorkItem(ctx) { 196 } 197 } 198 199 // processNextWorkItem will read a single work item off the queue and 200 // attempt to process it, by calling the syncHandler. 201 func (r *rangeAllocator) processNextNodeWorkItem(ctx context.Context) bool { 202 obj, shutdown := r.queue.Get() 203 if shutdown { 204 return false 205 } 206 207 // We wrap this block in a func so we can defer r.queue.Done. 208 err := func(logger klog.Logger, obj interface{}) error { 209 // We call Done here so the workNodeQueue knows we have finished 210 // processing this item. We also must remember to call Forget if we 211 // do not want this work item being re-queued. For example, we do 212 // not call Forget if a transient error occurs, instead the item is 213 // put back on the queue and attempted again after a back-off 214 // period. 215 defer r.queue.Done(obj) 216 var key string 217 var ok bool 218 // We expect strings to come off the workNodeQueue. These are of the 219 // form namespace/name. We do this as the delayed nature of the 220 // workNodeQueue means the items in the informer cache may actually be 221 // more up to date that when the item was initially put onto the 222 // workNodeQueue. 223 if key, ok = obj.(string); !ok { 224 // As the item in the workNodeQueue is actually invalid, we call 225 // Forget here else we'd go into a loop of attempting to 226 // process a work item that is invalid. 227 r.queue.Forget(obj) 228 utilruntime.HandleError(fmt.Errorf("expected string in workNodeQueue but got %#v", obj)) 229 return nil 230 } 231 // Run the syncHandler, passing it the namespace/name string of the 232 // Foo resource to be synced. 233 if err := r.syncNode(ctx, key); err != nil { 234 // Put the item back on the queue to handle any transient errors. 235 r.queue.AddRateLimited(key) 236 return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) 237 } 238 // Finally, if no error occurs we Forget this item so it does not 239 // get queue again until another change happens. 240 r.queue.Forget(obj) 241 logger.Info("Successfully synced", "key", key) 242 return nil 243 }(klog.FromContext(ctx), obj) 244 245 if err != nil { 246 utilruntime.HandleError(err) 247 return true 248 } 249 250 return true 251 } 252 253 func (r *rangeAllocator) syncNode(ctx context.Context, key string) error { 254 logger := klog.FromContext(ctx) 255 startTime := time.Now() 256 defer func() { 257 logger.V(4).Info("Finished syncing Node request", "node", key, "elapsed", time.Since(startTime)) 258 }() 259 260 node, err := r.nodeLister.Get(key) 261 if apierrors.IsNotFound(err) { 262 logger.V(3).Info("node has been deleted", "node", key) 263 // TODO: obtain the node object information to call ReleaseCIDR from here 264 // and retry if there is an error. 265 return nil 266 } 267 if err != nil { 268 return err 269 } 270 // Check the DeletionTimestamp to determine if object is under deletion. 271 if !node.DeletionTimestamp.IsZero() { 272 logger.V(3).Info("node is being deleted", "node", key) 273 return r.ReleaseCIDR(logger, node) 274 } 275 return r.AllocateOrOccupyCIDR(ctx, node) 276 } 277 278 // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet 279 func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error { 280 if len(node.Spec.PodCIDRs) == 0 { 281 return nil 282 } 283 for idx, cidr := range node.Spec.PodCIDRs { 284 _, podCIDR, err := netutils.ParseCIDRSloppy(cidr) 285 if err != nil { 286 return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR) 287 } 288 // If node has a pre allocate cidr that does not exist in our cidrs. 289 // This will happen if cluster went from dualstack(multi cidrs) to non-dualstack 290 // then we have now way of locking it 291 if idx >= len(r.cidrSets) { 292 return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx) 293 } 294 295 if err := r.cidrSets[idx].Occupy(podCIDR); err != nil { 296 return fmt.Errorf("failed to mark cidr[%v] at idx [%v] as occupied for node: %v: %v", podCIDR, idx, node.Name, err) 297 } 298 } 299 return nil 300 } 301 302 // WARNING: If you're adding any return calls or defer any more work from this 303 // function you have to make sure to update nodesInProcessing properly with the 304 // disposition of the node when the work is done. 305 func (r *rangeAllocator) AllocateOrOccupyCIDR(ctx context.Context, node *v1.Node) error { 306 if node == nil { 307 return nil 308 } 309 310 if len(node.Spec.PodCIDRs) > 0 { 311 return r.occupyCIDRs(node) 312 } 313 314 logger := klog.FromContext(ctx) 315 allocatedCIDRs := make([]*net.IPNet, len(r.cidrSets)) 316 317 for idx := range r.cidrSets { 318 podCIDR, err := r.cidrSets[idx].AllocateNext() 319 if err != nil { 320 controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable") 321 return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err) 322 } 323 allocatedCIDRs[idx] = podCIDR 324 } 325 326 //queue the assignment 327 logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocatedCIDRs) 328 return r.updateCIDRsAllocation(ctx, node.Name, allocatedCIDRs) 329 } 330 331 // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets 332 func (r *rangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error { 333 if node == nil || len(node.Spec.PodCIDRs) == 0 { 334 return nil 335 } 336 337 for idx, cidr := range node.Spec.PodCIDRs { 338 _, podCIDR, err := netutils.ParseCIDRSloppy(cidr) 339 if err != nil { 340 return fmt.Errorf("failed to parse CIDR %s on Node %v: %v", cidr, node.Name, err) 341 } 342 343 // If node has a pre allocate cidr that does not exist in our cidrs. 344 // This will happen if cluster went from dualstack(multi cidrs) to non-dualstack 345 // then we have now way of locking it 346 if idx >= len(r.cidrSets) { 347 return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx) 348 } 349 350 logger.V(4).Info("Release CIDR for node", "CIDR", cidr, "node", klog.KObj(node)) 351 if err = r.cidrSets[idx].Release(podCIDR); err != nil { 352 return fmt.Errorf("error when releasing CIDR %v: %v", cidr, err) 353 } 354 } 355 return nil 356 } 357 358 // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs 359 // so that they won't be assignable. 360 func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) { 361 // Checks if service CIDR has a nonempty intersection with cluster 362 // CIDR. It is the case if either clusterCIDR contains serviceCIDR with 363 // clusterCIDR's Mask applied (this means that clusterCIDR contains 364 // serviceCIDR) or vice versa (which means that serviceCIDR contains 365 // clusterCIDR). 366 for idx, cidr := range r.clusterCIDRs { 367 // if they don't overlap then ignore the filtering 368 if !cidr.Contains(serviceCIDR.IP.Mask(cidr.Mask)) && !serviceCIDR.Contains(cidr.IP.Mask(serviceCIDR.Mask)) { 369 continue 370 } 371 372 // at this point, len(cidrSet) == len(clusterCidr) 373 if err := r.cidrSets[idx].Occupy(serviceCIDR); err != nil { 374 logger.Error(err, "Error filtering out service cidr out cluster cidr", "CIDR", cidr, "index", idx, "serviceCIDR", serviceCIDR) 375 } 376 } 377 } 378 379 // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. 380 func (r *rangeAllocator) updateCIDRsAllocation(ctx context.Context, nodeName string, allocatedCIDRs []*net.IPNet) error { 381 var err error 382 var node *v1.Node 383 logger := klog.FromContext(ctx) 384 cidrsString := ipnetToStringList(allocatedCIDRs) 385 node, err = r.nodeLister.Get(nodeName) 386 if err != nil { 387 logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", nodeName)) 388 return err 389 } 390 391 // if cidr list matches the proposed. 392 // then we possibly updated this node 393 // and just failed to ack the success. 394 if len(node.Spec.PodCIDRs) == len(allocatedCIDRs) { 395 match := true 396 for idx, cidr := range cidrsString { 397 if node.Spec.PodCIDRs[idx] != cidr { 398 match = false 399 break 400 } 401 } 402 if match { 403 logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", allocatedCIDRs) 404 return nil 405 } 406 } 407 408 // node has cidrs, release the reserved 409 if len(node.Spec.PodCIDRs) != 0 { 410 logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) 411 for idx, cidr := range allocatedCIDRs { 412 if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { 413 logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr) 414 } 415 } 416 return nil 417 } 418 419 // If we reached here, it means that the node has no CIDR currently assigned. So we set it. 420 for i := 0; i < cidrUpdateRetries; i++ { 421 if err = nodeutil.PatchNodeCIDRs(ctx, r.client, types.NodeName(node.Name), cidrsString); err == nil { 422 logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDRs", cidrsString) 423 return nil 424 } 425 } 426 // failed release back to the pool 427 logger.Error(err, "Failed to update node PodCIDR after multiple attempts", "node", klog.KObj(node), "podCIDRs", cidrsString) 428 controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRAssignmentFailed") 429 // We accept the fact that we may leak CIDRs here. This is safer than releasing 430 // them in case when we don't know if request went through. 431 // NodeController restart will return all falsely allocated CIDRs to the pool. 432 if !apierrors.IsServerTimeout(err) { 433 logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) 434 for idx, cidr := range allocatedCIDRs { 435 if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { 436 logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node)) 437 } 438 } 439 } 440 return err 441 }