k8s.io/kubernetes@v1.29.3/pkg/controller/nodeipam/ipam/range_allocator.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package ipam 18 19 import ( 20 "context" 21 "fmt" 22 "net" 23 "sync" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/klog/v2" 27 netutils "k8s.io/utils/net" 28 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 "k8s.io/apimachinery/pkg/types" 31 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 32 "k8s.io/apimachinery/pkg/util/sets" 33 informers "k8s.io/client-go/informers/core/v1" 34 clientset "k8s.io/client-go/kubernetes" 35 "k8s.io/client-go/kubernetes/scheme" 36 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 37 corelisters "k8s.io/client-go/listers/core/v1" 38 "k8s.io/client-go/tools/cache" 39 "k8s.io/client-go/tools/record" 40 nodeutil "k8s.io/component-helpers/node/util" 41 "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset" 42 controllerutil "k8s.io/kubernetes/pkg/controller/util/node" 43 ) 44 45 type rangeAllocator struct { 46 client clientset.Interface 47 // cluster cidrs as passed in during controller creation 48 clusterCIDRs []*net.IPNet 49 // for each entry in clusterCIDRs we maintain a list of what is used and what is not 50 cidrSets []*cidrset.CidrSet 51 // nodeLister is able to list/get nodes and is populated by the shared informer passed to controller 52 nodeLister corelisters.NodeLister 53 // nodesSynced returns true if the node shared informer has been synced at least once. 54 nodesSynced cache.InformerSynced 55 // Channel that is used to pass updating Nodes and their reserved CIDRs to the background 56 // This increases a throughput of CIDR assignment by not blocking on long operations. 57 nodeCIDRUpdateChannel chan nodeReservedCIDRs 58 broadcaster record.EventBroadcaster 59 recorder record.EventRecorder 60 // Keep a set of nodes that are currently being processed to avoid races in CIDR allocation 61 lock sync.Mutex 62 nodesInProcessing sets.String 63 } 64 65 // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs) 66 // Caller must ensure subNetMaskSize is not less than cluster CIDR mask size. 67 // Caller must always pass in a list of existing nodes so the new allocator. 68 // Caller must ensure that ClusterCIDRs are semantically correct e.g (1 for non DualStack, 2 for DualStack etc..) 69 // can initialize its CIDR map. NodeList is only nil in testing. 70 func NewCIDRRangeAllocator(logger klog.Logger, client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) { 71 if client == nil { 72 logger.Error(nil, "kubeClient is nil when starting CIDRRangeAllocator") 73 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 74 } 75 76 eventBroadcaster := record.NewBroadcaster() 77 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cidrAllocator"}) 78 79 // create a cidrSet for each cidr we operate on 80 // cidrSet are mapped to clusterCIDR by index 81 cidrSets := make([]*cidrset.CidrSet, len(allocatorParams.ClusterCIDRs)) 82 for idx, cidr := range allocatorParams.ClusterCIDRs { 83 cidrSet, err := cidrset.NewCIDRSet(cidr, allocatorParams.NodeCIDRMaskSizes[idx]) 84 if err != nil { 85 return nil, err 86 } 87 cidrSets[idx] = cidrSet 88 } 89 90 ra := &rangeAllocator{ 91 client: client, 92 clusterCIDRs: allocatorParams.ClusterCIDRs, 93 cidrSets: cidrSets, 94 nodeLister: nodeInformer.Lister(), 95 nodesSynced: nodeInformer.Informer().HasSynced, 96 nodeCIDRUpdateChannel: make(chan nodeReservedCIDRs, cidrUpdateQueueSize), 97 broadcaster: eventBroadcaster, 98 recorder: recorder, 99 nodesInProcessing: sets.NewString(), 100 } 101 102 if allocatorParams.ServiceCIDR != nil { 103 ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR) 104 } else { 105 logger.Info("No Service CIDR provided. Skipping filtering out service addresses") 106 } 107 108 if allocatorParams.SecondaryServiceCIDR != nil { 109 ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR) 110 } else { 111 logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses") 112 } 113 114 if nodeList != nil { 115 for _, node := range nodeList.Items { 116 if len(node.Spec.PodCIDRs) == 0 { 117 logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node)) 118 continue 119 } 120 logger.V(4).Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDR", node.Spec.PodCIDR) 121 if err := ra.occupyCIDRs(&node); err != nil { 122 // This will happen if: 123 // 1. We find garbage in the podCIDRs field. Retrying is useless. 124 // 2. CIDR out of range: This means a node CIDR has changed. 125 // This error will keep crashing controller-manager. 126 return nil, err 127 } 128 } 129 } 130 131 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 132 AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { 133 return ra.AllocateOrOccupyCIDR(logger, node) 134 }), 135 UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { 136 // If the PodCIDRs list is not empty we either: 137 // - already processed a Node that already had CIDRs after NC restarted 138 // (cidr is marked as used), 139 // - already processed a Node successfully and allocated CIDRs for it 140 // (cidr is marked as used), 141 // - already processed a Node but we did saw a "timeout" response and 142 // request eventually got through in this case we haven't released 143 // the allocated CIDRs (cidr is still marked as used). 144 // There's a possible error here: 145 // - NC sees a new Node and assigns CIDRs X,Y.. to it, 146 // - Update Node call fails with a timeout, 147 // - Node is updated by some other component, NC sees an update and 148 // assigns CIDRs A,B.. to the Node, 149 // - Both CIDR X,Y.. and CIDR A,B.. are marked as used in the local cache, 150 // even though Node sees only CIDR A,B.. 151 // The problem here is that in in-memory cache we see CIDR X,Y.. as marked, 152 // which prevents it from being assigned to any new node. The cluster 153 // state is correct. 154 // Restart of NC fixes the issue. 155 if len(newNode.Spec.PodCIDRs) == 0 { 156 return ra.AllocateOrOccupyCIDR(logger, newNode) 157 } 158 return nil 159 }), 160 DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error { 161 return ra.ReleaseCIDR(logger, node) 162 }), 163 }) 164 165 return ra, nil 166 } 167 168 func (r *rangeAllocator) Run(ctx context.Context) { 169 defer utilruntime.HandleCrash() 170 171 // Start event processing pipeline. 172 r.broadcaster.StartStructuredLogging(0) 173 logger := klog.FromContext(ctx) 174 logger.Info("Sending events to api server") 175 r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) 176 defer r.broadcaster.Shutdown() 177 178 logger.Info("Starting range CIDR allocator") 179 defer logger.Info("Shutting down range CIDR allocator") 180 181 if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), r.nodesSynced) { 182 return 183 } 184 185 for i := 0; i < cidrUpdateWorkers; i++ { 186 go r.worker(ctx) 187 } 188 189 <-ctx.Done() 190 } 191 192 func (r *rangeAllocator) worker(ctx context.Context) { 193 logger := klog.FromContext(ctx) 194 for { 195 select { 196 case workItem, ok := <-r.nodeCIDRUpdateChannel: 197 if !ok { 198 logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed") 199 return 200 } 201 if err := r.updateCIDRsAllocation(logger, workItem); err != nil { 202 // Requeue the failed node for update again. 203 r.nodeCIDRUpdateChannel <- workItem 204 } 205 case <-ctx.Done(): 206 return 207 } 208 } 209 } 210 211 func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool { 212 r.lock.Lock() 213 defer r.lock.Unlock() 214 if r.nodesInProcessing.Has(nodeName) { 215 return false 216 } 217 r.nodesInProcessing.Insert(nodeName) 218 return true 219 } 220 221 func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) { 222 r.lock.Lock() 223 defer r.lock.Unlock() 224 r.nodesInProcessing.Delete(nodeName) 225 } 226 227 // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet 228 func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error { 229 defer r.removeNodeFromProcessing(node.Name) 230 if len(node.Spec.PodCIDRs) == 0 { 231 return nil 232 } 233 for idx, cidr := range node.Spec.PodCIDRs { 234 _, podCIDR, err := netutils.ParseCIDRSloppy(cidr) 235 if err != nil { 236 return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR) 237 } 238 // If node has a pre allocate cidr that does not exist in our cidrs. 239 // This will happen if cluster went from dualstack(multi cidrs) to non-dualstack 240 // then we have now way of locking it 241 if idx >= len(r.cidrSets) { 242 return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx) 243 } 244 245 if err := r.cidrSets[idx].Occupy(podCIDR); err != nil { 246 return fmt.Errorf("failed to mark cidr[%v] at idx [%v] as occupied for node: %v: %v", podCIDR, idx, node.Name, err) 247 } 248 } 249 return nil 250 } 251 252 // WARNING: If you're adding any return calls or defer any more work from this 253 // function you have to make sure to update nodesInProcessing properly with the 254 // disposition of the node when the work is done. 255 func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error { 256 if node == nil { 257 return nil 258 } 259 if !r.insertNodeToProcessing(node.Name) { 260 logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node)) 261 return nil 262 } 263 264 if len(node.Spec.PodCIDRs) > 0 { 265 return r.occupyCIDRs(node) 266 } 267 // allocate and queue the assignment 268 allocated := nodeReservedCIDRs{ 269 nodeName: node.Name, 270 allocatedCIDRs: make([]*net.IPNet, len(r.cidrSets)), 271 } 272 273 for idx := range r.cidrSets { 274 podCIDR, err := r.cidrSets[idx].AllocateNext() 275 if err != nil { 276 r.removeNodeFromProcessing(node.Name) 277 controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable") 278 return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err) 279 } 280 allocated.allocatedCIDRs[idx] = podCIDR 281 } 282 283 //queue the assignment 284 logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocated.allocatedCIDRs) 285 r.nodeCIDRUpdateChannel <- allocated 286 return nil 287 } 288 289 // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets 290 func (r *rangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error { 291 if node == nil || len(node.Spec.PodCIDRs) == 0 { 292 return nil 293 } 294 295 for idx, cidr := range node.Spec.PodCIDRs { 296 _, podCIDR, err := netutils.ParseCIDRSloppy(cidr) 297 if err != nil { 298 return fmt.Errorf("failed to parse CIDR %s on Node %v: %v", cidr, node.Name, err) 299 } 300 301 // If node has a pre allocate cidr that does not exist in our cidrs. 302 // This will happen if cluster went from dualstack(multi cidrs) to non-dualstack 303 // then we have now way of locking it 304 if idx >= len(r.cidrSets) { 305 return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx) 306 } 307 308 logger.V(4).Info("Release CIDR for node", "CIDR", cidr, "node", klog.KObj(node)) 309 if err = r.cidrSets[idx].Release(podCIDR); err != nil { 310 return fmt.Errorf("error when releasing CIDR %v: %v", cidr, err) 311 } 312 } 313 return nil 314 } 315 316 // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs 317 // so that they won't be assignable. 318 func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) { 319 // Checks if service CIDR has a nonempty intersection with cluster 320 // CIDR. It is the case if either clusterCIDR contains serviceCIDR with 321 // clusterCIDR's Mask applied (this means that clusterCIDR contains 322 // serviceCIDR) or vice versa (which means that serviceCIDR contains 323 // clusterCIDR). 324 for idx, cidr := range r.clusterCIDRs { 325 // if they don't overlap then ignore the filtering 326 if !cidr.Contains(serviceCIDR.IP.Mask(cidr.Mask)) && !serviceCIDR.Contains(cidr.IP.Mask(serviceCIDR.Mask)) { 327 continue 328 } 329 330 // at this point, len(cidrSet) == len(clusterCidr) 331 if err := r.cidrSets[idx].Occupy(serviceCIDR); err != nil { 332 logger.Error(err, "Error filtering out service cidr out cluster cidr", "CIDR", cidr, "index", idx, "serviceCIDR", serviceCIDR) 333 } 334 } 335 } 336 337 // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. 338 func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeReservedCIDRs) error { 339 var err error 340 var node *v1.Node 341 defer r.removeNodeFromProcessing(data.nodeName) 342 cidrsString := ipnetToStringList(data.allocatedCIDRs) 343 node, err = r.nodeLister.Get(data.nodeName) 344 if err != nil { 345 logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName)) 346 return err 347 } 348 349 // if cidr list matches the proposed. 350 // then we possibly updated this node 351 // and just failed to ack the success. 352 if len(node.Spec.PodCIDRs) == len(data.allocatedCIDRs) { 353 match := true 354 for idx, cidr := range cidrsString { 355 if node.Spec.PodCIDRs[idx] != cidr { 356 match = false 357 break 358 } 359 } 360 if match { 361 logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs) 362 return nil 363 } 364 } 365 366 // node has cidrs, release the reserved 367 if len(node.Spec.PodCIDRs) != 0 { 368 logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) 369 for idx, cidr := range data.allocatedCIDRs { 370 if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { 371 logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr) 372 } 373 } 374 return nil 375 } 376 377 // If we reached here, it means that the node has no CIDR currently assigned. So we set it. 378 for i := 0; i < cidrUpdateRetries; i++ { 379 if err = nodeutil.PatchNodeCIDRs(r.client, types.NodeName(node.Name), cidrsString); err == nil { 380 logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDRs", cidrsString) 381 return nil 382 } 383 } 384 // failed release back to the pool 385 logger.Error(err, "Failed to update node PodCIDR after multiple attempts", "node", klog.KObj(node), "podCIDRs", cidrsString) 386 controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRAssignmentFailed") 387 // We accept the fact that we may leak CIDRs here. This is safer than releasing 388 // them in case when we don't know if request went through. 389 // NodeController restart will return all falsely allocated CIDRs to the pool. 390 if !apierrors.IsServerTimeout(err) { 391 logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) 392 for idx, cidr := range data.allocatedCIDRs { 393 if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { 394 logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node)) 395 } 396 } 397 } 398 return err 399 }