k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/nodeipam/ipam/range_allocator.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package ipam
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net"
    23  	"time"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  	"k8s.io/klog/v2"
    28  	netutils "k8s.io/utils/net"
    29  
    30  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    31  	"k8s.io/apimachinery/pkg/types"
    32  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    33  	informers "k8s.io/client-go/informers/core/v1"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/client-go/kubernetes/scheme"
    36  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    37  	corelisters "k8s.io/client-go/listers/core/v1"
    38  	"k8s.io/client-go/tools/cache"
    39  	"k8s.io/client-go/tools/record"
    40  	"k8s.io/client-go/util/workqueue"
    41  	nodeutil "k8s.io/component-helpers/node/util"
    42  	"k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset"
    43  	controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
    44  )
    45  
    46  type rangeAllocator struct {
    47  	client clientset.Interface
    48  	// cluster cidrs as passed in during controller creation
    49  	clusterCIDRs []*net.IPNet
    50  	// for each entry in clusterCIDRs we maintain a list of what is used and what is not
    51  	cidrSets []*cidrset.CidrSet
    52  	// nodeLister is able to list/get nodes and is populated by the shared informer passed to controller
    53  	nodeLister corelisters.NodeLister
    54  	// nodesSynced returns true if the node shared informer has been synced at least once.
    55  	nodesSynced cache.InformerSynced
    56  	broadcaster record.EventBroadcaster
    57  	recorder    record.EventRecorder
    58  
    59  	// queues are where incoming work is placed to de-dup and to allow "easy"
    60  	// rate limited requeues on errors
    61  	queue workqueue.RateLimitingInterface
    62  }
    63  
    64  var _ CIDRAllocator = &rangeAllocator{}
    65  
    66  // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs)
    67  // Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
    68  // Caller must always pass in a list of existing nodes so the new allocator.
    69  // Caller must ensure that ClusterCIDRs are semantically correct e.g (1 for non DualStack, 2 for DualStack etc..)
    70  // can initialize its CIDR map. NodeList is only nil in testing.
    71  func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) {
    72  	logger := klog.FromContext(ctx)
    73  	if client == nil {
    74  		logger.Error(nil, "kubeClient is nil when starting CIDRRangeAllocator")
    75  		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
    76  	}
    77  
    78  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
    79  	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cidrAllocator"})
    80  
    81  	// create a cidrSet for each cidr we operate on
    82  	// cidrSet are mapped to clusterCIDR by index
    83  	cidrSets := make([]*cidrset.CidrSet, len(allocatorParams.ClusterCIDRs))
    84  	for idx, cidr := range allocatorParams.ClusterCIDRs {
    85  		cidrSet, err := cidrset.NewCIDRSet(cidr, allocatorParams.NodeCIDRMaskSizes[idx])
    86  		if err != nil {
    87  			return nil, err
    88  		}
    89  		cidrSets[idx] = cidrSet
    90  	}
    91  
    92  	ra := &rangeAllocator{
    93  		client:       client,
    94  		clusterCIDRs: allocatorParams.ClusterCIDRs,
    95  		cidrSets:     cidrSets,
    96  		nodeLister:   nodeInformer.Lister(),
    97  		nodesSynced:  nodeInformer.Informer().HasSynced,
    98  		broadcaster:  eventBroadcaster,
    99  		recorder:     recorder,
   100  		queue:        workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cidrallocator_node"),
   101  	}
   102  
   103  	if allocatorParams.ServiceCIDR != nil {
   104  		ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR)
   105  	} else {
   106  		logger.Info("No Service CIDR provided. Skipping filtering out service addresses")
   107  	}
   108  
   109  	if allocatorParams.SecondaryServiceCIDR != nil {
   110  		ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR)
   111  	} else {
   112  		logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses")
   113  	}
   114  
   115  	if nodeList != nil {
   116  		for _, node := range nodeList.Items {
   117  			if len(node.Spec.PodCIDRs) == 0 {
   118  				logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node))
   119  				continue
   120  			}
   121  			logger.V(4).Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDR", node.Spec.PodCIDR)
   122  			if err := ra.occupyCIDRs(&node); err != nil {
   123  				// This will happen if:
   124  				// 1. We find garbage in the podCIDRs field. Retrying is useless.
   125  				// 2. CIDR out of range: This means a node CIDR has changed.
   126  				// This error will keep crashing controller-manager.
   127  				return nil, err
   128  			}
   129  		}
   130  	}
   131  
   132  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   133  		AddFunc: func(obj interface{}) {
   134  			key, err := cache.MetaNamespaceKeyFunc(obj)
   135  			if err == nil {
   136  				ra.queue.Add(key)
   137  			}
   138  		},
   139  		UpdateFunc: func(old, new interface{}) {
   140  			key, err := cache.MetaNamespaceKeyFunc(new)
   141  			if err == nil {
   142  				ra.queue.Add(key)
   143  			}
   144  		},
   145  		DeleteFunc: func(obj interface{}) {
   146  			// The informer cache no longer has the object, and since Node doesn't have a finalizer,
   147  			// we don't see the Update with DeletionTimestamp != 0.
   148  			// TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name
   149  			// and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures.
   150  			if err := ra.ReleaseCIDR(logger, obj.(*v1.Node)); err != nil {
   151  				utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err))
   152  			}
   153  			// IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this
   154  			// key function.
   155  			key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
   156  			if err == nil {
   157  				ra.queue.Add(key)
   158  			}
   159  		},
   160  	})
   161  
   162  	return ra, nil
   163  }
   164  
   165  func (r *rangeAllocator) Run(ctx context.Context) {
   166  	defer utilruntime.HandleCrash()
   167  
   168  	// Start event processing pipeline.
   169  	r.broadcaster.StartStructuredLogging(3)
   170  	logger := klog.FromContext(ctx)
   171  	logger.Info("Sending events to api server")
   172  	r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")})
   173  	defer r.broadcaster.Shutdown()
   174  
   175  	defer r.queue.ShutDown()
   176  
   177  	logger.Info("Starting range CIDR allocator")
   178  	defer logger.Info("Shutting down range CIDR allocator")
   179  
   180  	if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), r.nodesSynced) {
   181  		return
   182  	}
   183  
   184  	for i := 0; i < cidrUpdateWorkers; i++ {
   185  		go wait.UntilWithContext(ctx, r.runWorker, time.Second)
   186  	}
   187  
   188  	<-ctx.Done()
   189  }
   190  
   191  // runWorker is a long-running function that will continually call the
   192  // processNextWorkItem function in order to read and process a message on the
   193  // queue.
   194  func (r *rangeAllocator) runWorker(ctx context.Context) {
   195  	for r.processNextNodeWorkItem(ctx) {
   196  	}
   197  }
   198  
   199  // processNextWorkItem will read a single work item off the queue and
   200  // attempt to process it, by calling the syncHandler.
   201  func (r *rangeAllocator) processNextNodeWorkItem(ctx context.Context) bool {
   202  	obj, shutdown := r.queue.Get()
   203  	if shutdown {
   204  		return false
   205  	}
   206  
   207  	// We wrap this block in a func so we can defer r.queue.Done.
   208  	err := func(logger klog.Logger, obj interface{}) error {
   209  		// We call Done here so the workNodeQueue knows we have finished
   210  		// processing this item. We also must remember to call Forget if we
   211  		// do not want this work item being re-queued. For example, we do
   212  		// not call Forget if a transient error occurs, instead the item is
   213  		// put back on the queue and attempted again after a back-off
   214  		// period.
   215  		defer r.queue.Done(obj)
   216  		var key string
   217  		var ok bool
   218  		// We expect strings to come off the workNodeQueue. These are of the
   219  		// form namespace/name. We do this as the delayed nature of the
   220  		// workNodeQueue means the items in the informer cache may actually be
   221  		// more up to date that when the item was initially put onto the
   222  		// workNodeQueue.
   223  		if key, ok = obj.(string); !ok {
   224  			// As the item in the workNodeQueue is actually invalid, we call
   225  			// Forget here else we'd go into a loop of attempting to
   226  			// process a work item that is invalid.
   227  			r.queue.Forget(obj)
   228  			utilruntime.HandleError(fmt.Errorf("expected string in workNodeQueue but got %#v", obj))
   229  			return nil
   230  		}
   231  		// Run the syncHandler, passing it the namespace/name string of the
   232  		// Foo resource to be synced.
   233  		if err := r.syncNode(ctx, key); err != nil {
   234  			// Put the item back on the queue to handle any transient errors.
   235  			r.queue.AddRateLimited(key)
   236  			return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error())
   237  		}
   238  		// Finally, if no error occurs we Forget this item so it does not
   239  		// get queue again until another change happens.
   240  		r.queue.Forget(obj)
   241  		logger.Info("Successfully synced", "key", key)
   242  		return nil
   243  	}(klog.FromContext(ctx), obj)
   244  
   245  	if err != nil {
   246  		utilruntime.HandleError(err)
   247  		return true
   248  	}
   249  
   250  	return true
   251  }
   252  
   253  func (r *rangeAllocator) syncNode(ctx context.Context, key string) error {
   254  	logger := klog.FromContext(ctx)
   255  	startTime := time.Now()
   256  	defer func() {
   257  		logger.V(4).Info("Finished syncing Node request", "node", key, "elapsed", time.Since(startTime))
   258  	}()
   259  
   260  	node, err := r.nodeLister.Get(key)
   261  	if apierrors.IsNotFound(err) {
   262  		logger.V(3).Info("node has been deleted", "node", key)
   263  		// TODO: obtain the node object information to call ReleaseCIDR from here
   264  		// and retry if there is an error.
   265  		return nil
   266  	}
   267  	if err != nil {
   268  		return err
   269  	}
   270  	// Check the DeletionTimestamp to determine if object is under deletion.
   271  	if !node.DeletionTimestamp.IsZero() {
   272  		logger.V(3).Info("node is being deleted", "node", key)
   273  		return r.ReleaseCIDR(logger, node)
   274  	}
   275  	return r.AllocateOrOccupyCIDR(ctx, node)
   276  }
   277  
   278  // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet
   279  func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error {
   280  	if len(node.Spec.PodCIDRs) == 0 {
   281  		return nil
   282  	}
   283  	for idx, cidr := range node.Spec.PodCIDRs {
   284  		_, podCIDR, err := netutils.ParseCIDRSloppy(cidr)
   285  		if err != nil {
   286  			return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
   287  		}
   288  		// If node has a pre allocate cidr that does not exist in our cidrs.
   289  		// This will happen if cluster went from dualstack(multi cidrs) to non-dualstack
   290  		// then we have now way of locking it
   291  		if idx >= len(r.cidrSets) {
   292  			return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx)
   293  		}
   294  
   295  		if err := r.cidrSets[idx].Occupy(podCIDR); err != nil {
   296  			return fmt.Errorf("failed to mark cidr[%v] at idx [%v] as occupied for node: %v: %v", podCIDR, idx, node.Name, err)
   297  		}
   298  	}
   299  	return nil
   300  }
   301  
   302  // WARNING: If you're adding any return calls or defer any more work from this
   303  // function you have to make sure to update nodesInProcessing properly with the
   304  // disposition of the node when the work is done.
   305  func (r *rangeAllocator) AllocateOrOccupyCIDR(ctx context.Context, node *v1.Node) error {
   306  	if node == nil {
   307  		return nil
   308  	}
   309  
   310  	if len(node.Spec.PodCIDRs) > 0 {
   311  		return r.occupyCIDRs(node)
   312  	}
   313  
   314  	logger := klog.FromContext(ctx)
   315  	allocatedCIDRs := make([]*net.IPNet, len(r.cidrSets))
   316  
   317  	for idx := range r.cidrSets {
   318  		podCIDR, err := r.cidrSets[idx].AllocateNext()
   319  		if err != nil {
   320  			controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable")
   321  			return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err)
   322  		}
   323  		allocatedCIDRs[idx] = podCIDR
   324  	}
   325  
   326  	//queue the assignment
   327  	logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocatedCIDRs)
   328  	return r.updateCIDRsAllocation(ctx, node.Name, allocatedCIDRs)
   329  }
   330  
   331  // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets
   332  func (r *rangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error {
   333  	if node == nil || len(node.Spec.PodCIDRs) == 0 {
   334  		return nil
   335  	}
   336  
   337  	for idx, cidr := range node.Spec.PodCIDRs {
   338  		_, podCIDR, err := netutils.ParseCIDRSloppy(cidr)
   339  		if err != nil {
   340  			return fmt.Errorf("failed to parse CIDR %s on Node %v: %v", cidr, node.Name, err)
   341  		}
   342  
   343  		// If node has a pre allocate cidr that does not exist in our cidrs.
   344  		// This will happen if cluster went from dualstack(multi cidrs) to non-dualstack
   345  		// then we have now way of locking it
   346  		if idx >= len(r.cidrSets) {
   347  			return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx)
   348  		}
   349  
   350  		logger.V(4).Info("Release CIDR for node", "CIDR", cidr, "node", klog.KObj(node))
   351  		if err = r.cidrSets[idx].Release(podCIDR); err != nil {
   352  			return fmt.Errorf("error when releasing CIDR %v: %v", cidr, err)
   353  		}
   354  	}
   355  	return nil
   356  }
   357  
   358  // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs
   359  // so that they won't be assignable.
   360  func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) {
   361  	// Checks if service CIDR has a nonempty intersection with cluster
   362  	// CIDR. It is the case if either clusterCIDR contains serviceCIDR with
   363  	// clusterCIDR's Mask applied (this means that clusterCIDR contains
   364  	// serviceCIDR) or vice versa (which means that serviceCIDR contains
   365  	// clusterCIDR).
   366  	for idx, cidr := range r.clusterCIDRs {
   367  		// if they don't overlap then ignore the filtering
   368  		if !cidr.Contains(serviceCIDR.IP.Mask(cidr.Mask)) && !serviceCIDR.Contains(cidr.IP.Mask(serviceCIDR.Mask)) {
   369  			continue
   370  		}
   371  
   372  		// at this point, len(cidrSet) == len(clusterCidr)
   373  		if err := r.cidrSets[idx].Occupy(serviceCIDR); err != nil {
   374  			logger.Error(err, "Error filtering out service cidr out cluster cidr", "CIDR", cidr, "index", idx, "serviceCIDR", serviceCIDR)
   375  		}
   376  	}
   377  }
   378  
   379  // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server.
   380  func (r *rangeAllocator) updateCIDRsAllocation(ctx context.Context, nodeName string, allocatedCIDRs []*net.IPNet) error {
   381  	var err error
   382  	var node *v1.Node
   383  	logger := klog.FromContext(ctx)
   384  	cidrsString := ipnetToStringList(allocatedCIDRs)
   385  	node, err = r.nodeLister.Get(nodeName)
   386  	if err != nil {
   387  		logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", nodeName))
   388  		return err
   389  	}
   390  
   391  	// if cidr list matches the proposed.
   392  	// then we possibly updated this node
   393  	// and just failed to ack the success.
   394  	if len(node.Spec.PodCIDRs) == len(allocatedCIDRs) {
   395  		match := true
   396  		for idx, cidr := range cidrsString {
   397  			if node.Spec.PodCIDRs[idx] != cidr {
   398  				match = false
   399  				break
   400  			}
   401  		}
   402  		if match {
   403  			logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", allocatedCIDRs)
   404  			return nil
   405  		}
   406  	}
   407  
   408  	// node has cidrs, release the reserved
   409  	if len(node.Spec.PodCIDRs) != 0 {
   410  		logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs)
   411  		for idx, cidr := range allocatedCIDRs {
   412  			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
   413  				logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr)
   414  			}
   415  		}
   416  		return nil
   417  	}
   418  
   419  	// If we reached here, it means that the node has no CIDR currently assigned. So we set it.
   420  	for i := 0; i < cidrUpdateRetries; i++ {
   421  		if err = nodeutil.PatchNodeCIDRs(ctx, r.client, types.NodeName(node.Name), cidrsString); err == nil {
   422  			logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDRs", cidrsString)
   423  			return nil
   424  		}
   425  	}
   426  	// failed release back to the pool
   427  	logger.Error(err, "Failed to update node PodCIDR after multiple attempts", "node", klog.KObj(node), "podCIDRs", cidrsString)
   428  	controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRAssignmentFailed")
   429  	// We accept the fact that we may leak CIDRs here. This is safer than releasing
   430  	// them in case when we don't know if request went through.
   431  	// NodeController restart will return all falsely allocated CIDRs to the pool.
   432  	if !apierrors.IsServerTimeout(err) {
   433  		logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node))
   434  		for idx, cidr := range allocatedCIDRs {
   435  			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
   436  				logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node))
   437  			}
   438  		}
   439  	}
   440  	return err
   441  }