k8s.io/kubernetes@v1.29.3/pkg/controller/nodeipam/ipam/range_allocator.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package ipam
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net"
    23  	"sync"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/klog/v2"
    27  	netutils "k8s.io/utils/net"
    28  
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    32  	"k8s.io/apimachinery/pkg/util/sets"
    33  	informers "k8s.io/client-go/informers/core/v1"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/client-go/kubernetes/scheme"
    36  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    37  	corelisters "k8s.io/client-go/listers/core/v1"
    38  	"k8s.io/client-go/tools/cache"
    39  	"k8s.io/client-go/tools/record"
    40  	nodeutil "k8s.io/component-helpers/node/util"
    41  	"k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset"
    42  	controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
    43  )
    44  
    45  type rangeAllocator struct {
    46  	client clientset.Interface
    47  	// cluster cidrs as passed in during controller creation
    48  	clusterCIDRs []*net.IPNet
    49  	// for each entry in clusterCIDRs we maintain a list of what is used and what is not
    50  	cidrSets []*cidrset.CidrSet
    51  	// nodeLister is able to list/get nodes and is populated by the shared informer passed to controller
    52  	nodeLister corelisters.NodeLister
    53  	// nodesSynced returns true if the node shared informer has been synced at least once.
    54  	nodesSynced cache.InformerSynced
    55  	// Channel that is used to pass updating Nodes and their reserved CIDRs to the background
    56  	// This increases a throughput of CIDR assignment by not blocking on long operations.
    57  	nodeCIDRUpdateChannel chan nodeReservedCIDRs
    58  	broadcaster           record.EventBroadcaster
    59  	recorder              record.EventRecorder
    60  	// Keep a set of nodes that are currently being processed to avoid races in CIDR allocation
    61  	lock              sync.Mutex
    62  	nodesInProcessing sets.String
    63  }
    64  
    65  // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs)
    66  // Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
    67  // Caller must always pass in a list of existing nodes so the new allocator.
    68  // Caller must ensure that ClusterCIDRs are semantically correct e.g (1 for non DualStack, 2 for DualStack etc..)
    69  // can initialize its CIDR map. NodeList is only nil in testing.
    70  func NewCIDRRangeAllocator(logger klog.Logger, client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) {
    71  	if client == nil {
    72  		logger.Error(nil, "kubeClient is nil when starting CIDRRangeAllocator")
    73  		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
    74  	}
    75  
    76  	eventBroadcaster := record.NewBroadcaster()
    77  	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cidrAllocator"})
    78  
    79  	// create a cidrSet for each cidr we operate on
    80  	// cidrSet are mapped to clusterCIDR by index
    81  	cidrSets := make([]*cidrset.CidrSet, len(allocatorParams.ClusterCIDRs))
    82  	for idx, cidr := range allocatorParams.ClusterCIDRs {
    83  		cidrSet, err := cidrset.NewCIDRSet(cidr, allocatorParams.NodeCIDRMaskSizes[idx])
    84  		if err != nil {
    85  			return nil, err
    86  		}
    87  		cidrSets[idx] = cidrSet
    88  	}
    89  
    90  	ra := &rangeAllocator{
    91  		client:                client,
    92  		clusterCIDRs:          allocatorParams.ClusterCIDRs,
    93  		cidrSets:              cidrSets,
    94  		nodeLister:            nodeInformer.Lister(),
    95  		nodesSynced:           nodeInformer.Informer().HasSynced,
    96  		nodeCIDRUpdateChannel: make(chan nodeReservedCIDRs, cidrUpdateQueueSize),
    97  		broadcaster:           eventBroadcaster,
    98  		recorder:              recorder,
    99  		nodesInProcessing:     sets.NewString(),
   100  	}
   101  
   102  	if allocatorParams.ServiceCIDR != nil {
   103  		ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR)
   104  	} else {
   105  		logger.Info("No Service CIDR provided. Skipping filtering out service addresses")
   106  	}
   107  
   108  	if allocatorParams.SecondaryServiceCIDR != nil {
   109  		ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR)
   110  	} else {
   111  		logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses")
   112  	}
   113  
   114  	if nodeList != nil {
   115  		for _, node := range nodeList.Items {
   116  			if len(node.Spec.PodCIDRs) == 0 {
   117  				logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node))
   118  				continue
   119  			}
   120  			logger.V(4).Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDR", node.Spec.PodCIDR)
   121  			if err := ra.occupyCIDRs(&node); err != nil {
   122  				// This will happen if:
   123  				// 1. We find garbage in the podCIDRs field. Retrying is useless.
   124  				// 2. CIDR out of range: This means a node CIDR has changed.
   125  				// This error will keep crashing controller-manager.
   126  				return nil, err
   127  			}
   128  		}
   129  	}
   130  
   131  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   132  		AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error {
   133  			return ra.AllocateOrOccupyCIDR(logger, node)
   134  		}),
   135  		UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
   136  			// If the PodCIDRs list is not empty we either:
   137  			// - already processed a Node that already had CIDRs after NC restarted
   138  			//   (cidr is marked as used),
   139  			// - already processed a Node successfully and allocated CIDRs for it
   140  			//   (cidr is marked as used),
   141  			// - already processed a Node but we did saw a "timeout" response and
   142  			//   request eventually got through in this case we haven't released
   143  			//   the allocated CIDRs (cidr is still marked as used).
   144  			// There's a possible error here:
   145  			// - NC sees a new Node and assigns CIDRs X,Y.. to it,
   146  			// - Update Node call fails with a timeout,
   147  			// - Node is updated by some other component, NC sees an update and
   148  			//   assigns CIDRs A,B.. to the Node,
   149  			// - Both CIDR X,Y.. and CIDR A,B.. are marked as used in the local cache,
   150  			//   even though Node sees only CIDR A,B..
   151  			// The problem here is that in in-memory cache we see CIDR X,Y.. as marked,
   152  			// which prevents it from being assigned to any new node. The cluster
   153  			// state is correct.
   154  			// Restart of NC fixes the issue.
   155  			if len(newNode.Spec.PodCIDRs) == 0 {
   156  				return ra.AllocateOrOccupyCIDR(logger, newNode)
   157  			}
   158  			return nil
   159  		}),
   160  		DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error {
   161  			return ra.ReleaseCIDR(logger, node)
   162  		}),
   163  	})
   164  
   165  	return ra, nil
   166  }
   167  
   168  func (r *rangeAllocator) Run(ctx context.Context) {
   169  	defer utilruntime.HandleCrash()
   170  
   171  	// Start event processing pipeline.
   172  	r.broadcaster.StartStructuredLogging(0)
   173  	logger := klog.FromContext(ctx)
   174  	logger.Info("Sending events to api server")
   175  	r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")})
   176  	defer r.broadcaster.Shutdown()
   177  
   178  	logger.Info("Starting range CIDR allocator")
   179  	defer logger.Info("Shutting down range CIDR allocator")
   180  
   181  	if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), r.nodesSynced) {
   182  		return
   183  	}
   184  
   185  	for i := 0; i < cidrUpdateWorkers; i++ {
   186  		go r.worker(ctx)
   187  	}
   188  
   189  	<-ctx.Done()
   190  }
   191  
   192  func (r *rangeAllocator) worker(ctx context.Context) {
   193  	logger := klog.FromContext(ctx)
   194  	for {
   195  		select {
   196  		case workItem, ok := <-r.nodeCIDRUpdateChannel:
   197  			if !ok {
   198  				logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed")
   199  				return
   200  			}
   201  			if err := r.updateCIDRsAllocation(logger, workItem); err != nil {
   202  				// Requeue the failed node for update again.
   203  				r.nodeCIDRUpdateChannel <- workItem
   204  			}
   205  		case <-ctx.Done():
   206  			return
   207  		}
   208  	}
   209  }
   210  
   211  func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool {
   212  	r.lock.Lock()
   213  	defer r.lock.Unlock()
   214  	if r.nodesInProcessing.Has(nodeName) {
   215  		return false
   216  	}
   217  	r.nodesInProcessing.Insert(nodeName)
   218  	return true
   219  }
   220  
   221  func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) {
   222  	r.lock.Lock()
   223  	defer r.lock.Unlock()
   224  	r.nodesInProcessing.Delete(nodeName)
   225  }
   226  
   227  // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet
   228  func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error {
   229  	defer r.removeNodeFromProcessing(node.Name)
   230  	if len(node.Spec.PodCIDRs) == 0 {
   231  		return nil
   232  	}
   233  	for idx, cidr := range node.Spec.PodCIDRs {
   234  		_, podCIDR, err := netutils.ParseCIDRSloppy(cidr)
   235  		if err != nil {
   236  			return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
   237  		}
   238  		// If node has a pre allocate cidr that does not exist in our cidrs.
   239  		// This will happen if cluster went from dualstack(multi cidrs) to non-dualstack
   240  		// then we have now way of locking it
   241  		if idx >= len(r.cidrSets) {
   242  			return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx)
   243  		}
   244  
   245  		if err := r.cidrSets[idx].Occupy(podCIDR); err != nil {
   246  			return fmt.Errorf("failed to mark cidr[%v] at idx [%v] as occupied for node: %v: %v", podCIDR, idx, node.Name, err)
   247  		}
   248  	}
   249  	return nil
   250  }
   251  
   252  // WARNING: If you're adding any return calls or defer any more work from this
   253  // function you have to make sure to update nodesInProcessing properly with the
   254  // disposition of the node when the work is done.
   255  func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error {
   256  	if node == nil {
   257  		return nil
   258  	}
   259  	if !r.insertNodeToProcessing(node.Name) {
   260  		logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node))
   261  		return nil
   262  	}
   263  
   264  	if len(node.Spec.PodCIDRs) > 0 {
   265  		return r.occupyCIDRs(node)
   266  	}
   267  	// allocate and queue the assignment
   268  	allocated := nodeReservedCIDRs{
   269  		nodeName:       node.Name,
   270  		allocatedCIDRs: make([]*net.IPNet, len(r.cidrSets)),
   271  	}
   272  
   273  	for idx := range r.cidrSets {
   274  		podCIDR, err := r.cidrSets[idx].AllocateNext()
   275  		if err != nil {
   276  			r.removeNodeFromProcessing(node.Name)
   277  			controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable")
   278  			return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err)
   279  		}
   280  		allocated.allocatedCIDRs[idx] = podCIDR
   281  	}
   282  
   283  	//queue the assignment
   284  	logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocated.allocatedCIDRs)
   285  	r.nodeCIDRUpdateChannel <- allocated
   286  	return nil
   287  }
   288  
   289  // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets
   290  func (r *rangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error {
   291  	if node == nil || len(node.Spec.PodCIDRs) == 0 {
   292  		return nil
   293  	}
   294  
   295  	for idx, cidr := range node.Spec.PodCIDRs {
   296  		_, podCIDR, err := netutils.ParseCIDRSloppy(cidr)
   297  		if err != nil {
   298  			return fmt.Errorf("failed to parse CIDR %s on Node %v: %v", cidr, node.Name, err)
   299  		}
   300  
   301  		// If node has a pre allocate cidr that does not exist in our cidrs.
   302  		// This will happen if cluster went from dualstack(multi cidrs) to non-dualstack
   303  		// then we have now way of locking it
   304  		if idx >= len(r.cidrSets) {
   305  			return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx)
   306  		}
   307  
   308  		logger.V(4).Info("Release CIDR for node", "CIDR", cidr, "node", klog.KObj(node))
   309  		if err = r.cidrSets[idx].Release(podCIDR); err != nil {
   310  			return fmt.Errorf("error when releasing CIDR %v: %v", cidr, err)
   311  		}
   312  	}
   313  	return nil
   314  }
   315  
   316  // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs
   317  // so that they won't be assignable.
   318  func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) {
   319  	// Checks if service CIDR has a nonempty intersection with cluster
   320  	// CIDR. It is the case if either clusterCIDR contains serviceCIDR with
   321  	// clusterCIDR's Mask applied (this means that clusterCIDR contains
   322  	// serviceCIDR) or vice versa (which means that serviceCIDR contains
   323  	// clusterCIDR).
   324  	for idx, cidr := range r.clusterCIDRs {
   325  		// if they don't overlap then ignore the filtering
   326  		if !cidr.Contains(serviceCIDR.IP.Mask(cidr.Mask)) && !serviceCIDR.Contains(cidr.IP.Mask(serviceCIDR.Mask)) {
   327  			continue
   328  		}
   329  
   330  		// at this point, len(cidrSet) == len(clusterCidr)
   331  		if err := r.cidrSets[idx].Occupy(serviceCIDR); err != nil {
   332  			logger.Error(err, "Error filtering out service cidr out cluster cidr", "CIDR", cidr, "index", idx, "serviceCIDR", serviceCIDR)
   333  		}
   334  	}
   335  }
   336  
   337  // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server.
   338  func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeReservedCIDRs) error {
   339  	var err error
   340  	var node *v1.Node
   341  	defer r.removeNodeFromProcessing(data.nodeName)
   342  	cidrsString := ipnetToStringList(data.allocatedCIDRs)
   343  	node, err = r.nodeLister.Get(data.nodeName)
   344  	if err != nil {
   345  		logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName))
   346  		return err
   347  	}
   348  
   349  	// if cidr list matches the proposed.
   350  	// then we possibly updated this node
   351  	// and just failed to ack the success.
   352  	if len(node.Spec.PodCIDRs) == len(data.allocatedCIDRs) {
   353  		match := true
   354  		for idx, cidr := range cidrsString {
   355  			if node.Spec.PodCIDRs[idx] != cidr {
   356  				match = false
   357  				break
   358  			}
   359  		}
   360  		if match {
   361  			logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs)
   362  			return nil
   363  		}
   364  	}
   365  
   366  	// node has cidrs, release the reserved
   367  	if len(node.Spec.PodCIDRs) != 0 {
   368  		logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs)
   369  		for idx, cidr := range data.allocatedCIDRs {
   370  			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
   371  				logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr)
   372  			}
   373  		}
   374  		return nil
   375  	}
   376  
   377  	// If we reached here, it means that the node has no CIDR currently assigned. So we set it.
   378  	for i := 0; i < cidrUpdateRetries; i++ {
   379  		if err = nodeutil.PatchNodeCIDRs(r.client, types.NodeName(node.Name), cidrsString); err == nil {
   380  			logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDRs", cidrsString)
   381  			return nil
   382  		}
   383  	}
   384  	// failed release back to the pool
   385  	logger.Error(err, "Failed to update node PodCIDR after multiple attempts", "node", klog.KObj(node), "podCIDRs", cidrsString)
   386  	controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRAssignmentFailed")
   387  	// We accept the fact that we may leak CIDRs here. This is safer than releasing
   388  	// them in case when we don't know if request went through.
   389  	// NodeController restart will return all falsely allocated CIDRs to the pool.
   390  	if !apierrors.IsServerTimeout(err) {
   391  		logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node))
   392  		for idx, cidr := range data.allocatedCIDRs {
   393  			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil {
   394  				logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node))
   395  			}
   396  		}
   397  	}
   398  	return err
   399  }