github.com/datadog/cilium@v1.6.12/pkg/ipam/crd.go (about)

     1  // Copyright 2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package ipam
    16  
    17  import (
    18  	"fmt"
    19  	"net"
    20  	"reflect"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/cilium/cilium/pkg/cidr"
    25  	"github.com/cilium/cilium/pkg/k8s"
    26  	ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
    27  	"github.com/cilium/cilium/pkg/k8s/informer"
    28  	k8sversion "github.com/cilium/cilium/pkg/k8s/version"
    29  	"github.com/cilium/cilium/pkg/lock"
    30  	"github.com/cilium/cilium/pkg/node"
    31  	"github.com/cilium/cilium/pkg/option"
    32  	"github.com/cilium/cilium/pkg/trigger"
    33  
    34  	"github.com/sirupsen/logrus"
    35  	"k8s.io/api/core/v1"
    36  	"k8s.io/apimachinery/pkg/fields"
    37  	"k8s.io/apimachinery/pkg/util/wait"
    38  	"k8s.io/client-go/tools/cache"
    39  )
    40  
    41  var (
    42  	sharedNodeStore *nodeStore
    43  	initNodeStore   sync.Once
    44  )
    45  
    46  const (
    47  	// customResourceUpdateRate is the maximum rate in which a custom
    48  	// resource is updated
    49  	customResourceUpdateRate = 15 * time.Second
    50  
    51  	fieldName = "name"
    52  )
    53  
    54  // nodeStore represents a CiliumNode custom resource and binds the CR to a list
    55  // of allocators
    56  type nodeStore struct {
    57  	// mutex protects access to all members of this struct
    58  	mutex lock.RWMutex
    59  
    60  	// ownNode is the last known version of the own node resource
    61  	ownNode *ciliumv2.CiliumNode
    62  
    63  	// allocators is a list of allocators tied to this custom resource
    64  	allocators []*crdAllocator
    65  
    66  	// refreshTrigger is the configured trigger to synchronize updates to
    67  	// the custom resource with rate limiting
    68  	refreshTrigger *trigger.Trigger
    69  
    70  	// allocationPoolSize is the size of the IP pool for each address
    71  	// family
    72  	allocationPoolSize map[Family]int
    73  }
    74  
    75  // newNodeStore initializes a new store which reflects the CiliumNode custom
    76  // resource of the specified node name
    77  func newNodeStore(nodeName string, owner Owner) *nodeStore {
    78  	log.WithField(fieldName, nodeName).Info("Subscribed to CiliumNode custom resource")
    79  
    80  	store := &nodeStore{
    81  		allocators:         []*crdAllocator{},
    82  		allocationPoolSize: map[Family]int{},
    83  	}
    84  	ciliumClient := k8s.CiliumClient()
    85  
    86  	t, err := trigger.NewTrigger(trigger.Parameters{
    87  		Name:        "crd-allocator-node-refresher",
    88  		MinInterval: customResourceUpdateRate,
    89  		TriggerFunc: store.refreshNodeTrigger,
    90  	})
    91  	if err != nil {
    92  		log.WithError(err).Fatal("Unable to initialize CiliumNode synchronization trigger")
    93  	}
    94  	store.refreshTrigger = t
    95  
    96  	// Create the CiliumNode custom resource. This call will block until
    97  	// the custom resource has been created
    98  	owner.UpdateCiliumNodeResource()
    99  
   100  	ciliumNodeSelector := fields.ParseSelectorOrDie("metadata.name=" + nodeName)
   101  	ciliumNodeStore := cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc)
   102  	ciliumNodeInformer := informer.NewInformerWithStore(
   103  		cache.NewListWatchFromClient(ciliumClient.CiliumV2().RESTClient(),
   104  			"ciliumnodes", v1.NamespaceAll, ciliumNodeSelector),
   105  		&ciliumv2.CiliumNode{},
   106  		0,
   107  		cache.ResourceEventHandlerFuncs{
   108  			AddFunc: func(obj interface{}) {
   109  				var valid, equal bool
   110  				defer func() { owner.K8sEventReceived("CiliumNode", "create", valid, equal) }()
   111  				if node, ok := obj.(*ciliumv2.CiliumNode); ok {
   112  					valid = true
   113  					store.updateLocalNodeResource(node.DeepCopy())
   114  					owner.K8sEventProcessed("CiliumNode", "create", true)
   115  				} else {
   116  					log.Warningf("Unknown CiliumNode object type %s received: %+v", reflect.TypeOf(obj), obj)
   117  				}
   118  			},
   119  			UpdateFunc: func(oldObj, newObj interface{}) {
   120  				var valid, equal bool
   121  				defer func() { owner.K8sEventReceived("CiliumNode", "update", valid, equal) }()
   122  				if node, ok := newObj.(*ciliumv2.CiliumNode); ok {
   123  					valid = true
   124  					store.updateLocalNodeResource(node.DeepCopy())
   125  					owner.K8sEventProcessed("CiliumNode", "update", true)
   126  				} else {
   127  					log.Warningf("Unknown CiliumNode object type %s received: %+v", reflect.TypeOf(newObj), newObj)
   128  				}
   129  			},
   130  			DeleteFunc: func(obj interface{}) {
   131  				// Given we are watching a single specific
   132  				// resource using the node name, any delete
   133  				// notification means that the resource
   134  				// matching the local node name has been
   135  				// removed. No attempt to cast is required.
   136  				store.deleteLocalNodeResource()
   137  				owner.K8sEventProcessed("CiliumNode", "delete", true)
   138  				owner.K8sEventReceived("CiliumNode", "delete", true, false)
   139  			},
   140  		},
   141  		func(obj interface{}) interface{} {
   142  			cnp, _ := obj.(*ciliumv2.CiliumNode)
   143  			return cnp
   144  		},
   145  		ciliumNodeStore,
   146  	)
   147  
   148  	go ciliumNodeInformer.Run(wait.NeverStop)
   149  
   150  	log.WithField(fieldName, nodeName).Info("Waiting for CiliumNode custom resource to become available...")
   151  	if ok := cache.WaitForCacheSync(wait.NeverStop, ciliumNodeInformer.HasSynced); !ok {
   152  		log.WithField(fieldName, nodeName).Fatal("Unable to synchronize CiliumNode custom resource")
   153  	} else {
   154  		log.WithField(fieldName, nodeName).Info("Successfully synchronized CiliumNode custom resource")
   155  	}
   156  
   157  	for {
   158  		minimumReached, required, numAvailable := store.hasMinimumIPsInPool()
   159  		logFields := logrus.Fields{
   160  			fieldName:   nodeName,
   161  			"required":  required,
   162  			"available": numAvailable,
   163  		}
   164  		if minimumReached {
   165  			log.WithFields(logFields).Info("All required IPs are available in CRD-backed allocation pool")
   166  			break
   167  		}
   168  
   169  		log.WithFields(logFields).Info("Waiting for IPs to become available in CRD-backed allocation pool")
   170  		time.Sleep(5 * time.Second)
   171  	}
   172  
   173  	store.refreshTrigger.TriggerWithReason("initial sync")
   174  
   175  	return store
   176  }
   177  
   178  func deriveVpcCIDR(node *ciliumv2.CiliumNode) (result *cidr.CIDR) {
   179  	if len(node.Status.ENI.ENIs) > 0 {
   180  		// A node belongs to a single VPC so we can pick the first ENI
   181  		// in the list and derive the VPC CIDR from it.
   182  		for _, eni := range node.Status.ENI.ENIs {
   183  			c, err := cidr.ParseCIDR(eni.VPC.PrimaryCIDR)
   184  			if err == nil {
   185  				result = c
   186  			}
   187  			return
   188  		}
   189  	}
   190  	return
   191  }
   192  
   193  // hasMinimumIPsInPool returns true if the required number of IPs is available
   194  // in the allocation pool. It also returns the number of IPs required and
   195  // avalable.
   196  func (n *nodeStore) hasMinimumIPsInPool() (minimumReached bool, required, numAvailable int) {
   197  	n.mutex.RLock()
   198  	defer n.mutex.RUnlock()
   199  
   200  	if n.ownNode == nil {
   201  		return
   202  	}
   203  
   204  	switch {
   205  	case n.ownNode.Spec.ENI.MinAllocate != 0:
   206  		required = n.ownNode.Spec.ENI.MinAllocate
   207  	case n.ownNode.Spec.ENI.PreAllocate != 0:
   208  		required = n.ownNode.Spec.ENI.PreAllocate
   209  	case option.Config.EnableHealthChecking:
   210  		required = 2
   211  	default:
   212  		required = 1
   213  	}
   214  
   215  	if n.ownNode.Spec.IPAM.Pool != nil {
   216  		numAvailable = len(n.ownNode.Spec.IPAM.Pool)
   217  		if len(n.ownNode.Spec.IPAM.Pool) >= required {
   218  			minimumReached = true
   219  		}
   220  
   221  		if option.Config.IPAM == option.IPAMENI {
   222  			if vpcCIDR := deriveVpcCIDR(n.ownNode); vpcCIDR != nil {
   223  				option.Config.SetIPv4NativeRoutingCIDR(vpcCIDR)
   224  			} else {
   225  				minimumReached = false
   226  			}
   227  		}
   228  	}
   229  
   230  	return
   231  }
   232  
   233  // deleteLocalNodeResource is called when the CiliumNode resource representing
   234  // the local node has been deleted.
   235  func (n *nodeStore) deleteLocalNodeResource() {
   236  	n.mutex.Lock()
   237  	n.ownNode = nil
   238  	n.mutex.Unlock()
   239  }
   240  
   241  // updateLocalNodeResource is called when the CiliumNode resource representing
   242  // the local node has been added or updated. It updates the available IPs based
   243  // on the custom resource passed into the function.
   244  func (n *nodeStore) updateLocalNodeResource(node *ciliumv2.CiliumNode) {
   245  	n.mutex.Lock()
   246  	defer n.mutex.Unlock()
   247  
   248  	n.ownNode = node
   249  	n.allocationPoolSize[IPv4] = 0
   250  	n.allocationPoolSize[IPv6] = 0
   251  	if node.Spec.IPAM.Pool != nil {
   252  		for ipString := range node.Spec.IPAM.Pool {
   253  			if ip := net.ParseIP(ipString); ip != nil {
   254  				if ip.To4() != nil {
   255  					n.allocationPoolSize[IPv4]++
   256  				} else {
   257  					n.allocationPoolSize[IPv6]++
   258  				}
   259  			}
   260  		}
   261  	}
   262  }
   263  
   264  // refreshNodeTrigger is called to refresh the custom resource after taking the
   265  // configured rate limiting into account
   266  //
   267  // Note: The function signature includes the reasons argument in order to
   268  // implement the trigger.TriggerFunc interface despite the argument being
   269  // unused.
   270  func (n *nodeStore) refreshNodeTrigger(reasons []string) {
   271  	if err := n.refreshNode(); err != nil {
   272  		log.WithError(err).Warning("Unable to update CiliumNode custom resource")
   273  		n.refreshTrigger.TriggerWithReason("retry after error")
   274  	}
   275  }
   276  
   277  // refreshNode updates the custom resource in the apiserver based on the latest
   278  // information in the local node store
   279  func (n *nodeStore) refreshNode() error {
   280  	n.mutex.RLock()
   281  	if n.ownNode == nil {
   282  		n.mutex.RUnlock()
   283  		return nil
   284  	}
   285  
   286  	node := n.ownNode.DeepCopy()
   287  	staleCopyOfAllocators := make([]*crdAllocator, len(n.allocators))
   288  	copy(staleCopyOfAllocators, n.allocators)
   289  	n.mutex.RUnlock()
   290  
   291  	node.Status.IPAM.Used = map[string]ciliumv2.AllocationIP{}
   292  
   293  	for _, a := range staleCopyOfAllocators {
   294  		a.mutex.RLock()
   295  		for ip, ipInfo := range a.allocated {
   296  			node.Status.IPAM.Used[ip] = ipInfo
   297  		}
   298  		a.mutex.RUnlock()
   299  	}
   300  
   301  	var err error
   302  	k8sCapabilities := k8sversion.Capabilities()
   303  	ciliumClient := k8s.CiliumClient()
   304  	switch {
   305  	case k8sCapabilities.UpdateStatus:
   306  		_, err = ciliumClient.CiliumV2().CiliumNodes().UpdateStatus(node)
   307  	default:
   308  		_, err = ciliumClient.CiliumV2().CiliumNodes().Update(node)
   309  	}
   310  
   311  	return err
   312  }
   313  
   314  // addAllocator adds a new CRD allocator to the node store
   315  func (n *nodeStore) addAllocator(allocator *crdAllocator) {
   316  	n.mutex.Lock()
   317  	n.allocators = append(n.allocators, allocator)
   318  	n.mutex.Unlock()
   319  }
   320  
   321  // allocate checks if a particular IP can be allocated or return an error
   322  func (n *nodeStore) allocate(ip net.IP) (*ciliumv2.AllocationIP, error) {
   323  	n.mutex.RLock()
   324  	defer n.mutex.RUnlock()
   325  
   326  	if n.ownNode == nil {
   327  		return nil, fmt.Errorf("CiliumNode for own node is not available")
   328  	}
   329  
   330  	if n.ownNode.Spec.IPAM.Pool == nil {
   331  		return nil, fmt.Errorf("No IPs available")
   332  	}
   333  
   334  	ipInfo, ok := n.ownNode.Spec.IPAM.Pool[ip.String()]
   335  	if !ok {
   336  		return nil, fmt.Errorf("IP %s is not available", ip.String())
   337  	}
   338  
   339  	return &ipInfo, nil
   340  }
   341  
   342  // allocateNext allocates the next available IP or returns an error
   343  func (n *nodeStore) allocateNext(allocated map[string]ciliumv2.AllocationIP, family Family) (net.IP, *ciliumv2.AllocationIP, error) {
   344  	n.mutex.RLock()
   345  	defer n.mutex.RUnlock()
   346  
   347  	if n.ownNode == nil {
   348  		return nil, nil, fmt.Errorf("CiliumNode for own node is not available")
   349  	}
   350  
   351  	// FIXME: This is currently using a brute-force method that can be
   352  	// optimized
   353  	for ip, ipInfo := range n.ownNode.Spec.IPAM.Pool {
   354  		if _, ok := allocated[ip]; !ok {
   355  			parsedIP := net.ParseIP(ip)
   356  			if parsedIP == nil {
   357  				log.WithFields(logrus.Fields{
   358  					fieldName: n.ownNode.Name,
   359  					"ip":      ip,
   360  				}).Warning("Unable to parse IP in CiliumNode custom resource")
   361  				continue
   362  			}
   363  
   364  			if DeriveFamily(parsedIP) != family {
   365  				continue
   366  			}
   367  
   368  			return parsedIP, &ipInfo, nil
   369  		}
   370  	}
   371  
   372  	return nil, nil, fmt.Errorf("No more IPs available")
   373  }
   374  
   375  // crdAllocator implements the CRD-backed IP allocator
   376  type crdAllocator struct {
   377  	// store is the node store backing the custom resource
   378  	store *nodeStore
   379  
   380  	// mutex protects access to the allocated map
   381  	mutex lock.RWMutex
   382  
   383  	// allocated is a map of all allocated IPs indexed by the allocated IP
   384  	// represented as string
   385  	allocated map[string]ciliumv2.AllocationIP
   386  
   387  	// family is the address family this allocator is allocator for
   388  	family Family
   389  }
   390  
   391  // newCRDAllocator creates a new CRD-backed IP allocator
   392  func newCRDAllocator(family Family, owner Owner) Allocator {
   393  	initNodeStore.Do(func() {
   394  		sharedNodeStore = newNodeStore(node.GetName(), owner)
   395  	})
   396  
   397  	allocator := &crdAllocator{
   398  		allocated: map[string]ciliumv2.AllocationIP{},
   399  		family:    family,
   400  		store:     sharedNodeStore,
   401  	}
   402  
   403  	sharedNodeStore.addAllocator(allocator)
   404  
   405  	return allocator
   406  }
   407  
   408  func deriveGatewayIP(eni ciliumv2.ENI) string {
   409  	subnetIP, _, err := net.ParseCIDR(eni.Subnet.CIDR)
   410  	if err != nil {
   411  		log.WithError(err).Warningf("Unable to parse AWS subnet CIDR %s", eni.Subnet.CIDR)
   412  		return ""
   413  	}
   414  
   415  	addr := subnetIP.To4()
   416  
   417  	// The gateway for a subnet and VPC is always x.x.x.1
   418  	// Ref: https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Route_Tables.html
   419  	return net.IPv4(addr[0], addr[1], addr[2], addr[3]+1).String()
   420  }
   421  
   422  func (a *crdAllocator) buildAllocationResult(ip net.IP, ipInfo *ciliumv2.AllocationIP) (result *AllocationResult, err error) {
   423  	result = &AllocationResult{IP: ip}
   424  
   425  	// In ENI mode, the Resource points to the ENI so we can derive the
   426  	// master interface and all CIDRs of the VPC
   427  	if option.Config.IPAM == option.IPAMENI {
   428  		a.store.mutex.RLock()
   429  		defer a.store.mutex.RUnlock()
   430  
   431  		if a.store.ownNode == nil {
   432  			return
   433  		}
   434  
   435  		for _, eni := range a.store.ownNode.Status.ENI.ENIs {
   436  			if eni.ID == ipInfo.Resource {
   437  				result.Master = eni.MAC
   438  				result.CIDRs = []string{eni.VPC.PrimaryCIDR}
   439  				result.CIDRs = append(result.CIDRs, eni.VPC.CIDRs...)
   440  				if eni.Subnet.CIDR != "" {
   441  					result.GatewayIP = deriveGatewayIP(eni)
   442  				}
   443  
   444  				return
   445  			}
   446  		}
   447  
   448  		result = nil
   449  		err = fmt.Errorf("unable to find ENI %s", ipInfo.Resource)
   450  	}
   451  
   452  	return
   453  }
   454  
   455  // Allocate will attempt to find the specified IP in the custom resource and
   456  // allocate it if it is available. If the IP is unavailable or already
   457  // allocated, an error is returned. The custom resource will be updated to
   458  // reflect the newly allocated IP.
   459  func (a *crdAllocator) Allocate(ip net.IP, owner string) (*AllocationResult, error) {
   460  	a.mutex.Lock()
   461  	defer a.mutex.Unlock()
   462  
   463  	if _, ok := a.allocated[ip.String()]; ok {
   464  		return nil, fmt.Errorf("IP already in use")
   465  	}
   466  
   467  	ipInfo, err := a.store.allocate(ip)
   468  	if err != nil {
   469  		return nil, err
   470  	}
   471  
   472  	a.markAllocated(ip, owner, *ipInfo)
   473  
   474  	return a.buildAllocationResult(ip, ipInfo)
   475  }
   476  
   477  // Release will release the specified IP or return an error if the IP has not
   478  // been allocated before. The custom resource will be updated to reflect the
   479  // released IP.
   480  func (a *crdAllocator) Release(ip net.IP) error {
   481  	a.mutex.Lock()
   482  	defer a.mutex.Unlock()
   483  
   484  	if _, ok := a.allocated[ip.String()]; !ok {
   485  		return fmt.Errorf("IP %s is not allocated", ip.String())
   486  	}
   487  
   488  	delete(a.allocated, ip.String())
   489  	a.store.refreshTrigger.TriggerWithReason(fmt.Sprintf("release of IP %s", ip.String()))
   490  
   491  	return nil
   492  }
   493  
   494  // markAllocated marks a particular IP as allocated and triggers the custom
   495  // resource update
   496  func (a *crdAllocator) markAllocated(ip net.IP, owner string, ipInfo ciliumv2.AllocationIP) {
   497  	ipInfo.Owner = owner
   498  	a.allocated[ip.String()] = ipInfo
   499  	a.store.refreshTrigger.TriggerWithReason(fmt.Sprintf("allocation of IP %s", ip.String()))
   500  }
   501  
   502  // AllocateNext allocates the next available IP as offered by the custom
   503  // resource or return an error if no IP is available. The custom resource will
   504  // be updated to reflect the newly allocated IP.
   505  func (a *crdAllocator) AllocateNext(owner string) (*AllocationResult, error) {
   506  	a.mutex.Lock()
   507  	defer a.mutex.Unlock()
   508  
   509  	ip, ipInfo, err := a.store.allocateNext(a.allocated, a.family)
   510  	if err != nil {
   511  		return nil, err
   512  	}
   513  
   514  	a.markAllocated(ip, owner, *ipInfo)
   515  
   516  	return a.buildAllocationResult(ip, ipInfo)
   517  }
   518  
   519  // totalPoolSize returns the total size of the allocation pool
   520  // a.mutex must be held
   521  func (a *crdAllocator) totalPoolSize() int {
   522  	if num, ok := a.store.allocationPoolSize[a.family]; ok {
   523  		return num
   524  	}
   525  	return 0
   526  }
   527  
   528  // Dump provides a status report and lists all allocated IP addressess
   529  func (a *crdAllocator) Dump() (map[string]string, string) {
   530  	a.mutex.RLock()
   531  	defer a.mutex.RUnlock()
   532  
   533  	allocs := map[string]string{}
   534  	for ip := range a.allocated {
   535  		allocs[ip] = ""
   536  	}
   537  
   538  	status := fmt.Sprintf("%d/%d allocated", len(allocs), a.totalPoolSize())
   539  	return allocs, status
   540  }