github.com/cilium/cilium@v1.16.2/pkg/ipam/node_manager.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 // Copyright 2017 Lyft, Inc. 5 6 package ipam 7 8 import ( 9 "context" 10 "fmt" 11 "sort" 12 13 "github.com/sirupsen/logrus" 14 "golang.org/x/sync/semaphore" 15 16 "github.com/cilium/cilium/pkg/backoff" 17 "github.com/cilium/cilium/pkg/controller" 18 ipamStats "github.com/cilium/cilium/pkg/ipam/stats" 19 ipamTypes "github.com/cilium/cilium/pkg/ipam/types" 20 v2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2" 21 "github.com/cilium/cilium/pkg/lock" 22 "github.com/cilium/cilium/pkg/logging" 23 "github.com/cilium/cilium/pkg/time" 24 "github.com/cilium/cilium/pkg/trigger" 25 ) 26 27 var ipamNodeIntervalControllerGroup = controller.NewGroup("ipam-node-interval-refresh") 28 29 // CiliumNodeGetterUpdater defines the interface used to interact with the k8s 30 // apiserver to retrieve and update the CiliumNode custom resource 31 type CiliumNodeGetterUpdater interface { 32 Create(node *v2.CiliumNode) (*v2.CiliumNode, error) 33 Update(origResource, newResource *v2.CiliumNode) (*v2.CiliumNode, error) 34 UpdateStatus(origResource, newResource *v2.CiliumNode) (*v2.CiliumNode, error) 35 Get(name string) (*v2.CiliumNode, error) 36 } 37 38 // NodeOperations is the interface an IPAM implementation must provide in order 39 // to provide IP allocation for a node. The structure implementing this API 40 // *must* be aware of the node connected to this implementation. This is 41 // achieved by considering the node context provided in 42 // AllocationImplementation.CreateNode() function and returning a 43 // NodeOperations implementation which performs operations in the context of 44 // that node. 45 type NodeOperations interface { 46 // UpdateNode is called when an update to the CiliumNode is received. 47 UpdatedNode(obj *v2.CiliumNode) 48 49 // PopulateStatusFields is called to give the implementation a chance 50 // to populate any implementation specific fields in CiliumNode.Status. 51 PopulateStatusFields(resource *v2.CiliumNode) 52 53 // CreateInterface is called to create a new interface. This is only 54 // done if PrepareIPAllocation indicates that no more IPs are available 55 // (AllocationAction.AvailableForAllocation == 0) for allocation but 56 // interfaces are available for creation 57 // (AllocationAction.EmptyInterfaceSlots > 0). This function must 58 // create the interface *and* allocate up to 59 // AllocationAction.MaxIPsToAllocate. 60 CreateInterface(ctx context.Context, allocation *AllocationAction, scopedLog *logrus.Entry) (int, string, error) 61 62 // ResyncInterfacesAndIPs is called to synchronize the latest list of 63 // interfaces and IPs associated with the node. This function is called 64 // sparingly as this information is kept in sync based on the success 65 // of the functions AllocateIPs(), ReleaseIPs() and CreateInterface(). 66 // It returns all available ip in node and remaining available interfaces 67 // that can either be allocated or have not yet exhausted the instance specific quota of addresses 68 // and error occurred during execution. 69 ResyncInterfacesAndIPs(ctx context.Context, scopedLog *logrus.Entry) (ipamTypes.AllocationMap, ipamStats.InterfaceStats, error) 70 71 // PrepareIPAllocation is called to calculate the number of IPs that 72 // can be allocated on the node and whether a new network interface 73 // must be attached to the node. 74 PrepareIPAllocation(scopedLog *logrus.Entry) (*AllocationAction, error) 75 76 // AllocateIPs is called after invoking PrepareIPAllocation and needs 77 // to perform the actual allocation. 78 AllocateIPs(ctx context.Context, allocation *AllocationAction) error 79 80 // PrepareIPRelease is called to calculate whether any IP excess needs 81 // to be resolved. It behaves identical to PrepareIPAllocation but 82 // indicates a need to release IPs. 83 PrepareIPRelease(excessIPs int, scopedLog *logrus.Entry) *ReleaseAction 84 85 // ReleaseIPs is called after invoking PrepareIPRelease and needs to 86 // perform the release of IPs. 87 ReleaseIPs(ctx context.Context, release *ReleaseAction) error 88 89 // GetMaximumAllocatableIPv4 returns the maximum amount of IPv4 addresses 90 // that can be allocated to the instance 91 GetMaximumAllocatableIPv4() int 92 93 // GetMinimumAllocatableIPv4 returns the minimum amount of IPv4 addresses that 94 // must be allocated to the instance. 95 GetMinimumAllocatableIPv4() int 96 97 // IsPrefixDelegated helps identify if a node supports prefix delegation 98 IsPrefixDelegated() bool 99 100 // GetUsedIPWithPrefixes returns the total number of used IPs including all IPs in a prefix if at-least one of 101 // the prefix IPs is in use. 102 GetUsedIPWithPrefixes() int 103 } 104 105 // AllocationImplementation is the interface an implementation must provide. 106 // Other than NodeOperations, this implementation is not related to a node 107 // specifically. 108 type AllocationImplementation interface { 109 // CreateNode is called when the IPAM layer has learned about a new 110 // node which requires IPAM services. This function must return a 111 // NodeOperations implementation which will render IPAM services to the 112 // node context provided. 113 CreateNode(obj *v2.CiliumNode, node *Node) NodeOperations 114 115 // GetPoolQuota is called to retrieve the remaining IP addresses in all 116 // IP pools known to the IPAM implementation. 117 GetPoolQuota() ipamTypes.PoolQuotaMap 118 119 // Resync is called periodically to give the IPAM implementation a 120 // chance to resync its own state with external APIs or systems. It is 121 // also called when the IPAM layer detects that state got out of sync. 122 Resync(ctx context.Context) time.Time 123 124 // InstanceSync is called to sync the state of the specified instance with 125 // external APIs or systems. 126 InstanceSync(ctx context.Context, instanceID string) time.Time 127 128 // HasInstance returns whether the instance is in instances 129 HasInstance(instanceID string) bool 130 131 // DeleteInstance deletes the instance from instances 132 DeleteInstance(instanceID string) 133 } 134 135 // MetricsAPI represents the metrics being maintained by a NodeManager 136 type MetricsAPI interface { 137 MetricsNodeAPI 138 139 AllocationAttempt(typ, status, subnetID string, observe float64) 140 ReleaseAttempt(typ, status, subnetID string, observe float64) 141 IncInterfaceAllocation(subnetID string) 142 AddIPAllocation(subnetID string, allocated int64) 143 AddIPRelease(subnetID string, released int64) 144 SetAllocatedIPs(typ string, allocated int) 145 SetAvailableInterfaces(available int) 146 SetInterfaceCandidates(interfaceCandidates int) 147 SetEmptyInterfaceSlots(emptyInterfaceSlots int) 148 SetAvailableIPsPerSubnet(subnetID string, availabilityZone string, available int) 149 SetNodes(category string, nodes int) 150 IncResyncCount() 151 PoolMaintainerTrigger() trigger.MetricsObserver 152 K8sSyncTrigger() trigger.MetricsObserver 153 ResyncTrigger() trigger.MetricsObserver 154 } 155 156 type MetricsNodeAPI interface { 157 SetIPAvailable(node string, cap int) 158 SetIPUsed(node string, used int) 159 SetIPNeeded(node string, needed int) 160 DeleteNode(node string) 161 } 162 163 // nodeMap is a mapping of node names to ENI nodes 164 type nodeMap map[string]*Node 165 166 // NodeManager manages all nodes with ENIs 167 type NodeManager struct { 168 mutex lock.RWMutex 169 nodes nodeMap 170 instancesAPI AllocationImplementation 171 k8sAPI CiliumNodeGetterUpdater 172 metricsAPI MetricsAPI 173 parallelWorkers int64 174 releaseExcessIPs bool 175 stableInstancesAPI bool 176 prefixDelegation bool 177 } 178 179 func (n *NodeManager) ClusterSizeDependantInterval(baseInterval time.Duration) time.Duration { 180 n.mutex.RLock() 181 numNodes := len(n.nodes) 182 n.mutex.RUnlock() 183 184 return backoff.ClusterSizeDependantInterval(baseInterval, numNodes) 185 } 186 187 // NewNodeManager returns a new NodeManager 188 func NewNodeManager(instancesAPI AllocationImplementation, k8sAPI CiliumNodeGetterUpdater, metrics MetricsAPI, 189 parallelWorkers int64, releaseExcessIPs bool, prefixDelegation bool) (*NodeManager, error) { 190 if parallelWorkers < 1 { 191 parallelWorkers = 1 192 } 193 194 mngr := &NodeManager{ 195 nodes: nodeMap{}, 196 instancesAPI: instancesAPI, 197 k8sAPI: k8sAPI, 198 metricsAPI: metrics, 199 parallelWorkers: parallelWorkers, 200 releaseExcessIPs: releaseExcessIPs, 201 prefixDelegation: prefixDelegation, 202 } 203 204 // Assume readiness, the initial blocking resync in Start() will update 205 // the readiness 206 mngr.SetInstancesAPIReadiness(true) 207 208 return mngr, nil 209 } 210 211 func (n *NodeManager) instancesAPIResync(ctx context.Context) (time.Time, bool) { 212 syncTime := n.instancesAPI.Resync(ctx) 213 success := !syncTime.IsZero() 214 n.SetInstancesAPIReadiness(success) 215 return syncTime, success 216 } 217 218 // Start kicks of the NodeManager by performing the initial state 219 // synchronization and starting the background sync goroutine 220 func (n *NodeManager) Start(ctx context.Context) error { 221 // Trigger the initial resync in a blocking manner 222 if _, ok := n.instancesAPIResync(ctx); !ok { 223 return fmt.Errorf("Initial synchronization with instances API failed") 224 } 225 226 // Start an interval based background resync for safety, it will 227 // synchronize the state regularly and resolve eventual deficit if the 228 // event driven trigger fails, and also release excess IP addresses 229 // if release-excess-ips is enabled 230 go func() { 231 mngr := controller.NewManager() 232 mngr.UpdateController("ipam-node-interval-refresh", 233 controller.ControllerParams{ 234 Group: ipamNodeIntervalControllerGroup, 235 RunInterval: time.Minute, 236 DoFunc: func(ctx context.Context) error { 237 if syncTime, ok := n.instancesAPIResync(ctx); ok { 238 n.Resync(ctx, syncTime) 239 } 240 return nil 241 }, 242 }) 243 }() 244 245 return nil 246 } 247 248 // SetInstancesAPIReadiness sets the readiness state of the instances API 249 func (n *NodeManager) SetInstancesAPIReadiness(ready bool) { 250 n.mutex.Lock() 251 n.stableInstancesAPI = ready 252 n.mutex.Unlock() 253 } 254 255 // InstancesAPIIsReady returns true if the instances API is stable and ready 256 func (n *NodeManager) InstancesAPIIsReady() bool { 257 n.mutex.Lock() 258 defer n.mutex.Unlock() 259 return n.stableInstancesAPI 260 } 261 262 // GetNames returns the list of all node names 263 func (n *NodeManager) GetNames() (allNodeNames []string) { 264 n.mutex.RLock() 265 defer n.mutex.RUnlock() 266 267 allNodeNames = make([]string, 0, len(n.nodes)) 268 269 for name := range n.nodes { 270 allNodeNames = append(allNodeNames, name) 271 } 272 273 return 274 } 275 276 // Upsert is called whenever a CiliumNode resource has been updated in the 277 // Kubernetes apiserver. The CiliumNode will be created if it didn't exist before. 278 func (n *NodeManager) Upsert(resource *v2.CiliumNode) { 279 n.mutex.Lock() 280 defer n.mutex.Unlock() 281 node, ok := n.nodes[resource.Name] 282 if !ok { 283 node = &Node{ 284 name: resource.Name, 285 manager: n, 286 logLimiter: logging.NewLimiter(10*time.Second, 3), // 1 log / 10 secs, burst of 3 287 ipv4Alloc: ipAllocAttrs{ 288 ipsMarkedForRelease: make(map[string]time.Time), 289 ipReleaseStatus: make(map[string]string), 290 }, 291 } 292 293 ctx, cancel := context.WithCancel(context.Background()) 294 // InstanceAPI is stale and the instances API is stable then do resync instancesAPI to sync instances 295 if !n.instancesAPI.HasInstance(resource.InstanceID()) && n.stableInstancesAPI { 296 if syncTime := n.instancesAPI.InstanceSync(ctx, resource.InstanceID()); syncTime.IsZero() { 297 node.logger().Warning("Failed to resync the instance from the API after new node was found") 298 n.stableInstancesAPI = false 299 } else { 300 n.stableInstancesAPI = true 301 } 302 } 303 304 node.ops = n.instancesAPI.CreateNode(resource, node) 305 306 backoff := &backoff.Exponential{ 307 Max: 5 * time.Minute, 308 Jitter: true, 309 NodeManager: n, 310 Name: fmt.Sprintf("ipam-pool-maintainer-%s", resource.Name), 311 ResetAfter: 10 * time.Minute, 312 } 313 poolMaintainer, err := trigger.NewTrigger(trigger.Parameters{ 314 Name: fmt.Sprintf("ipam-pool-maintainer-%s", resource.Name), 315 MinInterval: 10 * time.Millisecond, 316 MetricsObserver: n.metricsAPI.PoolMaintainerTrigger(), 317 TriggerFunc: func(reasons []string) { 318 if err := node.MaintainIPPool(ctx); err != nil { 319 node.logger().WithError(err).Warning("Unable to maintain ip pool of node") 320 backoff.Wait(ctx) 321 } 322 }, 323 ShutdownFunc: cancel, 324 }) 325 if err != nil { 326 node.logger().WithError(err).Error("Unable to create pool-maintainer trigger") 327 return 328 } 329 330 retry, err := trigger.NewTrigger(trigger.Parameters{ 331 Name: fmt.Sprintf("ipam-pool-maintainer-%s-retry", resource.Name), 332 MinInterval: time.Minute, // large minimal interval to not retry too often 333 TriggerFunc: func(reasons []string) { poolMaintainer.Trigger() }, 334 }) 335 if err != nil { 336 node.logger().WithError(err).Error("Unable to create pool-maintainer-retry trigger") 337 return 338 } 339 node.retry = retry 340 341 k8sSync, err := trigger.NewTrigger(trigger.Parameters{ 342 Name: fmt.Sprintf("ipam-node-k8s-sync-%s", resource.Name), 343 MinInterval: 10 * time.Millisecond, 344 MetricsObserver: n.metricsAPI.K8sSyncTrigger(), 345 TriggerFunc: func(reasons []string) { 346 node.syncToAPIServer() 347 }, 348 }) 349 if err != nil { 350 poolMaintainer.Shutdown() 351 node.logger().WithError(err).Error("Unable to create k8s-sync trigger") 352 return 353 } 354 355 instanceSync, err := trigger.NewTrigger(trigger.Parameters{ 356 Name: fmt.Sprintf("ipam-node-instance-sync-%s", resource.Name), 357 MinInterval: 10 * time.Millisecond, 358 MetricsObserver: n.metricsAPI.ResyncTrigger(), 359 TriggerFunc: func(reasons []string) { 360 if syncTime, ok := node.instanceAPISync(ctx, resource.InstanceID()); ok { 361 node.manager.Resync(ctx, syncTime) 362 } 363 }, 364 }) 365 if err != nil { 366 poolMaintainer.Shutdown() 367 k8sSync.Shutdown() 368 node.logger().WithError(err).Error("Unable to create instance-sync trigger") 369 return 370 } 371 node.instanceSync = instanceSync 372 373 node.poolMaintainer = poolMaintainer 374 node.k8sSync = k8sSync 375 n.nodes[node.name] = node 376 log.WithField(fieldName, resource.Name).Info("Discovered new CiliumNode custom resource") 377 } 378 // Update the resource in the node while holding the lock, otherwise resyncs can be 379 // triggered prior to the update being applied. 380 node.UpdatedResource(resource) 381 } 382 383 // Delete is called after a CiliumNode resource has been deleted via the 384 // Kubernetes apiserver 385 func (n *NodeManager) Delete(resource *v2.CiliumNode) { 386 n.mutex.Lock() 387 388 if node, ok := n.nodes[resource.Name]; ok { 389 // Stop target_node metrics related to this node being emitted. 390 n.metricsAPI.DeleteNode(node.name) 391 392 if node.poolMaintainer != nil { 393 node.poolMaintainer.Shutdown() 394 } 395 if node.k8sSync != nil { 396 node.k8sSync.Shutdown() 397 } 398 if node.retry != nil { 399 node.retry.Shutdown() 400 } 401 if node.instanceSync != nil { 402 node.instanceSync.Shutdown() 403 } 404 } 405 406 // Delete the instance from instanceManager. This will cause Update() to 407 // invoke instancesAPIResync if this instance rejoins the cluster. 408 // This ensures that Node.recalculate() does not use stale data for 409 // instances which rejoin the cluster after their EC2 configuration has changed. 410 if resource.Spec.InstanceID != "" { 411 n.instancesAPI.DeleteInstance(resource.Spec.InstanceID) 412 } 413 414 delete(n.nodes, resource.Name) 415 n.mutex.Unlock() 416 } 417 418 // Get returns the node with the given name 419 func (n *NodeManager) Get(nodeName string) *Node { 420 n.mutex.RLock() 421 node := n.nodes[nodeName] 422 n.mutex.RUnlock() 423 return node 424 } 425 426 // GetNodesByIPWatermarkLocked returns all nodes that require addresses to be 427 // allocated or released, sorted by the number of addresses needed to be operated 428 // in descending order. Number of addresses to be released is negative value 429 // so that nodes with IP deficit are resolved first 430 // The caller must hold the NodeManager lock 431 func (n *NodeManager) GetNodesByIPWatermarkLocked() []*Node { 432 list := make([]*Node, len(n.nodes)) 433 index := 0 434 for _, node := range n.nodes { 435 list[index] = node 436 index++ 437 } 438 439 sort.Slice(list, func(i, j int) bool { 440 valuei := list[i].GetNeededAddresses() 441 valuej := list[j].GetNeededAddresses() 442 // Number of addresses to be released is negative value, 443 // nodes with more excess addresses are released earlier 444 if valuei < 0 && valuej < 0 { 445 return valuei < valuej 446 } 447 return valuei > valuej 448 }) 449 450 return list 451 } 452 453 type resyncStats struct { 454 mutex lock.Mutex 455 ipv4 ipResyncStats 456 emptyInterfaceSlots int 457 } 458 459 type ipResyncStats struct { 460 totalUsed int 461 totalAvailable int 462 totalNeeded int 463 remainingInterfaces int 464 interfaceCandidates int 465 nodes int 466 nodesAtCapacity int 467 nodesInDeficit int 468 nodeCapacity int 469 } 470 471 func (n *NodeManager) resyncNode(ctx context.Context, node *Node, stats *resyncStats, syncTime time.Time) { 472 node.updateLastResync(syncTime) 473 node.recalculate() 474 allocationNeeded := node.allocationNeeded() 475 releaseNeeded := node.releaseNeeded() 476 if allocationNeeded || releaseNeeded { 477 node.requirePoolMaintenance() 478 node.poolMaintainer.Trigger() 479 } 480 481 nodeStats := node.Stats() 482 483 stats.mutex.Lock() 484 stats.ipv4.totalUsed += nodeStats.IPv4.UsedIPs 485 // availableOnNode is the number of available IPs on the node at this 486 // current moment. It does not take into account the number of IPs that 487 // can be allocated in the future. 488 availableOnNode := nodeStats.IPv4.AvailableIPs - nodeStats.IPv4.UsedIPs 489 stats.ipv4.totalAvailable += availableOnNode 490 stats.ipv4.totalNeeded += nodeStats.IPv4.NeededIPs 491 stats.ipv4.remainingInterfaces += nodeStats.IPv4.RemainingInterfaces 492 stats.ipv4.interfaceCandidates += nodeStats.IPv4.InterfaceCandidates 493 stats.emptyInterfaceSlots += nodeStats.EmptyInterfaceSlots 494 stats.ipv4.nodes++ 495 496 stats.ipv4.nodeCapacity = nodeStats.IPv4.Capacity 497 498 // Set per Node metrics. 499 n.metricsAPI.SetIPAvailable(node.name, nodeStats.IPv4.Capacity) 500 n.metricsAPI.SetIPUsed(node.name, nodeStats.IPv4.UsedIPs) 501 n.metricsAPI.SetIPNeeded(node.name, nodeStats.IPv4.NeededIPs) 502 503 if allocationNeeded { 504 stats.ipv4.nodesInDeficit++ 505 } 506 507 if nodeStats.IPv4.RemainingInterfaces == 0 && availableOnNode == 0 { 508 stats.ipv4.nodesAtCapacity++ 509 } 510 511 stats.mutex.Unlock() 512 513 node.k8sSync.Trigger() 514 } 515 516 // Resync will attend all nodes and resolves IP deficits. The order of 517 // attendance is defined by the number of IPs needed to reach the configured 518 // watermarks. Any updates to the node resource are synchronized to the 519 // Kubernetes apiserver. 520 func (n *NodeManager) Resync(ctx context.Context, syncTime time.Time) { 521 n.mutex.Lock() 522 defer n.mutex.Unlock() 523 n.metricsAPI.IncResyncCount() 524 525 stats := resyncStats{} 526 sem := semaphore.NewWeighted(n.parallelWorkers) 527 528 for _, node := range n.GetNodesByIPWatermarkLocked() { 529 err := sem.Acquire(ctx, 1) 530 if err != nil { 531 continue 532 } 533 go func(node *Node, stats *resyncStats) { 534 n.resyncNode(ctx, node, stats, syncTime) 535 sem.Release(1) 536 }(node, &stats) 537 } 538 539 // Acquire the full semaphore, this requires all goroutines to 540 // complete and thus blocks until all nodes are synced 541 sem.Acquire(ctx, n.parallelWorkers) 542 543 n.metricsAPI.SetAllocatedIPs("used", stats.ipv4.totalUsed) 544 n.metricsAPI.SetAllocatedIPs("available", stats.ipv4.totalAvailable) 545 n.metricsAPI.SetAllocatedIPs("needed", stats.ipv4.totalNeeded) 546 n.metricsAPI.SetAvailableInterfaces(stats.ipv4.remainingInterfaces) 547 n.metricsAPI.SetInterfaceCandidates(stats.ipv4.interfaceCandidates) 548 n.metricsAPI.SetEmptyInterfaceSlots(stats.emptyInterfaceSlots) 549 n.metricsAPI.SetNodes("total", stats.ipv4.nodes) 550 n.metricsAPI.SetNodes("in-deficit", stats.ipv4.nodesInDeficit) 551 n.metricsAPI.SetNodes("at-capacity", stats.ipv4.nodesAtCapacity) 552 553 for poolID, quota := range n.instancesAPI.GetPoolQuota() { 554 n.metricsAPI.SetAvailableIPsPerSubnet(string(poolID), quota.AvailabilityZone, quota.AvailableIPs) 555 } 556 }