github.com/cilium/cilium@v1.16.2/pkg/node/manager/manager.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package manager 5 6 import ( 7 "bufio" 8 "context" 9 "errors" 10 "fmt" 11 "io/fs" 12 "math/rand/v2" 13 "net" 14 "net/netip" 15 "os" 16 "path/filepath" 17 "slices" 18 "sync" 19 20 "github.com/cilium/hive/cell" 21 "github.com/cilium/workerpool" 22 "github.com/google/renameio/v2" 23 jsoniter "github.com/json-iterator/go" 24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/sirupsen/logrus" 26 "golang.org/x/time/rate" 27 28 "github.com/cilium/cilium/pkg/backoff" 29 "github.com/cilium/cilium/pkg/controller" 30 "github.com/cilium/cilium/pkg/datapath/iptables/ipset" 31 datapath "github.com/cilium/cilium/pkg/datapath/types" 32 "github.com/cilium/cilium/pkg/identity" 33 "github.com/cilium/cilium/pkg/inctimer" 34 "github.com/cilium/cilium/pkg/ip" 35 "github.com/cilium/cilium/pkg/ipcache" 36 ipcacheTypes "github.com/cilium/cilium/pkg/ipcache/types" 37 "github.com/cilium/cilium/pkg/labels" 38 "github.com/cilium/cilium/pkg/labelsfilter" 39 "github.com/cilium/cilium/pkg/lock" 40 "github.com/cilium/cilium/pkg/logging/logfields" 41 "github.com/cilium/cilium/pkg/metrics" 42 "github.com/cilium/cilium/pkg/metrics/metric" 43 "github.com/cilium/cilium/pkg/node" 44 "github.com/cilium/cilium/pkg/node/addressing" 45 nodeTypes "github.com/cilium/cilium/pkg/node/types" 46 "github.com/cilium/cilium/pkg/option" 47 "github.com/cilium/cilium/pkg/source" 48 "github.com/cilium/cilium/pkg/time" 49 "github.com/cilium/cilium/pkg/trigger" 50 "github.com/cilium/cilium/pkg/wireguard/types" 51 ) 52 53 const ( 54 // The filename for the nodes checkpoint. This is periodically written, and 55 // restored on restart. The default path is /run/cilium/state/nodes.json 56 nodesFilename = "nodes.json" 57 // Minimum amount of time to wait in between writing nodes file. 58 nodeCheckpointMinInterval = time.Minute 59 ) 60 61 var ( 62 baseBackgroundSyncInterval = time.Minute 63 defaultNodeUpdateInterval = 10 * time.Second 64 65 neighborTableRefreshControllerGroup = controller.NewGroup("neighbor-table-refresh") 66 neighborTableUpdateControllerGroup = controller.NewGroup("neighbor-table-update") 67 ) 68 69 const ( 70 numBackgroundWorkers = 1 71 ) 72 73 type nodeEntry struct { 74 // mutex serves two purposes: 75 // 1. Serialize any direct access to the node field in this entry. 76 // 2. Serialize all calls do the datapath layer for a particular node. 77 // 78 // See description of Manager.mutex for more details 79 // 80 // If both the nodeEntry.mutex and Manager.mutex must be held, then the 81 // Manager.mutex must *always* be acquired first. 82 mutex lock.Mutex 83 node nodeTypes.Node 84 } 85 86 // IPCache is the set of interactions the node manager performs with the ipcache 87 type IPCache interface { 88 GetMetadataSourceByPrefix(prefix netip.Prefix) source.Source 89 UpsertMetadata(prefix netip.Prefix, src source.Source, resource ipcacheTypes.ResourceID, aux ...ipcache.IPMetadata) 90 OverrideIdentity(prefix netip.Prefix, identityLabels labels.Labels, src source.Source, resource ipcacheTypes.ResourceID) 91 RemoveMetadata(prefix netip.Prefix, resource ipcacheTypes.ResourceID, aux ...ipcache.IPMetadata) 92 RemoveIdentityOverride(prefix netip.Prefix, identityLabels labels.Labels, resource ipcacheTypes.ResourceID) 93 } 94 95 // IPSetFilterFn is a function allowing to optionally filter out the insertion 96 // of IPSet entries based on node characteristics. The insertion is performed 97 // if the function returns false, and skipped otherwise. 98 type IPSetFilterFn func(*nodeTypes.Node) bool 99 100 var _ Notifier = (*manager)(nil) 101 102 // manager is the entity that manages a collection of nodes 103 type manager struct { 104 // mutex is the lock protecting access to the nodes map. The mutex must 105 // be held for any access of the nodes map. 106 // 107 // The manager mutex works together with the entry mutex in the 108 // following way to minimize the duration the manager mutex is held: 109 // 110 // 1. Acquire manager mutex to safely access nodes map and to retrieve 111 // node entry. 112 // 2. Acquire mutex of the entry while the manager mutex is still held. 113 // This guarantees that no change to the entry has happened. 114 // 3. Release of the manager mutex to unblock changes or reads to other 115 // node entries. 116 // 4. Change of entry data or performing of datapath interactions 117 // 5. Release of the entry mutex 118 // 119 // If both the nodeEntry.mutex and Manager.mutex must be held, then the 120 // Manager.mutex must *always* be acquired first. 121 mutex lock.RWMutex 122 123 // nodes is the list of nodes. Access must be protected via mutex. 124 nodes map[nodeTypes.Identity]*nodeEntry 125 126 // Upon agent startup, this is filled with nodes as read from disk. Used to 127 // synthesize node deletion events for nodes which disappeared while we were 128 // down. 129 restoredNodes map[nodeTypes.Identity]*nodeTypes.Node 130 131 // nodeHandlersMu protects the nodeHandlers map against concurrent access. 132 nodeHandlersMu lock.RWMutex 133 // nodeHandlers has a slice containing all node handlers subscribed to node 134 // events. 135 nodeHandlers map[datapath.NodeHandler]struct{} 136 137 // workerpool manages background workers 138 workerpool *workerpool.WorkerPool 139 140 // metrics to track information about the node manager 141 metrics *nodeMetrics 142 143 // conf is the configuration of the caller passed in via NewManager. 144 // This field is immutable after NewManager() 145 conf *option.DaemonConfig 146 147 // ipcache is the set operations performed against the ipcache 148 ipcache IPCache 149 150 // ipsetMgr is the ipset cluster nodes configuration manager 151 ipsetMgr ipset.Manager 152 ipsetInitializer ipset.Initializer 153 ipsetFilter IPSetFilterFn 154 155 // controllerManager manages the controllers that are launched within the 156 // Manager. 157 controllerManager *controller.Manager 158 159 // health reports on the current health status of the node manager module. 160 health cell.Health 161 162 // nodeNeighborQueue tracks node neighbor link updates. 163 nodeNeighborQueue queue[nodeQueueEntry] 164 165 // nodeCheckpointer triggers writing the current set of nodes to disk 166 nodeCheckpointer *trigger.Trigger 167 checkpointerDone chan struct{} // Closed once the checkpointer is shut down. 168 169 // Ensure the pruning is only attempted once. 170 nodePruneOnce sync.Once 171 } 172 173 type nodeQueueEntry struct { 174 node *nodeTypes.Node 175 refresh bool 176 } 177 178 // Enqueue add a node to a controller managed queue which sets up the neighbor link. 179 func (m *manager) Enqueue(n *nodeTypes.Node, refresh bool) { 180 if n == nil { 181 log.WithFields(logrus.Fields{ 182 logfields.LogSubsys: "enqueue", 183 }).Warn("Skipping nodeNeighbor insert: No node given") 184 } 185 m.nodeNeighborQueue.push(&nodeQueueEntry{node: n, refresh: refresh}) 186 } 187 188 // Subscribe subscribes the given node handler to node events. 189 func (m *manager) Subscribe(nh datapath.NodeHandler) { 190 m.nodeHandlersMu.Lock() 191 m.nodeHandlers[nh] = struct{}{} 192 m.nodeHandlersMu.Unlock() 193 // Add all nodes already received by the manager. 194 m.mutex.RLock() 195 for _, v := range m.nodes { 196 v.mutex.Lock() 197 if err := nh.NodeAdd(v.node); err != nil { 198 log.WithFields(logrus.Fields{ 199 "handler": nh.Name(), 200 "node": v.node.Name, 201 }).WithError(err).Error("Failed applying node handler following initial subscribe. Cilium may have degraded functionality. See error message for more details.") 202 } 203 v.mutex.Unlock() 204 } 205 m.mutex.RUnlock() 206 } 207 208 // Unsubscribe unsubscribes the given node handler with node events. 209 func (m *manager) Unsubscribe(nh datapath.NodeHandler) { 210 m.nodeHandlersMu.Lock() 211 delete(m.nodeHandlers, nh) 212 m.nodeHandlersMu.Unlock() 213 } 214 215 // Iter executes the given function in all subscribed node handlers. 216 func (m *manager) Iter(f func(nh datapath.NodeHandler)) { 217 m.nodeHandlersMu.RLock() 218 defer m.nodeHandlersMu.RUnlock() 219 220 for nh := range m.nodeHandlers { 221 f(nh) 222 } 223 } 224 225 type nodeMetrics struct { 226 // metricEventsReceived is the prometheus metric to track the number of 227 // node events received 228 EventsReceived metric.Vec[metric.Counter] 229 230 // metricNumNodes is the prometheus metric to track the number of nodes 231 // being managed 232 NumNodes metric.Gauge 233 234 // metricDatapathValidations is the prometheus metric to track the 235 // number of datapath node validation calls 236 DatapathValidations metric.Counter 237 } 238 239 // ProcessNodeDeletion upon node deletion ensures metrics associated 240 // with the deleted node are no longer reported. 241 // Notably for metrics node connectivity status and latency metrics 242 func (*nodeMetrics) ProcessNodeDeletion(clusterName, nodeName string) { 243 // Removes all connectivity status associated with the deleted node. 244 _ = metrics.NodeConnectivityStatus.DeletePartialMatch(prometheus.Labels{ 245 metrics.LabelSourceCluster: clusterName, 246 metrics.LabelSourceNodeName: nodeName, 247 }) 248 _ = metrics.NodeConnectivityStatus.DeletePartialMatch(prometheus.Labels{ 249 metrics.LabelTargetCluster: clusterName, 250 metrics.LabelTargetNodeName: nodeName, 251 }) 252 253 // Removes all connectivity latency associated with the deleted node. 254 _ = metrics.NodeConnectivityLatency.DeletePartialMatch(prometheus.Labels{ 255 metrics.LabelSourceCluster: clusterName, 256 metrics.LabelSourceNodeName: nodeName, 257 }) 258 _ = metrics.NodeConnectivityLatency.DeletePartialMatch(prometheus.Labels{ 259 metrics.LabelTargetCluster: clusterName, 260 metrics.LabelTargetNodeName: nodeName, 261 }) 262 } 263 264 func NewNodeMetrics() *nodeMetrics { 265 return &nodeMetrics{ 266 EventsReceived: metric.NewCounterVec(metric.CounterOpts{ 267 ConfigName: metrics.Namespace + "_" + "nodes_all_events_received_total", 268 Namespace: metrics.Namespace, 269 Subsystem: "nodes", 270 Name: "all_events_received_total", 271 Help: "Number of node events received", 272 }, []string{"event_type", "source"}), 273 274 NumNodes: metric.NewGauge(metric.GaugeOpts{ 275 ConfigName: metrics.Namespace + "_" + "nodes_all_num", 276 Namespace: metrics.Namespace, 277 Subsystem: "nodes", 278 Name: "all_num", 279 Help: "Number of nodes managed", 280 }), 281 282 DatapathValidations: metric.NewCounter(metric.CounterOpts{ 283 ConfigName: metrics.Namespace + "_" + "nodes_all_datapath_validations_total", 284 Namespace: metrics.Namespace, 285 Subsystem: "nodes", 286 Name: "all_datapath_validations_total", 287 Help: "Number of validation calls to implement the datapath implementation of a node", 288 }), 289 } 290 } 291 292 // New returns a new node manager 293 func New(c *option.DaemonConfig, ipCache IPCache, ipsetMgr ipset.Manager, ipsetFilter IPSetFilterFn, nodeMetrics *nodeMetrics, health cell.Health) (*manager, error) { 294 if ipsetFilter == nil { 295 ipsetFilter = func(*nodeTypes.Node) bool { return false } 296 } 297 298 m := &manager{ 299 nodes: map[nodeTypes.Identity]*nodeEntry{}, 300 restoredNodes: map[nodeTypes.Identity]*nodeTypes.Node{}, 301 conf: c, 302 controllerManager: controller.NewManager(), 303 nodeHandlers: map[datapath.NodeHandler]struct{}{}, 304 ipcache: ipCache, 305 ipsetMgr: ipsetMgr, 306 ipsetInitializer: ipsetMgr.NewInitializer(), 307 ipsetFilter: ipsetFilter, 308 metrics: nodeMetrics, 309 health: health, 310 } 311 312 return m, nil 313 } 314 315 func (m *manager) Start(cell.HookContext) error { 316 m.workerpool = workerpool.New(numBackgroundWorkers) 317 318 // Ensure that we read a potential nodes file before we overwrite it. 319 m.restoreNodeCheckpoint() 320 if err := m.initNodeCheckpointer(nodeCheckpointMinInterval); err != nil { 321 return fmt.Errorf("failed to initialize node file writer: %w", err) 322 } 323 324 return m.workerpool.Submit("backgroundSync", m.backgroundSync) 325 } 326 327 // Stop shuts down a node manager 328 func (m *manager) Stop(cell.HookContext) error { 329 if m.workerpool != nil { 330 if err := m.workerpool.Close(); err != nil { 331 return err 332 } 333 } 334 335 m.mutex.Lock() 336 defer m.mutex.Unlock() 337 338 if m.nodeCheckpointer != nil { 339 // Using the shutdown func of trigger to checkpoint would block shutdown 340 // for up to its MinInterval, which is too long. 341 m.nodeCheckpointer.Shutdown() 342 close(m.checkpointerDone) 343 err := m.checkpoint() 344 if err != nil { 345 log.WithError(err).Error("Failed to write final node checkpoint.") 346 } 347 m.nodeCheckpointer = nil 348 } 349 350 return nil 351 } 352 353 // ClusterSizeDependantInterval returns a time.Duration that is dependant on 354 // the cluster size, i.e. the number of nodes that have been discovered. This 355 // can be used to control sync intervals of shared or centralized resources to 356 // avoid overloading these resources as the cluster grows. 357 // 358 // Example sync interval with baseInterval = 1 * time.Minute 359 // 360 // nodes | sync interval 361 // ------+----------------- 362 // 1 | 41.588830833s 363 // 2 | 1m05.916737320s 364 // 4 | 1m36.566274746s 365 // 8 | 2m11.833474640s 366 // 16 | 2m49.992800643s 367 // 32 | 3m29.790453687s 368 // 64 | 4m10.463236193s 369 // 128 | 4m51.588744261s 370 // 256 | 5m32.944565093s 371 // 512 | 6m14.416550710s 372 // 1024 | 6m55.946873494s 373 // 2048 | 7m37.506428894s 374 // 4096 | 8m19.080616652s 375 // 8192 | 9m00.662124608s 376 // 16384 | 9m42.247293667s 377 func (m *manager) ClusterSizeDependantInterval(baseInterval time.Duration) time.Duration { 378 m.mutex.RLock() 379 numNodes := len(m.nodes) 380 m.mutex.RUnlock() 381 382 return backoff.ClusterSizeDependantInterval(baseInterval, numNodes) 383 } 384 385 func (m *manager) backgroundSyncInterval() time.Duration { 386 return m.ClusterSizeDependantInterval(baseBackgroundSyncInterval) 387 } 388 389 // backgroundSync ensures that local node has a valid datapath in-place for 390 // each node in the cluster. See NodeValidateImplementation(). 391 func (m *manager) backgroundSync(ctx context.Context) error { 392 syncTimer, syncTimerDone := inctimer.New() 393 defer syncTimerDone() 394 for { 395 syncInterval := m.backgroundSyncInterval() 396 startWaiting := syncTimer.After(syncInterval) 397 log.WithField("syncInterval", syncInterval.String()).Debug("Starting new iteration of background sync") 398 err := m.singleBackgroundLoop(ctx, syncInterval) 399 log.WithField("syncInterval", syncInterval.String()).Debug("Finished iteration of background sync") 400 401 select { 402 case <-ctx.Done(): 403 return nil 404 // This handles cases when we didn't fetch nodes yet (e.g. on bootstrap) 405 // but also case when we have 1 node, in which case rate.Limiter doesn't 406 // throttle anything. 407 case <-startWaiting: 408 } 409 410 hr := m.health.NewScope("background-sync") 411 if err != nil { 412 hr.Degraded("Failed to apply node validation", err) 413 } else { 414 hr.OK("Node validation successful") 415 } 416 } 417 } 418 419 func (m *manager) singleBackgroundLoop(ctx context.Context, expectedLoopTime time.Duration) error { 420 var errs error 421 // get a copy of the node identities to avoid locking the entire manager 422 // throughout the process of running the datapath validation. 423 nodes := m.GetNodeIdentities() 424 limiter := rate.NewLimiter( 425 rate.Limit(float64(len(nodes))/float64(expectedLoopTime.Seconds())), 426 1, // One token in bucket to amortize for latency of the operation 427 ) 428 for _, nodeIdentity := range nodes { 429 if err := limiter.Wait(ctx); err != nil { 430 log.WithError(err).Debug("Error while rate limiting backgroundSync updates") 431 } 432 433 select { 434 case <-ctx.Done(): 435 return nil 436 default: 437 } 438 // Retrieve latest node information in case any event 439 // changed the node since the call to GetNodes() 440 m.mutex.RLock() 441 entry, ok := m.nodes[nodeIdentity] 442 if !ok { 443 m.mutex.RUnlock() 444 continue 445 } 446 entry.mutex.Lock() 447 m.mutex.RUnlock() 448 { 449 m.Iter(func(nh datapath.NodeHandler) { 450 if err := nh.NodeValidateImplementation(entry.node); err != nil { 451 log.WithFields(logrus.Fields{ 452 "handler": nh.Name(), 453 "node": entry.node.Name, 454 }).WithError(err). 455 Error("Failed to apply node handler during background sync. Cilium may have degraded functionality. See error message for details.") 456 errs = errors.Join(errs, fmt.Errorf("failed while handling %s on node %s: %w", nh.Name(), entry.node.Name, err)) 457 } 458 }) 459 } 460 entry.mutex.Unlock() 461 462 m.metrics.DatapathValidations.Inc() 463 } 464 return errs 465 } 466 467 func (m *manager) restoreNodeCheckpoint() { 468 path := filepath.Join(m.conf.StateDir, nodesFilename) 469 l := log.WithField(logfields.Path, path) 470 f, err := os.Open(path) 471 if err != nil { 472 if errors.Is(err, fs.ErrNotExist) { 473 // If we don't have a file to restore from, there's nothing we can 474 // do. This is expected in the upgrade path. 475 l.Debugf("No %v file found, cannot replay node deletion events for nodes"+ 476 " which disappeared during downtime.", nodesFilename) 477 return 478 } 479 l.WithError(err).Error("failed to read node checkpoint file") 480 return 481 } 482 483 r := jsoniter.ConfigFastest.NewDecoder(bufio.NewReader(f)) 484 var nodeCheckpoint []*nodeTypes.Node 485 if err := r.Decode(&nodeCheckpoint); err != nil { 486 l.WithError(err).Error("failed to decode node checkpoint file") 487 return 488 } 489 490 // We can't call NodeUpdated for restored nodes here, as the machinery 491 // assumes a fully initialized node manager, which we don't currently have. 492 // In addition, we only want to replay NodeDeletions, since k8s provided 493 // up-to-date information on all live nodes. We keep the restored nodes 494 // separate, let whatever init needs to happen occur and once we're synced 495 // to k8s, compare the restored nodes to the live ones. 496 for _, n := range nodeCheckpoint { 497 n.Source = source.Restored 498 m.restoredNodes[n.Identity()] = n 499 } 500 } 501 502 // initNodeCheckpointer sets up the trigger for writing nodes to disk. 503 func (m *manager) initNodeCheckpointer(minInterval time.Duration) error { 504 var err error 505 health := m.health.NewScope("node-checkpoint-writer") 506 m.checkpointerDone = make(chan struct{}) 507 508 m.nodeCheckpointer, err = trigger.NewTrigger(trigger.Parameters{ 509 Name: "node-checkpoint-trigger", 510 MinInterval: minInterval, // To avoid rapid repetition (e.g. during startup). 511 TriggerFunc: func(reasons []string) { 512 m.mutex.RLock() 513 select { 514 // The trigger package does not check whether the trigger is shut 515 // down already after sleeping to honor the MinInterval. Hence, we 516 // do so ourselves. 517 case <-m.checkpointerDone: 518 return 519 default: 520 } 521 err := m.checkpoint() 522 m.mutex.RUnlock() 523 524 if err != nil { 525 log.WithFields(logrus.Fields{ 526 logfields.Reason: reasons, 527 }).WithError(err).Error("could not write node checkpoint") 528 health.Degraded("failed to write node checkpoint", err) 529 } else { 530 health.OK("node checkpoint written") 531 } 532 }, 533 }) 534 return err 535 } 536 537 // checkpoint writes all nodes to disk. Assumes the manager is read locked. 538 // Don't call this directly, use the nodeCheckpointer trigger. 539 func (m *manager) checkpoint() error { 540 stateDir := m.conf.StateDir 541 nodesPath := filepath.Join(stateDir, nodesFilename) 542 log.WithFields(logrus.Fields{ 543 logfields.Path: nodesPath, 544 }).Debug("writing node checkpoint to disk") 545 546 // Write new contents to a temporary file which will be atomically renamed to the 547 // real file at the end of this function to avoid data corruption if we crash. 548 f, err := renameio.TempFile(stateDir, nodesPath) 549 if err != nil { 550 return fmt.Errorf("failed to open temporary file: %w", err) 551 } 552 defer f.Cleanup() 553 554 bw := bufio.NewWriter(f) 555 w := jsoniter.ConfigFastest.NewEncoder(bw) 556 ns := make([]nodeTypes.Node, 0, len(m.nodes)) 557 for _, n := range m.nodes { 558 ns = append(ns, n.node) 559 } 560 if err := w.Encode(ns); err != nil { 561 return fmt.Errorf("failed to encode node checkpoint: %w", err) 562 } 563 if err := bw.Flush(); err != nil { 564 return fmt.Errorf("failed to flush node checkpoint writer: %w", err) 565 } 566 567 return f.CloseAtomicallyReplace() 568 } 569 570 func (m *manager) nodeAddressHasTunnelIP(address nodeTypes.Address) bool { 571 // If the host firewall is enabled, all traffic to remote nodes must go 572 // through the tunnel to preserve the source identity as part of the 573 // encapsulation. In encryption case we also want to use vxlan device 574 // to create symmetric traffic when sending nodeIP->pod and pod->nodeIP. 575 return address.Type == addressing.NodeCiliumInternalIP || m.conf.NodeEncryptionEnabled() || 576 m.conf.EnableHostFirewall || m.conf.JoinCluster 577 } 578 579 func (m *manager) nodeAddressHasEncryptKey() bool { 580 // If we are doing encryption, but not node based encryption, then do not 581 // add a key to the nodeIPs so that we avoid a trip through stack and attempting 582 // to encrypt something we know does not have an encryption policy installed 583 // in the datapath. By setting key=0 and tunnelIP this will result in traffic 584 // being sent unencrypted over overlay device. 585 return m.conf.NodeEncryptionEnabled() && 586 // Also ignore any remote node's key if the local node opted to not perform 587 // node-to-node encryption 588 !node.GetOptOutNodeEncryption() 589 } 590 591 // endpointEncryptionKey returns the encryption key index to use for the health 592 // and ingress endpoints of a node. This is needed for WireGuard where the 593 // node's EncryptionKey and the endpoint's EncryptionKey are not the same if 594 // a node has opted out of node-to-node encryption by zeroing n.EncryptionKey. 595 // With WireGuard, we always want to encrypt pod-to-pod traffic, thus we return 596 // a static non-zero encrypt key here. 597 // With IPSec (or no encryption), the node's encryption key index and the 598 // encryption key of the endpoint on that node are the same. 599 func (m *manager) endpointEncryptionKey(n *nodeTypes.Node) ipcacheTypes.EncryptKey { 600 if m.conf.EnableWireguard { 601 return ipcacheTypes.EncryptKey(types.StaticEncryptKey) 602 } 603 604 return ipcacheTypes.EncryptKey(n.EncryptionKey) 605 } 606 607 func (m *manager) nodeIdentityLabels(n nodeTypes.Node) (nodeLabels labels.Labels, hasOverride bool) { 608 nodeLabels = labels.NewFrom(labels.LabelRemoteNode) 609 if n.IsLocal() { 610 nodeLabels = labels.NewFrom(labels.LabelHost) 611 if m.conf.PolicyCIDRMatchesNodes() { 612 for _, address := range n.IPAddresses { 613 addr, ok := ip.AddrFromIP(address.IP) 614 if ok { 615 bitLen := addr.BitLen() 616 if m.conf.EnableIPv4 && bitLen == net.IPv4len*8 || 617 m.conf.EnableIPv6 && bitLen == net.IPv6len*8 { 618 prefix, err := addr.Prefix(bitLen) 619 if err == nil { 620 cidrLabels := labels.GetCIDRLabels(prefix) 621 nodeLabels.MergeLabels(cidrLabels) 622 } 623 } 624 } 625 } 626 } 627 } else if !identity.NumericIdentity(n.NodeIdentity).IsReservedIdentity() { 628 // This needs to match clustermesh-apiserver's VMManager.AllocateNodeIdentity 629 nodeLabels = labels.Map2Labels(n.Labels, labels.LabelSourceK8s) 630 hasOverride = true 631 } else if !n.IsLocal() && option.Config.PerNodeLabelsEnabled() { 632 lbls := labels.Map2Labels(n.Labels, labels.LabelSourceNode) 633 filteredLbls, _ := labelsfilter.FilterNodeLabels(lbls) 634 nodeLabels.MergeLabels(filteredLbls) 635 } 636 637 return nodeLabels, hasOverride 638 } 639 640 // NodeUpdated is called after the information of a node has been updated. The 641 // node in the manager is added or updated if the source is allowed to update 642 // the node. If an update or addition has occurred, NodeUpdate() of the datapath 643 // interface is invoked. 644 func (m *manager) NodeUpdated(n nodeTypes.Node) { 645 log.WithFields(logrus.Fields{ 646 logfields.ClusterName: n.Cluster, 647 logfields.NodeName: n.Name, 648 logfields.SPI: n.EncryptionKey, 649 }).Info("Node updated") 650 if log.Logger.IsLevelEnabled(logrus.DebugLevel) { 651 log.WithField(logfields.Node, n.LogRepr()).Debugf("Received node update event from %s", n.Source) 652 } 653 654 nodeIdentifier := n.Identity() 655 dpUpdate := true 656 var nodeIP netip.Addr 657 if nIP := n.GetNodeIP(false); nIP != nil { 658 // GH-24829: Support IPv6-only nodes. 659 660 // Skip returning the error here because at this level, we assume that 661 // the IP is valid as long as it's coming from nodeTypes.Node. This 662 // object is created either from the node discovery (K8s) or from an 663 // event from the kvstore. 664 nodeIP, _ = ip.AddrFromIP(nIP) 665 } 666 667 resource := ipcacheTypes.NewResourceID(ipcacheTypes.ResourceKindNode, "", n.Name) 668 nodeLabels, nodeIdentityOverride := m.nodeIdentityLabels(n) 669 670 var ipsetEntries []netip.Prefix 671 var nodeIPsAdded, healthIPsAdded, ingressIPsAdded []netip.Prefix 672 673 for _, address := range n.IPAddresses { 674 prefix := ip.IPToNetPrefix(address.IP) 675 676 if address.Type == addressing.NodeInternalIP && !m.ipsetFilter(&n) { 677 ipsetEntries = append(ipsetEntries, prefix) 678 } 679 680 var tunnelIP netip.Addr 681 if m.nodeAddressHasTunnelIP(address) { 682 tunnelIP = nodeIP 683 } 684 685 var key uint8 686 if m.nodeAddressHasEncryptKey() { 687 key = n.EncryptionKey 688 } 689 690 // We expect the node manager to have a source of either Kubernetes, 691 // CustomResource, or KVStore. Prioritize the KVStore source over the 692 // rest as it is the strongest source, i.e. only trigger datapath 693 // updates if the information we receive takes priority. 694 // 695 // There are two exceptions to the rules above: 696 // * kube-apiserver entries - in that case, 697 // we still want to inform subscribers about changes in auxiliary 698 // data such as for example the health endpoint. 699 // * CiliumInternal IP addresses that match configured local router IP. 700 // In that case, we still want to inform subscribers about a new node 701 // even when IP addresses may seem repeated across the nodes. 702 existing := m.ipcache.GetMetadataSourceByPrefix(prefix) 703 overwrite := source.AllowOverwrite(existing, n.Source) 704 if !overwrite && existing != source.KubeAPIServer && 705 !(address.Type == addressing.NodeCiliumInternalIP && m.conf.IsLocalRouterIP(address.ToString())) { 706 dpUpdate = false 707 } 708 709 lbls := nodeLabels 710 // Add the CIDR labels for this node, if we allow selecting nodes by CIDR 711 if m.conf.PolicyCIDRMatchesNodes() { 712 lbls = labels.NewFrom(nodeLabels) 713 lbls.MergeLabels(labels.GetCIDRLabels(prefix)) 714 } 715 716 // Always associate the prefix with metadata, even though this may not 717 // end up in an ipcache entry. 718 m.ipcache.UpsertMetadata(prefix, n.Source, resource, 719 lbls, 720 ipcacheTypes.TunnelPeer{Addr: tunnelIP}, 721 ipcacheTypes.EncryptKey(key)) 722 if nodeIdentityOverride { 723 m.ipcache.OverrideIdentity(prefix, nodeLabels, n.Source, resource) 724 } 725 nodeIPsAdded = append(nodeIPsAdded, prefix) 726 } 727 728 var v4Addrs, v6Addrs []netip.Addr 729 for _, prefix := range ipsetEntries { 730 addr := prefix.Addr() 731 if addr.Is6() { 732 v6Addrs = append(v6Addrs, addr) 733 } else { 734 v4Addrs = append(v4Addrs, addr) 735 } 736 } 737 m.ipsetMgr.AddToIPSet(ipset.CiliumNodeIPSetV4, ipset.INetFamily, v4Addrs...) 738 m.ipsetMgr.AddToIPSet(ipset.CiliumNodeIPSetV6, ipset.INet6Family, v6Addrs...) 739 740 for _, address := range []net.IP{n.IPv4HealthIP, n.IPv6HealthIP} { 741 healthIP := ip.IPToNetPrefix(address) 742 if !healthIP.IsValid() { 743 continue 744 } 745 if !source.AllowOverwrite(m.ipcache.GetMetadataSourceByPrefix(healthIP), n.Source) { 746 dpUpdate = false 747 } 748 749 m.ipcache.UpsertMetadata(healthIP, n.Source, resource, 750 labels.LabelHealth, 751 ipcacheTypes.TunnelPeer{Addr: nodeIP}, 752 m.endpointEncryptionKey(&n)) 753 healthIPsAdded = append(healthIPsAdded, healthIP) 754 } 755 756 for _, address := range []net.IP{n.IPv4IngressIP, n.IPv6IngressIP} { 757 ingressIP := ip.IPToNetPrefix(address) 758 if !ingressIP.IsValid() { 759 continue 760 } 761 if !source.AllowOverwrite(m.ipcache.GetMetadataSourceByPrefix(ingressIP), n.Source) { 762 dpUpdate = false 763 } 764 765 m.ipcache.UpsertMetadata(ingressIP, n.Source, resource, 766 labels.LabelIngress, 767 ipcacheTypes.TunnelPeer{Addr: nodeIP}, 768 m.endpointEncryptionKey(&n)) 769 ingressIPsAdded = append(ingressIPsAdded, ingressIP) 770 } 771 772 m.mutex.Lock() 773 entry, oldNodeExists := m.nodes[nodeIdentifier] 774 if oldNodeExists { 775 m.metrics.EventsReceived.WithLabelValues("update", string(n.Source)).Inc() 776 777 if !source.AllowOverwrite(entry.node.Source, n.Source) { 778 // Done; skip node-handler updates and label injection 779 // triggers below. Includes case where the local host 780 // was discovered locally and then is subsequently 781 // updated by the k8s watcher. 782 m.mutex.Unlock() 783 return 784 } 785 786 entry.mutex.Lock() 787 m.mutex.Unlock() 788 oldNode := entry.node 789 entry.node = n 790 if dpUpdate { 791 var errs error 792 m.Iter(func(nh datapath.NodeHandler) { 793 if err := nh.NodeUpdate(oldNode, entry.node); err != nil { 794 log.WithFields(logrus.Fields{ 795 "handler": nh.Name(), 796 "node": entry.node.Name, 797 }).WithError(err). 798 Error("Failed to handle node update event while applying handler. Cilium may be have degraded functionality. See error message for details.") 799 errs = errors.Join(errs, err) 800 } 801 }) 802 803 hr := m.health.NewScope("nodes-update") 804 if errs != nil { 805 hr.Degraded("Failed to update nodes", errs) 806 } else { 807 hr.OK("Node updates successful") 808 } 809 } 810 811 m.removeNodeFromIPCache(oldNode, resource, ipsetEntries, nodeIPsAdded, healthIPsAdded, ingressIPsAdded) 812 813 entry.mutex.Unlock() 814 } else { 815 m.metrics.EventsReceived.WithLabelValues("add", string(n.Source)).Inc() 816 m.metrics.NumNodes.Inc() 817 818 entry = &nodeEntry{node: n} 819 entry.mutex.Lock() 820 m.nodes[nodeIdentifier] = entry 821 m.mutex.Unlock() 822 var errs error 823 if dpUpdate { 824 m.Iter(func(nh datapath.NodeHandler) { 825 if err := nh.NodeAdd(entry.node); err != nil { 826 log.WithFields(logrus.Fields{ 827 "node": entry.node.Name, 828 "handler": nh.Name(), 829 }).WithError(err). 830 Error("Failed to handle node update event while applying handler. Cilium may be have degraded functionality. See error message for details.") 831 errs = errors.Join(errs, err) 832 } 833 }) 834 } 835 entry.mutex.Unlock() 836 hr := m.health.NewScope("nodes-add") 837 if errs != nil { 838 hr.Degraded("Failed to add nodes", errs) 839 } else { 840 hr.OK("Node adds successful") 841 } 842 843 } 844 845 if m.nodeCheckpointer != nil { 846 m.nodeCheckpointer.TriggerWithReason("NodeUpdate") 847 } 848 } 849 850 // removeNodeFromIPCache removes all addresses associated with oldNode from the IPCache, 851 // unless they are present in the nodeIPsAdded, healthIPsAdded, ingressIPsAdded lists. 852 // Removes ipset entry associated with oldNode if it is not present in ipsetEntries. 853 // 854 // The removal logic in this function should mirror the upsert logic in NodeUpdated. 855 func (m *manager) removeNodeFromIPCache(oldNode nodeTypes.Node, resource ipcacheTypes.ResourceID, 856 ipsetEntries, nodeIPsAdded, healthIPsAdded, ingressIPsAdded []netip.Prefix) { 857 858 var oldNodeIP netip.Addr 859 if nIP := oldNode.GetNodeIP(false); nIP != nil { 860 // See comment in NodeUpdated(). 861 oldNodeIP, _ = ip.AddrFromIP(nIP) 862 } 863 oldNodeLabels, oldNodeIdentityOverride := m.nodeIdentityLabels(oldNode) 864 865 // Delete the old node IP addresses if they have changed in this node. 866 var v4Addrs, v6Addrs []netip.Addr 867 for _, address := range oldNode.IPAddresses { 868 oldPrefix := ip.IPToNetPrefix(address.IP) 869 if slices.Contains(nodeIPsAdded, oldPrefix) { 870 continue 871 } 872 873 if address.Type == addressing.NodeInternalIP && !slices.Contains(ipsetEntries, oldPrefix) { 874 addr, ok := ip.AddrFromIP(address.IP) 875 if !ok { 876 log.WithField(logfields.IPAddr, address.IP).Error("unable to convert to netip.Addr") 877 continue 878 } 879 if addr.Is6() { 880 v6Addrs = append(v6Addrs, addr) 881 } else { 882 v4Addrs = append(v4Addrs, addr) 883 } 884 } 885 886 var oldTunnelIP netip.Addr 887 if m.nodeAddressHasTunnelIP(address) { 888 oldTunnelIP = oldNodeIP 889 } 890 891 var oldKey uint8 892 if m.nodeAddressHasEncryptKey() { 893 oldKey = oldNode.EncryptionKey 894 } 895 896 m.ipcache.RemoveMetadata(oldPrefix, resource, 897 oldNodeLabels, 898 ipcacheTypes.TunnelPeer{Addr: oldTunnelIP}, 899 ipcacheTypes.EncryptKey(oldKey)) 900 if oldNodeIdentityOverride { 901 m.ipcache.RemoveIdentityOverride(oldPrefix, oldNodeLabels, resource) 902 } 903 } 904 905 m.ipsetMgr.RemoveFromIPSet(ipset.CiliumNodeIPSetV4, v4Addrs...) 906 m.ipsetMgr.RemoveFromIPSet(ipset.CiliumNodeIPSetV6, v6Addrs...) 907 908 // Delete the old health IP addresses if they have changed in this node. 909 for _, address := range []net.IP{oldNode.IPv4HealthIP, oldNode.IPv6HealthIP} { 910 healthIP := ip.IPToNetPrefix(address) 911 if !healthIP.IsValid() || slices.Contains(healthIPsAdded, healthIP) { 912 continue 913 } 914 915 m.ipcache.RemoveMetadata(healthIP, resource, 916 labels.LabelHealth, 917 ipcacheTypes.TunnelPeer{Addr: oldNodeIP}, 918 m.endpointEncryptionKey(&oldNode)) 919 } 920 921 // Delete the old ingress IP addresses if they have changed in this node. 922 for _, address := range []net.IP{oldNode.IPv4IngressIP, oldNode.IPv6IngressIP} { 923 ingressIP := ip.IPToNetPrefix(address) 924 if !ingressIP.IsValid() || slices.Contains(ingressIPsAdded, ingressIP) { 925 continue 926 } 927 928 m.ipcache.RemoveMetadata(ingressIP, resource, 929 labels.LabelIngress, 930 ipcacheTypes.TunnelPeer{Addr: oldNodeIP}, 931 m.endpointEncryptionKey(&oldNode)) 932 } 933 } 934 935 // NodeDeleted is called after a node has been deleted. It removes the node 936 // from the manager if the node is still owned by the source of which the event 937 // origins from. If the node was removed, NodeDelete() is invoked of the 938 // datapath interface. 939 func (m *manager) NodeDeleted(n nodeTypes.Node) { 940 log.WithFields(logrus.Fields{ 941 logfields.ClusterName: n.Cluster, 942 logfields.NodeName: n.Name, 943 }).Info("Node deleted") 944 if log.Logger.IsLevelEnabled(logrus.DebugLevel) { 945 log.Debugf("Received node delete event from %s", n.Source) 946 } 947 948 m.metrics.EventsReceived.WithLabelValues("delete", string(n.Source)).Inc() 949 950 nodeIdentifier := n.Identity() 951 952 var ( 953 entry *nodeEntry 954 oldNodeExists bool 955 ) 956 957 m.mutex.Lock() 958 // If the node is restored from disk, it doesn't exist in the bookkeeping, 959 // but we need to synthesize a deletion event for downstream. 960 if n.Source == source.Restored { 961 entry = &nodeEntry{ 962 node: n, 963 } 964 } else { 965 entry, oldNodeExists = m.nodes[nodeIdentifier] 966 if !oldNodeExists { 967 m.mutex.Unlock() 968 return 969 } 970 } 971 972 // If the source is Kubernetes and the node is the node we are running on 973 // Kubernetes is giving us a hint it is about to delete our node. Close down 974 // the agent gracefully in this case. 975 if n.Source != entry.node.Source { 976 m.mutex.Unlock() 977 if n.IsLocal() && n.Source == source.Kubernetes { 978 log.Debugf("Kubernetes is deleting local node, close manager") 979 m.Stop(context.Background()) 980 } else { 981 log.Debugf("Ignoring delete event of node %s from source %s. The node is owned by %s", 982 n.Name, n.Source, entry.node.Source) 983 } 984 return 985 } 986 987 // The ipcache is recreated from scratch on startup, no need to prune restored stale nodes. 988 if n.Source != source.Restored { 989 resource := ipcacheTypes.NewResourceID(ipcacheTypes.ResourceKindNode, "", n.Name) 990 m.removeNodeFromIPCache(entry.node, resource, nil, nil, nil, nil) 991 } 992 993 m.metrics.NumNodes.Dec() 994 m.metrics.ProcessNodeDeletion(n.Cluster, n.Name) 995 996 entry.mutex.Lock() 997 delete(m.nodes, nodeIdentifier) 998 if m.nodeCheckpointer != nil { 999 m.nodeCheckpointer.TriggerWithReason("NodeDeleted") 1000 } 1001 m.mutex.Unlock() 1002 var errs error 1003 m.Iter(func(nh datapath.NodeHandler) { 1004 if err := nh.NodeDelete(n); err != nil { 1005 // For now we log the error and continue. Eventually we will want to encorporate 1006 // this into the node managers health status. 1007 // However this is a bit tricky - as leftover node deletes are not retries so this will 1008 // need to be accompanied by some kind of retry mechanism. 1009 log.WithFields(logrus.Fields{ 1010 "handler": nh.Name(), 1011 "node": n.Name, 1012 }).WithError(err).Error("Failed to handle node delete event while applying handler. Cilium may be have degraded functionality.") 1013 errs = errors.Join(errs, err) 1014 } 1015 }) 1016 entry.mutex.Unlock() 1017 1018 hr := m.health.NewScope("nodes-delete") 1019 if errs != nil { 1020 hr.Degraded("Failed to delete nodes", errs) 1021 } else { 1022 hr.OK("Node deletions successful") 1023 } 1024 } 1025 1026 // NodeSync signals the manager that the initial nodes listing (either from k8s 1027 // or kvstore) has been completed. This allows the manager to initiate the 1028 // deletion of possible stale nodes. 1029 func (m *manager) NodeSync() { 1030 m.ipsetInitializer.InitDone() 1031 1032 // Due to the complexity around kvstore vs k8s as node sources, it may occur 1033 // that both sources call NodeSync at some point. Ensure we only run this 1034 // pruning operation once. 1035 m.nodePruneOnce.Do(func() { 1036 m.pruneNodes(false) 1037 }) 1038 } 1039 1040 func (m *manager) MeshNodeSync() { 1041 m.pruneNodes(true) 1042 } 1043 1044 func (m *manager) pruneNodes(includeMeshed bool) { 1045 m.mutex.Lock() 1046 if m.restoredNodes == nil || len(m.restoredNodes) == 0 { 1047 m.mutex.Unlock() 1048 return 1049 } 1050 // Live nodes should not be pruned. 1051 for id := range m.nodes { 1052 delete(m.restoredNodes, id) 1053 } 1054 1055 if len(m.restoredNodes) > 0 { 1056 log.WithFields(logrus.Fields{ 1057 "stale-nodes": m.restoredNodes, 1058 }).Info("Deleting stale nodes") 1059 } 1060 m.mutex.Unlock() 1061 1062 // Delete nodes now considered stale. Can't hold the mutex as 1063 // NodeDeleted also acquires it. 1064 for id, n := range m.restoredNodes { 1065 if n.Cluster == m.conf.ClusterName || includeMeshed { 1066 m.NodeDeleted(*n) 1067 delete(m.restoredNodes, id) 1068 } 1069 } 1070 } 1071 1072 // GetNodeIdentities returns a list of all node identities store in node 1073 // manager. 1074 func (m *manager) GetNodeIdentities() []nodeTypes.Identity { 1075 m.mutex.RLock() 1076 defer m.mutex.RUnlock() 1077 1078 nodes := make([]nodeTypes.Identity, 0, len(m.nodes)) 1079 for nodeIdentity := range m.nodes { 1080 nodes = append(nodes, nodeIdentity) 1081 } 1082 1083 return nodes 1084 } 1085 1086 // GetNodes returns a copy of all of the nodes as a map from Identity to Node. 1087 func (m *manager) GetNodes() map[nodeTypes.Identity]nodeTypes.Node { 1088 m.mutex.RLock() 1089 defer m.mutex.RUnlock() 1090 1091 nodes := make(map[nodeTypes.Identity]nodeTypes.Node, len(m.nodes)) 1092 for nodeIdentity, entry := range m.nodes { 1093 entry.mutex.Lock() 1094 nodes[nodeIdentity] = entry.node 1095 entry.mutex.Unlock() 1096 } 1097 1098 return nodes 1099 } 1100 1101 // StartNodeNeighborLinkUpdater manages node neighbors links sync. 1102 // This provides a central location for all node neighbor link updates. 1103 // Under proper conditions, publisher enqueues the node which requires a link update. 1104 // This controller is agnostic of the condition under which the links must be established, thus 1105 // that responsibility lies on the publishers. 1106 // This controller also provides for module health to be reported in a single central location. 1107 func (m *manager) StartNodeNeighborLinkUpdater(nh datapath.NodeNeighbors) { 1108 sc := m.health.NewScope("neighbor-link-updater") 1109 controller.NewManager().UpdateController( 1110 "node-neighbor-link-updater", 1111 controller.ControllerParams{ 1112 Group: neighborTableUpdateControllerGroup, 1113 DoFunc: func(ctx context.Context) error { 1114 var errs error 1115 if m.nodeNeighborQueue.isEmpty() { 1116 return nil 1117 } 1118 for { 1119 e, ok := m.nodeNeighborQueue.pop() 1120 if !ok { 1121 break 1122 } else if e == nil || e.node == nil { 1123 errs = errors.Join(errs, fmt.Errorf("invalid node spec found in queue: %#v", e)) 1124 break 1125 } 1126 1127 log.Debugf("Refreshing node neighbor link for %s", e.node.Name) 1128 hr := sc.NewScope(e.node.Name) 1129 if errs = errors.Join(errs, nh.NodeNeighborRefresh(ctx, *e.node, e.refresh)); errs != nil { 1130 hr.Degraded("Failed node neighbor link update", errs) 1131 } else { 1132 hr.OK("Node neighbor link update successful") 1133 } 1134 } 1135 return errs 1136 }, 1137 RunInterval: defaultNodeUpdateInterval, 1138 }, 1139 ) 1140 } 1141 1142 // StartNeighborRefresh spawns a controller which refreshes neighbor table 1143 // by forcing node neighbors refresh periodically based on the arping settings. 1144 func (m *manager) StartNeighborRefresh(nh datapath.NodeNeighbors) { 1145 ctx, cancel := context.WithCancel(context.Background()) 1146 controller.NewManager().UpdateController( 1147 "neighbor-table-refresh", 1148 controller.ControllerParams{ 1149 Group: neighborTableRefreshControllerGroup, 1150 DoFunc: func(controllerCtx context.Context) error { 1151 // Cancel previous goroutines from previous controller run 1152 cancel() 1153 ctx, cancel = context.WithCancel(controllerCtx) 1154 m.mutex.RLock() 1155 defer m.mutex.RUnlock() 1156 for _, entry := range m.nodes { 1157 entry.mutex.Lock() 1158 entryNode := entry.node 1159 entry.mutex.Unlock() 1160 if entryNode.IsLocal() { 1161 continue 1162 } 1163 go func(ctx context.Context, e *nodeTypes.Node) { 1164 // TODO Should this be moved to dequeue instead? 1165 // To avoid flooding network with arping requests 1166 // at the same time, spread them over the 1167 // [0; ARPPingRefreshPeriod/2) period. 1168 n := rand.Int64N(int64(m.conf.ARPPingRefreshPeriod / 2)) 1169 time.Sleep(time.Duration(n)) 1170 m.Enqueue(e, false) 1171 }(ctx, &entryNode) 1172 } 1173 return nil 1174 }, 1175 RunInterval: m.conf.ARPPingRefreshPeriod, 1176 }, 1177 ) 1178 }