github.com/cilium/cilium@v1.16.2/pkg/k8s/watchers/pod.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package watchers 5 6 import ( 7 "context" 8 "encoding/json" 9 "errors" 10 "fmt" 11 "maps" 12 "net" 13 "net/netip" 14 "strings" 15 "sync" 16 "sync/atomic" 17 18 "github.com/cilium/ebpf" 19 "github.com/cilium/ebpf/asm" 20 "github.com/cilium/hive/cell" 21 "github.com/cilium/statedb" 22 "github.com/sirupsen/logrus" 23 k8sErrors "k8s.io/apimachinery/pkg/api/errors" 24 meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 "k8s.io/apimachinery/pkg/fields" 26 "k8s.io/apimachinery/pkg/runtime/schema" 27 "k8s.io/apimachinery/pkg/types" 28 "k8s.io/client-go/tools/cache" 29 30 agentK8s "github.com/cilium/cilium/daemon/k8s" 31 "github.com/cilium/cilium/pkg/annotation" 32 cgroup "github.com/cilium/cilium/pkg/cgroups/manager" 33 cmtypes "github.com/cilium/cilium/pkg/clustermesh/types" 34 "github.com/cilium/cilium/pkg/controller" 35 "github.com/cilium/cilium/pkg/datapath/linux/bandwidth" 36 "github.com/cilium/cilium/pkg/datapath/linux/probes" 37 datapathTables "github.com/cilium/cilium/pkg/datapath/tables" 38 datapath "github.com/cilium/cilium/pkg/datapath/types" 39 "github.com/cilium/cilium/pkg/endpoint" 40 "github.com/cilium/cilium/pkg/endpoint/regeneration" 41 "github.com/cilium/cilium/pkg/endpointmanager" 42 "github.com/cilium/cilium/pkg/identity" 43 "github.com/cilium/cilium/pkg/ip" 44 "github.com/cilium/cilium/pkg/ipcache" 45 "github.com/cilium/cilium/pkg/k8s" 46 k8sClient "github.com/cilium/cilium/pkg/k8s/client" 47 "github.com/cilium/cilium/pkg/k8s/informer" 48 "github.com/cilium/cilium/pkg/k8s/resource" 49 slim_corev1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/api/core/v1" 50 slim_metav1 "github.com/cilium/cilium/pkg/k8s/slim/k8s/apis/meta/v1" 51 slimclientset "github.com/cilium/cilium/pkg/k8s/slim/k8s/client/clientset/versioned" 52 k8sSynced "github.com/cilium/cilium/pkg/k8s/synced" 53 k8sTypes "github.com/cilium/cilium/pkg/k8s/types" 54 k8sUtils "github.com/cilium/cilium/pkg/k8s/utils" 55 "github.com/cilium/cilium/pkg/k8s/watchers/resources" 56 "github.com/cilium/cilium/pkg/k8s/watchers/utils" 57 "github.com/cilium/cilium/pkg/kvstore" 58 "github.com/cilium/cilium/pkg/labels" 59 "github.com/cilium/cilium/pkg/labelsfilter" 60 "github.com/cilium/cilium/pkg/loadbalancer" 61 "github.com/cilium/cilium/pkg/lock" 62 "github.com/cilium/cilium/pkg/logging/logfields" 63 "github.com/cilium/cilium/pkg/metrics" 64 "github.com/cilium/cilium/pkg/node" 65 nodeTypes "github.com/cilium/cilium/pkg/node/types" 66 "github.com/cilium/cilium/pkg/option" 67 "github.com/cilium/cilium/pkg/policy" 68 "github.com/cilium/cilium/pkg/redirectpolicy" 69 "github.com/cilium/cilium/pkg/service" 70 "github.com/cilium/cilium/pkg/source" 71 "github.com/cilium/cilium/pkg/time" 72 ciliumTypes "github.com/cilium/cilium/pkg/types" 73 "github.com/cilium/cilium/pkg/u8proto" 74 ) 75 76 const podApiGroup = resources.K8sAPIGroupPodV1Core 77 78 var ciliumEndpointSyncPodLabelsControllerGroup = controller.NewGroup("sync-pod-labels-with-cilium-endpoint") 79 80 type k8sPodWatcherParams struct { 81 cell.In 82 83 K8sEventReporter *K8sEventReporter 84 85 Clientset k8sClient.Clientset 86 Resources agentK8s.Resources 87 K8sResourceSynced *k8sSynced.Resources 88 K8sAPIGroups *k8sSynced.APIGroups 89 EndpointManager endpointmanager.EndpointManager 90 PolicyUpdater *policy.Updater 91 IPCache *ipcache.IPCache 92 ServiceManager service.ServiceManager 93 DB *statedb.DB 94 NodeAddrs statedb.Table[datapathTables.NodeAddress] 95 LRPManager *redirectpolicy.Manager 96 BandwidthManager datapath.BandwidthManager 97 CGroupManager cgroup.CGroupManager 98 } 99 100 func newK8sPodWatcher(params k8sPodWatcherParams) *K8sPodWatcher { 101 return &K8sPodWatcher{ 102 clientset: params.Clientset, 103 k8sEventReporter: params.K8sEventReporter, 104 k8sResourceSynced: params.K8sResourceSynced, 105 k8sAPIGroups: params.K8sAPIGroups, 106 endpointManager: params.EndpointManager, 107 policyManager: params.PolicyUpdater, 108 svcManager: params.ServiceManager, 109 redirectPolicyManager: params.LRPManager, 110 ipcache: params.IPCache, 111 cgroupManager: params.CGroupManager, 112 bandwidthManager: params.BandwidthManager, 113 resources: params.Resources, 114 db: params.DB, 115 nodeAddrs: params.NodeAddrs, 116 117 controllersStarted: make(chan struct{}), 118 podStoreSet: make(chan struct{}), 119 } 120 } 121 122 type K8sPodWatcher struct { 123 clientset k8sClient.Clientset 124 125 k8sEventReporter *K8sEventReporter 126 127 // k8sResourceSynced maps a resource name to a channel. Once the given 128 // resource name is synchronized with k8s, the channel for which that 129 // resource name maps to is closed. 130 k8sResourceSynced *k8sSynced.Resources 131 // k8sAPIGroups is a set of k8s API in use. They are setup in watchers, 132 // and may be disabled while the agent runs. 133 k8sAPIGroups *k8sSynced.APIGroups 134 endpointManager endpointManager 135 policyManager policyManager 136 svcManager svcManager 137 redirectPolicyManager redirectPolicyManager 138 ipcache ipcacheManager 139 cgroupManager cgroupManager 140 bandwidthManager datapath.BandwidthManager 141 resources agentK8s.Resources 142 db *statedb.DB 143 nodeAddrs statedb.Table[datapathTables.NodeAddress] 144 145 podStoreMU lock.RWMutex 146 podStore cache.Store 147 // podStoreSet is a channel that is closed when the podStore cache is 148 // variable is written for the first time. 149 podStoreSet chan struct{} 150 podStoreOnce sync.Once 151 152 // controllersStarted is a channel that is closed when all watchers that do not depend on 153 // local node configuration have been started 154 controllersStarted chan struct{} 155 } 156 157 // createAllPodsController is used in the rare configurations where CiliumEndpointCRD is disabled. 158 // If kvstore is enabled then we fall back to watching only local pods when kvstore connects. 159 func (k *K8sPodWatcher) createAllPodsController(slimClient slimclientset.Interface) (cache.Store, cache.Controller) { 160 return informer.NewInformer( 161 k8sUtils.ListerWatcherWithFields( 162 k8sUtils.ListerWatcherFromTyped[*slim_corev1.PodList](slimClient.CoreV1().Pods("")), 163 fields.Everything()), 164 &slim_corev1.Pod{}, 165 0, 166 cache.ResourceEventHandlerFuncs{ 167 AddFunc: func(obj interface{}) { 168 if pod := informer.CastInformerEvent[slim_corev1.Pod](obj); pod != nil { 169 err := k.addK8sPodV1(pod) 170 k.k8sEventReporter.K8sEventProcessed(metricPod, resources.MetricCreate, err == nil) 171 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricCreate, true, false) 172 } else { 173 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricCreate, false, false) 174 } 175 }, 176 UpdateFunc: func(oldObj, newObj interface{}) { 177 if oldPod := informer.CastInformerEvent[slim_corev1.Pod](oldObj); oldPod != nil { 178 if newPod := informer.CastInformerEvent[slim_corev1.Pod](newObj); newPod != nil { 179 if oldPod.DeepEqual(newPod) { 180 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricUpdate, false, true) 181 } else { 182 err := k.updateK8sPodV1(oldPod, newPod) 183 k.k8sEventReporter.K8sEventProcessed(metricPod, resources.MetricUpdate, err == nil) 184 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricUpdate, true, false) 185 } 186 } 187 } else { 188 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricUpdate, false, false) 189 } 190 }, 191 DeleteFunc: func(obj interface{}) { 192 if pod := informer.CastInformerEvent[slim_corev1.Pod](obj); pod != nil { 193 err := k.deleteK8sPodV1(pod) 194 k.k8sEventReporter.K8sEventProcessed(metricPod, resources.MetricDelete, err == nil) 195 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricDelete, true, false) 196 } else { 197 k.k8sEventReporter.K8sEventReceived(podApiGroup, metricPod, resources.MetricDelete, false, false) 198 } 199 }, 200 }, 201 nil, 202 ) 203 } 204 205 func (k *K8sPodWatcher) podsInit(asyncControllers *sync.WaitGroup) { 206 var once sync.Once 207 watchNodePods := func() context.CancelFunc { 208 ctx, cancel := context.WithCancel(context.Background()) 209 var synced atomic.Bool 210 go func() { 211 pods := make(map[resource.Key]*slim_corev1.Pod) 212 for ev := range k.resources.LocalPods.Events(ctx) { 213 switch ev.Kind { 214 case resource.Sync: 215 // Set the pod store now that resource has synchronized. Only 216 // error expected is if we're being stopped (context cancelled). 217 podStore, err := k.resources.LocalPods.Store(ctx) 218 if err == nil { 219 k.podStoreMU.Lock() 220 k.podStore = podStore.CacheStore() 221 k.podStoreMU.Unlock() 222 k.podStoreOnce.Do(func() { 223 close(k.podStoreSet) 224 }) 225 } 226 synced.Store(true) 227 case resource.Upsert: 228 newPod := ev.Object 229 oldPod := pods[ev.Key] 230 if oldPod == nil { 231 k.addK8sPodV1(newPod) 232 } else { 233 k.updateK8sPodV1(oldPod, newPod) 234 } 235 k.k8sResourceSynced.SetEventTimestamp(podApiGroup) 236 pods[ev.Key] = newPod 237 case resource.Delete: 238 k.deleteK8sPodV1(ev.Object) 239 k.k8sResourceSynced.SetEventTimestamp(podApiGroup) 240 delete(pods, ev.Key) 241 } 242 243 ev.Done(nil) 244 } 245 }() 246 247 k.k8sResourceSynced.BlockWaitGroupToSyncResources(ctx.Done(), nil, synced.Load, resources.K8sAPIGroupPodV1Core) 248 once.Do(func() { 249 asyncControllers.Done() 250 k.k8sAPIGroups.AddAPI(resources.K8sAPIGroupPodV1Core) 251 }) 252 return cancel 253 } 254 255 // We will watch for pods on th entire cluster to keep existing 256 // functionality untouched. If we are running with CiliumEndpoint CRD 257 // enabled then it means that we can simply watch for pods that are created 258 // for this node. 259 if !option.Config.DisableCiliumEndpointCRD { 260 watchNodePods() 261 return 262 } 263 264 // If CiliumEndpointCRD is disabled, we will fallback on watching all pods. 265 for { 266 podStore, podController := k.createAllPodsController(k.clientset.Slim()) 267 268 isConnected := make(chan struct{}) 269 // once isConnected is closed, it will stop waiting on caches to be 270 // synchronized. 271 k.k8sResourceSynced.BlockWaitGroupToSyncResources(isConnected, nil, podController.HasSynced, resources.K8sAPIGroupPodV1Core) 272 once.Do(func() { 273 asyncControllers.Done() 274 k.k8sAPIGroups.AddAPI(resources.K8sAPIGroupPodV1Core) 275 }) 276 go podController.Run(isConnected) 277 278 k.podStoreMU.Lock() 279 k.podStore = podStore 280 k.podStoreMU.Unlock() 281 k.podStoreOnce.Do(func() { 282 close(k.podStoreSet) 283 }) 284 285 // Replace pod controller by only receiving events from our own 286 // node once we are connected to the kvstore. 287 <-kvstore.Connected() 288 close(isConnected) 289 290 log.WithField(logfields.Node, nodeTypes.GetName()).Info("Connected to KVStore, watching for pod events on node") 291 cancelWatchNodePods := watchNodePods() 292 293 // Create a new pod controller when we are disconnected with the 294 // kvstore 295 <-kvstore.Client().Disconnected() 296 cancelWatchNodePods() 297 log.Info("Disconnected from KVStore, watching for pod events all nodes") 298 } 299 } 300 301 func (k *K8sPodWatcher) addK8sPodV1(pod *slim_corev1.Pod) error { 302 var err error 303 304 logger := log.WithFields(logrus.Fields{ 305 logfields.K8sPodName: pod.ObjectMeta.Name, 306 logfields.K8sNamespace: pod.ObjectMeta.Namespace, 307 "podIP": pod.Status.PodIP, 308 "podIPs": pod.Status.PodIPs, 309 "hostIP": pod.Status.HostIP, 310 }) 311 312 podNSName := k8sUtils.GetObjNamespaceName(&pod.ObjectMeta) 313 314 // If ep is not nil then we have received the CNI event 315 // first and the k8s event afterwards, if this happens it's 316 // likely the Kube API Server is getting behind the event 317 // handling. 318 if eps := k.endpointManager.GetEndpointsByPodName(podNSName); len(eps) != 0 { 319 var earliestEP time.Time 320 for _, ep := range eps { 321 createdAt := ep.GetCreatedAt() 322 if earliestEP.IsZero() || createdAt.Before(earliestEP) { 323 earliestEP = createdAt 324 } 325 } 326 timeSinceEpCreated := time.Since(earliestEP) 327 if timeSinceEpCreated <= 0 { 328 metrics.EventLagK8s.Set(0) 329 } else { 330 metrics.EventLagK8s.Set(timeSinceEpCreated.Round(time.Second).Seconds()) 331 } 332 } else { 333 // If the ep is nil then we reset to zero, otherwise 334 // the previous value set is kept forever. 335 metrics.EventLagK8s.Set(0) 336 } 337 // In Kubernetes Jobs, Pods can be left in Kubernetes until the Job 338 // is deleted. If the Job is never deleted, Cilium will never receive a Pod 339 // delete event, causing the IP to be left in the ipcache. 340 // For this reason we should delete the ipcache entries whenever the pod 341 // status is either PodFailed or PodSucceeded as it means the IP address 342 // is no longer in use. 343 if !k8sUtils.IsPodRunning(pod.Status) { 344 err = k.deleteK8sPodV1(pod) 345 return err 346 } 347 348 if pod.Spec.HostNetwork && !option.Config.EnableLocalRedirectPolicy { 349 logger.Debug("Skip pod event using host networking") 350 return err 351 } 352 353 podIPs := k8sUtils.ValidIPs(pod.Status) 354 if len(podIPs) > 0 { 355 err = k.updatePodHostData(nil, pod, nil, podIPs) 356 357 if option.Config.EnableLocalRedirectPolicy { 358 k.redirectPolicyManager.OnAddPod(pod) 359 } 360 } 361 362 k.cgroupManager.OnAddPod(pod) 363 364 if err != nil { 365 logger.WithError(err).Warning("Unable to update ipcache map entry on pod add") 366 } 367 logger.Debug("Updated ipcache map entry on pod add") 368 369 return err 370 } 371 372 func (k *K8sPodWatcher) updateK8sPodV1(oldK8sPod, newK8sPod *slim_corev1.Pod) error { 373 var err error 374 375 if oldK8sPod == nil || newK8sPod == nil { 376 return err 377 } 378 379 logger := log.WithFields(logrus.Fields{ 380 logfields.K8sPodName: newK8sPod.ObjectMeta.Name, 381 logfields.K8sNamespace: newK8sPod.ObjectMeta.Namespace, 382 "new-podIP": newK8sPod.Status.PodIP, 383 "new-podIPs": newK8sPod.Status.PodIPs, 384 "new-hostIP": newK8sPod.Status.HostIP, 385 "old-podIP": oldK8sPod.Status.PodIP, 386 "old-podIPs": oldK8sPod.Status.PodIPs, 387 "old-hostIP": oldK8sPod.Status.HostIP, 388 }) 389 390 // In Kubernetes Jobs, Pods can be left in Kubernetes until the Job 391 // is deleted. If the Job is never deleted, Cilium will never receive a Pod 392 // delete event, causing the IP to be left in the ipcache. 393 // For this reason we should delete the ipcache entries whenever the pod 394 // status is either PodFailed or PodSucceeded as it means the IP address 395 // is no longer in use. 396 if !k8sUtils.IsPodRunning(newK8sPod.Status) { 397 err = k.deleteK8sPodV1(newK8sPod) 398 return err 399 } 400 401 if newK8sPod.Spec.HostNetwork && !option.Config.EnableLocalRedirectPolicy && 402 !option.Config.EnableSocketLBTracing { 403 logger.Debug("Skip pod event using host networking") 404 return err 405 } 406 407 k.cgroupManager.OnUpdatePod(oldK8sPod, newK8sPod) 408 409 oldPodIPs := k8sUtils.ValidIPs(oldK8sPod.Status) 410 newPodIPs := k8sUtils.ValidIPs(newK8sPod.Status) 411 if len(oldPodIPs) != 0 || len(newPodIPs) != 0 { 412 err = k.updatePodHostData(oldK8sPod, newK8sPod, oldPodIPs, newPodIPs) 413 if err != nil { 414 logger.WithError(err).Warning("Unable to update ipcache map entry on pod update") 415 } 416 } 417 418 // Check annotation updates. 419 oldAnno := oldK8sPod.ObjectMeta.Annotations 420 newAnno := newK8sPod.ObjectMeta.Annotations 421 annoChangedProxy := !k8s.AnnotationsEqual([]string{annotation.ProxyVisibility, annotation.ProxyVisibilityAlias}, oldAnno, newAnno) 422 annoChangedBandwidth := !k8s.AnnotationsEqual([]string{bandwidth.EgressBandwidth}, oldAnno, newAnno) 423 annoChangedNoTrack := !k8s.AnnotationsEqual([]string{annotation.NoTrack, annotation.NoTrackAlias}, oldAnno, newAnno) 424 annotationsChanged := annoChangedProxy || annoChangedBandwidth || annoChangedNoTrack 425 426 // Check label updates too. 427 oldK8sPodLabels, _ := labelsfilter.Filter(labels.Map2Labels(oldK8sPod.ObjectMeta.Labels, labels.LabelSourceK8s)) 428 // old labels are stripped to avoid grandfathering in special labels 429 oldPodLabels := k8sUtils.StripPodSpecialLabels(oldK8sPodLabels.K8sStringMap()) 430 431 strippedNewLabels := k8sUtils.StripPodSpecialLabels(newK8sPod.Labels) 432 433 newK8sPodLabels, _ := labelsfilter.Filter(labels.Map2Labels(strippedNewLabels, labels.LabelSourceK8s)) 434 newPodLabels := newK8sPodLabels.K8sStringMap() 435 labelsChanged := !maps.Equal(oldPodLabels, newPodLabels) 436 uidChanged := oldK8sPod.UID != newK8sPod.UID 437 438 // The relevant updates are : podIPs and label updates. 439 // Consider a UID change the same as a label change in case the pod's 440 // identity needs to be updated, see GH-30409. 441 oldPodIPsSlice := k8sTypes.IPSlice(oldPodIPs) 442 newPodIPsSlice := k8sTypes.IPSlice(newPodIPs) 443 lrpNeedsReassign := !maps.Equal(oldPodLabels, newPodLabels) || !(&oldPodIPsSlice).DeepEqual(&newPodIPsSlice) || uidChanged 444 445 if option.Config.EnableLocalRedirectPolicy { 446 oldPodReady := k8sUtils.GetLatestPodReadiness(oldK8sPod.Status) 447 newPodReady := k8sUtils.GetLatestPodReadiness(newK8sPod.Status) 448 449 if lrpNeedsReassign || (oldPodReady != newPodReady) { 450 k.redirectPolicyManager.OnUpdatePod(newK8sPod, lrpNeedsReassign, newPodReady == slim_corev1.ConditionTrue) 451 } 452 } 453 454 // Nothing changed. 455 if !annotationsChanged && !labelsChanged { 456 log.WithFields(logrus.Fields{ 457 "old-labels": oldK8sPod.GetObjectMeta().GetLabels(), 458 "old-annotations": oldK8sPod.GetObjectMeta().GetAnnotations(), 459 "new-labels": newK8sPod.GetObjectMeta().GetLabels(), 460 "new-annotations": newK8sPod.GetObjectMeta().GetAnnotations(), 461 }).Debugf("Pod does not have any annotations nor labels changed") 462 return err 463 } 464 465 podNSName := k8sUtils.GetObjNamespaceName(&newK8sPod.ObjectMeta) 466 467 podEPs := k.endpointManager.GetEndpointsByPodName(podNSName) 468 if len(podEPs) == 0 { 469 log.WithField("pod", podNSName).Debugf("Endpoint not found running for the given pod") 470 return err 471 } 472 473 for _, podEP := range podEPs { 474 if labelsChanged || uidChanged { 475 // Consider a UID change the same as a label change in case the pod's 476 // identity needs to be updated, see GH-30409. Annotations are not 477 // checked for because annotations don't impact identities. 478 err := podEP.UpdateLabelsFrom(oldPodLabels, newPodLabels, labels.LabelSourceK8s) 479 if err != nil { 480 log.WithFields(logrus.Fields{ 481 logfields.K8sPodName: newK8sPod.ObjectMeta.Name, 482 logfields.K8sNamespace: newK8sPod.ObjectMeta.Namespace, 483 logfields.EndpointID: podEP.GetID(), 484 logfields.Labels: newPodLabels, 485 }).WithError(err).Warning("Unable to update endpoint labels on pod update") 486 return err 487 } 488 489 // Synchronize Pod labels with CiliumEndpoint labels if there is a change. 490 updateCiliumEndpointLabels(k.clientset, podEP, newK8sPod.Labels) 491 } 492 493 if annotationsChanged { 494 if annoChangedProxy { 495 podEP.UpdateVisibilityPolicy(func(ns, podName string) (proxyVisibility string, err error) { 496 p, err := k.GetCachedPod(ns, podName) 497 if err != nil { 498 return "", nil 499 } 500 value, _ := annotation.Get(p, annotation.ProxyVisibility, annotation.ProxyVisibilityAlias) 501 return value, nil 502 }) 503 } 504 if annoChangedBandwidth { 505 podEP.UpdateBandwidthPolicy(k.bandwidthManager, func(ns, podName string) (bandwidthEgress string, err error) { 506 p, err := k.GetCachedPod(ns, podName) 507 if err != nil { 508 return "", nil 509 } 510 return p.ObjectMeta.Annotations[bandwidth.EgressBandwidth], nil 511 }) 512 } 513 if annoChangedNoTrack { 514 podEP.UpdateNoTrackRules(func(ns, podName string) (noTrackPort string, err error) { 515 p, err := k.GetCachedPod(ns, podName) 516 if err != nil { 517 return "", nil 518 } 519 value, _ := annotation.Get(p, annotation.NoTrack, annotation.NoTrackAlias) 520 return value, nil 521 }) 522 } 523 realizePodAnnotationUpdate(podEP) 524 } 525 } 526 527 return err 528 } 529 530 func realizePodAnnotationUpdate(podEP *endpoint.Endpoint) { 531 regenMetadata := ®eneration.ExternalRegenerationMetadata{ 532 Reason: "annotations updated", 533 RegenerationLevel: regeneration.RegenerateWithoutDatapath, 534 } 535 // No need to log an error if the state transition didn't succeed, 536 // if it didn't succeed that means the endpoint is being deleted, or 537 // another regeneration has already been queued up for this endpoint. 538 regen, _ := podEP.SetRegenerateStateIfAlive(regenMetadata) 539 if regen { 540 podEP.Regenerate(regenMetadata) 541 } 542 } 543 544 // updateCiliumEndpointLabels runs a controller associated with the endpoint that updates 545 // the Labels in CiliumEndpoint object by mirroring those of the associated Pod. 546 func updateCiliumEndpointLabels(clientset k8sClient.Clientset, ep *endpoint.Endpoint, labels map[string]string) { 547 var ( 548 controllerName = fmt.Sprintf("sync-pod-labels-with-cilium-endpoint (%v)", ep.GetID()) 549 scopedLog = log.WithField("controller", controllerName) 550 ) 551 552 // The controller is executed only once and is associated with the underlying endpoint object. 553 // This is to make sure that the controller is also deleted once the endpoint is gone. 554 ep.UpdateController(controllerName, 555 controller.ControllerParams{ 556 Group: ciliumEndpointSyncPodLabelsControllerGroup, 557 DoFunc: func(ctx context.Context) (err error) { 558 cepOwner := ep.GetCEPOwner() 559 if cepOwner.IsNil() { 560 err := errors.New("Skipping CiliumEndpoint update because it has no k8s pod") 561 scopedLog.WithFields(logrus.Fields{ 562 logfields.EndpointID: ep.GetID(), 563 logfields.Labels: logfields.Repr(labels), 564 }).Debug(err) 565 return err 566 } 567 ciliumClient := clientset.CiliumV2() 568 569 replaceLabels := []k8s.JSONPatch{ 570 { 571 OP: "replace", 572 Path: "/metadata/labels", 573 Value: labels, 574 }, 575 } 576 577 labelsPatch, err := json.Marshal(replaceLabels) 578 if err != nil { 579 scopedLog.WithError(err).Debug("Error marshalling Pod labels") 580 return err 581 } 582 583 _, err = ciliumClient.CiliumEndpoints(cepOwner.GetNamespace()).Patch( 584 ctx, ep.GetK8sCEPName(), 585 types.JSONPatchType, 586 labelsPatch, 587 meta_v1.PatchOptions{}) 588 if err != nil { 589 scopedLog.WithError(err).Debug("Error while updating CiliumEndpoint object with new Pod labels") 590 return err 591 } 592 593 scopedLog.WithFields(logrus.Fields{ 594 logfields.EndpointID: ep.GetID(), 595 logfields.Labels: logfields.Repr(labels), 596 }).Debug("Updated CiliumEndpoint object with new Pod labels") 597 598 return nil 599 }, 600 }) 601 } 602 603 func (k *K8sPodWatcher) deleteK8sPodV1(pod *slim_corev1.Pod) error { 604 var err error 605 606 logger := log.WithFields(logrus.Fields{ 607 logfields.K8sPodName: pod.ObjectMeta.Name, 608 logfields.K8sNamespace: pod.ObjectMeta.Namespace, 609 "podIP": pod.Status.PodIP, 610 "podIPs": pod.Status.PodIPs, 611 "hostIP": pod.Status.HostIP, 612 }) 613 614 if option.Config.EnableLocalRedirectPolicy { 615 k.redirectPolicyManager.OnDeletePod(pod) 616 } 617 618 k.cgroupManager.OnDeletePod(pod) 619 620 skipped, err := k.deletePodHostData(pod) 621 switch { 622 case skipped: 623 logger.WithError(err).Debug("Skipped ipcache map delete on pod delete") 624 case err != nil: 625 logger.WithError(err).Warning("Unable to delete ipcache map entry on pod delete") 626 default: 627 logger.Debug("Deleted ipcache map entry on pod delete") 628 } 629 return err 630 } 631 632 var ( 633 _netnsCookieSupported bool 634 _netnsCookieSupportedOnce sync.Once 635 ) 636 637 func netnsCookieSupported() bool { 638 _netnsCookieSupportedOnce.Do(func() { 639 _netnsCookieSupported = probes.HaveProgramHelper(ebpf.CGroupSock, asm.FnGetNetnsCookie) == nil && 640 probes.HaveProgramHelper(ebpf.CGroupSockAddr, asm.FnGetNetnsCookie) == nil 641 }) 642 return _netnsCookieSupported 643 } 644 645 func (k *K8sPodWatcher) genServiceMappings(pod *slim_corev1.Pod, podIPs []string, logger *logrus.Entry) []loadbalancer.SVC { 646 var ( 647 svcs []loadbalancer.SVC 648 containers []slim_corev1.Container 649 ) 650 containers = append(containers, pod.Spec.InitContainers...) 651 containers = append(containers, pod.Spec.Containers...) 652 for _, c := range containers { 653 for _, p := range c.Ports { 654 if p.HostPort <= 0 { 655 continue 656 } 657 658 if int(p.HostPort) >= option.Config.NodePortMin && 659 int(p.HostPort) <= option.Config.NodePortMax { 660 logger.Warningf("The requested hostPort %d is colliding with the configured NodePort range [%d, %d]. Ignoring.", 661 p.HostPort, option.Config.NodePortMin, option.Config.NodePortMax) 662 continue 663 } 664 665 feIP := net.ParseIP(p.HostIP) 666 if feIP != nil && feIP.IsLoopback() && !netnsCookieSupported() { 667 logger.Warningf("The requested loopback address for hostIP (%s) is not supported for kernels which don't provide netns cookies. Ignoring.", feIP) 668 continue 669 } 670 671 proto, err := loadbalancer.NewL4Type(string(p.Protocol)) 672 if err != nil { 673 continue 674 } 675 676 var bes4 []*loadbalancer.Backend 677 var bes6 []*loadbalancer.Backend 678 679 for _, podIP := range podIPs { 680 be := loadbalancer.Backend{ 681 L3n4Addr: loadbalancer.L3n4Addr{ 682 AddrCluster: cmtypes.MustParseAddrCluster(podIP), 683 L4Addr: loadbalancer.L4Addr{ 684 Protocol: proto, 685 Port: uint16(p.ContainerPort), 686 }, 687 }, 688 } 689 if be.L3n4Addr.AddrCluster.Is4() { 690 bes4 = append(bes4, &be) 691 } else { 692 bes6 = append(bes6, &be) 693 } 694 } 695 696 var nodeAddrAll []netip.Addr 697 loopbackHostport := false 698 699 // When HostIP is explicitly set, then we need to expose *only* 700 // on this address but not via other addresses. When it's not set, 701 // then expose via all local addresses. Same when the user provides 702 // an unspecified address (0.0.0.0 / [::]). 703 if feIP != nil && !feIP.IsUnspecified() { 704 // Migrate the loopback address into a 0.0.0.0 / [::] 705 // surrogate, thus internal datapath handling can be 706 // streamlined. It's not exposed for traffic from outside. 707 if feIP.IsLoopback() { 708 if feIP.To4() != nil { 709 feIP = net.IPv4zero 710 } else { 711 feIP = net.IPv6zero 712 } 713 loopbackHostport = true 714 } 715 nodeAddrAll = []netip.Addr{ip.MustAddrFromIP(feIP)} 716 } else { 717 iter := k.nodeAddrs.List(k.db.ReadTxn(), datapathTables.NodeAddressNodePortIndex.Query(true)) 718 for addr, _, ok := iter.Next(); ok; addr, _, ok = iter.Next() { 719 nodeAddrAll = append(nodeAddrAll, addr.Addr) 720 } 721 nodeAddrAll = append(nodeAddrAll, netip.IPv4Unspecified()) 722 nodeAddrAll = append(nodeAddrAll, netip.IPv6Unspecified()) 723 } 724 for _, addr := range nodeAddrAll { 725 fe := loadbalancer.L3n4AddrID{ 726 L3n4Addr: loadbalancer.L3n4Addr{ 727 AddrCluster: cmtypes.AddrClusterFrom(addr, 0), 728 L4Addr: loadbalancer.L4Addr{ 729 Protocol: proto, 730 Port: uint16(p.HostPort), 731 }, 732 Scope: loadbalancer.ScopeExternal, 733 }, 734 ID: loadbalancer.ID(0), 735 } 736 737 // We don't have the node name available here, but in any 738 // case in the BPF data path we drop any potential non-local 739 // backends anyway (which should never exist in the first 740 // place), hence we can just leave it at Cluster policy. 741 if addr.Is4() { 742 if option.Config.EnableIPv4 && len(bes4) > 0 { 743 svcs = append(svcs, 744 loadbalancer.SVC{ 745 Frontend: fe, 746 Backends: bes4, 747 Type: loadbalancer.SVCTypeHostPort, 748 ExtTrafficPolicy: loadbalancer.SVCTrafficPolicyCluster, 749 IntTrafficPolicy: loadbalancer.SVCTrafficPolicyCluster, 750 LoopbackHostport: loopbackHostport, 751 }) 752 } 753 } else { 754 if option.Config.EnableIPv6 && len(bes6) > 0 { 755 svcs = append(svcs, 756 loadbalancer.SVC{ 757 Frontend: fe, 758 Backends: bes6, 759 Type: loadbalancer.SVCTypeHostPort, 760 ExtTrafficPolicy: loadbalancer.SVCTrafficPolicyCluster, 761 IntTrafficPolicy: loadbalancer.SVCTrafficPolicyCluster, 762 LoopbackHostport: loopbackHostport, 763 }) 764 } 765 } 766 } 767 } 768 } 769 770 return svcs 771 } 772 773 func (k *K8sPodWatcher) upsertHostPortMapping(oldPod, newPod *slim_corev1.Pod, oldPodIPs, newPodIPs []string) error { 774 if !option.Config.EnableHostPort { 775 return nil 776 } 777 778 var svcsAdded []loadbalancer.L3n4Addr 779 780 logger := log.WithFields(logrus.Fields{ 781 logfields.K8sPodName: newPod.ObjectMeta.Name, 782 logfields.K8sNamespace: newPod.ObjectMeta.Namespace, 783 "podIPs": newPodIPs, 784 "hostIP": newPod.Status.HostIP, 785 }) 786 787 svcs := k.genServiceMappings(newPod, newPodIPs, logger) 788 789 if oldPod != nil { 790 for _, dpSvc := range svcs { 791 svcsAdded = append(svcsAdded, dpSvc.Frontend.L3n4Addr) 792 } 793 794 defer func() { 795 // delete all IPs that were not added regardless if the insertion of 796 // service in LB map was successful or not because we will not receive 797 // any other event with these old IP addresses. 798 oldSvcs := k.genServiceMappings(oldPod, oldPodIPs, logger) 799 800 for _, dpSvc := range oldSvcs { 801 var added bool 802 for _, svcsAdded := range svcsAdded { 803 if dpSvc.Frontend.L3n4Addr.DeepEqual(&svcsAdded) { 804 added = true 805 break 806 } 807 } 808 if !added { 809 if _, err := k.svcManager.DeleteService(dpSvc.Frontend.L3n4Addr); err != nil { 810 logger.WithError(err).Error("Error while deleting service in LB map") 811 } 812 } 813 } 814 }() 815 } 816 817 if len(svcs) == 0 { 818 return nil 819 } 820 821 for _, dpSvc := range svcs { 822 p := &loadbalancer.SVC{ 823 Frontend: dpSvc.Frontend, 824 Backends: dpSvc.Backends, 825 Type: dpSvc.Type, 826 ExtTrafficPolicy: dpSvc.ExtTrafficPolicy, 827 IntTrafficPolicy: dpSvc.IntTrafficPolicy, 828 HealthCheckNodePort: dpSvc.HealthCheckNodePort, 829 Name: loadbalancer.ServiceName{ 830 Name: fmt.Sprintf("%s/host-port/%d", newPod.ObjectMeta.Name, dpSvc.Frontend.L3n4Addr.Port), 831 Namespace: newPod.ObjectMeta.Namespace, 832 }, 833 LoopbackHostport: dpSvc.LoopbackHostport, 834 } 835 836 if _, _, err := k.svcManager.UpsertService(p); err != nil { 837 if errors.Is(err, service.NewErrLocalRedirectServiceExists(p.Frontend, p.Name)) { 838 logger.WithError(err).Debug("Error while inserting service in LB map") 839 } else { 840 logger.WithError(err).Error("Error while inserting service in LB map") 841 } 842 return err 843 } 844 } 845 846 return nil 847 } 848 849 func (k *K8sPodWatcher) deleteHostPortMapping(pod *slim_corev1.Pod, podIPs []string) error { 850 if !option.Config.EnableHostPort { 851 return nil 852 } 853 854 logger := log.WithFields(logrus.Fields{ 855 logfields.K8sPodName: pod.ObjectMeta.Name, 856 logfields.K8sNamespace: pod.ObjectMeta.Namespace, 857 "podIPs": podIPs, 858 "hostIP": pod.Status.HostIP, 859 }) 860 861 svcs := k.genServiceMappings(pod, podIPs, logger) 862 if len(svcs) == 0 { 863 return nil 864 } 865 866 for _, dpSvc := range svcs { 867 svc, _ := k.svcManager.GetDeepCopyServiceByFrontend(dpSvc.Frontend.L3n4Addr) 868 // Check whether the service being deleted is in fact "owned" by the pod being deleted. 869 // We want to make sure that the pod being deleted is in fact the "current" backend that 870 // "owns" the hostPort service. Otherwise we might break hostPort connectivity for another 871 // pod which may have since claimed ownership for the same hostPort service, which was previously 872 // "owned" by the pod being deleted. 873 // See: https://github.com/cilium/cilium/issues/22460. 874 if svc != nil && !utils.DeepEqualBackends(svc.Backends, dpSvc.Backends) { 875 continue 876 } 877 878 if _, err := k.svcManager.DeleteService(dpSvc.Frontend.L3n4Addr); err != nil { 879 logger.WithError(err).Error("Error while deleting service in LB map") 880 return err 881 } 882 } 883 884 return nil 885 } 886 887 func (k *K8sPodWatcher) updatePodHostData(oldPod, newPod *slim_corev1.Pod, oldPodIPs, newPodIPs k8sTypes.IPSlice) error { 888 logger := log.WithFields(logrus.Fields{ 889 logfields.K8sPodName: newPod.ObjectMeta.Name, 890 logfields.K8sNamespace: newPod.ObjectMeta.Namespace, 891 }) 892 893 if newPod.Spec.HostNetwork { 894 logger.Debug("Pod is using host networking") 895 return nil 896 } 897 898 var namedPortsChanged bool 899 900 ipSliceEqual := oldPodIPs != nil && oldPodIPs.DeepEqual(&newPodIPs) 901 902 defer func() { 903 if !ipSliceEqual { 904 // delete all IPs that were not added regardless if the insertion of the 905 // entry in the ipcache map was successful or not because we will not 906 // receive any other event with these old IP addresses. 907 for _, oldPodIP := range oldPodIPs { 908 var found bool 909 for _, newPodIP := range newPodIPs { 910 if newPodIP == oldPodIP { 911 found = true 912 break 913 } 914 } 915 if !found { 916 npc := k.ipcache.Delete(oldPodIP, source.Kubernetes) 917 if npc { 918 namedPortsChanged = true 919 } 920 } 921 } 922 } 923 924 // This happens at most once due to k8sMeta being the same for all podIPs in this loop 925 if namedPortsChanged { 926 k.policyManager.TriggerPolicyUpdates(true, "Named ports added or updated") 927 } 928 }() 929 930 specEqual := oldPod != nil && newPod.Spec.DeepEqual(&oldPod.Spec) 931 hostIPEqual := oldPod != nil && newPod.Status.HostIP != oldPod.Status.HostIP 932 933 // is spec and hostIPs are the same there no need to perform the remaining 934 // operations 935 if specEqual && hostIPEqual { 936 return nil 937 } 938 939 hostIP := net.ParseIP(newPod.Status.HostIP) 940 if hostIP == nil { 941 return fmt.Errorf("no/invalid HostIP: %s", newPod.Status.HostIP) 942 } 943 944 hostKey := node.GetEndpointEncryptKeyIndex() 945 946 k8sMeta := &ipcache.K8sMetadata{ 947 Namespace: newPod.Namespace, 948 PodName: newPod.Name, 949 } 950 951 // Store Named ports, if any. 952 for _, container := range newPod.Spec.Containers { 953 for _, port := range container.Ports { 954 if port.Name == "" { 955 continue 956 } 957 p, err := u8proto.ParseProtocol(string(port.Protocol)) 958 if err != nil { 959 return fmt.Errorf("ContainerPort: invalid protocol: %s", port.Protocol) 960 } 961 if k8sMeta.NamedPorts == nil { 962 k8sMeta.NamedPorts = make(ciliumTypes.NamedPortMap) 963 } 964 k8sMeta.NamedPorts[port.Name] = ciliumTypes.PortProto{ 965 Port: uint16(port.ContainerPort), 966 Proto: uint8(p), 967 } 968 } 969 } 970 971 var errs []string 972 for _, podIP := range newPodIPs { 973 // Initial mapping of podIP <-> hostIP <-> identity. The mapping is 974 // later updated once the allocator has determined the real identity. 975 // If the endpoint remains unmanaged, the identity remains untouched. 976 npc, err := k.ipcache.Upsert(podIP, hostIP, hostKey, k8sMeta, ipcache.Identity{ 977 ID: identity.ReservedIdentityUnmanaged, 978 Source: source.Kubernetes, 979 }) 980 if npc { 981 namedPortsChanged = true 982 } 983 if err != nil { 984 // It is expected to receive an error overwrite where the existing 985 // source is: 986 // - KVStore, this can happen as KVStore event propagation can 987 // usually be faster than k8s event propagation. 988 // - local since cilium-agent receives events for local pods. 989 // - custom resource since Cilium CR are slimmer and might have 990 // faster propagation than Kubernetes resources. 991 if !errors.Is(err, &ipcache.ErrOverwrite{ 992 ExistingSrc: source.KVStore, 993 NewSrc: source.Kubernetes, 994 }) && !errors.Is(err, &ipcache.ErrOverwrite{ 995 ExistingSrc: source.Local, 996 NewSrc: source.Kubernetes, 997 }) && !errors.Is(err, &ipcache.ErrOverwrite{ 998 ExistingSrc: source.CustomResource, 999 NewSrc: source.Kubernetes, 1000 }) { 1001 errs = append(errs, fmt.Sprintf("ipcache entry for podIP %s: %s", podIP, err)) 1002 } 1003 } 1004 } 1005 if len(errs) != 0 { 1006 return errors.New(strings.Join(errs, ", ")) 1007 } 1008 1009 nodeNameEqual := newPod.Spec.NodeName == nodeTypes.GetName() 1010 1011 // only upsert HostPort Mapping if the pod is on the local node 1012 // and spec or ip slice is different 1013 if nodeNameEqual && (!specEqual || !ipSliceEqual) { 1014 err := k.upsertHostPortMapping(oldPod, newPod, oldPodIPs, newPodIPs) 1015 if err != nil { 1016 return fmt.Errorf("cannot upsert hostPort for PodIPs: %s", newPodIPs) 1017 } 1018 } 1019 1020 return nil 1021 } 1022 1023 func (k *K8sPodWatcher) deletePodHostData(pod *slim_corev1.Pod) (bool, error) { 1024 if pod.Spec.HostNetwork { 1025 return true, fmt.Errorf("pod is using host networking") 1026 } 1027 1028 podIPs := k8sUtils.ValidIPs(pod.Status) 1029 if len(podIPs) == 0 { 1030 return true, nil 1031 } 1032 1033 k.deleteHostPortMapping(pod, podIPs) 1034 1035 var ( 1036 errs []string 1037 skipped bool 1038 ) 1039 1040 for _, podIP := range podIPs { 1041 // a small race condition exists here as deletion could occur in 1042 // parallel based on another event but it doesn't matter as the 1043 // identity is going away 1044 id, exists := k.ipcache.LookupByIP(podIP) 1045 if !exists { 1046 skipped = true 1047 errs = append(errs, fmt.Sprintf("identity for IP %s does not exist in case", podIP)) 1048 continue 1049 } 1050 1051 if id.Source != source.Kubernetes { 1052 skipped = true 1053 errs = append(errs, fmt.Sprintf("ipcache entry for IP %s not owned by kubernetes source", podIP)) 1054 continue 1055 } 1056 1057 k.ipcache.DeleteOnMetadataMatch(podIP, source.Kubernetes, pod.Namespace, pod.Name) 1058 } 1059 1060 if len(errs) != 0 { 1061 return skipped, errors.New(strings.Join(errs, ", ")) 1062 } 1063 1064 return skipped, nil 1065 } 1066 1067 // GetCachedPod returns a pod from the local store. 1068 func (k *K8sPodWatcher) GetCachedPod(namespace, name string) (*slim_corev1.Pod, error) { 1069 <-k.controllersStarted 1070 k.k8sResourceSynced.WaitForCacheSync(resources.K8sAPIGroupPodV1Core) 1071 <-k.podStoreSet 1072 k.podStoreMU.RLock() 1073 defer k.podStoreMU.RUnlock() 1074 pName := &slim_corev1.Pod{ 1075 ObjectMeta: slim_metav1.ObjectMeta{ 1076 Name: name, 1077 Namespace: namespace, 1078 }, 1079 } 1080 podInterface, exists, err := k.podStore.Get(pName) 1081 if err != nil { 1082 return nil, err 1083 } 1084 if !exists { 1085 return nil, k8sErrors.NewNotFound(schema.GroupResource{ 1086 Group: "core", 1087 Resource: "pod", 1088 }, name) 1089 } 1090 return podInterface.(*slim_corev1.Pod).DeepCopy(), nil 1091 }