istio.io/istio@v0.0.0-20240520182934-d79c90f27776/cni/pkg/nodeagent/informers.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nodeagent
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	corev1 "k8s.io/api/core/v1"
    22  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    23  	klabels "k8s.io/apimachinery/pkg/labels"
    24  
    25  	"istio.io/istio/cni/pkg/util"
    26  	"istio.io/istio/pkg/config/constants"
    27  	"istio.io/istio/pkg/kube"
    28  	"istio.io/istio/pkg/kube/controllers"
    29  	"istio.io/istio/pkg/kube/kclient"
    30  	"istio.io/istio/pkg/monitoring"
    31  )
    32  
    33  var (
    34  	eventTypeTag = monitoring.CreateLabel("type")
    35  	EventTotals  = monitoring.NewSum(
    36  		"nodeagent_reconcile_events_total",
    37  		"The total number of node agent reconcile events.",
    38  	)
    39  )
    40  
    41  type K8sHandlers interface {
    42  	GetPodIfAmbient(podName, podNamespace string) (*corev1.Pod, error)
    43  	GetAmbientPods() []*corev1.Pod
    44  	Start()
    45  }
    46  
    47  type InformerHandlers struct {
    48  	ctx             context.Context
    49  	dataplane       MeshDataplane
    50  	systemNamespace string
    51  
    52  	queue      controllers.Queue
    53  	pods       kclient.Client[*corev1.Pod]
    54  	namespaces kclient.Client[*corev1.Namespace]
    55  }
    56  
    57  func setupHandlers(ctx context.Context, kubeClient kube.Client, dataplane MeshDataplane, systemNamespace string) *InformerHandlers {
    58  	s := &InformerHandlers{ctx: ctx, dataplane: dataplane, systemNamespace: systemNamespace}
    59  	s.queue = controllers.NewQueue("ambient",
    60  		controllers.WithGenericReconciler(s.reconcile),
    61  		controllers.WithMaxAttempts(5),
    62  	)
    63  	// We only need to handle pods on our node
    64  	s.pods = kclient.NewFiltered[*corev1.Pod](kubeClient, kclient.Filter{FieldSelector: "spec.nodeName=" + NodeName})
    65  	s.pods.AddEventHandler(controllers.FromEventHandler(func(o controllers.Event) {
    66  		s.queue.Add(o)
    67  	}))
    68  
    69  	// Namespaces could be anything though, so we watch all of those
    70  	//
    71  	// NOTE that we are requeueing namespaces here explicitly to work around
    72  	// test flakes with the fake kube client in `pkg/kube/client.go` -
    73  	// because we are using `List()` in the handler, without this requeue,
    74  	// the fake client will sometimes drop pod events leading to test flakes.
    75  	//
    76  	// WaitForCacheSync *helps*, but does not entirely fix this problem
    77  	s.namespaces = kclient.New[*corev1.Namespace](kubeClient)
    78  	s.namespaces.AddEventHandler(controllers.FromEventHandler(func(o controllers.Event) {
    79  		s.queue.Add(o)
    80  	}))
    81  
    82  	return s
    83  }
    84  
    85  func (s *InformerHandlers) GetPodIfAmbient(podName, podNamespace string) (*corev1.Pod, error) {
    86  	ns := s.namespaces.Get(podNamespace, "")
    87  	if ns == nil {
    88  		return nil, fmt.Errorf("failed to find namespace %v", ns)
    89  	}
    90  	pod := s.pods.Get(podName, podNamespace)
    91  	if util.PodRedirectionEnabled(ns, pod) {
    92  		return pod, nil
    93  	}
    94  	return nil, nil
    95  }
    96  
    97  func (s *InformerHandlers) Start() {
    98  	kube.WaitForCacheSync("informer", s.ctx.Done(), s.pods.HasSynced, s.namespaces.HasSynced)
    99  	go s.queue.Run(s.ctx.Done())
   100  }
   101  
   102  func (s *InformerHandlers) GetAmbientPods() []*corev1.Pod {
   103  	var pods []*corev1.Pod
   104  	for _, pod := range s.pods.List(metav1.NamespaceAll, klabels.Everything()) {
   105  		ns := s.namespaces.Get(pod.Namespace, "")
   106  		if ns == nil {
   107  			log.Warnf("failed to find namespace %s for pod %s", pod.Namespace, pod.Name)
   108  		}
   109  
   110  		if !util.IsZtunnelPod(s.systemNamespace, pod) && util.PodRedirectionEnabled(ns, pod) {
   111  			pods = append(pods, pod)
   112  		}
   113  	}
   114  	return pods
   115  }
   116  
   117  // EnqueueNamespace takes a Namespace and enqueues all Pod objects that make need an update
   118  // TODO it is sort of pointless/confusing/implicit to populate Old and New with the same reference here
   119  func (s *InformerHandlers) enqueueNamespace(o controllers.Object) {
   120  	namespace := o.GetName()
   121  	labels := o.GetLabels()
   122  	matchAmbient := labels[constants.DataplaneModeLabel] == constants.DataplaneModeAmbient
   123  	if matchAmbient {
   124  		log.Infof("Namespace %s is enabled in ambient mesh", namespace)
   125  	} else {
   126  		log.Infof("Namespace %s is disabled from ambient mesh", namespace)
   127  	}
   128  	for _, pod := range s.pods.List(namespace, klabels.Everything()) {
   129  		// ztunnel pods are never "added to/removed from the mesh", so do not fire
   130  		// spurious events for them to avoid triggering extra
   131  		// ztunnel node reconciliation checks.
   132  		if !util.IsZtunnelPod(s.systemNamespace, pod) {
   133  			log.Debugf("Enqueuing pod %s/%s", pod.Namespace, pod.Name)
   134  			s.queue.Add(controllers.Event{
   135  				New:   pod,
   136  				Old:   pod,
   137  				Event: controllers.EventUpdate,
   138  			})
   139  		}
   140  	}
   141  }
   142  
   143  func (s *InformerHandlers) reconcile(input any) error {
   144  	event := input.(controllers.Event)
   145  	switch event.Latest().(type) {
   146  	case *corev1.Namespace:
   147  		return s.reconcileNamespace(input)
   148  	case *corev1.Pod:
   149  		return s.reconcilePod(input)
   150  	default:
   151  		return fmt.Errorf("unexpected event type: %+v", input)
   152  	}
   153  }
   154  
   155  func (s *InformerHandlers) reconcileNamespace(input any) error {
   156  	event := input.(controllers.Event)
   157  	ns := event.Latest().(*corev1.Namespace)
   158  
   159  	switch event.Event {
   160  	case controllers.EventAdd:
   161  		log.Debugf("Namespace %s added", ns.Name)
   162  		s.enqueueNamespace(ns)
   163  
   164  	case controllers.EventUpdate:
   165  		newNs := event.New.(*corev1.Namespace)
   166  		oldNs := event.Old.(*corev1.Namespace)
   167  
   168  		if getModeLabel(oldNs.Labels) != getModeLabel(newNs.Labels) {
   169  			log.Debugf("Namespace %s updated", newNs.Name)
   170  			s.enqueueNamespace(newNs)
   171  		}
   172  	}
   173  	return nil
   174  }
   175  
   176  func getModeLabel(m map[string]string) string {
   177  	if m == nil {
   178  		return ""
   179  	}
   180  	return m[constants.DataplaneModeLabel]
   181  }
   182  
   183  func (s *InformerHandlers) reconcilePod(input any) error {
   184  	event := input.(controllers.Event)
   185  	pod := event.Latest().(*corev1.Pod)
   186  
   187  	defer EventTotals.With(eventTypeTag.Value(event.Event.String())).Increment()
   188  
   189  	switch event.Event {
   190  	case controllers.EventAdd:
   191  		// pod was added to our cache
   192  		// we get here in 2 cases:
   193  		// 1. new pod was created on our node
   194  		// 2. we were restarted and current existing pods are added to our cache
   195  
   196  		// We have no good way to distinguish between these two cases from here. But we don't need to!
   197  		// Existing pods will be handled by the dataplane using `GetAmbientPods`,
   198  		// and the initial enqueueNamespace, and new pods will be handled by the CNI.
   199  
   200  	case controllers.EventUpdate:
   201  		// For update, we just need to handle opt outs
   202  		newPod := event.New.(*corev1.Pod)
   203  		oldPod := event.Old.(*corev1.Pod)
   204  		ns := s.namespaces.Get(newPod.Namespace, "")
   205  		if ns == nil {
   206  			return fmt.Errorf("failed to find namespace %v", ns)
   207  		}
   208  		wasAnnotated := oldPod.Annotations != nil && oldPod.Annotations[constants.AmbientRedirection] == constants.AmbientRedirectionEnabled
   209  		isAnnotated := newPod.Annotations != nil && newPod.Annotations[constants.AmbientRedirection] == constants.AmbientRedirectionEnabled
   210  		shouldBeEnabled := util.PodRedirectionEnabled(ns, newPod)
   211  
   212  		// We should check the latest annotation vs desired status
   213  		changeNeeded := isAnnotated != shouldBeEnabled
   214  
   215  		log.Debugf("Pod %s events: wasAnnotated(%v), isAnnotated(%v), shouldBeEnabled(%v), changeNeeded(%v), oldPod(%+v), newPod(%+v)",
   216  			pod.Name, wasAnnotated, isAnnotated, shouldBeEnabled, changeNeeded, oldPod, newPod)
   217  		if !changeNeeded {
   218  			log.Debugf("Pod %s update event skipped, no change needed", pod.Name)
   219  			return nil
   220  		}
   221  
   222  		if !shouldBeEnabled {
   223  			log.Debugf("Pod %s no longer matches, removing from mesh", newPod.Name)
   224  			err := s.dataplane.RemovePodFromMesh(s.ctx, pod)
   225  			log.Debugf("RemovePodFromMesh(%s) returned %v", newPod.Name, err)
   226  			// we ignore errors here as we don't want this event to be retried by the queue.
   227  		} else {
   228  			// If oldpod != ready && newpod != ready, but the ambient annotation was added,
   229  			// then assume this event was generated by the CNI plugin labeling the pod on startup,
   230  			// and skip the event.
   231  			//
   232  			// This isn't perfect (someone could manually annotate an unready pod,
   233  			// then install Istio, then the pod goes ready, and we'd miss capture) - but that
   234  			// seems vanishingly unlikely
   235  			wasReady := kube.CheckPodReadyOrComplete(oldPod)
   236  			isReady := kube.CheckPodReadyOrComplete(newPod)
   237  			if wasReady != nil && isReady != nil && isAnnotated {
   238  				log.Infof("Pod %s update event skipped, added/labeled by CNI plugin", pod.Name)
   239  				return nil
   240  			}
   241  
   242  			log.Debugf("Pod %s now matches, adding to mesh", newPod.Name)
   243  			// netns == ""; at this point netns should have been added via the initial snapshot,
   244  			// or via the cni plugin. If it happens to get here before the cni plugin somehow,
   245  			// then we will just fail to add the pod to the mesh, and it will be retried later when cni plugin adds it.
   246  
   247  			// We need a pod IP - if the pod was added via the CNI plugin, that plugin told us the IPs
   248  			// for the pod. If this is a pod added via informer, the pod should have already gone thru
   249  			// the CNI plugin chain, and have a PodIP.
   250  			//
   251  			// If PodIPs exists, it is preferred, otherwise fallback to PodIP.
   252  			//
   253  			// If we get to this point and have a pod that really and truly has no IP in either of those,
   254  			// it's not routable at this point and something is wrong/we should discard this event.
   255  			podIPs := util.GetPodIPsIfPresent(pod)
   256  			if len(podIPs) == 0 {
   257  				log.Warnf("pod %s does not appear to have any assigned IPs, not capturing", pod.Name)
   258  				return nil
   259  			}
   260  
   261  			err := s.dataplane.AddPodToMesh(s.ctx, pod, podIPs, "")
   262  			log.Debugf("AddPodToMesh(%s) returned %v", newPod.Name, err)
   263  		}
   264  	case controllers.EventDelete:
   265  		// TODO: as every pod on our node will come through here, check if pod is annotated?
   266  		err := s.dataplane.DelPodFromMesh(s.ctx, pod)
   267  		log.Debugf("DelPodFromMesh(%s) returned %v", pod.Name, err)
   268  	}
   269  	return nil
   270  }