
     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    15  package repair
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    22  	corev1 ""
    23  	metav1 ""
    24  	""
    26  	""
    27  	""
    28  	""
    29  	""
    30  	""
    31  )
    33  type Controller struct {
    34  	client       kube.Client
    35  	pods         kclient.Client[*corev1.Pod]
    36  	queue        controllers.Queue
    37  	cfg          config.RepairConfig
    38  	events       kclient.EventRecorder
    39  	repairedPods map[types.NamespacedName]types.UID
    40  }
    42  func NewRepairController(client kube.Client, cfg config.RepairConfig) (*Controller, error) {
    43  	c := &Controller{
    44  		cfg:          cfg,
    45  		client:       client,
    46  		events:       kclient.NewEventRecorder(client, "cni-repair"),
    47  		repairedPods: map[types.NamespacedName]types.UID{},
    48  	}
    49  	fieldSelectors := []string{}
    50  	if cfg.FieldSelectors != "" {
    51  		fieldSelectors = append(fieldSelectors, cfg.FieldSelectors)
    52  	}
    53  	// filter out pod events from different nodes
    54  	fieldSelectors = append(fieldSelectors, fmt.Sprintf("spec.nodeName=%v", cfg.NodeName))
    55  	c.pods = kclient.NewFiltered[*corev1.Pod](client, kclient.Filter{
    56  		LabelSelector: cfg.LabelSelectors,
    57  		FieldSelector: strings.Join(fieldSelectors, ","),
    58  	})
    59  	c.queue = controllers.NewQueue("repair pods",
    60  		controllers.WithReconciler(c.Reconcile),
    61  		controllers.WithMaxAttempts(5))
    62  	c.pods.AddEventHandler(controllers.ObjectHandler(c.queue.AddObject))
    64  	return c, nil
    65  }
    67  func (c *Controller) Run(stop <-chan struct{}) {
    68  	kube.WaitForCacheSync("repair controller", stop, c.pods.HasSynced)
    69  	c.queue.Run(stop)
    70  	c.pods.ShutdownHandlers()
    71  }
    73  func (c *Controller) Reconcile(key types.NamespacedName) error {
    74  	pod := c.pods.Get(key.Name, key.Namespace)
    75  	if pod == nil {
    76  		delete(c.repairedPods, key) // Ensure we do not leak
    77  		// Pod deleted, nothing to do
    78  		return nil
    79  	}
    80  	return c.ReconcilePod(pod)
    81  }
    83  func (c *Controller) ReconcilePod(pod *corev1.Pod) (err error) {
    84  	if !c.matchesFilter(pod) {
    85  		return // Skip, pod doesn't need repair
    86  	}
    87  	repairLog.Debugf("Reconciling pod %s", pod.Name)
    89  	if c.cfg.RepairPods {
    90  		return c.repairPod(pod)
    91  	} else if c.cfg.DeletePods {
    92  		return c.deleteBrokenPod(pod)
    93  	} else if c.cfg.LabelPods {
    94  		return c.labelBrokenPod(pod)
    95  	}
    96  	return nil
    97  }
    99  // repairPod actually dynamically repairs a pod. This is done by entering the pods network namespace and setting up rules.
   100  // This differs from the general CNI plugin flow, which triggers before the pod fully starts.
   101  // Additionally, we need to jump through hoops to find the network namespace.
   102  func (c *Controller) repairPod(pod *corev1.Pod) error {
   103  	m := podsRepaired.With(typeLabel.Value(repairType))
   104  	log := repairLog.WithLabels("pod", pod.Namespace+"/"+pod.Name)
   105  	key := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}
   106  	// We will get an event every time the pod changes. The repair is not instantaneous, though -- it will only recover
   107  	// once the pod restarts (in CrashLoopBackoff), which can take some time.
   108  	// We don't want to constantly try to apply the iptables rules, which is unneeded and will fail.
   109  	// Instead, we track which UIDs we repaired and skip them if already repaired.
   110  	//
   111  	// An alternative would be to write something to the Pod (status, annotation, etc).
   112  	// However, this requires elevated privileges we want to avoid
   113  	if uid, f := c.repairedPods[key]; f {
   114  		if uid == pod.UID {
   115  			log.Debugf("Skipping pod, already repaired")
   116  		} else {
   117  			// This is unexpected, bubble up to an error. Might be missing event, or invalid assumption in our code.
   118  			// Either way, we will skip.
   119  			log.Errorf("Skipping pod, already repaired with an unexpected UID %v vs %v", uid, pod.UID)
   120  		}
   121  		return nil
   122  	}
   123  	log.Infof("Repairing pod...")
   125  	// Fetch the pod's network namespace. This must run in the host process due to how the procfs /ns/net works.
   126  	// This will get a network namespace ID. This ID is scoped to the network namespace we running in.
   127  	// As such, we need to be in the host namespace: the CNI pod namespace has no relation to the users pod namespace.
   128  	netns, err := runInHost(func() (string, error) { return getPodNetNs(pod) })
   129  	if err != nil {
   130  		m.With(resultLabel.Value(resultFail)).Increment()
   131  		return fmt.Errorf("get netns: %v", err)
   132  	}
   133  	log = log.WithLabels("netns", netns)
   135  	if err := redirectRunningPod(pod, netns); err != nil {
   136  		log.Errorf("failed to setup redirection: %v", err)
   137  		m.With(resultLabel.Value(resultFail)).Increment()
   138  		return err
   139  	}
   140  	c.repairedPods[key] = pod.UID
   141  	log.Infof("pod repaired")
   142  	m.With(resultLabel.Value(resultSuccess)).Increment()
   143  	return nil
   144  }
   146  // redirectRunningPod dynamically enters the provided pod, that is already running, and programs it's networking configuration.
   147  func redirectRunningPod(pod *corev1.Pod, netns string) error {
   148  	pi := plugin.ExtractPodInfo(pod)
   149  	redirect, err := plugin.NewRedirect(pi)
   150  	if err != nil {
   151  		return fmt.Errorf("setup redirect: %v", err)
   152  	}
   153  	rulesMgr := plugin.IptablesInterceptRuleMgr()
   154  	if err := rulesMgr.Program(pod.Name, netns, redirect); err != nil {
   155  		return fmt.Errorf("program redirection: %v", err)
   156  	}
   157  	return nil
   158  }
   160  const (
   161  	ReasonDeleteBrokenPod = "DeleteBrokenPod"
   162  	ReasonLabelBrokenPod  = "LabelBrokenPod"
   163  )
   165  func (c *Controller) deleteBrokenPod(pod *corev1.Pod) error {
   166  	m := podsRepaired.With(typeLabel.Value(deleteType))
   167  	repairLog.Infof("Pod detected as broken, deleting: %s/%s", pod.Namespace, pod.Name)
   169  	// Make sure we are deleting what we think we are...
   170  	preconditions := &metav1.Preconditions{
   171  		UID:             &pod.UID,
   172  		ResourceVersion: &pod.ResourceVersion,
   173  	}
   174  	err := c.client.Kube().CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{
   175  		Preconditions: preconditions,
   176  	})
   177  	if err != nil {
   178, corev1.EventTypeWarning, ReasonDeleteBrokenPod, "pod detected as broken, but failed to delete: %v", err)
   179  		m.With(resultLabel.Value(resultFail)).Increment()
   180  		return err
   181  	}
   182, corev1.EventTypeWarning, ReasonDeleteBrokenPod, "pod detected as broken, deleted")
   183  	m.With(resultLabel.Value(resultSuccess)).Increment()
   184  	return nil
   185  }
   187  func (c *Controller) labelBrokenPod(pod *corev1.Pod) error {
   188  	// Added for safety, to make sure no healthy pods get labeled.
   189  	m := podsRepaired.With(typeLabel.Value(labelType))
   190  	repairLog.Infof("Pod detected as broken, adding label: %s/%s", pod.Namespace, pod.Name)
   192  	labels := pod.GetLabels()
   193  	if _, ok := labels[c.cfg.LabelKey]; ok {
   194  		m.With(resultLabel.Value(resultSkip)).Increment()
   195  		repairLog.Infof("Pod %s/%s already has label with key %s, skipping", pod.Namespace, pod.Name, c.cfg.LabelKey)
   196  		return nil
   197  	}
   199  	repairLog.Infof("Labeling pod %s/%s with label %s=%s", pod.Namespace, pod.Name, c.cfg.LabelKey, c.cfg.LabelValue)
   201  	patchBytes := fmt.Sprintf(`{"metadata":{"labels":{%q:%q}}}`, c.cfg.LabelKey, c.cfg.LabelValue)
   202  	// Both "pods" and "pods/status" can mutate the metadata. However, pods/status is lower privilege, so we use that instead.
   203  	_, err := c.client.Kube().CoreV1().Pods(pod.Namespace).Patch(context.Background(), pod.Name, types.MergePatchType,
   204  		[]byte(patchBytes), metav1.PatchOptions{}, "status")
   205  	if err != nil {
   206  		repairLog.Errorf("Failed to update pod: %s", err)
   207, corev1.EventTypeWarning, ReasonLabelBrokenPod, "pod detected as broken, but failed to label: %v", err)
   208  		m.With(resultLabel.Value(resultFail)).Increment()
   209  		return err
   210  	}
   211, corev1.EventTypeWarning, ReasonLabelBrokenPod, "pod detected as broken, labeled")
   212  	m.With(resultLabel.Value(resultSuccess)).Increment()
   213  	return nil
   214  }
   216  // MatchesFilter returns true if the pod matches the repair filter criteria
   217  func (c *Controller) matchesFilter(pod *corev1.Pod) bool {
   218  	// Helper function; checks that a container's termination message matches filter
   219  	matchTerminationMessage := func(state *corev1.ContainerStateTerminated) bool {
   220  		// If we are filtering on init container termination message and the termination message of 'state' does not match, exit
   221  		trimmedTerminationMessage := strings.TrimSpace(c.cfg.InitTerminationMsg)
   222  		return trimmedTerminationMessage == "" || trimmedTerminationMessage == strings.TrimSpace(state.Message)
   223  	}
   224  	// Helper function; checks that container exit code matches filter
   225  	matchExitCode := func(state *corev1.ContainerStateTerminated) bool {
   226  		// If we are filtering on init container exit code and the termination message does not match, exit
   227  		if ec := c.cfg.InitExitCode; ec == 0 || ec == int(state.ExitCode) {
   228  			return true
   229  		}
   230  		return false
   231  	}
   233  	// Only check pods that have the sidecar annotation; the rest can be
   234  	// ignored.
   235  	if c.cfg.SidecarAnnotation != "" {
   236  		if _, ok := pod.ObjectMeta.Annotations[c.cfg.SidecarAnnotation]; !ok {
   237  			return false
   238  		}
   239  	}
   241  	// For each candidate pod, iterate across all init containers searching for
   242  	// crashlooping init containers that match our criteria
   243  	for _, container := range pod.Status.InitContainerStatuses {
   244  		// Skip the container if the InitContainerName is not a match and our
   245  		// InitContainerName filter is non-empty.
   246  		if c.cfg.InitContainerName != "" && container.Name != c.cfg.InitContainerName {
   247  			continue
   248  		}
   250  		// For safety, check the containers *current* status. If the container
   251  		// successfully exited, we NEVER want to identify this pod as broken.
   252  		// If the pod is going to fail, the failure state will show up in
   253  		// LastTerminationState eventually.
   254  		if state := container.State.Terminated; state != nil {
   255  			if state.Reason == "Completed" || state.ExitCode == 0 {
   256  				continue
   257  			}
   258  		}
   260  		// Check the LastTerminationState struct for information about why the container
   261  		// last exited. If a pod is using the CNI configuration check init container,
   262  		// it will start crashlooping and populate this struct.
   263  		if state := container.LastTerminationState.Terminated; state != nil {
   264  			// Verify the container state matches our filter criteria
   265  			if matchTerminationMessage(state) && matchExitCode(state) {
   266  				return true
   267  			}
   268  		}
   269  	}
   270  	return false
   271  }