istio.io/istio@v0.0.0-20240520182934-d79c90f27776/cni/pkg/repair/repaircontroller.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package repair 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 22 corev1 "k8s.io/api/core/v1" 23 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 24 "k8s.io/apimachinery/pkg/types" 25 26 "istio.io/istio/cni/pkg/config" 27 "istio.io/istio/cni/pkg/plugin" 28 "istio.io/istio/pkg/kube" 29 "istio.io/istio/pkg/kube/controllers" 30 "istio.io/istio/pkg/kube/kclient" 31 ) 32 33 type Controller struct { 34 client kube.Client 35 pods kclient.Client[*corev1.Pod] 36 queue controllers.Queue 37 cfg config.RepairConfig 38 events kclient.EventRecorder 39 repairedPods map[types.NamespacedName]types.UID 40 } 41 42 func NewRepairController(client kube.Client, cfg config.RepairConfig) (*Controller, error) { 43 c := &Controller{ 44 cfg: cfg, 45 client: client, 46 events: kclient.NewEventRecorder(client, "cni-repair"), 47 repairedPods: map[types.NamespacedName]types.UID{}, 48 } 49 fieldSelectors := []string{} 50 if cfg.FieldSelectors != "" { 51 fieldSelectors = append(fieldSelectors, cfg.FieldSelectors) 52 } 53 // filter out pod events from different nodes 54 fieldSelectors = append(fieldSelectors, fmt.Sprintf("spec.nodeName=%v", cfg.NodeName)) 55 c.pods = kclient.NewFiltered[*corev1.Pod](client, kclient.Filter{ 56 LabelSelector: cfg.LabelSelectors, 57 FieldSelector: strings.Join(fieldSelectors, ","), 58 }) 59 c.queue = controllers.NewQueue("repair pods", 60 controllers.WithReconciler(c.Reconcile), 61 controllers.WithMaxAttempts(5)) 62 c.pods.AddEventHandler(controllers.ObjectHandler(c.queue.AddObject)) 63 64 return c, nil 65 } 66 67 func (c *Controller) Run(stop <-chan struct{}) { 68 kube.WaitForCacheSync("repair controller", stop, c.pods.HasSynced) 69 c.queue.Run(stop) 70 c.pods.ShutdownHandlers() 71 } 72 73 func (c *Controller) Reconcile(key types.NamespacedName) error { 74 pod := c.pods.Get(key.Name, key.Namespace) 75 if pod == nil { 76 delete(c.repairedPods, key) // Ensure we do not leak 77 // Pod deleted, nothing to do 78 return nil 79 } 80 return c.ReconcilePod(pod) 81 } 82 83 func (c *Controller) ReconcilePod(pod *corev1.Pod) (err error) { 84 if !c.matchesFilter(pod) { 85 return // Skip, pod doesn't need repair 86 } 87 repairLog.Debugf("Reconciling pod %s", pod.Name) 88 89 if c.cfg.RepairPods { 90 return c.repairPod(pod) 91 } else if c.cfg.DeletePods { 92 return c.deleteBrokenPod(pod) 93 } else if c.cfg.LabelPods { 94 return c.labelBrokenPod(pod) 95 } 96 return nil 97 } 98 99 // repairPod actually dynamically repairs a pod. This is done by entering the pods network namespace and setting up rules. 100 // This differs from the general CNI plugin flow, which triggers before the pod fully starts. 101 // Additionally, we need to jump through hoops to find the network namespace. 102 func (c *Controller) repairPod(pod *corev1.Pod) error { 103 m := podsRepaired.With(typeLabel.Value(repairType)) 104 log := repairLog.WithLabels("pod", pod.Namespace+"/"+pod.Name) 105 key := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} 106 // We will get an event every time the pod changes. The repair is not instantaneous, though -- it will only recover 107 // once the pod restarts (in CrashLoopBackoff), which can take some time. 108 // We don't want to constantly try to apply the iptables rules, which is unneeded and will fail. 109 // Instead, we track which UIDs we repaired and skip them if already repaired. 110 // 111 // An alternative would be to write something to the Pod (status, annotation, etc). 112 // However, this requires elevated privileges we want to avoid 113 if uid, f := c.repairedPods[key]; f { 114 if uid == pod.UID { 115 log.Debugf("Skipping pod, already repaired") 116 } else { 117 // This is unexpected, bubble up to an error. Might be missing event, or invalid assumption in our code. 118 // Either way, we will skip. 119 log.Errorf("Skipping pod, already repaired with an unexpected UID %v vs %v", uid, pod.UID) 120 } 121 return nil 122 } 123 log.Infof("Repairing pod...") 124 125 // Fetch the pod's network namespace. This must run in the host process due to how the procfs /ns/net works. 126 // This will get a network namespace ID. This ID is scoped to the network namespace we running in. 127 // As such, we need to be in the host namespace: the CNI pod namespace has no relation to the users pod namespace. 128 netns, err := runInHost(func() (string, error) { return getPodNetNs(pod) }) 129 if err != nil { 130 m.With(resultLabel.Value(resultFail)).Increment() 131 return fmt.Errorf("get netns: %v", err) 132 } 133 log = log.WithLabels("netns", netns) 134 135 if err := redirectRunningPod(pod, netns); err != nil { 136 log.Errorf("failed to setup redirection: %v", err) 137 m.With(resultLabel.Value(resultFail)).Increment() 138 return err 139 } 140 c.repairedPods[key] = pod.UID 141 log.Infof("pod repaired") 142 m.With(resultLabel.Value(resultSuccess)).Increment() 143 return nil 144 } 145 146 // redirectRunningPod dynamically enters the provided pod, that is already running, and programs it's networking configuration. 147 func redirectRunningPod(pod *corev1.Pod, netns string) error { 148 pi := plugin.ExtractPodInfo(pod) 149 redirect, err := plugin.NewRedirect(pi) 150 if err != nil { 151 return fmt.Errorf("setup redirect: %v", err) 152 } 153 rulesMgr := plugin.IptablesInterceptRuleMgr() 154 if err := rulesMgr.Program(pod.Name, netns, redirect); err != nil { 155 return fmt.Errorf("program redirection: %v", err) 156 } 157 return nil 158 } 159 160 const ( 161 ReasonDeleteBrokenPod = "DeleteBrokenPod" 162 ReasonLabelBrokenPod = "LabelBrokenPod" 163 ) 164 165 func (c *Controller) deleteBrokenPod(pod *corev1.Pod) error { 166 m := podsRepaired.With(typeLabel.Value(deleteType)) 167 repairLog.Infof("Pod detected as broken, deleting: %s/%s", pod.Namespace, pod.Name) 168 169 // Make sure we are deleting what we think we are... 170 preconditions := &metav1.Preconditions{ 171 UID: &pod.UID, 172 ResourceVersion: &pod.ResourceVersion, 173 } 174 err := c.client.Kube().CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{ 175 Preconditions: preconditions, 176 }) 177 if err != nil { 178 c.events.Write(pod, corev1.EventTypeWarning, ReasonDeleteBrokenPod, "pod detected as broken, but failed to delete: %v", err) 179 m.With(resultLabel.Value(resultFail)).Increment() 180 return err 181 } 182 c.events.Write(pod, corev1.EventTypeWarning, ReasonDeleteBrokenPod, "pod detected as broken, deleted") 183 m.With(resultLabel.Value(resultSuccess)).Increment() 184 return nil 185 } 186 187 func (c *Controller) labelBrokenPod(pod *corev1.Pod) error { 188 // Added for safety, to make sure no healthy pods get labeled. 189 m := podsRepaired.With(typeLabel.Value(labelType)) 190 repairLog.Infof("Pod detected as broken, adding label: %s/%s", pod.Namespace, pod.Name) 191 192 labels := pod.GetLabels() 193 if _, ok := labels[c.cfg.LabelKey]; ok { 194 m.With(resultLabel.Value(resultSkip)).Increment() 195 repairLog.Infof("Pod %s/%s already has label with key %s, skipping", pod.Namespace, pod.Name, c.cfg.LabelKey) 196 return nil 197 } 198 199 repairLog.Infof("Labeling pod %s/%s with label %s=%s", pod.Namespace, pod.Name, c.cfg.LabelKey, c.cfg.LabelValue) 200 201 patchBytes := fmt.Sprintf(`{"metadata":{"labels":{%q:%q}}}`, c.cfg.LabelKey, c.cfg.LabelValue) 202 // Both "pods" and "pods/status" can mutate the metadata. However, pods/status is lower privilege, so we use that instead. 203 _, err := c.client.Kube().CoreV1().Pods(pod.Namespace).Patch(context.Background(), pod.Name, types.MergePatchType, 204 []byte(patchBytes), metav1.PatchOptions{}, "status") 205 if err != nil { 206 repairLog.Errorf("Failed to update pod: %s", err) 207 c.events.Write(pod, corev1.EventTypeWarning, ReasonLabelBrokenPod, "pod detected as broken, but failed to label: %v", err) 208 m.With(resultLabel.Value(resultFail)).Increment() 209 return err 210 } 211 c.events.Write(pod, corev1.EventTypeWarning, ReasonLabelBrokenPod, "pod detected as broken, labeled") 212 m.With(resultLabel.Value(resultSuccess)).Increment() 213 return nil 214 } 215 216 // MatchesFilter returns true if the pod matches the repair filter criteria 217 func (c *Controller) matchesFilter(pod *corev1.Pod) bool { 218 // Helper function; checks that a container's termination message matches filter 219 matchTerminationMessage := func(state *corev1.ContainerStateTerminated) bool { 220 // If we are filtering on init container termination message and the termination message of 'state' does not match, exit 221 trimmedTerminationMessage := strings.TrimSpace(c.cfg.InitTerminationMsg) 222 return trimmedTerminationMessage == "" || trimmedTerminationMessage == strings.TrimSpace(state.Message) 223 } 224 // Helper function; checks that container exit code matches filter 225 matchExitCode := func(state *corev1.ContainerStateTerminated) bool { 226 // If we are filtering on init container exit code and the termination message does not match, exit 227 if ec := c.cfg.InitExitCode; ec == 0 || ec == int(state.ExitCode) { 228 return true 229 } 230 return false 231 } 232 233 // Only check pods that have the sidecar annotation; the rest can be 234 // ignored. 235 if c.cfg.SidecarAnnotation != "" { 236 if _, ok := pod.ObjectMeta.Annotations[c.cfg.SidecarAnnotation]; !ok { 237 return false 238 } 239 } 240 241 // For each candidate pod, iterate across all init containers searching for 242 // crashlooping init containers that match our criteria 243 for _, container := range pod.Status.InitContainerStatuses { 244 // Skip the container if the InitContainerName is not a match and our 245 // InitContainerName filter is non-empty. 246 if c.cfg.InitContainerName != "" && container.Name != c.cfg.InitContainerName { 247 continue 248 } 249 250 // For safety, check the containers *current* status. If the container 251 // successfully exited, we NEVER want to identify this pod as broken. 252 // If the pod is going to fail, the failure state will show up in 253 // LastTerminationState eventually. 254 if state := container.State.Terminated; state != nil { 255 if state.Reason == "Completed" || state.ExitCode == 0 { 256 continue 257 } 258 } 259 260 // Check the LastTerminationState struct for information about why the container 261 // last exited. If a pod is using the CNI configuration check init container, 262 // it will start crashlooping and populate this struct. 263 if state := container.LastTerminationState.Terminated; state != nil { 264 // Verify the container state matches our filter criteria 265 if matchTerminationMessage(state) && matchExitCode(state) { 266 return true 267 } 268 } 269 } 270 return false 271 }