github.com/aporeto-inc/trireme-lib@v10.358.0+incompatible/monitor/internal/pod/controller.go (about) 1 // +build linux !windows 2 3 package podmonitor 4 5 import ( 6 "context" 7 errs "errors" 8 "time" 9 10 "k8s.io/client-go/tools/record" 11 12 "go.aporeto.io/trireme-lib/common" 13 "go.aporeto.io/trireme-lib/monitor/config" 14 "go.aporeto.io/trireme-lib/monitor/extractors" 15 "go.aporeto.io/trireme-lib/policy" 16 "go.uber.org/zap" 17 18 corev1 "k8s.io/api/core/v1" 19 "k8s.io/apimachinery/pkg/api/errors" 20 21 "sigs.k8s.io/controller-runtime/pkg/client" 22 "sigs.k8s.io/controller-runtime/pkg/controller" 23 "sigs.k8s.io/controller-runtime/pkg/event" 24 "sigs.k8s.io/controller-runtime/pkg/handler" 25 "sigs.k8s.io/controller-runtime/pkg/manager" 26 "sigs.k8s.io/controller-runtime/pkg/reconcile" 27 "sigs.k8s.io/controller-runtime/pkg/source" 28 ) 29 30 var ( 31 // ErrHandlePUStartEventFailed is the error sent back if a start event fails 32 ErrHandlePUStartEventFailed = errs.New("Aporeto Enforcer start event failed") 33 34 // ErrNetnsExtractionMissing is the error when we are missing a PID or netns path after successful metadata extraction 35 ErrNetnsExtractionMissing = errs.New("Aporeto Enforcer missed to extract PID or netns path") 36 37 // ErrHandlePUStopEventFailed is the error sent back if a stop event fails 38 ErrHandlePUStopEventFailed = errs.New("Aporeto Enforcer stop event failed") 39 40 // ErrHandlePUDestroyEventFailed is the error sent back if a create event fails 41 ErrHandlePUDestroyEventFailed = errs.New("Aporeto Enforcer destroy event failed") 42 ) 43 44 // newReconciler returns a new reconcile.Reconciler 45 func newReconciler(mgr manager.Manager, handler *config.ProcessorConfig, metadataExtractor extractors.PodMetadataExtractor, netclsProgrammer extractors.PodNetclsProgrammer, sandboxExtractor extractors.PodSandboxExtractor, nodeName string, enableHostPods bool, deleteCh chan<- DeleteEvent, deleteReconcileCh chan<- struct{}, resyncInfo *ResyncInfoChan) *ReconcilePod { 46 return &ReconcilePod{ 47 client: mgr.GetClient(), 48 recorder: mgr.GetRecorder("trireme-pod-controller"), 49 handler: handler, 50 metadataExtractor: metadataExtractor, 51 netclsProgrammer: netclsProgrammer, 52 sandboxExtractor: sandboxExtractor, 53 nodeName: nodeName, 54 enableHostPods: enableHostPods, 55 deleteCh: deleteCh, 56 deleteReconcileCh: deleteReconcileCh, 57 resyncInfo: resyncInfo, 58 59 // TODO: should move into configuration 60 handlePUEventTimeout: 60 * time.Second, 61 metadataExtractTimeout: 10 * time.Second, 62 netclsProgramTimeout: 10 * time.Second, 63 } 64 } 65 66 // addController adds a new Controller to mgr with r as the reconcile.Reconciler 67 func addController(mgr manager.Manager, r *ReconcilePod, workers int, eventsCh <-chan event.GenericEvent) error { 68 // Create a new controller 69 c, err := controller.New("trireme-pod-controller", mgr, controller.Options{ 70 Reconciler: r, 71 MaxConcurrentReconciles: workers, 72 }) 73 if err != nil { 74 return err 75 } 76 77 // we use this mapper in both of our event sources 78 mapper := &WatchPodMapper{ 79 client: mgr.GetClient(), 80 nodeName: r.nodeName, 81 enableHostPods: r.enableHostPods, 82 } 83 84 // use the our watch pod mapper which filters pods before we reconcile 85 if err := c.Watch( 86 &source.Kind{Type: &corev1.Pod{}}, 87 &handler.EnqueueRequestsFromMapFunc{ToRequests: mapper}, 88 ); err != nil { 89 return err 90 } 91 92 // we pass in a custom channel for events generated by resync 93 return c.Watch( 94 &source.Channel{Source: eventsCh}, 95 &handler.EnqueueRequestsFromMapFunc{ToRequests: mapper}, 96 ) 97 } 98 99 var _ reconcile.Reconciler = &ReconcilePod{} 100 101 // DeleteEvent is used to send delete events to our event loop which will watch 102 // them for real deletion in the Kubernetes API. Once an object is gone, we will 103 // send down destroy events to trireme. 104 type DeleteEvent struct { 105 PodUID string 106 SandboxID string 107 NamespaceName client.ObjectKey 108 } 109 110 // ReconcilePod reconciles a Pod object 111 type ReconcilePod struct { 112 // This client, initialized using mgr.Client() above, is a split client 113 // that reads objects from the cache and writes to the apiserver 114 client client.Client 115 recorder record.EventRecorder 116 handler *config.ProcessorConfig 117 metadataExtractor extractors.PodMetadataExtractor 118 netclsProgrammer extractors.PodNetclsProgrammer 119 sandboxExtractor extractors.PodSandboxExtractor 120 nodeName string 121 enableHostPods bool 122 deleteCh chan<- DeleteEvent 123 deleteReconcileCh chan<- struct{} 124 resyncInfo *ResyncInfoChan 125 126 metadataExtractTimeout time.Duration 127 handlePUEventTimeout time.Duration 128 netclsProgramTimeout time.Duration 129 } 130 131 func (r *ReconcilePod) resyncHelper(nn string) { 132 if r.resyncInfo != nil { 133 r.resyncInfo.SendInfo(nn) 134 } 135 } 136 137 // Reconcile reads that state of the cluster for a pod object 138 func (r *ReconcilePod) Reconcile(request reconcile.Request) (reconcile.Result, error) { 139 ctx := context.Background() 140 nn := request.NamespacedName.String() 141 142 // we do this very early on: 143 // whatever happened to the processing of this pod event, we are telling the Resync handler 144 // that we have seen it. Even if we have not sent an event to the policy engine, 145 // it means that most likely we are okay for an existing PU to be deleted first 146 defer r.resyncHelper(nn) 147 148 var puID, sandboxID string 149 var err error 150 // Fetch the corresponding pod object. 151 pod := &corev1.Pod{} 152 if err := r.client.Get(ctx, request.NamespacedName, pod); err != nil { 153 if errors.IsNotFound(err) { 154 r.deleteReconcileCh <- struct{}{} 155 return reconcile.Result{}, nil 156 } 157 // Otherwise, we retry. 158 return reconcile.Result{}, err 159 } 160 161 sandboxID, err = r.sandboxExtractor(ctx, pod) 162 if err != nil { 163 // Do nothing if we can't find the sandboxID 164 zap.L().Debug("Pod reconcile: Cannot extract the SandboxID for ", zap.String("podname: ", nn)) 165 } 166 puID = string(pod.GetUID()) 167 // abort immediately if this is a HostNetwork pod, but we don't want to activate them 168 // NOTE: is already done in the mapper, however, this additional check does not hurt 169 if pod.Spec.HostNetwork && !r.enableHostPods { 170 zap.L().Debug("Pod is a HostNetwork pod, but enableHostPods is false", zap.String("puID", puID), zap.String("namespacedName", nn)) 171 return reconcile.Result{}, nil 172 } 173 174 // it looks like we can miss events for all sorts of unknown reasons 175 // if we reconcile though and the pod exists, we definitely know though 176 // that it must go away at some point, so always register it with the delete controller 177 r.deleteCh <- DeleteEvent{ 178 PodUID: puID, 179 SandboxID: sandboxID, 180 NamespaceName: request.NamespacedName, 181 } 182 183 // try to find out if any of the containers have been started yet 184 // this is static information on the pod, we don't need to care of the phase for determining that 185 // NOTE: This is important because InitContainers are started during the PodPending phase which is 186 // what we need to rely on for activation as early as possible 187 var started bool 188 for _, status := range pod.Status.InitContainerStatuses { 189 if status.State.Running != nil { 190 started = true 191 break 192 } 193 } 194 if !started { 195 for _, status := range pod.Status.ContainerStatuses { 196 if status.State.Running != nil { 197 started = true 198 break 199 } 200 } 201 } 202 203 switch pod.Status.Phase { 204 case corev1.PodPending: 205 fallthrough 206 case corev1.PodRunning: 207 zap.L().Debug("PodPending / PodRunning", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Bool("anyContainerStarted", started)) 208 209 // now try to do the metadata extraction 210 extractCtx, extractCancel := context.WithTimeout(ctx, r.metadataExtractTimeout) 211 defer extractCancel() 212 puRuntime, err := r.metadataExtractor(extractCtx, pod, started) 213 if err != nil { 214 zap.L().Error("failed to extract metadata", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 215 r.recorder.Eventf(pod, "Warning", "PUExtractMetadata", "PU '%s' failed to extract metadata: %s", puID, err.Error()) 216 return reconcile.Result{}, err 217 } 218 219 // now create/update the PU 220 // every HandlePUEvent call gets done in this context 221 handlePUCtx, handlePUCancel := context.WithTimeout(ctx, r.handlePUEventTimeout) 222 defer handlePUCancel() 223 if err := r.handler.Policy.HandlePUEvent( 224 handlePUCtx, 225 puID, 226 common.EventUpdate, 227 puRuntime, 228 ); err != nil { 229 zap.L().Error("failed to handle update event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 230 r.recorder.Eventf(pod, "Warning", "PUUpdate", "failed to handle update event for PU '%s': %s", puID, err.Error()) 231 // return reconcile.Result{}, err 232 } else { 233 r.recorder.Eventf(pod, "Normal", "PUUpdate", "PU '%s' updated successfully", puID) 234 } 235 236 // NOTE: a pod that is terminating, is going to reconcile as well in the PodRunning phase, 237 // however, it will have the deletion timestamp set which is an indicator for us that it is 238 // shutting down. It means for us, that we don't have to start anything anymore. We can safely stop 239 // the PU when the phase is PodSucceeded/PodFailed. However, we sent an update event above and included 240 // some new tags from the metadata extractor. 241 if pod.DeletionTimestamp != nil { 242 return reconcile.Result{}, nil 243 } 244 // If the pod hasn't started or if there is no sandbox present, requeue. 245 if sandboxID == "" || !started { 246 return reconcile.Result{Requeue: true}, nil 247 } 248 if started { 249 // if the metadata extractor is missing the PID or nspath, we need to try again 250 // we need it for starting the PU. However, only require this if we are not in host network mode. 251 // NOTE: this can happen for example if the containers are not in a running state on their own 252 if !pod.Spec.HostNetwork && len(puRuntime.NSPath()) == 0 && puRuntime.Pid() == 0 { 253 zap.L().Error("Kubernetes thinks a container is running, however, we failed to extract a PID or NSPath with the metadata extractor. Requeueing...", zap.String("puID", puID), zap.String("namespacedName", nn)) 254 r.recorder.Eventf(pod, "Warning", "PUStart", "PU '%s' failed to extract netns", puID) 255 return reconcile.Result{}, ErrNetnsExtractionMissing 256 } 257 258 // now start the PU 259 // every HandlePUEvent call gets done in this context 260 handlePUStartCtx, handlePUStartCancel := context.WithTimeout(ctx, r.handlePUEventTimeout) 261 defer handlePUStartCancel() 262 if err := r.handler.Policy.HandlePUEvent( 263 handlePUStartCtx, 264 puID, 265 common.EventStart, 266 puRuntime, 267 ); err != nil { 268 if policy.IsErrPUAlreadyActivated(err) { 269 // abort early if this PU has already been activated before 270 zap.L().Debug("PU has already been activated", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 271 } else { 272 zap.L().Error("failed to handle start event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 273 r.recorder.Eventf(pod, "Warning", "PUStart", "PU '%s' failed to start: %s", puID, err.Error()) 274 } 275 } else { 276 r.recorder.Eventf(pod, "Normal", "PUStart", "PU '%s' started successfully", puID) 277 } 278 279 // if this is a host network pod, we need to program the net_cls cgroup 280 if pod.Spec.HostNetwork { 281 netclsProgramCtx, netclsProgramCancel := context.WithTimeout(ctx, r.netclsProgramTimeout) 282 defer netclsProgramCancel() 283 if err := r.netclsProgrammer(netclsProgramCtx, pod, puRuntime); err != nil { 284 if extractors.IsErrNetclsAlreadyProgrammed(err) { 285 zap.L().Debug("net_cls cgroup has already been programmed previously", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 286 } else if extractors.IsErrNoHostNetworkPod(err) { 287 zap.L().Error("net_cls cgroup programmer told us that this is no host network pod.", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 288 } else { 289 zap.L().Error("failed to program net_cls cgroup of pod", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 290 r.recorder.Eventf(pod, "Warning", "PUStart", "Host Network PU '%s' failed to program its net_cls cgroups: %s", puID, err.Error()) 291 return reconcile.Result{}, err 292 } 293 } else { 294 zap.L().Debug("net_cls cgroup has been successfully programmed for trireme", zap.String("puID", puID), zap.String("namespacedName", nn)) 295 r.recorder.Eventf(pod, "Normal", "PUStart", "Host Network PU '%s' has successfully programmed its net_cls cgroups", puID) 296 } 297 } 298 } 299 return reconcile.Result{}, nil 300 301 case corev1.PodSucceeded: 302 fallthrough 303 case corev1.PodFailed: 304 zap.L().Debug("PodSucceeded / PodFailed", zap.String("puID", puID), zap.String("namespacedName", nn)) 305 // do metadata extraction regardless of them being stopped 306 // 307 // there is the edge case that the enforcer is starting up and we encounter the pod for the first time 308 // in stopped state, so we have to do metadata extraction here as well 309 extractCtx, extractCancel := context.WithTimeout(ctx, r.metadataExtractTimeout) 310 defer extractCancel() 311 puRuntime, err := r.metadataExtractor(extractCtx, pod, started) 312 if err != nil { 313 zap.L().Error("failed to extract metadata", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 314 r.recorder.Eventf(pod, "Warning", "PUExtractMetadata", "PU '%s' failed to extract metadata: %s", puID, err.Error()) 315 return reconcile.Result{}, err 316 } 317 318 // every HandlePUEvent call gets done in this context 319 handlePUCtx, handlePUCancel := context.WithTimeout(ctx, r.handlePUEventTimeout) 320 defer handlePUCancel() 321 322 if err := r.handler.Policy.HandlePUEvent( 323 handlePUCtx, 324 puID, 325 common.EventUpdate, 326 puRuntime, 327 ); err != nil { 328 zap.L().Error("failed to handle update event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 329 r.recorder.Eventf(pod, "Warning", "PUUpdate", "failed to handle update event for PU '%s': %s", puID, err.Error()) 330 // return reconcile.Result{}, err 331 } else { 332 r.recorder.Eventf(pod, "Normal", "PUUpdate", "PU '%s' updated successfully", puID) 333 } 334 335 if err := r.handler.Policy.HandlePUEvent( 336 handlePUCtx, 337 puID, 338 common.EventStop, 339 puRuntime, 340 ); err != nil { 341 zap.L().Error("failed to handle stop event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err)) 342 r.recorder.Eventf(pod, "Warning", "PUStop", "PU '%s' failed to stop: %s", puID, err.Error()) 343 } else { 344 r.recorder.Eventf(pod, "Normal", "PUStop", "PU '%s' has been successfully stopped", puID) 345 } 346 347 // we don't need to reconcile 348 // sending the stop event is enough 349 return reconcile.Result{}, nil 350 351 case corev1.PodUnknown: 352 zap.L().Error("pod is in unknown state", zap.String("puID", puID), zap.String("namespacedName", nn)) 353 354 // we don't need to retry, there is nothing *we* can do about it to fix this 355 return reconcile.Result{}, nil 356 default: 357 zap.L().Error("unknown pod phase", zap.String("puID", puID), zap.String("namespacedName", nn), zap.String("podPhase", string(pod.Status.Phase))) 358 359 // we don't need to retry, there is nothing *we* can do about it to fix this 360 return reconcile.Result{}, nil 361 } 362 }