github.com/aporeto-inc/trireme-lib@v10.358.0+incompatible/monitor/internal/pod/controller.go (about)

     1  // +build linux !windows
     2  
     3  package podmonitor
     4  
     5  import (
     6  	"context"
     7  	errs "errors"
     8  	"time"
     9  
    10  	"k8s.io/client-go/tools/record"
    11  
    12  	"go.aporeto.io/trireme-lib/common"
    13  	"go.aporeto.io/trireme-lib/monitor/config"
    14  	"go.aporeto.io/trireme-lib/monitor/extractors"
    15  	"go.aporeto.io/trireme-lib/policy"
    16  	"go.uber.org/zap"
    17  
    18  	corev1 "k8s.io/api/core/v1"
    19  	"k8s.io/apimachinery/pkg/api/errors"
    20  
    21  	"sigs.k8s.io/controller-runtime/pkg/client"
    22  	"sigs.k8s.io/controller-runtime/pkg/controller"
    23  	"sigs.k8s.io/controller-runtime/pkg/event"
    24  	"sigs.k8s.io/controller-runtime/pkg/handler"
    25  	"sigs.k8s.io/controller-runtime/pkg/manager"
    26  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    27  	"sigs.k8s.io/controller-runtime/pkg/source"
    28  )
    29  
    30  var (
    31  	// ErrHandlePUStartEventFailed is the error sent back if a start event fails
    32  	ErrHandlePUStartEventFailed = errs.New("Aporeto Enforcer start event failed")
    33  
    34  	// ErrNetnsExtractionMissing is the error when we are missing a PID or netns path after successful metadata extraction
    35  	ErrNetnsExtractionMissing = errs.New("Aporeto Enforcer missed to extract PID or netns path")
    36  
    37  	// ErrHandlePUStopEventFailed is the error sent back if a stop event fails
    38  	ErrHandlePUStopEventFailed = errs.New("Aporeto Enforcer stop event failed")
    39  
    40  	// ErrHandlePUDestroyEventFailed is the error sent back if a create event fails
    41  	ErrHandlePUDestroyEventFailed = errs.New("Aporeto Enforcer destroy event failed")
    42  )
    43  
    44  // newReconciler returns a new reconcile.Reconciler
    45  func newReconciler(mgr manager.Manager, handler *config.ProcessorConfig, metadataExtractor extractors.PodMetadataExtractor, netclsProgrammer extractors.PodNetclsProgrammer, sandboxExtractor extractors.PodSandboxExtractor, nodeName string, enableHostPods bool, deleteCh chan<- DeleteEvent, deleteReconcileCh chan<- struct{}, resyncInfo *ResyncInfoChan) *ReconcilePod {
    46  	return &ReconcilePod{
    47  		client:            mgr.GetClient(),
    48  		recorder:          mgr.GetRecorder("trireme-pod-controller"),
    49  		handler:           handler,
    50  		metadataExtractor: metadataExtractor,
    51  		netclsProgrammer:  netclsProgrammer,
    52  		sandboxExtractor:  sandboxExtractor,
    53  		nodeName:          nodeName,
    54  		enableHostPods:    enableHostPods,
    55  		deleteCh:          deleteCh,
    56  		deleteReconcileCh: deleteReconcileCh,
    57  		resyncInfo:        resyncInfo,
    58  
    59  		// TODO: should move into configuration
    60  		handlePUEventTimeout:   60 * time.Second,
    61  		metadataExtractTimeout: 10 * time.Second,
    62  		netclsProgramTimeout:   10 * time.Second,
    63  	}
    64  }
    65  
    66  // addController adds a new Controller to mgr with r as the reconcile.Reconciler
    67  func addController(mgr manager.Manager, r *ReconcilePod, workers int, eventsCh <-chan event.GenericEvent) error {
    68  	// Create a new controller
    69  	c, err := controller.New("trireme-pod-controller", mgr, controller.Options{
    70  		Reconciler:              r,
    71  		MaxConcurrentReconciles: workers,
    72  	})
    73  	if err != nil {
    74  		return err
    75  	}
    76  
    77  	// we use this mapper in both of our event sources
    78  	mapper := &WatchPodMapper{
    79  		client:         mgr.GetClient(),
    80  		nodeName:       r.nodeName,
    81  		enableHostPods: r.enableHostPods,
    82  	}
    83  
    84  	// use the our watch pod mapper which filters pods before we reconcile
    85  	if err := c.Watch(
    86  		&source.Kind{Type: &corev1.Pod{}},
    87  		&handler.EnqueueRequestsFromMapFunc{ToRequests: mapper},
    88  	); err != nil {
    89  		return err
    90  	}
    91  
    92  	// we pass in a custom channel for events generated by resync
    93  	return c.Watch(
    94  		&source.Channel{Source: eventsCh},
    95  		&handler.EnqueueRequestsFromMapFunc{ToRequests: mapper},
    96  	)
    97  }
    98  
    99  var _ reconcile.Reconciler = &ReconcilePod{}
   100  
   101  // DeleteEvent is used to send delete events to our event loop which will watch
   102  // them for real deletion in the Kubernetes API. Once an object is gone, we will
   103  // send down destroy events to trireme.
   104  type DeleteEvent struct {
   105  	PodUID        string
   106  	SandboxID     string
   107  	NamespaceName client.ObjectKey
   108  }
   109  
   110  // ReconcilePod reconciles a Pod object
   111  type ReconcilePod struct {
   112  	// This client, initialized using mgr.Client() above, is a split client
   113  	// that reads objects from the cache and writes to the apiserver
   114  	client            client.Client
   115  	recorder          record.EventRecorder
   116  	handler           *config.ProcessorConfig
   117  	metadataExtractor extractors.PodMetadataExtractor
   118  	netclsProgrammer  extractors.PodNetclsProgrammer
   119  	sandboxExtractor  extractors.PodSandboxExtractor
   120  	nodeName          string
   121  	enableHostPods    bool
   122  	deleteCh          chan<- DeleteEvent
   123  	deleteReconcileCh chan<- struct{}
   124  	resyncInfo        *ResyncInfoChan
   125  
   126  	metadataExtractTimeout time.Duration
   127  	handlePUEventTimeout   time.Duration
   128  	netclsProgramTimeout   time.Duration
   129  }
   130  
   131  func (r *ReconcilePod) resyncHelper(nn string) {
   132  	if r.resyncInfo != nil {
   133  		r.resyncInfo.SendInfo(nn)
   134  	}
   135  }
   136  
   137  // Reconcile reads that state of the cluster for a pod object
   138  func (r *ReconcilePod) Reconcile(request reconcile.Request) (reconcile.Result, error) {
   139  	ctx := context.Background()
   140  	nn := request.NamespacedName.String()
   141  
   142  	// we do this very early on:
   143  	// whatever happened to the processing of this pod event, we are telling the Resync handler
   144  	// that we have seen it. Even if we have not sent an event to the policy engine,
   145  	// it means that most likely we are okay for an existing PU to be deleted first
   146  	defer r.resyncHelper(nn)
   147  
   148  	var puID, sandboxID string
   149  	var err error
   150  	// Fetch the corresponding pod object.
   151  	pod := &corev1.Pod{}
   152  	if err := r.client.Get(ctx, request.NamespacedName, pod); err != nil {
   153  		if errors.IsNotFound(err) {
   154  			r.deleteReconcileCh <- struct{}{}
   155  			return reconcile.Result{}, nil
   156  		}
   157  		// Otherwise, we retry.
   158  		return reconcile.Result{}, err
   159  	}
   160  
   161  	sandboxID, err = r.sandboxExtractor(ctx, pod)
   162  	if err != nil {
   163  		// Do nothing if we can't find the sandboxID
   164  		zap.L().Debug("Pod reconcile: Cannot extract the SandboxID for ", zap.String("podname: ", nn))
   165  	}
   166  	puID = string(pod.GetUID())
   167  	// abort immediately if this is a HostNetwork pod, but we don't want to activate them
   168  	// NOTE: is already done in the mapper, however, this additional check does not hurt
   169  	if pod.Spec.HostNetwork && !r.enableHostPods {
   170  		zap.L().Debug("Pod is a HostNetwork pod, but enableHostPods is false", zap.String("puID", puID), zap.String("namespacedName", nn))
   171  		return reconcile.Result{}, nil
   172  	}
   173  
   174  	// it looks like we can miss events for all sorts of unknown reasons
   175  	// if we reconcile though and the pod exists, we definitely know though
   176  	// that it must go away at some point, so always register it with the delete controller
   177  	r.deleteCh <- DeleteEvent{
   178  		PodUID:        puID,
   179  		SandboxID:     sandboxID,
   180  		NamespaceName: request.NamespacedName,
   181  	}
   182  
   183  	// try to find out if any of the containers have been started yet
   184  	// this is static information on the pod, we don't need to care of the phase for determining that
   185  	// NOTE: This is important because InitContainers are started during the PodPending phase which is
   186  	//       what we need to rely on for activation as early as possible
   187  	var started bool
   188  	for _, status := range pod.Status.InitContainerStatuses {
   189  		if status.State.Running != nil {
   190  			started = true
   191  			break
   192  		}
   193  	}
   194  	if !started {
   195  		for _, status := range pod.Status.ContainerStatuses {
   196  			if status.State.Running != nil {
   197  				started = true
   198  				break
   199  			}
   200  		}
   201  	}
   202  
   203  	switch pod.Status.Phase {
   204  	case corev1.PodPending:
   205  		fallthrough
   206  	case corev1.PodRunning:
   207  		zap.L().Debug("PodPending / PodRunning", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Bool("anyContainerStarted", started))
   208  
   209  		// now try to do the metadata extraction
   210  		extractCtx, extractCancel := context.WithTimeout(ctx, r.metadataExtractTimeout)
   211  		defer extractCancel()
   212  		puRuntime, err := r.metadataExtractor(extractCtx, pod, started)
   213  		if err != nil {
   214  			zap.L().Error("failed to extract metadata", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   215  			r.recorder.Eventf(pod, "Warning", "PUExtractMetadata", "PU '%s' failed to extract metadata: %s", puID, err.Error())
   216  			return reconcile.Result{}, err
   217  		}
   218  
   219  		// now create/update the PU
   220  		// every HandlePUEvent call gets done in this context
   221  		handlePUCtx, handlePUCancel := context.WithTimeout(ctx, r.handlePUEventTimeout)
   222  		defer handlePUCancel()
   223  		if err := r.handler.Policy.HandlePUEvent(
   224  			handlePUCtx,
   225  			puID,
   226  			common.EventUpdate,
   227  			puRuntime,
   228  		); err != nil {
   229  			zap.L().Error("failed to handle update event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   230  			r.recorder.Eventf(pod, "Warning", "PUUpdate", "failed to handle update event for PU '%s': %s", puID, err.Error())
   231  			// return reconcile.Result{}, err
   232  		} else {
   233  			r.recorder.Eventf(pod, "Normal", "PUUpdate", "PU '%s' updated successfully", puID)
   234  		}
   235  
   236  		// NOTE: a pod that is terminating, is going to reconcile as well in the PodRunning phase,
   237  		// however, it will have the deletion timestamp set which is an indicator for us that it is
   238  		// shutting down. It means for us, that we don't have to start anything anymore. We can safely stop
   239  		// the PU when the phase is PodSucceeded/PodFailed. However, we sent an update event above and included
   240  		// some new tags from the metadata extractor.
   241  		if pod.DeletionTimestamp != nil {
   242  			return reconcile.Result{}, nil
   243  		}
   244  		// If the pod hasn't started or if there is no sandbox present, requeue.
   245  		if sandboxID == "" || !started {
   246  			return reconcile.Result{Requeue: true}, nil
   247  		}
   248  		if started {
   249  			// if the metadata extractor is missing the PID or nspath, we need to try again
   250  			// we need it for starting the PU. However, only require this if we are not in host network mode.
   251  			// NOTE: this can happen for example if the containers are not in a running state on their own
   252  			if !pod.Spec.HostNetwork && len(puRuntime.NSPath()) == 0 && puRuntime.Pid() == 0 {
   253  				zap.L().Error("Kubernetes thinks a container is running, however, we failed to extract a PID or NSPath with the metadata extractor. Requeueing...", zap.String("puID", puID), zap.String("namespacedName", nn))
   254  				r.recorder.Eventf(pod, "Warning", "PUStart", "PU '%s' failed to extract netns", puID)
   255  				return reconcile.Result{}, ErrNetnsExtractionMissing
   256  			}
   257  
   258  			// now start the PU
   259  			// every HandlePUEvent call gets done in this context
   260  			handlePUStartCtx, handlePUStartCancel := context.WithTimeout(ctx, r.handlePUEventTimeout)
   261  			defer handlePUStartCancel()
   262  			if err := r.handler.Policy.HandlePUEvent(
   263  				handlePUStartCtx,
   264  				puID,
   265  				common.EventStart,
   266  				puRuntime,
   267  			); err != nil {
   268  				if policy.IsErrPUAlreadyActivated(err) {
   269  					// abort early if this PU has already been activated before
   270  					zap.L().Debug("PU has already been activated", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   271  				} else {
   272  					zap.L().Error("failed to handle start event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   273  					r.recorder.Eventf(pod, "Warning", "PUStart", "PU '%s' failed to start: %s", puID, err.Error())
   274  				}
   275  			} else {
   276  				r.recorder.Eventf(pod, "Normal", "PUStart", "PU '%s' started successfully", puID)
   277  			}
   278  
   279  			// if this is a host network pod, we need to program the net_cls cgroup
   280  			if pod.Spec.HostNetwork {
   281  				netclsProgramCtx, netclsProgramCancel := context.WithTimeout(ctx, r.netclsProgramTimeout)
   282  				defer netclsProgramCancel()
   283  				if err := r.netclsProgrammer(netclsProgramCtx, pod, puRuntime); err != nil {
   284  					if extractors.IsErrNetclsAlreadyProgrammed(err) {
   285  						zap.L().Debug("net_cls cgroup has already been programmed previously", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   286  					} else if extractors.IsErrNoHostNetworkPod(err) {
   287  						zap.L().Error("net_cls cgroup programmer told us that this is no host network pod.", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   288  					} else {
   289  						zap.L().Error("failed to program net_cls cgroup of pod", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   290  						r.recorder.Eventf(pod, "Warning", "PUStart", "Host Network PU '%s' failed to program its net_cls cgroups: %s", puID, err.Error())
   291  						return reconcile.Result{}, err
   292  					}
   293  				} else {
   294  					zap.L().Debug("net_cls cgroup has been successfully programmed for trireme", zap.String("puID", puID), zap.String("namespacedName", nn))
   295  					r.recorder.Eventf(pod, "Normal", "PUStart", "Host Network PU '%s' has successfully programmed its net_cls cgroups", puID)
   296  				}
   297  			}
   298  		}
   299  		return reconcile.Result{}, nil
   300  
   301  	case corev1.PodSucceeded:
   302  		fallthrough
   303  	case corev1.PodFailed:
   304  		zap.L().Debug("PodSucceeded / PodFailed", zap.String("puID", puID), zap.String("namespacedName", nn))
   305  		// do metadata extraction regardless of them being stopped
   306  		//
   307  		// there is the edge case that the enforcer is starting up and we encounter the pod for the first time
   308  		// in stopped state, so we have to do metadata extraction here as well
   309  		extractCtx, extractCancel := context.WithTimeout(ctx, r.metadataExtractTimeout)
   310  		defer extractCancel()
   311  		puRuntime, err := r.metadataExtractor(extractCtx, pod, started)
   312  		if err != nil {
   313  			zap.L().Error("failed to extract metadata", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   314  			r.recorder.Eventf(pod, "Warning", "PUExtractMetadata", "PU '%s' failed to extract metadata: %s", puID, err.Error())
   315  			return reconcile.Result{}, err
   316  		}
   317  
   318  		// every HandlePUEvent call gets done in this context
   319  		handlePUCtx, handlePUCancel := context.WithTimeout(ctx, r.handlePUEventTimeout)
   320  		defer handlePUCancel()
   321  
   322  		if err := r.handler.Policy.HandlePUEvent(
   323  			handlePUCtx,
   324  			puID,
   325  			common.EventUpdate,
   326  			puRuntime,
   327  		); err != nil {
   328  			zap.L().Error("failed to handle update event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   329  			r.recorder.Eventf(pod, "Warning", "PUUpdate", "failed to handle update event for PU '%s': %s", puID, err.Error())
   330  			// return reconcile.Result{}, err
   331  		} else {
   332  			r.recorder.Eventf(pod, "Normal", "PUUpdate", "PU '%s' updated successfully", puID)
   333  		}
   334  
   335  		if err := r.handler.Policy.HandlePUEvent(
   336  			handlePUCtx,
   337  			puID,
   338  			common.EventStop,
   339  			puRuntime,
   340  		); err != nil {
   341  			zap.L().Error("failed to handle stop event", zap.String("puID", puID), zap.String("namespacedName", nn), zap.Error(err))
   342  			r.recorder.Eventf(pod, "Warning", "PUStop", "PU '%s' failed to stop: %s", puID, err.Error())
   343  		} else {
   344  			r.recorder.Eventf(pod, "Normal", "PUStop", "PU '%s' has been successfully stopped", puID)
   345  		}
   346  
   347  		// we don't need to reconcile
   348  		// sending the stop event is enough
   349  		return reconcile.Result{}, nil
   350  
   351  	case corev1.PodUnknown:
   352  		zap.L().Error("pod is in unknown state", zap.String("puID", puID), zap.String("namespacedName", nn))
   353  
   354  		// we don't need to retry, there is nothing *we* can do about it to fix this
   355  		return reconcile.Result{}, nil
   356  	default:
   357  		zap.L().Error("unknown pod phase", zap.String("puID", puID), zap.String("namespacedName", nn), zap.String("podPhase", string(pod.Status.Phase)))
   358  
   359  		// we don't need to retry, there is nothing *we* can do about it to fix this
   360  		return reconcile.Result{}, nil
   361  	}
   362  }