github.com/castai/kvisor@v1.7.1-0.20240516114728-b3572a2607b5/pkg/ebpftracer/tracer_decode.go (about)

     1  package ebpftracer
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"runtime/debug"
     7  	"strconv"
     8  
     9  	"github.com/castai/kvisor/pkg/containers"
    10  	"github.com/castai/kvisor/pkg/ebpftracer/decoder"
    11  	"github.com/castai/kvisor/pkg/ebpftracer/events"
    12  	"github.com/castai/kvisor/pkg/ebpftracer/types"
    13  	"github.com/castai/kvisor/pkg/kernel"
    14  	"github.com/castai/kvisor/pkg/metrics"
    15  	"github.com/castai/kvisor/pkg/proc"
    16  	"github.com/cilium/ebpf"
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"golang.org/x/net/context"
    19  )
    20  
    21  // Error indicating that the resulting error was caught from a panic
    22  var ErrPanic = errors.New("encountered panic")
    23  
    24  func (t *Tracer) decodeAndHandleSignal(_ context.Context, data []byte) (rerr error) {
    25  	defer func() {
    26  		if perr := recover(); perr != nil {
    27  			stack := string(debug.Stack())
    28  			rerr = fmt.Errorf("decode %w: %v, stack=%s", ErrPanic, perr, stack)
    29  		}
    30  	}()
    31  
    32  	ebpfMsgDecoder := decoder.NewEventDecoder(t.log, data)
    33  	var signalCtx types.SignalContext
    34  	if err := ebpfMsgDecoder.DecodeSignalContext(&signalCtx); err != nil {
    35  		return err
    36  	}
    37  	parsedArgs, err := decoder.ParseArgs(ebpfMsgDecoder, signalCtx.EventID)
    38  	if err != nil {
    39  		return fmt.Errorf("cannot parse event type %d: %w", signalCtx.EventID, err)
    40  	}
    41  
    42  	switch args := parsedArgs.(type) {
    43  	case types.SignalCgroupMkdirArgs:
    44  		// We we only care about events from the default cgroup, as cgroup v1 does not have unified cgroups.
    45  		if !t.cfg.CgroupClient.IsDefaultHierarchy(args.HierarchyId) {
    46  			return nil
    47  		}
    48  
    49  		t.cfg.CgroupClient.LoadCgroup(args.CgroupId, args.CgroupPath)
    50  
    51  	case types.SignalCgroupRmdirArgs:
    52  		// We we only care about events from the default cgroup, as cgroup v1 does not have unified cgroups.
    53  		if !t.cfg.CgroupClient.IsDefaultHierarchy(args.HierarchyId) {
    54  			return nil
    55  		}
    56  
    57  		t.queueCgroupForRemoval(args.CgroupId)
    58  		err := t.UnmuteEventsFromCgroup(args.CgroupId)
    59  		if err != nil {
    60  			return fmt.Errorf("cannot remove cgroup %d from mute map: %w", args.CgroupId, err)
    61  		}
    62  	default:
    63  		t.log.Warnf("unhandled signal: %d", signalCtx.EventID)
    64  	}
    65  
    66  	return nil
    67  }
    68  
    69  func (t *Tracer) decodeAndExportEvent(ctx context.Context, data []byte) (rerr error) {
    70  	metrics.AgentPulledEventsBytesTotal.Add(float64(len(data)))
    71  
    72  	defer func() {
    73  		if perr := recover(); perr != nil {
    74  			stack := string(debug.Stack())
    75  			rerr = fmt.Errorf("decode %w: %v, stack=%s", ErrPanic, perr, stack)
    76  		}
    77  	}()
    78  
    79  	ebpfMsgDecoder := decoder.NewEventDecoder(t.log, data)
    80  	var eventCtx types.EventContext
    81  	if err := ebpfMsgDecoder.DecodeContext(&eventCtx); err != nil {
    82  		return err
    83  	}
    84  
    85  	eventId := eventCtx.EventID
    86  	parsedArgs, err := decoder.ParseArgs(ebpfMsgDecoder, eventId)
    87  	if err != nil {
    88  		return fmt.Errorf("cannot parse event type %d: %w", eventId, err)
    89  	}
    90  
    91  	container, err := t.cfg.ContainerClient.GetContainerForCgroup(ctx, eventCtx.CgroupID)
    92  	if err != nil {
    93  		// We ignore any event not belonging to a container for now.
    94  		if errors.Is(err, containers.ErrContainerNotFound) {
    95  			err := t.MuteEventsFromCgroup(eventCtx.CgroupID)
    96  			if err != nil {
    97  				return fmt.Errorf("cannot mute events for cgroup %d: %w", eventCtx.CgroupID, err)
    98  			}
    99  			return nil
   100  		}
   101  		return fmt.Errorf("cannot get container for cgroup %d: %w", eventCtx.CgroupID, err)
   102  	}
   103  
   104  	eventCtx.Ts = t.bootTime + eventCtx.Ts
   105  	event := &types.Event{
   106  		Context:   &eventCtx,
   107  		Container: container,
   108  		Args:      parsedArgs,
   109  	}
   110  
   111  	if _, found := t.signatureEventMap[eventId]; found {
   112  		t.cfg.SignatureEngine.QueueEvent(event)
   113  	}
   114  
   115  	// Do not parse event, if it is not registered. If there is no policy set, we treat is as to parse all the events
   116  	if _, found := t.eventPoliciesMap[eventId]; !found && t.policy != nil {
   117  		metrics.AgentSkippedEventsTotal.With(prometheus.Labels{metrics.EventIDLabel: strconv.Itoa(int(eventId))}).Inc()
   118  		return nil
   119  	}
   120  
   121  	// TODO: Move rate limit based policy to kernel side.
   122  	if err := t.allowedByPolicyPre(&eventCtx); err != nil {
   123  		metrics.AgentSkippedEventsTotal.With(prometheus.Labels{metrics.EventIDLabel: strconv.Itoa(int(eventId))}).Inc()
   124  		return nil
   125  	}
   126  
   127  	switch eventId {
   128  	case events.SchedProcessExec:
   129  		if eventCtx.Pid == 1 {
   130  			t.cfg.MountNamespacePIDStore.ForceAddToBucket(proc.NamespaceID(eventCtx.MntID), eventCtx.NodeHostPid)
   131  		} else {
   132  			t.cfg.MountNamespacePIDStore.AddToBucket(proc.NamespaceID(eventCtx.MntID), eventCtx.NodeHostPid)
   133  		}
   134  	}
   135  
   136  	if err := t.allowedByPolicy(eventId, eventCtx.CgroupID, event); err != nil {
   137  		metrics.AgentSkippedEventsTotal.With(prometheus.Labels{metrics.EventIDLabel: strconv.Itoa(int(eventCtx.EventID))}).Inc()
   138  		return nil
   139  	}
   140  
   141  	switch eventId {
   142  	case events.NetFlowBase:
   143  		select {
   144  		case t.netflowEventsChan <- event:
   145  		default:
   146  			def := t.eventsSet[eventCtx.EventID]
   147  			metrics.AgentDroppedEventsTotal.With(prometheus.Labels{metrics.EventTypeLabel: def.name}).Inc()
   148  		}
   149  	default:
   150  		select {
   151  		case t.eventsChan <- event:
   152  		default:
   153  			def := t.eventsSet[eventCtx.EventID]
   154  			metrics.AgentDroppedEventsTotal.With(prometheus.Labels{metrics.EventTypeLabel: def.name}).Inc()
   155  		}
   156  	}
   157  
   158  	return nil
   159  }
   160  
   161  func (t *Tracer) MuteEventsFromCgroup(cgroup uint64) error {
   162  	t.log.Infof("muting cgroup %d", cgroup)
   163  	return t.module.objects.IgnoredCgroupsMap.Put(cgroup, cgroup)
   164  }
   165  
   166  func (t *Tracer) MuteEventsFromCgroups(cgroups []uint64) error {
   167  	t.log.Infof("muting cgroups %v", cgroups)
   168  
   169  	kernelVersion, err := kernel.CurrentKernelVersion()
   170  	if err != nil {
   171  		return err
   172  	}
   173  
   174  	// The ebpf batch helpers are available since kernel version 5.6.
   175  	if kernelVersion.Major > 5 || (kernelVersion.Major == 5 && kernelVersion.Minor >= 6) {
   176  		_, err = t.module.objects.IgnoredCgroupsMap.BatchUpdate(cgroups, cgroups, &ebpf.BatchOptions{
   177  			Flags: uint64(ebpf.UpdateAny),
   178  		})
   179  
   180  		if err != nil {
   181  			t.log.Warnf("got error while trying to mute cgroups %v: %s", cgroups, err)
   182  		}
   183  	} else {
   184  		for _, cgroup := range cgroups {
   185  			err = t.module.objects.IgnoredCgroupsMap.Update(cgroup, cgroup, ebpf.UpdateAny)
   186  
   187  			if err != nil {
   188  				t.log.Warnf("got error while trying to delete cgroup %d from ignore map: %s", cgroup, err)
   189  			}
   190  		}
   191  	}
   192  
   193  	return nil
   194  }
   195  
   196  func (t *Tracer) UnmuteEventsFromCgroup(cgroup uint64) error {
   197  	t.log.Infof("unmuting cgroup %d", cgroup)
   198  
   199  	err := t.module.objects.IgnoredCgroupsMap.Delete(cgroup)
   200  
   201  	// We do not care if we try to remove a non existing cgroup.
   202  	if errors.Is(err, ebpf.ErrKeyNotExist) {
   203  		return nil
   204  	}
   205  
   206  	return err
   207  }
   208  
   209  func (t *Tracer) UnmuteEventsFromCgroups(cgroups []uint64) error {
   210  	t.log.Infof("unmuting cgroup %v", cgroups)
   211  
   212  	kernelVersion, err := kernel.CurrentKernelVersion()
   213  	if err != nil {
   214  		return err
   215  	}
   216  
   217  	// The ebpf batch helpers are available since kernel version 5.6.
   218  	if kernelVersion.Major > 5 || (kernelVersion.Major == 5 && kernelVersion.Minor >= 6) {
   219  		_, err = t.module.objects.IgnoredCgroupsMap.BatchDelete(cgroups, nil)
   220  		if !errors.Is(err, ebpf.ErrKeyNotExist) {
   221  			t.log.Warnf("got error while trying to delete cgroups %v from ignore map: %s", cgroups, err)
   222  		}
   223  	} else {
   224  		for _, cgroup := range cgroups {
   225  			err = t.module.objects.IgnoredCgroupsMap.Delete(cgroup)
   226  			if !errors.Is(err, ebpf.ErrKeyNotExist) {
   227  				t.log.Warnf("got error while trying to delete cgroup %d from ignore map: %s", cgroup, err)
   228  			}
   229  		}
   230  	}
   231  
   232  	return nil
   233  }
   234  
   235  func (t *Tracer) IsCgroupMuted(cgroup uint64) bool {
   236  	var value uint64
   237  
   238  	err := t.module.objects.IgnoredCgroupsMap.Lookup(cgroup, &value)
   239  
   240  	return !errors.Is(err, ebpf.ErrKeyNotExist) && value > 0
   241  }