github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/container-hook/tracer.go (about)

     1  // Copyright 2023 The Inspektor Gadget authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package containerhook detects when a container is created or terminated.
    16  //
    17  // It uses two mechanisms to detect new containers:
    18  //  1. fanotify with FAN_OPEN_EXEC_PERM.
    19  //  2. ebpf on the sys_enter_execve tracepoint to get the execve arguments.
    20  //
    21  // Using fanotify with FAN_OPEN_EXEC_PERM allows to call a callback function
    22  // while the container is being created. The container is paused until the
    23  // callback function returns.
    24  //
    25  // Using ebpf on the sys_enter_execve tracepoint allows to get the execve
    26  // arguments without the need to read /proc/$pid/cmdline or /proc/$pid/comm.
    27  // Reading /proc/$pid/cmdline is not possible using only fanotify when the
    28  // tracer is not in the same pidns as the process being traced. This is the
    29  // case when Inspektor Gadget is started with hostPID=false.
    30  //
    31  // https://github.com/inspektor-gadget/inspektor-gadget/blob/main/docs/devel/fanotify-ebpf.png
    32  package containerhook
    33  
    34  import (
    35  	"encoding/json"
    36  	"errors"
    37  	"fmt"
    38  	"io"
    39  	"math"
    40  	"os"
    41  	"path/filepath"
    42  	"strconv"
    43  	"strings"
    44  	"sync"
    45  	"sync/atomic"
    46  	"time"
    47  
    48  	"github.com/cilium/ebpf"
    49  	"github.com/cilium/ebpf/link"
    50  	securejoin "github.com/cyphar/filepath-securejoin"
    51  	ocispec "github.com/opencontainers/runtime-spec/specs-go"
    52  	"github.com/s3rj1k/go-fanotify/fanotify"
    53  	log "github.com/sirupsen/logrus"
    54  	"golang.org/x/sys/unix"
    55  
    56  	"github.com/inspektor-gadget/inspektor-gadget/pkg/btfgen"
    57  	"github.com/inspektor-gadget/inspektor-gadget/pkg/gadgets"
    58  	"github.com/inspektor-gadget/inspektor-gadget/pkg/kfilefields"
    59  	"github.com/inspektor-gadget/inspektor-gadget/pkg/utils/host"
    60  )
    61  
    62  //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -target $TARGET -cc clang -cflags ${CFLAGS} -no-global-types -type record execruntime ./bpf/execruntime.bpf.c -- -I./bpf/
    63  
    64  type EventType int
    65  
    66  const (
    67  	EventTypeAddContainer EventType = iota
    68  	EventTypeRemoveContainer
    69  )
    70  
    71  // ContainerEvent is the notification for container creation or termination
    72  type ContainerEvent struct {
    73  	// Type is whether the container was added or removed
    74  	Type EventType
    75  
    76  	// ContainerID is the container id, typically a 64 hexadecimal string
    77  	ContainerID string
    78  
    79  	// ContainerName is the container name, typically two words with an underscore
    80  	ContainerName string
    81  
    82  	// ContainerPID is the process id of the container
    83  	ContainerPID uint32
    84  
    85  	// Container's configuration is the config.json from the OCI runtime
    86  	// spec
    87  	ContainerConfig *ocispec.Spec
    88  
    89  	// Bundle is the directory containing the config.json from the OCI
    90  	// runtime spec
    91  	// See https://github.com/opencontainers/runtime-spec/blob/main/bundle.md
    92  	Bundle string
    93  }
    94  
    95  type ContainerNotifyFunc func(notif ContainerEvent)
    96  
    97  type watchedContainer struct {
    98  	id  string
    99  	pid int
   100  }
   101  
   102  type futureContainer struct {
   103  	id        string
   104  	name      string
   105  	bundleDir string
   106  	pidFile   string
   107  }
   108  
   109  type ContainerNotifier struct {
   110  	runtimeBinaryNotify *fanotify.NotifyFD
   111  	callback            ContainerNotifyFunc
   112  
   113  	// containers is the set of containers that are being watched for
   114  	// termination. This prevents duplicate calls to
   115  	// AddWatchContainerTermination.
   116  	//
   117  	// Keys: Container ID
   118  	containers   map[string]*watchedContainer
   119  	containersMu sync.Mutex
   120  
   121  	// futureContainers is the set of containers that are detected before
   122  	// oci-runtime (runc/crun) creates the container e.g. detected via conmon
   123  	//
   124  	// Keys: Container ID
   125  	futureContainers map[string]*futureContainer
   126  	futureMu         sync.Mutex
   127  
   128  	objs  execruntimeObjects
   129  	links []link.Link
   130  
   131  	// set to true when the notifier is closed is closed
   132  	closed atomic.Bool
   133  	// this channel is used in watchContainersTermination() to avoid having to wait for the
   134  	// ticker to trigger before returning
   135  	done chan bool
   136  
   137  	wg sync.WaitGroup
   138  }
   139  
   140  // runtimePaths is the list of paths where the container runtime runc or crun
   141  // could be installed. Depending on the Linux distribution, it could be in
   142  // different locations.
   143  //
   144  // When this package is executed in a container, it prepends the
   145  // HOST_ROOT env variable to the path.
   146  var runtimePaths = []string{
   147  	"/bin/runc",
   148  	"/usr/bin/runc",
   149  	"/usr/sbin/runc",
   150  	"/usr/local/bin/runc",
   151  	"/usr/local/sbin/runc",
   152  	"/usr/lib/cri-o-runc/sbin/runc",
   153  	"/run/torcx/unpack/docker/bin/runc",
   154  	"/usr/bin/crun",
   155  	"/usr/bin/conmon",
   156  	"/var/lib/rancher/k3s/data/current/bin/runc",
   157  }
   158  
   159  // initFanotify initializes the fanotify API with the flags we need
   160  func initFanotify() (*fanotify.NotifyFD, error) {
   161  	fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS | unix.FAN_NONBLOCK)
   162  	openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC
   163  	return fanotify.Initialize(fanotifyFlags, openFlags)
   164  }
   165  
   166  // Supported detects if RuncNotifier is supported in the current environment
   167  func Supported() bool {
   168  	notifier, err := NewContainerNotifier(func(notif ContainerEvent) {})
   169  	if notifier != nil {
   170  		notifier.Close()
   171  	}
   172  	if err != nil {
   173  		log.Warnf("ContainerNotifier: not supported: %s", err)
   174  	}
   175  	return err == nil
   176  }
   177  
   178  // NewContainerNotifier uses fanotify and ebpf to detect when a container is
   179  // created or terminated, and call the callback on such event.
   180  //
   181  // Limitations:
   182  // - the container runtime must be installed in one of the paths listed by runtimePaths
   183  func NewContainerNotifier(callback ContainerNotifyFunc) (*ContainerNotifier, error) {
   184  	n := &ContainerNotifier{
   185  		callback:         callback,
   186  		containers:       make(map[string]*watchedContainer),
   187  		futureContainers: make(map[string]*futureContainer),
   188  		done:             make(chan bool),
   189  	}
   190  
   191  	if err := n.install(); err != nil {
   192  		n.Close()
   193  		return nil, err
   194  	}
   195  
   196  	return n, nil
   197  }
   198  
   199  func (n *ContainerNotifier) installEbpf(fanotifyFd int) error {
   200  	spec, err := loadExecruntime()
   201  	if err != nil {
   202  		return fmt.Errorf("load ebpf program for container-hook: %w", err)
   203  	}
   204  
   205  	fanotifyPrivateData, err := kfilefields.ReadPrivateDataFromFd(fanotifyFd)
   206  	if err != nil {
   207  		return fmt.Errorf("reading private data from fanotify fd: %w", err)
   208  	}
   209  
   210  	consts := map[string]interface{}{
   211  		"tracer_group": fanotifyPrivateData,
   212  	}
   213  	if err := spec.RewriteConstants(consts); err != nil {
   214  		return fmt.Errorf("RewriteConstants: %w", err)
   215  	}
   216  
   217  	opts := ebpf.CollectionOptions{
   218  		Programs: ebpf.ProgramOptions{
   219  			KernelTypes: btfgen.GetBTFSpec(),
   220  		},
   221  	}
   222  
   223  	if err := spec.LoadAndAssign(&n.objs, &opts); err != nil {
   224  		return fmt.Errorf("loading maps and programs: %w", err)
   225  	}
   226  
   227  	// Attach ebpf programs
   228  	l, err := link.Kprobe("fsnotify_remove_first_event", n.objs.IgFaPickE, nil)
   229  	if err != nil {
   230  		return fmt.Errorf("attaching kprobe fsnotify_remove_first_event: %w", err)
   231  	}
   232  	n.links = append(n.links, l)
   233  
   234  	l, err = link.Kretprobe("fsnotify_remove_first_event", n.objs.IgFaPickX, nil)
   235  	if err != nil {
   236  		return fmt.Errorf("attaching kretprobe fsnotify_remove_first_event: %w", err)
   237  	}
   238  	n.links = append(n.links, l)
   239  
   240  	l, err = link.Tracepoint("syscalls", "sys_enter_execve", n.objs.IgExecveE, nil)
   241  	if err != nil {
   242  		return fmt.Errorf("attaching tracepoint: %w", err)
   243  	}
   244  	n.links = append(n.links, l)
   245  
   246  	l, err = link.Tracepoint("syscalls", "sys_exit_execve", n.objs.IgExecveX, nil)
   247  	if err != nil {
   248  		return fmt.Errorf("attaching tracepoint: %w", err)
   249  	}
   250  	n.links = append(n.links, l)
   251  
   252  	return nil
   253  }
   254  
   255  func (n *ContainerNotifier) install() error {
   256  	// Start fanotify
   257  	runtimeBinaryNotify, err := initFanotify()
   258  	if err != nil {
   259  		return err
   260  	}
   261  	n.runtimeBinaryNotify = runtimeBinaryNotify
   262  
   263  	// Load, initialize and attach ebpf program
   264  	err = n.installEbpf(runtimeBinaryNotify.Fd)
   265  	if err != nil {
   266  		return err
   267  	}
   268  
   269  	// Attach fanotify to various runtime binaries
   270  	runtimeFound := false
   271  
   272  	runtimePath := os.Getenv("RUNTIME_PATH")
   273  	if runtimePath != "" {
   274  		log.Debugf("container-hook: trying runtime from RUNTIME_PATH env variable at %s", runtimePath)
   275  
   276  		// Check if we have to prepend the host root to the runtime path
   277  		if !strings.HasPrefix(runtimePath, host.HostRoot) {
   278  			// SecureJoin will resolve symlinks according to the host root
   279  			runtimePath, err = securejoin.SecureJoin(host.HostRoot, runtimePath)
   280  			if err != nil {
   281  				return fmt.Errorf("container-hook: securejoin failed: %w", err)
   282  			}
   283  		}
   284  
   285  		if _, err := os.Stat(runtimePath); errors.Is(err, os.ErrNotExist) {
   286  			return err
   287  		}
   288  
   289  		if err := runtimeBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runtimePath); err != nil {
   290  			return fmt.Errorf("fanotify marking of %s: %w", runtimePath, err)
   291  		}
   292  		log.Debugf("container-hook: monitoring runtime at %s", runtimePath)
   293  		runtimeFound = true
   294  	} else {
   295  		for _, r := range runtimePaths {
   296  			// SecureJoin will resolve symlinks according to the host root
   297  			runtimePath, err := securejoin.SecureJoin(host.HostRoot, r)
   298  			if err != nil {
   299  				log.Debugf("container-hook: securejoin failed: %s", err)
   300  				continue
   301  			}
   302  
   303  			log.Debugf("container-hook: trying runtime at %s", runtimePath)
   304  
   305  			if _, err := os.Stat(runtimePath); errors.Is(err, os.ErrNotExist) {
   306  				log.Debugf("container-hook: runc at %s not found", runtimePath)
   307  				continue
   308  			}
   309  
   310  			if err := runtimeBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runtimePath); err != nil {
   311  				log.Warnf("container-hook: failed to fanotify mark: %s", err)
   312  				continue
   313  			}
   314  			log.Debugf("container-hook: monitoring runtime at %s", runtimePath)
   315  			runtimeFound = true
   316  		}
   317  	}
   318  
   319  	if !runtimeFound {
   320  		runtimeBinaryNotify.File.Close()
   321  		return fmt.Errorf("no container runtime can be monitored with fanotify. The following paths were tested: %s. You can use the RUNTIME_PATH env variable to specify a custom path. If you are successful doing so, please open a PR to add your custom path to runtimePaths", strings.Join(runtimePaths, ","))
   322  	}
   323  
   324  	n.wg.Add(2)
   325  	go n.watchContainersTermination()
   326  	go n.watchRuntimeBinary()
   327  
   328  	return nil
   329  }
   330  
   331  // AddWatchContainerTermination watches a container for termination and
   332  // generates an event on the notifier. This is automatically called for new
   333  // containers detected by ContainerNotifier, but it can also be called for
   334  // containers detected externally such as initial containers.
   335  func (n *ContainerNotifier) AddWatchContainerTermination(containerID string, containerPID int) error {
   336  	n.containersMu.Lock()
   337  	defer n.containersMu.Unlock()
   338  
   339  	if _, ok := n.containers[containerID]; ok {
   340  		// This container is already being watched for termination
   341  		return nil
   342  	}
   343  
   344  	n.containers[containerID] = &watchedContainer{
   345  		id:  containerID,
   346  		pid: containerPID,
   347  	}
   348  
   349  	return nil
   350  }
   351  
   352  // watchContainerTermination waits until the container terminates
   353  func (n *ContainerNotifier) watchContainersTermination() {
   354  	defer n.wg.Done()
   355  
   356  	ticker := time.NewTicker(time.Second)
   357  	defer ticker.Stop()
   358  
   359  	for {
   360  		select {
   361  		case <-n.done:
   362  			return
   363  		case <-ticker.C:
   364  			if n.closed.Load() {
   365  				return
   366  			}
   367  
   368  			dirEntries, err := os.ReadDir(host.HostProcFs)
   369  			if err != nil {
   370  				log.Errorf("reading /proc: %s", err)
   371  				return
   372  			}
   373  			pids := make(map[int]bool)
   374  			for _, entry := range dirEntries {
   375  				pid, err := strconv.Atoi(entry.Name())
   376  				if err != nil {
   377  					// entry is not a process directory. Ignore.
   378  					continue
   379  				}
   380  				pids[pid] = true
   381  			}
   382  
   383  			n.containersMu.Lock()
   384  			for _, c := range n.containers {
   385  				if pids[c.pid] {
   386  					// container still running
   387  					continue
   388  				}
   389  
   390  				go n.callback(ContainerEvent{
   391  					Type:         EventTypeRemoveContainer,
   392  					ContainerID:  c.id,
   393  					ContainerPID: uint32(c.pid),
   394  				})
   395  
   396  				delete(n.containers, c.id)
   397  			}
   398  			n.containersMu.Unlock()
   399  		}
   400  	}
   401  }
   402  
   403  func (n *ContainerNotifier) watchPidFileIterate(
   404  	pidFileDirNotify *fanotify.NotifyFD,
   405  	bundleDir string,
   406  	configJSONPath string,
   407  	pidFile string,
   408  	pidFileDir string,
   409  ) (bool, error) {
   410  	// Get the next event from fanotify.
   411  	// Even though the API allows to pass skipPIDs, we cannot use
   412  	// it here because ResponseAllow would not be called.
   413  	data, err := pidFileDirNotify.GetEvent()
   414  	if err != nil {
   415  		return false, fmt.Errorf("%w", err)
   416  	}
   417  
   418  	// data can be nil if the event received is from a process in skipPIDs.
   419  	// In that case, skip and get the next event.
   420  	if data == nil {
   421  		return false, nil
   422  	}
   423  
   424  	// Don't leak the fd received by GetEvent
   425  	defer data.Close()
   426  	dataFile := data.File()
   427  	defer dataFile.Close()
   428  
   429  	if !data.MatchMask(unix.FAN_ACCESS_PERM) {
   430  		// This should not happen: FAN_ACCESS_PERM is the only mask Marked
   431  		return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)
   432  	}
   433  
   434  	// This unblocks whoever is accessing the pidfile
   435  	defer pidFileDirNotify.ResponseAllow(data)
   436  
   437  	path, err := data.GetPath()
   438  	if err != nil {
   439  		return false, err
   440  	}
   441  	path = filepath.Join(host.HostRoot, path)
   442  
   443  	// Consider files identical if they have the same device/inode,
   444  	// even if the paths differ due to symlinks (for example,
   445  	// the event's path is /run/... but the runc --pid-file argument
   446  	// uses /var/run/..., where /var/run is a symlink to /run).
   447  	filesAreIdentical, err := checkFilesAreIdentical(path, pidFile)
   448  	if err != nil {
   449  		return false, err
   450  	} else if !filesAreIdentical {
   451  		return false, nil
   452  	}
   453  
   454  	pidFileContent, err := io.ReadAll(dataFile)
   455  	if err != nil {
   456  		return false, err
   457  	}
   458  	if len(pidFileContent) == 0 {
   459  		return false, fmt.Errorf("empty pid file")
   460  	}
   461  	containerPID, err := strconv.Atoi(string(pidFileContent))
   462  	if err != nil {
   463  		return false, err
   464  	}
   465  
   466  	// Unfortunately, Linux 5.4 doesn't respect ignore masks
   467  	// See fix in Linux 5.9:
   468  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   469  	// Workaround: remove parent mask. We don't need it anymore :)
   470  	err = pidFileDirNotify.Mark(unix.FAN_MARK_REMOVE, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)
   471  	if err != nil {
   472  		return false, nil
   473  	}
   474  
   475  	bundleConfigJSON, err := os.ReadFile(configJSONPath)
   476  	if err != nil {
   477  		return false, err
   478  	}
   479  	containerConfig := &ocispec.Spec{}
   480  	err = json.Unmarshal(bundleConfigJSON, containerConfig)
   481  	if err != nil {
   482  		return false, err
   483  	}
   484  
   485  	// cri-o appends userdata to bundleDir,
   486  	// so we trim it here to get the correct containerID
   487  	containerID := filepath.Base(filepath.Clean(strings.TrimSuffix(bundleDir, "userdata")))
   488  
   489  	err = n.AddWatchContainerTermination(containerID, containerPID)
   490  	if err != nil {
   491  		log.Errorf("container %s with pid %d terminated before we could watch it: %s", containerID, containerPID, err)
   492  		return true, nil
   493  	}
   494  
   495  	if containerPID > math.MaxUint32 {
   496  		log.Errorf("Container PID (%d) exceeds math.MaxUint32 (%d)", containerPID, math.MaxUint32)
   497  		return true, nil
   498  	}
   499  
   500  	var containerName string
   501  	n.futureMu.Lock()
   502  	fc, ok := n.futureContainers[containerID]
   503  	if ok {
   504  		containerName = fc.name
   505  	}
   506  	delete(n.futureContainers, containerID)
   507  	n.futureMu.Unlock()
   508  
   509  	n.callback(ContainerEvent{
   510  		Type:            EventTypeAddContainer,
   511  		ContainerID:     containerID,
   512  		ContainerPID:    uint32(containerPID),
   513  		ContainerConfig: containerConfig,
   514  		Bundle:          bundleDir,
   515  		ContainerName:   containerName,
   516  	})
   517  
   518  	return true, nil
   519  }
   520  
   521  func checkFilesAreIdentical(path1, path2 string) (bool, error) {
   522  	// Since fanotify masks don't work on Linux 5.4, we could get a
   523  	// notification for an unrelated file before the pid file is created
   524  	// See fix in Linux 5.9:
   525  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   526  	// In this case we should not return an error.
   527  	if filepath.Base(path1) != filepath.Base(path2) {
   528  		return false, nil
   529  	}
   530  
   531  	f1, err := os.Stat(path1)
   532  	if err != nil {
   533  		return false, err
   534  	}
   535  
   536  	f2, err := os.Stat(path2)
   537  	if err != nil {
   538  		return false, err
   539  	}
   540  
   541  	return os.SameFile(f1, f2), nil
   542  }
   543  
   544  func (n *ContainerNotifier) monitorRuntimeInstance(bundleDir string, pidFile string) error {
   545  	fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS)
   546  	openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC
   547  
   548  	pidFileDirNotify, err := fanotify.Initialize(fanotifyFlags, openFlags)
   549  	if err != nil {
   550  		return err
   551  	}
   552  
   553  	// The pidfile does not exist yet, so we cannot monitor it directly.
   554  	// Instead we monitor its parent directory with FAN_EVENT_ON_CHILD to
   555  	// get events on the directory's children.
   556  	pidFileDir := filepath.Dir(pidFile)
   557  	err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)
   558  	if err != nil {
   559  		pidFileDirNotify.File.Close()
   560  		return fmt.Errorf("marking %s: %w", pidFileDir, err)
   561  	}
   562  
   563  	// watchPidFileIterate() will read config.json and it might be in the
   564  	// same directory as the pid file. To avoid getting events unrelated to
   565  	// the pidfile, add an ignore mask.
   566  	//
   567  	// This is best effort because the ignore mask is unfortunately not
   568  	// respected until a fix in Linux 5.9:
   569  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   570  	configJSONPath := filepath.Join(bundleDir, "config.json")
   571  	if _, err := os.Stat(configJSONPath); errors.Is(err, os.ErrNotExist) {
   572  		// podman might install config.json in the userdata directory
   573  		configJSONPath = filepath.Join(bundleDir, "userdata", "config.json")
   574  		if _, err := os.Stat(configJSONPath); errors.Is(err, os.ErrNotExist) {
   575  			pidFileDirNotify.File.Close()
   576  			return fmt.Errorf("config not found at %s", configJSONPath)
   577  		}
   578  	}
   579  	err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, configJSONPath)
   580  	if err != nil {
   581  		pidFileDirNotify.File.Close()
   582  		return fmt.Errorf("marking %s: %w", configJSONPath, err)
   583  	}
   584  
   585  	// similar to config.json, we ignore passwd file if it exists
   586  	passwdPath := filepath.Join(bundleDir, "passwd")
   587  	if _, err := os.Stat(passwdPath); !errors.Is(err, os.ErrNotExist) {
   588  		err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, passwdPath)
   589  		if err != nil {
   590  			pidFileDirNotify.File.Close()
   591  			return fmt.Errorf("marking passwd path: %w", err)
   592  		}
   593  	}
   594  
   595  	n.wg.Add(1)
   596  	go func() {
   597  		defer n.wg.Done()
   598  		defer pidFileDirNotify.File.Close()
   599  		for {
   600  			stop, err := n.watchPidFileIterate(pidFileDirNotify, bundleDir, configJSONPath, pidFile, pidFileDir)
   601  			if n.closed.Load() {
   602  				return
   603  			}
   604  			if err != nil {
   605  				log.Warnf("error watching pid: %v\n", err)
   606  				return
   607  			}
   608  			if stop {
   609  				return
   610  			}
   611  		}
   612  	}()
   613  
   614  	return nil
   615  }
   616  
   617  func (n *ContainerNotifier) watchRuntimeBinary() {
   618  	defer n.wg.Done()
   619  
   620  	for {
   621  		stop, err := n.watchRuntimeIterate()
   622  		if n.closed.Load() {
   623  			n.runtimeBinaryNotify.File.Close()
   624  			return
   625  		}
   626  		if err != nil {
   627  			log.Errorf("error watching runtime binary: %v\n", err)
   628  		}
   629  		if stop {
   630  			n.runtimeBinaryNotify.File.Close()
   631  			return
   632  		}
   633  	}
   634  }
   635  
   636  func (n *ContainerNotifier) parseConmonCmdline(cmdlineArr []string) {
   637  	containerName := ""
   638  	containerID := ""
   639  	bundleDir := ""
   640  	pidFile := ""
   641  
   642  	for i := 0; i < len(cmdlineArr); i++ {
   643  		verb := cmdlineArr[i]
   644  		arg := ""
   645  		if i+1 < len(cmdlineArr) {
   646  			arg = cmdlineArr[i+1]
   647  		}
   648  		switch verb {
   649  		case "-n", "--name":
   650  			containerName = arg
   651  			i++
   652  		case "-c", "--cid":
   653  			containerID = arg
   654  			i++
   655  		case "-b", "--bundle":
   656  			bundleDir = arg
   657  			i++
   658  		case "-p", "--container-pidfile":
   659  			pidFile = arg
   660  			i++
   661  		}
   662  	}
   663  
   664  	if containerName == "" || containerID == "" || bundleDir == "" || pidFile == "" {
   665  		return
   666  	}
   667  
   668  	n.futureMu.Lock()
   669  	n.futureContainers[containerID] = &futureContainer{
   670  		id:        containerID,
   671  		pidFile:   pidFile,
   672  		bundleDir: bundleDir,
   673  		name:      containerName,
   674  	}
   675  	n.futureMu.Unlock()
   676  }
   677  
   678  func (n *ContainerNotifier) parseOCIRuntime(comm string, cmdlineArr []string) {
   679  	// Parse oci-runtime (runc/crun) command line
   680  	createFound := false
   681  	bundleDir := ""
   682  	pidFile := ""
   683  
   684  	for i := 0; i < len(cmdlineArr); i++ {
   685  		if cmdlineArr[i] == "create" {
   686  			createFound = true
   687  			continue
   688  		}
   689  		if cmdlineArr[i] == "--bundle" && i+1 < len(cmdlineArr) {
   690  			i++
   691  			bundleDir = filepath.Join(host.HostRoot, cmdlineArr[i])
   692  			continue
   693  		}
   694  		if cmdlineArr[i] == "--pid-file" && i+1 < len(cmdlineArr) {
   695  			i++
   696  			pidFile = filepath.Join(host.HostRoot, cmdlineArr[i])
   697  			continue
   698  		}
   699  	}
   700  
   701  	if createFound && bundleDir != "" && pidFile != "" {
   702  		err := n.monitorRuntimeInstance(bundleDir, pidFile)
   703  		if err != nil {
   704  			log.Errorf("error monitoring runtime instance: %v\n", err)
   705  		}
   706  	}
   707  }
   708  
   709  func (n *ContainerNotifier) watchRuntimeIterate() (bool, error) {
   710  	// Get the next event from fanotify.
   711  	// Even though the API allows to pass skipPIDs, we cannot use it here
   712  	// because ResponseAllow would not be called.
   713  	data, err := n.runtimeBinaryNotify.GetEvent()
   714  	if err != nil {
   715  		return true, err
   716  	}
   717  
   718  	// data can be nil if the event received is from a process in skipPIDs.
   719  	// In that case, skip and get the next event.
   720  	if data == nil {
   721  		return false, nil
   722  	}
   723  
   724  	// Don't leak the fd received by GetEvent
   725  	defer data.Close()
   726  
   727  	if !data.MatchMask(unix.FAN_OPEN_EXEC_PERM) {
   728  		// This should not happen: FAN_OPEN_EXEC_PERM is the only mask Marked
   729  		return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)
   730  	}
   731  
   732  	// This unblocks the execution
   733  	defer n.runtimeBinaryNotify.ResponseAllow(data)
   734  
   735  	// Lookup entry in ebpf map ig_fa_records
   736  	var record execruntimeRecord
   737  	err = n.objs.IgFaRecords.LookupAndDelete(nil, &record)
   738  	if err != nil {
   739  		return false, fmt.Errorf("lookup record: %w", err)
   740  	}
   741  
   742  	// Skip empty record
   743  	// This can happen when the ebpf code didn't find the exec args
   744  	if record.Pid == 0 {
   745  		log.Debugf("skip event with pid=0")
   746  		return false, nil
   747  	}
   748  	if record.ArgsSize == 0 {
   749  		log.Debugf("skip event without args")
   750  		return false, nil
   751  	}
   752  
   753  	callerComm := strings.TrimRight(string(record.CallerComm[:]), "\x00")
   754  
   755  	cmdlineArr := []string{}
   756  	calleeComm := ""
   757  	for _, arg := range strings.Split(string(record.Args[0:record.ArgsSize]), "\x00") {
   758  		if arg != "" {
   759  			cmdlineArr = append(cmdlineArr, arg)
   760  		}
   761  	}
   762  	if len(cmdlineArr) == 0 {
   763  		log.Debugf("cannot get cmdline for pid %d", record.Pid)
   764  		return false, nil
   765  	}
   766  	if len(cmdlineArr) > 0 {
   767  		calleeComm = filepath.Base(cmdlineArr[0])
   768  	}
   769  
   770  	log.Debugf("got event with pid=%d caller=%q callee=%q args=%v",
   771  		record.Pid,
   772  		callerComm, calleeComm,
   773  		cmdlineArr)
   774  
   775  	// runc is executing itself with unix.Exec(), so fanotify receives two
   776  	// FAN_OPEN_EXEC_PERM events:
   777  	//   1. from containerd-shim (or similar)
   778  	//   2. from runc, by this re-execution.
   779  	// This filter takes the first one.
   780  
   781  	switch calleeComm {
   782  	case "conmon":
   783  		// Calling sequence: crio/podman -> conmon -> runc/crun
   784  		n.parseConmonCmdline(cmdlineArr)
   785  	case "runc", "crun":
   786  		n.parseOCIRuntime(calleeComm, cmdlineArr)
   787  	default:
   788  		return false, nil
   789  	}
   790  
   791  	return false, nil
   792  }
   793  
   794  func (n *ContainerNotifier) Close() {
   795  	n.closed.Store(true)
   796  	close(n.done)
   797  	if n.runtimeBinaryNotify != nil {
   798  		n.runtimeBinaryNotify.File.Close()
   799  	}
   800  	n.wg.Wait()
   801  
   802  	for _, l := range n.links {
   803  		gadgets.CloseLink(l)
   804  	}
   805  	n.links = nil
   806  	n.objs.Close()
   807  }