github.com/inspektor-gadget/inspektor-gadget@v0.28.1/pkg/runcfanotify/runcfanotify.go (about)

     1  // Copyright 2021 The Inspektor Gadget authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package runcfanotify
    16  
    17  import (
    18  	"encoding/json"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"os"
    24  	"path"
    25  	"path/filepath"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"sync/atomic"
    30  	"time"
    31  
    32  	ocispec "github.com/opencontainers/runtime-spec/specs-go"
    33  	"github.com/s3rj1k/go-fanotify/fanotify"
    34  	log "github.com/sirupsen/logrus"
    35  	"golang.org/x/sys/unix"
    36  
    37  	"github.com/inspektor-gadget/inspektor-gadget/pkg/utils/host"
    38  )
    39  
    40  type EventType int
    41  
    42  const (
    43  	EventTypeAddContainer EventType = iota
    44  	EventTypeRemoveContainer
    45  )
    46  
    47  // ContainerEvent is the notification for container creation or termination
    48  type ContainerEvent struct {
    49  	// Type is whether the container was added or removed
    50  	Type EventType
    51  
    52  	// ContainerID is the container id, typically a 64 hexadecimal string
    53  	ContainerID string
    54  
    55  	// ContainerName is the container name given by the container runtime,
    56  	// typically two words with an underscore. Notice it might be different from
    57  	// the one given by Kubernetes.
    58  	ContainerName string
    59  
    60  	// ContainerPID is the process id of the container
    61  	ContainerPID uint32
    62  
    63  	// Container's configuration is the config.json from the OCI runtime
    64  	// spec
    65  	ContainerConfig *ocispec.Spec
    66  
    67  	// Bundle is the directory containing the config.json from the OCI
    68  	// runtime spec
    69  	// See https://github.com/opencontainers/runtime-spec/blob/main/bundle.md
    70  	Bundle string
    71  }
    72  
    73  type RuncNotifyFunc func(notif ContainerEvent)
    74  
    75  type runcContainer struct {
    76  	id    string
    77  	pid   int
    78  	pidfd int
    79  }
    80  
    81  type futureContainer struct {
    82  	id        string
    83  	name      string
    84  	bundleDir string
    85  	pidFile   string
    86  }
    87  
    88  type RuncNotifier struct {
    89  	runcBinaryNotify *fanotify.NotifyFD
    90  	callback         RuncNotifyFunc
    91  
    92  	// containers is the set of containers that are being watched for
    93  	// termination. This prevents duplicate calls to
    94  	// AddWatchContainerTermination.
    95  	//
    96  	// Keys: Container ID
    97  	containers   map[string]*runcContainer
    98  	containersMu sync.Mutex
    99  
   100  	// futureContainers is the set of containers that are detected before
   101  	// oci-runtime (runc/crun) creates the container e.g. detected via conmon
   102  	//
   103  	// Keys: Container ID
   104  	futureContainers map[string]*futureContainer
   105  	futureMu         sync.Mutex
   106  
   107  	// set to true when RuncNotifier is closed
   108  	closed atomic.Bool
   109  	// this channel is used in watchContainersTermination() to avoid having to wait for the
   110  	// ticker to trigger before returning
   111  	done chan bool
   112  
   113  	wg sync.WaitGroup
   114  }
   115  
   116  // runcPaths is the list of paths where runc could be installed. Depending on
   117  // the Linux distribution, it could be in different locations.
   118  //
   119  // When this package is executed in a container, it prepends the
   120  // HOST_ROOT env variable to the path.
   121  var runcPaths = []string{
   122  	"/bin/runc",
   123  	"/usr/bin/runc",
   124  	"/usr/sbin/runc",
   125  	"/usr/local/bin/runc",
   126  	"/usr/local/sbin/runc",
   127  	"/usr/lib/cri-o-runc/sbin/runc",
   128  	"/run/torcx/unpack/docker/bin/runc",
   129  	"/usr/bin/crun",
   130  }
   131  
   132  // initFanotify initializes the fanotify API with the flags we need
   133  func initFanotify() (*fanotify.NotifyFD, error) {
   134  	fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS | unix.FAN_NONBLOCK)
   135  	openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC
   136  	return fanotify.Initialize(fanotifyFlags, openFlags)
   137  }
   138  
   139  // Supported detects if RuncNotifier is supported in the current environment
   140  func Supported() bool {
   141  	hostPidNs, err := host.IsHostPidNs()
   142  	if err != nil {
   143  		log.Debugf("Runcfanotify: not supported: %s", err)
   144  		return false
   145  	}
   146  	if !hostPidNs {
   147  		log.Debugf("Runcfanotify: not supported: not in host pid namespace")
   148  		return false
   149  	}
   150  	notifier, err := NewRuncNotifier(func(notif ContainerEvent) {})
   151  	if notifier != nil {
   152  		notifier.Close()
   153  	}
   154  	if err != nil {
   155  		log.Warnf("checking if current pid namespace is host pid namespace %s", err)
   156  	}
   157  	return err == nil
   158  }
   159  
   160  // NewRuncNotifier uses fanotify to detect when runc containers are created
   161  // or terminated, and call the callback on such event.
   162  //
   163  // Limitations:
   164  // - runc must be installed in one of the paths listed by runcPaths
   165  func NewRuncNotifier(callback RuncNotifyFunc) (*RuncNotifier, error) {
   166  	n := &RuncNotifier{
   167  		callback:         callback,
   168  		containers:       make(map[string]*runcContainer),
   169  		futureContainers: make(map[string]*futureContainer),
   170  		done:             make(chan bool),
   171  	}
   172  
   173  	runcBinaryNotify, err := initFanotify()
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  	n.runcBinaryNotify = runcBinaryNotify
   178  
   179  	runcMonitored := false
   180  
   181  	runcPath := os.Getenv("RUNC_PATH")
   182  	if runcPath != "" {
   183  		log.Debugf("Runcfanotify: trying runc from RUNC_PATH env variable at %s", runcPath)
   184  
   185  		if _, err := os.Stat(runcPath); errors.Is(err, os.ErrNotExist) {
   186  			return nil, err
   187  		}
   188  
   189  		if err := runcBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runcPath); err != nil {
   190  			return nil, fmt.Errorf("fanotify marking of %s: %w", runcPath, err)
   191  		}
   192  		runcMonitored = true
   193  	} else {
   194  		for _, r := range runcPaths {
   195  			runcPath := filepath.Join(host.HostRoot, r)
   196  
   197  			log.Debugf("Runcfanotify: trying runc at %s", runcPath)
   198  
   199  			if _, err := os.Stat(runcPath); errors.Is(err, os.ErrNotExist) {
   200  				log.Debugf("Runcfanotify: runc at %s not found", runcPath)
   201  				continue
   202  			}
   203  
   204  			if err := runcBinaryNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_OPEN_EXEC_PERM, unix.AT_FDCWD, runcPath); err != nil {
   205  				log.Warnf("Runcfanotify: failed to fanotify mark: %s", err)
   206  				continue
   207  			}
   208  			runcMonitored = true
   209  		}
   210  	}
   211  
   212  	if !runcMonitored {
   213  		runcBinaryNotify.File.Close()
   214  		return nil, fmt.Errorf("no runc instance can be monitored with fanotify. The following paths were tested: %s. You can use the RUNC_PATH env variable to specify a custom path. If you are successful doing so, please open a PR to add your custom path to runcPaths", strings.Join(runcPaths, ","))
   215  	}
   216  
   217  	n.wg.Add(2)
   218  	go n.watchContainersTermination()
   219  	go n.watchRunc()
   220  
   221  	return n, nil
   222  }
   223  
   224  // AddWatchContainerTermination watches a container for termination and
   225  // generates an event on the notifier. This is automatically called for new
   226  // containers detected by RuncNotifier, but it can also be called for
   227  // containers detected externally such as initial containers.
   228  func (n *RuncNotifier) AddWatchContainerTermination(containerID string, containerPID int) error {
   229  	n.containersMu.Lock()
   230  	defer n.containersMu.Unlock()
   231  
   232  	if _, ok := n.containers[containerID]; ok {
   233  		// This container is already being watched for termination
   234  		return nil
   235  	}
   236  
   237  	n.containers[containerID] = &runcContainer{
   238  		id:  containerID,
   239  		pid: containerPID,
   240  	}
   241  
   242  	return nil
   243  }
   244  
   245  // watchContainerTermination waits until the container terminates
   246  func (n *RuncNotifier) watchContainersTermination() {
   247  	defer n.wg.Done()
   248  
   249  	ticker := time.NewTicker(time.Second)
   250  	defer ticker.Stop()
   251  
   252  	for {
   253  		select {
   254  		case <-n.done:
   255  			return
   256  		case <-ticker.C:
   257  			if n.closed.Load() {
   258  				return
   259  			}
   260  
   261  			dirEntries, err := os.ReadDir(host.HostProcFs)
   262  			if err != nil {
   263  				log.Errorf("reading /proc: %s", err)
   264  				return
   265  			}
   266  			pids := make(map[int]bool)
   267  			for _, entry := range dirEntries {
   268  				pid, err := strconv.Atoi(entry.Name())
   269  				if err != nil {
   270  					// entry is not a process directory. Ignore.
   271  					continue
   272  				}
   273  				pids[pid] = true
   274  			}
   275  
   276  			n.containersMu.Lock()
   277  			for _, c := range n.containers {
   278  				if pids[c.pid] {
   279  					// container still running
   280  					continue
   281  				}
   282  
   283  				go n.callback(ContainerEvent{
   284  					Type:         EventTypeRemoveContainer,
   285  					ContainerID:  c.id,
   286  					ContainerPID: uint32(c.pid),
   287  				})
   288  
   289  				delete(n.containers, c.id)
   290  			}
   291  			n.containersMu.Unlock()
   292  		}
   293  	}
   294  }
   295  
   296  func (n *RuncNotifier) watchPidFileIterate(pidFileDirNotify *fanotify.NotifyFD, bundleDir string, pidFile string, pidFileDir string) (bool, error) {
   297  	// Get the next event from fanotify.
   298  	// Even though the API allows to pass skipPIDs, we cannot use
   299  	// it here because ResponseAllow would not be called.
   300  	data, err := pidFileDirNotify.GetEvent()
   301  	if err != nil {
   302  		return false, fmt.Errorf("%w", err)
   303  	}
   304  
   305  	// data can be nil if the event received is from a process in skipPIDs.
   306  	// In that case, skip and get the next event.
   307  	if data == nil {
   308  		return false, nil
   309  	}
   310  
   311  	// Don't leak the fd received by GetEvent
   312  	defer data.Close()
   313  	dataFile := data.File()
   314  	defer dataFile.Close()
   315  
   316  	if !data.MatchMask(unix.FAN_ACCESS_PERM) {
   317  		// This should not happen: FAN_ACCESS_PERM is the only mask Marked
   318  		return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)
   319  	}
   320  
   321  	// This unblocks whoever is accessing the pidfile
   322  	defer pidFileDirNotify.ResponseAllow(data)
   323  
   324  	pid := data.GetPID()
   325  
   326  	// Skip events triggered by ourselves
   327  	if pid == os.Getpid() {
   328  		return false, nil
   329  	}
   330  
   331  	path, err := data.GetPath()
   332  	if err != nil {
   333  		return false, err
   334  	}
   335  	path = filepath.Join(host.HostRoot, path)
   336  
   337  	// Consider files identical if they have the same device/inode,
   338  	// even if the paths differ due to symlinks (for example,
   339  	// the event's path is /run/... but the runc --pid-file argument
   340  	// uses /var/run/..., where /var/run is a symlink to /run).
   341  	filesAreIdentical, err := checkFilesAreIdentical(path, pidFile)
   342  	if err != nil {
   343  		return false, err
   344  	} else if !filesAreIdentical {
   345  		return false, nil
   346  	}
   347  
   348  	pidFileContent, err := io.ReadAll(dataFile)
   349  	if err != nil {
   350  		return false, err
   351  	}
   352  	if len(pidFileContent) == 0 {
   353  		return false, fmt.Errorf("empty pid file")
   354  	}
   355  	containerPID, err := strconv.Atoi(string(pidFileContent))
   356  	if err != nil {
   357  		return false, err
   358  	}
   359  
   360  	// Unfortunately, Linux 5.4 doesn't respect ignore masks
   361  	// See fix in Linux 5.9:
   362  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   363  	// Workaround: remove parent mask. We don't need it anymore :)
   364  	err = pidFileDirNotify.Mark(unix.FAN_MARK_REMOVE, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)
   365  	if err != nil {
   366  		return false, nil
   367  	}
   368  
   369  	bundleConfigJSON, err := os.ReadFile(filepath.Join(bundleDir, "config.json"))
   370  	if err != nil {
   371  		return false, err
   372  	}
   373  	containerConfig := &ocispec.Spec{}
   374  	err = json.Unmarshal(bundleConfigJSON, containerConfig)
   375  	if err != nil {
   376  		return false, err
   377  	}
   378  
   379  	// cri-o appends userdata to bundleDir,
   380  	// so we trim it here to get the correct containerID
   381  	containerID := filepath.Base(filepath.Clean(strings.TrimSuffix(bundleDir, "userdata")))
   382  
   383  	err = n.AddWatchContainerTermination(containerID, containerPID)
   384  	if err != nil {
   385  		log.Errorf("runc fanotify: container %s with pid %d terminated before we could watch it: %s", containerID, containerPID, err)
   386  		return true, nil
   387  	}
   388  
   389  	if containerPID > math.MaxUint32 {
   390  		log.Errorf("Container PID (%d) exceeds math.MaxUint32 (%d)", containerPID, math.MaxUint32)
   391  		return true, nil
   392  	}
   393  
   394  	var containerName string
   395  	if fc := n.lookupFutureContainer(containerID); fc != nil {
   396  		containerName = fc.name
   397  	}
   398  
   399  	n.callback(ContainerEvent{
   400  		Type:            EventTypeAddContainer,
   401  		ContainerID:     containerID,
   402  		ContainerPID:    uint32(containerPID),
   403  		ContainerConfig: containerConfig,
   404  		Bundle:          bundleDir,
   405  		ContainerName:   containerName,
   406  	})
   407  
   408  	return true, nil
   409  }
   410  
   411  func checkFilesAreIdentical(path1, path2 string) (bool, error) {
   412  	// Since fanotify masks don't work on Linux 5.4, we could get a
   413  	// notification for an unrelated file before the pid file is created
   414  	// See fix in Linux 5.9:
   415  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   416  	// In this case we should not return an error.
   417  	if filepath.Base(path1) != filepath.Base(path2) {
   418  		return false, nil
   419  	}
   420  
   421  	f1, err := os.Stat(path1)
   422  	if err != nil {
   423  		return false, err
   424  	}
   425  
   426  	f2, err := os.Stat(path2)
   427  	if err != nil {
   428  		return false, err
   429  	}
   430  
   431  	return os.SameFile(f1, f2), nil
   432  }
   433  
   434  func (n *RuncNotifier) monitorRuncInstance(bundleDir string, pidFile string) error {
   435  	fanotifyFlags := uint(unix.FAN_CLOEXEC | unix.FAN_CLASS_CONTENT | unix.FAN_UNLIMITED_QUEUE | unix.FAN_UNLIMITED_MARKS)
   436  	openFlags := os.O_RDONLY | unix.O_LARGEFILE | unix.O_CLOEXEC
   437  
   438  	pidFileDirNotify, err := fanotify.Initialize(fanotifyFlags, openFlags)
   439  	if err != nil {
   440  		return err
   441  	}
   442  
   443  	// The pidfile does not exist yet, so we cannot monitor it directly.
   444  	// Instead we monitor its parent directory with FAN_EVENT_ON_CHILD to
   445  	// get events on the directory's children.
   446  	pidFileDir := filepath.Dir(pidFile)
   447  	err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD, unix.FAN_ACCESS_PERM|unix.FAN_EVENT_ON_CHILD, unix.AT_FDCWD, pidFileDir)
   448  	if err != nil {
   449  		pidFileDirNotify.File.Close()
   450  		return fmt.Errorf("marking %s: %w", pidFileDir, err)
   451  	}
   452  
   453  	// watchPidFileIterate() will read config.json and it might be in the
   454  	// same directory as the pid file. To avoid getting events unrelated to
   455  	// the pidfile, add an ignore mask.
   456  	//
   457  	// This is best effort because the ignore mask is unfortunately not
   458  	// respected until a fix in Linux 5.9:
   459  	// https://github.com/torvalds/linux/commit/497b0c5a7c0688c1b100a9c2e267337f677c198e
   460  	configJSONPath := filepath.Join(bundleDir, "config.json")
   461  	err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, configJSONPath)
   462  	if err != nil {
   463  		pidFileDirNotify.File.Close()
   464  		return fmt.Errorf("ignoring %s: %w", configJSONPath, err)
   465  	}
   466  
   467  	// similar to config.json, we ignore passwd file if it exists
   468  	passwdPath := filepath.Join(bundleDir, "passwd")
   469  	if _, err := os.Stat(passwdPath); !errors.Is(err, os.ErrNotExist) {
   470  		err = pidFileDirNotify.Mark(unix.FAN_MARK_ADD|unix.FAN_MARK_IGNORED_MASK, unix.FAN_ACCESS_PERM, unix.AT_FDCWD, passwdPath)
   471  		if err != nil {
   472  			pidFileDirNotify.File.Close()
   473  			return fmt.Errorf("marking passwd path: %w", err)
   474  		}
   475  	}
   476  
   477  	n.wg.Add(1)
   478  	go func() {
   479  		defer n.wg.Done()
   480  		defer pidFileDirNotify.File.Close()
   481  		for {
   482  			stop, err := n.watchPidFileIterate(pidFileDirNotify, bundleDir, pidFile, pidFileDir)
   483  			if n.closed.Load() {
   484  				return
   485  			}
   486  			if err != nil {
   487  				log.Warnf("error watching pid: %v\n", err)
   488  				return
   489  			}
   490  			if stop {
   491  				return
   492  			}
   493  		}
   494  	}()
   495  
   496  	return nil
   497  }
   498  
   499  func (n *RuncNotifier) watchRunc() {
   500  	defer n.wg.Done()
   501  
   502  	for {
   503  		stop, err := n.watchRuncIterate()
   504  		if n.closed.Load() {
   505  			n.runcBinaryNotify.File.Close()
   506  			return
   507  		}
   508  		if err != nil {
   509  			log.Errorf("error watching runc: %v\n", err)
   510  		}
   511  		if stop {
   512  			n.runcBinaryNotify.File.Close()
   513  			return
   514  		}
   515  	}
   516  }
   517  
   518  func (n *RuncNotifier) parseConmonCmdline(cmdlineArr []string) {
   519  	if path.Base(cmdlineArr[0]) != "conmon" {
   520  		return
   521  	}
   522  
   523  	// Parse conmon command line
   524  	containerName := ""
   525  	containerID := ""
   526  	bundleDir := ""
   527  	pidFile := ""
   528  	conmonFound := false
   529  
   530  	conmonFound = true
   531  	for i := 0; i < len(cmdlineArr); i++ {
   532  		verb := cmdlineArr[i]
   533  		arg := ""
   534  		if i+1 < len(cmdlineArr) {
   535  			arg = cmdlineArr[i+1]
   536  		}
   537  		switch verb {
   538  		case "-n", "--name":
   539  			containerName = arg
   540  			i++
   541  		case "-c", "--cid":
   542  			containerID = arg
   543  			i++
   544  		case "-b", "--bundle":
   545  			bundleDir = arg
   546  			i++
   547  		case "-p", "--container-pidfile":
   548  			pidFile = arg
   549  			i++
   550  		}
   551  	}
   552  
   553  	if !conmonFound || containerName == "" || containerID == "" || bundleDir == "" || pidFile == "" {
   554  		return
   555  	}
   556  
   557  	n.futureMu.Lock()
   558  	n.futureContainers[containerID] = &futureContainer{
   559  		id:        containerID,
   560  		pidFile:   pidFile,
   561  		bundleDir: bundleDir,
   562  		name:      containerName,
   563  	}
   564  	n.futureMu.Unlock()
   565  }
   566  
   567  func (n *RuncNotifier) parseOCIRuntime(comm string, cmdlineArr []string) {
   568  	// Parse oci-runtime (runc/crun) command line
   569  	createFound := false
   570  	startFound := false
   571  	containerID := ""
   572  	bundleDir := ""
   573  	pidFile := ""
   574  
   575  	for i := 0; i < len(cmdlineArr); i++ {
   576  		if cmdlineArr[i] == "create" {
   577  			createFound = true
   578  			continue
   579  		}
   580  		if cmdlineArr[i] == "start" {
   581  			startFound = true
   582  			continue
   583  		}
   584  		if cmdlineArr[i] == "--bundle" && i+1 < len(cmdlineArr) {
   585  			i++
   586  			bundleDir = filepath.Join(host.HostRoot, cmdlineArr[i])
   587  			continue
   588  		}
   589  		if cmdlineArr[i] == "--pid-file" && i+1 < len(cmdlineArr) {
   590  			i++
   591  			pidFile = filepath.Join(host.HostRoot, cmdlineArr[i])
   592  			continue
   593  		}
   594  		if cmdlineArr[i] != "" {
   595  			containerID = cmdlineArr[i]
   596  		}
   597  	}
   598  
   599  	if comm == "runc" && createFound && bundleDir != "" && pidFile != "" {
   600  		err := n.monitorRuncInstance(bundleDir, pidFile)
   601  		if err != nil {
   602  			log.Errorf("error monitoring runc instance: %v\n", err)
   603  		}
   604  	}
   605  
   606  	if comm == "crun" && startFound && containerID != "" {
   607  		fc := n.lookupFutureContainer(containerID)
   608  		if fc == nil {
   609  			log.Warnf("cannot lookup container for %s\n", containerID)
   610  			return
   611  		}
   612  		bundleConfigJSON, err := os.ReadFile(filepath.Join(fc.bundleDir, "config.json"))
   613  		if err != nil {
   614  			log.Errorf("error reading bundle config: %v\n", err)
   615  			return
   616  		}
   617  		containerConfig := &ocispec.Spec{}
   618  		err = json.Unmarshal(bundleConfigJSON, containerConfig)
   619  		if err != nil {
   620  			log.Errorf("error unmarshaling bundle config: %v\n", err)
   621  			return
   622  		}
   623  
   624  		pidFileContent, err := os.ReadFile(fc.pidFile)
   625  		if err != nil {
   626  			log.Errorf("error reading pid file: %v\n", err)
   627  			return
   628  		}
   629  		if len(pidFileContent) == 0 {
   630  			log.Errorf("empty pid file")
   631  			return
   632  		}
   633  		containerPID, err := strconv.ParseUint(string(pidFileContent), 10, 32)
   634  		if err != nil {
   635  			log.Errorf("error parsing pid file: %v\n", err)
   636  			return
   637  		}
   638  
   639  		n.callback(ContainerEvent{
   640  			Type:            EventTypeAddContainer,
   641  			ContainerID:     containerID,
   642  			ContainerPID:    uint32(containerPID),
   643  			ContainerConfig: containerConfig,
   644  			Bundle:          bundleDir,
   645  			ContainerName:   fc.name,
   646  		})
   647  	}
   648  }
   649  
   650  func (n *RuncNotifier) watchRuncIterate() (bool, error) {
   651  	// Get the next event from fanotify.
   652  	// Even though the API allows to pass skipPIDs, we cannot use it here
   653  	// because ResponseAllow would not be called.
   654  	data, err := n.runcBinaryNotify.GetEvent()
   655  	if err != nil {
   656  		return true, fmt.Errorf("%w", err)
   657  	}
   658  
   659  	// data can be nil if the event received is from a process in skipPIDs.
   660  	// In that case, skip and get the next event.
   661  	if data == nil {
   662  		return false, nil
   663  	}
   664  
   665  	// Don't leak the fd received by GetEvent
   666  	defer data.Close()
   667  
   668  	if !data.MatchMask(unix.FAN_OPEN_EXEC_PERM) {
   669  		// This should not happen: FAN_OPEN_EXEC_PERM is the only mask Marked
   670  		return false, fmt.Errorf("fanotify: unknown event on runc: mask=%d pid=%d", data.Mask, data.Pid)
   671  	}
   672  
   673  	// This unblocks the execution
   674  	defer n.runcBinaryNotify.ResponseAllow(data)
   675  
   676  	pid := data.GetPID()
   677  
   678  	// Skip events triggered by ourselves
   679  	if pid == os.Getpid() {
   680  		return false, nil
   681  	}
   682  
   683  	// runc is executing itself with unix.Exec(), so fanotify receives two
   684  	// FAN_OPEN_EXEC_PERM events:
   685  	//   1. from containerd-shim (or similar)
   686  	//   2. from runc, by this re-execution.
   687  	// This filter skips the first one and handles the second one.
   688  	comm := host.GetProcComm(pid)
   689  	cmdlineArr := host.GetProcCmdline(pid)
   690  
   691  	if len(cmdlineArr) == 0 {
   692  		return false, nil
   693  	}
   694  
   695  	switch comm {
   696  	case "conmon":
   697  		// conmon is a special case because it is not a child of the container
   698  		// Also, the calling sequence is podman -> conmon -> runc
   699  		n.parseConmonCmdline(cmdlineArr)
   700  	case "runc", "crun":
   701  		n.parseOCIRuntime(comm, cmdlineArr)
   702  	default:
   703  		return false, nil
   704  	}
   705  
   706  	return false, nil
   707  }
   708  
   709  func (n *RuncNotifier) Close() {
   710  	n.closed.Store(true)
   711  	close(n.done)
   712  	n.runcBinaryNotify.File.Close()
   713  	n.wg.Wait()
   714  }
   715  
   716  func (n *RuncNotifier) lookupFutureContainer(id string) *futureContainer {
   717  	n.futureMu.Lock()
   718  	defer n.futureMu.Unlock()
   719  	fc, ok := n.futureContainers[id]
   720  	if !ok {
   721  		return nil
   722  	}
   723  	delete(n.futureContainers, id)
   724  	return fc
   725  }