github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/setns_init_linux.go (about)

     1  package libcontainer
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  
     9  	"github.com/opencontainers/selinux/go-selinux"
    10  	"github.com/sirupsen/logrus"
    11  	"golang.org/x/sys/unix"
    12  
    13  	"github.com/opencontainers/runc/libcontainer/apparmor"
    14  	"github.com/opencontainers/runc/libcontainer/keys"
    15  	"github.com/opencontainers/runc/libcontainer/seccomp"
    16  	"github.com/opencontainers/runc/libcontainer/system"
    17  	"github.com/opencontainers/runc/libcontainer/utils"
    18  )
    19  
    20  // linuxSetnsInit performs the container's initialization for running a new process
    21  // inside an existing container.
    22  type linuxSetnsInit struct {
    23  	pipe          *syncSocket
    24  	consoleSocket *os.File
    25  	pidfdSocket   *os.File
    26  	config        *initConfig
    27  	logPipe       *os.File
    28  	dmzExe        *os.File
    29  }
    30  
    31  func (l *linuxSetnsInit) getSessionRingName() string {
    32  	return "_ses." + l.config.ContainerID
    33  }
    34  
    35  func (l *linuxSetnsInit) Init() error {
    36  	if !l.config.Config.NoNewKeyring {
    37  		if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
    38  			return err
    39  		}
    40  		defer selinux.SetKeyLabel("") //nolint: errcheck
    41  		// Do not inherit the parent's session keyring.
    42  		if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
    43  			// Same justification as in standart_init_linux.go as to why we
    44  			// don't bail on ENOSYS.
    45  			//
    46  			// TODO(cyphar): And we should have logging here too.
    47  			if !errors.Is(err, unix.ENOSYS) {
    48  				return fmt.Errorf("unable to join session keyring: %w", err)
    49  			}
    50  		}
    51  	}
    52  
    53  	if l.config.CreateConsole {
    54  		if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
    55  			return err
    56  		}
    57  		if err := system.Setctty(); err != nil {
    58  			return err
    59  		}
    60  	}
    61  	if l.pidfdSocket != nil {
    62  		if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
    63  			return fmt.Errorf("failed to setup pidfd: %w", err)
    64  		}
    65  	}
    66  	if l.config.NoNewPrivileges {
    67  		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
    68  			return err
    69  		}
    70  	}
    71  	if l.config.Config.Umask != nil {
    72  		unix.Umask(int(*l.config.Config.Umask))
    73  	}
    74  
    75  	if l.config.Config.Scheduler != nil {
    76  		if err := setupScheduler(l.config.Config); err != nil {
    77  			return err
    78  		}
    79  	}
    80  
    81  	// Tell our parent that we're ready to exec. This must be done before the
    82  	// Seccomp rules have been applied, because we need to be able to read and
    83  	// write to a socket.
    84  	if err := syncParentReady(l.pipe); err != nil {
    85  		return fmt.Errorf("sync ready: %w", err)
    86  	}
    87  
    88  	if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
    89  		return err
    90  	}
    91  	defer selinux.SetExecLabel("") //nolint: errcheck
    92  	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
    93  	// do this before dropping capabilities; otherwise do it as late as possible
    94  	// just before execve so as few syscalls take place after it as possible.
    95  	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
    96  		seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
    97  		if err != nil {
    98  			return err
    99  		}
   100  		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
   101  			return err
   102  		}
   103  	}
   104  	if err := finalizeNamespace(l.config); err != nil {
   105  		return err
   106  	}
   107  	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
   108  		return err
   109  	}
   110  	if l.config.Config.Personality != nil {
   111  		if err := setupPersonality(l.config.Config); err != nil {
   112  			return err
   113  		}
   114  	}
   115  	// Check for the arg early to make sure it exists.
   116  	name, err := exec.LookPath(l.config.Args[0])
   117  	if err != nil {
   118  		return err
   119  	}
   120  	// exec.LookPath in Go < 1.20 might return no error for an executable
   121  	// residing on a file system mounted with noexec flag, so perform this
   122  	// extra check now while we can still return a proper error.
   123  	// TODO: remove this once go < 1.20 is not supported.
   124  	if err := eaccess(name); err != nil {
   125  		return &os.PathError{Op: "eaccess", Path: name, Err: err}
   126  	}
   127  	// Set seccomp as close to execve as possible, so as few syscalls take
   128  	// place afterward (reducing the amount of syscalls that users need to
   129  	// enable in their seccomp profiles).
   130  	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
   131  		seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
   132  		if err != nil {
   133  			return fmt.Errorf("unable to init seccomp: %w", err)
   134  		}
   135  		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
   136  			return err
   137  		}
   138  	}
   139  
   140  	// Close the pipe to signal that we have completed our init.
   141  	// Please keep this because we don't want to get a pipe write error if
   142  	// there is an error from `execve` after all fds closed.
   143  	_ = l.pipe.Close()
   144  
   145  	// Close the log pipe fd so the parent's ForwardLogs can exit.
   146  	logrus.Debugf("setns_init: about to exec")
   147  	if err := l.logPipe.Close(); err != nil {
   148  		return fmt.Errorf("close log pipe: %w", err)
   149  	}
   150  
   151  	if l.dmzExe != nil {
   152  		l.config.Args[0] = name
   153  		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
   154  	}
   155  	// Close all file descriptors we are not passing to the container. This is
   156  	// necessary because the execve target could use internal runc fds as the
   157  	// execve path, potentially giving access to binary files from the host
   158  	// (which can then be opened by container processes, leading to container
   159  	// escapes). Note that because this operation will close any open file
   160  	// descriptors that are referenced by (*os.File) handles from underneath
   161  	// the Go runtime, we must not do any file operations after this point
   162  	// (otherwise the (*os.File) finaliser could close the wrong file). See
   163  	// CVE-2024-21626 for more information as to why this protection is
   164  	// necessary.
   165  	//
   166  	// This is not needed for runc-dmz, because the extra execve(2) step means
   167  	// that all O_CLOEXEC file descriptors have already been closed and thus
   168  	// the second execve(2) from runc-dmz cannot access internal file
   169  	// descriptors from runc.
   170  	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
   171  		return err
   172  	}
   173  	return system.Exec(name, l.config.Args, os.Environ())
   174  }