github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/standard_init_linux.go (about)

     1  package libcontainer
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  
     9  	"github.com/opencontainers/runtime-spec/specs-go"
    10  	"github.com/opencontainers/selinux/go-selinux"
    11  	"github.com/sirupsen/logrus"
    12  	"golang.org/x/sys/unix"
    13  
    14  	"github.com/opencontainers/runc/libcontainer/apparmor"
    15  	"github.com/opencontainers/runc/libcontainer/configs"
    16  	"github.com/opencontainers/runc/libcontainer/keys"
    17  	"github.com/opencontainers/runc/libcontainer/seccomp"
    18  	"github.com/opencontainers/runc/libcontainer/system"
    19  	"github.com/opencontainers/runc/libcontainer/utils"
    20  )
    21  
    22  type linuxStandardInit struct {
    23  	pipe          *syncSocket
    24  	consoleSocket *os.File
    25  	pidfdSocket   *os.File
    26  	parentPid     int
    27  	fifoFile      *os.File
    28  	logPipe       *os.File
    29  	dmzExe        *os.File
    30  	config        *initConfig
    31  }
    32  
    33  func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
    34  	var newperms uint32
    35  
    36  	if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
    37  		// With user ns we need 'other' search permissions.
    38  		newperms = 0x8
    39  	} else {
    40  		// Without user ns we need 'UID' search permissions.
    41  		newperms = 0x80000
    42  	}
    43  
    44  	// Create a unique per session container name that we can join in setns;
    45  	// However, other containers can also join it.
    46  	return "_ses." + l.config.ContainerID, 0xffffffff, newperms
    47  }
    48  
    49  func (l *linuxStandardInit) Init() error {
    50  	if !l.config.Config.NoNewKeyring {
    51  		if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
    52  			return err
    53  		}
    54  		defer selinux.SetKeyLabel("") //nolint: errcheck
    55  		ringname, keepperms, newperms := l.getSessionRingParams()
    56  
    57  		// Do not inherit the parent's session keyring.
    58  		if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
    59  			// If keyrings aren't supported then it is likely we are on an
    60  			// older kernel (or inside an LXC container). While we could bail,
    61  			// the security feature we are using here is best-effort (it only
    62  			// really provides marginal protection since VFS credentials are
    63  			// the only significant protection of keyrings).
    64  			//
    65  			// TODO(cyphar): Log this so people know what's going on, once we
    66  			//               have proper logging in 'runc init'.
    67  			if !errors.Is(err, unix.ENOSYS) {
    68  				return fmt.Errorf("unable to join session keyring: %w", err)
    69  			}
    70  		} else {
    71  			// Make session keyring searchable. If we've gotten this far we
    72  			// bail on any error -- we don't want to have a keyring with bad
    73  			// permissions.
    74  			if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
    75  				return fmt.Errorf("unable to mod keyring permissions: %w", err)
    76  			}
    77  		}
    78  	}
    79  
    80  	if err := setupNetwork(l.config); err != nil {
    81  		return err
    82  	}
    83  	if err := setupRoute(l.config.Config); err != nil {
    84  		return err
    85  	}
    86  
    87  	// initialises the labeling system
    88  	selinux.GetEnabled()
    89  
    90  	err := prepareRootfs(l.pipe, l.config)
    91  	if err != nil {
    92  		return err
    93  	}
    94  
    95  	// Set up the console. This has to be done *before* we finalize the rootfs,
    96  	// but *after* we've given the user the chance to set up all of the mounts
    97  	// they wanted.
    98  	if l.config.CreateConsole {
    99  		if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
   100  			return err
   101  		}
   102  		if err := system.Setctty(); err != nil {
   103  			return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
   104  		}
   105  	}
   106  
   107  	if l.pidfdSocket != nil {
   108  		if err := setupPidfd(l.pidfdSocket, "standard"); err != nil {
   109  			return fmt.Errorf("failed to setup pidfd: %w", err)
   110  		}
   111  	}
   112  
   113  	// Finish the rootfs setup.
   114  	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
   115  		if err := finalizeRootfs(l.config.Config); err != nil {
   116  			return err
   117  		}
   118  	}
   119  
   120  	if hostname := l.config.Config.Hostname; hostname != "" {
   121  		if err := unix.Sethostname([]byte(hostname)); err != nil {
   122  			return &os.SyscallError{Syscall: "sethostname", Err: err}
   123  		}
   124  	}
   125  	if domainname := l.config.Config.Domainname; domainname != "" {
   126  		if err := unix.Setdomainname([]byte(domainname)); err != nil {
   127  			return &os.SyscallError{Syscall: "setdomainname", Err: err}
   128  		}
   129  	}
   130  	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
   131  		return fmt.Errorf("unable to apply apparmor profile: %w", err)
   132  	}
   133  
   134  	for key, value := range l.config.Config.Sysctl {
   135  		if err := writeSystemProperty(key, value); err != nil {
   136  			return err
   137  		}
   138  	}
   139  	for _, path := range l.config.Config.ReadonlyPaths {
   140  		if err := readonlyPath(path); err != nil {
   141  			return fmt.Errorf("can't make %q read-only: %w", path, err)
   142  		}
   143  	}
   144  	for _, path := range l.config.Config.MaskPaths {
   145  		if err := maskPath(path, l.config.Config.MountLabel); err != nil {
   146  			return fmt.Errorf("can't mask path %s: %w", path, err)
   147  		}
   148  	}
   149  	pdeath, err := system.GetParentDeathSignal()
   150  	if err != nil {
   151  		return fmt.Errorf("can't get pdeath signal: %w", err)
   152  	}
   153  	if l.config.NoNewPrivileges {
   154  		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
   155  			return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
   156  		}
   157  	}
   158  
   159  	if l.config.Config.Scheduler != nil {
   160  		if err := setupScheduler(l.config.Config); err != nil {
   161  			return err
   162  		}
   163  	}
   164  	if l.config.Config.IOPriority != nil {
   165  		if err := setIOPriority(l.config.Config.IOPriority); err != nil {
   166  			return err
   167  		}
   168  	}
   169  
   170  	// Tell our parent that we're ready to exec. This must be done before the
   171  	// Seccomp rules have been applied, because we need to be able to read and
   172  	// write to a socket.
   173  	if err := syncParentReady(l.pipe); err != nil {
   174  		return fmt.Errorf("sync ready: %w", err)
   175  	}
   176  	if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
   177  		return fmt.Errorf("can't set process label: %w", err)
   178  	}
   179  	defer selinux.SetExecLabel("") //nolint: errcheck
   180  	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
   181  	// do this before dropping capabilities; otherwise do it as late as possible
   182  	// just before execve so as few syscalls take place after it as possible.
   183  	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
   184  		seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
   185  		if err != nil {
   186  			return err
   187  		}
   188  
   189  		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
   190  			return err
   191  		}
   192  	}
   193  	if err := finalizeNamespace(l.config); err != nil {
   194  		return err
   195  	}
   196  	// finalizeNamespace can change user/group which clears the parent death
   197  	// signal, so we restore it here.
   198  	if err := pdeath.Restore(); err != nil {
   199  		return fmt.Errorf("can't restore pdeath signal: %w", err)
   200  	}
   201  	// Compare the parent from the initial start of the init process and make
   202  	// sure that it did not change.  if the parent changes that means it died
   203  	// and we were reparented to something else so we should just kill ourself
   204  	// and not cause problems for someone else.
   205  	if unix.Getppid() != l.parentPid {
   206  		return unix.Kill(unix.Getpid(), unix.SIGKILL)
   207  	}
   208  	// Check for the arg before waiting to make sure it exists and it is
   209  	// returned as a create time error.
   210  	name, err := exec.LookPath(l.config.Args[0])
   211  	if err != nil {
   212  		return err
   213  	}
   214  	// exec.LookPath in Go < 1.20 might return no error for an executable
   215  	// residing on a file system mounted with noexec flag, so perform this
   216  	// extra check now while we can still return a proper error.
   217  	// TODO: remove this once go < 1.20 is not supported.
   218  	if err := eaccess(name); err != nil {
   219  		return &os.PathError{Op: "eaccess", Path: name, Err: err}
   220  	}
   221  
   222  	// Set seccomp as close to execve as possible, so as few syscalls take
   223  	// place afterward (reducing the amount of syscalls that users need to
   224  	// enable in their seccomp profiles). However, this needs to be done
   225  	// before closing the pipe since we need it to pass the seccompFd to
   226  	// the parent.
   227  	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
   228  		seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
   229  		if err != nil {
   230  			return fmt.Errorf("unable to init seccomp: %w", err)
   231  		}
   232  
   233  		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
   234  			return err
   235  		}
   236  	}
   237  
   238  	// Set personality if specified.
   239  	if l.config.Config.Personality != nil {
   240  		if err := setupPersonality(l.config.Config); err != nil {
   241  			return err
   242  		}
   243  	}
   244  
   245  	// Close the pipe to signal that we have completed our init.
   246  	logrus.Debugf("init: closing the pipe to signal completion")
   247  	_ = l.pipe.Close()
   248  
   249  	// Close the log pipe fd so the parent's ForwardLogs can exit.
   250  	logrus.Debugf("init: about to wait on exec fifo")
   251  	if err := l.logPipe.Close(); err != nil {
   252  		return fmt.Errorf("close log pipe: %w", err)
   253  	}
   254  
   255  	fifoPath, closer := utils.ProcThreadSelfFd(l.fifoFile.Fd())
   256  	defer closer()
   257  
   258  	// Wait for the FIFO to be opened on the other side before exec-ing the
   259  	// user process. We open it through /proc/self/fd/$fd, because the fd that
   260  	// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
   261  	// re-open an O_PATH fd through /proc.
   262  	fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
   263  	if err != nil {
   264  		return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
   265  	}
   266  	if _, err := unix.Write(fd, []byte("0")); err != nil {
   267  		return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
   268  	}
   269  
   270  	// Close the O_PATH fifofd fd before exec because the kernel resets
   271  	// dumpable in the wrong order. This has been fixed in newer kernels, but
   272  	// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
   273  	// N.B. the core issue itself (passing dirfds to the host filesystem) has
   274  	// since been resolved.
   275  	// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
   276  	_ = l.fifoFile.Close()
   277  
   278  	s := l.config.SpecState
   279  	s.Pid = unix.Getpid()
   280  	s.Status = specs.StateCreated
   281  	if err := l.config.Config.Hooks.Run(configs.StartContainer, s); err != nil {
   282  		return err
   283  	}
   284  
   285  	if l.dmzExe != nil {
   286  		l.config.Args[0] = name
   287  		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
   288  	}
   289  	// Close all file descriptors we are not passing to the container. This is
   290  	// necessary because the execve target could use internal runc fds as the
   291  	// execve path, potentially giving access to binary files from the host
   292  	// (which can then be opened by container processes, leading to container
   293  	// escapes). Note that because this operation will close any open file
   294  	// descriptors that are referenced by (*os.File) handles from underneath
   295  	// the Go runtime, we must not do any file operations after this point
   296  	// (otherwise the (*os.File) finaliser could close the wrong file). See
   297  	// CVE-2024-21626 for more information as to why this protection is
   298  	// necessary.
   299  	//
   300  	// This is not needed for runc-dmz, because the extra execve(2) step means
   301  	// that all O_CLOEXEC file descriptors have already been closed and thus
   302  	// the second execve(2) from runc-dmz cannot access internal file
   303  	// descriptors from runc.
   304  	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
   305  		return err
   306  	}
   307  	return system.Exec(name, l.config.Args, os.Environ())
   308  }