github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/setns_init_linux.go (about) 1 package libcontainer 2 3 import ( 4 "errors" 5 "fmt" 6 "os" 7 "os/exec" 8 9 "github.com/opencontainers/selinux/go-selinux" 10 "github.com/sirupsen/logrus" 11 "golang.org/x/sys/unix" 12 13 "github.com/opencontainers/runc/libcontainer/apparmor" 14 "github.com/opencontainers/runc/libcontainer/keys" 15 "github.com/opencontainers/runc/libcontainer/seccomp" 16 "github.com/opencontainers/runc/libcontainer/system" 17 "github.com/opencontainers/runc/libcontainer/utils" 18 ) 19 20 // linuxSetnsInit performs the container's initialization for running a new process 21 // inside an existing container. 22 type linuxSetnsInit struct { 23 pipe *syncSocket 24 consoleSocket *os.File 25 pidfdSocket *os.File 26 config *initConfig 27 logPipe *os.File 28 dmzExe *os.File 29 } 30 31 func (l *linuxSetnsInit) getSessionRingName() string { 32 return "_ses." + l.config.ContainerID 33 } 34 35 func (l *linuxSetnsInit) Init() error { 36 if !l.config.Config.NoNewKeyring { 37 if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { 38 return err 39 } 40 defer selinux.SetKeyLabel("") //nolint: errcheck 41 // Do not inherit the parent's session keyring. 42 if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { 43 // Same justification as in standart_init_linux.go as to why we 44 // don't bail on ENOSYS. 45 // 46 // TODO(cyphar): And we should have logging here too. 47 if !errors.Is(err, unix.ENOSYS) { 48 return fmt.Errorf("unable to join session keyring: %w", err) 49 } 50 } 51 } 52 53 if l.config.CreateConsole { 54 if err := setupConsole(l.consoleSocket, l.config, false); err != nil { 55 return err 56 } 57 if err := system.Setctty(); err != nil { 58 return err 59 } 60 } 61 if l.pidfdSocket != nil { 62 if err := setupPidfd(l.pidfdSocket, "setns"); err != nil { 63 return fmt.Errorf("failed to setup pidfd: %w", err) 64 } 65 } 66 if l.config.NoNewPrivileges { 67 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { 68 return err 69 } 70 } 71 if l.config.Config.Umask != nil { 72 unix.Umask(int(*l.config.Config.Umask)) 73 } 74 75 if l.config.Config.Scheduler != nil { 76 if err := setupScheduler(l.config.Config); err != nil { 77 return err 78 } 79 } 80 81 // Tell our parent that we're ready to exec. This must be done before the 82 // Seccomp rules have been applied, because we need to be able to read and 83 // write to a socket. 84 if err := syncParentReady(l.pipe); err != nil { 85 return fmt.Errorf("sync ready: %w", err) 86 } 87 88 if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { 89 return err 90 } 91 defer selinux.SetExecLabel("") //nolint: errcheck 92 // Without NoNewPrivileges seccomp is a privileged operation, so we need to 93 // do this before dropping capabilities; otherwise do it as late as possible 94 // just before execve so as few syscalls take place after it as possible. 95 if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { 96 seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) 97 if err != nil { 98 return err 99 } 100 if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { 101 return err 102 } 103 } 104 if err := finalizeNamespace(l.config); err != nil { 105 return err 106 } 107 if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { 108 return err 109 } 110 if l.config.Config.Personality != nil { 111 if err := setupPersonality(l.config.Config); err != nil { 112 return err 113 } 114 } 115 // Check for the arg early to make sure it exists. 116 name, err := exec.LookPath(l.config.Args[0]) 117 if err != nil { 118 return err 119 } 120 // exec.LookPath in Go < 1.20 might return no error for an executable 121 // residing on a file system mounted with noexec flag, so perform this 122 // extra check now while we can still return a proper error. 123 // TODO: remove this once go < 1.20 is not supported. 124 if err := eaccess(name); err != nil { 125 return &os.PathError{Op: "eaccess", Path: name, Err: err} 126 } 127 // Set seccomp as close to execve as possible, so as few syscalls take 128 // place afterward (reducing the amount of syscalls that users need to 129 // enable in their seccomp profiles). 130 if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { 131 seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) 132 if err != nil { 133 return fmt.Errorf("unable to init seccomp: %w", err) 134 } 135 if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { 136 return err 137 } 138 } 139 140 // Close the pipe to signal that we have completed our init. 141 // Please keep this because we don't want to get a pipe write error if 142 // there is an error from `execve` after all fds closed. 143 _ = l.pipe.Close() 144 145 // Close the log pipe fd so the parent's ForwardLogs can exit. 146 logrus.Debugf("setns_init: about to exec") 147 if err := l.logPipe.Close(); err != nil { 148 return fmt.Errorf("close log pipe: %w", err) 149 } 150 151 if l.dmzExe != nil { 152 l.config.Args[0] = name 153 return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) 154 } 155 // Close all file descriptors we are not passing to the container. This is 156 // necessary because the execve target could use internal runc fds as the 157 // execve path, potentially giving access to binary files from the host 158 // (which can then be opened by container processes, leading to container 159 // escapes). Note that because this operation will close any open file 160 // descriptors that are referenced by (*os.File) handles from underneath 161 // the Go runtime, we must not do any file operations after this point 162 // (otherwise the (*os.File) finaliser could close the wrong file). See 163 // CVE-2024-21626 for more information as to why this protection is 164 // necessary. 165 // 166 // This is not needed for runc-dmz, because the extra execve(2) step means 167 // that all O_CLOEXEC file descriptors have already been closed and thus 168 // the second execve(2) from runc-dmz cannot access internal file 169 // descriptors from runc. 170 if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { 171 return err 172 } 173 return system.Exec(name, l.config.Args, os.Environ()) 174 }