github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/standard_init_linux.go (about) 1 package libcontainer 2 3 import ( 4 "errors" 5 "fmt" 6 "os" 7 "os/exec" 8 9 "github.com/opencontainers/runtime-spec/specs-go" 10 "github.com/opencontainers/selinux/go-selinux" 11 "github.com/sirupsen/logrus" 12 "golang.org/x/sys/unix" 13 14 "github.com/opencontainers/runc/libcontainer/apparmor" 15 "github.com/opencontainers/runc/libcontainer/configs" 16 "github.com/opencontainers/runc/libcontainer/keys" 17 "github.com/opencontainers/runc/libcontainer/seccomp" 18 "github.com/opencontainers/runc/libcontainer/system" 19 "github.com/opencontainers/runc/libcontainer/utils" 20 ) 21 22 type linuxStandardInit struct { 23 pipe *syncSocket 24 consoleSocket *os.File 25 pidfdSocket *os.File 26 parentPid int 27 fifoFile *os.File 28 logPipe *os.File 29 dmzExe *os.File 30 config *initConfig 31 } 32 33 func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { 34 var newperms uint32 35 36 if l.config.Config.Namespaces.Contains(configs.NEWUSER) { 37 // With user ns we need 'other' search permissions. 38 newperms = 0x8 39 } else { 40 // Without user ns we need 'UID' search permissions. 41 newperms = 0x80000 42 } 43 44 // Create a unique per session container name that we can join in setns; 45 // However, other containers can also join it. 46 return "_ses." + l.config.ContainerID, 0xffffffff, newperms 47 } 48 49 func (l *linuxStandardInit) Init() error { 50 if !l.config.Config.NoNewKeyring { 51 if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { 52 return err 53 } 54 defer selinux.SetKeyLabel("") //nolint: errcheck 55 ringname, keepperms, newperms := l.getSessionRingParams() 56 57 // Do not inherit the parent's session keyring. 58 if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { 59 // If keyrings aren't supported then it is likely we are on an 60 // older kernel (or inside an LXC container). While we could bail, 61 // the security feature we are using here is best-effort (it only 62 // really provides marginal protection since VFS credentials are 63 // the only significant protection of keyrings). 64 // 65 // TODO(cyphar): Log this so people know what's going on, once we 66 // have proper logging in 'runc init'. 67 if !errors.Is(err, unix.ENOSYS) { 68 return fmt.Errorf("unable to join session keyring: %w", err) 69 } 70 } else { 71 // Make session keyring searchable. If we've gotten this far we 72 // bail on any error -- we don't want to have a keyring with bad 73 // permissions. 74 if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { 75 return fmt.Errorf("unable to mod keyring permissions: %w", err) 76 } 77 } 78 } 79 80 if err := setupNetwork(l.config); err != nil { 81 return err 82 } 83 if err := setupRoute(l.config.Config); err != nil { 84 return err 85 } 86 87 // initialises the labeling system 88 selinux.GetEnabled() 89 90 err := prepareRootfs(l.pipe, l.config) 91 if err != nil { 92 return err 93 } 94 95 // Set up the console. This has to be done *before* we finalize the rootfs, 96 // but *after* we've given the user the chance to set up all of the mounts 97 // they wanted. 98 if l.config.CreateConsole { 99 if err := setupConsole(l.consoleSocket, l.config, true); err != nil { 100 return err 101 } 102 if err := system.Setctty(); err != nil { 103 return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err} 104 } 105 } 106 107 if l.pidfdSocket != nil { 108 if err := setupPidfd(l.pidfdSocket, "standard"); err != nil { 109 return fmt.Errorf("failed to setup pidfd: %w", err) 110 } 111 } 112 113 // Finish the rootfs setup. 114 if l.config.Config.Namespaces.Contains(configs.NEWNS) { 115 if err := finalizeRootfs(l.config.Config); err != nil { 116 return err 117 } 118 } 119 120 if hostname := l.config.Config.Hostname; hostname != "" { 121 if err := unix.Sethostname([]byte(hostname)); err != nil { 122 return &os.SyscallError{Syscall: "sethostname", Err: err} 123 } 124 } 125 if domainname := l.config.Config.Domainname; domainname != "" { 126 if err := unix.Setdomainname([]byte(domainname)); err != nil { 127 return &os.SyscallError{Syscall: "setdomainname", Err: err} 128 } 129 } 130 if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { 131 return fmt.Errorf("unable to apply apparmor profile: %w", err) 132 } 133 134 for key, value := range l.config.Config.Sysctl { 135 if err := writeSystemProperty(key, value); err != nil { 136 return err 137 } 138 } 139 for _, path := range l.config.Config.ReadonlyPaths { 140 if err := readonlyPath(path); err != nil { 141 return fmt.Errorf("can't make %q read-only: %w", path, err) 142 } 143 } 144 for _, path := range l.config.Config.MaskPaths { 145 if err := maskPath(path, l.config.Config.MountLabel); err != nil { 146 return fmt.Errorf("can't mask path %s: %w", path, err) 147 } 148 } 149 pdeath, err := system.GetParentDeathSignal() 150 if err != nil { 151 return fmt.Errorf("can't get pdeath signal: %w", err) 152 } 153 if l.config.NoNewPrivileges { 154 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { 155 return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} 156 } 157 } 158 159 if l.config.Config.Scheduler != nil { 160 if err := setupScheduler(l.config.Config); err != nil { 161 return err 162 } 163 } 164 if l.config.Config.IOPriority != nil { 165 if err := setIOPriority(l.config.Config.IOPriority); err != nil { 166 return err 167 } 168 } 169 170 // Tell our parent that we're ready to exec. This must be done before the 171 // Seccomp rules have been applied, because we need to be able to read and 172 // write to a socket. 173 if err := syncParentReady(l.pipe); err != nil { 174 return fmt.Errorf("sync ready: %w", err) 175 } 176 if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { 177 return fmt.Errorf("can't set process label: %w", err) 178 } 179 defer selinux.SetExecLabel("") //nolint: errcheck 180 // Without NoNewPrivileges seccomp is a privileged operation, so we need to 181 // do this before dropping capabilities; otherwise do it as late as possible 182 // just before execve so as few syscalls take place after it as possible. 183 if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { 184 seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) 185 if err != nil { 186 return err 187 } 188 189 if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { 190 return err 191 } 192 } 193 if err := finalizeNamespace(l.config); err != nil { 194 return err 195 } 196 // finalizeNamespace can change user/group which clears the parent death 197 // signal, so we restore it here. 198 if err := pdeath.Restore(); err != nil { 199 return fmt.Errorf("can't restore pdeath signal: %w", err) 200 } 201 // Compare the parent from the initial start of the init process and make 202 // sure that it did not change. if the parent changes that means it died 203 // and we were reparented to something else so we should just kill ourself 204 // and not cause problems for someone else. 205 if unix.Getppid() != l.parentPid { 206 return unix.Kill(unix.Getpid(), unix.SIGKILL) 207 } 208 // Check for the arg before waiting to make sure it exists and it is 209 // returned as a create time error. 210 name, err := exec.LookPath(l.config.Args[0]) 211 if err != nil { 212 return err 213 } 214 // exec.LookPath in Go < 1.20 might return no error for an executable 215 // residing on a file system mounted with noexec flag, so perform this 216 // extra check now while we can still return a proper error. 217 // TODO: remove this once go < 1.20 is not supported. 218 if err := eaccess(name); err != nil { 219 return &os.PathError{Op: "eaccess", Path: name, Err: err} 220 } 221 222 // Set seccomp as close to execve as possible, so as few syscalls take 223 // place afterward (reducing the amount of syscalls that users need to 224 // enable in their seccomp profiles). However, this needs to be done 225 // before closing the pipe since we need it to pass the seccompFd to 226 // the parent. 227 if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { 228 seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) 229 if err != nil { 230 return fmt.Errorf("unable to init seccomp: %w", err) 231 } 232 233 if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { 234 return err 235 } 236 } 237 238 // Set personality if specified. 239 if l.config.Config.Personality != nil { 240 if err := setupPersonality(l.config.Config); err != nil { 241 return err 242 } 243 } 244 245 // Close the pipe to signal that we have completed our init. 246 logrus.Debugf("init: closing the pipe to signal completion") 247 _ = l.pipe.Close() 248 249 // Close the log pipe fd so the parent's ForwardLogs can exit. 250 logrus.Debugf("init: about to wait on exec fifo") 251 if err := l.logPipe.Close(); err != nil { 252 return fmt.Errorf("close log pipe: %w", err) 253 } 254 255 fifoPath, closer := utils.ProcThreadSelfFd(l.fifoFile.Fd()) 256 defer closer() 257 258 // Wait for the FIFO to be opened on the other side before exec-ing the 259 // user process. We open it through /proc/self/fd/$fd, because the fd that 260 // was given to us was an O_PATH fd to the fifo itself. Linux allows us to 261 // re-open an O_PATH fd through /proc. 262 fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0) 263 if err != nil { 264 return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err} 265 } 266 if _, err := unix.Write(fd, []byte("0")); err != nil { 267 return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err} 268 } 269 270 // Close the O_PATH fifofd fd before exec because the kernel resets 271 // dumpable in the wrong order. This has been fixed in newer kernels, but 272 // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. 273 // N.B. the core issue itself (passing dirfds to the host filesystem) has 274 // since been resolved. 275 // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 276 _ = l.fifoFile.Close() 277 278 s := l.config.SpecState 279 s.Pid = unix.Getpid() 280 s.Status = specs.StateCreated 281 if err := l.config.Config.Hooks.Run(configs.StartContainer, s); err != nil { 282 return err 283 } 284 285 if l.dmzExe != nil { 286 l.config.Args[0] = name 287 return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) 288 } 289 // Close all file descriptors we are not passing to the container. This is 290 // necessary because the execve target could use internal runc fds as the 291 // execve path, potentially giving access to binary files from the host 292 // (which can then be opened by container processes, leading to container 293 // escapes). Note that because this operation will close any open file 294 // descriptors that are referenced by (*os.File) handles from underneath 295 // the Go runtime, we must not do any file operations after this point 296 // (otherwise the (*os.File) finaliser could close the wrong file). See 297 // CVE-2024-21626 for more information as to why this protection is 298 // necessary. 299 // 300 // This is not needed for runc-dmz, because the extra execve(2) step means 301 // that all O_CLOEXEC file descriptors have already been closed and thus 302 // the second execve(2) from runc-dmz cannot access internal file 303 // descriptors from runc. 304 if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { 305 return err 306 } 307 return system.Exec(name, l.config.Args, os.Environ()) 308 }