github.com/rootless-containers/rootlesskit/v2@v2.3.4/pkg/child/child.go (about) 1 package child 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "os" 8 "os/exec" 9 "os/signal" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "syscall" 14 "time" 15 16 "github.com/containernetworking/plugins/pkg/ns" 17 "github.com/rootless-containers/rootlesskit/v2/pkg/common" 18 "github.com/rootless-containers/rootlesskit/v2/pkg/copyup" 19 "github.com/rootless-containers/rootlesskit/v2/pkg/messages" 20 "github.com/rootless-containers/rootlesskit/v2/pkg/network" 21 "github.com/rootless-containers/rootlesskit/v2/pkg/port" 22 "github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy" 23 sigproxysignal "github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy/signal" 24 "github.com/sirupsen/logrus" 25 "golang.org/x/sys/unix" 26 ) 27 28 var propagationStates = map[string]uintptr{ 29 "private": uintptr(unix.MS_PRIVATE), 30 "rprivate": uintptr(unix.MS_REC | unix.MS_PRIVATE), 31 "shared": uintptr(unix.MS_SHARED), 32 "rshared": uintptr(unix.MS_REC | unix.MS_SHARED), 33 "slave": uintptr(unix.MS_SLAVE), 34 "rslave": uintptr(unix.MS_REC | unix.MS_SLAVE), 35 } 36 37 func setupFiles(cmd *exec.Cmd) { 38 // 0 1 and 2 are used for stdin. stdout, and stderr 39 const firstExtraFD = 3 40 systemdActivationFDs := 0 41 // check for systemd socket activation sockets 42 if v := os.Getenv("LISTEN_FDS"); v != "" { 43 if num, err := strconv.Atoi(v); err == nil { 44 systemdActivationFDs = num 45 cmd.ExtraFiles = make([]*os.File, systemdActivationFDs) 46 } 47 } 48 for fd := 0; fd < systemdActivationFDs; fd++ { 49 cmd.ExtraFiles[fd] = os.NewFile(uintptr(firstExtraFD+fd), "") 50 } 51 } 52 53 func createCmd(opt Opt) (*exec.Cmd, error) { 54 fixListenPidEnv, err := strconv.ParseBool(os.Getenv(opt.ChildUseActivationEnvKey)) 55 if err != nil { 56 fixListenPidEnv = false 57 } 58 os.Unsetenv(opt.ChildUseActivationEnvKey) 59 targetCmd := opt.TargetCmd 60 var cmd *exec.Cmd 61 cmdEnv := os.Environ() 62 if fixListenPidEnv { 63 cmd = exec.Command("/proc/self/exe", os.Args[1:]...) 64 cmdEnv = append(cmdEnv, opt.RunActivationHelperEnvKey+"=true") 65 } else { 66 var args []string 67 if len(targetCmd) > 1 { 68 args = targetCmd[1:] 69 } 70 cmd = exec.Command(targetCmd[0], args...) 71 } 72 cmd.Stdin = os.Stdin 73 cmd.Stdout = os.Stdout 74 cmd.Stderr = os.Stderr 75 cmd.Env = cmdEnv 76 cmd.SysProcAttr = &syscall.SysProcAttr{ 77 Pdeathsig: syscall.SIGKILL, 78 } 79 setupFiles(cmd) 80 return cmd, nil 81 } 82 83 // mountSysfs is needed for mounting /sys/class/net 84 // when netns is unshared. 85 func mountSysfs(hostNetwork, evacuateCgroup2 bool) error { 86 const cgroupDir = "/sys/fs/cgroup" 87 if hostNetwork { 88 if evacuateCgroup2 { 89 // We need to mount tmpfs before cgroup2 to avoid EBUSY 90 if err := unix.Mount("none", cgroupDir, "tmpfs", 0, ""); err != nil { 91 return fmt.Errorf("failed to mount tmpfs on %s: %w", cgroupDir, err) 92 } 93 if err := unix.Mount("none", cgroupDir, "cgroup2", 0, ""); err != nil { 94 return fmt.Errorf("failed to mount cgroup2 on %s: %w", cgroupDir, err) 95 } 96 } 97 // NOP 98 return nil 99 } 100 101 tmp, err := os.MkdirTemp("/tmp", "rksys") 102 if err != nil { 103 return fmt.Errorf("creating a directory under /tmp: %w", err) 104 } 105 defer os.RemoveAll(tmp) 106 if !evacuateCgroup2 { 107 if err := unix.Mount(cgroupDir, tmp, "", uintptr(unix.MS_BIND|unix.MS_REC), ""); err != nil { 108 return fmt.Errorf("failed to create bind mount on %s: %w", cgroupDir, err) 109 } 110 } 111 112 if err := unix.Mount("none", "/sys", "sysfs", 0, ""); err != nil { 113 // when the sysfs in the parent namespace is RO, 114 // we can't mount RW sysfs even in the child namespace. 115 // https://github.com/rootless-containers/rootlesskit/pull/23#issuecomment-429292632 116 // https://github.com/torvalds/linux/blob/9f203e2f2f065cd74553e6474f0ae3675f39fb0f/fs/namespace.c#L3326-L3328 117 logrus.Warnf("failed to mount sysfs, falling back to read-only mount: %v", err) 118 if err := unix.Mount("none", "/sys", "sysfs", uintptr(unix.MS_RDONLY), ""); err != nil { 119 // when /sys/firmware is masked, even RO sysfs can't be mounted 120 logrus.Warnf("failed to mount sysfs: %v", err) 121 } 122 } 123 if evacuateCgroup2 { 124 if err := unix.Mount("none", cgroupDir, "cgroup2", 0, ""); err != nil { 125 return fmt.Errorf("failed to mount cgroup2 on %s: %w", cgroupDir, err) 126 } 127 } else { 128 if err := unix.Mount(tmp, cgroupDir, "", uintptr(unix.MS_MOVE), ""); err != nil { 129 return fmt.Errorf("failed to move mount point from %s to %s: %w", tmp, cgroupDir, err) 130 } 131 } 132 return nil 133 } 134 135 func mountProcfs() error { 136 if err := unix.Mount("none", "/proc", "proc", 0, ""); err != nil { 137 logrus.Warnf("failed to mount procfs, falling back to read-only mount: %v", err) 138 if err := unix.Mount("none", "/proc", "proc", uintptr(unix.MS_RDONLY), ""); err != nil { 139 logrus.Warnf("failed to mount procfs: %v", err) 140 } 141 } 142 return nil 143 } 144 145 func activateLoopback() error { 146 cmds := [][]string{ 147 {"ip", "link", "set", "lo", "up"}, 148 } 149 if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { 150 return fmt.Errorf("executing %v: %w", cmds, err) 151 } 152 return nil 153 } 154 155 func activateDev(dev, ip string, netmask int, gateway string, mtu int) error { 156 cmds := [][]string{ 157 {"ip", "link", "set", dev, "up"}, 158 {"ip", "link", "set", "dev", dev, "mtu", strconv.Itoa(mtu)}, 159 {"ip", "addr", "add", ip + "/" + strconv.Itoa(netmask), "dev", dev}, 160 {"ip", "route", "add", "default", "via", gateway, "dev", dev}, 161 } 162 if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { 163 return fmt.Errorf("executing %v: %w", cmds, err) 164 } 165 return nil 166 } 167 168 func setupCopyDir(driver copyup.ChildDriver, dirs []string) (bool, error) { 169 if driver != nil { 170 etcWasCopied := false 171 copied, err := driver.CopyUp(dirs) 172 for _, d := range copied { 173 if d == "/etc" { 174 etcWasCopied = true 175 break 176 } 177 } 178 return etcWasCopied, err 179 } 180 if len(dirs) != 0 { 181 return false, errors.New("copy-up driver is not specified") 182 } 183 return false, nil 184 } 185 186 // setupNet sets up the network driver. 187 // 188 // NOTE: msg is altered during calling driver.ConfigureNetworkChild 189 func setupNet(stateDir string, msg *messages.ParentInitNetworkDriverCompleted, etcWasCopied bool, driver network.ChildDriver, detachedNetNSPath string) error { 190 // HostNetwork 191 if driver == nil { 192 return nil 193 } 194 195 stateDirResolvConf := filepath.Join(stateDir, "resolv.conf") 196 hostsContent, err := generateEtcHosts() 197 if err != nil { 198 return err 199 } 200 stateDirHosts := filepath.Join(stateDir, "hosts") 201 if err := os.WriteFile(stateDirHosts, hostsContent, 0644); err != nil { 202 return fmt.Errorf("writing %s: %w", stateDirHosts, err) 203 } 204 205 if detachedNetNSPath == "" { 206 // non-detached mode 207 if err := activateLoopback(); err != nil { 208 return err 209 } 210 dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath) // alters msg 211 if err != nil { 212 return err 213 } 214 if err := os.WriteFile(stateDirResolvConf, generateResolvConf(msg.DNS), 0644); err != nil { 215 return fmt.Errorf("writing %s: %w", stateDirResolvConf, err) 216 } 217 Info, _ := driver.ChildDriverInfo() 218 if !Info.ConfiguresInterface { 219 if err := activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU); err != nil { 220 return err 221 } 222 } 223 if etcWasCopied { 224 // remove copied-up link 225 for _, f := range []string{"/etc/resolv.conf", "/etc/hosts"} { 226 if err := os.RemoveAll(f); err != nil { 227 return fmt.Errorf("failed to remove copied-up link %q: %w", f, err) 228 } 229 if err := os.WriteFile(f, []byte{}, 0644); err != nil { 230 return fmt.Errorf("writing %s: %w", f, err) 231 } 232 } 233 } else { 234 logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " + 235 "Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " + 236 "Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " + 237 "Please refer to RootlessKit documentation for further information.") 238 } 239 if err := unix.Mount(stateDirResolvConf, "/etc/resolv.conf", "", uintptr(unix.MS_BIND), ""); err != nil { 240 return fmt.Errorf("failed to create bind mount /etc/resolv.conf for %s: %w", stateDirResolvConf, err) 241 } 242 if err := unix.Mount(stateDirHosts, "/etc/hosts", "", uintptr(unix.MS_BIND), ""); err != nil { 243 return fmt.Errorf("failed to create bind mount /etc/hosts for %s: %w", stateDirHosts, err) 244 } 245 } else { 246 // detached mode 247 if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error { 248 return activateLoopback() 249 }); err != nil { 250 return err 251 } 252 dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath) // alters msg 253 if err != nil { 254 return err 255 } 256 if err := os.WriteFile(stateDirResolvConf, generateResolvConf(msg.DNS), 0644); err != nil { 257 return fmt.Errorf("writing %s: %w", stateDirResolvConf, err) 258 } 259 if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error { 260 Info, _ := driver.ChildDriverInfo() 261 if !Info.ConfiguresInterface { 262 return activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU) 263 } 264 return nil 265 }); err != nil { 266 return err 267 } 268 } 269 return nil 270 } 271 272 type Opt struct { 273 PipeFDEnvKey string // needs to be set 274 RunActivationHelperEnvKey string // needs to be set 275 ChildUseActivationEnvKey string // needs to be set 276 StateDirEnvKey string // needs to be set 277 TargetCmd []string // needs to be set 278 NetworkDriver network.ChildDriver // nil for HostNetwork 279 CopyUpDriver copyup.ChildDriver // cannot be nil if len(CopyUpDirs) != 0 280 CopyUpDirs []string 281 DetachNetNS bool 282 PortDriver port.ChildDriver 283 MountProcfs bool // needs to be set if (and only if) parent.Opt.CreatePIDNS is set 284 Propagation string // mount propagation type 285 Reaper bool 286 EvacuateCgroup2 bool // needs to correspond to parent.Opt.EvacuateCgroup2 is set 287 } 288 289 // statPIDNS is from https://github.com/containerd/containerd/blob/v1.7.2/services/introspection/pidns_linux.go#L25-L36 290 func statPIDNS(pid int) (uint64, error) { 291 f := fmt.Sprintf("/proc/%d/ns/pid", pid) 292 st, err := os.Stat(f) 293 if err != nil { 294 return 0, err 295 } 296 stSys, ok := st.Sys().(*syscall.Stat_t) 297 if !ok { 298 return 0, fmt.Errorf("%T is not *syscall.Stat_t", st.Sys()) 299 } 300 return stSys.Ino, nil 301 } 302 303 func hasCaps() (bool, error) { 304 pid := os.Getpid() 305 hdr := unix.CapUserHeader{ 306 Version: unix.LINUX_CAPABILITY_VERSION_3, 307 Pid: int32(pid), 308 } 309 var data unix.CapUserData 310 if err := unix.Capget(&hdr, &data); err != nil { 311 return false, fmt.Errorf("failed to get the current caps: %w", err) 312 } 313 logrus.Debugf("Capabilities: %+v", data) 314 return data.Effective != 0, nil 315 } 316 317 // gainCaps gains the caps inside the user namespace. 318 // The caps are gained on re-execution after the child's uid_map and gid_map are fully written. 319 func gainCaps() error { 320 pid := os.Getpid() 321 pidns, err := statPIDNS(pid) 322 if err != nil { 323 logrus.WithError(err).Debug("Failed to stat pidns (negligible when unsharing pidns)") 324 pidns = 0 325 } 326 envName := fmt.Sprintf("_ROOTLESSKIT_REEXEC_COUNT_%d_%d", pidns, pid) 327 logrus.Debugf("Re-executing the RootlessKit child process (PID=%d) to gain the caps", pid) 328 329 var envValueInt int 330 if envValueStr := os.Getenv(envName); envValueStr != "" { 331 var err error 332 envValueInt, err = strconv.Atoi(envValueStr) 333 if err != nil { 334 return fmt.Errorf("failed to parse %s value %q: %w", envName, envValueStr, err) 335 } 336 } 337 if envValueInt > 5 { 338 time.Sleep(10 * time.Millisecond * time.Duration(envValueInt)) 339 } 340 if envValueInt > 10 { 341 return fmt.Errorf("no capabilities was gained after reexecuting the child (%s=%d)", envName, envValueInt) 342 } 343 logrus.Debugf("%s: %d->%d", envName, envValueInt, envValueInt+1) 344 os.Setenv(envName, strconv.Itoa(envValueInt+1)) 345 346 // PID should be kept after re-execution. 347 if err := syscall.Exec("/proc/self/exe", os.Args, os.Environ()); err != nil { 348 return err 349 } 350 panic("should not reach here") 351 } 352 353 func Child(opt Opt) error { 354 if opt.PipeFDEnvKey == "" { 355 return errors.New("pipe FD env key is not set") 356 } 357 pipeFDStr := os.Getenv(opt.PipeFDEnvKey) 358 if pipeFDStr == "" { 359 return fmt.Errorf("%s is not set", opt.PipeFDEnvKey) 360 } 361 var pipeFD, pipe2FD int 362 if _, err := fmt.Sscanf(pipeFDStr, "%d,%d", &pipeFD, &pipe2FD); err != nil { 363 return fmt.Errorf("unexpected fd value: %s: %w", pipeFDStr, err) 364 } 365 logrus.Debugf("pipeFD=%d, pipe2FD=%d", pipeFD, pipe2FD) 366 pipeR := os.NewFile(uintptr(pipeFD), "") 367 pipe2W := os.NewFile(uintptr(pipe2FD), "") 368 369 if opt.StateDirEnvKey == "" { 370 opt.StateDirEnvKey = "ROOTLESSKIT_STATE_DIR" // for backward compatibility of Go API 371 } 372 stateDir := os.Getenv(opt.StateDirEnvKey) 373 if stateDir == "" { 374 return errors.New("got empty StateDir") 375 } 376 377 var ( 378 msg *messages.Message 379 err error 380 ) 381 if ok, err := hasCaps(); err != nil { 382 return err 383 } else if !ok { 384 msg, err = messages.WaitFor(pipeR, messages.Name(messages.ParentHello{})) 385 if err != nil { 386 return err 387 } 388 389 msgChildHello := &messages.Message{ 390 U: messages.U{ 391 ChildHello: &messages.ChildHello{}, 392 }, 393 } 394 if err := messages.Send(pipe2W, msgChildHello); err != nil { 395 return err 396 } 397 398 msg, err = messages.WaitFor(pipeR, messages.Name(messages.ParentInitIdmapCompleted{})) 399 if err != nil { 400 return err 401 } 402 403 if err := gainCaps(); err != nil { 404 return fmt.Errorf("failed to gain the caps inside the user namespace: %w", err) 405 } 406 } 407 408 if opt.MountProcfs { 409 if err := mountProcfs(); err != nil { 410 return err 411 } 412 } 413 414 var detachedNetNSPath string 415 if opt.DetachNetNS { 416 detachedNetNSPath = filepath.Join(stateDir, "netns") 417 if err = NewNetNsWithPathWithoutEnter(detachedNetNSPath); err != nil { 418 return fmt.Errorf("failed to create a detached netns on %q: %w", detachedNetNSPath, err) 419 } 420 } 421 422 msgChildInitUserNSCompleted := &messages.Message{ 423 U: messages.U{ 424 ChildInitUserNSCompleted: &messages.ChildInitUserNSCompleted{}, 425 }, 426 } 427 if err := messages.Send(pipe2W, msgChildInitUserNSCompleted); err != nil { 428 return err 429 } 430 431 msg, err = messages.WaitFor(pipeR, messages.Name(messages.ParentInitNetworkDriverCompleted{})) 432 if err != nil { 433 return err 434 } 435 netMsg := msg.U.ParentInitNetworkDriverCompleted 436 437 msg, err = messages.WaitFor(pipeR, messages.Name(messages.ParentInitPortDriverCompleted{})) 438 if err != nil { 439 return err 440 } 441 portMsg := msg.U.ParentInitPortDriverCompleted 442 443 // The parent calls child with Pdeathsig, but it is cleared when newuidmap SUID binary is called 444 // https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646 445 runtime.LockOSThread() 446 err = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) 447 runtime.UnlockOSThread() 448 if err != nil { 449 return err 450 } 451 os.Unsetenv(opt.PipeFDEnvKey) 452 if err := pipeR.Close(); err != nil { 453 return fmt.Errorf("failed to close fd %d: %w", pipeFD, err) 454 } 455 if err := setMountPropagation(opt.Propagation); err != nil { 456 return err 457 } 458 etcWasCopied, err := setupCopyDir(opt.CopyUpDriver, opt.CopyUpDirs) 459 if err != nil { 460 return err 461 } 462 if detachedNetNSPath == "" { 463 if err := mountSysfs(opt.NetworkDriver == nil, opt.EvacuateCgroup2); err != nil { 464 return err 465 } 466 } 467 if err := setupNet(stateDir, netMsg, etcWasCopied, opt.NetworkDriver, detachedNetNSPath); err != nil { 468 return err 469 } 470 portQuitCh := make(chan struct{}) 471 portErrCh := make(chan error) 472 if opt.PortDriver != nil { 473 var portDriverOpaque map[string]string 474 if portMsg != nil { 475 portDriverOpaque = portMsg.PortDriverOpaque 476 } 477 go func() { 478 portErrCh <- opt.PortDriver.RunChildDriver(portDriverOpaque, portQuitCh, detachedNetNSPath) 479 }() 480 } 481 482 cmd, err := createCmd(opt) 483 if err != nil { 484 return err 485 } 486 if opt.Reaper { 487 if err := runAndReap(cmd); err != nil { 488 return fmt.Errorf("command %v exited: %w", opt.TargetCmd, err) 489 } 490 } else { 491 if err := cmd.Start(); err != nil { 492 return fmt.Errorf("command %v exited: %w", opt.TargetCmd, err) 493 } 494 sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid) 495 defer sigproxysignal.StopCatch(sigc) 496 if err := cmd.Wait(); err != nil { 497 return fmt.Errorf("command %v exited: %w", opt.TargetCmd, err) 498 } 499 } 500 if opt.PortDriver != nil { 501 portQuitCh <- struct{}{} 502 return <-portErrCh 503 } 504 return nil 505 } 506 507 func setMountPropagation(propagation string) error { 508 flags, ok := propagationStates[propagation] 509 if ok { 510 if err := unix.Mount("none", "/", "", flags, ""); err != nil { 511 return fmt.Errorf("failed to share mount point: /: %w", err) 512 } 513 } 514 return nil 515 } 516 517 func runAndReap(cmd *exec.Cmd) error { 518 c := make(chan os.Signal, 32) 519 signal.Notify(c, syscall.SIGCHLD) 520 cmd.SysProcAttr.Setsid = true 521 if err := cmd.Start(); err != nil { 522 return err 523 } 524 sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid) 525 defer sigproxysignal.StopCatch(sigc) 526 527 result := make(chan error) 528 go func() { 529 defer close(result) 530 for cEntry := range c { 531 logrus.Debugf("reaper: got signal %q", cEntry) 532 if wsPtr := reap(cmd.Process.Pid); wsPtr != nil { 533 ws := *wsPtr 534 if ws.Exited() && ws.ExitStatus() == 0 { 535 result <- nil 536 continue 537 } 538 var resultErr common.ErrorWithSys = &reaperErr{ 539 ws: ws, 540 } 541 result <- resultErr 542 } 543 } 544 }() 545 return <-result 546 } 547 548 func reap(myPid int) *syscall.WaitStatus { 549 var res *syscall.WaitStatus 550 for { 551 var ws syscall.WaitStatus 552 pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, nil) 553 logrus.Debugf("reaper: got ws=%+v, pid=%d, err=%+v", ws, pid, err) 554 if err != nil || pid <= 0 { 555 break 556 } 557 if pid == myPid { 558 res = &ws 559 } 560 } 561 return res 562 } 563 564 type reaperErr struct { 565 ws syscall.WaitStatus 566 } 567 568 func (e *reaperErr) Sys() interface{} { 569 return e.ws 570 } 571 572 func (e *reaperErr) Error() string { 573 if e.ws.Exited() { 574 return fmt.Sprintf("exit status %d", e.ws.ExitStatus()) 575 } 576 if e.ws.Signaled() { 577 return fmt.Sprintf("signal: %s", e.ws.Signal()) 578 } 579 return fmt.Sprintf("exited with WAITSTATUS=0x%08x", e.ws) 580 } 581 582 func NewNetNsWithPathWithoutEnter(p string) error { 583 if err := os.WriteFile(p, nil, 0400); err != nil { 584 return err 585 } 586 selfExe, err := os.Executable() 587 if err != nil { 588 return err 589 } 590 // this is hard (not impossible though) to reimplement in Go: https://github.com/cloudflare/slirpnetstack/commit/d7766a8a77f0093d3cb7a94bd0ccbe3f67d411ba 591 cmd := exec.Command("unshare", "-n", "mount", "--bind", "/proc/self/ns/net", p) 592 // Use our own implementation of unshare that is embedded in RootlessKit, so as to 593 // avoid /etc/apparmor.d/unshare-userns-restrict on Ubuntu 25.04. 594 // https://github.com/rootless-containers/rootlesskit/issues/494 595 cmd.Path = selfExe 596 out, err := cmd.CombinedOutput() 597 if err != nil { 598 return fmt.Errorf("failed to execute %v: %w (out=%q)", cmd.Args, err, string(out)) 599 } 600 return nil 601 }