github.com/containers/podman/v4@v4.9.4/pkg/rootless/rootless_linux.go (about) 1 //go:build linux && cgo 2 // +build linux,cgo 3 4 package rootless 5 6 import ( 7 "bufio" 8 "bytes" 9 "errors" 10 "fmt" 11 "io" 12 "os" 13 "os/exec" 14 gosignal "os/signal" 15 "os/user" 16 "runtime" 17 "strconv" 18 "strings" 19 "sync" 20 "unsafe" 21 22 "github.com/containers/podman/v4/pkg/errorhandling" 23 "github.com/containers/storage/pkg/idtools" 24 pmount "github.com/containers/storage/pkg/mount" 25 "github.com/containers/storage/pkg/unshare" 26 "github.com/sirupsen/logrus" 27 "github.com/syndtr/gocapability/capability" 28 "golang.org/x/sys/unix" 29 ) 30 31 /* 32 #cgo remote CFLAGS: -Wall -Werror -DDISABLE_JOIN_SHORTCUT 33 #include <stdlib.h> 34 #include <sys/types.h> 35 extern uid_t rootless_uid(); 36 extern uid_t rootless_gid(); 37 extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path, char *file_to_read, int fd); 38 extern int reexec_in_user_namespace_wait(int pid, int options); 39 extern int reexec_userns_join(int pid, char *pause_pid_file_path); 40 extern int is_fd_inherited(int fd); 41 */ 42 import "C" 43 44 const ( 45 numSig = 65 // max number of signals 46 ) 47 48 func init() { 49 rootlessUIDInit := int(C.rootless_uid()) 50 rootlessGIDInit := int(C.rootless_gid()) 51 if rootlessUIDInit != 0 { 52 // we need this if we joined the user+mount namespace from the C code. 53 if err := os.Setenv("_CONTAINERS_USERNS_CONFIGURED", "done"); err != nil { 54 logrus.Errorf("Failed to set environment variable %s as %s", "_CONTAINERS_USERNS_CONFIGURED", "done") 55 } 56 if err := os.Setenv("_CONTAINERS_ROOTLESS_UID", strconv.Itoa(rootlessUIDInit)); err != nil { 57 logrus.Errorf("Failed to set environment variable %s as %d", "_CONTAINERS_ROOTLESS_UID", rootlessUIDInit) 58 } 59 if err := os.Setenv("_CONTAINERS_ROOTLESS_GID", strconv.Itoa(rootlessGIDInit)); err != nil { 60 logrus.Errorf("Failed to set environment variable %s as %d", "_CONTAINERS_ROOTLESS_GID", rootlessGIDInit) 61 } 62 } 63 } 64 65 func runInUser() error { 66 return os.Setenv("_CONTAINERS_USERNS_CONFIGURED", "done") 67 } 68 69 var ( 70 isRootlessOnce sync.Once 71 isRootless bool 72 ) 73 74 // IsRootless tells us if we are running in rootless mode 75 func IsRootless() bool { 76 // unshare.IsRootless() is used to check if a user namespace is required. 77 // Here we need to make sure that nested podman instances act 78 // as if they have root privileges and pick paths on the host 79 // that would normally be used for root. 80 return unshare.IsRootless() && unshare.GetRootlessUID() > 0 81 } 82 83 // GetRootlessUID returns the UID of the user in the parent userNS 84 func GetRootlessUID() int { 85 return unshare.GetRootlessUID() 86 } 87 88 // GetRootlessGID returns the GID of the user in the parent userNS 89 func GetRootlessGID() int { 90 return unshare.GetRootlessGID() 91 } 92 93 func tryMappingTool(uid bool, pid int, hostID int, mappings []idtools.IDMap) error { 94 var tool = "newuidmap" 95 mode := os.ModeSetuid 96 cap := capability.CAP_SETUID 97 idtype := "setuid" 98 if !uid { 99 tool = "newgidmap" 100 mode = os.ModeSetgid 101 cap = capability.CAP_SETGID 102 idtype = "setgid" 103 } 104 path, err := exec.LookPath(tool) 105 if err != nil { 106 return fmt.Errorf("command required for rootless mode with multiple IDs: %w", err) 107 } 108 109 appendTriplet := func(l []string, a, b, c int) []string { 110 return append(l, strconv.Itoa(a), strconv.Itoa(b), strconv.Itoa(c)) 111 } 112 113 args := []string{path, strconv.Itoa(pid)} 114 args = appendTriplet(args, 0, hostID, 1) 115 for _, i := range mappings { 116 if hostID >= i.HostID && hostID < i.HostID+i.Size { 117 what := "UID" 118 where := "/etc/subuid" 119 if !uid { 120 what = "GID" 121 where = "/etc/subgid" 122 } 123 return fmt.Errorf("invalid configuration: the specified mapping %d:%d in %q includes the user %s", i.HostID, i.Size, where, what) 124 } 125 args = appendTriplet(args, i.ContainerID+1, i.HostID, i.Size) 126 } 127 cmd := exec.Cmd{ 128 Path: path, 129 Args: args, 130 } 131 132 if output, err := cmd.CombinedOutput(); err != nil { 133 logrus.Errorf("running `%s`: %s", strings.Join(args, " "), output) 134 errorStr := fmt.Sprintf("cannot set up namespace using %q", path) 135 if isSet, err := unshare.IsSetID(cmd.Path, mode, cap); err != nil { 136 logrus.Errorf("Failed to check for %s on %s: %v", idtype, path, err) 137 } else if !isSet { 138 errorStr = fmt.Sprintf("%s: should have %s or have filecaps %s", errorStr, idtype, idtype) 139 } 140 return fmt.Errorf("%v: %w", errorStr, err) 141 } 142 return nil 143 } 144 145 // joinUserAndMountNS re-exec podman in a new userNS and join the user and mount 146 // namespace of the specified PID without looking up its parent. Useful to join directly 147 // the conmon process. 148 func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { 149 hasCapSysAdmin, err := unshare.HasCapSysAdmin() 150 if err != nil { 151 return false, 0, err 152 } 153 if (os.Geteuid() == 0 && hasCapSysAdmin) || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { 154 return false, 0, nil 155 } 156 157 cPausePid := C.CString(pausePid) 158 defer C.free(unsafe.Pointer(cPausePid)) 159 160 pidC := C.reexec_userns_join(C.int(pid), cPausePid) 161 if int(pidC) < 0 { 162 return false, -1, fmt.Errorf("cannot re-exec process to join the existing user namespace") 163 } 164 165 return waitAndProxySignalsToChild(pidC) 166 } 167 168 // GetConfiguredMappings returns the additional IDs configured for the current user. 169 func GetConfiguredMappings(quiet bool) ([]idtools.IDMap, []idtools.IDMap, error) { 170 var uids, gids []idtools.IDMap 171 username := os.Getenv("USER") 172 if username == "" { 173 var id string 174 if os.Geteuid() == 0 { 175 id = strconv.Itoa(GetRootlessUID()) 176 } else { 177 id = strconv.Itoa(os.Geteuid()) 178 } 179 userID, err := user.LookupId(id) 180 if err == nil { 181 username = userID.Username 182 } 183 } 184 mappings, err := idtools.NewIDMappings(username, username) 185 if err != nil { 186 logLevel := logrus.ErrorLevel 187 if quiet || (os.Geteuid() == 0 && GetRootlessUID() == 0) { 188 logLevel = logrus.DebugLevel 189 } 190 logrus.StandardLogger().Logf(logLevel, "cannot find UID/GID for user %s: %v - check rootless mode in man pages.", username, err) 191 } else { 192 uids = mappings.UIDs() 193 gids = mappings.GIDs() 194 } 195 return uids, gids, nil 196 } 197 198 func copyMappings(from, to string) error { 199 // when running as non-root always go through the newuidmap/newgidmap 200 // configuration since this is the expectation when running on Kubernetes 201 if os.Geteuid() != 0 { 202 return errors.New("copying mappings is allowed only for root") 203 } 204 content, err := os.ReadFile(from) 205 if err != nil { 206 return err 207 } 208 // Both runc and crun check whether the current process is in a user namespace 209 // by looking up 4294967295 in /proc/self/uid_map. If the mappings would be 210 // copied as they are, the check in the OCI runtimes would fail. So just split 211 // it in two different ranges. 212 if bytes.Contains(content, []byte("4294967295")) { 213 content = []byte("0 0 1\n1 1 4294967294\n") 214 } 215 return os.WriteFile(to, content, 0600) 216 } 217 218 func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ bool, _ int, retErr error) { 219 hasCapSysAdmin, err := unshare.HasCapSysAdmin() 220 if err != nil { 221 return false, 0, err 222 } 223 224 if (os.Geteuid() == 0 && hasCapSysAdmin) || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { 225 if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" { 226 return false, 0, runInUser() 227 } 228 return false, 0, nil 229 } 230 231 if _, inContainer := os.LookupEnv("container"); !inContainer { 232 if mounts, err := pmount.GetMounts(); err == nil { 233 for _, m := range mounts { 234 if m.Mountpoint == "/" { 235 isShared := false 236 for _, o := range strings.Split(m.Optional, ",") { 237 if strings.HasPrefix(o, "shared:") { 238 isShared = true 239 break 240 } 241 } 242 if !isShared { 243 logrus.Warningf("%q is not a shared mount, this could cause issues or missing mounts with rootless containers", m.Mountpoint) 244 } 245 break 246 } 247 } 248 } 249 } 250 251 cPausePid := C.CString(pausePid) 252 defer C.free(unsafe.Pointer(cPausePid)) 253 254 cFileToRead := C.CString(fileToRead) 255 defer C.free(unsafe.Pointer(cFileToRead)) 256 var fileOutputFD C.int 257 if fileOutput != nil { 258 fileOutputFD = C.int(fileOutput.Fd()) 259 } 260 261 runtime.LockOSThread() 262 defer runtime.UnlockOSThread() 263 264 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_DGRAM, 0) 265 if err != nil { 266 return false, -1, err 267 } 268 r, w := os.NewFile(uintptr(fds[0]), "sync host"), os.NewFile(uintptr(fds[1]), "sync child") 269 270 var pid int 271 272 defer errorhandling.CloseQuiet(r) 273 defer errorhandling.CloseQuiet(w) 274 defer func() { 275 toWrite := []byte("0") 276 if retErr != nil { 277 toWrite = []byte("1") 278 } 279 if _, err := w.Write(toWrite); err != nil { 280 logrus.Errorf("Failed to write byte 0: %q", err) 281 } 282 if retErr != nil && pid > 0 { 283 if err := unix.Kill(pid, unix.SIGKILL); err != nil { 284 if err != unix.ESRCH { 285 logrus.Errorf("Failed to clean up process %d: %v", pid, err) 286 } 287 } 288 C.reexec_in_user_namespace_wait(C.int(pid), 0) 289 } 290 }() 291 292 pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid, cFileToRead, fileOutputFD) 293 pid = int(pidC) 294 if pid < 0 { 295 return false, -1, fmt.Errorf("cannot re-exec process") 296 } 297 298 uids, gids, err := GetConfiguredMappings(false) 299 if err != nil { 300 return false, -1, err 301 } 302 303 uidMap := fmt.Sprintf("/proc/%d/uid_map", pid) 304 gidMap := fmt.Sprintf("/proc/%d/gid_map", pid) 305 306 uidsMapped := false 307 308 if err := copyMappings("/proc/self/uid_map", uidMap); err == nil { 309 uidsMapped = true 310 } 311 312 if uids != nil && !uidsMapped { 313 err := tryMappingTool(true, pid, os.Geteuid(), uids) 314 // If some mappings were specified, do not ignore the error 315 if err != nil && len(uids) > 0 { 316 return false, -1, err 317 } 318 uidsMapped = err == nil 319 } 320 if !uidsMapped { 321 logrus.Warnf("Using rootless single mapping into the namespace. This might break some images. Check /etc/subuid and /etc/subgid for adding sub*ids if not using a network user") 322 setgroups := fmt.Sprintf("/proc/%d/setgroups", pid) 323 err = os.WriteFile(setgroups, []byte("deny\n"), 0666) 324 if err != nil { 325 return false, -1, fmt.Errorf("cannot write setgroups file: %w", err) 326 } 327 logrus.Debugf("write setgroups file exited with 0") 328 329 err = os.WriteFile(uidMap, []byte(fmt.Sprintf("%d %d 1\n", 0, os.Geteuid())), 0666) 330 if err != nil { 331 return false, -1, fmt.Errorf("cannot write uid_map: %w", err) 332 } 333 logrus.Debugf("write uid_map exited with 0") 334 } 335 336 gidsMapped := false 337 if err := copyMappings("/proc/self/gid_map", gidMap); err == nil { 338 gidsMapped = true 339 } 340 if gids != nil && !gidsMapped { 341 err := tryMappingTool(false, pid, os.Getegid(), gids) 342 // If some mappings were specified, do not ignore the error 343 if err != nil && len(gids) > 0 { 344 return false, -1, err 345 } 346 gidsMapped = err == nil 347 } 348 if !gidsMapped { 349 err = os.WriteFile(gidMap, []byte(fmt.Sprintf("%d %d 1\n", 0, os.Getegid())), 0666) 350 if err != nil { 351 return false, -1, fmt.Errorf("cannot write gid_map: %w", err) 352 } 353 } 354 355 _, err = w.WriteString("0") 356 if err != nil { 357 return false, -1, fmt.Errorf("write to sync pipe: %w", err) 358 } 359 360 b := make([]byte, 1) 361 _, err = w.Read(b) 362 if err != nil { 363 return false, -1, fmt.Errorf("read from sync pipe: %w", err) 364 } 365 366 if fileOutput != nil { 367 ret := C.reexec_in_user_namespace_wait(pidC, 0) 368 if ret < 0 { 369 return false, -1, errors.New("waiting for the re-exec process") 370 } 371 return true, 0, nil 372 } 373 374 if b[0] == '2' { 375 // We have lost the race for writing the PID file, as probably another 376 // process created a namespace and wrote the PID. 377 // Try to join it. 378 data, err := os.ReadFile(pausePid) 379 if err == nil { 380 var pid uint64 381 pid, err = strconv.ParseUint(string(data), 10, 0) 382 if err == nil { 383 return joinUserAndMountNS(uint(pid), "") 384 } 385 } 386 return false, -1, fmt.Errorf("setting up the process: %w", err) 387 } 388 389 if b[0] != '0' { 390 return false, -1, errors.New("setting up the process") 391 } 392 393 return waitAndProxySignalsToChild(pidC) 394 } 395 396 func waitAndProxySignalsToChild(pid C.int) (bool, int, error) { 397 signals := []os.Signal{} 398 for sig := 0; sig < numSig; sig++ { 399 if sig == int(unix.SIGTSTP) { 400 continue 401 } 402 signals = append(signals, unix.Signal(sig)) 403 } 404 405 // Disable all existing signal handlers, from now forward everything to the child and let 406 // it deal with it. All we do is to wait and propagate the exit code from the child to our parent. 407 gosignal.Reset() 408 c := make(chan os.Signal, len(signals)) 409 gosignal.Notify(c, signals...) 410 go func() { 411 for s := range c { 412 if s == unix.SIGCHLD || s == unix.SIGPIPE { 413 continue 414 } 415 416 if err := unix.Kill(int(pid), s.(unix.Signal)); err != nil { 417 if err != unix.ESRCH { 418 logrus.Errorf("Failed to propagate signal to child process %d: %v", int(pid), err) 419 } 420 } 421 } 422 }() 423 424 ret := C.reexec_in_user_namespace_wait(pid, 0) 425 // child exited reset our signal proxy handler 426 gosignal.Reset() 427 if ret < 0 { 428 return false, -1, errors.New("waiting for the re-exec process") 429 } 430 431 return true, int(ret), nil 432 } 433 434 // BecomeRootInUserNS re-exec podman in a new userNS. It returns whether podman was re-executed 435 // into a new user namespace and the return code from the re-executed podman process. 436 // If podman was re-executed the caller needs to propagate the error code returned by the child 437 // process. 438 func BecomeRootInUserNS(pausePid string) (bool, int, error) { 439 return becomeRootInUserNS(pausePid, "", nil) 440 } 441 442 // TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths. 443 // This is useful when there are already running containers and we 444 // don't have a pause process yet. We can use the paths to the conmon 445 // processes to attempt joining their namespaces. 446 // If needNewNamespace is set, the file is read from a temporary user 447 // namespace, this is useful for containers that are running with a 448 // different uidmap and the unprivileged user has no way to read the 449 // file owned by the root in the container. 450 func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { 451 var lastErr error 452 var pausePid int 453 454 for _, path := range paths { 455 if !needNewNamespace { 456 data, err := os.ReadFile(path) 457 if err != nil { 458 lastErr = err 459 continue 460 } 461 462 pausePid, err = strconv.Atoi(string(data)) 463 if err != nil { 464 lastErr = fmt.Errorf("cannot parse file %q: %w", path, err) 465 continue 466 } 467 } else { 468 r, w, err := os.Pipe() 469 if err != nil { 470 lastErr = err 471 continue 472 } 473 474 defer errorhandling.CloseQuiet(r) 475 476 if _, _, err := becomeRootInUserNS("", path, w); err != nil { 477 w.Close() 478 lastErr = err 479 continue 480 } 481 482 if err := w.Close(); err != nil { 483 return false, 0, err 484 } 485 defer func() { 486 C.reexec_in_user_namespace_wait(-1, 0) 487 }() 488 489 b := make([]byte, 32) 490 491 n, err := r.Read(b) 492 if err != nil { 493 lastErr = fmt.Errorf("cannot read %q: %w", path, err) 494 continue 495 } 496 497 pausePid, err = strconv.Atoi(string(b[:n])) 498 if err != nil { 499 lastErr = err 500 continue 501 } 502 } 503 504 if pausePid > 0 && unix.Kill(pausePid, 0) == nil { 505 joined, pid, err := joinUserAndMountNS(uint(pausePid), pausePidPath) 506 if err == nil { 507 return joined, pid, nil 508 } 509 lastErr = err 510 } 511 } 512 if lastErr != nil { 513 return false, 0, lastErr 514 } 515 return false, 0, fmt.Errorf("could not find any running process: %w", unix.ESRCH) 516 } 517 518 // ReadMappingsProc parses and returns the ID mappings at the specified path. 519 func ReadMappingsProc(path string) ([]idtools.IDMap, error) { 520 file, err := os.Open(path) 521 if err != nil { 522 return nil, err 523 } 524 defer file.Close() 525 526 mappings := []idtools.IDMap{} 527 528 buf := bufio.NewReader(file) 529 for { 530 line, _, err := buf.ReadLine() 531 if err != nil { 532 if err == io.EOF { 533 return mappings, nil 534 } 535 return nil, fmt.Errorf("cannot read line from %s: %w", path, err) 536 } 537 if line == nil { 538 return mappings, nil 539 } 540 541 containerID, hostID, size := 0, 0, 0 542 if _, err := fmt.Sscanf(string(line), "%d %d %d", &containerID, &hostID, &size); err != nil { 543 return nil, fmt.Errorf("cannot parse %s: %w", string(line), err) 544 } 545 mappings = append(mappings, idtools.IDMap{ContainerID: containerID, HostID: hostID, Size: size}) 546 } 547 } 548 549 func matches(id int, configuredIDs []idtools.IDMap, currentIDs []idtools.IDMap) bool { 550 // The first mapping is the host user, handle it separately. 551 if currentIDs[0].HostID != id || currentIDs[0].Size != 1 { 552 return false 553 } 554 555 currentIDs = currentIDs[1:] 556 if len(currentIDs) != len(configuredIDs) { 557 return false 558 } 559 560 // It is fine to iterate sequentially as both slices are sorted. 561 for i := range currentIDs { 562 if currentIDs[i].HostID != configuredIDs[i].HostID { 563 return false 564 } 565 if currentIDs[i].Size != configuredIDs[i].Size { 566 return false 567 } 568 } 569 570 return true 571 } 572 573 // ConfigurationMatches checks whether the additional uids/gids configured for the user 574 // match the current user namespace. 575 func ConfigurationMatches() (bool, error) { 576 if !IsRootless() || os.Geteuid() != 0 { 577 return true, nil 578 } 579 580 uids, gids, err := GetConfiguredMappings(false) 581 if err != nil { 582 return false, err 583 } 584 585 currentUIDs, err := ReadMappingsProc("/proc/self/uid_map") 586 if err != nil { 587 return false, err 588 } 589 590 if !matches(GetRootlessUID(), uids, currentUIDs) { 591 return false, err 592 } 593 594 currentGIDs, err := ReadMappingsProc("/proc/self/gid_map") 595 if err != nil { 596 return false, err 597 } 598 599 return matches(GetRootlessGID(), gids, currentGIDs), nil 600 } 601 602 // IsFdInherited checks whether the fd is opened and valid to use 603 func IsFdInherited(fd int) bool { 604 return int(C.is_fd_inherited(C.int(fd))) > 0 605 }