github.com/containers/podman/v4@v4.9.4/pkg/rootless/rootless_linux.go (about)

     1  //go:build linux && cgo
     2  // +build linux,cgo
     3  
     4  package rootless
     5  
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"os/exec"
    14  	gosignal "os/signal"
    15  	"os/user"
    16  	"runtime"
    17  	"strconv"
    18  	"strings"
    19  	"sync"
    20  	"unsafe"
    21  
    22  	"github.com/containers/podman/v4/pkg/errorhandling"
    23  	"github.com/containers/storage/pkg/idtools"
    24  	pmount "github.com/containers/storage/pkg/mount"
    25  	"github.com/containers/storage/pkg/unshare"
    26  	"github.com/sirupsen/logrus"
    27  	"github.com/syndtr/gocapability/capability"
    28  	"golang.org/x/sys/unix"
    29  )
    30  
    31  /*
    32  #cgo remote CFLAGS: -Wall -Werror -DDISABLE_JOIN_SHORTCUT
    33  #include <stdlib.h>
    34  #include <sys/types.h>
    35  extern uid_t rootless_uid();
    36  extern uid_t rootless_gid();
    37  extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path, char *file_to_read, int fd);
    38  extern int reexec_in_user_namespace_wait(int pid, int options);
    39  extern int reexec_userns_join(int pid, char *pause_pid_file_path);
    40  extern int is_fd_inherited(int fd);
    41  */
    42  import "C"
    43  
    44  const (
    45  	numSig = 65 // max number of signals
    46  )
    47  
    48  func init() {
    49  	rootlessUIDInit := int(C.rootless_uid())
    50  	rootlessGIDInit := int(C.rootless_gid())
    51  	if rootlessUIDInit != 0 {
    52  		// we need this if we joined the user+mount namespace from the C code.
    53  		if err := os.Setenv("_CONTAINERS_USERNS_CONFIGURED", "done"); err != nil {
    54  			logrus.Errorf("Failed to set environment variable %s as %s", "_CONTAINERS_USERNS_CONFIGURED", "done")
    55  		}
    56  		if err := os.Setenv("_CONTAINERS_ROOTLESS_UID", strconv.Itoa(rootlessUIDInit)); err != nil {
    57  			logrus.Errorf("Failed to set environment variable %s as %d", "_CONTAINERS_ROOTLESS_UID", rootlessUIDInit)
    58  		}
    59  		if err := os.Setenv("_CONTAINERS_ROOTLESS_GID", strconv.Itoa(rootlessGIDInit)); err != nil {
    60  			logrus.Errorf("Failed to set environment variable %s as %d", "_CONTAINERS_ROOTLESS_GID", rootlessGIDInit)
    61  		}
    62  	}
    63  }
    64  
    65  func runInUser() error {
    66  	return os.Setenv("_CONTAINERS_USERNS_CONFIGURED", "done")
    67  }
    68  
    69  var (
    70  	isRootlessOnce sync.Once
    71  	isRootless     bool
    72  )
    73  
    74  // IsRootless tells us if we are running in rootless mode
    75  func IsRootless() bool {
    76  	// unshare.IsRootless() is used to check if a user namespace is required.
    77  	// Here we need to make sure that nested podman instances act
    78  	// as if they have root privileges and pick paths on the host
    79  	// that would normally be used for root.
    80  	return unshare.IsRootless() && unshare.GetRootlessUID() > 0
    81  }
    82  
    83  // GetRootlessUID returns the UID of the user in the parent userNS
    84  func GetRootlessUID() int {
    85  	return unshare.GetRootlessUID()
    86  }
    87  
    88  // GetRootlessGID returns the GID of the user in the parent userNS
    89  func GetRootlessGID() int {
    90  	return unshare.GetRootlessGID()
    91  }
    92  
    93  func tryMappingTool(uid bool, pid int, hostID int, mappings []idtools.IDMap) error {
    94  	var tool = "newuidmap"
    95  	mode := os.ModeSetuid
    96  	cap := capability.CAP_SETUID
    97  	idtype := "setuid"
    98  	if !uid {
    99  		tool = "newgidmap"
   100  		mode = os.ModeSetgid
   101  		cap = capability.CAP_SETGID
   102  		idtype = "setgid"
   103  	}
   104  	path, err := exec.LookPath(tool)
   105  	if err != nil {
   106  		return fmt.Errorf("command required for rootless mode with multiple IDs: %w", err)
   107  	}
   108  
   109  	appendTriplet := func(l []string, a, b, c int) []string {
   110  		return append(l, strconv.Itoa(a), strconv.Itoa(b), strconv.Itoa(c))
   111  	}
   112  
   113  	args := []string{path, strconv.Itoa(pid)}
   114  	args = appendTriplet(args, 0, hostID, 1)
   115  	for _, i := range mappings {
   116  		if hostID >= i.HostID && hostID < i.HostID+i.Size {
   117  			what := "UID"
   118  			where := "/etc/subuid"
   119  			if !uid {
   120  				what = "GID"
   121  				where = "/etc/subgid"
   122  			}
   123  			return fmt.Errorf("invalid configuration: the specified mapping %d:%d in %q includes the user %s", i.HostID, i.Size, where, what)
   124  		}
   125  		args = appendTriplet(args, i.ContainerID+1, i.HostID, i.Size)
   126  	}
   127  	cmd := exec.Cmd{
   128  		Path: path,
   129  		Args: args,
   130  	}
   131  
   132  	if output, err := cmd.CombinedOutput(); err != nil {
   133  		logrus.Errorf("running `%s`: %s", strings.Join(args, " "), output)
   134  		errorStr := fmt.Sprintf("cannot set up namespace using %q", path)
   135  		if isSet, err := unshare.IsSetID(cmd.Path, mode, cap); err != nil {
   136  			logrus.Errorf("Failed to check for %s on %s: %v", idtype, path, err)
   137  		} else if !isSet {
   138  			errorStr = fmt.Sprintf("%s: should have %s or have filecaps %s", errorStr, idtype, idtype)
   139  		}
   140  		return fmt.Errorf("%v: %w", errorStr, err)
   141  	}
   142  	return nil
   143  }
   144  
   145  // joinUserAndMountNS re-exec podman in a new userNS and join the user and mount
   146  // namespace of the specified PID without looking up its parent.  Useful to join directly
   147  // the conmon process.
   148  func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
   149  	hasCapSysAdmin, err := unshare.HasCapSysAdmin()
   150  	if err != nil {
   151  		return false, 0, err
   152  	}
   153  	if (os.Geteuid() == 0 && hasCapSysAdmin) || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
   154  		return false, 0, nil
   155  	}
   156  
   157  	cPausePid := C.CString(pausePid)
   158  	defer C.free(unsafe.Pointer(cPausePid))
   159  
   160  	pidC := C.reexec_userns_join(C.int(pid), cPausePid)
   161  	if int(pidC) < 0 {
   162  		return false, -1, fmt.Errorf("cannot re-exec process to join the existing user namespace")
   163  	}
   164  
   165  	return waitAndProxySignalsToChild(pidC)
   166  }
   167  
   168  // GetConfiguredMappings returns the additional IDs configured for the current user.
   169  func GetConfiguredMappings(quiet bool) ([]idtools.IDMap, []idtools.IDMap, error) {
   170  	var uids, gids []idtools.IDMap
   171  	username := os.Getenv("USER")
   172  	if username == "" {
   173  		var id string
   174  		if os.Geteuid() == 0 {
   175  			id = strconv.Itoa(GetRootlessUID())
   176  		} else {
   177  			id = strconv.Itoa(os.Geteuid())
   178  		}
   179  		userID, err := user.LookupId(id)
   180  		if err == nil {
   181  			username = userID.Username
   182  		}
   183  	}
   184  	mappings, err := idtools.NewIDMappings(username, username)
   185  	if err != nil {
   186  		logLevel := logrus.ErrorLevel
   187  		if quiet || (os.Geteuid() == 0 && GetRootlessUID() == 0) {
   188  			logLevel = logrus.DebugLevel
   189  		}
   190  		logrus.StandardLogger().Logf(logLevel, "cannot find UID/GID for user %s: %v - check rootless mode in man pages.", username, err)
   191  	} else {
   192  		uids = mappings.UIDs()
   193  		gids = mappings.GIDs()
   194  	}
   195  	return uids, gids, nil
   196  }
   197  
   198  func copyMappings(from, to string) error {
   199  	// when running as non-root always go through the newuidmap/newgidmap
   200  	// configuration since this is the expectation when running on Kubernetes
   201  	if os.Geteuid() != 0 {
   202  		return errors.New("copying mappings is allowed only for root")
   203  	}
   204  	content, err := os.ReadFile(from)
   205  	if err != nil {
   206  		return err
   207  	}
   208  	// Both runc and crun check whether the current process is in a user namespace
   209  	// by looking up 4294967295 in /proc/self/uid_map.  If the mappings would be
   210  	// copied as they are, the check in the OCI runtimes would fail.  So just split
   211  	// it in two different ranges.
   212  	if bytes.Contains(content, []byte("4294967295")) {
   213  		content = []byte("0 0 1\n1 1 4294967294\n")
   214  	}
   215  	return os.WriteFile(to, content, 0600)
   216  }
   217  
   218  func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ bool, _ int, retErr error) {
   219  	hasCapSysAdmin, err := unshare.HasCapSysAdmin()
   220  	if err != nil {
   221  		return false, 0, err
   222  	}
   223  
   224  	if (os.Geteuid() == 0 && hasCapSysAdmin) || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
   225  		if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" {
   226  			return false, 0, runInUser()
   227  		}
   228  		return false, 0, nil
   229  	}
   230  
   231  	if _, inContainer := os.LookupEnv("container"); !inContainer {
   232  		if mounts, err := pmount.GetMounts(); err == nil {
   233  			for _, m := range mounts {
   234  				if m.Mountpoint == "/" {
   235  					isShared := false
   236  					for _, o := range strings.Split(m.Optional, ",") {
   237  						if strings.HasPrefix(o, "shared:") {
   238  							isShared = true
   239  							break
   240  						}
   241  					}
   242  					if !isShared {
   243  						logrus.Warningf("%q is not a shared mount, this could cause issues or missing mounts with rootless containers", m.Mountpoint)
   244  					}
   245  					break
   246  				}
   247  			}
   248  		}
   249  	}
   250  
   251  	cPausePid := C.CString(pausePid)
   252  	defer C.free(unsafe.Pointer(cPausePid))
   253  
   254  	cFileToRead := C.CString(fileToRead)
   255  	defer C.free(unsafe.Pointer(cFileToRead))
   256  	var fileOutputFD C.int
   257  	if fileOutput != nil {
   258  		fileOutputFD = C.int(fileOutput.Fd())
   259  	}
   260  
   261  	runtime.LockOSThread()
   262  	defer runtime.UnlockOSThread()
   263  
   264  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_DGRAM, 0)
   265  	if err != nil {
   266  		return false, -1, err
   267  	}
   268  	r, w := os.NewFile(uintptr(fds[0]), "sync host"), os.NewFile(uintptr(fds[1]), "sync child")
   269  
   270  	var pid int
   271  
   272  	defer errorhandling.CloseQuiet(r)
   273  	defer errorhandling.CloseQuiet(w)
   274  	defer func() {
   275  		toWrite := []byte("0")
   276  		if retErr != nil {
   277  			toWrite = []byte("1")
   278  		}
   279  		if _, err := w.Write(toWrite); err != nil {
   280  			logrus.Errorf("Failed to write byte 0: %q", err)
   281  		}
   282  		if retErr != nil && pid > 0 {
   283  			if err := unix.Kill(pid, unix.SIGKILL); err != nil {
   284  				if err != unix.ESRCH {
   285  					logrus.Errorf("Failed to clean up process %d: %v", pid, err)
   286  				}
   287  			}
   288  			C.reexec_in_user_namespace_wait(C.int(pid), 0)
   289  		}
   290  	}()
   291  
   292  	pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid, cFileToRead, fileOutputFD)
   293  	pid = int(pidC)
   294  	if pid < 0 {
   295  		return false, -1, fmt.Errorf("cannot re-exec process")
   296  	}
   297  
   298  	uids, gids, err := GetConfiguredMappings(false)
   299  	if err != nil {
   300  		return false, -1, err
   301  	}
   302  
   303  	uidMap := fmt.Sprintf("/proc/%d/uid_map", pid)
   304  	gidMap := fmt.Sprintf("/proc/%d/gid_map", pid)
   305  
   306  	uidsMapped := false
   307  
   308  	if err := copyMappings("/proc/self/uid_map", uidMap); err == nil {
   309  		uidsMapped = true
   310  	}
   311  
   312  	if uids != nil && !uidsMapped {
   313  		err := tryMappingTool(true, pid, os.Geteuid(), uids)
   314  		// If some mappings were specified, do not ignore the error
   315  		if err != nil && len(uids) > 0 {
   316  			return false, -1, err
   317  		}
   318  		uidsMapped = err == nil
   319  	}
   320  	if !uidsMapped {
   321  		logrus.Warnf("Using rootless single mapping into the namespace. This might break some images. Check /etc/subuid and /etc/subgid for adding sub*ids if not using a network user")
   322  		setgroups := fmt.Sprintf("/proc/%d/setgroups", pid)
   323  		err = os.WriteFile(setgroups, []byte("deny\n"), 0666)
   324  		if err != nil {
   325  			return false, -1, fmt.Errorf("cannot write setgroups file: %w", err)
   326  		}
   327  		logrus.Debugf("write setgroups file exited with 0")
   328  
   329  		err = os.WriteFile(uidMap, []byte(fmt.Sprintf("%d %d 1\n", 0, os.Geteuid())), 0666)
   330  		if err != nil {
   331  			return false, -1, fmt.Errorf("cannot write uid_map: %w", err)
   332  		}
   333  		logrus.Debugf("write uid_map exited with 0")
   334  	}
   335  
   336  	gidsMapped := false
   337  	if err := copyMappings("/proc/self/gid_map", gidMap); err == nil {
   338  		gidsMapped = true
   339  	}
   340  	if gids != nil && !gidsMapped {
   341  		err := tryMappingTool(false, pid, os.Getegid(), gids)
   342  		// If some mappings were specified, do not ignore the error
   343  		if err != nil && len(gids) > 0 {
   344  			return false, -1, err
   345  		}
   346  		gidsMapped = err == nil
   347  	}
   348  	if !gidsMapped {
   349  		err = os.WriteFile(gidMap, []byte(fmt.Sprintf("%d %d 1\n", 0, os.Getegid())), 0666)
   350  		if err != nil {
   351  			return false, -1, fmt.Errorf("cannot write gid_map: %w", err)
   352  		}
   353  	}
   354  
   355  	_, err = w.WriteString("0")
   356  	if err != nil {
   357  		return false, -1, fmt.Errorf("write to sync pipe: %w", err)
   358  	}
   359  
   360  	b := make([]byte, 1)
   361  	_, err = w.Read(b)
   362  	if err != nil {
   363  		return false, -1, fmt.Errorf("read from sync pipe: %w", err)
   364  	}
   365  
   366  	if fileOutput != nil {
   367  		ret := C.reexec_in_user_namespace_wait(pidC, 0)
   368  		if ret < 0 {
   369  			return false, -1, errors.New("waiting for the re-exec process")
   370  		}
   371  		return true, 0, nil
   372  	}
   373  
   374  	if b[0] == '2' {
   375  		// We have lost the race for writing the PID file, as probably another
   376  		// process created a namespace and wrote the PID.
   377  		// Try to join it.
   378  		data, err := os.ReadFile(pausePid)
   379  		if err == nil {
   380  			var pid uint64
   381  			pid, err = strconv.ParseUint(string(data), 10, 0)
   382  			if err == nil {
   383  				return joinUserAndMountNS(uint(pid), "")
   384  			}
   385  		}
   386  		return false, -1, fmt.Errorf("setting up the process: %w", err)
   387  	}
   388  
   389  	if b[0] != '0' {
   390  		return false, -1, errors.New("setting up the process")
   391  	}
   392  
   393  	return waitAndProxySignalsToChild(pidC)
   394  }
   395  
   396  func waitAndProxySignalsToChild(pid C.int) (bool, int, error) {
   397  	signals := []os.Signal{}
   398  	for sig := 0; sig < numSig; sig++ {
   399  		if sig == int(unix.SIGTSTP) {
   400  			continue
   401  		}
   402  		signals = append(signals, unix.Signal(sig))
   403  	}
   404  
   405  	// Disable all existing signal handlers, from now forward everything to the child and let
   406  	// it deal with it. All we do is to wait and propagate the exit code from the child to our parent.
   407  	gosignal.Reset()
   408  	c := make(chan os.Signal, len(signals))
   409  	gosignal.Notify(c, signals...)
   410  	go func() {
   411  		for s := range c {
   412  			if s == unix.SIGCHLD || s == unix.SIGPIPE {
   413  				continue
   414  			}
   415  
   416  			if err := unix.Kill(int(pid), s.(unix.Signal)); err != nil {
   417  				if err != unix.ESRCH {
   418  					logrus.Errorf("Failed to propagate signal to child process %d: %v", int(pid), err)
   419  				}
   420  			}
   421  		}
   422  	}()
   423  
   424  	ret := C.reexec_in_user_namespace_wait(pid, 0)
   425  	// child exited reset our signal proxy handler
   426  	gosignal.Reset()
   427  	if ret < 0 {
   428  		return false, -1, errors.New("waiting for the re-exec process")
   429  	}
   430  
   431  	return true, int(ret), nil
   432  }
   433  
   434  // BecomeRootInUserNS re-exec podman in a new userNS.  It returns whether podman was re-executed
   435  // into a new user namespace and the return code from the re-executed podman process.
   436  // If podman was re-executed the caller needs to propagate the error code returned by the child
   437  // process.
   438  func BecomeRootInUserNS(pausePid string) (bool, int, error) {
   439  	return becomeRootInUserNS(pausePid, "", nil)
   440  }
   441  
   442  // TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths.
   443  // This is useful when there are already running containers and we
   444  // don't have a pause process yet.  We can use the paths to the conmon
   445  // processes to attempt joining their namespaces.
   446  // If needNewNamespace is set, the file is read from a temporary user
   447  // namespace, this is useful for containers that are running with a
   448  // different uidmap and the unprivileged user has no way to read the
   449  // file owned by the root in the container.
   450  func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) {
   451  	var lastErr error
   452  	var pausePid int
   453  
   454  	for _, path := range paths {
   455  		if !needNewNamespace {
   456  			data, err := os.ReadFile(path)
   457  			if err != nil {
   458  				lastErr = err
   459  				continue
   460  			}
   461  
   462  			pausePid, err = strconv.Atoi(string(data))
   463  			if err != nil {
   464  				lastErr = fmt.Errorf("cannot parse file %q: %w", path, err)
   465  				continue
   466  			}
   467  		} else {
   468  			r, w, err := os.Pipe()
   469  			if err != nil {
   470  				lastErr = err
   471  				continue
   472  			}
   473  
   474  			defer errorhandling.CloseQuiet(r)
   475  
   476  			if _, _, err := becomeRootInUserNS("", path, w); err != nil {
   477  				w.Close()
   478  				lastErr = err
   479  				continue
   480  			}
   481  
   482  			if err := w.Close(); err != nil {
   483  				return false, 0, err
   484  			}
   485  			defer func() {
   486  				C.reexec_in_user_namespace_wait(-1, 0)
   487  			}()
   488  
   489  			b := make([]byte, 32)
   490  
   491  			n, err := r.Read(b)
   492  			if err != nil {
   493  				lastErr = fmt.Errorf("cannot read %q: %w", path, err)
   494  				continue
   495  			}
   496  
   497  			pausePid, err = strconv.Atoi(string(b[:n]))
   498  			if err != nil {
   499  				lastErr = err
   500  				continue
   501  			}
   502  		}
   503  
   504  		if pausePid > 0 && unix.Kill(pausePid, 0) == nil {
   505  			joined, pid, err := joinUserAndMountNS(uint(pausePid), pausePidPath)
   506  			if err == nil {
   507  				return joined, pid, nil
   508  			}
   509  			lastErr = err
   510  		}
   511  	}
   512  	if lastErr != nil {
   513  		return false, 0, lastErr
   514  	}
   515  	return false, 0, fmt.Errorf("could not find any running process: %w", unix.ESRCH)
   516  }
   517  
   518  // ReadMappingsProc parses and returns the ID mappings at the specified path.
   519  func ReadMappingsProc(path string) ([]idtools.IDMap, error) {
   520  	file, err := os.Open(path)
   521  	if err != nil {
   522  		return nil, err
   523  	}
   524  	defer file.Close()
   525  
   526  	mappings := []idtools.IDMap{}
   527  
   528  	buf := bufio.NewReader(file)
   529  	for {
   530  		line, _, err := buf.ReadLine()
   531  		if err != nil {
   532  			if err == io.EOF {
   533  				return mappings, nil
   534  			}
   535  			return nil, fmt.Errorf("cannot read line from %s: %w", path, err)
   536  		}
   537  		if line == nil {
   538  			return mappings, nil
   539  		}
   540  
   541  		containerID, hostID, size := 0, 0, 0
   542  		if _, err := fmt.Sscanf(string(line), "%d %d %d", &containerID, &hostID, &size); err != nil {
   543  			return nil, fmt.Errorf("cannot parse %s: %w", string(line), err)
   544  		}
   545  		mappings = append(mappings, idtools.IDMap{ContainerID: containerID, HostID: hostID, Size: size})
   546  	}
   547  }
   548  
   549  func matches(id int, configuredIDs []idtools.IDMap, currentIDs []idtools.IDMap) bool {
   550  	// The first mapping is the host user, handle it separately.
   551  	if currentIDs[0].HostID != id || currentIDs[0].Size != 1 {
   552  		return false
   553  	}
   554  
   555  	currentIDs = currentIDs[1:]
   556  	if len(currentIDs) != len(configuredIDs) {
   557  		return false
   558  	}
   559  
   560  	// It is fine to iterate sequentially as both slices are sorted.
   561  	for i := range currentIDs {
   562  		if currentIDs[i].HostID != configuredIDs[i].HostID {
   563  			return false
   564  		}
   565  		if currentIDs[i].Size != configuredIDs[i].Size {
   566  			return false
   567  		}
   568  	}
   569  
   570  	return true
   571  }
   572  
   573  // ConfigurationMatches checks whether the additional uids/gids configured for the user
   574  // match the current user namespace.
   575  func ConfigurationMatches() (bool, error) {
   576  	if !IsRootless() || os.Geteuid() != 0 {
   577  		return true, nil
   578  	}
   579  
   580  	uids, gids, err := GetConfiguredMappings(false)
   581  	if err != nil {
   582  		return false, err
   583  	}
   584  
   585  	currentUIDs, err := ReadMappingsProc("/proc/self/uid_map")
   586  	if err != nil {
   587  		return false, err
   588  	}
   589  
   590  	if !matches(GetRootlessUID(), uids, currentUIDs) {
   591  		return false, err
   592  	}
   593  
   594  	currentGIDs, err := ReadMappingsProc("/proc/self/gid_map")
   595  	if err != nil {
   596  		return false, err
   597  	}
   598  
   599  	return matches(GetRootlessGID(), gids, currentGIDs), nil
   600  }
   601  
   602  // IsFdInherited checks whether the fd is opened and valid to use
   603  func IsFdInherited(fd int) bool {
   604  	return int(C.is_fd_inherited(C.int(fd))) > 0
   605  }