github.com/rootless-containers/rootlesskit/v2@v2.3.4/pkg/parent/parent.go (about)

     1  package parent
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net"
     8  	"net/http"
     9  	"os"
    10  	"os/exec"
    11  	"os/user"
    12  	"path/filepath"
    13  	"strconv"
    14  	"syscall"
    15  
    16  	"github.com/gofrs/flock"
    17  	"github.com/gorilla/mux"
    18  	"github.com/rootless-containers/rootlesskit/v2/pkg/api/router"
    19  	"github.com/rootless-containers/rootlesskit/v2/pkg/messages"
    20  	"github.com/rootless-containers/rootlesskit/v2/pkg/network"
    21  	"github.com/rootless-containers/rootlesskit/v2/pkg/parent/cgrouputil"
    22  	"github.com/rootless-containers/rootlesskit/v2/pkg/parent/dynidtools"
    23  	"github.com/rootless-containers/rootlesskit/v2/pkg/parent/idtools"
    24  	"github.com/rootless-containers/rootlesskit/v2/pkg/port"
    25  	"github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy"
    26  	"github.com/rootless-containers/rootlesskit/v2/pkg/sigproxy/signal"
    27  	"github.com/sirupsen/logrus"
    28  	"golang.org/x/sys/unix"
    29  )
    30  
    31  type Opt struct {
    32  	PipeFDEnvKey             string               // needs to be set
    33  	ChildUseActivationEnvKey string               // needs to be set
    34  	StateDir                 string               // directory needs to be precreated
    35  	StateDirEnvKey           string               // optional env key to propagate StateDir value
    36  	NetworkDriver            network.ParentDriver // nil for HostNetwork
    37  	PortDriver               port.ParentDriver    // nil for --port-driver=none
    38  	PublishPorts             []port.Spec
    39  	CreatePIDNS              bool
    40  	CreateCgroupNS           bool
    41  	CreateUTSNS              bool
    42  	CreateIPCNS              bool
    43  	DetachNetNS              bool
    44  	ParentEUIDEnvKey         string // optional env key to propagate geteuid() value
    45  	ParentEGIDEnvKey         string // optional env key to propagate getegid() value
    46  	Propagation              string
    47  	EvacuateCgroup2          string // e.g. "rootlesskit_evacuation"
    48  	SubidSource              SubidSource
    49  }
    50  
    51  type SubidSource string
    52  
    53  const (
    54  	SubidSourceAuto    = SubidSource("auto")    // Try dynamic then fallback to static
    55  	SubidSourceDynamic = SubidSource("dynamic") // /usr/bin/getsubids
    56  	SubidSourceStatic  = SubidSource("static")  // /etc/{subuid,subgid}
    57  )
    58  
    59  // Documented state files. Undocumented ones are subject to change.
    60  const (
    61  	StateFileLock     = "lock"
    62  	StateFileChildPID = "child_pid" // decimal pid number text
    63  	StateFileAPISock  = "api.sock"  // REST API Socket
    64  	StateFileNetNs    = "netns"     // rootlesskit network namespace
    65  )
    66  
    67  func checkPreflight(opt Opt) error {
    68  	if opt.PipeFDEnvKey == "" {
    69  		return errors.New("pipe FD env key is not set")
    70  	}
    71  	if opt.StateDir == "" {
    72  		return errors.New("state dir is not set")
    73  	}
    74  	if !filepath.IsAbs(opt.StateDir) {
    75  		return errors.New("state dir must be absolute")
    76  	}
    77  	if stat, err := os.Stat(opt.StateDir); err != nil || !stat.IsDir() {
    78  		return fmt.Errorf("state dir is inaccessible: %w", err)
    79  	}
    80  
    81  	if os.Geteuid() == 0 {
    82  		logrus.Warn("Running RootlessKit as the root user is unsupported.")
    83  	}
    84  
    85  	warnSysctl()
    86  
    87  	// invalid propagation doesn't result in an error
    88  	warnPropagation(opt.Propagation)
    89  	return nil
    90  }
    91  
    92  // createCleanupLock uses LOCK_SH for preventing automatic cleanup of
    93  // "/tmp/<Our State Dir>" caused by by systemd.
    94  //
    95  // This LOCK_SH lock is different from our lock file in the state dir.
    96  // We could unify the lock file into LOCK_SH, but we are still keeping
    97  // the lock file for a historical reason.
    98  //
    99  // See:
   100  // - https://github.com/rootless-containers/rootlesskit/issues/185
   101  // - https://github.com/rootless-containers/rootlesskit/pull/188
   102  func createCleanupLock(sDir string) error {
   103  	//lock state dir when using /tmp/ path
   104  	stateDir, err := os.Open(sDir)
   105  	if err != nil {
   106  		return err
   107  	}
   108  	err = unix.Flock(int(stateDir.Fd()), unix.LOCK_SH)
   109  	if err != nil {
   110  		logrus.Warnf("Failed to lock the state dir %s", sDir)
   111  	}
   112  	return nil
   113  }
   114  
   115  // LockStateDir creates and locks "lock" file in the state dir.
   116  func LockStateDir(stateDir string) (*flock.Flock, error) {
   117  	lockPath := filepath.Join(stateDir, StateFileLock)
   118  	lock := flock.New(lockPath)
   119  	locked, err := lock.TryLock()
   120  	if err != nil {
   121  		return nil, fmt.Errorf("failed to lock %s: %w", lockPath, err)
   122  	}
   123  	if !locked {
   124  		return nil, fmt.Errorf("failed to lock %s, another RootlessKit is running with the same state directory?", lockPath)
   125  	}
   126  	return lock, nil
   127  }
   128  
   129  func setupFilesAndEnv(readPipe *os.File, writePipe *os.File, opt Opt) ([]*os.File, []string) {
   130  	// 0 1 and 2  are used for stdin. stdout, and stderr
   131  	const listenFdsStart = 3
   132  	listenPid, listenPidErr := strconv.Atoi(os.Getenv("LISTEN_PID"))
   133  	listenFds, listenFdsErr := strconv.Atoi(os.Getenv("LISTEN_FDS"))
   134  	useSystemdSocketFDs := listenPidErr == nil && listenFdsErr == nil && listenFds > 0
   135  	if !useSystemdSocketFDs {
   136  		listenFds = 0
   137  	}
   138  	extraFiles := make([]*os.File, listenFds+2)
   139  	for i, fd := 0, listenFdsStart; i < listenFds; i, fd = i+1, fd+1 {
   140  		name := "LISTEN_FD_" + strconv.Itoa(fd)
   141  		extraFiles[i] = os.NewFile(uintptr(fd), name)
   142  	}
   143  	extraFiles[listenFds] = readPipe
   144  	extraFiles[listenFds+1] = writePipe
   145  	cmdEnv := os.Environ()
   146  	cmdEnv = append(cmdEnv, opt.PipeFDEnvKey+"="+strconv.Itoa(listenFdsStart+listenFds)+","+strconv.Itoa(listenFdsStart+listenFds+1))
   147  	cmdEnv = append(cmdEnv, opt.ChildUseActivationEnvKey+"="+strconv.FormatBool(listenPid == os.Getpid()))
   148  	return extraFiles, cmdEnv
   149  }
   150  
   151  func Parent(opt Opt) error {
   152  	if err := checkPreflight(opt); err != nil {
   153  		return err
   154  	}
   155  
   156  	err := createCleanupLock(opt.StateDir)
   157  	if err != nil {
   158  		return err
   159  	}
   160  
   161  	lock, err := LockStateDir(opt.StateDir)
   162  	if err != nil {
   163  		return err
   164  	}
   165  	defer os.RemoveAll(opt.StateDir)
   166  	defer lock.Unlock()
   167  
   168  	pipeR, pipeW, err := os.Pipe() // parent-to-child
   169  	if err != nil {
   170  		return err
   171  	}
   172  	pipe2R, pipe2W, err := os.Pipe() // child-to-parent
   173  	if err != nil {
   174  		return err
   175  	}
   176  	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
   177  	cmd.SysProcAttr = &syscall.SysProcAttr{
   178  		Pdeathsig:  syscall.SIGKILL,
   179  		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
   180  	}
   181  
   182  	if opt.NetworkDriver != nil {
   183  		if !opt.DetachNetNS {
   184  			cmd.SysProcAttr.Unshareflags |= syscall.CLONE_NEWNET
   185  		}
   186  	}
   187  
   188  	if opt.CreatePIDNS {
   189  		// cannot be Unshareflags (panics)
   190  		cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWPID
   191  	}
   192  	if opt.CreateCgroupNS {
   193  		cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWCGROUP
   194  	}
   195  	if opt.CreateUTSNS {
   196  		cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWUTS
   197  	}
   198  	if opt.CreateIPCNS {
   199  		cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWIPC
   200  	}
   201  	cmd.Stdin = os.Stdin
   202  	cmd.Stdout = os.Stdout
   203  	cmd.Stderr = os.Stderr
   204  	cmd.ExtraFiles, cmd.Env = setupFilesAndEnv(pipeR, pipe2W, opt)
   205  	if opt.StateDirEnvKey != "" {
   206  		cmd.Env = append(cmd.Env, opt.StateDirEnvKey+"="+opt.StateDir)
   207  	}
   208  	if opt.ParentEUIDEnvKey != "" {
   209  		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEUIDEnvKey, os.Geteuid()))
   210  	}
   211  	if opt.ParentEGIDEnvKey != "" {
   212  		cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEGIDEnvKey, os.Getegid()))
   213  	}
   214  	if err := cmd.Start(); err != nil {
   215  		warnOnChildStartFailure(err)
   216  		return fmt.Errorf("failed to start the child: %w", err)
   217  	}
   218  
   219  	msgParentHello := &messages.Message{
   220  		U: messages.U{
   221  			ParentHello: &messages.ParentHello{},
   222  		},
   223  	}
   224  	if err := messages.Send(pipeW, msgParentHello); err != nil {
   225  		return err
   226  	}
   227  	if _, err := messages.WaitFor(pipe2R, messages.Name(messages.ChildHello{})); err != nil {
   228  		return err
   229  	}
   230  
   231  	if err := setupUIDGIDMap(cmd.Process.Pid, opt.SubidSource); err != nil {
   232  		return fmt.Errorf("failed to setup UID/GID map: %w", err)
   233  	}
   234  	msgParentInitIdmapCompleted := &messages.Message{
   235  		U: messages.U{
   236  			ParentInitIdmapCompleted: &messages.ParentInitIdmapCompleted{},
   237  		},
   238  	}
   239  	if err := messages.Send(pipeW, msgParentInitIdmapCompleted); err != nil {
   240  		return err
   241  	}
   242  	if _, err := messages.WaitFor(pipe2R, messages.Name(messages.ChildInitUserNSCompleted{})); err != nil {
   243  		return err
   244  	}
   245  
   246  	sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid)
   247  	defer signal.StopCatch(sigc)
   248  
   249  	if opt.EvacuateCgroup2 != "" {
   250  		if err := cgrouputil.EvacuateCgroup2(opt.EvacuateCgroup2); err != nil {
   251  			return err
   252  		}
   253  	}
   254  
   255  	// configure Network driver
   256  	msgParentInitNetworkDriverCompleted := &messages.Message{
   257  		U: messages.U{
   258  			ParentInitNetworkDriverCompleted: &messages.ParentInitNetworkDriverCompleted{},
   259  		},
   260  	}
   261  
   262  	if opt.NetworkDriver != nil {
   263  		var netns string
   264  		if opt.DetachNetNS {
   265  			netns = filepath.Join("/proc", strconv.Itoa(cmd.Process.Pid), "root", filepath.Clean(opt.StateDir), "netns")
   266  		}
   267  		netMsg, cleanupNetwork, err := opt.NetworkDriver.ConfigureNetwork(cmd.Process.Pid, opt.StateDir, netns)
   268  		if cleanupNetwork != nil {
   269  			defer cleanupNetwork()
   270  		}
   271  		if err != nil {
   272  			return fmt.Errorf("failed to setup network %+v: %w", opt.NetworkDriver, err)
   273  		}
   274  		msgParentInitNetworkDriverCompleted.U.ParentInitNetworkDriverCompleted = netMsg
   275  	}
   276  	if err := messages.Send(pipeW, msgParentInitNetworkDriverCompleted); err != nil {
   277  		return err
   278  	}
   279  
   280  	// configure Port driver
   281  	msgParentInitPortDriverCompleted := &messages.Message{
   282  		U: messages.U{
   283  			ParentInitPortDriverCompleted: &messages.ParentInitPortDriverCompleted{},
   284  		},
   285  	}
   286  	portDriverInitComplete := make(chan struct{})
   287  	portDriverQuit := make(chan struct{})
   288  	portDriverErr := make(chan error)
   289  	if opt.PortDriver != nil {
   290  		msgParentInitPortDriverCompleted.U.ParentInitPortDriverCompleted.PortDriverOpaque = opt.PortDriver.OpaqueForChild()
   291  		cctx := &port.ChildContext{
   292  			IP: net.ParseIP(msgParentInitNetworkDriverCompleted.U.ParentInitNetworkDriverCompleted.IP).To4(),
   293  		}
   294  		go func() {
   295  			portDriverErr <- opt.PortDriver.RunParentDriver(portDriverInitComplete,
   296  				portDriverQuit, cctx)
   297  		}()
   298  	}
   299  	if err := messages.Send(pipeW, msgParentInitPortDriverCompleted); err != nil {
   300  		return err
   301  	}
   302  
   303  	// Close the parent-to-child pipe
   304  	if err := pipeW.Close(); err != nil {
   305  		return err
   306  	}
   307  	if opt.PortDriver != nil {
   308  		// wait for port driver to be ready
   309  		select {
   310  		case <-portDriverInitComplete:
   311  		case err = <-portDriverErr:
   312  			return err
   313  		}
   314  		// publish ports
   315  		for _, p := range opt.PublishPorts {
   316  			st, err := opt.PortDriver.AddPort(context.TODO(), p)
   317  			if err != nil {
   318  				return fmt.Errorf("failed to expose port %v: %w", p, err)
   319  			}
   320  			logrus.Debugf("published port %v", st)
   321  		}
   322  	}
   323  
   324  	// after child is fully configured, write PID to child_pid file
   325  	childPIDPath := filepath.Join(opt.StateDir, StateFileChildPID)
   326  	if err := os.WriteFile(childPIDPath, []byte(strconv.Itoa(cmd.Process.Pid)), 0444); err != nil {
   327  		return fmt.Errorf("failed to write the child PID %d to %s: %w", cmd.Process.Pid, childPIDPath, err)
   328  	}
   329  	// listens the API
   330  	apiSockPath := filepath.Join(opt.StateDir, StateFileAPISock)
   331  	apiCloser, err := listenServeAPI(apiSockPath, &router.Backend{
   332  		StateDir:      opt.StateDir,
   333  		ChildPID:      cmd.Process.Pid,
   334  		NetworkDriver: opt.NetworkDriver,
   335  		PortDriver:    opt.PortDriver,
   336  	})
   337  	if err != nil {
   338  		return err
   339  	}
   340  	// block until the child exits
   341  	if err := cmd.Wait(); err != nil {
   342  		return fmt.Errorf("child exited: %w", err)
   343  	}
   344  	// close the API socket
   345  	if err := apiCloser.Close(); err != nil {
   346  		return fmt.Errorf("failed to close %s: %w", apiSockPath, err)
   347  	}
   348  	// shut down port driver
   349  	if opt.PortDriver != nil {
   350  		portDriverQuit <- struct{}{}
   351  		err = <-portDriverErr
   352  	}
   353  	return err
   354  }
   355  
   356  func getSubIDRanges(u *user.User, subidSource SubidSource) ([]idtools.SubIDRange, []idtools.SubIDRange, error) {
   357  	uid, err := strconv.Atoi(u.Uid)
   358  	if err != nil {
   359  		return nil, nil, err
   360  	}
   361  	switch subidSource {
   362  	case SubidSourceStatic:
   363  		logrus.Debugf("subid-source: using the static source")
   364  		return idtools.GetSubIDRanges(uid, u.Username)
   365  	case SubidSourceDynamic:
   366  		logrus.Debugf("subid-source: using the dynamic source")
   367  		return dynidtools.GetSubIDRanges(uid, u.Username)
   368  	case "", SubidSourceAuto:
   369  		subuidRanges, subgidRanges, err := getSubIDRanges(u, SubidSourceDynamic)
   370  		if err == nil && len(subuidRanges) > 0 && len(subgidRanges) > 0 {
   371  			return subuidRanges, subgidRanges, nil
   372  		}
   373  		logrus.WithError(err).Debugf("failed to use subid source %q, falling back to %q", SubidSourceDynamic, SubidSourceStatic)
   374  		return getSubIDRanges(u, SubidSourceStatic)
   375  	default:
   376  		return nil, nil, fmt.Errorf("unknown subid source %q", subidSource)
   377  	}
   378  }
   379  
   380  func newugidmapArgs(subidSource SubidSource) ([]string, []string, error) {
   381  	u, err := user.Current()
   382  	if err != nil {
   383  		return nil, nil, err
   384  	}
   385  	subuidRanges, subgidRanges, err := getSubIDRanges(u, subidSource)
   386  	if err != nil {
   387  		return nil, nil, err
   388  	}
   389  	logrus.Debugf("subuid ranges=%v", subuidRanges)
   390  	logrus.Debugf("subgid ranges=%v", subgidRanges)
   391  	return newugidmapArgsFromSubIDRanges(u, subuidRanges, subgidRanges)
   392  }
   393  
   394  func newugidmapArgsFromSubIDRanges(u *user.User, subuidRanges, subgidRanges []idtools.SubIDRange) ([]string, []string, error) {
   395  	uidMap := []string{
   396  		"0",
   397  		u.Uid,
   398  		"1",
   399  	}
   400  	gidMap := []string{
   401  		"0",
   402  		u.Gid,
   403  		"1",
   404  	}
   405  
   406  	uidMapLast := 1
   407  	for _, f := range subuidRanges {
   408  		uidMap = append(uidMap, []string{
   409  			strconv.Itoa(uidMapLast),
   410  			strconv.Itoa(f.Start),
   411  			strconv.Itoa(f.Length),
   412  		}...)
   413  		uidMapLast += f.Length
   414  	}
   415  	gidMapLast := 1
   416  	for _, f := range subgidRanges {
   417  		gidMap = append(gidMap, []string{
   418  			strconv.Itoa(gidMapLast),
   419  			strconv.Itoa(f.Start),
   420  			strconv.Itoa(f.Length),
   421  		}...)
   422  		gidMapLast += f.Length
   423  	}
   424  
   425  	return uidMap, gidMap, nil
   426  }
   427  
   428  func setupUIDGIDMap(pid int, subidSource SubidSource) error {
   429  	uArgs, gArgs, err := newugidmapArgs(subidSource)
   430  	if err != nil {
   431  		return fmt.Errorf("failed to compute uid/gid map: %w", err)
   432  	}
   433  	pidS := strconv.Itoa(pid)
   434  	cmd := exec.Command("newuidmap", append([]string{pidS}, uArgs...)...)
   435  	out, err := cmd.CombinedOutput()
   436  	if err != nil {
   437  		return fmt.Errorf("newuidmap %s %v failed: %s: %w", pidS, uArgs, string(out), err)
   438  	}
   439  	cmd = exec.Command("newgidmap", append([]string{pidS}, gArgs...)...)
   440  	out, err = cmd.CombinedOutput()
   441  	if err != nil {
   442  		return fmt.Errorf("newgidmap %s %v failed: %s: %w", pidS, gArgs, string(out), err)
   443  	}
   444  	return nil
   445  }
   446  
   447  // apiCloser is implemented by *http.Server
   448  type apiCloser interface {
   449  	Close() error
   450  	Shutdown(context.Context) error
   451  }
   452  
   453  func listenServeAPI(socketPath string, backend *router.Backend) (apiCloser, error) {
   454  	r := mux.NewRouter()
   455  	router.AddRoutes(r, backend)
   456  	srv := &http.Server{Handler: r}
   457  	err := os.RemoveAll(socketPath)
   458  	if err != nil {
   459  		return nil, err
   460  	}
   461  	l, err := net.Listen("unix", socketPath)
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  	go srv.Serve(l)
   466  	return srv, nil
   467  }
   468  
   469  // InitStateDir removes everything in the state dir except the lock file.
   470  // This is needed because when the previous execution crashed, the state dir may not be removed successfully.
   471  //
   472  // InitStateDir must be called before calling parent functions.
   473  func InitStateDir(stateDir string) error {
   474  	if err := os.MkdirAll(stateDir, 0755); err != nil {
   475  		return err
   476  	}
   477  	lk, err := LockStateDir(stateDir)
   478  	if err != nil {
   479  		return err
   480  	}
   481  	defer lk.Unlock()
   482  	stateDirStuffs, err := os.ReadDir(stateDir)
   483  	if err != nil {
   484  		return err
   485  	}
   486  	for _, f := range stateDirStuffs {
   487  		if f.Name() == StateFileLock {
   488  			continue
   489  		}
   490  		p := filepath.Join(stateDir, f.Name())
   491  		if err := os.RemoveAll(p); err != nil {
   492  			return fmt.Errorf("failed to remove %s: %w", p, err)
   493  		}
   494  	}
   495  	return nil
   496  }