github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/init_linux.go (about)

     1  package libcontainer
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"runtime/debug"
    13  	"strconv"
    14  	"strings"
    15  
    16  	"github.com/containerd/console"
    17  	"github.com/moby/sys/user"
    18  	"github.com/opencontainers/runtime-spec/specs-go"
    19  	"github.com/sirupsen/logrus"
    20  	"github.com/vishvananda/netlink"
    21  	"golang.org/x/sys/unix"
    22  
    23  	"github.com/opencontainers/runc/libcontainer/capabilities"
    24  	"github.com/opencontainers/runc/libcontainer/cgroups"
    25  	"github.com/opencontainers/runc/libcontainer/configs"
    26  	"github.com/opencontainers/runc/libcontainer/system"
    27  	"github.com/opencontainers/runc/libcontainer/utils"
    28  )
    29  
    30  type initType string
    31  
    32  const (
    33  	initSetns    initType = "setns"
    34  	initStandard initType = "standard"
    35  )
    36  
    37  type pid struct {
    38  	Pid           int `json:"stage2_pid"`
    39  	PidFirstChild int `json:"stage1_pid"`
    40  }
    41  
    42  // network is an internal struct used to setup container networks.
    43  type network struct {
    44  	configs.Network
    45  
    46  	// TempVethPeerName is a unique temporary veth peer name that was placed into
    47  	// the container's namespace.
    48  	TempVethPeerName string `json:"temp_veth_peer_name"`
    49  }
    50  
    51  // initConfig is used for transferring parameters from Exec() to Init()
    52  type initConfig struct {
    53  	Args             []string              `json:"args"`
    54  	Env              []string              `json:"env"`
    55  	Cwd              string                `json:"cwd"`
    56  	Capabilities     *configs.Capabilities `json:"capabilities"`
    57  	ProcessLabel     string                `json:"process_label"`
    58  	AppArmorProfile  string                `json:"apparmor_profile"`
    59  	NoNewPrivileges  bool                  `json:"no_new_privileges"`
    60  	User             string                `json:"user"`
    61  	AdditionalGroups []string              `json:"additional_groups"`
    62  	Config           *configs.Config       `json:"config"`
    63  	Networks         []*network            `json:"network"`
    64  	PassedFilesCount int                   `json:"passed_files_count"`
    65  	ContainerID      string                `json:"containerid"`
    66  	Rlimits          []configs.Rlimit      `json:"rlimits"`
    67  	CreateConsole    bool                  `json:"create_console"`
    68  	ConsoleWidth     uint16                `json:"console_width"`
    69  	ConsoleHeight    uint16                `json:"console_height"`
    70  	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
    71  	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
    72  	SpecState        *specs.State          `json:"spec_state,omitempty"`
    73  	Cgroup2Path      string                `json:"cgroup2_path,omitempty"`
    74  }
    75  
    76  // Init is part of "runc init" implementation.
    77  func Init() {
    78  	runtime.GOMAXPROCS(1)
    79  	runtime.LockOSThread()
    80  
    81  	if err := startInitialization(); err != nil {
    82  		// If the error is returned, it was not communicated
    83  		// back to the parent (which is not a common case),
    84  		// so print it to stderr here as a last resort.
    85  		//
    86  		// Do not use logrus as we are not sure if it has been
    87  		// set up yet, but most important, if the parent is
    88  		// alive (and its log forwarding is working).
    89  		fmt.Fprintln(os.Stderr, err)
    90  	}
    91  	// Normally, StartInitialization() never returns, meaning
    92  	// if we are here, it had failed.
    93  	os.Exit(255)
    94  }
    95  
    96  // Normally, this function does not return. If it returns, with or without an
    97  // error, it means the initialization has failed. If the error is returned,
    98  // it means the error can not be communicated back to the parent.
    99  func startInitialization() (retErr error) {
   100  	// Get the synchronisation pipe.
   101  	envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
   102  	syncPipeFd, err := strconv.Atoi(envSyncPipe)
   103  	if err != nil {
   104  		return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
   105  	}
   106  	syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
   107  	defer syncPipe.Close()
   108  
   109  	defer func() {
   110  		// If this defer is ever called, this means initialization has failed.
   111  		// Send the error back to the parent process in the form of an initError
   112  		// if the sync socket has not been closed.
   113  		if syncPipe.isClosed() {
   114  			return
   115  		}
   116  		ierr := initError{Message: retErr.Error()}
   117  		if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
   118  			fmt.Fprintln(os.Stderr, err)
   119  			return
   120  		}
   121  		// The error is sent, no need to also return it (or it will be reported twice).
   122  		retErr = nil
   123  	}()
   124  
   125  	// Get the INITPIPE.
   126  	envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
   127  	initPipeFd, err := strconv.Atoi(envInitPipe)
   128  	if err != nil {
   129  		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
   130  	}
   131  	initPipe := os.NewFile(uintptr(initPipeFd), "init")
   132  	defer initPipe.Close()
   133  
   134  	// Set up logging. This is used rarely, and mostly for init debugging.
   135  
   136  	// Passing log level is optional; currently libcontainer/integration does not do it.
   137  	if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
   138  		logLevel, err := strconv.Atoi(levelStr)
   139  		if err != nil {
   140  			return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
   141  		}
   142  		logrus.SetLevel(logrus.Level(logLevel))
   143  	}
   144  
   145  	logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
   146  	if err != nil {
   147  		return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
   148  	}
   149  	logPipe := os.NewFile(uintptr(logFd), "logpipe")
   150  
   151  	logrus.SetOutput(logPipe)
   152  	logrus.SetFormatter(new(logrus.JSONFormatter))
   153  	logrus.Debug("child process in init()")
   154  
   155  	// Only init processes have FIFOFD.
   156  	var fifoFile *os.File
   157  	envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
   158  	it := initType(envInitType)
   159  	if it == initStandard {
   160  		fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
   161  		if err != nil {
   162  			return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
   163  		}
   164  		fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
   165  	}
   166  
   167  	var consoleSocket *os.File
   168  	if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
   169  		console, err := strconv.Atoi(envConsole)
   170  		if err != nil {
   171  			return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
   172  		}
   173  		consoleSocket = os.NewFile(uintptr(console), "console-socket")
   174  		defer consoleSocket.Close()
   175  	}
   176  
   177  	var pidfdSocket *os.File
   178  	if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
   179  		sockFd, err := strconv.Atoi(envSockFd)
   180  		if err != nil {
   181  			return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
   182  		}
   183  		pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
   184  		defer pidfdSocket.Close()
   185  	}
   186  
   187  	// Get runc-dmz fds.
   188  	var dmzExe *os.File
   189  	if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
   190  		dmzFd, err := strconv.Atoi(dmzFdStr)
   191  		if err != nil {
   192  			return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err)
   193  		}
   194  		unix.CloseOnExec(dmzFd)
   195  		dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz")
   196  	}
   197  
   198  	// clear the current process's environment to clean any libcontainer
   199  	// specific env vars.
   200  	os.Clearenv()
   201  
   202  	defer func() {
   203  		if err := recover(); err != nil {
   204  			if err2, ok := err.(error); ok {
   205  				retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack())
   206  			} else {
   207  				retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack())
   208  			}
   209  		}
   210  	}()
   211  
   212  	var config initConfig
   213  	if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
   214  		return err
   215  	}
   216  
   217  	// If init succeeds, it will not return, hence none of the defers will be called.
   218  	return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe)
   219  }
   220  
   221  func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe *os.File) error {
   222  	if err := populateProcessEnvironment(config.Env); err != nil {
   223  		return err
   224  	}
   225  
   226  	// Clean the RLIMIT_NOFILE cache in go runtime.
   227  	// Issue: https://github.com/opencontainers/runc/issues/4195
   228  	if containsRlimit(config.Rlimits, unix.RLIMIT_NOFILE) {
   229  		system.ClearRlimitNofileCache()
   230  	}
   231  
   232  	switch t {
   233  	case initSetns:
   234  		i := &linuxSetnsInit{
   235  			pipe:          pipe,
   236  			consoleSocket: consoleSocket,
   237  			pidfdSocket:   pidfdSocket,
   238  			config:        config,
   239  			logPipe:       logPipe,
   240  			dmzExe:        dmzExe,
   241  		}
   242  		return i.Init()
   243  	case initStandard:
   244  		i := &linuxStandardInit{
   245  			pipe:          pipe,
   246  			consoleSocket: consoleSocket,
   247  			pidfdSocket:   pidfdSocket,
   248  			parentPid:     unix.Getppid(),
   249  			config:        config,
   250  			fifoFile:      fifoFile,
   251  			logPipe:       logPipe,
   252  			dmzExe:        dmzExe,
   253  		}
   254  		return i.Init()
   255  	}
   256  	return fmt.Errorf("unknown init type %q", t)
   257  }
   258  
   259  // populateProcessEnvironment loads the provided environment variables into the
   260  // current processes's environment.
   261  func populateProcessEnvironment(env []string) error {
   262  	for _, pair := range env {
   263  		p := strings.SplitN(pair, "=", 2)
   264  		if len(p) < 2 {
   265  			return errors.New("invalid environment variable: missing '='")
   266  		}
   267  		name, val := p[0], p[1]
   268  		if name == "" {
   269  			return errors.New("invalid environment variable: name cannot be empty")
   270  		}
   271  		if strings.IndexByte(name, 0) >= 0 {
   272  			return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
   273  		}
   274  		if strings.IndexByte(val, 0) >= 0 {
   275  			return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
   276  		}
   277  		if err := os.Setenv(name, val); err != nil {
   278  			return err
   279  		}
   280  	}
   281  	return nil
   282  }
   283  
   284  // verifyCwd ensures that the current directory is actually inside the mount
   285  // namespace root of the current process.
   286  func verifyCwd() error {
   287  	// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
   288  	// current mount namespace root, and in that case prefixes "(unreachable)"
   289  	// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
   290  	// when this happens and return ENOENT rather than returning a non-absolute
   291  	// path. In both cases we can therefore easily detect if we have an invalid
   292  	// cwd by checking the return value of getcwd(3). See getcwd(3) for more
   293  	// details, and CVE-2024-21626 for the security issue that motivated this
   294  	// check.
   295  	//
   296  	// We have to use unix.Getwd() here because os.Getwd() has a workaround for
   297  	// $PWD which involves doing stat(.), which can fail if the current
   298  	// directory is inaccessible to the container process.
   299  	if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
   300  		return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
   301  	} else if err != nil {
   302  		return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
   303  	} else if !filepath.IsAbs(wd) {
   304  		// We shouldn't ever hit this, but check just in case.
   305  		return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
   306  	}
   307  	return nil
   308  }
   309  
   310  // finalizeNamespace drops the caps, sets the correct user
   311  // and working dir, and closes any leaked file descriptors
   312  // before executing the command inside the namespace
   313  func finalizeNamespace(config *initConfig) error {
   314  	// Ensure that all unwanted fds we may have accidentally
   315  	// inherited are marked close-on-exec so they stay out of the
   316  	// container
   317  	if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
   318  		return fmt.Errorf("error closing exec fds: %w", err)
   319  	}
   320  
   321  	// we only do chdir if it's specified
   322  	doChdir := config.Cwd != ""
   323  	if doChdir {
   324  		// First, attempt the chdir before setting up the user.
   325  		// This could allow us to access a directory that the user running runc can access
   326  		// but the container user cannot.
   327  		err := unix.Chdir(config.Cwd)
   328  		switch {
   329  		case err == nil:
   330  			doChdir = false
   331  		case os.IsPermission(err):
   332  			// If we hit an EPERM, we should attempt again after setting up user.
   333  			// This will allow us to successfully chdir if the container user has access
   334  			// to the directory, but the user running runc does not.
   335  			// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
   336  		default:
   337  			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
   338  		}
   339  	}
   340  
   341  	caps := &configs.Capabilities{}
   342  	if config.Capabilities != nil {
   343  		caps = config.Capabilities
   344  	} else if config.Config.Capabilities != nil {
   345  		caps = config.Config.Capabilities
   346  	}
   347  	w, err := capabilities.New(caps)
   348  	if err != nil {
   349  		return err
   350  	}
   351  	// drop capabilities in bounding set before changing user
   352  	if err := w.ApplyBoundingSet(); err != nil {
   353  		return fmt.Errorf("unable to apply bounding set: %w", err)
   354  	}
   355  	// preserve existing capabilities while we change users
   356  	if err := system.SetKeepCaps(); err != nil {
   357  		return fmt.Errorf("unable to set keep caps: %w", err)
   358  	}
   359  	if err := setupUser(config); err != nil {
   360  		return fmt.Errorf("unable to setup user: %w", err)
   361  	}
   362  	// Change working directory AFTER the user has been set up, if we haven't done it yet.
   363  	if doChdir {
   364  		if err := unix.Chdir(config.Cwd); err != nil {
   365  			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
   366  		}
   367  	}
   368  	// Make sure our final working directory is inside the container.
   369  	if err := verifyCwd(); err != nil {
   370  		return err
   371  	}
   372  	if err := system.ClearKeepCaps(); err != nil {
   373  		return fmt.Errorf("unable to clear keep caps: %w", err)
   374  	}
   375  	if err := w.ApplyCaps(); err != nil {
   376  		return fmt.Errorf("unable to apply caps: %w", err)
   377  	}
   378  	return nil
   379  }
   380  
   381  // setupConsole sets up the console from inside the container, and sends the
   382  // master pty fd to the config.Pipe (using cmsg). This is done to ensure that
   383  // consoles are scoped to a container properly (see runc#814 and the many
   384  // issues related to that). This has to be run *after* we've pivoted to the new
   385  // rootfs (and the users' configuration is entirely set up).
   386  func setupConsole(socket *os.File, config *initConfig, mount bool) error {
   387  	defer socket.Close()
   388  	// At this point, /dev/ptmx points to something that we would expect. We
   389  	// used to change the owner of the slave path, but since the /dev/pts mount
   390  	// can have gid=X set (at the users' option). So touching the owner of the
   391  	// slave PTY is not necessary, as the kernel will handle that for us. Note
   392  	// however, that setupUser (specifically fixStdioPermissions) *will* change
   393  	// the UID owner of the console to be the user the process will run as (so
   394  	// they can actually control their console).
   395  
   396  	pty, slavePath, err := console.NewPty()
   397  	if err != nil {
   398  		return err
   399  	}
   400  	// After we return from here, we don't need the console anymore.
   401  	defer pty.Close()
   402  
   403  	if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
   404  		err = pty.Resize(console.WinSize{
   405  			Height: config.ConsoleHeight,
   406  			Width:  config.ConsoleWidth,
   407  		})
   408  		if err != nil {
   409  			return err
   410  		}
   411  	}
   412  
   413  	// Mount the console inside our rootfs.
   414  	if mount {
   415  		if err := mountConsole(slavePath); err != nil {
   416  			return err
   417  		}
   418  	}
   419  	// While we can access console.master, using the API is a good idea.
   420  	if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
   421  		return err
   422  	}
   423  	runtime.KeepAlive(pty)
   424  
   425  	// Now, dup over all the things.
   426  	return dupStdio(slavePath)
   427  }
   428  
   429  // syncParentReady sends to the given pipe a JSON payload which indicates that
   430  // the init is ready to Exec the child process. It then waits for the parent to
   431  // indicate that it is cleared to Exec.
   432  func syncParentReady(pipe *syncSocket) error {
   433  	// Tell parent.
   434  	if err := writeSync(pipe, procReady); err != nil {
   435  		return err
   436  	}
   437  	// Wait for parent to give the all-clear.
   438  	return readSync(pipe, procRun)
   439  }
   440  
   441  // syncParentHooks sends to the given pipe a JSON payload which indicates that
   442  // the parent should execute pre-start hooks. It then waits for the parent to
   443  // indicate that it is cleared to resume.
   444  func syncParentHooks(pipe *syncSocket) error {
   445  	// Tell parent.
   446  	if err := writeSync(pipe, procHooks); err != nil {
   447  		return err
   448  	}
   449  	// Wait for parent to give the all-clear.
   450  	return readSync(pipe, procHooksDone)
   451  }
   452  
   453  // syncParentSeccomp sends the fd associated with the seccomp file descriptor
   454  // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
   455  func syncParentSeccomp(pipe *syncSocket, seccompFd *os.File) error {
   456  	if seccompFd == nil {
   457  		return nil
   458  	}
   459  	defer seccompFd.Close()
   460  
   461  	// Tell parent to grab our fd.
   462  	//
   463  	// Notably, we do not use writeSyncFile here because a container might have
   464  	// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
   465  	// possible number of system calls here because all of those syscalls
   466  	// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
   467  	// before the parent gets the file descriptor would deadlock "runc init" if
   468  	// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
   469  	// details.
   470  	if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil {
   471  		return err
   472  	}
   473  	// Wait for parent to tell us they've grabbed the seccompfd.
   474  	return readSync(pipe, procSeccompDone)
   475  }
   476  
   477  // setupUser changes the groups, gid, and uid for the user inside the container
   478  func setupUser(config *initConfig) error {
   479  	// Set up defaults.
   480  	defaultExecUser := user.ExecUser{
   481  		Uid:  0,
   482  		Gid:  0,
   483  		Home: "/",
   484  	}
   485  
   486  	passwdPath, err := user.GetPasswdPath()
   487  	if err != nil {
   488  		return err
   489  	}
   490  
   491  	groupPath, err := user.GetGroupPath()
   492  	if err != nil {
   493  		return err
   494  	}
   495  
   496  	execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
   497  	if err != nil {
   498  		return err
   499  	}
   500  
   501  	var addGroups []int
   502  	if len(config.AdditionalGroups) > 0 {
   503  		addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
   504  		if err != nil {
   505  			return err
   506  		}
   507  	}
   508  
   509  	if config.RootlessEUID {
   510  		// We cannot set any additional groups in a rootless container and thus
   511  		// we bail if the user asked us to do so. TODO: We currently can't do
   512  		// this check earlier, but if libcontainer.Process.User was typesafe
   513  		// this might work.
   514  		if len(addGroups) > 0 {
   515  			return errors.New("cannot set any additional groups in a rootless container")
   516  		}
   517  	}
   518  
   519  	// Before we change to the container's user make sure that the processes
   520  	// STDIO is correctly owned by the user that we are switching to.
   521  	if err := fixStdioPermissions(execUser); err != nil {
   522  		return err
   523  	}
   524  
   525  	// We don't need to use /proc/thread-self here because setgroups is a
   526  	// per-userns file and thus is global to all threads in a thread-group.
   527  	// This lets us avoid having to do runtime.LockOSThread.
   528  	setgroups, err := os.ReadFile("/proc/self/setgroups")
   529  	if err != nil && !os.IsNotExist(err) {
   530  		return err
   531  	}
   532  
   533  	// This isn't allowed in an unprivileged user namespace since Linux 3.19.
   534  	// There's nothing we can do about /etc/group entries, so we silently
   535  	// ignore setting groups here (since the user didn't explicitly ask us to
   536  	// set the group).
   537  	allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"
   538  
   539  	if allowSupGroups {
   540  		suppGroups := append(execUser.Sgids, addGroups...)
   541  		if err := unix.Setgroups(suppGroups); err != nil {
   542  			return &os.SyscallError{Syscall: "setgroups", Err: err}
   543  		}
   544  	}
   545  
   546  	if err := unix.Setgid(execUser.Gid); err != nil {
   547  		if err == unix.EINVAL {
   548  			return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", execUser.Gid)
   549  		}
   550  		return err
   551  	}
   552  	if err := unix.Setuid(execUser.Uid); err != nil {
   553  		if err == unix.EINVAL {
   554  			return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", execUser.Uid)
   555  		}
   556  		return err
   557  	}
   558  
   559  	// if we didn't get HOME already, set it based on the user's HOME
   560  	if envHome := os.Getenv("HOME"); envHome == "" {
   561  		if err := os.Setenv("HOME", execUser.Home); err != nil {
   562  			return err
   563  		}
   564  	}
   565  	return nil
   566  }
   567  
   568  // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
   569  // The ownership needs to match because it is created outside of the container and needs to be
   570  // localized.
   571  func fixStdioPermissions(u *user.ExecUser) error {
   572  	var null unix.Stat_t
   573  	if err := unix.Stat("/dev/null", &null); err != nil {
   574  		return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
   575  	}
   576  	for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
   577  		var s unix.Stat_t
   578  		if err := unix.Fstat(int(file.Fd()), &s); err != nil {
   579  			return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
   580  		}
   581  
   582  		// Skip chown if uid is already the one we want or any of the STDIO descriptors
   583  		// were redirected to /dev/null.
   584  		if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
   585  			continue
   586  		}
   587  
   588  		// We only change the uid (as it is possible for the mount to
   589  		// prefer a different gid, and there's no reason for us to change it).
   590  		// The reason why we don't just leave the default uid=X mount setup is
   591  		// that users expect to be able to actually use their console. Without
   592  		// this code, you couldn't effectively run as a non-root user inside a
   593  		// container and also have a console set up.
   594  		if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
   595  			// If we've hit an EINVAL then s.Gid isn't mapped in the user
   596  			// namespace. If we've hit an EPERM then the inode's current owner
   597  			// is not mapped in our user namespace (in particular,
   598  			// privileged_wrt_inode_uidgid() has failed). Read-only
   599  			// /dev can result in EROFS error. In any case, it's
   600  			// better for us to just not touch the stdio rather
   601  			// than bail at this point.
   602  
   603  			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
   604  				continue
   605  			}
   606  			return err
   607  		}
   608  	}
   609  	return nil
   610  }
   611  
   612  // setupNetwork sets up and initializes any network interface inside the container.
   613  func setupNetwork(config *initConfig) error {
   614  	for _, config := range config.Networks {
   615  		strategy, err := getStrategy(config.Type)
   616  		if err != nil {
   617  			return err
   618  		}
   619  		if err := strategy.initialize(config); err != nil {
   620  			return err
   621  		}
   622  	}
   623  	return nil
   624  }
   625  
   626  func setupRoute(config *configs.Config) error {
   627  	for _, config := range config.Routes {
   628  		_, dst, err := net.ParseCIDR(config.Destination)
   629  		if err != nil {
   630  			return err
   631  		}
   632  		src := net.ParseIP(config.Source)
   633  		if src == nil {
   634  			return fmt.Errorf("Invalid source for route: %s", config.Source)
   635  		}
   636  		gw := net.ParseIP(config.Gateway)
   637  		if gw == nil {
   638  			return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
   639  		}
   640  		l, err := netlink.LinkByName(config.InterfaceName)
   641  		if err != nil {
   642  			return err
   643  		}
   644  		route := &netlink.Route{
   645  			Scope:     netlink.SCOPE_UNIVERSE,
   646  			Dst:       dst,
   647  			Src:       src,
   648  			Gw:        gw,
   649  			LinkIndex: l.Attrs().Index,
   650  		}
   651  		if err := netlink.RouteAdd(route); err != nil {
   652  			return err
   653  		}
   654  	}
   655  	return nil
   656  }
   657  
   658  func containsRlimit(limits []configs.Rlimit, resource int) bool {
   659  	for _, rlimit := range limits {
   660  		if rlimit.Type == resource {
   661  			return true
   662  		}
   663  	}
   664  	return false
   665  }
   666  
   667  func setupRlimits(limits []configs.Rlimit, pid int) error {
   668  	for _, rlimit := range limits {
   669  		if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
   670  			return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
   671  		}
   672  	}
   673  	return nil
   674  }
   675  
   676  func setupScheduler(config *configs.Config) error {
   677  	attr, err := configs.ToSchedAttr(config.Scheduler)
   678  	if err != nil {
   679  		return err
   680  	}
   681  	if err := unix.SchedSetAttr(0, attr, 0); err != nil {
   682  		if errors.Is(err, unix.EPERM) && config.Cgroups.CpusetCpus != "" {
   683  			return errors.New("process scheduler can't be used together with AllowedCPUs")
   684  		}
   685  		return fmt.Errorf("error setting scheduler: %w", err)
   686  	}
   687  	return nil
   688  }
   689  
   690  func setupPersonality(config *configs.Config) error {
   691  	return system.SetLinuxPersonality(config.Personality.Domain)
   692  }
   693  
   694  // signalAllProcesses freezes then iterates over all the processes inside the
   695  // manager's cgroups sending the signal s to them.
   696  func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
   697  	if !m.Exists() {
   698  		return ErrNotRunning
   699  	}
   700  	// Use cgroup.kill, if available.
   701  	if s == unix.SIGKILL {
   702  		if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
   703  			err := cgroups.WriteFile(p, "cgroup.kill", "1")
   704  			if err == nil || !errors.Is(err, os.ErrNotExist) {
   705  				return err
   706  			}
   707  			// Fallback to old implementation.
   708  		}
   709  	}
   710  
   711  	if err := m.Freeze(configs.Frozen); err != nil {
   712  		logrus.Warn(err)
   713  	}
   714  	pids, err := m.GetAllPids()
   715  	if err != nil {
   716  		if err := m.Freeze(configs.Thawed); err != nil {
   717  			logrus.Warn(err)
   718  		}
   719  		return err
   720  	}
   721  	for _, pid := range pids {
   722  		err := unix.Kill(pid, s)
   723  		if err != nil && err != unix.ESRCH {
   724  			logrus.Warnf("kill %d: %v", pid, err)
   725  		}
   726  	}
   727  	if err := m.Freeze(configs.Thawed); err != nil {
   728  		logrus.Warn(err)
   729  	}
   730  
   731  	return nil
   732  }
   733  
   734  // setupPidfd opens a process file descriptor of init process, and sends the
   735  // file descriptor back to the socket.
   736  func setupPidfd(socket *os.File, initType string) error {
   737  	defer socket.Close()
   738  
   739  	pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
   740  	if err != nil {
   741  		return fmt.Errorf("failed to pidfd_open: %w", err)
   742  	}
   743  
   744  	if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
   745  		unix.Close(pidFd)
   746  		return fmt.Errorf("failed to send pidfd on socket: %w", err)
   747  	}
   748  	return unix.Close(pidFd)
   749  }