github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/sandbox/sandbox.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sandbox creates and manipulates sandboxes.
    16  package sandbox
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math"
    25  	"os"
    26  	"os/exec"
    27  	"path/filepath"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/cenkalti/backoff"
    34  	specs "github.com/opencontainers/runtime-spec/specs-go"
    35  	"github.com/syndtr/gocapability/capability"
    36  	"golang.org/x/sys/unix"
    37  	"github.com/metacubex/gvisor/pkg/abi/linux"
    38  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    39  	"github.com/metacubex/gvisor/pkg/cleanup"
    40  	"github.com/metacubex/gvisor/pkg/control/client"
    41  	"github.com/metacubex/gvisor/pkg/control/server"
    42  	"github.com/metacubex/gvisor/pkg/coverage"
    43  	"github.com/metacubex/gvisor/pkg/log"
    44  	metricpb "github.com/metacubex/gvisor/pkg/metric/metric_go_proto"
    45  	"github.com/metacubex/gvisor/pkg/prometheus"
    46  	"github.com/metacubex/gvisor/pkg/sentry/control"
    47  	"github.com/metacubex/gvisor/pkg/sentry/devices/nvproxy"
    48  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/erofs"
    49  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    50  	"github.com/metacubex/gvisor/pkg/sentry/seccheck"
    51  	"github.com/metacubex/gvisor/pkg/state/statefile"
    52  	"github.com/metacubex/gvisor/pkg/sync"
    53  	"github.com/metacubex/gvisor/pkg/urpc"
    54  	"github.com/metacubex/gvisor/runsc/boot"
    55  	"github.com/metacubex/gvisor/runsc/boot/procfs"
    56  	"github.com/metacubex/gvisor/runsc/cgroup"
    57  	"github.com/metacubex/gvisor/runsc/config"
    58  	"github.com/metacubex/gvisor/runsc/console"
    59  	"github.com/metacubex/gvisor/runsc/donation"
    60  	"github.com/metacubex/gvisor/runsc/specutils"
    61  )
    62  
    63  const (
    64  	// namespaceAnnotation is a pod annotation populated by containerd.
    65  	// It contains the name of the pod that a sandbox is in when running in Kubernetes.
    66  	podNameAnnotation = "io.kubernetes.cri.sandbox-name"
    67  
    68  	// namespaceAnnotation is a pod annotation populated by containerd.
    69  	// It contains the namespace of the pod that a sandbox is in when running in Kubernetes.
    70  	namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace"
    71  )
    72  
    73  // createControlSocket finds a location and creates the socket used to
    74  // communicate with the sandbox. The socket is a UDS on the host filesystem.
    75  //
    76  // Note that abstract sockets are *not* used, because any user can connect to
    77  // them. There is no file mode protecting abstract sockets.
    78  func createControlSocket(rootDir, id string) (string, int, error) {
    79  	name := fmt.Sprintf("runsc-%s.sock", id)
    80  
    81  	// Only use absolute paths to guarantee resolution from anywhere.
    82  	for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} {
    83  		path := filepath.Join(dir, name)
    84  		log.Debugf("Attempting to create socket file %q", path)
    85  		fd, err := server.CreateSocket(path)
    86  		if err == nil {
    87  			log.Debugf("Using socket file %q", path)
    88  			return path, fd, nil
    89  		}
    90  		log.Debugf("Failed to create socket file %q: %v", path, err)
    91  	}
    92  	return "", -1, fmt.Errorf("unable to find location to write socket file")
    93  }
    94  
    95  // pid is an atomic type that implements JSON marshal/unmarshal interfaces.
    96  type pid struct {
    97  	val atomicbitops.Int64
    98  }
    99  
   100  func (p *pid) store(pid int) {
   101  	p.val.Store(int64(pid))
   102  }
   103  
   104  func (p *pid) load() int {
   105  	return int(p.val.Load())
   106  }
   107  
   108  // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON.
   109  func (p *pid) UnmarshalJSON(b []byte) error {
   110  	var pid int
   111  
   112  	if err := json.Unmarshal(b, &pid); err != nil {
   113  		return err
   114  	}
   115  	p.store(pid)
   116  	return nil
   117  }
   118  
   119  // MarshalJSON implements json.Marshaler.MarshalJSON
   120  func (p *pid) MarshalJSON() ([]byte, error) {
   121  	return json.Marshal(p.load())
   122  }
   123  
   124  // Sandbox wraps a sandbox process.
   125  //
   126  // It is used to start/stop sandbox process (and associated processes like
   127  // gofers), as well as for running and manipulating containers inside a running
   128  // sandbox.
   129  //
   130  // Note: Sandbox must be immutable because a copy of it is saved for each
   131  // container and changes would not be synchronized to all of them.
   132  type Sandbox struct {
   133  	// ID is the id of the sandbox (immutable). By convention, this is the same
   134  	// ID as the first container run in the sandbox.
   135  	ID string `json:"id"`
   136  
   137  	// PodName is the name of the Kubernetes Pod (if any) that this sandbox
   138  	// represents. Unset if not running under containerd or Kubernetes.
   139  	PodName string `json:"podName"`
   140  
   141  	// Namespace is the Kubernetes namespace (if any) of the pod that this
   142  	// sandbox represents. Unset if not running under containerd or Kubernetes.
   143  	Namespace string `json:"namespace"`
   144  
   145  	// Pid is the pid of the running sandbox. May be 0 if the sandbox
   146  	// is not running.
   147  	Pid pid `json:"pid"`
   148  
   149  	// UID is the user ID in the parent namespace that the sandbox is running as.
   150  	UID int `json:"uid"`
   151  	// GID is the group ID in the parent namespace that the sandbox is running as.
   152  	GID int `json:"gid"`
   153  
   154  	// CgroupJSON contains the cgroup configuration that the sandbox is part of
   155  	// and allow serialization of the configuration into json
   156  	CgroupJSON cgroup.CgroupJSON `json:"cgroup"`
   157  
   158  	// OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox
   159  	// started, before it may be modified.
   160  	OriginalOOMScoreAdj int `json:"originalOomScoreAdj"`
   161  
   162  	// RegisteredMetrics is the set of metrics registered in the sandbox.
   163  	// Used for verifying metric data integrity after containers are started.
   164  	// Only populated if exporting metrics was requested when the sandbox was
   165  	// created.
   166  	RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"`
   167  
   168  	// MetricMetadata are key-value pairs that are useful to export about this
   169  	// sandbox, but not part of the set of labels that uniquely identify it.
   170  	// They are static once initialized, and typically contain high-level
   171  	// configuration information about the sandbox.
   172  	MetricMetadata map[string]string `json:"metricMetadata"`
   173  
   174  	// MetricServerAddress is the address of the metric server that this sandbox
   175  	// intends to export metrics for.
   176  	// Only populated if exporting metrics was requested when the sandbox was
   177  	// created.
   178  	MetricServerAddress string `json:"metricServerAddress"`
   179  
   180  	// ControlSocketPath is the path to the sandbox's uRPC server socket.
   181  	// Connections to the sandbox are made through this.
   182  	ControlSocketPath string `json:"controlSocketPath"`
   183  
   184  	// MountHints provides extra information about container mounts that apply
   185  	// to the entire pod.
   186  	MountHints *boot.PodMountHints `json:"mountHints"`
   187  
   188  	// child is set if a sandbox process is a child of the current process.
   189  	//
   190  	// This field isn't saved to json, because only a creator of sandbox
   191  	// will have it as a child process.
   192  	child bool `nojson:"true"`
   193  
   194  	// statusMu protects status.
   195  	statusMu sync.Mutex `nojson:"true"`
   196  
   197  	// status is the exit status of a sandbox process. It's only set if the
   198  	// child==true and the sandbox was waited on. This field allows for multiple
   199  	// threads to wait on sandbox and get the exit code, since Linux will return
   200  	// WaitStatus to one of the waiters only.
   201  	status unix.WaitStatus `nojson:"true"`
   202  }
   203  
   204  // Getpid returns the process ID of the sandbox process.
   205  func (s *Sandbox) Getpid() int {
   206  	return s.Pid.load()
   207  }
   208  
   209  // Args is used to configure a new sandbox.
   210  type Args struct {
   211  	// ID is the sandbox unique identifier.
   212  	ID string
   213  
   214  	// Spec is the OCI spec that describes the container.
   215  	Spec *specs.Spec
   216  
   217  	// BundleDir is the directory containing the container bundle.
   218  	BundleDir string
   219  
   220  	// ConsoleSocket is the path to a unix domain socket that will receive
   221  	// the console FD. It may be empty.
   222  	ConsoleSocket string
   223  
   224  	// UserLog is the filename to send user-visible logs to. It may be empty.
   225  	UserLog string
   226  
   227  	// IOFiles is the list of image files and/or socket files that connect to
   228  	// a gofer endpoint for the mount points using Gofers. They must be in the
   229  	// same order as mounts appear in the spec.
   230  	IOFiles []*os.File
   231  
   232  	// File that connects to a gofer endpoint for a device mount point at /dev.
   233  	DevIOFile *os.File
   234  
   235  	// GoferFilestoreFiles are the regular files that will back the overlayfs or
   236  	// tmpfs mount if a gofer mount is to be overlaid.
   237  	GoferFilestoreFiles []*os.File
   238  
   239  	// GoferMountConfs contains information about how the gofer mounts have been
   240  	// configured. The first entry is for rootfs and the following entries are
   241  	// for bind mounts in Spec.Mounts (in the same order).
   242  	GoferMountConfs boot.GoferMountConfFlags
   243  
   244  	// MountHints provides extra information about containers mounts that apply
   245  	// to the entire pod.
   246  	MountHints *boot.PodMountHints
   247  
   248  	// MountsFile is a file container mount information from the spec. It's
   249  	// equivalent to the mounts from the spec, except that all paths have been
   250  	// resolved to their final absolute location.
   251  	MountsFile *os.File
   252  
   253  	// Gcgroup is the cgroup that the sandbox is part of.
   254  	Cgroup cgroup.Cgroup
   255  
   256  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   257  	// If the caller exits, the sandbox should exit too.
   258  	Attached bool
   259  
   260  	// SinkFiles is the an ordered array of files to be used by seccheck sinks
   261  	// configured from the --pod-init-config file.
   262  	SinkFiles []*os.File
   263  
   264  	// PassFiles are user-supplied files from the host to be exposed to the
   265  	// sandboxed app.
   266  	PassFiles map[int]*os.File
   267  
   268  	// ExecFile is the file from the host used for program execution.
   269  	ExecFile *os.File
   270  }
   271  
   272  // New creates the sandbox process. The caller must call Destroy() on the
   273  // sandbox.
   274  func New(conf *config.Config, args *Args) (*Sandbox, error) {
   275  	s := &Sandbox{
   276  		ID: args.ID,
   277  		CgroupJSON: cgroup.CgroupJSON{
   278  			Cgroup: args.Cgroup,
   279  		},
   280  		UID:                 -1, // prevent usage before it's set.
   281  		GID:                 -1, // prevent usage before it's set.
   282  		MetricMetadata:      conf.MetricMetadata(),
   283  		MetricServerAddress: conf.MetricServer,
   284  		MountHints:          args.MountHints,
   285  	}
   286  	if args.Spec != nil && args.Spec.Annotations != nil {
   287  		s.PodName = args.Spec.Annotations[podNameAnnotation]
   288  		s.Namespace = args.Spec.Annotations[namespaceAnnotation]
   289  	}
   290  
   291  	// The Cleanup object cleans up partially created sandboxes when an error
   292  	// occurs. Any errors occurring during cleanup itself are ignored.
   293  	c := cleanup.Make(func() {
   294  		if err := s.destroy(); err != nil {
   295  			log.Warningf("error destroying sandbox: %v", err)
   296  		}
   297  	})
   298  	defer c.Clean()
   299  
   300  	if len(conf.PodInitConfig) > 0 {
   301  		initConf, err := boot.LoadInitConfig(conf.PodInitConfig)
   302  		if err != nil {
   303  			return nil, fmt.Errorf("loading init config file: %w", err)
   304  		}
   305  		args.SinkFiles, err = initConf.Setup()
   306  		if err != nil {
   307  			return nil, fmt.Errorf("cannot init config: %w", err)
   308  		}
   309  	}
   310  
   311  	// Create pipe to synchronize when sandbox process has been booted.
   312  	clientSyncFile, sandboxSyncFile, err := os.Pipe()
   313  	if err != nil {
   314  		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
   315  	}
   316  	defer clientSyncFile.Close()
   317  
   318  	// Create the sandbox process.
   319  	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
   320  	// sandboxSyncFile has to be closed to be able to detect when the sandbox
   321  	// process exits unexpectedly.
   322  	sandboxSyncFile.Close()
   323  	if err != nil {
   324  		return nil, fmt.Errorf("cannot create sandbox process: %w", err)
   325  	}
   326  
   327  	// Wait until the sandbox has booted.
   328  	b := make([]byte, 1)
   329  	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
   330  		err := fmt.Errorf("waiting for sandbox to start: %v", err)
   331  		// If the sandbox failed to start, it may be because the binary
   332  		// permissions were incorrect. Check the bits and return a more helpful
   333  		// error message.
   334  		//
   335  		// NOTE: The error message is checked because error types are lost over
   336  		// rpc calls.
   337  		if strings.Contains(err.Error(), io.EOF.Error()) {
   338  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   339  				return nil, fmt.Errorf("%v: %v", err, permsErr)
   340  			}
   341  		}
   342  		return nil, fmt.Errorf("cannot read client sync file: %w", err)
   343  	}
   344  
   345  	if conf.MetricServer != "" {
   346  		// The control server is up and the sandbox was configured to export metrics.
   347  		// We must gather data about registered metrics prior to any process starting in the sandbox.
   348  		log.Debugf("Getting metric registration information from sandbox %q", s.ID)
   349  		var registeredMetrics control.MetricsRegistrationResponse
   350  		if err := s.call(boot.MetricsGetRegistered, nil, &registeredMetrics); err != nil {
   351  			return nil, fmt.Errorf("cannot get registered metrics: %v", err)
   352  		}
   353  		s.RegisteredMetrics = registeredMetrics.RegisteredMetrics
   354  	}
   355  
   356  	c.Release()
   357  	return s, nil
   358  }
   359  
   360  // CreateSubcontainer creates a container inside the sandbox.
   361  func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error {
   362  	log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   363  
   364  	var files []*os.File
   365  	if tty != nil {
   366  		files = []*os.File{tty}
   367  	}
   368  	if err := s.configureStdios(conf, files); err != nil {
   369  		return err
   370  	}
   371  
   372  	args := boot.CreateArgs{
   373  		CID:         cid,
   374  		FilePayload: urpc.FilePayload{Files: files},
   375  	}
   376  	if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil {
   377  		return fmt.Errorf("creating sub-container %q: %w", cid, err)
   378  	}
   379  	return nil
   380  }
   381  
   382  // StartRoot starts running the root container process inside the sandbox.
   383  func (s *Sandbox) StartRoot(conf *config.Config) error {
   384  	pid := s.Pid.load()
   385  	log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid)
   386  	conn, err := s.sandboxConnect()
   387  	if err != nil {
   388  		return err
   389  	}
   390  	defer conn.Close()
   391  
   392  	// Configure the network.
   393  	if err := setupNetwork(conn, pid, conf); err != nil {
   394  		return fmt.Errorf("setting up network: %w", err)
   395  	}
   396  
   397  	// Send a message to the sandbox control server to start the root container.
   398  	if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil {
   399  		return fmt.Errorf("starting root container: %w", err)
   400  	}
   401  
   402  	return nil
   403  }
   404  
   405  // StartSubcontainer starts running a sub-container inside the sandbox.
   406  func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error {
   407  	log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   408  
   409  	if err := s.configureStdios(conf, stdios); err != nil {
   410  		return err
   411  	}
   412  	s.fixPidns(spec)
   413  
   414  	// The payload contains (in this specific order):
   415  	// * stdin/stdout/stderr (optional: only present when not using TTY)
   416  	// * The subcontainer's gofer filestore files (optional)
   417  	// * The subcontainer's dev gofer file (optional)
   418  	// * Gofer files.
   419  	payload := urpc.FilePayload{}
   420  	payload.Files = append(payload.Files, stdios...)
   421  	payload.Files = append(payload.Files, goferFilestores...)
   422  	if devIOFile != nil {
   423  		payload.Files = append(payload.Files, devIOFile)
   424  	}
   425  	payload.Files = append(payload.Files, goferFiles...)
   426  
   427  	// Start running the container.
   428  	args := boot.StartArgs{
   429  		Spec:                 spec,
   430  		Conf:                 conf,
   431  		CID:                  cid,
   432  		NumGoferFilestoreFDs: len(goferFilestores),
   433  		IsDevIoFilePresent:   devIOFile != nil,
   434  		GoferMountConfs:      goferConfs,
   435  		FilePayload:          payload,
   436  	}
   437  	if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil {
   438  		return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err)
   439  	}
   440  	return nil
   441  }
   442  
   443  // Restore sends the restore call for a container in the sandbox.
   444  func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error {
   445  	log.Debugf("Restore sandbox %q", s.ID)
   446  
   447  	rf, err := os.Open(filename)
   448  	if err != nil {
   449  		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
   450  	}
   451  	defer rf.Close()
   452  
   453  	opt := boot.RestoreOpts{
   454  		FilePayload: urpc.FilePayload{
   455  			Files: []*os.File{rf},
   456  		},
   457  		SandboxID: s.ID,
   458  	}
   459  
   460  	// If the platform needs a device FD we must pass it in.
   461  	if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil {
   462  		return err
   463  	} else if deviceFile != nil {
   464  		defer deviceFile.Close()
   465  		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
   466  	}
   467  
   468  	conn, err := s.sandboxConnect()
   469  	if err != nil {
   470  		return err
   471  	}
   472  	defer conn.Close()
   473  
   474  	// Configure the network.
   475  	if err := setupNetwork(conn, s.Pid.load(), conf); err != nil {
   476  		return fmt.Errorf("setting up network: %v", err)
   477  	}
   478  
   479  	// Restore the container and start the root container.
   480  	if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
   481  		return fmt.Errorf("restoring container %q: %v", cid, err)
   482  	}
   483  
   484  	return nil
   485  }
   486  
   487  // Processes retrieves the list of processes and associated metadata for a
   488  // given container in this sandbox.
   489  func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
   490  	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
   491  	var pl []*control.Process
   492  	if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil {
   493  		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
   494  	}
   495  	return pl, nil
   496  }
   497  
   498  // CreateTraceSession creates a new trace session.
   499  func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error {
   500  	log.Debugf("Creating trace session in sandbox %q", s.ID)
   501  
   502  	sinkFiles, err := seccheck.SetupSinks(config.Sinks)
   503  	if err != nil {
   504  		return err
   505  	}
   506  	defer func() {
   507  		for _, f := range sinkFiles {
   508  			_ = f.Close()
   509  		}
   510  	}()
   511  
   512  	arg := boot.CreateTraceSessionArgs{
   513  		Config: *config,
   514  		Force:  force,
   515  		FilePayload: urpc.FilePayload{
   516  			Files: sinkFiles,
   517  		},
   518  	}
   519  	if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil {
   520  		return fmt.Errorf("creating trace session: %w", err)
   521  	}
   522  	return nil
   523  }
   524  
   525  // DeleteTraceSession deletes an existing trace session.
   526  func (s *Sandbox) DeleteTraceSession(name string) error {
   527  	log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID)
   528  	if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil {
   529  		return fmt.Errorf("deleting trace session: %w", err)
   530  	}
   531  	return nil
   532  }
   533  
   534  // ListTraceSessions lists all trace sessions.
   535  func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) {
   536  	log.Debugf("Listing trace sessions in sandbox %q", s.ID)
   537  	var sessions []seccheck.SessionConfig
   538  	if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil {
   539  		return nil, fmt.Errorf("listing trace session: %w", err)
   540  	}
   541  	return sessions, nil
   542  }
   543  
   544  // ProcfsDump collects and returns a procfs dump for the sandbox.
   545  func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) {
   546  	log.Debugf("Procfs dump %q", s.ID)
   547  	var procfsDump []procfs.ProcessProcfsDump
   548  	if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil {
   549  		return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
   550  	}
   551  	return procfsDump, nil
   552  }
   553  
   554  // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
   555  func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) {
   556  	return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */)
   557  }
   558  
   559  // Execute runs the specified command in the container. It returns the PID of
   560  // the newly created process.
   561  func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   562  	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
   563  
   564  	// Stdios are those files which have an FD <= 2 in the process. We do not
   565  	// want the ownership of other files to be changed by configureStdios.
   566  	var stdios []*os.File
   567  	for i, fd := range args.GuestFDs {
   568  		if fd > 2 || i >= len(args.Files) {
   569  			continue
   570  		}
   571  		stdios = append(stdios, args.Files[i])
   572  	}
   573  
   574  	if err := s.configureStdios(conf, stdios); err != nil {
   575  		return 0, err
   576  	}
   577  
   578  	// Send a message to the sandbox control server to start the container.
   579  	var pid int32
   580  	if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil {
   581  		return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err)
   582  	}
   583  	return pid, nil
   584  }
   585  
   586  // Event retrieves stats about the sandbox such as memory and CPU utilization.
   587  func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
   588  	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
   589  	var e boot.EventOut
   590  	if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil {
   591  		return nil, fmt.Errorf("retrieving event data from sandbox: %w", err)
   592  	}
   593  	return &e, nil
   594  }
   595  
   596  // PortForward starts port forwarding to the sandbox.
   597  func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error {
   598  	log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts)
   599  	conn, err := s.sandboxConnect()
   600  	if err != nil {
   601  		return err
   602  	}
   603  	defer conn.Close()
   604  
   605  	if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil {
   606  		return fmt.Errorf("port forwarding to sandbox: %v", err)
   607  	}
   608  
   609  	return nil
   610  }
   611  
   612  func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
   613  	log.Debugf("Connecting to sandbox %q", s.ID)
   614  	path := s.ControlSocketPath
   615  	if len(path) >= linux.UnixPathMax {
   616  		// This is not an abstract socket path. It is a filesystem path.
   617  		// UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead
   618  		// open the socket using open(2) and use /proc to refer to the open FD.
   619  		sockFD, err := unix.Open(path, unix.O_PATH, 0)
   620  		if err != nil {
   621  			return nil, fmt.Errorf("failed to open socket at %q", path)
   622  		}
   623  		defer unix.Close(sockFD)
   624  		path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD))
   625  	}
   626  	conn, err := client.ConnectTo(path)
   627  	if err != nil {
   628  		return nil, s.connError(err)
   629  	}
   630  	return conn, nil
   631  }
   632  
   633  func (s *Sandbox) call(method string, arg, result any) error {
   634  	conn, err := s.sandboxConnect()
   635  	if err != nil {
   636  		return err
   637  	}
   638  	defer conn.Close()
   639  
   640  	return conn.Call(method, arg, result)
   641  }
   642  
   643  func (s *Sandbox) connError(err error) error {
   644  	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err)
   645  }
   646  
   647  // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
   648  // command, passing in the bundle dir.
   649  func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
   650  	donations := donation.Agency{}
   651  	defer donations.Close()
   652  
   653  	// pgalloc.MemoryFile (which provides application memory) sometimes briefly
   654  	// mlock(2)s ranges of memory in order to fault in a large number of pages at
   655  	// a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc
   656  	// expects to run in a memory cgroup that limits its memory usage as
   657  	// required.
   658  	// This needs to be done before exec'ing `runsc boot`, as that subcommand
   659  	// runs as an unprivileged user that will not be able to call `setrlimit`
   660  	// by itself. Calling `setrlimit` here will have the side-effect of setting
   661  	// the limit on the currently-running `runsc` process as well, but that
   662  	// should be OK too.
   663  	var rlim unix.Rlimit
   664  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   665  		log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err)
   666  	} else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY {
   667  		rlim.Cur = unix.RLIM_INFINITY
   668  		rlim.Max = unix.RLIM_INFINITY
   669  		if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   670  			// We may not have CAP_SYS_RESOURCE, so this failure may be expected.
   671  			log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err)
   672  		}
   673  	}
   674  
   675  	//
   676  	// These flags must come BEFORE the "boot" command in cmd.Args.
   677  	//
   678  
   679  	// Open the log files to pass to the sandbox as FDs.
   680  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   681  		return err
   682  	}
   683  
   684  	test := ""
   685  	if len(conf.TestOnlyTestNameEnv) != 0 {
   686  		// Fetch test name if one is provided and the test only flag was set.
   687  		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
   688  			test = t
   689  		}
   690  	}
   691  	if specutils.IsDebugCommand(conf, "boot") {
   692  		if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil {
   693  			return err
   694  		}
   695  	}
   696  	if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil {
   697  		return err
   698  	}
   699  	covFilename := conf.CoverageReport
   700  	if covFilename == "" {
   701  		covFilename = os.Getenv("GO_COVERAGE_FILE")
   702  	}
   703  	if covFilename != "" && coverage.Available() {
   704  		if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil {
   705  			return err
   706  		}
   707  	}
   708  	if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil {
   709  		return err
   710  	}
   711  
   712  	// Relay all the config flags to the sandbox process.
   713  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
   714  	cmd.SysProcAttr = &unix.SysProcAttr{
   715  		// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   716  		// when re-parented.
   717  		Setsid: true,
   718  	}
   719  
   720  	// Set Args[0] to make easier to spot the sandbox process. Otherwise it's
   721  	// shown as `exe`.
   722  	cmd.Args[0] = "runsc-sandbox"
   723  
   724  	// Tranfer FDs that need to be present before the "boot" command.
   725  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
   726  	nextFD := donations.Transfer(cmd, 3)
   727  
   728  	// Add the "boot" command to the args.
   729  	//
   730  	// All flags after this must be for the boot command
   731  	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
   732  
   733  	// Clear environment variables, unless --TESTONLY-unsafe-nonroot is set.
   734  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   735  		// Setting cmd.Env = nil causes cmd to inherit the current process's env.
   736  		cmd.Env = []string{}
   737  	}
   738  
   739  	// If there is a gofer, sends all socket ends to the sandbox.
   740  	donations.DonateAndClose("io-fds", args.IOFiles...)
   741  	donations.DonateAndClose("dev-io-fd", args.DevIOFile)
   742  	donations.DonateAndClose("gofer-filestore-fds", args.GoferFilestoreFiles...)
   743  	donations.DonateAndClose("mounts-fd", args.MountsFile)
   744  	donations.Donate("start-sync-fd", startSyncFile)
   745  	if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   746  		return err
   747  	}
   748  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
   749  	if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil {
   750  		return err
   751  	}
   752  	if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil {
   753  		return err
   754  	}
   755  	if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil {
   756  		return err
   757  	}
   758  	if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil {
   759  		return err
   760  	}
   761  	if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil {
   762  		return err
   763  	}
   764  
   765  	// Pass gofer mount configs.
   766  	cmd.Args = append(cmd.Args, "--gofer-mount-confs="+args.GoferMountConfs.String())
   767  
   768  	// Create a socket for the control server and donate it to the sandbox.
   769  	controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID)
   770  	if err != nil {
   771  		return fmt.Errorf("failed to create control socket: %v", err)
   772  	}
   773  	s.ControlSocketPath = controlSocketPath
   774  	log.Infof("Control socket path: %q", s.ControlSocketPath)
   775  	donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket"))
   776  
   777  	specFile, err := specutils.OpenSpec(args.BundleDir)
   778  	if err != nil {
   779  		return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err)
   780  	}
   781  	donations.DonateAndClose("spec-fd", specFile)
   782  
   783  	if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil {
   784  		return err
   785  	}
   786  	donations.DonateAndClose("sink-fds", args.SinkFiles...)
   787  
   788  	gPlatform, err := platform.Lookup(conf.Platform)
   789  	if err != nil {
   790  		return fmt.Errorf("cannot look up platform: %w", err)
   791  	}
   792  	if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil {
   793  		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
   794  	} else if deviceFile != nil {
   795  		donations.DonateAndClose("device-fd", deviceFile)
   796  	}
   797  
   798  	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
   799  	// isn't set.
   800  	if conf.Platform == "kvm" {
   801  		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
   802  	}
   803  
   804  	// nss is the set of namespaces to join or create before starting the sandbox
   805  	// process. Mount, IPC and UTS namespaces from the host are not used as they
   806  	// are virtualized inside the sandbox. Be paranoid and run inside an empty
   807  	// namespace for these. Don't unshare cgroup because sandbox is added to a
   808  	// cgroup in the caller's namespace.
   809  	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
   810  	nss := []specs.LinuxNamespace{
   811  		{Type: specs.IPCNamespace},
   812  		{Type: specs.MountNamespace},
   813  		{Type: specs.UTSNamespace},
   814  	}
   815  
   816  	if gPlatform.Requirements().RequiresCurrentPIDNS {
   817  		// TODO(b/75837838): Also set a new PID namespace so that we limit
   818  		// access to other host processes.
   819  		log.Infof("Sandbox will be started in the current PID namespace")
   820  	} else {
   821  		log.Infof("Sandbox will be started in a new PID namespace")
   822  		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
   823  		cmd.Args = append(cmd.Args, "--pidns=true")
   824  	}
   825  
   826  	if specutils.NVProxyEnabled(args.Spec, conf) {
   827  		nvidiaDriverVersion, err := nvproxy.HostDriverVersion()
   828  		if err != nil {
   829  			return fmt.Errorf("failed to get Nvidia driver version: %w", err)
   830  		}
   831  		cmd.Args = append(cmd.Args, "--nvidia-driver-version="+nvidiaDriverVersion)
   832  	}
   833  
   834  	// Joins the network namespace if network is enabled. the sandbox talks
   835  	// directly to the host network, which may have been configured in the
   836  	// namespace.
   837  	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
   838  		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
   839  		nss = append(nss, ns)
   840  	} else if conf.Network == config.NetworkHost {
   841  		log.Infof("Sandbox will be started in the host network namespace")
   842  	} else {
   843  		log.Infof("Sandbox will be started in new network namespace")
   844  		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
   845  	}
   846  
   847  	// These are set to the uid/gid that the sandbox process will use. May be
   848  	// overriden below.
   849  	s.UID = os.Getuid()
   850  	s.GID = os.Getgid()
   851  
   852  	// User namespace depends on the network type or whether access to the host
   853  	// filesystem is required. These features require to run inside the user
   854  	// namespace specified in the spec or the current namespace if none is
   855  	// configured.
   856  	rootlessEUID := unix.Geteuid() != 0
   857  	setUserMappings := false
   858  	if conf.Network == config.NetworkHost || conf.DirectFS {
   859  		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
   860  			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
   861  			nss = append(nss, userns)
   862  			if rootlessEUID {
   863  				syncFile, err := ConfigureCmdForRootless(cmd, &donations)
   864  				if err != nil {
   865  					return err
   866  				}
   867  				defer syncFile.Close()
   868  				setUserMappings = true
   869  			} else {
   870  				specutils.SetUIDGIDMappings(cmd, args.Spec)
   871  				// We need to set UID and GID to have capabilities in a new user namespace.
   872  				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
   873  			}
   874  		} else {
   875  			if rootlessEUID {
   876  				return fmt.Errorf("unable to run a rootless container without userns")
   877  			}
   878  			log.Infof("Sandbox will be started in the current user namespace")
   879  		}
   880  		// When running in the caller's defined user namespace, apply the same
   881  		// capabilities to the sandbox process to ensure it abides to the same
   882  		// rules.
   883  		cmd.Args = append(cmd.Args, "--apply-caps=true")
   884  
   885  		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
   886  		// bind-mount the executable inside it.
   887  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   888  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   889  		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID {
   890  			log.Infof("Sandbox will be started in minimal chroot")
   891  			cmd.Args = append(cmd.Args, "--setup-root")
   892  		} else {
   893  			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
   894  		}
   895  	} else {
   896  		// If we have CAP_SETUID and CAP_SETGID, then we can also run
   897  		// as user nobody.
   898  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   899  			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
   900  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   901  		} else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
   902  			log.Infof("Sandbox will be started in new user namespace")
   903  			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
   904  			cmd.Args = append(cmd.Args, "--setup-root")
   905  
   906  			const nobody = 65534
   907  			if rootlessEUID || conf.Rootless {
   908  				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
   909  			} else {
   910  				// Map nobody in the new namespace to nobody in the parent namespace.
   911  				s.UID = nobody
   912  				s.GID = nobody
   913  			}
   914  
   915  			// Set credentials to run as user and group nobody.
   916  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
   917  			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   918  				{
   919  					ContainerID: nobody,
   920  					HostID:      s.UID,
   921  					Size:        1,
   922  				},
   923  			}
   924  			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   925  				{
   926  					ContainerID: nobody,
   927  					HostID:      s.GID,
   928  					Size:        1,
   929  				},
   930  			}
   931  
   932  			// A sandbox process will construct an empty root for itself, so it has
   933  			// to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
   934  			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps,
   935  				uintptr(capability.CAP_SYS_ADMIN),
   936  				uintptr(capability.CAP_SYS_CHROOT),
   937  				// CAP_SETPCAP is required to clear the bounding set.
   938  				uintptr(capability.CAP_SETPCAP),
   939  			)
   940  
   941  		} else {
   942  			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
   943  		}
   944  	}
   945  
   946  	// The current process' stdio must be passed to the application via the
   947  	// --stdio-fds flag. The stdio of the sandbox process itself must not
   948  	// be connected to the same FDs, otherwise we risk leaking sandbox
   949  	// errors to the application, so we set the sandbox stdio to nil,
   950  	// causing them to read/write from the null device.
   951  	cmd.Stdin = nil
   952  	cmd.Stdout = nil
   953  	cmd.Stderr = nil
   954  	var stdios [3]*os.File
   955  
   956  	// If the console control socket file is provided, then create a new
   957  	// pty master/replica pair and set the TTY on the sandbox process.
   958  	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
   959  		// console.NewWithSocket will send the master on the given
   960  		// socket, and return the replica.
   961  		tty, err := console.NewWithSocket(args.ConsoleSocket)
   962  		if err != nil {
   963  			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
   964  		}
   965  		defer tty.Close()
   966  
   967  		// Set the TTY as a controlling TTY on the sandbox process.
   968  		cmd.SysProcAttr.Setctty = true
   969  
   970  		// Inconveniently, the Ctty must be the FD in the *child* process's FD
   971  		// table. So transfer all files we have so far and make sure the next file
   972  		// added to donations is stdin.
   973  		//
   974  		// See https://github.com/golang/go/issues/29458.
   975  		nextFD = donations.Transfer(cmd, nextFD)
   976  		cmd.SysProcAttr.Ctty = nextFD
   977  
   978  		// Pass the tty as all stdio fds to sandbox.
   979  		stdios[0] = tty
   980  		stdios[1] = tty
   981  		stdios[2] = tty
   982  
   983  		if conf.Debug {
   984  			// If debugging, send the boot process stdio to the
   985  			// TTY, so that it is easier to find.
   986  			cmd.Stdin = tty
   987  			cmd.Stdout = tty
   988  			cmd.Stderr = tty
   989  		}
   990  	} else {
   991  		// If not using a console, pass our current stdio as the
   992  		// container stdio via flags.
   993  		stdios[0] = os.Stdin
   994  		stdios[1] = os.Stdout
   995  		stdios[2] = os.Stderr
   996  
   997  		if conf.Debug {
   998  			// If debugging, send the boot process stdio to the
   999  			// this process' stdio, so that is is easier to find.
  1000  			cmd.Stdin = os.Stdin
  1001  			cmd.Stdout = os.Stdout
  1002  			cmd.Stderr = os.Stderr
  1003  		}
  1004  	}
  1005  	if err := s.configureStdios(conf, stdios[:]); err != nil {
  1006  		return fmt.Errorf("configuring stdios: %w", err)
  1007  	}
  1008  	// Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above
  1009  	// because it relies on stdin being the next FD donated.
  1010  	donations.Donate("stdio-fds", stdios[:]...)
  1011  
  1012  	totalSysMem, err := totalSystemMemory()
  1013  	if err != nil {
  1014  		return err
  1015  	}
  1016  	cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10))
  1017  
  1018  	mem := totalSysMem
  1019  	if s.CgroupJSON.Cgroup != nil {
  1020  		cpuNum, err := s.CgroupJSON.Cgroup.NumCPU()
  1021  		if err != nil {
  1022  			return fmt.Errorf("getting cpu count from cgroups: %v", err)
  1023  		}
  1024  		if conf.CPUNumFromQuota {
  1025  			// Dropping below 2 CPUs can trigger application to disable
  1026  			// locks that can lead do hard to debug errors, so just
  1027  			// leaving two cores as reasonable default.
  1028  			const minCPUs = 2
  1029  
  1030  			quota, err := s.CgroupJSON.Cgroup.CPUQuota()
  1031  			if err != nil {
  1032  				return fmt.Errorf("getting cpu quota from cgroups: %v", err)
  1033  			}
  1034  			if n := int(math.Ceil(quota)); n > 0 {
  1035  				if n < minCPUs {
  1036  					n = minCPUs
  1037  				}
  1038  				if n < cpuNum {
  1039  					// Only lower the cpu number.
  1040  					cpuNum = n
  1041  				}
  1042  			}
  1043  		}
  1044  		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
  1045  
  1046  		memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit()
  1047  		if err != nil {
  1048  			return fmt.Errorf("getting memory limit from cgroups: %v", err)
  1049  		}
  1050  		if memLimit < mem {
  1051  			mem = memLimit
  1052  		}
  1053  	}
  1054  	cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
  1055  
  1056  	if args.Attached {
  1057  		// Kill sandbox if parent process exits in attached mode.
  1058  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1059  		// Tells boot that any process it creates must have pdeathsig set.
  1060  		cmd.Args = append(cmd.Args, "--attached")
  1061  	}
  1062  
  1063  	if args.ExecFile != nil {
  1064  		donations.Donate("exec-fd", args.ExecFile)
  1065  	}
  1066  
  1067  	nextFD = donations.Transfer(cmd, nextFD)
  1068  
  1069  	_ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles)
  1070  
  1071  	// Add container ID as the last argument.
  1072  	cmd.Args = append(cmd.Args, s.ID)
  1073  
  1074  	donation.LogDonations(cmd)
  1075  	log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args)
  1076  	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
  1077  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1078  		err := fmt.Errorf("starting sandbox: %v", err)
  1079  		// If the sandbox failed to start, it may be because the binary
  1080  		// permissions were incorrect. Check the bits and return a more helpful
  1081  		// error message.
  1082  		//
  1083  		// NOTE: The error message is checked because error types are lost over
  1084  		// rpc calls.
  1085  		if strings.Contains(err.Error(), unix.EACCES.Error()) {
  1086  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
  1087  				return fmt.Errorf("%v: %v", err, permsErr)
  1088  			}
  1089  		}
  1090  		return err
  1091  	}
  1092  	s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid)
  1093  	if err != nil {
  1094  		return err
  1095  	}
  1096  	if setUserMappings {
  1097  		if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil {
  1098  			return err
  1099  		}
  1100  	}
  1101  
  1102  	s.child = true
  1103  	s.Pid.store(cmd.Process.Pid)
  1104  	log.Infof("Sandbox started, PID: %d", cmd.Process.Pid)
  1105  
  1106  	return nil
  1107  }
  1108  
  1109  // Wait waits for the containerized process to exit, and returns its WaitStatus.
  1110  func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) {
  1111  	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
  1112  
  1113  	if conn, err := s.sandboxConnect(); err != nil {
  1114  		// The sandbox may have exited while before we had a chance to wait on it.
  1115  		// There is nothing we can do for subcontainers. For the init container, we
  1116  		// can try to get the sandbox exit code.
  1117  		if !s.IsRootContainer(cid) {
  1118  			return unix.WaitStatus(0), err
  1119  		}
  1120  		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1121  	} else {
  1122  		defer conn.Close()
  1123  
  1124  		// Try the Wait RPC to the sandbox.
  1125  		var ws unix.WaitStatus
  1126  		err = conn.Call(boot.ContMgrWait, &cid, &ws)
  1127  		conn.Close()
  1128  		if err == nil {
  1129  			if s.IsRootContainer(cid) {
  1130  				if err := s.waitForStopped(); err != nil {
  1131  					return unix.WaitStatus(0), err
  1132  				}
  1133  			}
  1134  			// It worked!
  1135  			return ws, nil
  1136  		}
  1137  		// See comment above.
  1138  		if !s.IsRootContainer(cid) {
  1139  			return unix.WaitStatus(0), err
  1140  		}
  1141  
  1142  		// The sandbox may have exited after we connected, but before
  1143  		// or during the Wait RPC.
  1144  		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1145  	}
  1146  
  1147  	// The sandbox may have already exited, or exited while handling the Wait RPC.
  1148  	// The best we can do is ask Linux what the sandbox exit status was, since in
  1149  	// most cases that will be the same as the container exit status.
  1150  	if err := s.waitForStopped(); err != nil {
  1151  		return unix.WaitStatus(0), err
  1152  	}
  1153  	if !s.child {
  1154  		return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
  1155  	}
  1156  
  1157  	s.statusMu.Lock()
  1158  	defer s.statusMu.Unlock()
  1159  	return s.status, nil
  1160  }
  1161  
  1162  // WaitPID waits for process 'pid' in the container's sandbox and returns its
  1163  // WaitStatus.
  1164  func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) {
  1165  	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
  1166  	var ws unix.WaitStatus
  1167  	args := &boot.WaitPIDArgs{
  1168  		PID: pid,
  1169  		CID: cid,
  1170  	}
  1171  	if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil {
  1172  		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err)
  1173  	}
  1174  	return ws, nil
  1175  }
  1176  
  1177  // IsRootContainer returns true if the specified container ID belongs to the
  1178  // root container.
  1179  func (s *Sandbox) IsRootContainer(cid string) bool {
  1180  	return s.ID == cid
  1181  }
  1182  
  1183  // Destroy frees all resources associated with the sandbox. It fails fast and
  1184  // is idempotent.
  1185  func (s *Sandbox) destroy() error {
  1186  	log.Debugf("Destroying sandbox %q", s.ID)
  1187  	// Only delete the control file if it exists.
  1188  	if len(s.ControlSocketPath) > 0 {
  1189  		if err := os.Remove(s.ControlSocketPath); err != nil {
  1190  			log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err)
  1191  		}
  1192  	}
  1193  	pid := s.Pid.load()
  1194  	if pid != 0 {
  1195  		log.Debugf("Killing sandbox %q", s.ID)
  1196  		if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH {
  1197  			return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err)
  1198  		}
  1199  		if err := s.waitForStopped(); err != nil {
  1200  			return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err)
  1201  		}
  1202  	}
  1203  
  1204  	return nil
  1205  }
  1206  
  1207  // SignalContainer sends the signal to a container in the sandbox. If all is
  1208  // true and signal is SIGKILL, then waits for all processes to exit before
  1209  // returning.
  1210  func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error {
  1211  	log.Debugf("Signal sandbox %q", s.ID)
  1212  	mode := boot.DeliverToProcess
  1213  	if all {
  1214  		mode = boot.DeliverToAllProcesses
  1215  	}
  1216  
  1217  	args := boot.SignalArgs{
  1218  		CID:   cid,
  1219  		Signo: int32(sig),
  1220  		Mode:  mode,
  1221  	}
  1222  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1223  		return fmt.Errorf("signaling container %q: %w", cid, err)
  1224  	}
  1225  	return nil
  1226  }
  1227  
  1228  // SignalProcess sends the signal to a particular process in the container. If
  1229  // fgProcess is true, then the signal is sent to the foreground process group
  1230  // in the same session that PID belongs to. This is only valid if the process
  1231  // is attached to a host TTY.
  1232  func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error {
  1233  	log.Debugf("Signal sandbox %q", s.ID)
  1234  
  1235  	mode := boot.DeliverToProcess
  1236  	if fgProcess {
  1237  		mode = boot.DeliverToForegroundProcessGroup
  1238  	}
  1239  
  1240  	args := boot.SignalArgs{
  1241  		CID:   cid,
  1242  		Signo: int32(sig),
  1243  		PID:   pid,
  1244  		Mode:  mode,
  1245  	}
  1246  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1247  		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
  1248  	}
  1249  	return nil
  1250  }
  1251  
  1252  // Checkpoint sends the checkpoint call for a container in the sandbox.
  1253  // The statefile will be written to f.
  1254  func (s *Sandbox) Checkpoint(cid string, f *os.File, options statefile.Options) error {
  1255  	log.Debugf("Checkpoint sandbox %q, options %+v", s.ID, options)
  1256  	opt := control.SaveOpts{
  1257  		Metadata: options.WriteToMetadata(map[string]string{}),
  1258  		FilePayload: urpc.FilePayload{
  1259  			Files: []*os.File{f},
  1260  		},
  1261  	}
  1262  
  1263  	if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
  1264  		return fmt.Errorf("checkpointing container %q: %w", cid, err)
  1265  	}
  1266  	return nil
  1267  }
  1268  
  1269  // Pause sends the pause call for a container in the sandbox.
  1270  func (s *Sandbox) Pause(cid string) error {
  1271  	log.Debugf("Pause sandbox %q", s.ID)
  1272  	if err := s.call(boot.LifecyclePause, nil, nil); err != nil {
  1273  		return fmt.Errorf("pausing container %q: %w", cid, err)
  1274  	}
  1275  	return nil
  1276  }
  1277  
  1278  // Resume sends the resume call for a container in the sandbox.
  1279  func (s *Sandbox) Resume(cid string) error {
  1280  	log.Debugf("Resume sandbox %q", s.ID)
  1281  	if err := s.call(boot.LifecycleResume, nil, nil); err != nil {
  1282  		return fmt.Errorf("resuming container %q: %w", cid, err)
  1283  	}
  1284  	return nil
  1285  }
  1286  
  1287  // Usage sends the collect call for a container in the sandbox.
  1288  func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) {
  1289  	log.Debugf("Usage sandbox %q", s.ID)
  1290  	opts := control.MemoryUsageOpts{Full: Full}
  1291  	var m control.MemoryUsage
  1292  	if err := s.call(boot.UsageCollect, &opts, &m); err != nil {
  1293  		return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err)
  1294  	}
  1295  	return m, nil
  1296  }
  1297  
  1298  // UsageFD sends the usagefd call for a container in the sandbox.
  1299  func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) {
  1300  	log.Debugf("Usage sandbox %q", s.ID)
  1301  	opts := control.MemoryUsageFileOpts{Version: 1}
  1302  	var m control.MemoryUsageFile
  1303  	if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil {
  1304  		return nil, fmt.Errorf("collecting usage FD: %w", err)
  1305  	}
  1306  
  1307  	if len(m.FilePayload.Files) != 2 {
  1308  		return nil, fmt.Errorf("wants exactly two fds")
  1309  	}
  1310  	return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1])
  1311  }
  1312  
  1313  // GetRegisteredMetrics returns metric registration data from the sandbox.
  1314  // This data is meant to be used as a way to sanity-check any exported metrics data during the
  1315  // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce
  1316  // bogus metrics.
  1317  // This returns an error if the sandbox has not requested instrumentation during creation time.
  1318  func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
  1319  	if s.RegisteredMetrics == nil {
  1320  		return nil, errors.New("sandbox did not request instrumentation when it was created")
  1321  	}
  1322  	return s.RegisteredMetrics, nil
  1323  }
  1324  
  1325  // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
  1326  func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
  1327  	log.Debugf("Metrics export sandbox %q", s.ID)
  1328  	var data control.MetricsExportData
  1329  	if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
  1330  		return nil, err
  1331  	}
  1332  	// Since we do not trust the output of the sandbox as-is, double-check that the options were
  1333  	// respected.
  1334  	if err := opts.Verify(&data); err != nil {
  1335  		return nil, err
  1336  	}
  1337  	return data.Snapshot, nil
  1338  }
  1339  
  1340  // IsRunning returns true if the sandbox or gofer process is running.
  1341  func (s *Sandbox) IsRunning() bool {
  1342  	pid := s.Pid.load()
  1343  	if pid != 0 {
  1344  		// Send a signal 0 to the sandbox process.
  1345  		if err := unix.Kill(pid, 0); err == nil {
  1346  			// Succeeded, process is running.
  1347  			return true
  1348  		}
  1349  	}
  1350  	return false
  1351  }
  1352  
  1353  // Stacks collects and returns all stacks for the sandbox.
  1354  func (s *Sandbox) Stacks() (string, error) {
  1355  	log.Debugf("Stacks sandbox %q", s.ID)
  1356  	var stacks string
  1357  	if err := s.call(boot.DebugStacks, nil, &stacks); err != nil {
  1358  		return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
  1359  	}
  1360  	return stacks, nil
  1361  }
  1362  
  1363  // HeapProfile writes a heap profile to the given file.
  1364  func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
  1365  	log.Debugf("Heap profile %q", s.ID)
  1366  	opts := control.HeapProfileOpts{
  1367  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1368  		Delay:       delay,
  1369  	}
  1370  	return s.call(boot.ProfileHeap, &opts, nil)
  1371  }
  1372  
  1373  // CPUProfile collects a CPU profile.
  1374  func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
  1375  	log.Debugf("CPU profile %q", s.ID)
  1376  	opts := control.CPUProfileOpts{
  1377  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1378  		Duration:    duration,
  1379  	}
  1380  	return s.call(boot.ProfileCPU, &opts, nil)
  1381  }
  1382  
  1383  // BlockProfile writes a block profile to the given file.
  1384  func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
  1385  	log.Debugf("Block profile %q", s.ID)
  1386  	opts := control.BlockProfileOpts{
  1387  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1388  		Duration:    duration,
  1389  	}
  1390  	return s.call(boot.ProfileBlock, &opts, nil)
  1391  }
  1392  
  1393  // MutexProfile writes a mutex profile to the given file.
  1394  func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
  1395  	log.Debugf("Mutex profile %q", s.ID)
  1396  	opts := control.MutexProfileOpts{
  1397  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1398  		Duration:    duration,
  1399  	}
  1400  	return s.call(boot.ProfileMutex, &opts, nil)
  1401  }
  1402  
  1403  // Trace collects an execution trace.
  1404  func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
  1405  	log.Debugf("Trace %q", s.ID)
  1406  	opts := control.TraceProfileOpts{
  1407  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1408  		Duration:    duration,
  1409  	}
  1410  	return s.call(boot.ProfileTrace, &opts, nil)
  1411  }
  1412  
  1413  // ChangeLogging changes logging options.
  1414  func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
  1415  	log.Debugf("Change logging start %q", s.ID)
  1416  	if err := s.call(boot.LoggingChange, &args, nil); err != nil {
  1417  		return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err)
  1418  	}
  1419  	return nil
  1420  }
  1421  
  1422  // DestroyContainer destroys the given container. If it is the root container,
  1423  // then the entire sandbox is destroyed.
  1424  func (s *Sandbox) DestroyContainer(cid string) error {
  1425  	if err := s.destroyContainer(cid); err != nil {
  1426  		// If the sandbox isn't running, the container has already been destroyed,
  1427  		// ignore the error in this case.
  1428  		if s.IsRunning() {
  1429  			return err
  1430  		}
  1431  	}
  1432  	return nil
  1433  }
  1434  
  1435  func (s *Sandbox) destroyContainer(cid string) error {
  1436  	if s.IsRootContainer(cid) {
  1437  		log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid)
  1438  		return s.destroy()
  1439  	}
  1440  
  1441  	log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID)
  1442  	if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil {
  1443  		return fmt.Errorf("destroying container %q: %w", cid, err)
  1444  	}
  1445  	return nil
  1446  }
  1447  
  1448  func (s *Sandbox) waitForStopped() error {
  1449  	if s.child {
  1450  		s.statusMu.Lock()
  1451  		defer s.statusMu.Unlock()
  1452  		pid := s.Pid.load()
  1453  		if pid == 0 {
  1454  			return nil
  1455  		}
  1456  		// The sandbox process is a child of the current process,
  1457  		// so we can wait on it to terminate and collect its zombie.
  1458  		if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil {
  1459  			return fmt.Errorf("error waiting the sandbox process: %v", err)
  1460  		}
  1461  		s.Pid.store(0)
  1462  		return nil
  1463  	}
  1464  
  1465  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1466  	defer cancel()
  1467  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1468  	op := func() error {
  1469  		if s.IsRunning() {
  1470  			return fmt.Errorf("sandbox is still running")
  1471  		}
  1472  		return nil
  1473  	}
  1474  	return backoff.Retry(op, b)
  1475  }
  1476  
  1477  // configureStdios change stdios ownership to give access to the sandbox
  1478  // process. This may be skipped depending on the configuration.
  1479  func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error {
  1480  	if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1481  		// Cannot change ownership without CAP_CHOWN.
  1482  		return nil
  1483  	}
  1484  
  1485  	if s.UID < 0 || s.GID < 0 {
  1486  		panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID))
  1487  	}
  1488  	for _, file := range stdios {
  1489  		log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID)
  1490  		if err := file.Chown(s.UID, s.GID); err != nil {
  1491  			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
  1492  				log.Warningf("can't change an owner of %s: %s", file.Name(), err)
  1493  				continue
  1494  			}
  1495  			return err
  1496  		}
  1497  	}
  1498  	return nil
  1499  }
  1500  
  1501  // deviceFileForPlatform opens the device file for the given platform. If the
  1502  // platform does not need a device file, then nil is returned.
  1503  // devicePath may be empty to use a sane platform-specific default.
  1504  func deviceFileForPlatform(name, devicePath string) (*os.File, error) {
  1505  	p, err := platform.Lookup(name)
  1506  	if err != nil {
  1507  		return nil, err
  1508  	}
  1509  
  1510  	f, err := p.OpenDevice(devicePath)
  1511  	if err != nil {
  1512  		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
  1513  	}
  1514  	return f, nil
  1515  }
  1516  
  1517  // checkBinaryPermissions verifies that the required binary bits are set on
  1518  // the runsc executable.
  1519  func checkBinaryPermissions(conf *config.Config) error {
  1520  	// All platforms need the other exe bit
  1521  	neededBits := os.FileMode(0001)
  1522  	if conf.Platform == "ptrace" {
  1523  		// Ptrace needs the other read bit
  1524  		neededBits |= os.FileMode(0004)
  1525  	}
  1526  
  1527  	exePath, err := os.Executable()
  1528  	if err != nil {
  1529  		return fmt.Errorf("getting exe path: %v", err)
  1530  	}
  1531  
  1532  	// Check the permissions of the runsc binary and print an error if it
  1533  	// doesn't match expectations.
  1534  	info, err := os.Stat(exePath)
  1535  	if err != nil {
  1536  		return fmt.Errorf("stat file: %v", err)
  1537  	}
  1538  
  1539  	if info.Mode().Perm()&neededBits != neededBits {
  1540  		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
  1541  	}
  1542  	return nil
  1543  }
  1544  
  1545  // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox.
  1546  func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) {
  1547  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1548  	args := control.CgroupsReadArgs{
  1549  		Args: []control.CgroupsReadArg{
  1550  			{
  1551  				File: file,
  1552  			},
  1553  		},
  1554  	}
  1555  	var out control.CgroupsResults
  1556  	if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil {
  1557  		return "", err
  1558  	}
  1559  	if len(out.Results) != 1 {
  1560  		return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1561  	}
  1562  	return out.Results[0].Unpack()
  1563  }
  1564  
  1565  // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox.
  1566  func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error {
  1567  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1568  	args := control.CgroupsWriteArgs{
  1569  		Args: []control.CgroupsWriteArg{
  1570  			{
  1571  				File:  file,
  1572  				Value: value,
  1573  			},
  1574  		},
  1575  	}
  1576  	var out control.CgroupsResults
  1577  	if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil {
  1578  		return err
  1579  	}
  1580  	if len(out.Results) != 1 {
  1581  		return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1582  	}
  1583  	return out.Results[0].AsError()
  1584  }
  1585  
  1586  // fixPidns looks at the PID namespace path. If that path corresponds to the
  1587  // sandbox process PID namespace, then change the spec so that the container
  1588  // joins the sandbox root namespace.
  1589  func (s *Sandbox) fixPidns(spec *specs.Spec) {
  1590  	pidns, ok := specutils.GetNS(specs.PIDNamespace, spec)
  1591  	if !ok {
  1592  		// pidns was not set, nothing to fix.
  1593  		return
  1594  	}
  1595  	if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) {
  1596  		// Fix only if the PID namespace corresponds to the sandbox's.
  1597  		return
  1598  	}
  1599  
  1600  	for i := range spec.Linux.Namespaces {
  1601  		if spec.Linux.Namespaces[i].Type == specs.PIDNamespace {
  1602  			// Removing the namespace makes the container join the sandbox root
  1603  			// namespace.
  1604  			log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path)
  1605  			spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...)
  1606  			return
  1607  		}
  1608  	}
  1609  	panic("unreachable")
  1610  }
  1611  
  1612  // ConfigureCmdForRootless configures cmd to donate a socket FD that can be
  1613  // used to synchronize userns configuration.
  1614  func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) {
  1615  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1616  	if err != nil {
  1617  		return nil, err
  1618  	}
  1619  	f := os.NewFile(uintptr(fds[1]), "userns sync other FD")
  1620  	donations.DonateAndClose("sync-userns-fd", f)
  1621  	if cmd.SysProcAttr == nil {
  1622  		cmd.SysProcAttr = &unix.SysProcAttr{}
  1623  	}
  1624  	cmd.SysProcAttr.AmbientCaps = []uintptr{
  1625  		// Same as `cap` in cmd/gofer.go.
  1626  		unix.CAP_CHOWN,
  1627  		unix.CAP_DAC_OVERRIDE,
  1628  		unix.CAP_DAC_READ_SEARCH,
  1629  		unix.CAP_FOWNER,
  1630  		unix.CAP_FSETID,
  1631  		unix.CAP_SYS_CHROOT,
  1632  		// Needed for setuid(2)/setgid(2).
  1633  		unix.CAP_SETUID,
  1634  		unix.CAP_SETGID,
  1635  		// Needed for chroot.
  1636  		unix.CAP_SYS_ADMIN,
  1637  		// Needed to be able to clear bounding set (PR_CAPBSET_DROP).
  1638  		unix.CAP_SETPCAP,
  1639  	}
  1640  	return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil
  1641  }
  1642  
  1643  // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings
  1644  // for process pid.
  1645  func SetUserMappings(spec *specs.Spec, pid int) error {
  1646  	log.Debugf("Setting user mappings")
  1647  	args := []string{strconv.Itoa(pid)}
  1648  	for _, idMap := range spec.Linux.UIDMappings {
  1649  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1650  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1651  		args = append(args,
  1652  			strconv.Itoa(int(idMap.ContainerID)),
  1653  			strconv.Itoa(int(idMap.HostID)),
  1654  			strconv.Itoa(int(idMap.Size)),
  1655  		)
  1656  	}
  1657  
  1658  	out, err := exec.Command("newuidmap", args...).CombinedOutput()
  1659  	log.Debugf("newuidmap: %#v\n%s", args, out)
  1660  	if err != nil {
  1661  		return fmt.Errorf("newuidmap failed: %w", err)
  1662  	}
  1663  
  1664  	args = []string{strconv.Itoa(pid)}
  1665  	for _, idMap := range spec.Linux.GIDMappings {
  1666  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1667  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1668  		args = append(args,
  1669  			strconv.Itoa(int(idMap.ContainerID)),
  1670  			strconv.Itoa(int(idMap.HostID)),
  1671  			strconv.Itoa(int(idMap.Size)),
  1672  		)
  1673  	}
  1674  	out, err = exec.Command("newgidmap", args...).CombinedOutput()
  1675  	log.Debugf("newgidmap: %#v\n%s", args, out)
  1676  	if err != nil {
  1677  		return fmt.Errorf("newgidmap failed: %w", err)
  1678  	}
  1679  	return nil
  1680  }
  1681  
  1682  // Mount mounts a filesystem in a container.
  1683  func (s *Sandbox) Mount(cid, fstype, src, dest string) error {
  1684  	var files []*os.File
  1685  	switch fstype {
  1686  	case erofs.Name:
  1687  		if imageFile, err := os.Open(src); err != nil {
  1688  			return fmt.Errorf("opening %s: %v", src, err)
  1689  		} else {
  1690  			files = append(files, imageFile)
  1691  		}
  1692  
  1693  	default:
  1694  		return fmt.Errorf("unsupported filesystem type: %v", fstype)
  1695  	}
  1696  
  1697  	args := boot.MountArgs{
  1698  		ContainerID: cid,
  1699  		Source:      src,
  1700  		Destination: dest,
  1701  		FsType:      fstype,
  1702  		FilePayload: urpc.FilePayload{Files: files},
  1703  	}
  1704  	return s.call(boot.ContMgrMount, &args, nil)
  1705  }