github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/sandbox/sandbox.go

github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/sandbox/sandbox.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sandbox creates and manipulates sandboxes.
    16  package sandbox
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math"
    25  	"os"
    26  	"os/exec"
    27  	"path/filepath"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    34  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    35  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    36  	"github.com/MerlinKodo/gvisor/pkg/control/client"
    37  	"github.com/MerlinKodo/gvisor/pkg/control/server"
    38  	"github.com/MerlinKodo/gvisor/pkg/coverage"
    39  	"github.com/MerlinKodo/gvisor/pkg/log"
    40  	metricpb "github.com/MerlinKodo/gvisor/pkg/metric/metric_go_proto"
    41  	"github.com/MerlinKodo/gvisor/pkg/prometheus"
    42  	"github.com/MerlinKodo/gvisor/pkg/sentry/control"
    43  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    44  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    45  	"github.com/MerlinKodo/gvisor/pkg/state/statefile"
    46  	"github.com/MerlinKodo/gvisor/pkg/sync"
    47  	"github.com/MerlinKodo/gvisor/pkg/urpc"
    48  	"github.com/MerlinKodo/gvisor/runsc/boot"
    49  	"github.com/MerlinKodo/gvisor/runsc/boot/procfs"
    50  	"github.com/MerlinKodo/gvisor/runsc/cgroup"
    51  	"github.com/MerlinKodo/gvisor/runsc/config"
    52  	"github.com/MerlinKodo/gvisor/runsc/console"
    53  	"github.com/MerlinKodo/gvisor/runsc/donation"
    54  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    55  	"github.com/cenkalti/backoff"
    56  	specs "github.com/opencontainers/runtime-spec/specs-go"
    57  	"github.com/syndtr/gocapability/capability"
    58  	"golang.org/x/sys/unix"
    59  )
    60  
    61  const (
    62  	// namespaceAnnotation is a pod annotation populated by containerd.
    63  	// It contains the name of the pod that a sandbox is in when running in Kubernetes.
    64  	podNameAnnotation = "io.kubernetes.cri.sandbox-name"
    65  
    66  	// namespaceAnnotation is a pod annotation populated by containerd.
    67  	// It contains the namespace of the pod that a sandbox is in when running in Kubernetes.
    68  	namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace"
    69  )
    70  
    71  // createControlSocket finds a location and creates the socket used to
    72  // communicate with the sandbox. The socket is a UDS on the host filesystem.
    73  //
    74  // Note that abstract sockets are *not* used, because any user can connect to
    75  // them. There is no file mode protecting abstract sockets.
    76  func createControlSocket(rootDir, id string) (string, int, error) {
    77  	name := fmt.Sprintf("runsc-%s.sock", id)
    78  
    79  	// Only use absolute paths to guarantee resolution from anywhere.
    80  	for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} {
    81  		path := filepath.Join(dir, name)
    82  		log.Debugf("Attempting to create socket file %q", path)
    83  		fd, err := server.CreateSocket(path)
    84  		if err == nil {
    85  			log.Debugf("Using socket file %q", path)
    86  			return path, fd, nil
    87  		}
    88  		log.Debugf("Failed to create socket file %q: %v", path, err)
    89  	}
    90  	return "", -1, fmt.Errorf("unable to find location to write socket file")
    91  }
    92  
    93  // pid is an atomic type that implements JSON marshal/unmarshal interfaces.
    94  type pid struct {
    95  	val atomicbitops.Int64
    96  }
    97  
    98  func (p *pid) store(pid int) {
    99  	p.val.Store(int64(pid))
   100  }
   101  
   102  func (p *pid) load() int {
   103  	return int(p.val.Load())
   104  }
   105  
   106  // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON.
   107  func (p *pid) UnmarshalJSON(b []byte) error {
   108  	var pid int
   109  
   110  	if err := json.Unmarshal(b, &pid); err != nil {
   111  		return err
   112  	}
   113  	p.store(pid)
   114  	return nil
   115  }
   116  
   117  // MarshalJSON implements json.Marshaler.MarshalJSON
   118  func (p *pid) MarshalJSON() ([]byte, error) {
   119  	return json.Marshal(p.load())
   120  }
   121  
   122  // Sandbox wraps a sandbox process.
   123  //
   124  // It is used to start/stop sandbox process (and associated processes like
   125  // gofers), as well as for running and manipulating containers inside a running
   126  // sandbox.
   127  //
   128  // Note: Sandbox must be immutable because a copy of it is saved for each
   129  // container and changes would not be synchronized to all of them.
   130  type Sandbox struct {
   131  	// ID is the id of the sandbox (immutable). By convention, this is the same
   132  	// ID as the first container run in the sandbox.
   133  	ID string `json:"id"`
   134  
   135  	// PodName is the name of the Kubernetes Pod (if any) that this sandbox
   136  	// represents. Unset if not running under containerd or Kubernetes.
   137  	PodName string `json:"podName"`
   138  
   139  	// Namespace is the Kubernetes namespace (if any) of the pod that this
   140  	// sandbox represents. Unset if not running under containerd or Kubernetes.
   141  	Namespace string `json:"namespace"`
   142  
   143  	// Pid is the pid of the running sandbox. May be 0 if the sandbox
   144  	// is not running.
   145  	Pid pid `json:"pid"`
   146  
   147  	// UID is the user ID in the parent namespace that the sandbox is running as.
   148  	UID int `json:"uid"`
   149  	// GID is the group ID in the parent namespace that the sandbox is running as.
   150  	GID int `json:"gid"`
   151  
   152  	// CgroupJSON contains the cgroup configuration that the sandbox is part of
   153  	// and allow serialization of the configuration into json
   154  	CgroupJSON cgroup.CgroupJSON `json:"cgroup"`
   155  
   156  	// OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox
   157  	// started, before it may be modified.
   158  	OriginalOOMScoreAdj int `json:"originalOomScoreAdj"`
   159  
   160  	// RegisteredMetrics is the set of metrics registered in the sandbox.
   161  	// Used for verifying metric data integrity after containers are started.
   162  	// Only populated if exporting metrics was requested when the sandbox was
   163  	// created.
   164  	RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"`
   165  
   166  	// MetricMetadata are key-value pairs that are useful to export about this
   167  	// sandbox, but not part of the set of labels that uniquely identify it.
   168  	// They are static once initialized, and typically contain high-level
   169  	// configuration information about the sandbox.
   170  	MetricMetadata map[string]string `json:"metricMetadata"`
   171  
   172  	// MetricServerAddress is the address of the metric server that this sandbox
   173  	// intends to export metrics for.
   174  	// Only populated if exporting metrics was requested when the sandbox was
   175  	// created.
   176  	MetricServerAddress string `json:"metricServerAddress"`
   177  
   178  	// ControlSocketPath is the path to the sandbox's uRPC server socket.
   179  	// Connections to the sandbox are made through this.
   180  	ControlSocketPath string `json:"controlSocketPath"`
   181  
   182  	// MountHints provides extra information about container mounts that apply
   183  	// to the entire pod.
   184  	MountHints *boot.PodMountHints `json:"mountHints"`
   185  
   186  	// child is set if a sandbox process is a child of the current process.
   187  	//
   188  	// This field isn't saved to json, because only a creator of sandbox
   189  	// will have it as a child process.
   190  	child bool `nojson:"true"`
   191  
   192  	// statusMu protects status.
   193  	statusMu sync.Mutex `nojson:"true"`
   194  
   195  	// status is the exit status of a sandbox process. It's only set if the
   196  	// child==true and the sandbox was waited on. This field allows for multiple
   197  	// threads to wait on sandbox and get the exit code, since Linux will return
   198  	// WaitStatus to one of the waiters only.
   199  	status unix.WaitStatus `nojson:"true"`
   200  }
   201  
   202  // Getpid returns the process ID of the sandbox process.
   203  func (s *Sandbox) Getpid() int {
   204  	return s.Pid.load()
   205  }
   206  
   207  // Args is used to configure a new sandbox.
   208  type Args struct {
   209  	// ID is the sandbox unique identifier.
   210  	ID string
   211  
   212  	// Spec is the OCI spec that describes the container.
   213  	Spec *specs.Spec
   214  
   215  	// BundleDir is the directory containing the container bundle.
   216  	BundleDir string
   217  
   218  	// ConsoleSocket is the path to a unix domain socket that will receive
   219  	// the console FD. It may be empty.
   220  	ConsoleSocket string
   221  
   222  	// UserLog is the filename to send user-visible logs to. It may be empty.
   223  	UserLog string
   224  
   225  	// IOFiles is the list of files that connect to a gofer endpoint for the
   226  	// mounts points using Gofers. They must be in the same order as mounts
   227  	// appear in the spec.
   228  	IOFiles []*os.File
   229  
   230  	// OverlayFilestoreFiles are the regular files that will back the tmpfs upper
   231  	// mount in the overlay mounts.
   232  	OverlayFilestoreFiles []*os.File
   233  
   234  	// OverlayMediums contains information about how the gofer mounts have been
   235  	// overlaid. The first entry is for rootfs and the following entries are for
   236  	// bind mounts in Spec.Mounts (in the same order).
   237  	OverlayMediums boot.OverlayMediumFlags
   238  
   239  	// MountHints provides extra information about containers mounts that apply
   240  	// to the entire pod.
   241  	MountHints *boot.PodMountHints
   242  
   243  	// MountsFile is a file container mount information from the spec. It's
   244  	// equivalent to the mounts from the spec, except that all paths have been
   245  	// resolved to their final absolute location.
   246  	MountsFile *os.File
   247  
   248  	// Gcgroup is the cgroup that the sandbox is part of.
   249  	Cgroup cgroup.Cgroup
   250  
   251  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   252  	// If the caller exits, the sandbox should exit too.
   253  	Attached bool
   254  
   255  	// SinkFiles is the an ordered array of files to be used by seccheck sinks
   256  	// configured from the --pod-init-config file.
   257  	SinkFiles []*os.File
   258  
   259  	// PassFiles are user-supplied files from the host to be exposed to the
   260  	// sandboxed app.
   261  	PassFiles map[int]*os.File
   262  
   263  	// ExecFile is the file from the host used for program execution.
   264  	ExecFile *os.File
   265  
   266  	// NvidiaDevMinors is the list of device minors for Nvidia GPU devices
   267  	// exposed to the sandbox.
   268  	NvidiaDevMinors boot.NvidiaDevMinors
   269  }
   270  
   271  // New creates the sandbox process. The caller must call Destroy() on the
   272  // sandbox.
   273  func New(conf *config.Config, args *Args) (*Sandbox, error) {
   274  	s := &Sandbox{
   275  		ID: args.ID,
   276  		CgroupJSON: cgroup.CgroupJSON{
   277  			Cgroup: args.Cgroup,
   278  		},
   279  		UID:                 -1, // prevent usage before it's set.
   280  		GID:                 -1, // prevent usage before it's set.
   281  		MetricMetadata:      conf.MetricMetadata(),
   282  		MetricServerAddress: conf.MetricServer,
   283  		MountHints:          args.MountHints,
   284  	}
   285  	if args.Spec != nil && args.Spec.Annotations != nil {
   286  		s.PodName = args.Spec.Annotations[podNameAnnotation]
   287  		s.Namespace = args.Spec.Annotations[namespaceAnnotation]
   288  	}
   289  
   290  	// The Cleanup object cleans up partially created sandboxes when an error
   291  	// occurs. Any errors occurring during cleanup itself are ignored.
   292  	c := cleanup.Make(func() {
   293  		if err := s.destroy(); err != nil {
   294  			log.Warningf("error destroying sandbox: %v", err)
   295  		}
   296  	})
   297  	defer c.Clean()
   298  
   299  	if len(conf.PodInitConfig) > 0 {
   300  		initConf, err := boot.LoadInitConfig(conf.PodInitConfig)
   301  		if err != nil {
   302  			return nil, fmt.Errorf("loading init config file: %w", err)
   303  		}
   304  		args.SinkFiles, err = initConf.Setup()
   305  		if err != nil {
   306  			return nil, fmt.Errorf("cannot init config: %w", err)
   307  		}
   308  	}
   309  
   310  	// Create pipe to synchronize when sandbox process has been booted.
   311  	clientSyncFile, sandboxSyncFile, err := os.Pipe()
   312  	if err != nil {
   313  		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
   314  	}
   315  	defer clientSyncFile.Close()
   316  
   317  	// Create the sandbox process.
   318  	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
   319  	// sandboxSyncFile has to be closed to be able to detect when the sandbox
   320  	// process exits unexpectedly.
   321  	sandboxSyncFile.Close()
   322  	if err != nil {
   323  		return nil, fmt.Errorf("cannot create sandbox process: %w", err)
   324  	}
   325  
   326  	// Wait until the sandbox has booted.
   327  	b := make([]byte, 1)
   328  	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
   329  		err := fmt.Errorf("waiting for sandbox to start: %v", err)
   330  		// If the sandbox failed to start, it may be because the binary
   331  		// permissions were incorrect. Check the bits and return a more helpful
   332  		// error message.
   333  		//
   334  		// NOTE: The error message is checked because error types are lost over
   335  		// rpc calls.
   336  		if strings.Contains(err.Error(), io.EOF.Error()) {
   337  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   338  				return nil, fmt.Errorf("%v: %v", err, permsErr)
   339  			}
   340  		}
   341  		return nil, fmt.Errorf("cannot read client sync file: %w", err)
   342  	}
   343  
   344  	if conf.MetricServer != "" {
   345  		// The control server is up and the sandbox was configured to export metrics.
   346  		// We must gather data about registered metrics prior to any process starting in the sandbox.
   347  		log.Debugf("Getting metric registration information from sandbox %q", s.ID)
   348  		var registeredMetrics control.MetricsRegistrationResponse
   349  		if err := s.call(boot.MetricsGetRegistered, nil, &registeredMetrics); err != nil {
   350  			return nil, fmt.Errorf("cannot get registered metrics: %v", err)
   351  		}
   352  		s.RegisteredMetrics = registeredMetrics.RegisteredMetrics
   353  	}
   354  
   355  	c.Release()
   356  	return s, nil
   357  }
   358  
   359  // CreateSubcontainer creates a container inside the sandbox.
   360  func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error {
   361  	log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   362  
   363  	var files []*os.File
   364  	if tty != nil {
   365  		files = []*os.File{tty}
   366  	}
   367  	if err := s.configureStdios(conf, files); err != nil {
   368  		return err
   369  	}
   370  
   371  	args := boot.CreateArgs{
   372  		CID:         cid,
   373  		FilePayload: urpc.FilePayload{Files: files},
   374  	}
   375  	if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil {
   376  		return fmt.Errorf("creating sub-container %q: %w", cid, err)
   377  	}
   378  	return nil
   379  }
   380  
   381  // StartRoot starts running the root container process inside the sandbox.
   382  func (s *Sandbox) StartRoot(conf *config.Config) error {
   383  	pid := s.Pid.load()
   384  	log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid)
   385  	conn, err := s.sandboxConnect()
   386  	if err != nil {
   387  		return err
   388  	}
   389  	defer conn.Close()
   390  
   391  	// Configure the network.
   392  	if err := setupNetwork(conn, pid, conf); err != nil {
   393  		return fmt.Errorf("setting up network: %w", err)
   394  	}
   395  
   396  	// Send a message to the sandbox control server to start the root container.
   397  	if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil {
   398  		return fmt.Errorf("starting root container: %w", err)
   399  	}
   400  
   401  	return nil
   402  }
   403  
   404  // StartSubcontainer starts running a sub-container inside the sandbox.
   405  func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, overlayFilestoreFiles []*os.File, overlayMediums []boot.OverlayMedium) error {
   406  	log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   407  
   408  	if err := s.configureStdios(conf, stdios); err != nil {
   409  		return err
   410  	}
   411  	s.fixPidns(spec)
   412  
   413  	// The payload contains (in this specific order):
   414  	// * stdin/stdout/stderr (optional: only present when not using TTY)
   415  	// * The subcontainer's overlay filestore files (optional: only present when
   416  	//   host file backed overlay is configured)
   417  	// * Gofer files.
   418  	payload := urpc.FilePayload{}
   419  	payload.Files = append(payload.Files, stdios...)
   420  	payload.Files = append(payload.Files, overlayFilestoreFiles...)
   421  	payload.Files = append(payload.Files, goferFiles...)
   422  
   423  	// Start running the container.
   424  	args := boot.StartArgs{
   425  		Spec:                   spec,
   426  		Conf:                   conf,
   427  		CID:                    cid,
   428  		NumOverlayFilestoreFDs: len(overlayFilestoreFiles),
   429  		OverlayMediums:         overlayMediums,
   430  		FilePayload:            payload,
   431  	}
   432  	if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil {
   433  		return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err)
   434  	}
   435  	return nil
   436  }
   437  
   438  // Restore sends the restore call for a container in the sandbox.
   439  func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error {
   440  	log.Debugf("Restore sandbox %q", s.ID)
   441  
   442  	rf, err := os.Open(filename)
   443  	if err != nil {
   444  		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
   445  	}
   446  	defer rf.Close()
   447  
   448  	opt := boot.RestoreOpts{
   449  		FilePayload: urpc.FilePayload{
   450  			Files: []*os.File{rf},
   451  		},
   452  		SandboxID: s.ID,
   453  	}
   454  
   455  	// If the platform needs a device FD we must pass it in.
   456  	if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil {
   457  		return err
   458  	} else if deviceFile != nil {
   459  		defer deviceFile.Close()
   460  		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
   461  	}
   462  
   463  	conn, err := s.sandboxConnect()
   464  	if err != nil {
   465  		return err
   466  	}
   467  	defer conn.Close()
   468  
   469  	// Configure the network.
   470  	if err := setupNetwork(conn, s.Pid.load(), conf); err != nil {
   471  		return fmt.Errorf("setting up network: %v", err)
   472  	}
   473  
   474  	// Restore the container and start the root container.
   475  	if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
   476  		return fmt.Errorf("restoring container %q: %v", cid, err)
   477  	}
   478  
   479  	return nil
   480  }
   481  
   482  // Processes retrieves the list of processes and associated metadata for a
   483  // given container in this sandbox.
   484  func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
   485  	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
   486  	var pl []*control.Process
   487  	if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil {
   488  		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
   489  	}
   490  	return pl, nil
   491  }
   492  
   493  // CreateTraceSession creates a new trace session.
   494  func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error {
   495  	log.Debugf("Creating trace session in sandbox %q", s.ID)
   496  
   497  	sinkFiles, err := seccheck.SetupSinks(config.Sinks)
   498  	if err != nil {
   499  		return err
   500  	}
   501  	defer func() {
   502  		for _, f := range sinkFiles {
   503  			_ = f.Close()
   504  		}
   505  	}()
   506  
   507  	arg := boot.CreateTraceSessionArgs{
   508  		Config: *config,
   509  		Force:  force,
   510  		FilePayload: urpc.FilePayload{
   511  			Files: sinkFiles,
   512  		},
   513  	}
   514  	if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil {
   515  		return fmt.Errorf("creating trace session: %w", err)
   516  	}
   517  	return nil
   518  }
   519  
   520  // DeleteTraceSession deletes an existing trace session.
   521  func (s *Sandbox) DeleteTraceSession(name string) error {
   522  	log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID)
   523  	if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil {
   524  		return fmt.Errorf("deleting trace session: %w", err)
   525  	}
   526  	return nil
   527  }
   528  
   529  // ListTraceSessions lists all trace sessions.
   530  func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) {
   531  	log.Debugf("Listing trace sessions in sandbox %q", s.ID)
   532  	var sessions []seccheck.SessionConfig
   533  	if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil {
   534  		return nil, fmt.Errorf("listing trace session: %w", err)
   535  	}
   536  	return sessions, nil
   537  }
   538  
   539  // ProcfsDump collects and returns a procfs dump for the sandbox.
   540  func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) {
   541  	log.Debugf("Procfs dump %q", s.ID)
   542  	var procfsDump []procfs.ProcessProcfsDump
   543  	if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil {
   544  		return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
   545  	}
   546  	return procfsDump, nil
   547  }
   548  
   549  // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
   550  func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) {
   551  	return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */)
   552  }
   553  
   554  // Execute runs the specified command in the container. It returns the PID of
   555  // the newly created process.
   556  func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   557  	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
   558  
   559  	// Stdios are those files which have an FD <= 2 in the process. We do not
   560  	// want the ownership of other files to be changed by configureStdios.
   561  	var stdios []*os.File
   562  	for i, fd := range args.GuestFDs {
   563  		if fd > 2 || i >= len(args.Files) {
   564  			continue
   565  		}
   566  		stdios = append(stdios, args.Files[i])
   567  	}
   568  
   569  	if err := s.configureStdios(conf, stdios); err != nil {
   570  		return 0, err
   571  	}
   572  
   573  	// Send a message to the sandbox control server to start the container.
   574  	var pid int32
   575  	if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil {
   576  		return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err)
   577  	}
   578  	return pid, nil
   579  }
   580  
   581  // Event retrieves stats about the sandbox such as memory and CPU utilization.
   582  func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
   583  	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
   584  	var e boot.EventOut
   585  	if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil {
   586  		return nil, fmt.Errorf("retrieving event data from sandbox: %w", err)
   587  	}
   588  	return &e, nil
   589  }
   590  
   591  // PortForward starts port forwarding to the sandbox.
   592  func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error {
   593  	log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts)
   594  	conn, err := s.sandboxConnect()
   595  	if err != nil {
   596  		return err
   597  	}
   598  	defer conn.Close()
   599  
   600  	if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil {
   601  		return fmt.Errorf("port forwarding to sandbox: %v", err)
   602  	}
   603  
   604  	return nil
   605  }
   606  
   607  func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
   608  	log.Debugf("Connecting to sandbox %q", s.ID)
   609  	path := s.ControlSocketPath
   610  	if len(path) >= linux.UnixPathMax {
   611  		// This is not an abstract socket path. It is a filesystem path.
   612  		// UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead
   613  		// open the socket using open(2) and use /proc to refer to the open FD.
   614  		sockFD, err := unix.Open(path, unix.O_PATH, 0)
   615  		if err != nil {
   616  			return nil, fmt.Errorf("failed to open socket at %q", path)
   617  		}
   618  		defer unix.Close(sockFD)
   619  		path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD))
   620  	}
   621  	conn, err := client.ConnectTo(path)
   622  	if err != nil {
   623  		return nil, s.connError(err)
   624  	}
   625  	return conn, nil
   626  }
   627  
   628  func (s *Sandbox) call(method string, arg, result any) error {
   629  	conn, err := s.sandboxConnect()
   630  	if err != nil {
   631  		return err
   632  	}
   633  	defer conn.Close()
   634  
   635  	return conn.Call(method, arg, result)
   636  }
   637  
   638  func (s *Sandbox) connError(err error) error {
   639  	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err)
   640  }
   641  
   642  // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
   643  // command, passing in the bundle dir.
   644  func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
   645  	donations := donation.Agency{}
   646  	defer donations.Close()
   647  
   648  	// pgalloc.MemoryFile (which provides application memory) sometimes briefly
   649  	// mlock(2)s ranges of memory in order to fault in a large number of pages at
   650  	// a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc
   651  	// expects to run in a memory cgroup that limits its memory usage as
   652  	// required.
   653  	// This needs to be done before exec'ing `runsc boot`, as that subcommand
   654  	// runs as an unprivileged user that will not be able to call `setrlimit`
   655  	// by itself. Calling `setrlimit` here will have the side-effect of setting
   656  	// the limit on the currently-running `runsc` process as well, but that
   657  	// should be OK too.
   658  	var rlim unix.Rlimit
   659  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   660  		log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err)
   661  	} else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY {
   662  		rlim.Cur = unix.RLIM_INFINITY
   663  		rlim.Max = unix.RLIM_INFINITY
   664  		if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   665  			// We may not have CAP_SYS_RESOURCE, so this failure may be expected.
   666  			log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err)
   667  		}
   668  	}
   669  
   670  	//
   671  	// These flags must come BEFORE the "boot" command in cmd.Args.
   672  	//
   673  
   674  	// Open the log files to pass to the sandbox as FDs.
   675  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   676  		return err
   677  	}
   678  
   679  	test := ""
   680  	if len(conf.TestOnlyTestNameEnv) != 0 {
   681  		// Fetch test name if one is provided and the test only flag was set.
   682  		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
   683  			test = t
   684  		}
   685  	}
   686  	if specutils.IsDebugCommand(conf, "boot") {
   687  		if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil {
   688  			return err
   689  		}
   690  	}
   691  	if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil {
   692  		return err
   693  	}
   694  	covFilename := conf.CoverageReport
   695  	if covFilename == "" {
   696  		covFilename = os.Getenv("GO_COVERAGE_FILE")
   697  	}
   698  	if covFilename != "" && coverage.Available() {
   699  		if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil {
   700  			return err
   701  		}
   702  	}
   703  	if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil {
   704  		return err
   705  	}
   706  
   707  	// Relay all the config flags to the sandbox process.
   708  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
   709  	cmd.SysProcAttr = &unix.SysProcAttr{
   710  		// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   711  		// when re-parented.
   712  		Setsid: true,
   713  	}
   714  
   715  	// Set Args[0] to make easier to spot the sandbox process. Otherwise it's
   716  	// shown as `exe`.
   717  	cmd.Args[0] = "runsc-sandbox"
   718  
   719  	// Tranfer FDs that need to be present before the "boot" command.
   720  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
   721  	nextFD := donations.Transfer(cmd, 3)
   722  
   723  	// Add the "boot" command to the args.
   724  	//
   725  	// All flags after this must be for the boot command
   726  	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
   727  
   728  	// Clear environment variables, unless --TESTONLY-unsafe-nonroot is set.
   729  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   730  		// Setting cmd.Env = nil causes cmd to inherit the current process's env.
   731  		cmd.Env = []string{}
   732  	}
   733  
   734  	// If there is a gofer, sends all socket ends to the sandbox.
   735  	donations.DonateAndClose("io-fds", args.IOFiles...)
   736  	donations.DonateAndClose("overlay-filestore-fds", args.OverlayFilestoreFiles...)
   737  	donations.DonateAndClose("mounts-fd", args.MountsFile)
   738  	donations.Donate("start-sync-fd", startSyncFile)
   739  	if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   740  		return err
   741  	}
   742  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
   743  	if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil {
   744  		return err
   745  	}
   746  	if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil {
   747  		return err
   748  	}
   749  	if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil {
   750  		return err
   751  	}
   752  	if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil {
   753  		return err
   754  	}
   755  	if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil {
   756  		return err
   757  	}
   758  
   759  	// Pass nvidia device minors.
   760  	if len(args.NvidiaDevMinors) > 0 {
   761  		cmd.Args = append(cmd.Args, "--nvidia-dev-minors="+args.NvidiaDevMinors.String())
   762  	}
   763  
   764  	// Pass overlay mediums.
   765  	cmd.Args = append(cmd.Args, "--overlay-mediums="+args.OverlayMediums.String())
   766  
   767  	// Create a socket for the control server and donate it to the sandbox.
   768  	controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID)
   769  	if err != nil {
   770  		return fmt.Errorf("failed to create control socket: %v", err)
   771  	}
   772  	s.ControlSocketPath = controlSocketPath
   773  	log.Infof("Control socket path: %q", s.ControlSocketPath)
   774  	donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket"))
   775  
   776  	specFile, err := specutils.OpenSpec(args.BundleDir)
   777  	if err != nil {
   778  		return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err)
   779  	}
   780  	donations.DonateAndClose("spec-fd", specFile)
   781  
   782  	if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil {
   783  		return err
   784  	}
   785  	donations.DonateAndClose("sink-fds", args.SinkFiles...)
   786  
   787  	gPlatform, err := platform.Lookup(conf.Platform)
   788  	if err != nil {
   789  		return fmt.Errorf("cannot look up platform: %w", err)
   790  	}
   791  	if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil {
   792  		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
   793  	} else if deviceFile != nil {
   794  		donations.DonateAndClose("device-fd", deviceFile)
   795  	}
   796  
   797  	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
   798  	// isn't set.
   799  	if conf.Platform == "kvm" {
   800  		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
   801  	}
   802  
   803  	// nss is the set of namespaces to join or create before starting the sandbox
   804  	// process. Mount, IPC and UTS namespaces from the host are not used as they
   805  	// are virtualized inside the sandbox. Be paranoid and run inside an empty
   806  	// namespace for these. Don't unshare cgroup because sandbox is added to a
   807  	// cgroup in the caller's namespace.
   808  	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
   809  	nss := []specs.LinuxNamespace{
   810  		{Type: specs.IPCNamespace},
   811  		{Type: specs.MountNamespace},
   812  		{Type: specs.UTSNamespace},
   813  	}
   814  
   815  	if gPlatform.Requirements().RequiresCurrentPIDNS {
   816  		// TODO(b/75837838): Also set a new PID namespace so that we limit
   817  		// access to other host processes.
   818  		log.Infof("Sandbox will be started in the current PID namespace")
   819  	} else {
   820  		log.Infof("Sandbox will be started in a new PID namespace")
   821  		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
   822  		cmd.Args = append(cmd.Args, "--pidns=true")
   823  	}
   824  
   825  	// Joins the network namespace if network is enabled. the sandbox talks
   826  	// directly to the host network, which may have been configured in the
   827  	// namespace.
   828  	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
   829  		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
   830  		nss = append(nss, ns)
   831  	} else if conf.Network == config.NetworkHost {
   832  		log.Infof("Sandbox will be started in the host network namespace")
   833  	} else {
   834  		log.Infof("Sandbox will be started in new network namespace")
   835  		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
   836  	}
   837  
   838  	// These are set to the uid/gid that the sandbox process will use. May be
   839  	// overriden below.
   840  	s.UID = os.Getuid()
   841  	s.GID = os.Getgid()
   842  
   843  	// User namespace depends on the network type or whether access to the host
   844  	// filesystem is required. These features require to run inside the user
   845  	// namespace specified in the spec or the current namespace if none is
   846  	// configured.
   847  	rootlessEUID := unix.Geteuid() != 0
   848  	setUserMappings := false
   849  	if conf.Network == config.NetworkHost || conf.DirectFS {
   850  		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
   851  			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
   852  			nss = append(nss, userns)
   853  			if rootlessEUID {
   854  				syncFile, err := ConfigureCmdForRootless(cmd, &donations)
   855  				if err != nil {
   856  					return err
   857  				}
   858  				defer syncFile.Close()
   859  				setUserMappings = true
   860  			} else {
   861  				specutils.SetUIDGIDMappings(cmd, args.Spec)
   862  				// We need to set UID and GID to have capabilities in a new user namespace.
   863  				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
   864  			}
   865  		} else {
   866  			if rootlessEUID {
   867  				return fmt.Errorf("unable to run a rootless container without userns")
   868  			}
   869  			log.Infof("Sandbox will be started in the current user namespace")
   870  		}
   871  		// When running in the caller's defined user namespace, apply the same
   872  		// capabilities to the sandbox process to ensure it abides to the same
   873  		// rules.
   874  		cmd.Args = append(cmd.Args, "--apply-caps=true")
   875  
   876  		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
   877  		// bind-mount the executable inside it.
   878  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   879  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   880  		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID {
   881  			log.Infof("Sandbox will be started in minimal chroot")
   882  			cmd.Args = append(cmd.Args, "--setup-root")
   883  		} else {
   884  			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
   885  		}
   886  	} else {
   887  		// If we have CAP_SETUID and CAP_SETGID, then we can also run
   888  		// as user nobody.
   889  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   890  			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
   891  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   892  		} else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
   893  			log.Infof("Sandbox will be started in new user namespace")
   894  			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
   895  			cmd.Args = append(cmd.Args, "--setup-root")
   896  
   897  			const nobody = 65534
   898  			if rootlessEUID || conf.Rootless {
   899  				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
   900  			} else {
   901  				// Map nobody in the new namespace to nobody in the parent namespace.
   902  				s.UID = nobody
   903  				s.GID = nobody
   904  			}
   905  
   906  			// Set credentials to run as user and group nobody.
   907  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
   908  			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   909  				{
   910  					ContainerID: nobody,
   911  					HostID:      s.UID,
   912  					Size:        1,
   913  				},
   914  			}
   915  			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   916  				{
   917  					ContainerID: nobody,
   918  					HostID:      s.GID,
   919  					Size:        1,
   920  				},
   921  			}
   922  
   923  			// A sandbox process will construct an empty root for itself, so it has
   924  			// to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
   925  			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps,
   926  				uintptr(capability.CAP_SYS_ADMIN),
   927  				uintptr(capability.CAP_SYS_CHROOT),
   928  				// CAP_SETPCAP is required to clear the bounding set.
   929  				uintptr(capability.CAP_SETPCAP),
   930  			)
   931  
   932  		} else {
   933  			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
   934  		}
   935  	}
   936  
   937  	// The current process' stdio must be passed to the application via the
   938  	// --stdio-fds flag. The stdio of the sandbox process itself must not
   939  	// be connected to the same FDs, otherwise we risk leaking sandbox
   940  	// errors to the application, so we set the sandbox stdio to nil,
   941  	// causing them to read/write from the null device.
   942  	cmd.Stdin = nil
   943  	cmd.Stdout = nil
   944  	cmd.Stderr = nil
   945  	var stdios [3]*os.File
   946  
   947  	// If the console control socket file is provided, then create a new
   948  	// pty master/replica pair and set the TTY on the sandbox process.
   949  	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
   950  		// console.NewWithSocket will send the master on the given
   951  		// socket, and return the replica.
   952  		tty, err := console.NewWithSocket(args.ConsoleSocket)
   953  		if err != nil {
   954  			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
   955  		}
   956  		defer tty.Close()
   957  
   958  		// Set the TTY as a controlling TTY on the sandbox process.
   959  		cmd.SysProcAttr.Setctty = true
   960  
   961  		// Inconveniently, the Ctty must be the FD in the *child* process's FD
   962  		// table. So transfer all files we have so far and make sure the next file
   963  		// added to donations is stdin.
   964  		//
   965  		// See https://github.com/golang/go/issues/29458.
   966  		nextFD = donations.Transfer(cmd, nextFD)
   967  		cmd.SysProcAttr.Ctty = nextFD
   968  
   969  		// Pass the tty as all stdio fds to sandbox.
   970  		stdios[0] = tty
   971  		stdios[1] = tty
   972  		stdios[2] = tty
   973  
   974  		if conf.Debug {
   975  			// If debugging, send the boot process stdio to the
   976  			// TTY, so that it is easier to find.
   977  			cmd.Stdin = tty
   978  			cmd.Stdout = tty
   979  			cmd.Stderr = tty
   980  		}
   981  	} else {
   982  		// If not using a console, pass our current stdio as the
   983  		// container stdio via flags.
   984  		stdios[0] = os.Stdin
   985  		stdios[1] = os.Stdout
   986  		stdios[2] = os.Stderr
   987  
   988  		if conf.Debug {
   989  			// If debugging, send the boot process stdio to the
   990  			// this process' stdio, so that is is easier to find.
   991  			cmd.Stdin = os.Stdin
   992  			cmd.Stdout = os.Stdout
   993  			cmd.Stderr = os.Stderr
   994  		}
   995  	}
   996  	if err := s.configureStdios(conf, stdios[:]); err != nil {
   997  		return fmt.Errorf("configuring stdios: %w", err)
   998  	}
   999  	// Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above
  1000  	// because it relies on stdin being the next FD donated.
  1001  	donations.Donate("stdio-fds", stdios[:]...)
  1002  
  1003  	totalSysMem, err := totalSystemMemory()
  1004  	if err != nil {
  1005  		return err
  1006  	}
  1007  	cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10))
  1008  
  1009  	mem := totalSysMem
  1010  	if s.CgroupJSON.Cgroup != nil {
  1011  		cpuNum, err := s.CgroupJSON.Cgroup.NumCPU()
  1012  		if err != nil {
  1013  			return fmt.Errorf("getting cpu count from cgroups: %v", err)
  1014  		}
  1015  		if conf.CPUNumFromQuota {
  1016  			// Dropping below 2 CPUs can trigger application to disable
  1017  			// locks that can lead do hard to debug errors, so just
  1018  			// leaving two cores as reasonable default.
  1019  			const minCPUs = 2
  1020  
  1021  			quota, err := s.CgroupJSON.Cgroup.CPUQuota()
  1022  			if err != nil {
  1023  				return fmt.Errorf("getting cpu quota from cgroups: %v", err)
  1024  			}
  1025  			if n := int(math.Ceil(quota)); n > 0 {
  1026  				if n < minCPUs {
  1027  					n = minCPUs
  1028  				}
  1029  				if n < cpuNum {
  1030  					// Only lower the cpu number.
  1031  					cpuNum = n
  1032  				}
  1033  			}
  1034  		}
  1035  		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
  1036  
  1037  		memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit()
  1038  		if err != nil {
  1039  			return fmt.Errorf("getting memory limit from cgroups: %v", err)
  1040  		}
  1041  		if memLimit < mem {
  1042  			mem = memLimit
  1043  		}
  1044  	}
  1045  	cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
  1046  
  1047  	if args.Attached {
  1048  		// Kill sandbox if parent process exits in attached mode.
  1049  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1050  		// Tells boot that any process it creates must have pdeathsig set.
  1051  		cmd.Args = append(cmd.Args, "--attached")
  1052  	}
  1053  
  1054  	if args.ExecFile != nil {
  1055  		donations.Donate("exec-fd", args.ExecFile)
  1056  	}
  1057  
  1058  	nextFD = donations.Transfer(cmd, nextFD)
  1059  
  1060  	_ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles)
  1061  
  1062  	// Add container ID as the last argument.
  1063  	cmd.Args = append(cmd.Args, s.ID)
  1064  
  1065  	donation.LogDonations(cmd)
  1066  	log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args)
  1067  	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
  1068  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1069  		err := fmt.Errorf("starting sandbox: %v", err)
  1070  		// If the sandbox failed to start, it may be because the binary
  1071  		// permissions were incorrect. Check the bits and return a more helpful
  1072  		// error message.
  1073  		//
  1074  		// NOTE: The error message is checked because error types are lost over
  1075  		// rpc calls.
  1076  		if strings.Contains(err.Error(), unix.EACCES.Error()) {
  1077  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
  1078  				return fmt.Errorf("%v: %v", err, permsErr)
  1079  			}
  1080  		}
  1081  		return err
  1082  	}
  1083  	s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid)
  1084  	if err != nil {
  1085  		return err
  1086  	}
  1087  	if setUserMappings {
  1088  		if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil {
  1089  			return err
  1090  		}
  1091  	}
  1092  
  1093  	s.child = true
  1094  	s.Pid.store(cmd.Process.Pid)
  1095  	log.Infof("Sandbox started, PID: %d", cmd.Process.Pid)
  1096  
  1097  	return nil
  1098  }
  1099  
  1100  // Wait waits for the containerized process to exit, and returns its WaitStatus.
  1101  func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) {
  1102  	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
  1103  
  1104  	if conn, err := s.sandboxConnect(); err != nil {
  1105  		// The sandbox may have exited while before we had a chance to wait on it.
  1106  		// There is nothing we can do for subcontainers. For the init container, we
  1107  		// can try to get the sandbox exit code.
  1108  		if !s.IsRootContainer(cid) {
  1109  			return unix.WaitStatus(0), err
  1110  		}
  1111  		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1112  	} else {
  1113  		defer conn.Close()
  1114  
  1115  		// Try the Wait RPC to the sandbox.
  1116  		var ws unix.WaitStatus
  1117  		err = conn.Call(boot.ContMgrWait, &cid, &ws)
  1118  		conn.Close()
  1119  		if err == nil {
  1120  			if s.IsRootContainer(cid) {
  1121  				if err := s.waitForStopped(); err != nil {
  1122  					return unix.WaitStatus(0), err
  1123  				}
  1124  			}
  1125  			// It worked!
  1126  			return ws, nil
  1127  		}
  1128  		// See comment above.
  1129  		if !s.IsRootContainer(cid) {
  1130  			return unix.WaitStatus(0), err
  1131  		}
  1132  
  1133  		// The sandbox may have exited after we connected, but before
  1134  		// or during the Wait RPC.
  1135  		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1136  	}
  1137  
  1138  	// The sandbox may have already exited, or exited while handling the Wait RPC.
  1139  	// The best we can do is ask Linux what the sandbox exit status was, since in
  1140  	// most cases that will be the same as the container exit status.
  1141  	if err := s.waitForStopped(); err != nil {
  1142  		return unix.WaitStatus(0), err
  1143  	}
  1144  	if !s.child {
  1145  		return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
  1146  	}
  1147  
  1148  	s.statusMu.Lock()
  1149  	defer s.statusMu.Unlock()
  1150  	return s.status, nil
  1151  }
  1152  
  1153  // WaitPID waits for process 'pid' in the container's sandbox and returns its
  1154  // WaitStatus.
  1155  func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) {
  1156  	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
  1157  	var ws unix.WaitStatus
  1158  	args := &boot.WaitPIDArgs{
  1159  		PID: pid,
  1160  		CID: cid,
  1161  	}
  1162  	if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil {
  1163  		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err)
  1164  	}
  1165  	return ws, nil
  1166  }
  1167  
  1168  // IsRootContainer returns true if the specified container ID belongs to the
  1169  // root container.
  1170  func (s *Sandbox) IsRootContainer(cid string) bool {
  1171  	return s.ID == cid
  1172  }
  1173  
  1174  // Destroy frees all resources associated with the sandbox. It fails fast and
  1175  // is idempotent.
  1176  func (s *Sandbox) destroy() error {
  1177  	log.Debugf("Destroying sandbox %q", s.ID)
  1178  	// Only delete the control file if it exists.
  1179  	if len(s.ControlSocketPath) > 0 {
  1180  		if err := os.Remove(s.ControlSocketPath); err != nil {
  1181  			log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err)
  1182  		}
  1183  	}
  1184  	pid := s.Pid.load()
  1185  	if pid != 0 {
  1186  		log.Debugf("Killing sandbox %q", s.ID)
  1187  		if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH {
  1188  			return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err)
  1189  		}
  1190  		if err := s.waitForStopped(); err != nil {
  1191  			return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err)
  1192  		}
  1193  	}
  1194  
  1195  	return nil
  1196  }
  1197  
  1198  // SignalContainer sends the signal to a container in the sandbox. If all is
  1199  // true and signal is SIGKILL, then waits for all processes to exit before
  1200  // returning.
  1201  func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error {
  1202  	log.Debugf("Signal sandbox %q", s.ID)
  1203  	mode := boot.DeliverToProcess
  1204  	if all {
  1205  		mode = boot.DeliverToAllProcesses
  1206  	}
  1207  
  1208  	args := boot.SignalArgs{
  1209  		CID:   cid,
  1210  		Signo: int32(sig),
  1211  		Mode:  mode,
  1212  	}
  1213  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1214  		return fmt.Errorf("signaling container %q: %w", cid, err)
  1215  	}
  1216  	return nil
  1217  }
  1218  
  1219  // SignalProcess sends the signal to a particular process in the container. If
  1220  // fgProcess is true, then the signal is sent to the foreground process group
  1221  // in the same session that PID belongs to. This is only valid if the process
  1222  // is attached to a host TTY.
  1223  func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error {
  1224  	log.Debugf("Signal sandbox %q", s.ID)
  1225  
  1226  	mode := boot.DeliverToProcess
  1227  	if fgProcess {
  1228  		mode = boot.DeliverToForegroundProcessGroup
  1229  	}
  1230  
  1231  	args := boot.SignalArgs{
  1232  		CID:   cid,
  1233  		Signo: int32(sig),
  1234  		PID:   pid,
  1235  		Mode:  mode,
  1236  	}
  1237  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1238  		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
  1239  	}
  1240  	return nil
  1241  }
  1242  
  1243  // Checkpoint sends the checkpoint call for a container in the sandbox.
  1244  // The statefile will be written to f.
  1245  func (s *Sandbox) Checkpoint(cid string, f *os.File, options statefile.Options) error {
  1246  	log.Debugf("Checkpoint sandbox %q, options %+v", s.ID, options)
  1247  	opt := control.SaveOpts{
  1248  		Metadata: options.WriteToMetadata(map[string]string{}),
  1249  		FilePayload: urpc.FilePayload{
  1250  			Files: []*os.File{f},
  1251  		},
  1252  	}
  1253  
  1254  	if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
  1255  		return fmt.Errorf("checkpointing container %q: %w", cid, err)
  1256  	}
  1257  	return nil
  1258  }
  1259  
  1260  // Pause sends the pause call for a container in the sandbox.
  1261  func (s *Sandbox) Pause(cid string) error {
  1262  	log.Debugf("Pause sandbox %q", s.ID)
  1263  	if err := s.call(boot.LifecyclePause, nil, nil); err != nil {
  1264  		return fmt.Errorf("pausing container %q: %w", cid, err)
  1265  	}
  1266  	return nil
  1267  }
  1268  
  1269  // Resume sends the resume call for a container in the sandbox.
  1270  func (s *Sandbox) Resume(cid string) error {
  1271  	log.Debugf("Resume sandbox %q", s.ID)
  1272  	if err := s.call(boot.LifecycleResume, nil, nil); err != nil {
  1273  		return fmt.Errorf("resuming container %q: %w", cid, err)
  1274  	}
  1275  	return nil
  1276  }
  1277  
  1278  // Usage sends the collect call for a container in the sandbox.
  1279  func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) {
  1280  	log.Debugf("Usage sandbox %q", s.ID)
  1281  	opts := control.MemoryUsageOpts{Full: Full}
  1282  	var m control.MemoryUsage
  1283  	if err := s.call(boot.UsageCollect, &opts, &m); err != nil {
  1284  		return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err)
  1285  	}
  1286  	return m, nil
  1287  }
  1288  
  1289  // UsageFD sends the usagefd call for a container in the sandbox.
  1290  func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) {
  1291  	log.Debugf("Usage sandbox %q", s.ID)
  1292  	opts := control.MemoryUsageFileOpts{Version: 1}
  1293  	var m control.MemoryUsageFile
  1294  	if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil {
  1295  		return nil, fmt.Errorf("collecting usage FD: %w", err)
  1296  	}
  1297  
  1298  	if len(m.FilePayload.Files) != 2 {
  1299  		return nil, fmt.Errorf("wants exactly two fds")
  1300  	}
  1301  	return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1])
  1302  }
  1303  
  1304  // GetRegisteredMetrics returns metric registration data from the sandbox.
  1305  // This data is meant to be used as a way to sanity-check any exported metrics data during the
  1306  // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce
  1307  // bogus metrics.
  1308  // This returns an error if the sandbox has not requested instrumentation during creation time.
  1309  func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
  1310  	if s.RegisteredMetrics == nil {
  1311  		return nil, errors.New("sandbox did not request instrumentation when it was created")
  1312  	}
  1313  	return s.RegisteredMetrics, nil
  1314  }
  1315  
  1316  // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
  1317  func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
  1318  	log.Debugf("Metrics export sandbox %q", s.ID)
  1319  	var data control.MetricsExportData
  1320  	if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
  1321  		return nil, err
  1322  	}
  1323  	// Since we do not trust the output of the sandbox as-is, double-check that the options were
  1324  	// respected.
  1325  	if err := opts.Verify(&data); err != nil {
  1326  		return nil, err
  1327  	}
  1328  	return data.Snapshot, nil
  1329  }
  1330  
  1331  // IsRunning returns true if the sandbox or gofer process is running.
  1332  func (s *Sandbox) IsRunning() bool {
  1333  	pid := s.Pid.load()
  1334  	if pid != 0 {
  1335  		// Send a signal 0 to the sandbox process.
  1336  		if err := unix.Kill(pid, 0); err == nil {
  1337  			// Succeeded, process is running.
  1338  			return true
  1339  		}
  1340  	}
  1341  	return false
  1342  }
  1343  
  1344  // Stacks collects and returns all stacks for the sandbox.
  1345  func (s *Sandbox) Stacks() (string, error) {
  1346  	log.Debugf("Stacks sandbox %q", s.ID)
  1347  	var stacks string
  1348  	if err := s.call(boot.DebugStacks, nil, &stacks); err != nil {
  1349  		return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
  1350  	}
  1351  	return stacks, nil
  1352  }
  1353  
  1354  // HeapProfile writes a heap profile to the given file.
  1355  func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
  1356  	log.Debugf("Heap profile %q", s.ID)
  1357  	opts := control.HeapProfileOpts{
  1358  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1359  		Delay:       delay,
  1360  	}
  1361  	return s.call(boot.ProfileHeap, &opts, nil)
  1362  }
  1363  
  1364  // CPUProfile collects a CPU profile.
  1365  func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
  1366  	log.Debugf("CPU profile %q", s.ID)
  1367  	opts := control.CPUProfileOpts{
  1368  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1369  		Duration:    duration,
  1370  	}
  1371  	return s.call(boot.ProfileCPU, &opts, nil)
  1372  }
  1373  
  1374  // BlockProfile writes a block profile to the given file.
  1375  func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
  1376  	log.Debugf("Block profile %q", s.ID)
  1377  	opts := control.BlockProfileOpts{
  1378  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1379  		Duration:    duration,
  1380  	}
  1381  	return s.call(boot.ProfileBlock, &opts, nil)
  1382  }
  1383  
  1384  // MutexProfile writes a mutex profile to the given file.
  1385  func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
  1386  	log.Debugf("Mutex profile %q", s.ID)
  1387  	opts := control.MutexProfileOpts{
  1388  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1389  		Duration:    duration,
  1390  	}
  1391  	return s.call(boot.ProfileMutex, &opts, nil)
  1392  }
  1393  
  1394  // Trace collects an execution trace.
  1395  func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
  1396  	log.Debugf("Trace %q", s.ID)
  1397  	opts := control.TraceProfileOpts{
  1398  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1399  		Duration:    duration,
  1400  	}
  1401  	return s.call(boot.ProfileTrace, &opts, nil)
  1402  }
  1403  
  1404  // ChangeLogging changes logging options.
  1405  func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
  1406  	log.Debugf("Change logging start %q", s.ID)
  1407  	if err := s.call(boot.LoggingChange, &args, nil); err != nil {
  1408  		return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err)
  1409  	}
  1410  	return nil
  1411  }
  1412  
  1413  // DestroyContainer destroys the given container. If it is the root container,
  1414  // then the entire sandbox is destroyed.
  1415  func (s *Sandbox) DestroyContainer(cid string) error {
  1416  	if err := s.destroyContainer(cid); err != nil {
  1417  		// If the sandbox isn't running, the container has already been destroyed,
  1418  		// ignore the error in this case.
  1419  		if s.IsRunning() {
  1420  			return err
  1421  		}
  1422  	}
  1423  	return nil
  1424  }
  1425  
  1426  func (s *Sandbox) destroyContainer(cid string) error {
  1427  	if s.IsRootContainer(cid) {
  1428  		log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid)
  1429  		return s.destroy()
  1430  	}
  1431  
  1432  	log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID)
  1433  	if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil {
  1434  		return fmt.Errorf("destroying container %q: %w", cid, err)
  1435  	}
  1436  	return nil
  1437  }
  1438  
  1439  func (s *Sandbox) waitForStopped() error {
  1440  	if s.child {
  1441  		s.statusMu.Lock()
  1442  		defer s.statusMu.Unlock()
  1443  		pid := s.Pid.load()
  1444  		if pid == 0 {
  1445  			return nil
  1446  		}
  1447  		// The sandbox process is a child of the current process,
  1448  		// so we can wait on it to terminate and collect its zombie.
  1449  		if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil {
  1450  			return fmt.Errorf("error waiting the sandbox process: %v", err)
  1451  		}
  1452  		s.Pid.store(0)
  1453  		return nil
  1454  	}
  1455  
  1456  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1457  	defer cancel()
  1458  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1459  	op := func() error {
  1460  		if s.IsRunning() {
  1461  			return fmt.Errorf("sandbox is still running")
  1462  		}
  1463  		return nil
  1464  	}
  1465  	return backoff.Retry(op, b)
  1466  }
  1467  
  1468  // configureStdios change stdios ownership to give access to the sandbox
  1469  // process. This may be skipped depending on the configuration.
  1470  func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error {
  1471  	if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1472  		// Cannot change ownership without CAP_CHOWN.
  1473  		return nil
  1474  	}
  1475  
  1476  	if s.UID < 0 || s.GID < 0 {
  1477  		panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID))
  1478  	}
  1479  	for _, file := range stdios {
  1480  		log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID)
  1481  		if err := file.Chown(s.UID, s.GID); err != nil {
  1482  			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
  1483  				log.Warningf("can't change an owner of %s: %s", file.Name(), err)
  1484  				continue
  1485  			}
  1486  			return err
  1487  		}
  1488  	}
  1489  	return nil
  1490  }
  1491  
  1492  // deviceFileForPlatform opens the device file for the given platform. If the
  1493  // platform does not need a device file, then nil is returned.
  1494  // devicePath may be empty to use a sane platform-specific default.
  1495  func deviceFileForPlatform(name, devicePath string) (*os.File, error) {
  1496  	p, err := platform.Lookup(name)
  1497  	if err != nil {
  1498  		return nil, err
  1499  	}
  1500  
  1501  	f, err := p.OpenDevice(devicePath)
  1502  	if err != nil {
  1503  		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
  1504  	}
  1505  	return f, nil
  1506  }
  1507  
  1508  // checkBinaryPermissions verifies that the required binary bits are set on
  1509  // the runsc executable.
  1510  func checkBinaryPermissions(conf *config.Config) error {
  1511  	// All platforms need the other exe bit
  1512  	neededBits := os.FileMode(0001)
  1513  	if conf.Platform == "ptrace" {
  1514  		// Ptrace needs the other read bit
  1515  		neededBits |= os.FileMode(0004)
  1516  	}
  1517  
  1518  	exePath, err := os.Executable()
  1519  	if err != nil {
  1520  		return fmt.Errorf("getting exe path: %v", err)
  1521  	}
  1522  
  1523  	// Check the permissions of the runsc binary and print an error if it
  1524  	// doesn't match expectations.
  1525  	info, err := os.Stat(exePath)
  1526  	if err != nil {
  1527  		return fmt.Errorf("stat file: %v", err)
  1528  	}
  1529  
  1530  	if info.Mode().Perm()&neededBits != neededBits {
  1531  		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
  1532  	}
  1533  	return nil
  1534  }
  1535  
  1536  // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox.
  1537  func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) {
  1538  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1539  	args := control.CgroupsReadArgs{
  1540  		Args: []control.CgroupsReadArg{
  1541  			{
  1542  				File: file,
  1543  			},
  1544  		},
  1545  	}
  1546  	var out control.CgroupsResults
  1547  	if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil {
  1548  		return "", err
  1549  	}
  1550  	if len(out.Results) != 1 {
  1551  		return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1552  	}
  1553  	return out.Results[0].Unpack()
  1554  }
  1555  
  1556  // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox.
  1557  func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error {
  1558  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1559  	args := control.CgroupsWriteArgs{
  1560  		Args: []control.CgroupsWriteArg{
  1561  			{
  1562  				File:  file,
  1563  				Value: value,
  1564  			},
  1565  		},
  1566  	}
  1567  	var out control.CgroupsResults
  1568  	if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil {
  1569  		return err
  1570  	}
  1571  	if len(out.Results) != 1 {
  1572  		return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1573  	}
  1574  	return out.Results[0].AsError()
  1575  }
  1576  
  1577  // fixPidns looks at the PID namespace path. If that path corresponds to the
  1578  // sandbox process PID namespace, then change the spec so that the container
  1579  // joins the sandbox root namespace.
  1580  func (s *Sandbox) fixPidns(spec *specs.Spec) {
  1581  	pidns, ok := specutils.GetNS(specs.PIDNamespace, spec)
  1582  	if !ok {
  1583  		// pidns was not set, nothing to fix.
  1584  		return
  1585  	}
  1586  	if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) {
  1587  		// Fix only if the PID namespace corresponds to the sandbox's.
  1588  		return
  1589  	}
  1590  
  1591  	for i := range spec.Linux.Namespaces {
  1592  		if spec.Linux.Namespaces[i].Type == specs.PIDNamespace {
  1593  			// Removing the namespace makes the container join the sandbox root
  1594  			// namespace.
  1595  			log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path)
  1596  			spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...)
  1597  			return
  1598  		}
  1599  	}
  1600  	panic("unreachable")
  1601  }
  1602  
  1603  // ConfigureCmdForRootless configures cmd to donate a socket FD that can be
  1604  // used to synchronize userns configuration.
  1605  func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) {
  1606  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1607  	if err != nil {
  1608  		return nil, err
  1609  	}
  1610  	f := os.NewFile(uintptr(fds[1]), "userns sync other FD")
  1611  	donations.DonateAndClose("sync-userns-fd", f)
  1612  	if cmd.SysProcAttr == nil {
  1613  		cmd.SysProcAttr = &unix.SysProcAttr{}
  1614  	}
  1615  	cmd.SysProcAttr.AmbientCaps = []uintptr{
  1616  		// Same as `cap` in cmd/gofer.go.
  1617  		unix.CAP_CHOWN,
  1618  		unix.CAP_DAC_OVERRIDE,
  1619  		unix.CAP_DAC_READ_SEARCH,
  1620  		unix.CAP_FOWNER,
  1621  		unix.CAP_FSETID,
  1622  		unix.CAP_SYS_CHROOT,
  1623  		// Needed for setuid(2)/setgid(2).
  1624  		unix.CAP_SETUID,
  1625  		unix.CAP_SETGID,
  1626  		// Needed for chroot.
  1627  		unix.CAP_SYS_ADMIN,
  1628  		// Needed to be able to clear bounding set (PR_CAPBSET_DROP).
  1629  		unix.CAP_SETPCAP,
  1630  	}
  1631  	return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil
  1632  }
  1633  
  1634  // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings
  1635  // for process pid.
  1636  func SetUserMappings(spec *specs.Spec, pid int) error {
  1637  	log.Debugf("Setting user mappings")
  1638  	args := []string{strconv.Itoa(pid)}
  1639  	for _, idMap := range spec.Linux.UIDMappings {
  1640  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1641  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1642  		args = append(args,
  1643  			strconv.Itoa(int(idMap.ContainerID)),
  1644  			strconv.Itoa(int(idMap.HostID)),
  1645  			strconv.Itoa(int(idMap.Size)),
  1646  		)
  1647  	}
  1648  
  1649  	out, err := exec.Command("newuidmap", args...).CombinedOutput()
  1650  	log.Debugf("newuidmap: %#v\n%s", args, out)
  1651  	if err != nil {
  1652  		return fmt.Errorf("newuidmap failed: %w", err)
  1653  	}
  1654  
  1655  	args = []string{strconv.Itoa(pid)}
  1656  	for _, idMap := range spec.Linux.GIDMappings {
  1657  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1658  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1659  		args = append(args,
  1660  			strconv.Itoa(int(idMap.ContainerID)),
  1661  			strconv.Itoa(int(idMap.HostID)),
  1662  			strconv.Itoa(int(idMap.Size)),
  1663  		)
  1664  	}
  1665  	out, err = exec.Command("newgidmap", args...).CombinedOutput()
  1666  	log.Debugf("newgidmap: %#v\n%s", args, out)
  1667  	if err != nil {
  1668  		return fmt.Errorf("newgidmap failed: %w", err)
  1669  	}
  1670  	return nil
  1671  }