gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/sandbox/sandbox.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/sandbox/sandbox.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sandbox creates and manipulates sandboxes.
    16  package sandbox
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math"
    25  	"os"
    26  	"os/exec"
    27  	"path"
    28  	"path/filepath"
    29  	"strconv"
    30  	"strings"
    31  	"syscall"
    32  	"time"
    33  
    34  	"github.com/cenkalti/backoff"
    35  	specs "github.com/opencontainers/runtime-spec/specs-go"
    36  	"github.com/syndtr/gocapability/capability"
    37  	"golang.org/x/sys/unix"
    38  	"gvisor.dev/gvisor/pkg/abi/linux"
    39  	"gvisor.dev/gvisor/pkg/atomicbitops"
    40  	"gvisor.dev/gvisor/pkg/cleanup"
    41  	"gvisor.dev/gvisor/pkg/control/client"
    42  	"gvisor.dev/gvisor/pkg/control/server"
    43  	"gvisor.dev/gvisor/pkg/coverage"
    44  	"gvisor.dev/gvisor/pkg/fd"
    45  	"gvisor.dev/gvisor/pkg/log"
    46  	metricpb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
    47  	"gvisor.dev/gvisor/pkg/prometheus"
    48  	"gvisor.dev/gvisor/pkg/sentry/control"
    49  	"gvisor.dev/gvisor/pkg/sentry/devices/nvproxy"
    50  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs"
    51  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    52  	"gvisor.dev/gvisor/pkg/sentry/platform"
    53  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    54  	"gvisor.dev/gvisor/pkg/state/statefile"
    55  	"gvisor.dev/gvisor/pkg/sync"
    56  	"gvisor.dev/gvisor/pkg/urpc"
    57  	"gvisor.dev/gvisor/runsc/boot"
    58  	"gvisor.dev/gvisor/runsc/boot/procfs"
    59  	"gvisor.dev/gvisor/runsc/cgroup"
    60  	"gvisor.dev/gvisor/runsc/config"
    61  	"gvisor.dev/gvisor/runsc/console"
    62  	"gvisor.dev/gvisor/runsc/donation"
    63  	"gvisor.dev/gvisor/runsc/specutils"
    64  )
    65  
    66  const (
    67  	// namespaceAnnotation is a pod annotation populated by containerd.
    68  	// It contains the name of the pod that a sandbox is in when running in Kubernetes.
    69  	podNameAnnotation = "io.kubernetes.cri.sandbox-name"
    70  
    71  	// namespaceAnnotation is a pod annotation populated by containerd.
    72  	// It contains the namespace of the pod that a sandbox is in when running in Kubernetes.
    73  	namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace"
    74  )
    75  
    76  // createControlSocket finds a location and creates the socket used to
    77  // communicate with the sandbox. The socket is a UDS on the host filesystem.
    78  //
    79  // Note that abstract sockets are *not* used, because any user can connect to
    80  // them. There is no file mode protecting abstract sockets.
    81  func createControlSocket(rootDir, id string) (string, int, error) {
    82  	name := fmt.Sprintf("runsc-%s.sock", id)
    83  
    84  	// Only use absolute paths to guarantee resolution from anywhere.
    85  	for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} {
    86  		path := filepath.Join(dir, name)
    87  		log.Debugf("Attempting to create socket file %q", path)
    88  		fd, err := server.CreateSocket(path)
    89  		if err == nil {
    90  			log.Debugf("Using socket file %q", path)
    91  			return path, fd, nil
    92  		}
    93  		log.Debugf("Failed to create socket file %q: %v", path, err)
    94  	}
    95  	return "", -1, fmt.Errorf("unable to find location to write socket file")
    96  }
    97  
    98  // pid is an atomic type that implements JSON marshal/unmarshal interfaces.
    99  type pid struct {
   100  	val atomicbitops.Int64
   101  }
   102  
   103  func (p *pid) store(pid int) {
   104  	p.val.Store(int64(pid))
   105  }
   106  
   107  func (p *pid) load() int {
   108  	return int(p.val.Load())
   109  }
   110  
   111  // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON.
   112  func (p *pid) UnmarshalJSON(b []byte) error {
   113  	var pid int
   114  
   115  	if err := json.Unmarshal(b, &pid); err != nil {
   116  		return err
   117  	}
   118  	p.store(pid)
   119  	return nil
   120  }
   121  
   122  // MarshalJSON implements json.Marshaler.MarshalJSON
   123  func (p *pid) MarshalJSON() ([]byte, error) {
   124  	return json.Marshal(p.load())
   125  }
   126  
   127  // Sandbox wraps a sandbox process.
   128  //
   129  // It is used to start/stop sandbox process (and associated processes like
   130  // gofers), as well as for running and manipulating containers inside a running
   131  // sandbox.
   132  //
   133  // Note: Sandbox must be immutable because a copy of it is saved for each
   134  // container and changes would not be synchronized to all of them.
   135  type Sandbox struct {
   136  	// ID is the id of the sandbox (immutable). By convention, this is the same
   137  	// ID as the first container run in the sandbox.
   138  	ID string `json:"id"`
   139  
   140  	// PodName is the name of the Kubernetes Pod (if any) that this sandbox
   141  	// represents. Unset if not running under containerd or Kubernetes.
   142  	PodName string `json:"podName"`
   143  
   144  	// Namespace is the Kubernetes namespace (if any) of the pod that this
   145  	// sandbox represents. Unset if not running under containerd or Kubernetes.
   146  	Namespace string `json:"namespace"`
   147  
   148  	// Pid is the pid of the running sandbox. May be 0 if the sandbox
   149  	// is not running.
   150  	Pid pid `json:"pid"`
   151  
   152  	// UID is the user ID in the parent namespace that the sandbox is running as.
   153  	UID int `json:"uid"`
   154  	// GID is the group ID in the parent namespace that the sandbox is running as.
   155  	GID int `json:"gid"`
   156  
   157  	// CgroupJSON contains the cgroup configuration that the sandbox is part of
   158  	// and allow serialization of the configuration into json
   159  	CgroupJSON cgroup.CgroupJSON `json:"cgroup"`
   160  
   161  	// OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox
   162  	// started, before it may be modified.
   163  	OriginalOOMScoreAdj int `json:"originalOomScoreAdj"`
   164  
   165  	// RegisteredMetrics is the set of metrics registered in the sandbox.
   166  	// Used for verifying metric data integrity after containers are started.
   167  	// Only populated if exporting metrics was requested when the sandbox was
   168  	// created.
   169  	RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"`
   170  
   171  	// MetricMetadata are key-value pairs that are useful to export about this
   172  	// sandbox, but not part of the set of labels that uniquely identify it.
   173  	// They are static once initialized, and typically contain high-level
   174  	// configuration information about the sandbox.
   175  	MetricMetadata map[string]string `json:"metricMetadata"`
   176  
   177  	// MetricServerAddress is the address of the metric server that this sandbox
   178  	// intends to export metrics for.
   179  	// Only populated if exporting metrics was requested when the sandbox was
   180  	// created.
   181  	MetricServerAddress string `json:"metricServerAddress"`
   182  
   183  	// ControlSocketPath is the path to the sandbox's uRPC server socket.
   184  	// Connections to the sandbox are made through this.
   185  	ControlSocketPath string `json:"controlSocketPath"`
   186  
   187  	// MountHints provides extra information about container mounts that apply
   188  	// to the entire pod.
   189  	MountHints *boot.PodMountHints `json:"mountHints"`
   190  
   191  	// child is set if a sandbox process is a child of the current process.
   192  	//
   193  	// This field isn't saved to json, because only a creator of sandbox
   194  	// will have it as a child process.
   195  	child bool `nojson:"true"`
   196  
   197  	// statusMu protects status.
   198  	statusMu sync.Mutex `nojson:"true"`
   199  
   200  	// status is the exit status of a sandbox process. It's only set if the
   201  	// child==true and the sandbox was waited on. This field allows for multiple
   202  	// threads to wait on sandbox and get the exit code, since Linux will return
   203  	// WaitStatus to one of the waiters only.
   204  	status unix.WaitStatus `nojson:"true"`
   205  }
   206  
   207  // Getpid returns the process ID of the sandbox process.
   208  func (s *Sandbox) Getpid() int {
   209  	return s.Pid.load()
   210  }
   211  
   212  // Args is used to configure a new sandbox.
   213  type Args struct {
   214  	// ID is the sandbox unique identifier.
   215  	ID string
   216  
   217  	// Spec is the OCI spec that describes the container.
   218  	Spec *specs.Spec
   219  
   220  	// BundleDir is the directory containing the container bundle.
   221  	BundleDir string
   222  
   223  	// ConsoleSocket is the path to a unix domain socket that will receive
   224  	// the console FD. It may be empty.
   225  	ConsoleSocket string
   226  
   227  	// UserLog is the filename to send user-visible logs to. It may be empty.
   228  	UserLog string
   229  
   230  	// IOFiles is the list of image files and/or socket files that connect to
   231  	// a gofer endpoint for the mount points using Gofers. They must be in the
   232  	// same order as mounts appear in the spec.
   233  	IOFiles []*os.File
   234  
   235  	// File that connects to a gofer endpoint for a device mount point at /dev.
   236  	DevIOFile *os.File
   237  
   238  	// GoferFilestoreFiles are the regular files that will back the overlayfs or
   239  	// tmpfs mount if a gofer mount is to be overlaid.
   240  	GoferFilestoreFiles []*os.File
   241  
   242  	// GoferMountConfs contains information about how the gofer mounts have been
   243  	// configured. The first entry is for rootfs and the following entries are
   244  	// for bind mounts in Spec.Mounts (in the same order).
   245  	GoferMountConfs boot.GoferMountConfFlags
   246  
   247  	// MountHints provides extra information about containers mounts that apply
   248  	// to the entire pod.
   249  	MountHints *boot.PodMountHints
   250  
   251  	// MountsFile is a file container mount information from the spec. It's
   252  	// equivalent to the mounts from the spec, except that all paths have been
   253  	// resolved to their final absolute location.
   254  	MountsFile *os.File
   255  
   256  	// Gcgroup is the cgroup that the sandbox is part of.
   257  	Cgroup cgroup.Cgroup
   258  
   259  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   260  	// If the caller exits, the sandbox should exit too.
   261  	Attached bool
   262  
   263  	// SinkFiles is the an ordered array of files to be used by seccheck sinks
   264  	// configured from the --pod-init-config file.
   265  	SinkFiles []*os.File
   266  
   267  	// PassFiles are user-supplied files from the host to be exposed to the
   268  	// sandboxed app.
   269  	PassFiles map[int]*os.File
   270  
   271  	// ExecFile is the file from the host used for program execution.
   272  	ExecFile *os.File
   273  }
   274  
   275  // New creates the sandbox process. The caller must call Destroy() on the
   276  // sandbox.
   277  func New(conf *config.Config, args *Args) (*Sandbox, error) {
   278  	s := &Sandbox{
   279  		ID: args.ID,
   280  		CgroupJSON: cgroup.CgroupJSON{
   281  			Cgroup: args.Cgroup,
   282  		},
   283  		UID:                 -1, // prevent usage before it's set.
   284  		GID:                 -1, // prevent usage before it's set.
   285  		MetricMetadata:      conf.MetricMetadata(),
   286  		MetricServerAddress: conf.MetricServer,
   287  		MountHints:          args.MountHints,
   288  	}
   289  	if args.Spec != nil && args.Spec.Annotations != nil {
   290  		s.PodName = args.Spec.Annotations[podNameAnnotation]
   291  		s.Namespace = args.Spec.Annotations[namespaceAnnotation]
   292  	}
   293  
   294  	// The Cleanup object cleans up partially created sandboxes when an error
   295  	// occurs. Any errors occurring during cleanup itself are ignored.
   296  	c := cleanup.Make(func() {
   297  		if err := s.destroy(); err != nil {
   298  			log.Warningf("error destroying sandbox: %v", err)
   299  		}
   300  	})
   301  	defer c.Clean()
   302  
   303  	if len(conf.PodInitConfig) > 0 {
   304  		initConf, err := boot.LoadInitConfig(conf.PodInitConfig)
   305  		if err != nil {
   306  			return nil, fmt.Errorf("loading init config file: %w", err)
   307  		}
   308  		args.SinkFiles, err = initConf.Setup()
   309  		if err != nil {
   310  			return nil, fmt.Errorf("cannot init config: %w", err)
   311  		}
   312  	}
   313  
   314  	// Create pipe to synchronize when sandbox process has been booted.
   315  	clientSyncFile, sandboxSyncFile, err := os.Pipe()
   316  	if err != nil {
   317  		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
   318  	}
   319  	defer clientSyncFile.Close()
   320  
   321  	// Create the sandbox process.
   322  	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
   323  	// sandboxSyncFile has to be closed to be able to detect when the sandbox
   324  	// process exits unexpectedly.
   325  	sandboxSyncFile.Close()
   326  	if err != nil {
   327  		return nil, fmt.Errorf("cannot create sandbox process: %w", err)
   328  	}
   329  
   330  	// Wait until the sandbox has booted.
   331  	b := make([]byte, 1)
   332  	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
   333  		err := fmt.Errorf("waiting for sandbox to start: %v", err)
   334  		// If the sandbox failed to start, it may be because the binary
   335  		// permissions were incorrect. Check the bits and return a more helpful
   336  		// error message.
   337  		//
   338  		// NOTE: The error message is checked because error types are lost over
   339  		// rpc calls.
   340  		if strings.Contains(err.Error(), io.EOF.Error()) {
   341  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   342  				return nil, fmt.Errorf("%v: %v", err, permsErr)
   343  			}
   344  		}
   345  		return nil, fmt.Errorf("cannot read client sync file: %w", err)
   346  	}
   347  
   348  	if conf.MetricServer != "" {
   349  		// The control server is up and the sandbox was configured to export metrics.
   350  		// We must gather data about registered metrics prior to any process starting in the sandbox.
   351  		log.Debugf("Getting metric registration information from sandbox %q", s.ID)
   352  		var registeredMetrics control.MetricsRegistrationResponse
   353  		if err := s.call(boot.MetricsGetRegistered, nil, &registeredMetrics); err != nil {
   354  			return nil, fmt.Errorf("cannot get registered metrics: %v", err)
   355  		}
   356  		s.RegisteredMetrics = registeredMetrics.RegisteredMetrics
   357  	}
   358  
   359  	c.Release()
   360  	return s, nil
   361  }
   362  
   363  // CreateSubcontainer creates a container inside the sandbox.
   364  func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error {
   365  	log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   366  
   367  	var files []*os.File
   368  	if tty != nil {
   369  		files = []*os.File{tty}
   370  	}
   371  	if err := s.configureStdios(conf, files); err != nil {
   372  		return err
   373  	}
   374  
   375  	args := boot.CreateArgs{
   376  		CID:         cid,
   377  		FilePayload: urpc.FilePayload{Files: files},
   378  	}
   379  	if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil {
   380  		return fmt.Errorf("creating sub-container %q: %w", cid, err)
   381  	}
   382  	return nil
   383  }
   384  
   385  // StartRoot starts running the root container process inside the sandbox.
   386  func (s *Sandbox) StartRoot(conf *config.Config) error {
   387  	pid := s.Pid.load()
   388  	log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid)
   389  	conn, err := s.sandboxConnect()
   390  	if err != nil {
   391  		return err
   392  	}
   393  	defer conn.Close()
   394  
   395  	// Configure the network.
   396  	if err := setupNetwork(conn, pid, conf); err != nil {
   397  		return fmt.Errorf("setting up network: %w", err)
   398  	}
   399  
   400  	// Send a message to the sandbox control server to start the root container.
   401  	if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil {
   402  		return fmt.Errorf("starting root container: %w", err)
   403  	}
   404  
   405  	return nil
   406  }
   407  
   408  // StartSubcontainer starts running a sub-container inside the sandbox.
   409  func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error {
   410  	log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   411  
   412  	if err := s.configureStdios(conf, stdios); err != nil {
   413  		return err
   414  	}
   415  	s.fixPidns(spec)
   416  
   417  	// The payload contains (in this specific order):
   418  	// * stdin/stdout/stderr (optional: only present when not using TTY)
   419  	// * The subcontainer's gofer filestore files (optional)
   420  	// * The subcontainer's dev gofer file (optional)
   421  	// * Gofer files.
   422  	payload := urpc.FilePayload{}
   423  	payload.Files = append(payload.Files, stdios...)
   424  	payload.Files = append(payload.Files, goferFilestores...)
   425  	if devIOFile != nil {
   426  		payload.Files = append(payload.Files, devIOFile)
   427  	}
   428  	payload.Files = append(payload.Files, goferFiles...)
   429  
   430  	// Start running the container.
   431  	args := boot.StartArgs{
   432  		Spec:                 spec,
   433  		Conf:                 conf,
   434  		CID:                  cid,
   435  		NumGoferFilestoreFDs: len(goferFilestores),
   436  		IsDevIoFilePresent:   devIOFile != nil,
   437  		GoferMountConfs:      goferConfs,
   438  		FilePayload:          payload,
   439  	}
   440  	if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil {
   441  		return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err)
   442  	}
   443  	return nil
   444  }
   445  
   446  // Restore sends the restore call for a container in the sandbox.
   447  func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, direct bool) error {
   448  	log.Debugf("Restore sandbox %q from path %q", s.ID, imagePath)
   449  
   450  	stateFileName := path.Join(imagePath, boot.CheckpointStateFileName)
   451  	sf, err := os.Open(stateFileName)
   452  	if err != nil {
   453  		return fmt.Errorf("opening state file %q failed: %v", stateFileName, err)
   454  	}
   455  	defer sf.Close()
   456  
   457  	opt := boot.RestoreOpts{
   458  		FilePayload: urpc.FilePayload{
   459  			Files: []*os.File{sf},
   460  		},
   461  	}
   462  
   463  	// If the pages file exists, we must pass it in.
   464  	pagesFileName := path.Join(imagePath, boot.CheckpointPagesFileName)
   465  	pagesReadFlags := os.O_RDONLY
   466  	if direct {
   467  		// The contents are page-aligned, so it can be opened with O_DIRECT.
   468  		pagesReadFlags |= syscall.O_DIRECT
   469  	}
   470  	if pf, err := os.OpenFile(pagesFileName, pagesReadFlags, 0); err == nil {
   471  		defer pf.Close()
   472  		pagesMetadataFileName := path.Join(imagePath, boot.CheckpointPagesMetadataFileName)
   473  		pmf, err := os.Open(pagesMetadataFileName)
   474  		if err != nil {
   475  			return fmt.Errorf("opening restore image file %q failed: %v", pagesMetadataFileName, err)
   476  		}
   477  		defer pmf.Close()
   478  		opt.HavePagesFile = true
   479  		opt.FilePayload.Files = append(opt.FilePayload.Files, pmf, pf)
   480  	} else if !os.IsNotExist(err) {
   481  		return fmt.Errorf("opening restore image file %q failed: %v", pagesFileName, err)
   482  	}
   483  
   484  	// If the platform needs a device FD we must pass it in.
   485  	if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil {
   486  		return err
   487  	} else if deviceFile != nil {
   488  		defer deviceFile.Close()
   489  		opt.HaveDeviceFile = true
   490  		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile.ReleaseToFile("device file"))
   491  	}
   492  
   493  	conn, err := s.sandboxConnect()
   494  	if err != nil {
   495  		return err
   496  	}
   497  	defer conn.Close()
   498  
   499  	// Configure the network.
   500  	if err := setupNetwork(conn, s.Pid.load(), conf); err != nil {
   501  		return fmt.Errorf("setting up network: %v", err)
   502  	}
   503  
   504  	// Restore the container and start the root container.
   505  	if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
   506  		return fmt.Errorf("restoring container %q: %v", cid, err)
   507  	}
   508  
   509  	return nil
   510  }
   511  
   512  // RestoreSubcontainer sends the restore call for a sub-container in the sandbox.
   513  func (s *Sandbox) RestoreSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestoreFiles []*os.File, devIOFile *os.File, goferMountConf []boot.GoferMountConf) error {
   514  	log.Debugf("Restore sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   515  
   516  	if err := s.configureStdios(conf, stdios); err != nil {
   517  		return err
   518  	}
   519  	s.fixPidns(spec)
   520  
   521  	// The payload contains (in this specific order):
   522  	// * stdin/stdout/stderr (optional: only present when not using TTY)
   523  	// * The subcontainer's overlay filestore files (optional: only present when
   524  	//   host file backed overlay is configured)
   525  	// * Gofer files.
   526  	payload := urpc.FilePayload{}
   527  	payload.Files = append(payload.Files, stdios...)
   528  	payload.Files = append(payload.Files, goferFilestoreFiles...)
   529  	if devIOFile != nil {
   530  		payload.Files = append(payload.Files, devIOFile)
   531  	}
   532  	payload.Files = append(payload.Files, goferFiles...)
   533  
   534  	// Start running the container.
   535  	args := boot.StartArgs{
   536  		Spec:                 spec,
   537  		Conf:                 conf,
   538  		CID:                  cid,
   539  		NumGoferFilestoreFDs: len(goferFilestoreFiles),
   540  		IsDevIoFilePresent:   devIOFile != nil,
   541  		GoferMountConfs:      goferMountConf,
   542  		FilePayload:          payload,
   543  	}
   544  	if err := s.call(boot.ContMgrRestoreSubcontainer, &args, nil); err != nil {
   545  		return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err)
   546  	}
   547  	return nil
   548  }
   549  
   550  // Processes retrieves the list of processes and associated metadata for a
   551  // given container in this sandbox.
   552  func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
   553  	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
   554  	var pl []*control.Process
   555  	if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil {
   556  		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
   557  	}
   558  	return pl, nil
   559  }
   560  
   561  // CreateTraceSession creates a new trace session.
   562  func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error {
   563  	log.Debugf("Creating trace session in sandbox %q", s.ID)
   564  
   565  	sinkFiles, err := seccheck.SetupSinks(config.Sinks)
   566  	if err != nil {
   567  		return err
   568  	}
   569  	defer func() {
   570  		for _, f := range sinkFiles {
   571  			_ = f.Close()
   572  		}
   573  	}()
   574  
   575  	arg := boot.CreateTraceSessionArgs{
   576  		Config: *config,
   577  		Force:  force,
   578  		FilePayload: urpc.FilePayload{
   579  			Files: sinkFiles,
   580  		},
   581  	}
   582  	if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil {
   583  		return fmt.Errorf("creating trace session: %w", err)
   584  	}
   585  	return nil
   586  }
   587  
   588  // DeleteTraceSession deletes an existing trace session.
   589  func (s *Sandbox) DeleteTraceSession(name string) error {
   590  	log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID)
   591  	if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil {
   592  		return fmt.Errorf("deleting trace session: %w", err)
   593  	}
   594  	return nil
   595  }
   596  
   597  // ListTraceSessions lists all trace sessions.
   598  func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) {
   599  	log.Debugf("Listing trace sessions in sandbox %q", s.ID)
   600  	var sessions []seccheck.SessionConfig
   601  	if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil {
   602  		return nil, fmt.Errorf("listing trace session: %w", err)
   603  	}
   604  	return sessions, nil
   605  }
   606  
   607  // ProcfsDump collects and returns a procfs dump for the sandbox.
   608  func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) {
   609  	log.Debugf("Procfs dump %q", s.ID)
   610  	var procfsDump []procfs.ProcessProcfsDump
   611  	if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil {
   612  		return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
   613  	}
   614  	return procfsDump, nil
   615  }
   616  
   617  // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
   618  func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) {
   619  	return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */)
   620  }
   621  
   622  // Execute runs the specified command in the container. It returns the PID of
   623  // the newly created process.
   624  func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   625  	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
   626  
   627  	// Stdios are those files which have an FD <= 2 in the process. We do not
   628  	// want the ownership of other files to be changed by configureStdios.
   629  	var stdios []*os.File
   630  	for i, fd := range args.GuestFDs {
   631  		if fd > 2 || i >= len(args.Files) {
   632  			continue
   633  		}
   634  		stdios = append(stdios, args.Files[i])
   635  	}
   636  
   637  	if err := s.configureStdios(conf, stdios); err != nil {
   638  		return 0, err
   639  	}
   640  
   641  	// Send a message to the sandbox control server to start the container.
   642  	var pid int32
   643  	if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil {
   644  		return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err)
   645  	}
   646  	return pid, nil
   647  }
   648  
   649  // Event retrieves stats about the sandbox such as memory and CPU utilization.
   650  func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
   651  	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
   652  	var e boot.EventOut
   653  	if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil {
   654  		return nil, fmt.Errorf("retrieving event data from sandbox: %w", err)
   655  	}
   656  	return &e, nil
   657  }
   658  
   659  // PortForward starts port forwarding to the sandbox.
   660  func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error {
   661  	log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts)
   662  	conn, err := s.sandboxConnect()
   663  	if err != nil {
   664  		return err
   665  	}
   666  	defer conn.Close()
   667  
   668  	if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil {
   669  		return fmt.Errorf("port forwarding to sandbox: %v", err)
   670  	}
   671  
   672  	return nil
   673  }
   674  
   675  func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
   676  	log.Debugf("Connecting to sandbox %q", s.ID)
   677  	path := s.ControlSocketPath
   678  	if len(path) >= linux.UnixPathMax {
   679  		// This is not an abstract socket path. It is a filesystem path.
   680  		// UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead
   681  		// open the socket using open(2) and use /proc to refer to the open FD.
   682  		sockFD, err := unix.Open(path, unix.O_PATH, 0)
   683  		if err != nil {
   684  			return nil, fmt.Errorf("failed to open socket at %q", path)
   685  		}
   686  		defer unix.Close(sockFD)
   687  		path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD))
   688  	}
   689  	conn, err := client.ConnectTo(path)
   690  	if err != nil {
   691  		return nil, s.connError(err)
   692  	}
   693  	return conn, nil
   694  }
   695  
   696  func (s *Sandbox) call(method string, arg, result any) error {
   697  	conn, err := s.sandboxConnect()
   698  	if err != nil {
   699  		return err
   700  	}
   701  	defer conn.Close()
   702  
   703  	return conn.Call(method, arg, result)
   704  }
   705  
   706  func (s *Sandbox) connError(err error) error {
   707  	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err)
   708  }
   709  
   710  // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
   711  // command, passing in the bundle dir.
   712  func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
   713  	// Ensure we don't leak FDs to the sandbox process.
   714  	if err := SetCloExeOnAllFDs(); err != nil {
   715  		return fmt.Errorf("setting CLOEXEC on all FDs: %w", err)
   716  	}
   717  
   718  	donations := donation.Agency{}
   719  	defer donations.Close()
   720  
   721  	// pgalloc.MemoryFile (which provides application memory) sometimes briefly
   722  	// mlock(2)s ranges of memory in order to fault in a large number of pages at
   723  	// a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc
   724  	// expects to run in a memory cgroup that limits its memory usage as
   725  	// required.
   726  	// This needs to be done before exec'ing `runsc boot`, as that subcommand
   727  	// runs as an unprivileged user that will not be able to call `setrlimit`
   728  	// by itself. Calling `setrlimit` here will have the side-effect of setting
   729  	// the limit on the currently-running `runsc` process as well, but that
   730  	// should be OK too.
   731  	var rlim unix.Rlimit
   732  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   733  		log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err)
   734  	} else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY {
   735  		rlim.Cur = unix.RLIM_INFINITY
   736  		rlim.Max = unix.RLIM_INFINITY
   737  		if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   738  			// We may not have CAP_SYS_RESOURCE, so this failure may be expected.
   739  			log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err)
   740  		}
   741  	}
   742  
   743  	//
   744  	// These flags must come BEFORE the "boot" command in cmd.Args.
   745  	//
   746  
   747  	// Open the log files to pass to the sandbox as FDs.
   748  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   749  		return err
   750  	}
   751  
   752  	test := ""
   753  	if len(conf.TestOnlyTestNameEnv) != 0 {
   754  		// Fetch test name if one is provided and the test only flag was set.
   755  		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
   756  			test = t
   757  		}
   758  	}
   759  	if specutils.IsDebugCommand(conf, "boot") {
   760  		if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil {
   761  			return err
   762  		}
   763  	}
   764  	if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil {
   765  		return err
   766  	}
   767  	covFilename := conf.CoverageReport
   768  	if covFilename == "" {
   769  		covFilename = os.Getenv("GO_COVERAGE_FILE")
   770  	}
   771  	if covFilename != "" && coverage.Available() {
   772  		if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil {
   773  			return err
   774  		}
   775  	}
   776  
   777  	// Relay all the config flags to the sandbox process.
   778  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
   779  	cmd.SysProcAttr = &unix.SysProcAttr{
   780  		// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   781  		// when re-parented.
   782  		Setsid: true,
   783  	}
   784  
   785  	// Set Args[0] to make easier to spot the sandbox process. Otherwise it's
   786  	// shown as `exe`.
   787  	cmd.Args[0] = "runsc-sandbox"
   788  
   789  	// Tranfer FDs that need to be present before the "boot" command.
   790  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
   791  	nextFD := donations.Transfer(cmd, 3)
   792  
   793  	// Add the "boot" command to the args.
   794  	//
   795  	// All flags after this must be for the boot command
   796  	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
   797  
   798  	// Clear environment variables, unless --TESTONLY-unsafe-nonroot is set.
   799  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   800  		// Setting cmd.Env = nil causes cmd to inherit the current process's env.
   801  		cmd.Env = []string{}
   802  	}
   803  
   804  	// If there is a gofer, sends all socket ends to the sandbox.
   805  	donations.DonateAndClose("io-fds", args.IOFiles...)
   806  	donations.DonateAndClose("dev-io-fd", args.DevIOFile)
   807  	donations.DonateAndClose("gofer-filestore-fds", args.GoferFilestoreFiles...)
   808  	donations.DonateAndClose("mounts-fd", args.MountsFile)
   809  	donations.Donate("start-sync-fd", startSyncFile)
   810  	if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   811  		return err
   812  	}
   813  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
   814  	if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil {
   815  		return err
   816  	}
   817  	if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil {
   818  		return err
   819  	}
   820  	if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil {
   821  		return err
   822  	}
   823  	if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil {
   824  		return err
   825  	}
   826  	if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil {
   827  		return err
   828  	}
   829  
   830  	// Pass gofer mount configs.
   831  	cmd.Args = append(cmd.Args, "--gofer-mount-confs="+args.GoferMountConfs.String())
   832  
   833  	// Create a socket for the control server and donate it to the sandbox.
   834  	controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID)
   835  	if err != nil {
   836  		return fmt.Errorf("failed to create control socket: %v", err)
   837  	}
   838  	s.ControlSocketPath = controlSocketPath
   839  	log.Infof("Control socket path: %q", s.ControlSocketPath)
   840  	donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket"))
   841  
   842  	specFile, err := specutils.OpenSpec(args.BundleDir)
   843  	if err != nil {
   844  		return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err)
   845  	}
   846  	donations.DonateAndClose("spec-fd", specFile)
   847  
   848  	if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil {
   849  		return err
   850  	}
   851  	donations.DonateAndClose("sink-fds", args.SinkFiles...)
   852  
   853  	gPlatform, err := platform.Lookup(conf.Platform)
   854  	if err != nil {
   855  		return fmt.Errorf("cannot look up platform: %w", err)
   856  	}
   857  	if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil {
   858  		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
   859  	} else if deviceFile != nil {
   860  		donations.DonateAndClose("device-fd", deviceFile.ReleaseToFile("device file"))
   861  	}
   862  
   863  	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
   864  	// isn't set.
   865  	if conf.Platform == "kvm" {
   866  		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
   867  	}
   868  
   869  	// nss is the set of namespaces to join or create before starting the sandbox
   870  	// process. Mount, IPC and UTS namespaces from the host are not used as they
   871  	// are virtualized inside the sandbox. Be paranoid and run inside an empty
   872  	// namespace for these. Don't unshare cgroup because sandbox is added to a
   873  	// cgroup in the caller's namespace.
   874  	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
   875  	nss := []specs.LinuxNamespace{
   876  		{Type: specs.IPCNamespace},
   877  		{Type: specs.MountNamespace},
   878  		{Type: specs.UTSNamespace},
   879  	}
   880  
   881  	if gPlatform.Requirements().RequiresCurrentPIDNS {
   882  		// TODO(b/75837838): Also set a new PID namespace so that we limit
   883  		// access to other host processes.
   884  		log.Infof("Sandbox will be started in the current PID namespace")
   885  	} else {
   886  		log.Infof("Sandbox will be started in a new PID namespace")
   887  		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
   888  		cmd.Args = append(cmd.Args, "--pidns=true")
   889  	}
   890  
   891  	if specutils.NVProxyEnabled(args.Spec, conf) {
   892  		version, err := getNvproxyDriverVersion(conf)
   893  		if err != nil {
   894  			return fmt.Errorf("failed to get Nvidia driver version: %w", err)
   895  		}
   896  		cmd.Args = append(cmd.Args, "--nvidia-driver-version="+version)
   897  	}
   898  
   899  	// Joins the network namespace if network is enabled. the sandbox talks
   900  	// directly to the host network, which may have been configured in the
   901  	// namespace.
   902  	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
   903  		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
   904  		nss = append(nss, ns)
   905  	} else if conf.Network == config.NetworkHost {
   906  		log.Infof("Sandbox will be started in the host network namespace")
   907  	} else {
   908  		log.Infof("Sandbox will be started in new network namespace")
   909  		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
   910  	}
   911  
   912  	// These are set to the uid/gid that the sandbox process will use. May be
   913  	// overriden below.
   914  	s.UID = os.Getuid()
   915  	s.GID = os.Getgid()
   916  
   917  	// User namespace depends on the network type or whether access to the host
   918  	// filesystem is required. These features require to run inside the user
   919  	// namespace specified in the spec or the current namespace if none is
   920  	// configured.
   921  	rootlessEUID := unix.Geteuid() != 0
   922  	setUserMappings := false
   923  	if conf.Network == config.NetworkHost || conf.DirectFS {
   924  		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
   925  			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
   926  			nss = append(nss, userns)
   927  			if rootlessEUID {
   928  				syncFile, err := ConfigureCmdForRootless(cmd, &donations)
   929  				if err != nil {
   930  					return err
   931  				}
   932  				defer syncFile.Close()
   933  				setUserMappings = true
   934  			} else {
   935  				specutils.SetUIDGIDMappings(cmd, args.Spec)
   936  				// We need to set UID and GID to have capabilities in a new user namespace.
   937  				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
   938  			}
   939  		} else {
   940  			if rootlessEUID {
   941  				return fmt.Errorf("unable to run a rootless container without userns")
   942  			}
   943  			log.Infof("Sandbox will be started in the current user namespace")
   944  		}
   945  		// When running in the caller's defined user namespace, apply the same
   946  		// capabilities to the sandbox process to ensure it abides to the same
   947  		// rules.
   948  		cmd.Args = append(cmd.Args, "--apply-caps=true")
   949  
   950  		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
   951  		// bind-mount the executable inside it.
   952  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   953  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   954  		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID {
   955  			log.Infof("Sandbox will be started in minimal chroot")
   956  			cmd.Args = append(cmd.Args, "--setup-root")
   957  		} else {
   958  			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
   959  		}
   960  	} else {
   961  		// If we have CAP_SETUID and CAP_SETGID, then we can also run
   962  		// as user nobody.
   963  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   964  			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
   965  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   966  		} else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
   967  			log.Infof("Sandbox will be started in new user namespace")
   968  			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
   969  			cmd.Args = append(cmd.Args, "--setup-root")
   970  
   971  			const nobody = 65534
   972  			if rootlessEUID || conf.Rootless {
   973  				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
   974  			} else {
   975  				// Map nobody in the new namespace to nobody in the parent namespace.
   976  				s.UID = nobody
   977  				s.GID = nobody
   978  			}
   979  
   980  			// Set credentials to run as user and group nobody.
   981  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
   982  			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   983  				{
   984  					ContainerID: nobody,
   985  					HostID:      s.UID,
   986  					Size:        1,
   987  				},
   988  			}
   989  			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   990  				{
   991  					ContainerID: nobody,
   992  					HostID:      s.GID,
   993  					Size:        1,
   994  				},
   995  			}
   996  
   997  			// A sandbox process will construct an empty root for itself, so it has
   998  			// to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
   999  			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps,
  1000  				uintptr(capability.CAP_SYS_ADMIN),
  1001  				uintptr(capability.CAP_SYS_CHROOT),
  1002  				// CAP_SETPCAP is required to clear the bounding set.
  1003  				uintptr(capability.CAP_SETPCAP),
  1004  			)
  1005  
  1006  		} else {
  1007  			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
  1008  		}
  1009  	}
  1010  
  1011  	// The current process' stdio must be passed to the application via the
  1012  	// --stdio-fds flag. The stdio of the sandbox process itself must not
  1013  	// be connected to the same FDs, otherwise we risk leaking sandbox
  1014  	// errors to the application, so we set the sandbox stdio to nil,
  1015  	// causing them to read/write from the null device.
  1016  	cmd.Stdin = nil
  1017  	cmd.Stdout = nil
  1018  	cmd.Stderr = nil
  1019  	var stdios [3]*os.File
  1020  
  1021  	// If the console control socket file is provided, then create a new
  1022  	// pty master/replica pair and set the TTY on the sandbox process.
  1023  	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
  1024  		// console.NewWithSocket will send the master on the given
  1025  		// socket, and return the replica.
  1026  		tty, err := console.NewWithSocket(args.ConsoleSocket)
  1027  		if err != nil {
  1028  			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
  1029  		}
  1030  		defer tty.Close()
  1031  
  1032  		// Set the TTY as a controlling TTY on the sandbox process.
  1033  		cmd.SysProcAttr.Setctty = true
  1034  
  1035  		// Inconveniently, the Ctty must be the FD in the *child* process's FD
  1036  		// table. So transfer all files we have so far and make sure the next file
  1037  		// added to donations is stdin.
  1038  		//
  1039  		// See https://github.com/golang/go/issues/29458.
  1040  		nextFD = donations.Transfer(cmd, nextFD)
  1041  		cmd.SysProcAttr.Ctty = nextFD
  1042  
  1043  		// Pass the tty as all stdio fds to sandbox.
  1044  		stdios[0] = tty
  1045  		stdios[1] = tty
  1046  		stdios[2] = tty
  1047  
  1048  		if conf.Debug {
  1049  			// If debugging, send the boot process stdio to the
  1050  			// TTY, so that it is easier to find.
  1051  			cmd.Stdin = tty
  1052  			cmd.Stdout = tty
  1053  			cmd.Stderr = tty
  1054  		}
  1055  	} else {
  1056  		// If not using a console, pass our current stdio as the
  1057  		// container stdio via flags.
  1058  		stdios[0] = os.Stdin
  1059  		stdios[1] = os.Stdout
  1060  		stdios[2] = os.Stderr
  1061  
  1062  		if conf.Debug {
  1063  			// If debugging, send the boot process stdio to the
  1064  			// this process' stdio, so that is is easier to find.
  1065  			cmd.Stdin = os.Stdin
  1066  			cmd.Stdout = os.Stdout
  1067  			cmd.Stderr = os.Stderr
  1068  		}
  1069  	}
  1070  	if err := s.configureStdios(conf, stdios[:]); err != nil {
  1071  		return fmt.Errorf("configuring stdios: %w", err)
  1072  	}
  1073  	// Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above
  1074  	// because it relies on stdin being the next FD donated.
  1075  	donations.Donate("stdio-fds", stdios[:]...)
  1076  	if conf.ProfilingMetricsLog == "-" {
  1077  		donations.Donate("profiling-metrics-fd", stdios[1])
  1078  		cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=true")
  1079  	} else if conf.ProfilingMetricsLog != "" {
  1080  		if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil {
  1081  			return err
  1082  		}
  1083  		cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=false")
  1084  	}
  1085  
  1086  	totalSysMem, err := totalSystemMemory()
  1087  	if err != nil {
  1088  		return err
  1089  	}
  1090  	cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10))
  1091  
  1092  	mem := totalSysMem
  1093  	if s.CgroupJSON.Cgroup != nil {
  1094  		cpuNum, err := s.CgroupJSON.Cgroup.NumCPU()
  1095  		if err != nil {
  1096  			return fmt.Errorf("getting cpu count from cgroups: %v", err)
  1097  		}
  1098  		if conf.CPUNumFromQuota {
  1099  			// Dropping below 2 CPUs can trigger application to disable
  1100  			// locks that can lead do hard to debug errors, so just
  1101  			// leaving two cores as reasonable default.
  1102  			const minCPUs = 2
  1103  
  1104  			quota, err := s.CgroupJSON.Cgroup.CPUQuota()
  1105  			if err != nil {
  1106  				return fmt.Errorf("getting cpu quota from cgroups: %v", err)
  1107  			}
  1108  			if n := int(math.Ceil(quota)); n > 0 {
  1109  				if n < minCPUs {
  1110  					n = minCPUs
  1111  				}
  1112  				if n < cpuNum {
  1113  					// Only lower the cpu number.
  1114  					cpuNum = n
  1115  				}
  1116  			}
  1117  		}
  1118  		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
  1119  
  1120  		memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit()
  1121  		if err != nil {
  1122  			return fmt.Errorf("getting memory limit from cgroups: %v", err)
  1123  		}
  1124  		if memLimit < mem {
  1125  			mem = memLimit
  1126  		}
  1127  	}
  1128  	cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
  1129  
  1130  	if args.Attached {
  1131  		// Kill sandbox if parent process exits in attached mode.
  1132  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1133  		// Tells boot that any process it creates must have pdeathsig set.
  1134  		cmd.Args = append(cmd.Args, "--attached")
  1135  	}
  1136  
  1137  	if args.ExecFile != nil {
  1138  		donations.Donate("exec-fd", args.ExecFile)
  1139  	}
  1140  
  1141  	nextFD = donations.Transfer(cmd, nextFD)
  1142  
  1143  	_ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles)
  1144  
  1145  	// Add container ID as the last argument.
  1146  	cmd.Args = append(cmd.Args, s.ID)
  1147  
  1148  	donation.LogDonations(cmd)
  1149  	log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args)
  1150  	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
  1151  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1152  		err := fmt.Errorf("starting sandbox: %v", err)
  1153  		// If the sandbox failed to start, it may be because the binary
  1154  		// permissions were incorrect. Check the bits and return a more helpful
  1155  		// error message.
  1156  		//
  1157  		// NOTE: The error message is checked because error types are lost over
  1158  		// rpc calls.
  1159  		if strings.Contains(err.Error(), unix.EACCES.Error()) {
  1160  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
  1161  				return fmt.Errorf("%v: %v", err, permsErr)
  1162  			}
  1163  		}
  1164  		return err
  1165  	}
  1166  	s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid)
  1167  	if err != nil {
  1168  		return err
  1169  	}
  1170  	if setUserMappings {
  1171  		if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil {
  1172  			return err
  1173  		}
  1174  	}
  1175  
  1176  	s.child = true
  1177  	s.Pid.store(cmd.Process.Pid)
  1178  	log.Infof("Sandbox started, PID: %d", cmd.Process.Pid)
  1179  
  1180  	return nil
  1181  }
  1182  
  1183  // Wait waits for the containerized process to exit, and returns its WaitStatus.
  1184  func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) {
  1185  	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
  1186  
  1187  	if conn, err := s.sandboxConnect(); err != nil {
  1188  		// The sandbox may have exited while before we had a chance to wait on it.
  1189  		// There is nothing we can do for subcontainers. For the init container, we
  1190  		// can try to get the sandbox exit code.
  1191  		if !s.IsRootContainer(cid) {
  1192  			return unix.WaitStatus(0), err
  1193  		}
  1194  		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1195  	} else {
  1196  		defer conn.Close()
  1197  
  1198  		// Try the Wait RPC to the sandbox.
  1199  		var ws unix.WaitStatus
  1200  		err = conn.Call(boot.ContMgrWait, &cid, &ws)
  1201  		conn.Close()
  1202  		if err == nil {
  1203  			if s.IsRootContainer(cid) {
  1204  				if err := s.waitForStopped(); err != nil {
  1205  					return unix.WaitStatus(0), err
  1206  				}
  1207  			}
  1208  			// It worked!
  1209  			return ws, nil
  1210  		}
  1211  		// See comment above.
  1212  		if !s.IsRootContainer(cid) {
  1213  			return unix.WaitStatus(0), err
  1214  		}
  1215  
  1216  		// The sandbox may have exited after we connected, but before
  1217  		// or during the Wait RPC.
  1218  		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1219  	}
  1220  
  1221  	// The sandbox may have already exited, or exited while handling the Wait RPC.
  1222  	// The best we can do is ask Linux what the sandbox exit status was, since in
  1223  	// most cases that will be the same as the container exit status.
  1224  	if err := s.waitForStopped(); err != nil {
  1225  		return unix.WaitStatus(0), err
  1226  	}
  1227  	if !s.child {
  1228  		return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
  1229  	}
  1230  
  1231  	s.statusMu.Lock()
  1232  	defer s.statusMu.Unlock()
  1233  	return s.status, nil
  1234  }
  1235  
  1236  // WaitPID waits for process 'pid' in the container's sandbox and returns its
  1237  // WaitStatus.
  1238  func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) {
  1239  	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
  1240  	var ws unix.WaitStatus
  1241  	args := &boot.WaitPIDArgs{
  1242  		PID: pid,
  1243  		CID: cid,
  1244  	}
  1245  	if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil {
  1246  		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err)
  1247  	}
  1248  	return ws, nil
  1249  }
  1250  
  1251  // IsRootContainer returns true if the specified container ID belongs to the
  1252  // root container.
  1253  func (s *Sandbox) IsRootContainer(cid string) bool {
  1254  	return s.ID == cid
  1255  }
  1256  
  1257  // Destroy frees all resources associated with the sandbox. It fails fast and
  1258  // is idempotent.
  1259  func (s *Sandbox) destroy() error {
  1260  	log.Debugf("Destroying sandbox %q", s.ID)
  1261  	// Only delete the control file if it exists.
  1262  	if len(s.ControlSocketPath) > 0 {
  1263  		if err := os.Remove(s.ControlSocketPath); err != nil {
  1264  			log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err)
  1265  		}
  1266  	}
  1267  	pid := s.Pid.load()
  1268  	if pid != 0 {
  1269  		log.Debugf("Killing sandbox %q", s.ID)
  1270  		if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH {
  1271  			return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err)
  1272  		}
  1273  		if err := s.waitForStopped(); err != nil {
  1274  			return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err)
  1275  		}
  1276  	}
  1277  
  1278  	return nil
  1279  }
  1280  
  1281  // SignalContainer sends the signal to a container in the sandbox. If all is
  1282  // true and signal is SIGKILL, then waits for all processes to exit before
  1283  // returning.
  1284  func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error {
  1285  	log.Debugf("Signal sandbox %q", s.ID)
  1286  	mode := boot.DeliverToProcess
  1287  	if all {
  1288  		mode = boot.DeliverToAllProcesses
  1289  	}
  1290  
  1291  	args := boot.SignalArgs{
  1292  		CID:   cid,
  1293  		Signo: int32(sig),
  1294  		Mode:  mode,
  1295  	}
  1296  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1297  		return fmt.Errorf("signaling container %q: %w", cid, err)
  1298  	}
  1299  	return nil
  1300  }
  1301  
  1302  // SignalProcess sends the signal to a particular process in the container. If
  1303  // fgProcess is true, then the signal is sent to the foreground process group
  1304  // in the same session that PID belongs to. This is only valid if the process
  1305  // is attached to a host TTY.
  1306  func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error {
  1307  	log.Debugf("Signal sandbox %q", s.ID)
  1308  
  1309  	mode := boot.DeliverToProcess
  1310  	if fgProcess {
  1311  		mode = boot.DeliverToForegroundProcessGroup
  1312  	}
  1313  
  1314  	args := boot.SignalArgs{
  1315  		CID:   cid,
  1316  		Signo: int32(sig),
  1317  		PID:   pid,
  1318  		Mode:  mode,
  1319  	}
  1320  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1321  		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
  1322  	}
  1323  	return nil
  1324  }
  1325  
  1326  // Checkpoint sends the checkpoint call for a container in the sandbox.
  1327  // The statefile will be written to f.
  1328  func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error {
  1329  	log.Debugf("Checkpoint sandbox %q, statefile options %+v, MemoryFile options %+v", s.ID, sfOpts, mfOpts)
  1330  
  1331  	stateFilePath := filepath.Join(imagePath, boot.CheckpointStateFileName)
  1332  	sf, err := os.OpenFile(stateFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
  1333  	if err != nil {
  1334  		return fmt.Errorf("creating checkpoint state file %q: %w", stateFilePath, err)
  1335  	}
  1336  	defer sf.Close()
  1337  
  1338  	opt := control.SaveOpts{
  1339  		Metadata:           sfOpts.WriteToMetadata(map[string]string{}),
  1340  		MemoryFileSaveOpts: mfOpts,
  1341  		FilePayload: urpc.FilePayload{
  1342  			Files: []*os.File{sf},
  1343  		},
  1344  		Resume: sfOpts.Resume,
  1345  	}
  1346  
  1347  	// When there is no compression, MemoryFile contents are page-aligned.
  1348  	// It is beneficial to store them separately so certain optimizations can be
  1349  	// applied during restore. See Restore().
  1350  	if sfOpts.Compression == statefile.CompressionLevelNone {
  1351  		pagesFilePath := filepath.Join(imagePath, boot.CheckpointPagesFileName)
  1352  		pagesWriteFlags := os.O_CREATE | os.O_EXCL | os.O_RDWR
  1353  		if direct {
  1354  			// The writes will be page-aligned, so it can be opened with O_DIRECT.
  1355  			pagesWriteFlags |= syscall.O_DIRECT
  1356  		}
  1357  		pf, err := os.OpenFile(pagesFilePath, pagesWriteFlags, 0644)
  1358  		if err != nil {
  1359  			return fmt.Errorf("creating checkpoint pages file %q: %w", pagesFilePath, err)
  1360  		}
  1361  		defer pf.Close()
  1362  		pagesMetadataFilePath := filepath.Join(imagePath, boot.CheckpointPagesMetadataFileName)
  1363  		pmf, err := os.OpenFile(pagesMetadataFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644)
  1364  		if err != nil {
  1365  			return fmt.Errorf("creating checkpoint pages metadata file %q: %w", pagesMetadataFilePath, err)
  1366  		}
  1367  		defer pmf.Close()
  1368  		opt.FilePayload.Files = append(opt.FilePayload.Files, pmf, pf)
  1369  		opt.HavePagesFile = true
  1370  	}
  1371  
  1372  	if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
  1373  		return fmt.Errorf("checkpointing container %q: %w", cid, err)
  1374  	}
  1375  	return nil
  1376  }
  1377  
  1378  // Pause sends the pause call for a container in the sandbox.
  1379  func (s *Sandbox) Pause(cid string) error {
  1380  	log.Debugf("Pause sandbox %q", s.ID)
  1381  	if err := s.call(boot.LifecyclePause, nil, nil); err != nil {
  1382  		return fmt.Errorf("pausing container %q: %w", cid, err)
  1383  	}
  1384  	return nil
  1385  }
  1386  
  1387  // Resume sends the resume call for a container in the sandbox.
  1388  func (s *Sandbox) Resume(cid string) error {
  1389  	log.Debugf("Resume sandbox %q", s.ID)
  1390  	if err := s.call(boot.LifecycleResume, nil, nil); err != nil {
  1391  		return fmt.Errorf("resuming container %q: %w", cid, err)
  1392  	}
  1393  	return nil
  1394  }
  1395  
  1396  // Usage sends the collect call for a container in the sandbox.
  1397  func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) {
  1398  	log.Debugf("Usage sandbox %q", s.ID)
  1399  	opts := control.MemoryUsageOpts{Full: Full}
  1400  	var m control.MemoryUsage
  1401  	if err := s.call(boot.UsageCollect, &opts, &m); err != nil {
  1402  		return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err)
  1403  	}
  1404  	return m, nil
  1405  }
  1406  
  1407  // UsageFD sends the usagefd call for a container in the sandbox.
  1408  func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) {
  1409  	log.Debugf("Usage sandbox %q", s.ID)
  1410  	opts := control.MemoryUsageFileOpts{Version: 1}
  1411  	var m control.MemoryUsageFile
  1412  	if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil {
  1413  		return nil, fmt.Errorf("collecting usage FD: %w", err)
  1414  	}
  1415  
  1416  	if len(m.FilePayload.Files) != 2 {
  1417  		return nil, fmt.Errorf("wants exactly two fds")
  1418  	}
  1419  	return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1])
  1420  }
  1421  
  1422  // GetRegisteredMetrics returns metric registration data from the sandbox.
  1423  // This data is meant to be used as a way to sanity-check any exported metrics data during the
  1424  // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce
  1425  // bogus metrics.
  1426  // This returns an error if the sandbox has not requested instrumentation during creation time.
  1427  func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
  1428  	if s.RegisteredMetrics == nil {
  1429  		return nil, errors.New("sandbox did not request instrumentation when it was created")
  1430  	}
  1431  	return s.RegisteredMetrics, nil
  1432  }
  1433  
  1434  // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
  1435  func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
  1436  	log.Debugf("Metrics export sandbox %q", s.ID)
  1437  	var data control.MetricsExportData
  1438  	if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
  1439  		return nil, err
  1440  	}
  1441  	// Since we do not trust the output of the sandbox as-is, double-check that the options were
  1442  	// respected.
  1443  	if err := opts.Verify(&data); err != nil {
  1444  		return nil, err
  1445  	}
  1446  	return data.Snapshot, nil
  1447  }
  1448  
  1449  // IsRunning returns true if the sandbox or gofer process is running.
  1450  func (s *Sandbox) IsRunning() bool {
  1451  	pid := s.Pid.load()
  1452  	if pid == 0 {
  1453  		return false
  1454  	}
  1455  	// Send a signal 0 to the sandbox process. If it succeeds, the sandbox
  1456  	// process is running.
  1457  	return unix.Kill(pid, 0) == nil
  1458  }
  1459  
  1460  // Stacks collects and returns all stacks for the sandbox.
  1461  func (s *Sandbox) Stacks() (string, error) {
  1462  	log.Debugf("Stacks sandbox %q", s.ID)
  1463  	var stacks string
  1464  	if err := s.call(boot.DebugStacks, nil, &stacks); err != nil {
  1465  		return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
  1466  	}
  1467  	return stacks, nil
  1468  }
  1469  
  1470  // HeapProfile writes a heap profile to the given file.
  1471  func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
  1472  	log.Debugf("Heap profile %q", s.ID)
  1473  	opts := control.HeapProfileOpts{
  1474  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1475  		Delay:       delay,
  1476  	}
  1477  	return s.call(boot.ProfileHeap, &opts, nil)
  1478  }
  1479  
  1480  // CPUProfile collects a CPU profile.
  1481  func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
  1482  	log.Debugf("CPU profile %q", s.ID)
  1483  	opts := control.CPUProfileOpts{
  1484  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1485  		Duration:    duration,
  1486  	}
  1487  	return s.call(boot.ProfileCPU, &opts, nil)
  1488  }
  1489  
  1490  // BlockProfile writes a block profile to the given file.
  1491  func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
  1492  	log.Debugf("Block profile %q", s.ID)
  1493  	opts := control.BlockProfileOpts{
  1494  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1495  		Duration:    duration,
  1496  	}
  1497  	return s.call(boot.ProfileBlock, &opts, nil)
  1498  }
  1499  
  1500  // MutexProfile writes a mutex profile to the given file.
  1501  func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
  1502  	log.Debugf("Mutex profile %q", s.ID)
  1503  	opts := control.MutexProfileOpts{
  1504  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1505  		Duration:    duration,
  1506  	}
  1507  	return s.call(boot.ProfileMutex, &opts, nil)
  1508  }
  1509  
  1510  // Trace collects an execution trace.
  1511  func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
  1512  	log.Debugf("Trace %q", s.ID)
  1513  	opts := control.TraceProfileOpts{
  1514  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1515  		Duration:    duration,
  1516  	}
  1517  	return s.call(boot.ProfileTrace, &opts, nil)
  1518  }
  1519  
  1520  // ChangeLogging changes logging options.
  1521  func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
  1522  	log.Debugf("Change logging start %q", s.ID)
  1523  	if err := s.call(boot.LoggingChange, &args, nil); err != nil {
  1524  		return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err)
  1525  	}
  1526  	return nil
  1527  }
  1528  
  1529  // DestroyContainer destroys the given container. If it is the root container,
  1530  // then the entire sandbox is destroyed.
  1531  func (s *Sandbox) DestroyContainer(cid string) error {
  1532  	if err := s.destroyContainer(cid); err != nil {
  1533  		// If the sandbox isn't running, the container has already been destroyed,
  1534  		// ignore the error in this case.
  1535  		if s.IsRunning() {
  1536  			return err
  1537  		}
  1538  	}
  1539  	return nil
  1540  }
  1541  
  1542  func (s *Sandbox) destroyContainer(cid string) error {
  1543  	if s.IsRootContainer(cid) {
  1544  		log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid)
  1545  		return s.destroy()
  1546  	}
  1547  
  1548  	log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID)
  1549  	if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil {
  1550  		return fmt.Errorf("destroying container %q: %w", cid, err)
  1551  	}
  1552  	return nil
  1553  }
  1554  
  1555  // waitForStopped waits for the sandbox to actually stop.
  1556  // This should only be called when the sandbox is known to be shutting down.
  1557  func (s *Sandbox) waitForStopped() error {
  1558  	const waitTimeout = 2 * time.Minute
  1559  	if s.child {
  1560  		s.statusMu.Lock()
  1561  		defer s.statusMu.Unlock()
  1562  		pid := s.Pid.load()
  1563  		if pid == 0 {
  1564  			return nil
  1565  		}
  1566  		// The sandbox process is a child of the current process,
  1567  		// so we can wait on it to terminate and collect its zombie.
  1568  		if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil {
  1569  			return fmt.Errorf("error waiting the sandbox process: %v", err)
  1570  		}
  1571  		s.Pid.store(0)
  1572  		return nil
  1573  	}
  1574  	ctx, cancel := context.WithTimeout(context.Background(), waitTimeout)
  1575  	defer cancel()
  1576  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1577  	op := func() error {
  1578  		if s.IsRunning() {
  1579  			return fmt.Errorf("sandbox is still running")
  1580  		}
  1581  		return nil
  1582  	}
  1583  	return backoff.Retry(op, b)
  1584  }
  1585  
  1586  // configureStdios change stdios ownership to give access to the sandbox
  1587  // process. This may be skipped depending on the configuration.
  1588  func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error {
  1589  	if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1590  		// Cannot change ownership without CAP_CHOWN.
  1591  		return nil
  1592  	}
  1593  
  1594  	if s.UID < 0 || s.GID < 0 {
  1595  		panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID))
  1596  	}
  1597  	for _, file := range stdios {
  1598  		log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID)
  1599  		if err := file.Chown(s.UID, s.GID); err != nil {
  1600  			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
  1601  				log.Warningf("can't change an owner of %s: %s", file.Name(), err)
  1602  				continue
  1603  			}
  1604  			return err
  1605  		}
  1606  	}
  1607  	return nil
  1608  }
  1609  
  1610  // deviceFileForPlatform opens the device file for the given platform. If the
  1611  // platform does not need a device file, then nil is returned.
  1612  // devicePath may be empty to use a sane platform-specific default.
  1613  func deviceFileForPlatform(name, devicePath string) (*fd.FD, error) {
  1614  	p, err := platform.Lookup(name)
  1615  	if err != nil {
  1616  		return nil, err
  1617  	}
  1618  
  1619  	f, err := p.OpenDevice(devicePath)
  1620  	if err != nil {
  1621  		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
  1622  	}
  1623  	return f, nil
  1624  }
  1625  
  1626  // getNvproxyDriverVersion returns the NVIDIA driver ABI version to use by
  1627  // nvproxy.
  1628  func getNvproxyDriverVersion(conf *config.Config) (string, error) {
  1629  	switch conf.NVProxyDriverVersion {
  1630  	case "":
  1631  		return nvproxy.HostDriverVersion()
  1632  	case "latest":
  1633  		nvproxy.Init()
  1634  		return nvproxy.LatestDriver().String(), nil
  1635  	default:
  1636  		version, err := nvproxy.DriverVersionFrom(conf.NVProxyDriverVersion)
  1637  		return version.String(), err
  1638  	}
  1639  }
  1640  
  1641  // checkBinaryPermissions verifies that the required binary bits are set on
  1642  // the runsc executable.
  1643  func checkBinaryPermissions(conf *config.Config) error {
  1644  	// All platforms need the other exe bit
  1645  	neededBits := os.FileMode(0001)
  1646  	if conf.Platform == "ptrace" {
  1647  		// Ptrace needs the other read bit
  1648  		neededBits |= os.FileMode(0004)
  1649  	}
  1650  
  1651  	exePath, err := os.Executable()
  1652  	if err != nil {
  1653  		return fmt.Errorf("getting exe path: %v", err)
  1654  	}
  1655  
  1656  	// Check the permissions of the runsc binary and print an error if it
  1657  	// doesn't match expectations.
  1658  	info, err := os.Stat(exePath)
  1659  	if err != nil {
  1660  		return fmt.Errorf("stat file: %v", err)
  1661  	}
  1662  
  1663  	if info.Mode().Perm()&neededBits != neededBits {
  1664  		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
  1665  	}
  1666  	return nil
  1667  }
  1668  
  1669  // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox.
  1670  func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) {
  1671  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1672  	args := control.CgroupsReadArgs{
  1673  		Args: []control.CgroupsReadArg{
  1674  			{
  1675  				File: file,
  1676  			},
  1677  		},
  1678  	}
  1679  	var out control.CgroupsResults
  1680  	if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil {
  1681  		return "", err
  1682  	}
  1683  	if len(out.Results) != 1 {
  1684  		return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1685  	}
  1686  	return out.Results[0].Unpack()
  1687  }
  1688  
  1689  // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox.
  1690  func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error {
  1691  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1692  	args := control.CgroupsWriteArgs{
  1693  		Args: []control.CgroupsWriteArg{
  1694  			{
  1695  				File:  file,
  1696  				Value: value,
  1697  			},
  1698  		},
  1699  	}
  1700  	var out control.CgroupsResults
  1701  	if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil {
  1702  		return err
  1703  	}
  1704  	if len(out.Results) != 1 {
  1705  		return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1706  	}
  1707  	return out.Results[0].AsError()
  1708  }
  1709  
  1710  // fixPidns looks at the PID namespace path. If that path corresponds to the
  1711  // sandbox process PID namespace, then change the spec so that the container
  1712  // joins the sandbox root namespace.
  1713  func (s *Sandbox) fixPidns(spec *specs.Spec) {
  1714  	pidns, ok := specutils.GetNS(specs.PIDNamespace, spec)
  1715  	if !ok {
  1716  		// pidns was not set, nothing to fix.
  1717  		return
  1718  	}
  1719  	if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) {
  1720  		// Fix only if the PID namespace corresponds to the sandbox's.
  1721  		return
  1722  	}
  1723  
  1724  	for i := range spec.Linux.Namespaces {
  1725  		if spec.Linux.Namespaces[i].Type == specs.PIDNamespace {
  1726  			// Removing the namespace makes the container join the sandbox root
  1727  			// namespace.
  1728  			log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path)
  1729  			spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...)
  1730  			return
  1731  		}
  1732  	}
  1733  	panic("unreachable")
  1734  }
  1735  
  1736  // ConfigureCmdForRootless configures cmd to donate a socket FD that can be
  1737  // used to synchronize userns configuration.
  1738  func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) {
  1739  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1740  	if err != nil {
  1741  		return nil, err
  1742  	}
  1743  	f := os.NewFile(uintptr(fds[1]), "userns sync other FD")
  1744  	donations.DonateAndClose("sync-userns-fd", f)
  1745  	if cmd.SysProcAttr == nil {
  1746  		cmd.SysProcAttr = &unix.SysProcAttr{}
  1747  	}
  1748  	cmd.SysProcAttr.AmbientCaps = []uintptr{
  1749  		// Same as `cap` in cmd/gofer.go.
  1750  		unix.CAP_CHOWN,
  1751  		unix.CAP_DAC_OVERRIDE,
  1752  		unix.CAP_DAC_READ_SEARCH,
  1753  		unix.CAP_FOWNER,
  1754  		unix.CAP_FSETID,
  1755  		unix.CAP_SYS_CHROOT,
  1756  		// Needed for setuid(2)/setgid(2).
  1757  		unix.CAP_SETUID,
  1758  		unix.CAP_SETGID,
  1759  		// Needed for chroot.
  1760  		unix.CAP_SYS_ADMIN,
  1761  		// Needed to be able to clear bounding set (PR_CAPBSET_DROP).
  1762  		unix.CAP_SETPCAP,
  1763  	}
  1764  	return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil
  1765  }
  1766  
  1767  // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings
  1768  // for process pid.
  1769  func SetUserMappings(spec *specs.Spec, pid int) error {
  1770  	log.Debugf("Setting user mappings")
  1771  	args := []string{strconv.Itoa(pid)}
  1772  	for _, idMap := range spec.Linux.UIDMappings {
  1773  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1774  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1775  		args = append(args,
  1776  			strconv.Itoa(int(idMap.ContainerID)),
  1777  			strconv.Itoa(int(idMap.HostID)),
  1778  			strconv.Itoa(int(idMap.Size)),
  1779  		)
  1780  	}
  1781  
  1782  	out, err := exec.Command("newuidmap", args...).CombinedOutput()
  1783  	log.Debugf("newuidmap: %#v\n%s", args, out)
  1784  	if err != nil {
  1785  		return fmt.Errorf("newuidmap failed: %w", err)
  1786  	}
  1787  
  1788  	args = []string{strconv.Itoa(pid)}
  1789  	for _, idMap := range spec.Linux.GIDMappings {
  1790  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1791  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1792  		args = append(args,
  1793  			strconv.Itoa(int(idMap.ContainerID)),
  1794  			strconv.Itoa(int(idMap.HostID)),
  1795  			strconv.Itoa(int(idMap.Size)),
  1796  		)
  1797  	}
  1798  	out, err = exec.Command("newgidmap", args...).CombinedOutput()
  1799  	log.Debugf("newgidmap: %#v\n%s", args, out)
  1800  	if err != nil {
  1801  		return fmt.Errorf("newgidmap failed: %w", err)
  1802  	}
  1803  	return nil
  1804  }
  1805  
  1806  // Mount mounts a filesystem in a container.
  1807  func (s *Sandbox) Mount(cid, fstype, src, dest string) error {
  1808  	var files []*os.File
  1809  	switch fstype {
  1810  	case erofs.Name:
  1811  		if imageFile, err := os.Open(src); err != nil {
  1812  			return fmt.Errorf("opening %s: %v", src, err)
  1813  		} else {
  1814  			files = append(files, imageFile)
  1815  		}
  1816  
  1817  	default:
  1818  		return fmt.Errorf("unsupported filesystem type: %v", fstype)
  1819  	}
  1820  
  1821  	args := boot.MountArgs{
  1822  		ContainerID: cid,
  1823  		Source:      src,
  1824  		Destination: dest,
  1825  		FsType:      fstype,
  1826  		FilePayload: urpc.FilePayload{Files: files},
  1827  	}
  1828  	return s.call(boot.ContMgrMount, &args, nil)
  1829  }
  1830  
  1831  // ContainerRuntimeState returns the runtime state of a container.
  1832  func (s *Sandbox) ContainerRuntimeState(cid string) (boot.ContainerRuntimeState, error) {
  1833  	log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q", s.ID, cid)
  1834  	var state boot.ContainerRuntimeState
  1835  	if err := s.call(boot.ContMgrContainerRuntimeState, &cid, &state); err != nil {
  1836  		return boot.RuntimeStateInvalid, fmt.Errorf("getting container state (CID: %q): %w", cid, err)
  1837  	}
  1838  	log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q, state: %v", s.ID, cid, state)
  1839  	return state, nil
  1840  }
  1841  
  1842  func setCloExeOnAllFDs() error {
  1843  	f, err := os.Open("/proc/self/fd")
  1844  	if err != nil {
  1845  		return fmt.Errorf("failed to open /proc/self/fd: %w", err)
  1846  
  1847  	}
  1848  	defer f.Close()
  1849  	for {
  1850  		dents, err := f.Readdirnames(256)
  1851  		if err == io.EOF {
  1852  			break
  1853  		} else if err != nil {
  1854  			return fmt.Errorf("failed to read /proc/self/fd: %w", err)
  1855  		}
  1856  		for _, dent := range dents {
  1857  			fd, err := strconv.Atoi(dent)
  1858  			if err != nil {
  1859  				return fmt.Errorf("failed to convert /proc/self/fd entry %q to int: %w", dent, err)
  1860  			}
  1861  			if fd == int(f.Fd()) {
  1862  				continue
  1863  			}
  1864  			flags, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_GETFD, 0)
  1865  			if errno != 0 {
  1866  				return fmt.Errorf("error getting descriptor flags: %w", errno)
  1867  			}
  1868  			if flags&unix.FD_CLOEXEC != 0 {
  1869  				continue
  1870  			}
  1871  			flags |= unix.FD_CLOEXEC
  1872  			if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_SETFD, flags); errno != 0 {
  1873  				return fmt.Errorf("error setting CLOEXEC: %w", errno)
  1874  			}
  1875  		}
  1876  	}
  1877  	return nil
  1878  }
  1879  
  1880  var setCloseExecOnce sync.Once
  1881  
  1882  // SetCloExeOnAllFDs sets CLOEXEC on all FDs in /proc/self/fd. This avoids
  1883  // leaking inherited FDs from the parent (caller) to subprocesses created.
  1884  func SetCloExeOnAllFDs() (retErr error) {
  1885  	// Sufficient to do this only once per runsc invocation. Avoid double work.
  1886  	setCloseExecOnce.Do(func() { retErr = setCloExeOnAllFDs() })
  1887  	return
  1888  }