github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/sandbox/sandbox.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sandbox creates and manipulates sandboxes.
    16  package sandbox
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"os"
    24  	"os/exec"
    25  	"strconv"
    26  	"strings"
    27  	"syscall"
    28  	"time"
    29  
    30  	"github.com/cenkalti/backoff"
    31  	specs "github.com/opencontainers/runtime-spec/specs-go"
    32  	"github.com/syndtr/gocapability/capability"
    33  	"golang.org/x/sys/unix"
    34  	"github.com/SagerNet/gvisor/pkg/cleanup"
    35  	"github.com/SagerNet/gvisor/pkg/control/client"
    36  	"github.com/SagerNet/gvisor/pkg/control/server"
    37  	"github.com/SagerNet/gvisor/pkg/coverage"
    38  	"github.com/SagerNet/gvisor/pkg/log"
    39  	"github.com/SagerNet/gvisor/pkg/sentry/control"
    40  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    41  	"github.com/SagerNet/gvisor/pkg/sync"
    42  	"github.com/SagerNet/gvisor/pkg/urpc"
    43  	"github.com/SagerNet/gvisor/runsc/boot"
    44  	"github.com/SagerNet/gvisor/runsc/boot/platforms"
    45  	"github.com/SagerNet/gvisor/runsc/cgroup"
    46  	"github.com/SagerNet/gvisor/runsc/config"
    47  	"github.com/SagerNet/gvisor/runsc/console"
    48  	"github.com/SagerNet/gvisor/runsc/specutils"
    49  )
    50  
    51  // Sandbox wraps a sandbox process.
    52  //
    53  // It is used to start/stop sandbox process (and associated processes like
    54  // gofers), as well as for running and manipulating containers inside a running
    55  // sandbox.
    56  //
    57  // Note: Sandbox must be immutable because a copy of it is saved for each
    58  // container and changes would not be synchronized to all of them.
    59  type Sandbox struct {
    60  	// ID is the id of the sandbox (immutable). By convention, this is the same
    61  	// ID as the first container run in the sandbox.
    62  	ID string `json:"id"`
    63  
    64  	// Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox
    65  	// is not running.
    66  	Pid int `json:"pid"`
    67  
    68  	// Cgroup has the cgroup configuration for the sandbox.
    69  	Cgroup *cgroup.Cgroup `json:"cgroup"`
    70  
    71  	// OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox
    72  	// started, before it may be modified.
    73  	OriginalOOMScoreAdj int `json:"originalOomScoreAdj"`
    74  
    75  	// child is set if a sandbox process is a child of the current process.
    76  	//
    77  	// This field isn't saved to json, because only a creator of sandbox
    78  	// will have it as a child process.
    79  	child bool
    80  
    81  	// statusMu protects status.
    82  	statusMu sync.Mutex
    83  
    84  	// status is the exit status of a sandbox process. It's only set if the
    85  	// child==true and the sandbox was waited on. This field allows for multiple
    86  	// threads to wait on sandbox and get the exit code, since Linux will return
    87  	// WaitStatus to one of the waiters only.
    88  	status unix.WaitStatus
    89  }
    90  
    91  // Args is used to configure a new sandbox.
    92  type Args struct {
    93  	// ID is the sandbox unique identifier.
    94  	ID string
    95  
    96  	// Spec is the OCI spec that describes the container.
    97  	Spec *specs.Spec
    98  
    99  	// BundleDir is the directory containing the container bundle.
   100  	BundleDir string
   101  
   102  	// ConsoleSocket is the path to a unix domain socket that will receive
   103  	// the console FD. It may be empty.
   104  	ConsoleSocket string
   105  
   106  	// UserLog is the filename to send user-visible logs to. It may be empty.
   107  	UserLog string
   108  
   109  	// IOFiles is the list of files that connect to a 9P endpoint for the mounts
   110  	// points using Gofers. They must be in the same order as mounts appear in
   111  	// the spec.
   112  	IOFiles []*os.File
   113  
   114  	// MountsFile is a file container mount information from the spec. It's
   115  	// equivalent to the mounts from the spec, except that all paths have been
   116  	// resolved to their final absolute location.
   117  	MountsFile *os.File
   118  
   119  	// Gcgroup is the cgroup that the sandbox is part of.
   120  	Cgroup *cgroup.Cgroup
   121  
   122  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   123  	// If the caller exits, the sandbox should exit too.
   124  	Attached bool
   125  }
   126  
   127  // New creates the sandbox process. The caller must call Destroy() on the
   128  // sandbox.
   129  func New(conf *config.Config, args *Args) (*Sandbox, error) {
   130  	s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup}
   131  	// The Cleanup object cleans up partially created sandboxes when an error
   132  	// occurs. Any errors occurring during cleanup itself are ignored.
   133  	c := cleanup.Make(func() {
   134  		if err := s.destroy(); err != nil {
   135  			log.Warningf("error destroying sandbox: %v", err)
   136  		}
   137  	})
   138  	defer c.Clean()
   139  
   140  	// Create pipe to synchronize when sandbox process has been booted.
   141  	clientSyncFile, sandboxSyncFile, err := os.Pipe()
   142  	if err != nil {
   143  		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
   144  	}
   145  	defer clientSyncFile.Close()
   146  
   147  	// Create the sandbox process.
   148  	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
   149  	// sandboxSyncFile has to be closed to be able to detect when the sandbox
   150  	// process exits unexpectedly.
   151  	sandboxSyncFile.Close()
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  
   156  	// Wait until the sandbox has booted.
   157  	b := make([]byte, 1)
   158  	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
   159  		err := fmt.Errorf("waiting for sandbox to start: %v", err)
   160  		// If the sandbox failed to start, it may be because the binary
   161  		// permissions were incorrect. Check the bits and return a more helpful
   162  		// error message.
   163  		//
   164  		// NOTE: The error message is checked because error types are lost over
   165  		// rpc calls.
   166  		if strings.Contains(err.Error(), io.EOF.Error()) {
   167  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   168  				return nil, fmt.Errorf("%v: %v", err, permsErr)
   169  			}
   170  		}
   171  		return nil, err
   172  	}
   173  
   174  	c.Release()
   175  	return s, nil
   176  }
   177  
   178  // CreateContainer creates a non-root container inside the sandbox.
   179  func (s *Sandbox) CreateContainer(cid string, tty *os.File) error {
   180  	log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
   181  	sandboxConn, err := s.sandboxConnect()
   182  	if err != nil {
   183  		return fmt.Errorf("couldn't connect to sandbox: %v", err)
   184  	}
   185  	defer sandboxConn.Close()
   186  
   187  	var files []*os.File
   188  	if tty != nil {
   189  		files = []*os.File{tty}
   190  	}
   191  
   192  	args := boot.CreateArgs{
   193  		CID:         cid,
   194  		FilePayload: urpc.FilePayload{Files: files},
   195  	}
   196  	if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil {
   197  		return fmt.Errorf("creating non-root container %q: %v", cid, err)
   198  	}
   199  	return nil
   200  }
   201  
   202  // StartRoot starts running the root container process inside the sandbox.
   203  func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
   204  	log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid)
   205  	conn, err := s.sandboxConnect()
   206  	if err != nil {
   207  		return err
   208  	}
   209  	defer conn.Close()
   210  
   211  	// Configure the network.
   212  	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
   213  		return fmt.Errorf("setting up network: %v", err)
   214  	}
   215  
   216  	// Send a message to the sandbox control server to start the root
   217  	// container.
   218  	if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil {
   219  		return fmt.Errorf("starting root container: %v", err)
   220  	}
   221  
   222  	return nil
   223  }
   224  
   225  // StartContainer starts running a non-root container inside the sandbox.
   226  func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error {
   227  	log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
   228  	sandboxConn, err := s.sandboxConnect()
   229  	if err != nil {
   230  		return fmt.Errorf("couldn't connect to sandbox: %v", err)
   231  	}
   232  	defer sandboxConn.Close()
   233  
   234  	// The payload must contain stdin/stdout/stderr (which may be empty if using
   235  	// TTY) followed by gofer files.
   236  	payload := urpc.FilePayload{}
   237  	payload.Files = append(payload.Files, stdios...)
   238  	payload.Files = append(payload.Files, goferFiles...)
   239  
   240  	// Start running the container.
   241  	args := boot.StartArgs{
   242  		Spec:        spec,
   243  		Conf:        conf,
   244  		CID:         cid,
   245  		FilePayload: payload,
   246  	}
   247  	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
   248  		return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
   249  	}
   250  	return nil
   251  }
   252  
   253  // Restore sends the restore call for a container in the sandbox.
   254  func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error {
   255  	log.Debugf("Restore sandbox %q", s.ID)
   256  
   257  	rf, err := os.Open(filename)
   258  	if err != nil {
   259  		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
   260  	}
   261  	defer rf.Close()
   262  
   263  	opt := boot.RestoreOpts{
   264  		FilePayload: urpc.FilePayload{
   265  			Files: []*os.File{rf},
   266  		},
   267  		SandboxID: s.ID,
   268  	}
   269  
   270  	// If the platform needs a device FD we must pass it in.
   271  	if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil {
   272  		return err
   273  	} else if deviceFile != nil {
   274  		defer deviceFile.Close()
   275  		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
   276  	}
   277  
   278  	conn, err := s.sandboxConnect()
   279  	if err != nil {
   280  		return err
   281  	}
   282  	defer conn.Close()
   283  
   284  	// Configure the network.
   285  	if err := setupNetwork(conn, s.Pid, spec, conf); err != nil {
   286  		return fmt.Errorf("setting up network: %v", err)
   287  	}
   288  
   289  	// Restore the container and start the root container.
   290  	if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil {
   291  		return fmt.Errorf("restoring container %q: %v", cid, err)
   292  	}
   293  
   294  	return nil
   295  }
   296  
   297  // Processes retrieves the list of processes and associated metadata for a
   298  // given container in this sandbox.
   299  func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
   300  	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
   301  	conn, err := s.sandboxConnect()
   302  	if err != nil {
   303  		return nil, err
   304  	}
   305  	defer conn.Close()
   306  
   307  	var pl []*control.Process
   308  	if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil {
   309  		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
   310  	}
   311  	return pl, nil
   312  }
   313  
   314  // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
   315  func (s *Sandbox) NewCGroup() (*cgroup.Cgroup, error) {
   316  	return cgroup.NewFromPid(s.Pid)
   317  }
   318  
   319  // Execute runs the specified command in the container. It returns the PID of
   320  // the newly created process.
   321  func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
   322  	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
   323  	conn, err := s.sandboxConnect()
   324  	if err != nil {
   325  		return 0, s.connError(err)
   326  	}
   327  	defer conn.Close()
   328  
   329  	// Send a message to the sandbox control server to start the container.
   330  	var pid int32
   331  	if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil {
   332  		return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err)
   333  	}
   334  	return pid, nil
   335  }
   336  
   337  // Event retrieves stats about the sandbox such as memory and CPU utilization.
   338  func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
   339  	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
   340  	conn, err := s.sandboxConnect()
   341  	if err != nil {
   342  		return nil, err
   343  	}
   344  	defer conn.Close()
   345  
   346  	var e boot.EventOut
   347  	// TODO(b/129292330): Pass in the container id (cid) here. The sandbox
   348  	// should return events only for that container.
   349  	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
   350  		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
   351  	}
   352  	e.Event.ID = cid
   353  	return &e, nil
   354  }
   355  
   356  func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
   357  	log.Debugf("Connecting to sandbox %q", s.ID)
   358  	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
   359  	if err != nil {
   360  		return nil, s.connError(err)
   361  	}
   362  	return conn, nil
   363  }
   364  
   365  func (s *Sandbox) connError(err error) error {
   366  	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err)
   367  }
   368  
   369  // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
   370  // command, passing in the bundle dir.
   371  func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
   372  	// nextFD is used to get unused FDs that we can pass to the sandbox.  It
   373  	// starts at 3 because 0, 1, and 2 are taken by stdin/out/err.
   374  	nextFD := 3
   375  
   376  	binPath := specutils.ExePath
   377  	cmd := exec.Command(binPath, conf.ToFlags()...)
   378  	cmd.SysProcAttr = &unix.SysProcAttr{}
   379  
   380  	// Open the log files to pass to the sandbox as FDs.
   381  	//
   382  	// These flags must come BEFORE the "boot" command in cmd.Args.
   383  	if conf.LogFilename != "" {
   384  		logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
   385  		if err != nil {
   386  			return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
   387  		}
   388  		defer logFile.Close()
   389  		cmd.ExtraFiles = append(cmd.ExtraFiles, logFile)
   390  		cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD))
   391  		nextFD++
   392  	}
   393  
   394  	test := ""
   395  	if len(conf.TestOnlyTestNameEnv) != 0 {
   396  		// Fetch test name if one is provided and the test only flag was set.
   397  		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
   398  			test = t
   399  		}
   400  	}
   401  	if conf.DebugLog != "" {
   402  		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test)
   403  		if err != nil {
   404  			return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
   405  		}
   406  		defer debugLogFile.Close()
   407  		cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile)
   408  		cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD))
   409  		nextFD++
   410  	}
   411  	if conf.PanicLog != "" {
   412  		panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test)
   413  		if err != nil {
   414  			return fmt.Errorf("opening panic log file in %q: %v", conf.PanicLog, err)
   415  		}
   416  		defer panicLogFile.Close()
   417  		cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile)
   418  		cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD))
   419  		nextFD++
   420  	}
   421  	covFilename := conf.CoverageReport
   422  	if covFilename == "" {
   423  		covFilename = os.Getenv("GO_COVERAGE_FILE")
   424  	}
   425  	if covFilename != "" && coverage.Available() {
   426  		covFile, err := specutils.DebugLogFile(covFilename, "cov", test)
   427  		if err != nil {
   428  			return fmt.Errorf("opening debug log file in %q: %v", covFilename, err)
   429  		}
   430  		defer covFile.Close()
   431  		cmd.ExtraFiles = append(cmd.ExtraFiles, covFile)
   432  		cmd.Args = append(cmd.Args, "--coverage-fd="+strconv.Itoa(nextFD))
   433  		nextFD++
   434  	}
   435  
   436  	// Add the "boot" command to the args.
   437  	//
   438  	// All flags after this must be for the boot command
   439  	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
   440  
   441  	// Create a socket for the control server and donate it to the sandbox.
   442  	addr := boot.ControlSocketAddr(s.ID)
   443  	sockFD, err := server.CreateSocket(addr)
   444  	log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00".
   445  	if err != nil {
   446  		return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err)
   447  	}
   448  	controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket")
   449  	defer controllerFile.Close()
   450  	cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile)
   451  	cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD))
   452  	nextFD++
   453  
   454  	defer args.MountsFile.Close()
   455  	cmd.ExtraFiles = append(cmd.ExtraFiles, args.MountsFile)
   456  	cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD))
   457  	nextFD++
   458  
   459  	specFile, err := specutils.OpenSpec(args.BundleDir)
   460  	if err != nil {
   461  		return err
   462  	}
   463  	defer specFile.Close()
   464  	cmd.ExtraFiles = append(cmd.ExtraFiles, specFile)
   465  	cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD))
   466  	nextFD++
   467  
   468  	cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile)
   469  	cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD))
   470  	nextFD++
   471  
   472  	// If there is a gofer, sends all socket ends to the sandbox.
   473  	for _, f := range args.IOFiles {
   474  		defer f.Close()
   475  		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
   476  		cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD))
   477  		nextFD++
   478  	}
   479  
   480  	gPlatform, err := platform.Lookup(conf.Platform)
   481  	if err != nil {
   482  		return err
   483  	}
   484  
   485  	if deviceFile, err := gPlatform.OpenDevice(); err != nil {
   486  		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
   487  	} else if deviceFile != nil {
   488  		defer deviceFile.Close()
   489  		cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile)
   490  		cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD))
   491  		nextFD++
   492  	}
   493  
   494  	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
   495  	// isn't set.
   496  	if conf.Platform == "kvm" {
   497  		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
   498  	}
   499  
   500  	// The current process' stdio must be passed to the application via the
   501  	// --stdio-fds flag. The stdio of the sandbox process itself must not
   502  	// be connected to the same FDs, otherwise we risk leaking sandbox
   503  	// errors to the application, so we set the sandbox stdio to nil,
   504  	// causing them to read/write from the null device.
   505  	cmd.Stdin = nil
   506  	cmd.Stdout = nil
   507  	cmd.Stderr = nil
   508  
   509  	// If the console control socket file is provided, then create a new
   510  	// pty master/replica pair and set the TTY on the sandbox process.
   511  	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
   512  		// console.NewWithSocket will send the master on the given
   513  		// socket, and return the replica.
   514  		tty, err := console.NewWithSocket(args.ConsoleSocket)
   515  		if err != nil {
   516  			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
   517  		}
   518  		defer tty.Close()
   519  
   520  		// Set the TTY as a controlling TTY on the sandbox process.
   521  		cmd.SysProcAttr.Setctty = true
   522  		// The Ctty FD must be the FD in the child process's FD table,
   523  		// which will be nextFD in this case.
   524  		// See https://github.com/golang/go/issues/29458.
   525  		cmd.SysProcAttr.Ctty = nextFD
   526  
   527  		// Pass the tty as all stdio fds to sandbox.
   528  		for i := 0; i < 3; i++ {
   529  			cmd.ExtraFiles = append(cmd.ExtraFiles, tty)
   530  			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
   531  			nextFD++
   532  		}
   533  
   534  		if conf.Debug {
   535  			// If debugging, send the boot process stdio to the
   536  			// TTY, so that it is easier to find.
   537  			cmd.Stdin = tty
   538  			cmd.Stdout = tty
   539  			cmd.Stderr = tty
   540  		}
   541  	} else {
   542  		// If not using a console, pass our current stdio as the
   543  		// container stdio via flags.
   544  		for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
   545  			cmd.ExtraFiles = append(cmd.ExtraFiles, f)
   546  			cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD))
   547  			nextFD++
   548  		}
   549  
   550  		if conf.Debug {
   551  			// If debugging, send the boot process stdio to the
   552  			// this process' stdio, so that is is easier to find.
   553  			cmd.Stdin = os.Stdin
   554  			cmd.Stdout = os.Stdout
   555  			cmd.Stderr = os.Stderr
   556  		}
   557  	}
   558  
   559  	// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   560  	// when re-parented.
   561  	cmd.SysProcAttr.Setsid = true
   562  
   563  	// nss is the set of namespaces to join or create before starting the sandbox
   564  	// process. Mount, IPC and UTS namespaces from the host are not used as they
   565  	// are virtualized inside the sandbox. Be paranoid and run inside an empty
   566  	// namespace for these. Don't unshare cgroup because sandbox is added to a
   567  	// cgroup in the caller's namespace.
   568  	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
   569  	nss := []specs.LinuxNamespace{
   570  		{Type: specs.IPCNamespace},
   571  		{Type: specs.MountNamespace},
   572  		{Type: specs.UTSNamespace},
   573  	}
   574  
   575  	if gPlatform.Requirements().RequiresCurrentPIDNS {
   576  		// TODO(b/75837838): Also set a new PID namespace so that we limit
   577  		// access to other host processes.
   578  		log.Infof("Sandbox will be started in the current PID namespace")
   579  	} else {
   580  		log.Infof("Sandbox will be started in a new PID namespace")
   581  		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
   582  		cmd.Args = append(cmd.Args, "--pidns=true")
   583  	}
   584  
   585  	// Joins the network namespace if network is enabled. the sandbox talks
   586  	// directly to the host network, which may have been configured in the
   587  	// namespace.
   588  	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
   589  		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
   590  		nss = append(nss, ns)
   591  	} else if conf.Network == config.NetworkHost {
   592  		log.Infof("Sandbox will be started in the host network namespace")
   593  	} else {
   594  		log.Infof("Sandbox will be started in new network namespace")
   595  		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
   596  	}
   597  
   598  	// User namespace depends on the network type. Host network requires to run
   599  	// inside the user namespace specified in the spec or the current namespace
   600  	// if none is configured.
   601  	if conf.Network == config.NetworkHost {
   602  		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
   603  			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
   604  			nss = append(nss, userns)
   605  			specutils.SetUIDGIDMappings(cmd, args.Spec)
   606  		} else {
   607  			log.Infof("Sandbox will be started in the current user namespace")
   608  		}
   609  		// When running in the caller's defined user namespace, apply the same
   610  		// capabilities to the sandbox process to ensure it abides to the same
   611  		// rules.
   612  		cmd.Args = append(cmd.Args, "--apply-caps=true")
   613  
   614  		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
   615  		// bind-mount the executable inside it.
   616  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   617  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   618  
   619  		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) {
   620  			log.Infof("Sandbox will be started in minimal chroot")
   621  			cmd.Args = append(cmd.Args, "--setup-root")
   622  		} else {
   623  			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
   624  		}
   625  	} else {
   626  		// If we have CAP_SETUID and CAP_SETGID, then we can also run
   627  		// as user nobody.
   628  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   629  			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
   630  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   631  		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
   632  			log.Infof("Sandbox will be started in new user namespace")
   633  			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
   634  			cmd.Args = append(cmd.Args, "--setup-root")
   635  
   636  			const nobody = 65534
   637  			if conf.Rootless {
   638  				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
   639  				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   640  					{
   641  						ContainerID: nobody,
   642  						HostID:      os.Getuid(),
   643  						Size:        1,
   644  					},
   645  				}
   646  				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   647  					{
   648  						ContainerID: nobody,
   649  						HostID:      os.Getgid(),
   650  						Size:        1,
   651  					},
   652  				}
   653  
   654  			} else {
   655  				// Map nobody in the new namespace to nobody in the parent namespace.
   656  				//
   657  				// A sandbox process will construct an empty
   658  				// root for itself, so it has to have
   659  				// CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
   660  				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   661  					{
   662  						ContainerID: nobody,
   663  						HostID:      nobody,
   664  						Size:        1,
   665  					},
   666  				}
   667  				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   668  					{
   669  						ContainerID: nobody,
   670  						HostID:      nobody,
   671  						Size:        1,
   672  					},
   673  				}
   674  			}
   675  
   676  			// Set credentials to run as user and group nobody.
   677  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
   678  			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT))
   679  		} else {
   680  			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
   681  		}
   682  	}
   683  
   684  	cmd.Args[0] = "runsc-sandbox"
   685  
   686  	if s.Cgroup != nil {
   687  		cpuNum, err := s.Cgroup.NumCPU()
   688  		if err != nil {
   689  			return fmt.Errorf("getting cpu count from cgroups: %v", err)
   690  		}
   691  		if conf.CPUNumFromQuota {
   692  			// Dropping below 2 CPUs can trigger application to disable
   693  			// locks that can lead do hard to debug errors, so just
   694  			// leaving two cores as reasonable default.
   695  			const minCPUs = 2
   696  
   697  			quota, err := s.Cgroup.CPUQuota()
   698  			if err != nil {
   699  				return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
   700  			}
   701  			if n := int(math.Ceil(quota)); n > 0 {
   702  				if n < minCPUs {
   703  					n = minCPUs
   704  				}
   705  				if n < cpuNum {
   706  					// Only lower the cpu number.
   707  					cpuNum = n
   708  				}
   709  			}
   710  		}
   711  		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
   712  
   713  		mem, err := s.Cgroup.MemoryLimit()
   714  		if err != nil {
   715  			return fmt.Errorf("getting memory limit from cgroups: %v", err)
   716  		}
   717  		// When memory limit is unset, a "large" number is returned. In that case,
   718  		// just stick with the default.
   719  		if mem < 0x7ffffffffffff000 {
   720  			cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
   721  		}
   722  	}
   723  
   724  	if args.UserLog != "" {
   725  		f, err := os.OpenFile(args.UserLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
   726  		if err != nil {
   727  			return fmt.Errorf("opening compat log file: %v", err)
   728  		}
   729  		defer f.Close()
   730  
   731  		cmd.ExtraFiles = append(cmd.ExtraFiles, f)
   732  		cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD))
   733  		nextFD++
   734  	}
   735  
   736  	_ = nextFD // All FD assignment is finished.
   737  
   738  	if args.Attached {
   739  		// Kill sandbox if parent process exits in attached mode.
   740  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
   741  		// Tells boot that any process it creates must have pdeathsig set.
   742  		cmd.Args = append(cmd.Args, "--attached")
   743  	}
   744  
   745  	// Add container as the last argument.
   746  	cmd.Args = append(cmd.Args, s.ID)
   747  
   748  	// Log the FDs we are donating to the sandbox process.
   749  	for i, f := range cmd.ExtraFiles {
   750  		log.Debugf("Donating FD %d: %q", i+3, f.Name())
   751  	}
   752  
   753  	log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args)
   754  	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
   755  	if err := specutils.StartInNS(cmd, nss); err != nil {
   756  		err := fmt.Errorf("starting sandbox: %v", err)
   757  		// If the sandbox failed to start, it may be because the binary
   758  		// permissions were incorrect. Check the bits and return a more helpful
   759  		// error message.
   760  		//
   761  		// NOTE: The error message is checked because error types are lost over
   762  		// rpc calls.
   763  		if strings.Contains(err.Error(), unix.EACCES.Error()) {
   764  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   765  				return fmt.Errorf("%v: %v", err, permsErr)
   766  			}
   767  		}
   768  		return err
   769  	}
   770  	s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid)
   771  	if err != nil {
   772  		return err
   773  	}
   774  
   775  	s.child = true
   776  	s.Pid = cmd.Process.Pid
   777  	log.Infof("Sandbox started, PID: %d", s.Pid)
   778  
   779  	return nil
   780  }
   781  
   782  // Wait waits for the containerized process to exit, and returns its WaitStatus.
   783  func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) {
   784  	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
   785  
   786  	if conn, err := s.sandboxConnect(); err != nil {
   787  		// The sandbox may have exited while before we had a chance to wait on it.
   788  		// There is nothing we can do for subcontainers. For the init container, we
   789  		// can try to get the sandbox exit code.
   790  		if !s.IsRootContainer(cid) {
   791  			return unix.WaitStatus(0), err
   792  		}
   793  		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
   794  	} else {
   795  		defer conn.Close()
   796  
   797  		// Try the Wait RPC to the sandbox.
   798  		var ws unix.WaitStatus
   799  		err = conn.Call(boot.ContainerWait, &cid, &ws)
   800  		conn.Close()
   801  		if err == nil {
   802  			if s.IsRootContainer(cid) {
   803  				if err := s.waitForStopped(); err != nil {
   804  					return unix.WaitStatus(0), err
   805  				}
   806  			}
   807  			// It worked!
   808  			return ws, nil
   809  		}
   810  		// See comment above.
   811  		if !s.IsRootContainer(cid) {
   812  			return unix.WaitStatus(0), err
   813  		}
   814  
   815  		// The sandbox may have exited after we connected, but before
   816  		// or during the Wait RPC.
   817  		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
   818  	}
   819  
   820  	// The sandbox may have already exited, or exited while handling the Wait RPC.
   821  	// The best we can do is ask Linux what the sandbox exit status was, since in
   822  	// most cases that will be the same as the container exit status.
   823  	if err := s.waitForStopped(); err != nil {
   824  		return unix.WaitStatus(0), err
   825  	}
   826  	if !s.child {
   827  		return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
   828  	}
   829  
   830  	s.statusMu.Lock()
   831  	defer s.statusMu.Unlock()
   832  	return s.status, nil
   833  }
   834  
   835  // WaitPID waits for process 'pid' in the container's sandbox and returns its
   836  // WaitStatus.
   837  func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) {
   838  	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
   839  	var ws unix.WaitStatus
   840  	conn, err := s.sandboxConnect()
   841  	if err != nil {
   842  		return ws, err
   843  	}
   844  	defer conn.Close()
   845  
   846  	args := &boot.WaitPIDArgs{
   847  		PID: pid,
   848  		CID: cid,
   849  	}
   850  	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
   851  		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
   852  	}
   853  	return ws, nil
   854  }
   855  
   856  // IsRootContainer returns true if the specified container ID belongs to the
   857  // root container.
   858  func (s *Sandbox) IsRootContainer(cid string) bool {
   859  	return s.ID == cid
   860  }
   861  
   862  // Destroy frees all resources associated with the sandbox. It fails fast and
   863  // is idempotent.
   864  func (s *Sandbox) destroy() error {
   865  	log.Debugf("Destroy sandbox %q", s.ID)
   866  	if s.Pid != 0 {
   867  		log.Debugf("Killing sandbox %q", s.ID)
   868  		if err := unix.Kill(s.Pid, unix.SIGKILL); err != nil && err != unix.ESRCH {
   869  			return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
   870  		}
   871  		if err := s.waitForStopped(); err != nil {
   872  			return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err)
   873  		}
   874  	}
   875  
   876  	return nil
   877  }
   878  
   879  // SignalContainer sends the signal to a container in the sandbox. If all is
   880  // true and signal is SIGKILL, then waits for all processes to exit before
   881  // returning.
   882  func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error {
   883  	log.Debugf("Signal sandbox %q", s.ID)
   884  	conn, err := s.sandboxConnect()
   885  	if err != nil {
   886  		return err
   887  	}
   888  	defer conn.Close()
   889  
   890  	mode := boot.DeliverToProcess
   891  	if all {
   892  		mode = boot.DeliverToAllProcesses
   893  	}
   894  
   895  	args := boot.SignalArgs{
   896  		CID:   cid,
   897  		Signo: int32(sig),
   898  		Mode:  mode,
   899  	}
   900  	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
   901  		return fmt.Errorf("signaling container %q: %v", cid, err)
   902  	}
   903  	return nil
   904  }
   905  
   906  // SignalProcess sends the signal to a particular process in the container. If
   907  // fgProcess is true, then the signal is sent to the foreground process group
   908  // in the same session that PID belongs to. This is only valid if the process
   909  // is attached to a host TTY.
   910  func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error {
   911  	log.Debugf("Signal sandbox %q", s.ID)
   912  	conn, err := s.sandboxConnect()
   913  	if err != nil {
   914  		return err
   915  	}
   916  	defer conn.Close()
   917  
   918  	mode := boot.DeliverToProcess
   919  	if fgProcess {
   920  		mode = boot.DeliverToForegroundProcessGroup
   921  	}
   922  
   923  	args := boot.SignalArgs{
   924  		CID:   cid,
   925  		Signo: int32(sig),
   926  		PID:   pid,
   927  		Mode:  mode,
   928  	}
   929  	if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil {
   930  		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
   931  	}
   932  	return nil
   933  }
   934  
   935  // Checkpoint sends the checkpoint call for a container in the sandbox.
   936  // The statefile will be written to f.
   937  func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
   938  	log.Debugf("Checkpoint sandbox %q", s.ID)
   939  	conn, err := s.sandboxConnect()
   940  	if err != nil {
   941  		return err
   942  	}
   943  	defer conn.Close()
   944  
   945  	opt := control.SaveOpts{
   946  		FilePayload: urpc.FilePayload{
   947  			Files: []*os.File{f},
   948  		},
   949  	}
   950  
   951  	if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil {
   952  		return fmt.Errorf("checkpointing container %q: %v", cid, err)
   953  	}
   954  	return nil
   955  }
   956  
   957  // Pause sends the pause call for a container in the sandbox.
   958  func (s *Sandbox) Pause(cid string) error {
   959  	log.Debugf("Pause sandbox %q", s.ID)
   960  	conn, err := s.sandboxConnect()
   961  	if err != nil {
   962  		return err
   963  	}
   964  	defer conn.Close()
   965  
   966  	if err := conn.Call(boot.ContainerPause, nil, nil); err != nil {
   967  		return fmt.Errorf("pausing container %q: %v", cid, err)
   968  	}
   969  	return nil
   970  }
   971  
   972  // Resume sends the resume call for a container in the sandbox.
   973  func (s *Sandbox) Resume(cid string) error {
   974  	log.Debugf("Resume sandbox %q", s.ID)
   975  	conn, err := s.sandboxConnect()
   976  	if err != nil {
   977  		return err
   978  	}
   979  	defer conn.Close()
   980  
   981  	if err := conn.Call(boot.ContainerResume, nil, nil); err != nil {
   982  		return fmt.Errorf("resuming container %q: %v", cid, err)
   983  	}
   984  	return nil
   985  }
   986  
   987  // IsRunning returns true if the sandbox or gofer process is running.
   988  func (s *Sandbox) IsRunning() bool {
   989  	if s.Pid != 0 {
   990  		// Send a signal 0 to the sandbox process.
   991  		if err := unix.Kill(s.Pid, 0); err == nil {
   992  			// Succeeded, process is running.
   993  			return true
   994  		}
   995  	}
   996  	return false
   997  }
   998  
   999  // Stacks collects and returns all stacks for the sandbox.
  1000  func (s *Sandbox) Stacks() (string, error) {
  1001  	log.Debugf("Stacks sandbox %q", s.ID)
  1002  	conn, err := s.sandboxConnect()
  1003  	if err != nil {
  1004  		return "", err
  1005  	}
  1006  	defer conn.Close()
  1007  
  1008  	var stacks string
  1009  	if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil {
  1010  		return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err)
  1011  	}
  1012  	return stacks, nil
  1013  }
  1014  
  1015  // HeapProfile writes a heap profile to the given file.
  1016  func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
  1017  	log.Debugf("Heap profile %q", s.ID)
  1018  	conn, err := s.sandboxConnect()
  1019  	if err != nil {
  1020  		return err
  1021  	}
  1022  	defer conn.Close()
  1023  
  1024  	opts := control.HeapProfileOpts{
  1025  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1026  		Delay:       delay,
  1027  	}
  1028  	return conn.Call(boot.HeapProfile, &opts, nil)
  1029  }
  1030  
  1031  // CPUProfile collects a CPU profile.
  1032  func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
  1033  	log.Debugf("CPU profile %q", s.ID)
  1034  	conn, err := s.sandboxConnect()
  1035  	if err != nil {
  1036  		return err
  1037  	}
  1038  	defer conn.Close()
  1039  
  1040  	opts := control.CPUProfileOpts{
  1041  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1042  		Duration:    duration,
  1043  	}
  1044  	return conn.Call(boot.CPUProfile, &opts, nil)
  1045  }
  1046  
  1047  // BlockProfile writes a block profile to the given file.
  1048  func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
  1049  	log.Debugf("Block profile %q", s.ID)
  1050  	conn, err := s.sandboxConnect()
  1051  	if err != nil {
  1052  		return err
  1053  	}
  1054  	defer conn.Close()
  1055  
  1056  	opts := control.BlockProfileOpts{
  1057  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1058  		Duration:    duration,
  1059  	}
  1060  	return conn.Call(boot.BlockProfile, &opts, nil)
  1061  }
  1062  
  1063  // MutexProfile writes a mutex profile to the given file.
  1064  func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
  1065  	log.Debugf("Mutex profile %q", s.ID)
  1066  	conn, err := s.sandboxConnect()
  1067  	if err != nil {
  1068  		return err
  1069  	}
  1070  	defer conn.Close()
  1071  
  1072  	opts := control.MutexProfileOpts{
  1073  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1074  		Duration:    duration,
  1075  	}
  1076  	return conn.Call(boot.MutexProfile, &opts, nil)
  1077  }
  1078  
  1079  // Trace collects an execution trace.
  1080  func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
  1081  	log.Debugf("Trace %q", s.ID)
  1082  	conn, err := s.sandboxConnect()
  1083  	if err != nil {
  1084  		return err
  1085  	}
  1086  	defer conn.Close()
  1087  
  1088  	opts := control.TraceProfileOpts{
  1089  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1090  		Duration:    duration,
  1091  	}
  1092  	return conn.Call(boot.Trace, &opts, nil)
  1093  }
  1094  
  1095  // ChangeLogging changes logging options.
  1096  func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
  1097  	log.Debugf("Change logging start %q", s.ID)
  1098  	conn, err := s.sandboxConnect()
  1099  	if err != nil {
  1100  		return err
  1101  	}
  1102  	defer conn.Close()
  1103  
  1104  	if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil {
  1105  		return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err)
  1106  	}
  1107  	return nil
  1108  }
  1109  
  1110  // DestroyContainer destroys the given container. If it is the root container,
  1111  // then the entire sandbox is destroyed.
  1112  func (s *Sandbox) DestroyContainer(cid string) error {
  1113  	if err := s.destroyContainer(cid); err != nil {
  1114  		// If the sandbox isn't running, the container has already been destroyed,
  1115  		// ignore the error in this case.
  1116  		if s.IsRunning() {
  1117  			return err
  1118  		}
  1119  	}
  1120  	return nil
  1121  }
  1122  
  1123  func (s *Sandbox) destroyContainer(cid string) error {
  1124  	if s.IsRootContainer(cid) {
  1125  		log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid)
  1126  		return s.destroy()
  1127  	}
  1128  
  1129  	log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID)
  1130  	conn, err := s.sandboxConnect()
  1131  	if err != nil {
  1132  		return err
  1133  	}
  1134  	defer conn.Close()
  1135  	if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil {
  1136  		return fmt.Errorf("destroying container %q: %v", cid, err)
  1137  	}
  1138  	return nil
  1139  }
  1140  
  1141  func (s *Sandbox) waitForStopped() error {
  1142  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1143  	defer cancel()
  1144  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1145  	op := func() error {
  1146  		if s.child {
  1147  			s.statusMu.Lock()
  1148  			defer s.statusMu.Unlock()
  1149  			if s.Pid == 0 {
  1150  				return nil
  1151  			}
  1152  			// The sandbox process is a child of the current process,
  1153  			// so we can wait it and collect its zombie.
  1154  			wpid, err := unix.Wait4(int(s.Pid), &s.status, unix.WNOHANG, nil)
  1155  			if err != nil {
  1156  				return fmt.Errorf("error waiting the sandbox process: %v", err)
  1157  			}
  1158  			if wpid == 0 {
  1159  				return fmt.Errorf("sandbox is still running")
  1160  			}
  1161  			s.Pid = 0
  1162  		} else if s.IsRunning() {
  1163  			return fmt.Errorf("sandbox is still running")
  1164  		}
  1165  		return nil
  1166  	}
  1167  	return backoff.Retry(op, b)
  1168  }
  1169  
  1170  // deviceFileForPlatform opens the device file for the given platform. If the
  1171  // platform does not need a device file, then nil is returned.
  1172  func deviceFileForPlatform(name string) (*os.File, error) {
  1173  	p, err := platform.Lookup(name)
  1174  	if err != nil {
  1175  		return nil, err
  1176  	}
  1177  
  1178  	f, err := p.OpenDevice()
  1179  	if err != nil {
  1180  		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
  1181  	}
  1182  	return f, nil
  1183  }
  1184  
  1185  // checkBinaryPermissions verifies that the required binary bits are set on
  1186  // the runsc executable.
  1187  func checkBinaryPermissions(conf *config.Config) error {
  1188  	// All platforms need the other exe bit
  1189  	neededBits := os.FileMode(0001)
  1190  	if conf.Platform == platforms.Ptrace {
  1191  		// Ptrace needs the other read bit
  1192  		neededBits |= os.FileMode(0004)
  1193  	}
  1194  
  1195  	exePath, err := os.Executable()
  1196  	if err != nil {
  1197  		return fmt.Errorf("getting exe path: %v", err)
  1198  	}
  1199  
  1200  	// Check the permissions of the runsc binary and print an error if it
  1201  	// doesn't match expectations.
  1202  	info, err := os.Stat(exePath)
  1203  	if err != nil {
  1204  		return fmt.Errorf("stat file: %v", err)
  1205  	}
  1206  
  1207  	if info.Mode().Perm()&neededBits != neededBits {
  1208  		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
  1209  	}
  1210  	return nil
  1211  }