github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/sandbox/sandbox.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sandbox creates and manipulates sandboxes.
    16  package sandbox
    17  
    18  import (
    19  	"context"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math"
    25  	"os"
    26  	"os/exec"
    27  	"path/filepath"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/cenkalti/backoff"
    34  	specs "github.com/opencontainers/runtime-spec/specs-go"
    35  	"github.com/syndtr/gocapability/capability"
    36  	"github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops"
    37  	"github.com/ttpreport/gvisor-ligolo/pkg/cleanup"
    38  	"github.com/ttpreport/gvisor-ligolo/pkg/control/client"
    39  	"github.com/ttpreport/gvisor-ligolo/pkg/control/server"
    40  	"github.com/ttpreport/gvisor-ligolo/pkg/coverage"
    41  	"github.com/ttpreport/gvisor-ligolo/pkg/log"
    42  	metricpb "github.com/ttpreport/gvisor-ligolo/pkg/metric/metric_go_proto"
    43  	"github.com/ttpreport/gvisor-ligolo/pkg/prometheus"
    44  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/control"
    45  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/platform"
    46  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/seccheck"
    47  	"github.com/ttpreport/gvisor-ligolo/pkg/sync"
    48  	"github.com/ttpreport/gvisor-ligolo/pkg/urpc"
    49  	"github.com/ttpreport/gvisor-ligolo/runsc/boot"
    50  	"github.com/ttpreport/gvisor-ligolo/runsc/boot/procfs"
    51  	"github.com/ttpreport/gvisor-ligolo/runsc/cgroup"
    52  	"github.com/ttpreport/gvisor-ligolo/runsc/config"
    53  	"github.com/ttpreport/gvisor-ligolo/runsc/console"
    54  	"github.com/ttpreport/gvisor-ligolo/runsc/donation"
    55  	"github.com/ttpreport/gvisor-ligolo/runsc/specutils"
    56  	"golang.org/x/sys/unix"
    57  )
    58  
    59  const (
    60  	// namespaceAnnotation is a pod annotation populated by containerd.
    61  	// It contains the name of the pod that a sandbox is in when running in Kubernetes.
    62  	podNameAnnotation = "io.kubernetes.cri.sandbox-name"
    63  
    64  	// namespaceAnnotation is a pod annotation populated by containerd.
    65  	// It contains the namespace of the pod that a sandbox is in when running in Kubernetes.
    66  	namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace"
    67  )
    68  
    69  // createControlSocket finds a location and creates the socket used to
    70  // communicate with the sandbox.
    71  func createControlSocket(rootDir, id string) (string, int, error) {
    72  	name := fmt.Sprintf("runsc-%s.sock", id)
    73  
    74  	// Only use absolute paths to guarantee resolution from anywhere.
    75  	var paths []string
    76  	for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} {
    77  		paths = append(paths, filepath.Join(dir, name))
    78  	}
    79  	// If nothing else worked, use the abstract namespace.
    80  	paths = append(paths, fmt.Sprintf("\x00runsc-sandbox.%s", id))
    81  
    82  	for _, path := range paths {
    83  		log.Debugf("Attempting to create socket file %q", path)
    84  		fd, err := server.CreateSocket(path)
    85  		if err == nil {
    86  			log.Debugf("Using socket file %q", path)
    87  			return path, fd, nil
    88  		}
    89  	}
    90  	return "", -1, fmt.Errorf("unable to find location to write socket file")
    91  }
    92  
    93  // pid is an atomic type that implements JSON marshal/unmarshal interfaces.
    94  type pid struct {
    95  	val atomicbitops.Int64
    96  }
    97  
    98  func (p *pid) store(pid int) {
    99  	p.val.Store(int64(pid))
   100  }
   101  
   102  func (p *pid) load() int {
   103  	return int(p.val.Load())
   104  }
   105  
   106  // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON.
   107  func (p *pid) UnmarshalJSON(b []byte) error {
   108  	var pid int
   109  
   110  	if err := json.Unmarshal(b, &pid); err != nil {
   111  		return err
   112  	}
   113  	p.store(pid)
   114  	return nil
   115  }
   116  
   117  // MarshalJSON implements json.Marshaler.MarshalJSON
   118  func (p *pid) MarshalJSON() ([]byte, error) {
   119  	return json.Marshal(p.load())
   120  }
   121  
   122  // Sandbox wraps a sandbox process.
   123  //
   124  // It is used to start/stop sandbox process (and associated processes like
   125  // gofers), as well as for running and manipulating containers inside a running
   126  // sandbox.
   127  //
   128  // Note: Sandbox must be immutable because a copy of it is saved for each
   129  // container and changes would not be synchronized to all of them.
   130  type Sandbox struct {
   131  	// ID is the id of the sandbox (immutable). By convention, this is the same
   132  	// ID as the first container run in the sandbox.
   133  	ID string `json:"id"`
   134  
   135  	// PodName is the name of the Kubernetes Pod (if any) that this sandbox
   136  	// represents. Unset if not running under containerd or Kubernetes.
   137  	PodName string `json:"podName"`
   138  
   139  	// Namespace is the Kubernetes namespace (if any) of the pod that this
   140  	// sandbox represents. Unset if not running under containerd or Kubernetes.
   141  	Namespace string `json:"namespace"`
   142  
   143  	// Pid is the pid of the running sandbox. May be 0 if the sandbox
   144  	// is not running.
   145  	Pid pid `json:"pid"`
   146  
   147  	// UID is the user ID in the parent namespace that the sandbox is running as.
   148  	UID int `json:"uid"`
   149  	// GID is the group ID in the parent namespace that the sandbox is running as.
   150  	GID int `json:"gid"`
   151  
   152  	// CgroupJSON contains the cgroup configuration that the sandbox is part of
   153  	// and allow serialization of the configuration into json
   154  	CgroupJSON cgroup.CgroupJSON `json:"cgroup"`
   155  
   156  	// OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox
   157  	// started, before it may be modified.
   158  	OriginalOOMScoreAdj int `json:"originalOomScoreAdj"`
   159  
   160  	// RegisteredMetrics is the set of metrics registered in the sandbox.
   161  	// Used for verifying metric data integrity after containers are started.
   162  	// Only populated if exporting metrics was requested when the sandbox was
   163  	// created.
   164  	RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"`
   165  
   166  	// MetricMetadata are key-value pairs that are useful to export about this
   167  	// sandbox, but not part of the set of labels that uniquely identify it.
   168  	// They are static once initialized, and typically contain high-level
   169  	// configuration information about the sandbox.
   170  	MetricMetadata map[string]string `json:"metricMetadata"`
   171  
   172  	// MetricServerAddress is the address of the metric server that this sandbox
   173  	// intends to export metrics for.
   174  	// Only populated if exporting metrics was requested when the sandbox was
   175  	// created.
   176  	MetricServerAddress string `json:"metricServerAddress"`
   177  
   178  	// ControlAddress is the uRPC address used to connect to the sandbox.
   179  	ControlAddress string `json:"control_address"`
   180  
   181  	// MountHints provides extra information about container mounts that apply
   182  	// to the entire pod.
   183  	MountHints *boot.PodMountHints `json:"mountHints"`
   184  
   185  	// child is set if a sandbox process is a child of the current process.
   186  	//
   187  	// This field isn't saved to json, because only a creator of sandbox
   188  	// will have it as a child process.
   189  	child bool
   190  
   191  	// statusMu protects status.
   192  	statusMu sync.Mutex
   193  
   194  	// status is the exit status of a sandbox process. It's only set if the
   195  	// child==true and the sandbox was waited on. This field allows for multiple
   196  	// threads to wait on sandbox and get the exit code, since Linux will return
   197  	// WaitStatus to one of the waiters only.
   198  	status unix.WaitStatus
   199  }
   200  
   201  // Getpid returns the process ID of the sandbox process.
   202  func (s *Sandbox) Getpid() int {
   203  	return s.Pid.load()
   204  }
   205  
   206  // Args is used to configure a new sandbox.
   207  type Args struct {
   208  	// ID is the sandbox unique identifier.
   209  	ID string
   210  
   211  	// Spec is the OCI spec that describes the container.
   212  	Spec *specs.Spec
   213  
   214  	// BundleDir is the directory containing the container bundle.
   215  	BundleDir string
   216  
   217  	// ConsoleSocket is the path to a unix domain socket that will receive
   218  	// the console FD. It may be empty.
   219  	ConsoleSocket string
   220  
   221  	// UserLog is the filename to send user-visible logs to. It may be empty.
   222  	UserLog string
   223  
   224  	// IOFiles is the list of files that connect to a gofer endpoint for the
   225  	// mounts points using Gofers. They must be in the same order as mounts
   226  	// appear in the spec.
   227  	IOFiles []*os.File
   228  
   229  	// OverlayFilestoreFiles are the regular files that will back the tmpfs upper
   230  	// mount in the overlay mounts.
   231  	OverlayFilestoreFiles []*os.File
   232  
   233  	// OverlayMediums contains information about how the gofer mounts have been
   234  	// overlaid. The first entry is for rootfs and the following entries are for
   235  	// bind mounts in Spec.Mounts (in the same order).
   236  	OverlayMediums []boot.OverlayMedium
   237  
   238  	// MountHints provides extra information about containers mounts that apply
   239  	// to the entire pod.
   240  	MountHints *boot.PodMountHints
   241  
   242  	// MountsFile is a file container mount information from the spec. It's
   243  	// equivalent to the mounts from the spec, except that all paths have been
   244  	// resolved to their final absolute location.
   245  	MountsFile *os.File
   246  
   247  	// Gcgroup is the cgroup that the sandbox is part of.
   248  	Cgroup cgroup.Cgroup
   249  
   250  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   251  	// If the caller exits, the sandbox should exit too.
   252  	Attached bool
   253  
   254  	// SinkFiles is the an ordered array of files to be used by seccheck sinks
   255  	// configured from the --pod-init-config file.
   256  	SinkFiles []*os.File
   257  
   258  	// PassFiles are user-supplied files from the host to be exposed to the
   259  	// sandboxed app.
   260  	PassFiles map[int]*os.File
   261  
   262  	// ExecFile is the file from the host used for program execution.
   263  	ExecFile *os.File
   264  }
   265  
   266  // New creates the sandbox process. The caller must call Destroy() on the
   267  // sandbox.
   268  func New(conf *config.Config, args *Args) (*Sandbox, error) {
   269  	s := &Sandbox{
   270  		ID: args.ID,
   271  		CgroupJSON: cgroup.CgroupJSON{
   272  			Cgroup: args.Cgroup,
   273  		},
   274  		UID:                 -1, // prevent usage before it's set.
   275  		GID:                 -1, // prevent usage before it's set.
   276  		MetricMetadata:      conf.MetricMetadata(),
   277  		MetricServerAddress: conf.MetricServer,
   278  		MountHints:          args.MountHints,
   279  	}
   280  	if args.Spec != nil && args.Spec.Annotations != nil {
   281  		s.PodName = args.Spec.Annotations[podNameAnnotation]
   282  		s.Namespace = args.Spec.Annotations[namespaceAnnotation]
   283  	}
   284  
   285  	// The Cleanup object cleans up partially created sandboxes when an error
   286  	// occurs. Any errors occurring during cleanup itself are ignored.
   287  	c := cleanup.Make(func() {
   288  		if err := s.destroy(); err != nil {
   289  			log.Warningf("error destroying sandbox: %v", err)
   290  		}
   291  	})
   292  	defer c.Clean()
   293  
   294  	if len(conf.PodInitConfig) > 0 {
   295  		initConf, err := boot.LoadInitConfig(conf.PodInitConfig)
   296  		if err != nil {
   297  			return nil, fmt.Errorf("loading init config file: %w", err)
   298  		}
   299  		args.SinkFiles, err = initConf.Setup()
   300  		if err != nil {
   301  			return nil, fmt.Errorf("cannot init config: %w", err)
   302  		}
   303  	}
   304  
   305  	// Create pipe to synchronize when sandbox process has been booted.
   306  	clientSyncFile, sandboxSyncFile, err := os.Pipe()
   307  	if err != nil {
   308  		return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err)
   309  	}
   310  	defer clientSyncFile.Close()
   311  
   312  	// Create the sandbox process.
   313  	err = s.createSandboxProcess(conf, args, sandboxSyncFile)
   314  	// sandboxSyncFile has to be closed to be able to detect when the sandbox
   315  	// process exits unexpectedly.
   316  	sandboxSyncFile.Close()
   317  	if err != nil {
   318  		return nil, fmt.Errorf("cannot create sandbox process: %w", err)
   319  	}
   320  
   321  	// Wait until the sandbox has booted.
   322  	b := make([]byte, 1)
   323  	if l, err := clientSyncFile.Read(b); err != nil || l != 1 {
   324  		err := fmt.Errorf("waiting for sandbox to start: %v", err)
   325  		// If the sandbox failed to start, it may be because the binary
   326  		// permissions were incorrect. Check the bits and return a more helpful
   327  		// error message.
   328  		//
   329  		// NOTE: The error message is checked because error types are lost over
   330  		// rpc calls.
   331  		if strings.Contains(err.Error(), io.EOF.Error()) {
   332  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
   333  				return nil, fmt.Errorf("%v: %v", err, permsErr)
   334  			}
   335  		}
   336  		return nil, fmt.Errorf("cannot read client sync file: %w", err)
   337  	}
   338  
   339  	if conf.MetricServer != "" {
   340  		// The control server is up and the sandbox was configured to export metrics.
   341  		// We must gather data about registered metrics prior to any process starting in the sandbox.
   342  		log.Debugf("Getting metric registration information from sandbox %q", s.ID)
   343  		var registeredMetrics control.MetricsRegistrationResponse
   344  		if err := s.call(boot.MetricsGetRegistered, nil, &registeredMetrics); err != nil {
   345  			return nil, fmt.Errorf("cannot get registered metrics: %v", err)
   346  		}
   347  		s.RegisteredMetrics = registeredMetrics.RegisteredMetrics
   348  	}
   349  
   350  	c.Release()
   351  	return s, nil
   352  }
   353  
   354  // CreateSubcontainer creates a container inside the sandbox.
   355  func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error {
   356  	log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   357  
   358  	var files []*os.File
   359  	if tty != nil {
   360  		files = []*os.File{tty}
   361  	}
   362  	if err := s.configureStdios(conf, files); err != nil {
   363  		return err
   364  	}
   365  
   366  	args := boot.CreateArgs{
   367  		CID:         cid,
   368  		FilePayload: urpc.FilePayload{Files: files},
   369  	}
   370  	if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil {
   371  		return fmt.Errorf("creating sub-container %q: %w", cid, err)
   372  	}
   373  	return nil
   374  }
   375  
   376  // StartRoot starts running the root container process inside the sandbox.
   377  func (s *Sandbox) StartRoot(conf *config.Config) error {
   378  	pid := s.Pid.load()
   379  	log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid)
   380  	conn, err := s.sandboxConnect()
   381  	if err != nil {
   382  		return err
   383  	}
   384  	defer conn.Close()
   385  
   386  	// Configure the network.
   387  	if err := setupNetwork(conn, pid, conf); err != nil {
   388  		return fmt.Errorf("setting up network: %w", err)
   389  	}
   390  
   391  	// Send a message to the sandbox control server to start the root container.
   392  	if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil {
   393  		return fmt.Errorf("starting root container: %w", err)
   394  	}
   395  
   396  	return nil
   397  }
   398  
   399  // StartSubcontainer starts running a sub-container inside the sandbox.
   400  func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, overlayFilestoreFiles []*os.File, overlayMediums []boot.OverlayMedium) error {
   401  	log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load())
   402  
   403  	if err := s.configureStdios(conf, stdios); err != nil {
   404  		return err
   405  	}
   406  	s.fixPidns(spec)
   407  
   408  	// The payload contains (in this specific order):
   409  	// * stdin/stdout/stderr (optional: only present when not using TTY)
   410  	// * The subcontainer's overlay filestore files (optional: only present when
   411  	//   host file backed overlay is configured)
   412  	// * Gofer files.
   413  	payload := urpc.FilePayload{}
   414  	payload.Files = append(payload.Files, stdios...)
   415  	payload.Files = append(payload.Files, overlayFilestoreFiles...)
   416  	payload.Files = append(payload.Files, goferFiles...)
   417  
   418  	// Start running the container.
   419  	args := boot.StartArgs{
   420  		Spec:                   spec,
   421  		Conf:                   conf,
   422  		CID:                    cid,
   423  		NumOverlayFilestoreFDs: len(overlayFilestoreFiles),
   424  		OverlayMediums:         overlayMediums,
   425  		FilePayload:            payload,
   426  	}
   427  	if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil {
   428  		return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err)
   429  	}
   430  	return nil
   431  }
   432  
   433  // Restore sends the restore call for a container in the sandbox.
   434  func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error {
   435  	log.Debugf("Restore sandbox %q", s.ID)
   436  
   437  	rf, err := os.Open(filename)
   438  	if err != nil {
   439  		return fmt.Errorf("opening restore file %q failed: %v", filename, err)
   440  	}
   441  	defer rf.Close()
   442  
   443  	opt := boot.RestoreOpts{
   444  		FilePayload: urpc.FilePayload{
   445  			Files: []*os.File{rf},
   446  		},
   447  		SandboxID: s.ID,
   448  	}
   449  
   450  	// If the platform needs a device FD we must pass it in.
   451  	if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil {
   452  		return err
   453  	} else if deviceFile != nil {
   454  		defer deviceFile.Close()
   455  		opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile)
   456  	}
   457  
   458  	conn, err := s.sandboxConnect()
   459  	if err != nil {
   460  		return err
   461  	}
   462  	defer conn.Close()
   463  
   464  	// Configure the network.
   465  	if err := setupNetwork(conn, s.Pid.load(), conf); err != nil {
   466  		return fmt.Errorf("setting up network: %v", err)
   467  	}
   468  
   469  	// Restore the container and start the root container.
   470  	if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil {
   471  		return fmt.Errorf("restoring container %q: %v", cid, err)
   472  	}
   473  
   474  	return nil
   475  }
   476  
   477  // Processes retrieves the list of processes and associated metadata for a
   478  // given container in this sandbox.
   479  func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
   480  	log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID)
   481  	var pl []*control.Process
   482  	if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil {
   483  		return nil, fmt.Errorf("retrieving process data from sandbox: %v", err)
   484  	}
   485  	return pl, nil
   486  }
   487  
   488  // CreateTraceSession creates a new trace session.
   489  func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error {
   490  	log.Debugf("Creating trace session in sandbox %q", s.ID)
   491  
   492  	sinkFiles, err := seccheck.SetupSinks(config.Sinks)
   493  	if err != nil {
   494  		return err
   495  	}
   496  	defer func() {
   497  		for _, f := range sinkFiles {
   498  			_ = f.Close()
   499  		}
   500  	}()
   501  
   502  	arg := boot.CreateTraceSessionArgs{
   503  		Config: *config,
   504  		Force:  force,
   505  		FilePayload: urpc.FilePayload{
   506  			Files: sinkFiles,
   507  		},
   508  	}
   509  	if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil {
   510  		return fmt.Errorf("creating trace session: %w", err)
   511  	}
   512  	return nil
   513  }
   514  
   515  // DeleteTraceSession deletes an existing trace session.
   516  func (s *Sandbox) DeleteTraceSession(name string) error {
   517  	log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID)
   518  	if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil {
   519  		return fmt.Errorf("deleting trace session: %w", err)
   520  	}
   521  	return nil
   522  }
   523  
   524  // ListTraceSessions lists all trace sessions.
   525  func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) {
   526  	log.Debugf("Listing trace sessions in sandbox %q", s.ID)
   527  	var sessions []seccheck.SessionConfig
   528  	if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil {
   529  		return nil, fmt.Errorf("listing trace session: %w", err)
   530  	}
   531  	return sessions, nil
   532  }
   533  
   534  // ProcfsDump collects and returns a procfs dump for the sandbox.
   535  func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) {
   536  	log.Debugf("Procfs dump %q", s.ID)
   537  	var procfsDump []procfs.ProcessProcfsDump
   538  	if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil {
   539  		return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
   540  	}
   541  	return procfsDump, nil
   542  }
   543  
   544  // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one.
   545  func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) {
   546  	return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */)
   547  }
   548  
   549  // Execute runs the specified command in the container. It returns the PID of
   550  // the newly created process.
   551  func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   552  	log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID)
   553  
   554  	// Stdios are those files which have an FD <= 2 in the process. We do not
   555  	// want the ownership of other files to be changed by configureStdios.
   556  	var stdios []*os.File
   557  	for i, fd := range args.GuestFDs {
   558  		if fd > 2 || i >= len(args.Files) {
   559  			continue
   560  		}
   561  		stdios = append(stdios, args.Files[i])
   562  	}
   563  
   564  	if err := s.configureStdios(conf, stdios); err != nil {
   565  		return 0, err
   566  	}
   567  
   568  	// Send a message to the sandbox control server to start the container.
   569  	var pid int32
   570  	if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil {
   571  		return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err)
   572  	}
   573  	return pid, nil
   574  }
   575  
   576  // Event retrieves stats about the sandbox such as memory and CPU utilization.
   577  func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
   578  	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
   579  	var e boot.EventOut
   580  	if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil {
   581  		return nil, fmt.Errorf("retrieving event data from sandbox: %w", err)
   582  	}
   583  	return &e, nil
   584  }
   585  
   586  // PortForward starts port forwarding to the sandbox.
   587  func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error {
   588  	log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts)
   589  	conn, err := s.sandboxConnect()
   590  	if err != nil {
   591  		return err
   592  	}
   593  	defer conn.Close()
   594  
   595  	if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil {
   596  		return fmt.Errorf("port forwarding to sandbox: %v", err)
   597  	}
   598  
   599  	return nil
   600  }
   601  
   602  func (s *Sandbox) sandboxConnect() (*urpc.Client, error) {
   603  	log.Debugf("Connecting to sandbox %q", s.ID)
   604  	conn, err := client.ConnectTo(s.ControlAddress)
   605  	if err != nil {
   606  		return nil, s.connError(err)
   607  	}
   608  	return conn, nil
   609  }
   610  
   611  func (s *Sandbox) call(method string, arg, result any) error {
   612  	conn, err := s.sandboxConnect()
   613  	if err != nil {
   614  		return err
   615  	}
   616  	defer conn.Close()
   617  
   618  	return conn.Call(method, arg, result)
   619  }
   620  
   621  func (s *Sandbox) connError(err error) error {
   622  	return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err)
   623  }
   624  
   625  // createSandboxProcess starts the sandbox as a subprocess by running the "boot"
   626  // command, passing in the bundle dir.
   627  func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error {
   628  	donations := donation.Agency{}
   629  	defer donations.Close()
   630  
   631  	// pgalloc.MemoryFile (which provides application memory) sometimes briefly
   632  	// mlock(2)s ranges of memory in order to fault in a large number of pages at
   633  	// a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc
   634  	// expects to run in a memory cgroup that limits its memory usage as
   635  	// required.
   636  	// This needs to be done before exec'ing `runsc boot`, as that subcommand
   637  	// runs as an unprivileged user that will not be able to call `setrlimit`
   638  	// by itself. Calling `setrlimit` here will have the side-effect of setting
   639  	// the limit on the currently-running `runsc` process as well, but that
   640  	// should be OK too.
   641  	var rlim unix.Rlimit
   642  	if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   643  		log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err)
   644  	} else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY {
   645  		rlim.Cur = unix.RLIM_INFINITY
   646  		rlim.Max = unix.RLIM_INFINITY
   647  		if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil {
   648  			// We may not have CAP_SYS_RESOURCE, so this failure may be expected.
   649  			log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err)
   650  		}
   651  	}
   652  
   653  	//
   654  	// These flags must come BEFORE the "boot" command in cmd.Args.
   655  	//
   656  
   657  	// Open the log files to pass to the sandbox as FDs.
   658  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   659  		return err
   660  	}
   661  
   662  	test := ""
   663  	if len(conf.TestOnlyTestNameEnv) != 0 {
   664  		// Fetch test name if one is provided and the test only flag was set.
   665  		if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
   666  			test = t
   667  		}
   668  	}
   669  	if specutils.IsDebugCommand(conf, "boot") {
   670  		if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil {
   671  			return err
   672  		}
   673  	}
   674  	if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil {
   675  		return err
   676  	}
   677  	covFilename := conf.CoverageReport
   678  	if covFilename == "" {
   679  		covFilename = os.Getenv("GO_COVERAGE_FILE")
   680  	}
   681  	if covFilename != "" && coverage.Available() {
   682  		if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil {
   683  			return err
   684  		}
   685  	}
   686  
   687  	// Relay all the config flags to the sandbox process.
   688  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
   689  	cmd.SysProcAttr = &unix.SysProcAttr{
   690  		// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   691  		// when re-parented.
   692  		Setsid: true,
   693  	}
   694  
   695  	// Set Args[0] to make easier to spot the sandbox process. Otherwise it's
   696  	// shown as `exe`.
   697  	cmd.Args[0] = "runsc-sandbox"
   698  
   699  	// Tranfer FDs that need to be present before the "boot" command.
   700  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
   701  	nextFD := donations.Transfer(cmd, 3)
   702  
   703  	// Add the "boot" command to the args.
   704  	//
   705  	// All flags after this must be for the boot command
   706  	cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir)
   707  
   708  	// Clear environment variables, unless --TESTONLY-unsafe-nonroot is set.
   709  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   710  		// Setting cmd.Env = nil causes cmd to inherit the current process's env.
   711  		cmd.Env = []string{}
   712  	}
   713  
   714  	// If there is a gofer, sends all socket ends to the sandbox.
   715  	donations.DonateAndClose("io-fds", args.IOFiles...)
   716  	donations.DonateAndClose("overlay-filestore-fds", args.OverlayFilestoreFiles...)
   717  	donations.DonateAndClose("mounts-fd", args.MountsFile)
   718  	donations.Donate("start-sync-fd", startSyncFile)
   719  	if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
   720  		return err
   721  	}
   722  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
   723  	if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil {
   724  		return err
   725  	}
   726  	if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil {
   727  		return err
   728  	}
   729  	if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil {
   730  		return err
   731  	}
   732  	if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil {
   733  		return err
   734  	}
   735  	if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil {
   736  		return err
   737  	}
   738  
   739  	// Pass overlay mediums.
   740  	cmd.Args = append(cmd.Args, "--overlay-mediums="+boot.ToOverlayMediumFlags(args.OverlayMediums))
   741  
   742  	// Create a socket for the control server and donate it to the sandbox.
   743  	controlAddress, sockFD, err := createControlSocket(conf.RootDir, s.ID)
   744  	if err != nil {
   745  		return fmt.Errorf("creating control socket %q: %v", s.ControlAddress, err)
   746  	}
   747  	log.Infof("Control socket: %q", s.ControlAddress)
   748  	s.ControlAddress = controlAddress
   749  	donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket"))
   750  
   751  	specFile, err := specutils.OpenSpec(args.BundleDir)
   752  	if err != nil {
   753  		return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err)
   754  	}
   755  	donations.DonateAndClose("spec-fd", specFile)
   756  
   757  	if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil {
   758  		return err
   759  	}
   760  	donations.DonateAndClose("sink-fds", args.SinkFiles...)
   761  
   762  	gPlatform, err := platform.Lookup(conf.Platform)
   763  	if err != nil {
   764  		return fmt.Errorf("cannot look up platform: %w", err)
   765  	}
   766  	if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil {
   767  		return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err)
   768  	} else if deviceFile != nil {
   769  		donations.DonateAndClose("device-fd", deviceFile)
   770  	}
   771  
   772  	// TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff
   773  	// isn't set.
   774  	if conf.Platform == "kvm" {
   775  		cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
   776  	}
   777  
   778  	// nss is the set of namespaces to join or create before starting the sandbox
   779  	// process. Mount, IPC and UTS namespaces from the host are not used as they
   780  	// are virtualized inside the sandbox. Be paranoid and run inside an empty
   781  	// namespace for these. Don't unshare cgroup because sandbox is added to a
   782  	// cgroup in the caller's namespace.
   783  	log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces")
   784  	nss := []specs.LinuxNamespace{
   785  		{Type: specs.IPCNamespace},
   786  		{Type: specs.MountNamespace},
   787  		{Type: specs.UTSNamespace},
   788  	}
   789  
   790  	if gPlatform.Requirements().RequiresCurrentPIDNS {
   791  		// TODO(b/75837838): Also set a new PID namespace so that we limit
   792  		// access to other host processes.
   793  		log.Infof("Sandbox will be started in the current PID namespace")
   794  	} else {
   795  		log.Infof("Sandbox will be started in a new PID namespace")
   796  		nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace})
   797  		cmd.Args = append(cmd.Args, "--pidns=true")
   798  	}
   799  
   800  	// Joins the network namespace if network is enabled. the sandbox talks
   801  	// directly to the host network, which may have been configured in the
   802  	// namespace.
   803  	if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone {
   804  		log.Infof("Sandbox will be started in the container's network namespace: %+v", ns)
   805  		nss = append(nss, ns)
   806  	} else if conf.Network == config.NetworkHost {
   807  		log.Infof("Sandbox will be started in the host network namespace")
   808  	} else {
   809  		log.Infof("Sandbox will be started in new network namespace")
   810  		nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace})
   811  	}
   812  
   813  	// These are set to the uid/gid that the sandbox process will use. May be
   814  	// overriden below.
   815  	s.UID = os.Getuid()
   816  	s.GID = os.Getgid()
   817  
   818  	// User namespace depends on the network type or whether access to the host
   819  	// filesystem is required. These features require to run inside the user
   820  	// namespace specified in the spec or the current namespace if none is
   821  	// configured.
   822  	rootlessEUID := unix.Geteuid() != 0
   823  	setUserMappings := false
   824  	if conf.Network == config.NetworkHost || conf.DirectFS {
   825  		if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok {
   826  			log.Infof("Sandbox will be started in container's user namespace: %+v", userns)
   827  			nss = append(nss, userns)
   828  			if rootlessEUID {
   829  				syncFile, err := ConfigureCmdForRootless(cmd, &donations)
   830  				if err != nil {
   831  					return err
   832  				}
   833  				defer syncFile.Close()
   834  				setUserMappings = true
   835  			} else {
   836  				specutils.SetUIDGIDMappings(cmd, args.Spec)
   837  				// We need to set UID and GID to have capabilities in a new user namespace.
   838  				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
   839  			}
   840  		} else {
   841  			if rootlessEUID {
   842  				return fmt.Errorf("unable to run a rootless container without userns")
   843  			}
   844  			log.Infof("Sandbox will be started in the current user namespace")
   845  		}
   846  		// When running in the caller's defined user namespace, apply the same
   847  		// capabilities to the sandbox process to ensure it abides to the same
   848  		// rules.
   849  		cmd.Args = append(cmd.Args, "--apply-caps=true")
   850  
   851  		// If we have CAP_SYS_ADMIN, we can create an empty chroot and
   852  		// bind-mount the executable inside it.
   853  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   854  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   855  		} else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID {
   856  			log.Infof("Sandbox will be started in minimal chroot")
   857  			cmd.Args = append(cmd.Args, "--setup-root")
   858  		} else {
   859  			return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN")
   860  		}
   861  	} else {
   862  		// If we have CAP_SETUID and CAP_SETGID, then we can also run
   863  		// as user nobody.
   864  		if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   865  			log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
   866  			log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
   867  		} else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
   868  			log.Infof("Sandbox will be started in new user namespace")
   869  			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
   870  			cmd.Args = append(cmd.Args, "--setup-root")
   871  
   872  			const nobody = 65534
   873  			if rootlessEUID || conf.Rootless {
   874  				log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
   875  			} else {
   876  				// Map nobody in the new namespace to nobody in the parent namespace.
   877  				s.UID = nobody
   878  				s.GID = nobody
   879  			}
   880  
   881  			// Set credentials to run as user and group nobody.
   882  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody}
   883  			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
   884  				{
   885  					ContainerID: nobody,
   886  					HostID:      s.UID,
   887  					Size:        1,
   888  				},
   889  			}
   890  			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
   891  				{
   892  					ContainerID: nobody,
   893  					HostID:      s.GID,
   894  					Size:        1,
   895  				},
   896  			}
   897  
   898  			// A sandbox process will construct an empty root for itself, so it has
   899  			// to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities.
   900  			cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps,
   901  				uintptr(capability.CAP_SYS_ADMIN),
   902  				uintptr(capability.CAP_SYS_CHROOT),
   903  				// CAP_SETPCAP is required to clear the bounding set.
   904  				uintptr(capability.CAP_SETPCAP),
   905  			)
   906  
   907  		} else {
   908  			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
   909  		}
   910  	}
   911  
   912  	// The current process' stdio must be passed to the application via the
   913  	// --stdio-fds flag. The stdio of the sandbox process itself must not
   914  	// be connected to the same FDs, otherwise we risk leaking sandbox
   915  	// errors to the application, so we set the sandbox stdio to nil,
   916  	// causing them to read/write from the null device.
   917  	cmd.Stdin = nil
   918  	cmd.Stdout = nil
   919  	cmd.Stderr = nil
   920  	var stdios [3]*os.File
   921  
   922  	// If the console control socket file is provided, then create a new
   923  	// pty master/replica pair and set the TTY on the sandbox process.
   924  	if args.Spec.Process.Terminal && args.ConsoleSocket != "" {
   925  		// console.NewWithSocket will send the master on the given
   926  		// socket, and return the replica.
   927  		tty, err := console.NewWithSocket(args.ConsoleSocket)
   928  		if err != nil {
   929  			return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err)
   930  		}
   931  		defer tty.Close()
   932  
   933  		// Set the TTY as a controlling TTY on the sandbox process.
   934  		cmd.SysProcAttr.Setctty = true
   935  
   936  		// Inconveniently, the Ctty must be the FD in the *child* process's FD
   937  		// table. So transfer all files we have so far and make sure the next file
   938  		// added to donations is stdin.
   939  		//
   940  		// See https://github.com/golang/go/issues/29458.
   941  		nextFD = donations.Transfer(cmd, nextFD)
   942  		cmd.SysProcAttr.Ctty = nextFD
   943  
   944  		// Pass the tty as all stdio fds to sandbox.
   945  		stdios[0] = tty
   946  		stdios[1] = tty
   947  		stdios[2] = tty
   948  
   949  		if conf.Debug {
   950  			// If debugging, send the boot process stdio to the
   951  			// TTY, so that it is easier to find.
   952  			cmd.Stdin = tty
   953  			cmd.Stdout = tty
   954  			cmd.Stderr = tty
   955  		}
   956  	} else {
   957  		// If not using a console, pass our current stdio as the
   958  		// container stdio via flags.
   959  		stdios[0] = os.Stdin
   960  		stdios[1] = os.Stdout
   961  		stdios[2] = os.Stderr
   962  
   963  		if conf.Debug {
   964  			// If debugging, send the boot process stdio to the
   965  			// this process' stdio, so that is is easier to find.
   966  			cmd.Stdin = os.Stdin
   967  			cmd.Stdout = os.Stdout
   968  			cmd.Stderr = os.Stderr
   969  		}
   970  	}
   971  	if err := s.configureStdios(conf, stdios[:]); err != nil {
   972  		return fmt.Errorf("configuring stdios: %w", err)
   973  	}
   974  	// Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above
   975  	// because it relies on stdin being the next FD donated.
   976  	donations.Donate("stdio-fds", stdios[:]...)
   977  
   978  	totalSysMem, err := totalSystemMemory()
   979  	if err != nil {
   980  		return err
   981  	}
   982  	cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10))
   983  
   984  	mem := totalSysMem
   985  	if s.CgroupJSON.Cgroup != nil {
   986  		cpuNum, err := s.CgroupJSON.Cgroup.NumCPU()
   987  		if err != nil {
   988  			return fmt.Errorf("getting cpu count from cgroups: %v", err)
   989  		}
   990  		if conf.CPUNumFromQuota {
   991  			// Dropping below 2 CPUs can trigger application to disable
   992  			// locks that can lead do hard to debug errors, so just
   993  			// leaving two cores as reasonable default.
   994  			const minCPUs = 2
   995  
   996  			quota, err := s.CgroupJSON.Cgroup.CPUQuota()
   997  			if err != nil {
   998  				return fmt.Errorf("getting cpu quota from cgroups: %v", err)
   999  			}
  1000  			if n := int(math.Ceil(quota)); n > 0 {
  1001  				if n < minCPUs {
  1002  					n = minCPUs
  1003  				}
  1004  				if n < cpuNum {
  1005  					// Only lower the cpu number.
  1006  					cpuNum = n
  1007  				}
  1008  			}
  1009  		}
  1010  		cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
  1011  
  1012  		memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit()
  1013  		if err != nil {
  1014  			return fmt.Errorf("getting memory limit from cgroups: %v", err)
  1015  		}
  1016  		if memLimit < mem {
  1017  			mem = memLimit
  1018  		}
  1019  	}
  1020  	cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10))
  1021  
  1022  	if args.Attached {
  1023  		// Kill sandbox if parent process exits in attached mode.
  1024  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1025  		// Tells boot that any process it creates must have pdeathsig set.
  1026  		cmd.Args = append(cmd.Args, "--attached")
  1027  	}
  1028  
  1029  	if args.ExecFile != nil {
  1030  		donations.Donate("exec-fd", args.ExecFile)
  1031  	}
  1032  
  1033  	nextFD = donations.Transfer(cmd, nextFD)
  1034  
  1035  	_ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles)
  1036  
  1037  	// Add container ID as the last argument.
  1038  	cmd.Args = append(cmd.Args, s.ID)
  1039  
  1040  	donation.LogDonations(cmd)
  1041  	log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args)
  1042  	log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr)
  1043  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1044  		err := fmt.Errorf("starting sandbox: %v", err)
  1045  		// If the sandbox failed to start, it may be because the binary
  1046  		// permissions were incorrect. Check the bits and return a more helpful
  1047  		// error message.
  1048  		//
  1049  		// NOTE: The error message is checked because error types are lost over
  1050  		// rpc calls.
  1051  		if strings.Contains(err.Error(), unix.EACCES.Error()) {
  1052  			if permsErr := checkBinaryPermissions(conf); permsErr != nil {
  1053  				return fmt.Errorf("%v: %v", err, permsErr)
  1054  			}
  1055  		}
  1056  		return err
  1057  	}
  1058  	s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid)
  1059  	if err != nil {
  1060  		return err
  1061  	}
  1062  	if setUserMappings {
  1063  		if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil {
  1064  			return err
  1065  		}
  1066  	}
  1067  
  1068  	s.child = true
  1069  	s.Pid.store(cmd.Process.Pid)
  1070  	log.Infof("Sandbox started, PID: %d", cmd.Process.Pid)
  1071  
  1072  	return nil
  1073  }
  1074  
  1075  // Wait waits for the containerized process to exit, and returns its WaitStatus.
  1076  func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) {
  1077  	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
  1078  
  1079  	if conn, err := s.sandboxConnect(); err != nil {
  1080  		// The sandbox may have exited while before we had a chance to wait on it.
  1081  		// There is nothing we can do for subcontainers. For the init container, we
  1082  		// can try to get the sandbox exit code.
  1083  		if !s.IsRootContainer(cid) {
  1084  			return unix.WaitStatus(0), err
  1085  		}
  1086  		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1087  	} else {
  1088  		defer conn.Close()
  1089  
  1090  		// Try the Wait RPC to the sandbox.
  1091  		var ws unix.WaitStatus
  1092  		err = conn.Call(boot.ContMgrWait, &cid, &ws)
  1093  		conn.Close()
  1094  		if err == nil {
  1095  			if s.IsRootContainer(cid) {
  1096  				if err := s.waitForStopped(); err != nil {
  1097  					return unix.WaitStatus(0), err
  1098  				}
  1099  			}
  1100  			// It worked!
  1101  			return ws, nil
  1102  		}
  1103  		// See comment above.
  1104  		if !s.IsRootContainer(cid) {
  1105  			return unix.WaitStatus(0), err
  1106  		}
  1107  
  1108  		// The sandbox may have exited after we connected, but before
  1109  		// or during the Wait RPC.
  1110  		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
  1111  	}
  1112  
  1113  	// The sandbox may have already exited, or exited while handling the Wait RPC.
  1114  	// The best we can do is ask Linux what the sandbox exit status was, since in
  1115  	// most cases that will be the same as the container exit status.
  1116  	if err := s.waitForStopped(); err != nil {
  1117  		return unix.WaitStatus(0), err
  1118  	}
  1119  	if !s.child {
  1120  		return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
  1121  	}
  1122  
  1123  	s.statusMu.Lock()
  1124  	defer s.statusMu.Unlock()
  1125  	return s.status, nil
  1126  }
  1127  
  1128  // WaitPID waits for process 'pid' in the container's sandbox and returns its
  1129  // WaitStatus.
  1130  func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) {
  1131  	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
  1132  	var ws unix.WaitStatus
  1133  	args := &boot.WaitPIDArgs{
  1134  		PID: pid,
  1135  		CID: cid,
  1136  	}
  1137  	if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil {
  1138  		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err)
  1139  	}
  1140  	return ws, nil
  1141  }
  1142  
  1143  // IsRootContainer returns true if the specified container ID belongs to the
  1144  // root container.
  1145  func (s *Sandbox) IsRootContainer(cid string) bool {
  1146  	return s.ID == cid
  1147  }
  1148  
  1149  // Destroy frees all resources associated with the sandbox. It fails fast and
  1150  // is idempotent.
  1151  func (s *Sandbox) destroy() error {
  1152  	log.Debugf("Destroying sandbox %q", s.ID)
  1153  	// Only delete the control file if it exists and is not an abstract UDS.
  1154  	if len(s.ControlAddress) > 0 && s.ControlAddress[0] != 0 {
  1155  		if err := os.Remove(s.ControlAddress); err != nil {
  1156  			log.Warningf("failed to delete control socket file %q: %v", s.ControlAddress, err)
  1157  		}
  1158  	}
  1159  	pid := s.Pid.load()
  1160  	if pid != 0 {
  1161  		log.Debugf("Killing sandbox %q", s.ID)
  1162  		if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH {
  1163  			return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err)
  1164  		}
  1165  		if err := s.waitForStopped(); err != nil {
  1166  			return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err)
  1167  		}
  1168  	}
  1169  
  1170  	return nil
  1171  }
  1172  
  1173  // SignalContainer sends the signal to a container in the sandbox. If all is
  1174  // true and signal is SIGKILL, then waits for all processes to exit before
  1175  // returning.
  1176  func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error {
  1177  	log.Debugf("Signal sandbox %q", s.ID)
  1178  	mode := boot.DeliverToProcess
  1179  	if all {
  1180  		mode = boot.DeliverToAllProcesses
  1181  	}
  1182  
  1183  	args := boot.SignalArgs{
  1184  		CID:   cid,
  1185  		Signo: int32(sig),
  1186  		Mode:  mode,
  1187  	}
  1188  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1189  		return fmt.Errorf("signaling container %q: %w", cid, err)
  1190  	}
  1191  	return nil
  1192  }
  1193  
  1194  // SignalProcess sends the signal to a particular process in the container. If
  1195  // fgProcess is true, then the signal is sent to the foreground process group
  1196  // in the same session that PID belongs to. This is only valid if the process
  1197  // is attached to a host TTY.
  1198  func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error {
  1199  	log.Debugf("Signal sandbox %q", s.ID)
  1200  
  1201  	mode := boot.DeliverToProcess
  1202  	if fgProcess {
  1203  		mode = boot.DeliverToForegroundProcessGroup
  1204  	}
  1205  
  1206  	args := boot.SignalArgs{
  1207  		CID:   cid,
  1208  		Signo: int32(sig),
  1209  		PID:   pid,
  1210  		Mode:  mode,
  1211  	}
  1212  	if err := s.call(boot.ContMgrSignal, &args, nil); err != nil {
  1213  		return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err)
  1214  	}
  1215  	return nil
  1216  }
  1217  
  1218  // Checkpoint sends the checkpoint call for a container in the sandbox.
  1219  // The statefile will be written to f.
  1220  func (s *Sandbox) Checkpoint(cid string, f *os.File) error {
  1221  	log.Debugf("Checkpoint sandbox %q", s.ID)
  1222  	opt := control.SaveOpts{
  1223  		FilePayload: urpc.FilePayload{
  1224  			Files: []*os.File{f},
  1225  		},
  1226  	}
  1227  
  1228  	if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {
  1229  		return fmt.Errorf("checkpointing container %q: %w", cid, err)
  1230  	}
  1231  	return nil
  1232  }
  1233  
  1234  // Pause sends the pause call for a container in the sandbox.
  1235  func (s *Sandbox) Pause(cid string) error {
  1236  	log.Debugf("Pause sandbox %q", s.ID)
  1237  	if err := s.call(boot.LifecyclePause, nil, nil); err != nil {
  1238  		return fmt.Errorf("pausing container %q: %w", cid, err)
  1239  	}
  1240  	return nil
  1241  }
  1242  
  1243  // Resume sends the resume call for a container in the sandbox.
  1244  func (s *Sandbox) Resume(cid string) error {
  1245  	log.Debugf("Resume sandbox %q", s.ID)
  1246  	if err := s.call(boot.LifecycleResume, nil, nil); err != nil {
  1247  		return fmt.Errorf("resuming container %q: %w", cid, err)
  1248  	}
  1249  	return nil
  1250  }
  1251  
  1252  // Usage sends the collect call for a container in the sandbox.
  1253  func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) {
  1254  	log.Debugf("Usage sandbox %q", s.ID)
  1255  	opts := control.MemoryUsageOpts{Full: Full}
  1256  	var m control.MemoryUsage
  1257  	if err := s.call(boot.UsageCollect, &opts, &m); err != nil {
  1258  		return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err)
  1259  	}
  1260  	return m, nil
  1261  }
  1262  
  1263  // UsageFD sends the usagefd call for a container in the sandbox.
  1264  func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) {
  1265  	log.Debugf("Usage sandbox %q", s.ID)
  1266  	opts := control.MemoryUsageFileOpts{Version: 1}
  1267  	var m control.MemoryUsageFile
  1268  	if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil {
  1269  		return nil, fmt.Errorf("collecting usage FD: %w", err)
  1270  	}
  1271  
  1272  	if len(m.FilePayload.Files) != 2 {
  1273  		return nil, fmt.Errorf("wants exactly two fds")
  1274  	}
  1275  	return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1])
  1276  }
  1277  
  1278  // GetRegisteredMetrics returns metric registration data from the sandbox.
  1279  // This data is meant to be used as a way to sanity-check any exported metrics data during the
  1280  // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce
  1281  // bogus metrics.
  1282  // This returns an error if the sandbox has not requested instrumentation during creation time.
  1283  func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
  1284  	if s.RegisteredMetrics == nil {
  1285  		return nil, errors.New("sandbox did not request instrumentation when it was created")
  1286  	}
  1287  	return s.RegisteredMetrics, nil
  1288  }
  1289  
  1290  // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
  1291  func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
  1292  	log.Debugf("Metrics export sandbox %q", s.ID)
  1293  	var data control.MetricsExportData
  1294  	if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
  1295  		return nil, err
  1296  	}
  1297  	// Since we do not trust the output of the sandbox as-is, double-check that the options were
  1298  	// respected.
  1299  	if err := opts.Verify(&data); err != nil {
  1300  		return nil, err
  1301  	}
  1302  	return data.Snapshot, nil
  1303  }
  1304  
  1305  // IsRunning returns true if the sandbox or gofer process is running.
  1306  func (s *Sandbox) IsRunning() bool {
  1307  	pid := s.Pid.load()
  1308  	if pid != 0 {
  1309  		// Send a signal 0 to the sandbox process.
  1310  		if err := unix.Kill(pid, 0); err == nil {
  1311  			// Succeeded, process is running.
  1312  			return true
  1313  		}
  1314  	}
  1315  	return false
  1316  }
  1317  
  1318  // Stacks collects and returns all stacks for the sandbox.
  1319  func (s *Sandbox) Stacks() (string, error) {
  1320  	log.Debugf("Stacks sandbox %q", s.ID)
  1321  	var stacks string
  1322  	if err := s.call(boot.DebugStacks, nil, &stacks); err != nil {
  1323  		return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err)
  1324  	}
  1325  	return stacks, nil
  1326  }
  1327  
  1328  // HeapProfile writes a heap profile to the given file.
  1329  func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
  1330  	log.Debugf("Heap profile %q", s.ID)
  1331  	opts := control.HeapProfileOpts{
  1332  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1333  		Delay:       delay,
  1334  	}
  1335  	return s.call(boot.ProfileHeap, &opts, nil)
  1336  }
  1337  
  1338  // CPUProfile collects a CPU profile.
  1339  func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
  1340  	log.Debugf("CPU profile %q", s.ID)
  1341  	opts := control.CPUProfileOpts{
  1342  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1343  		Duration:    duration,
  1344  	}
  1345  	return s.call(boot.ProfileCPU, &opts, nil)
  1346  }
  1347  
  1348  // BlockProfile writes a block profile to the given file.
  1349  func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
  1350  	log.Debugf("Block profile %q", s.ID)
  1351  	opts := control.BlockProfileOpts{
  1352  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1353  		Duration:    duration,
  1354  	}
  1355  	return s.call(boot.ProfileBlock, &opts, nil)
  1356  }
  1357  
  1358  // MutexProfile writes a mutex profile to the given file.
  1359  func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
  1360  	log.Debugf("Mutex profile %q", s.ID)
  1361  	opts := control.MutexProfileOpts{
  1362  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1363  		Duration:    duration,
  1364  	}
  1365  	return s.call(boot.ProfileMutex, &opts, nil)
  1366  }
  1367  
  1368  // Trace collects an execution trace.
  1369  func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
  1370  	log.Debugf("Trace %q", s.ID)
  1371  	opts := control.TraceProfileOpts{
  1372  		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
  1373  		Duration:    duration,
  1374  	}
  1375  	return s.call(boot.ProfileTrace, &opts, nil)
  1376  }
  1377  
  1378  // ChangeLogging changes logging options.
  1379  func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
  1380  	log.Debugf("Change logging start %q", s.ID)
  1381  	if err := s.call(boot.LoggingChange, &args, nil); err != nil {
  1382  		return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err)
  1383  	}
  1384  	return nil
  1385  }
  1386  
  1387  // DestroyContainer destroys the given container. If it is the root container,
  1388  // then the entire sandbox is destroyed.
  1389  func (s *Sandbox) DestroyContainer(cid string) error {
  1390  	if err := s.destroyContainer(cid); err != nil {
  1391  		// If the sandbox isn't running, the container has already been destroyed,
  1392  		// ignore the error in this case.
  1393  		if s.IsRunning() {
  1394  			return err
  1395  		}
  1396  	}
  1397  	return nil
  1398  }
  1399  
  1400  func (s *Sandbox) destroyContainer(cid string) error {
  1401  	if s.IsRootContainer(cid) {
  1402  		log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid)
  1403  		return s.destroy()
  1404  	}
  1405  
  1406  	log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID)
  1407  	if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil {
  1408  		return fmt.Errorf("destroying container %q: %w", cid, err)
  1409  	}
  1410  	return nil
  1411  }
  1412  
  1413  func (s *Sandbox) waitForStopped() error {
  1414  	if s.child {
  1415  		s.statusMu.Lock()
  1416  		defer s.statusMu.Unlock()
  1417  		pid := s.Pid.load()
  1418  		if pid == 0 {
  1419  			return nil
  1420  		}
  1421  		// The sandbox process is a child of the current process,
  1422  		// so we can wait on it to terminate and collect its zombie.
  1423  		if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil {
  1424  			return fmt.Errorf("error waiting the sandbox process: %v", err)
  1425  		}
  1426  		s.Pid.store(0)
  1427  		return nil
  1428  	}
  1429  
  1430  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1431  	defer cancel()
  1432  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1433  	op := func() error {
  1434  		if s.IsRunning() {
  1435  			return fmt.Errorf("sandbox is still running")
  1436  		}
  1437  		return nil
  1438  	}
  1439  	return backoff.Retry(op, b)
  1440  }
  1441  
  1442  // configureStdios change stdios ownership to give access to the sandbox
  1443  // process. This may be skipped depending on the configuration.
  1444  func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error {
  1445  	if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1446  		// Cannot change ownership without CAP_CHOWN.
  1447  		return nil
  1448  	}
  1449  
  1450  	if s.UID < 0 || s.GID < 0 {
  1451  		panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID))
  1452  	}
  1453  	for _, file := range stdios {
  1454  		log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID)
  1455  		if err := file.Chown(s.UID, s.GID); err != nil {
  1456  			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
  1457  				log.Warningf("can't change an owner of %s: %s", file.Name(), err)
  1458  				continue
  1459  			}
  1460  			return err
  1461  		}
  1462  	}
  1463  	return nil
  1464  }
  1465  
  1466  // deviceFileForPlatform opens the device file for the given platform. If the
  1467  // platform does not need a device file, then nil is returned.
  1468  // devicePath may be empty to use a sane platform-specific default.
  1469  func deviceFileForPlatform(name, devicePath string) (*os.File, error) {
  1470  	p, err := platform.Lookup(name)
  1471  	if err != nil {
  1472  		return nil, err
  1473  	}
  1474  
  1475  	f, err := p.OpenDevice(devicePath)
  1476  	if err != nil {
  1477  		return nil, fmt.Errorf("opening device file for platform %q: %w", name, err)
  1478  	}
  1479  	return f, nil
  1480  }
  1481  
  1482  // checkBinaryPermissions verifies that the required binary bits are set on
  1483  // the runsc executable.
  1484  func checkBinaryPermissions(conf *config.Config) error {
  1485  	// All platforms need the other exe bit
  1486  	neededBits := os.FileMode(0001)
  1487  	if conf.Platform == "ptrace" {
  1488  		// Ptrace needs the other read bit
  1489  		neededBits |= os.FileMode(0004)
  1490  	}
  1491  
  1492  	exePath, err := os.Executable()
  1493  	if err != nil {
  1494  		return fmt.Errorf("getting exe path: %v", err)
  1495  	}
  1496  
  1497  	// Check the permissions of the runsc binary and print an error if it
  1498  	// doesn't match expectations.
  1499  	info, err := os.Stat(exePath)
  1500  	if err != nil {
  1501  		return fmt.Errorf("stat file: %v", err)
  1502  	}
  1503  
  1504  	if info.Mode().Perm()&neededBits != neededBits {
  1505  		return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath)))
  1506  	}
  1507  	return nil
  1508  }
  1509  
  1510  // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox.
  1511  func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) {
  1512  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1513  	args := control.CgroupsReadArgs{
  1514  		Args: []control.CgroupsReadArg{
  1515  			{
  1516  				File: file,
  1517  			},
  1518  		},
  1519  	}
  1520  	var out control.CgroupsResults
  1521  	if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil {
  1522  		return "", err
  1523  	}
  1524  	if len(out.Results) != 1 {
  1525  		return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1526  	}
  1527  	return out.Results[0].Unpack()
  1528  }
  1529  
  1530  // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox.
  1531  func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error {
  1532  	log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID)
  1533  	args := control.CgroupsWriteArgs{
  1534  		Args: []control.CgroupsWriteArg{
  1535  			{
  1536  				File:  file,
  1537  				Value: value,
  1538  			},
  1539  		},
  1540  	}
  1541  	var out control.CgroupsResults
  1542  	if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil {
  1543  		return err
  1544  	}
  1545  	if len(out.Results) != 1 {
  1546  		return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out)
  1547  	}
  1548  	return out.Results[0].AsError()
  1549  }
  1550  
  1551  // fixPidns looks at the PID namespace path. If that path corresponds to the
  1552  // sandbox process PID namespace, then change the spec so that the container
  1553  // joins the sandbox root namespace.
  1554  func (s *Sandbox) fixPidns(spec *specs.Spec) {
  1555  	pidns, ok := specutils.GetNS(specs.PIDNamespace, spec)
  1556  	if !ok {
  1557  		// pidns was not set, nothing to fix.
  1558  		return
  1559  	}
  1560  	if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) {
  1561  		// Fix only if the PID namespace corresponds to the sandbox's.
  1562  		return
  1563  	}
  1564  
  1565  	for i := range spec.Linux.Namespaces {
  1566  		if spec.Linux.Namespaces[i].Type == specs.PIDNamespace {
  1567  			// Removing the namespace makes the container join the sandbox root
  1568  			// namespace.
  1569  			log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path)
  1570  			spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...)
  1571  			return
  1572  		}
  1573  	}
  1574  	panic("unreachable")
  1575  }
  1576  
  1577  // ConfigureCmdForRootless configures cmd to donate a socket FD that can be
  1578  // used to synchronize userns configuration.
  1579  func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) {
  1580  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1581  	if err != nil {
  1582  		return nil, err
  1583  	}
  1584  	f := os.NewFile(uintptr(fds[1]), "userns sync other FD")
  1585  	donations.DonateAndClose("sync-userns-fd", f)
  1586  	if cmd.SysProcAttr == nil {
  1587  		cmd.SysProcAttr = &unix.SysProcAttr{}
  1588  	}
  1589  	cmd.SysProcAttr.AmbientCaps = []uintptr{
  1590  		// Same as `cap` in cmd/gofer.go.
  1591  		unix.CAP_CHOWN,
  1592  		unix.CAP_DAC_OVERRIDE,
  1593  		unix.CAP_DAC_READ_SEARCH,
  1594  		unix.CAP_FOWNER,
  1595  		unix.CAP_FSETID,
  1596  		unix.CAP_SYS_CHROOT,
  1597  		// Needed for setuid(2)/setgid(2).
  1598  		unix.CAP_SETUID,
  1599  		unix.CAP_SETGID,
  1600  		// Needed for chroot.
  1601  		unix.CAP_SYS_ADMIN,
  1602  		// Needed to be able to clear bounding set (PR_CAPBSET_DROP).
  1603  		unix.CAP_SETPCAP,
  1604  	}
  1605  	return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil
  1606  }
  1607  
  1608  // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings
  1609  // for process pid.
  1610  func SetUserMappings(spec *specs.Spec, pid int) error {
  1611  	log.Debugf("Setting user mappings")
  1612  	args := []string{strconv.Itoa(pid)}
  1613  	for _, idMap := range spec.Linux.UIDMappings {
  1614  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1615  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1616  		args = append(args,
  1617  			strconv.Itoa(int(idMap.ContainerID)),
  1618  			strconv.Itoa(int(idMap.HostID)),
  1619  			strconv.Itoa(int(idMap.Size)),
  1620  		)
  1621  	}
  1622  
  1623  	out, err := exec.Command("newuidmap", args...).CombinedOutput()
  1624  	log.Debugf("newuidmap: %#v\n%s", args, out)
  1625  	if err != nil {
  1626  		return fmt.Errorf("newuidmap failed: %w", err)
  1627  	}
  1628  
  1629  	args = []string{strconv.Itoa(pid)}
  1630  	for _, idMap := range spec.Linux.GIDMappings {
  1631  		log.Infof("Mapping host uid %d to container uid %d (size=%d)",
  1632  			idMap.HostID, idMap.ContainerID, idMap.Size)
  1633  		args = append(args,
  1634  			strconv.Itoa(int(idMap.ContainerID)),
  1635  			strconv.Itoa(int(idMap.HostID)),
  1636  			strconv.Itoa(int(idMap.Size)),
  1637  		)
  1638  	}
  1639  	out, err = exec.Command("newgidmap", args...).CombinedOutput()
  1640  	log.Debugf("newgidmap: %#v\n%s", args, out)
  1641  	if err != nil {
  1642  		return fmt.Errorf("newgidmap failed: %w", err)
  1643  	}
  1644  	return nil
  1645  }