github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/container/container.go

github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/container/container.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package container creates and manipulates containers.
    16  package container
    17  
    18  import (
    19  	"bufio"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io/ioutil"
    24  	"os"
    25  	"os/exec"
    26  	"path"
    27  	"regexp"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    34  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    35  	"github.com/MerlinKodo/gvisor/pkg/log"
    36  	"github.com/MerlinKodo/gvisor/pkg/sentry/control"
    37  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    38  	"github.com/MerlinKodo/gvisor/pkg/sighandling"
    39  	"github.com/MerlinKodo/gvisor/pkg/state/statefile"
    40  	"github.com/MerlinKodo/gvisor/runsc/boot"
    41  	"github.com/MerlinKodo/gvisor/runsc/cgroup"
    42  	"github.com/MerlinKodo/gvisor/runsc/config"
    43  	"github.com/MerlinKodo/gvisor/runsc/console"
    44  	"github.com/MerlinKodo/gvisor/runsc/donation"
    45  	"github.com/MerlinKodo/gvisor/runsc/sandbox"
    46  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    47  	"github.com/cenkalti/backoff"
    48  	specs "github.com/opencontainers/runtime-spec/specs-go"
    49  	"golang.org/x/sys/unix"
    50  )
    51  
    52  const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent"
    53  
    54  // validateID validates the container id.
    55  func validateID(id string) error {
    56  	// See libcontainer/factory_linux.go.
    57  	idRegex := regexp.MustCompile(`^[\w+\.-]+$`)
    58  	if !idRegex.MatchString(id) {
    59  		return fmt.Errorf("invalid container id: %v", id)
    60  	}
    61  	return nil
    62  }
    63  
    64  // Container represents a containerized application. When running, the
    65  // container is associated with a single Sandbox.
    66  //
    67  // Container metadata can be saved and loaded to disk. Within a root directory,
    68  // we maintain subdirectories for each container named with the container id.
    69  // The container metadata is stored as a json within the container directory
    70  // in a file named "meta.json". This metadata format is defined by us and is
    71  // not part of the OCI spec.
    72  //
    73  // Containers must write their metadata files after any change to their internal
    74  // states. The entire container directory is deleted when the container is
    75  // destroyed.
    76  //
    77  // When the container is stopped, all processes that belong to the container
    78  // must be stopped before Destroy() returns. containerd makes roughly the
    79  // following calls to stop a container:
    80  //   - First it attempts to kill the container process with
    81  //     'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
    82  //     separate thread, it's waiting on the container. As soon as the wait
    83  //     returns, it moves on to the next step:
    84  //   - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
    85  //     the container. 'kill --all SIGKILL' waits for all processes before
    86  //     returning.
    87  //   - Containerd waits for stdin, stdout and stderr to drain and be closed.
    88  //   - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
    89  //     again just to be sure, waits, and then proceeds with remaining teardown.
    90  //
    91  // Container is thread-unsafe.
    92  type Container struct {
    93  	// ID is the container ID.
    94  	ID string `json:"id"`
    95  
    96  	// Spec is the OCI runtime spec that configures this container.
    97  	Spec *specs.Spec `json:"spec"`
    98  
    99  	// BundleDir is the directory containing the container bundle.
   100  	BundleDir string `json:"bundleDir"`
   101  
   102  	// CreatedAt is the time the container was created.
   103  	CreatedAt time.Time `json:"createdAt"`
   104  
   105  	// Owner is the container owner.
   106  	Owner string `json:"owner"`
   107  
   108  	// ConsoleSocket is the path to a unix domain socket that will receive
   109  	// the console FD.
   110  	ConsoleSocket string `json:"consoleSocket"`
   111  
   112  	// Status is the current container Status.
   113  	Status Status `json:"status"`
   114  
   115  	// GoferPid is the PID of the gofer running along side the sandbox. May
   116  	// be 0 if the gofer has been killed.
   117  	GoferPid int `json:"goferPid"`
   118  
   119  	// Sandbox is the sandbox this container is running in. It's set when the
   120  	// container is created and reset when the sandbox is destroyed.
   121  	Sandbox *sandbox.Sandbox `json:"sandbox"`
   122  
   123  	// CompatCgroup has the cgroup configuration for the container. For the single
   124  	// container case, container cgroup is set in `c.Sandbox` only. CompactCgroup
   125  	// is only set for multi-container, where the `c.Sandbox` cgroup represents
   126  	// the entire pod.
   127  	//
   128  	// Note that CompatCgroup is created only for compatibility with tools
   129  	// that expect container cgroups to exist. Setting limits here makes no change
   130  	// to the container in question.
   131  	CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"`
   132  
   133  	// Saver handles load from/save to the state file safely from multiple
   134  	// processes.
   135  	Saver StateFile `json:"saver"`
   136  
   137  	// OverlayMediums contains information about how the gofer mounts have been
   138  	// overlaid. The first entry is for rootfs and the following entries are for
   139  	// bind mounts in Spec.Mounts (in the same order).
   140  	OverlayMediums boot.OverlayMediumFlags `json:"overlayMediums"`
   141  
   142  	//
   143  	// Fields below this line are not saved in the state file and will not
   144  	// be preserved across commands.
   145  	//
   146  
   147  	// goferIsChild is set if a gofer process is a child of the current process.
   148  	//
   149  	// This field isn't saved to json, because only a creator of a gofer
   150  	// process will have it as a child process.
   151  	goferIsChild bool `nojson:"true"`
   152  }
   153  
   154  // Args is used to configure a new container.
   155  type Args struct {
   156  	// ID is the container unique identifier.
   157  	ID string
   158  
   159  	// Spec is the OCI spec that describes the container.
   160  	Spec *specs.Spec
   161  
   162  	// BundleDir is the directory containing the container bundle.
   163  	BundleDir string
   164  
   165  	// ConsoleSocket is the path to a unix domain socket that will receive
   166  	// the console FD. It may be empty.
   167  	ConsoleSocket string
   168  
   169  	// PIDFile is the filename where the container's root process PID will be
   170  	// written to. It may be empty.
   171  	PIDFile string
   172  
   173  	// UserLog is the filename to send user-visible logs to. It may be empty.
   174  	//
   175  	// It only applies for the init container.
   176  	UserLog string
   177  
   178  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   179  	// If the caller exits, the sandbox should exit too.
   180  	//
   181  	// It only applies for the init container.
   182  	Attached bool
   183  
   184  	// PassFiles are user-supplied files from the host to be exposed to the
   185  	// sandboxed app.
   186  	PassFiles map[int]*os.File
   187  
   188  	// ExecFile is the host file used for program execution.
   189  	ExecFile *os.File
   190  }
   191  
   192  // New creates the container in a new Sandbox process, unless the metadata
   193  // indicates that an existing Sandbox should be used. The caller must call
   194  // Destroy() on the container.
   195  func New(conf *config.Config, args Args) (*Container, error) {
   196  	log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir)
   197  	if err := validateID(args.ID); err != nil {
   198  		return nil, err
   199  	}
   200  
   201  	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
   202  		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
   203  	}
   204  
   205  	if err := modifySpecForDirectfs(conf, args.Spec); err != nil {
   206  		return nil, fmt.Errorf("failed to modify spec for directfs: %v", err)
   207  	}
   208  
   209  	sandboxID := args.ID
   210  	if !isRoot(args.Spec) {
   211  		var ok bool
   212  		sandboxID, ok = specutils.SandboxID(args.Spec)
   213  		if !ok {
   214  			return nil, fmt.Errorf("no sandbox ID found when creating container")
   215  		}
   216  	}
   217  
   218  	c := &Container{
   219  		ID:            args.ID,
   220  		Spec:          args.Spec,
   221  		ConsoleSocket: args.ConsoleSocket,
   222  		BundleDir:     args.BundleDir,
   223  		Status:        Creating,
   224  		CreatedAt:     time.Now(),
   225  		Owner:         os.Getenv("USER"),
   226  		Saver: StateFile{
   227  			RootDir: conf.RootDir,
   228  			ID: FullID{
   229  				SandboxID:   sandboxID,
   230  				ContainerID: args.ID,
   231  			},
   232  		},
   233  	}
   234  	// The Cleanup object cleans up partially created containers when an error
   235  	// occurs. Any errors occurring during cleanup itself are ignored.
   236  	cu := cleanup.Make(func() { _ = c.Destroy() })
   237  	defer cu.Clean()
   238  
   239  	// Lock the container metadata file to prevent concurrent creations of
   240  	// containers with the same id.
   241  	if err := c.Saver.LockForNew(); err != nil {
   242  		return nil, fmt.Errorf("cannot lock container metadata file: %w", err)
   243  	}
   244  	defer c.Saver.UnlockOrDie()
   245  
   246  	// If the metadata annotations indicate that this container should be started
   247  	// in an existing sandbox, we must do so. These are the possible metadata
   248  	// annotation states:
   249  	//   1. No annotations: it means that there is a single container and this
   250  	//      container is obviously the root. Both container and sandbox share the
   251  	//      ID.
   252  	//   2. Container type == sandbox: it means this is the root container
   253  	//  		starting the sandbox. Both container and sandbox share the same ID.
   254  	//   3. Container type == container: it means this is a subcontainer of an
   255  	//      already started sandbox. In this case, container ID is different than
   256  	//      the sandbox ID.
   257  	if isRoot(args.Spec) {
   258  		log.Debugf("Creating new sandbox for container, cid: %s", args.ID)
   259  
   260  		if args.Spec.Linux == nil {
   261  			args.Spec.Linux = &specs.Linux{}
   262  		}
   263  		// Don't force the use of cgroups in tests because they lack permission to do so.
   264  		if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   265  			args.Spec.Linux.CgroupsPath = "/" + args.ID
   266  		}
   267  		var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup
   268  		if !conf.IgnoreCgroups {
   269  			var err error
   270  
   271  			// Create and join cgroup before processes are created to ensure they are
   272  			// part of the cgroup from the start (and all their children processes).
   273  			parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec)
   274  			if err != nil {
   275  				return nil, fmt.Errorf("cannot set up cgroup for root: %w", err)
   276  			}
   277  			// Join the child cgroup when using cgroupfs. Joining non leaf-node
   278  			// cgroups is illegal in cgroupsv2 and will return EBUSY.
   279  			if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() {
   280  				containerCgroup = subCgroup
   281  			} else {
   282  				containerCgroup = parentCgroup
   283  			}
   284  		}
   285  		c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup}
   286  		mountHints, err := boot.NewPodMountHints(args.Spec)
   287  		if err != nil {
   288  			return nil, fmt.Errorf("error creating pod mount hints: %w", err)
   289  		}
   290  		overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(conf.GetOverlay2(), mountHints)
   291  		if err != nil {
   292  			return nil, err
   293  		}
   294  		c.OverlayMediums = overlayMediums
   295  		if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil {
   296  			return nil, err
   297  		}
   298  		if err := runInCgroup(containerCgroup, func() error {
   299  			ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached)
   300  			if err != nil {
   301  				return fmt.Errorf("cannot create gofer process: %w", err)
   302  			}
   303  
   304  			// Start a new sandbox for this container. Any errors after this point
   305  			// must destroy the container.
   306  			sandArgs := &sandbox.Args{
   307  				ID:                    sandboxID,
   308  				Spec:                  args.Spec,
   309  				BundleDir:             args.BundleDir,
   310  				ConsoleSocket:         args.ConsoleSocket,
   311  				UserLog:               args.UserLog,
   312  				IOFiles:               ioFiles,
   313  				MountsFile:            specFile,
   314  				Cgroup:                containerCgroup,
   315  				Attached:              args.Attached,
   316  				OverlayFilestoreFiles: overlayFilestoreFiles,
   317  				OverlayMediums:        overlayMediums,
   318  				MountHints:            mountHints,
   319  				PassFiles:             args.PassFiles,
   320  				ExecFile:              args.ExecFile,
   321  			}
   322  			if specutils.GPUFunctionalityRequested(args.Spec, conf) {
   323  				// Expose all Nvidia devices in /dev/, because we don't know what
   324  				// devices future subcontainers will want.
   325  				searchDir := "/"
   326  				if conf.NVProxyDocker {
   327  					// For single-container use cases like Docker, the container rootfs
   328  					// is populated with the devices that need to be exposed. Scan that.
   329  					// This scan needs to happen outside the sandbox process because
   330  					// /rootfs/dev/nvidia* mounts made in gofer may not be propagated to
   331  					// sandbox's mount namespace.
   332  					searchDir = args.Spec.Root.Path
   333  				}
   334  				sandArgs.NvidiaDevMinors, err = specutils.FindAllGPUDevices(searchDir)
   335  				if err != nil {
   336  					return fmt.Errorf("FindAllGPUDevices: %w", err)
   337  				}
   338  			}
   339  			sand, err := sandbox.New(conf, sandArgs)
   340  			if err != nil {
   341  				return fmt.Errorf("cannot create sandbox: %w", err)
   342  			}
   343  			c.Sandbox = sand
   344  			return nil
   345  
   346  		}); err != nil {
   347  			return nil, err
   348  		}
   349  	} else {
   350  		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID)
   351  
   352  		// Find the sandbox associated with this ID.
   353  		fullID := FullID{
   354  			SandboxID:   sandboxID,
   355  			ContainerID: sandboxID,
   356  		}
   357  		sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true})
   358  		if err != nil {
   359  			return nil, fmt.Errorf("cannot load sandbox: %w", err)
   360  		}
   361  		c.Sandbox = sb.Sandbox
   362  
   363  		subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec)
   364  		if err != nil {
   365  			return nil, err
   366  		}
   367  		c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup}
   368  
   369  		// If the console control socket file is provided, then create a new
   370  		// pty master/slave pair and send the TTY to the sandbox process.
   371  		var tty *os.File
   372  		if c.ConsoleSocket != "" {
   373  			// Create a new TTY pair and send the master on the provided socket.
   374  			var err error
   375  			tty, err = console.NewWithSocket(c.ConsoleSocket)
   376  			if err != nil {
   377  				return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err)
   378  			}
   379  			// tty file is transferred to the sandbox, then it can be closed here.
   380  			defer tty.Close()
   381  		}
   382  
   383  		if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil {
   384  			return nil, fmt.Errorf("cannot create subcontainer: %w", err)
   385  		}
   386  	}
   387  	c.changeStatus(Created)
   388  
   389  	// Save the metadata file.
   390  	if err := c.saveLocked(); err != nil {
   391  		return nil, err
   392  	}
   393  
   394  	// "If any prestart hook fails, the runtime MUST generate an error,
   395  	// stop and destroy the container" -OCI spec.
   396  	if c.Spec.Hooks != nil {
   397  		// Even though the hook name is Prestart, runc used to call it from create.
   398  		// For this reason, it's now deprecated, but the spec requires it to be
   399  		// called *before* CreateRuntime and CreateRuntime must be called in create.
   400  		//
   401  		// "For runtimes that implement the deprecated prestart hooks as
   402  		// createRuntime hooks, createRuntime hooks MUST be called after the
   403  		// prestart hooks."
   404  		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
   405  			return nil, err
   406  		}
   407  		if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil {
   408  			return nil, err
   409  		}
   410  		if len(c.Spec.Hooks.CreateContainer) > 0 {
   411  			log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported")
   412  		}
   413  	}
   414  
   415  	// Write the PID file. Containerd considers the call to create complete after
   416  	// this file is created, so it must be the last thing we do.
   417  	if args.PIDFile != "" {
   418  		if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
   419  			return nil, fmt.Errorf("error writing PID file: %v", err)
   420  		}
   421  	}
   422  
   423  	cu.Release()
   424  	return c, nil
   425  }
   426  
   427  // Start starts running the containerized process inside the sandbox.
   428  func (c *Container) Start(conf *config.Config) error {
   429  	log.Debugf("Start container, cid: %s", c.ID)
   430  
   431  	if err := c.Saver.lock(BlockAcquire); err != nil {
   432  		return err
   433  	}
   434  	unlock := cleanup.Make(c.Saver.UnlockOrDie)
   435  	defer unlock.Clean()
   436  
   437  	if err := c.requireStatus("start", Created); err != nil {
   438  		return err
   439  	}
   440  
   441  	// "If any prestart hook fails, the runtime MUST generate an error,
   442  	// stop and destroy the container" -OCI spec.
   443  	if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 {
   444  		log.Warningf("StartContainer hook skipped because running inside container namespace is not supported")
   445  	}
   446  
   447  	if isRoot(c.Spec) {
   448  		if err := c.Sandbox.StartRoot(conf); err != nil {
   449  			return err
   450  		}
   451  	} else {
   452  		overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(conf.GetOverlay2(), c.Sandbox.MountHints)
   453  		if err != nil {
   454  			return err
   455  		}
   456  		c.OverlayMediums = overlayMediums
   457  		// Join cgroup to start gofer process to ensure it's part of the cgroup from
   458  		// the start (and all their children processes).
   459  		if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error {
   460  			// Create the gofer process.
   461  			goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
   462  			if err != nil {
   463  				return err
   464  			}
   465  			defer func() {
   466  				_ = mountsFile.Close()
   467  				for _, f := range goferFiles {
   468  					_ = f.Close()
   469  				}
   470  			}()
   471  
   472  			cleanMounts, err := specutils.ReadMounts(mountsFile)
   473  			if err != nil {
   474  				return fmt.Errorf("reading mounts file: %v", err)
   475  			}
   476  			c.Spec.Mounts = cleanMounts
   477  
   478  			// Setup stdios if the container is not using terminal. Otherwise TTY was
   479  			// already setup in create.
   480  			var stdios []*os.File
   481  			if !c.Spec.Process.Terminal {
   482  				stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr}
   483  			}
   484  
   485  			return c.Sandbox.StartSubcontainer(c.Spec, conf, c.ID, stdios, goferFiles, overlayFilestoreFiles, overlayMediums)
   486  		}); err != nil {
   487  			return err
   488  		}
   489  	}
   490  
   491  	// "If any poststart hook fails, the runtime MUST log a warning, but
   492  	// the remaining hooks and lifecycle continue as if the hook had
   493  	// succeeded" -OCI spec.
   494  	if c.Spec.Hooks != nil {
   495  		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
   496  	}
   497  
   498  	c.changeStatus(Running)
   499  	if err := c.saveLocked(); err != nil {
   500  		return err
   501  	}
   502  
   503  	// Release lock before adjusting OOM score because the lock is acquired there.
   504  	unlock.Clean()
   505  
   506  	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
   507  	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil {
   508  		return err
   509  	}
   510  
   511  	// Set container's oom_score_adj to the gofer since it is dedicated to
   512  	// the container, in case the gofer uses up too much memory.
   513  	return c.adjustGoferOOMScoreAdj()
   514  }
   515  
   516  // Restore takes a container and replaces its kernel and file system
   517  // to restore a container from its state file.
   518  func (c *Container) Restore(conf *config.Config, restoreFile string) error {
   519  	log.Debugf("Restore container, cid: %s", c.ID)
   520  	if err := c.Saver.lock(BlockAcquire); err != nil {
   521  		return err
   522  	}
   523  	defer c.Saver.UnlockOrDie()
   524  
   525  	if err := c.requireStatus("restore", Created); err != nil {
   526  		return err
   527  	}
   528  
   529  	// "If any prestart hook fails, the runtime MUST generate an error,
   530  	// stop and destroy the container" -OCI spec.
   531  	if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 {
   532  		log.Warningf("StartContainer hook skipped because running inside container namespace is not supported")
   533  	}
   534  
   535  	if err := c.Sandbox.Restore(conf, c.ID, restoreFile); err != nil {
   536  		return err
   537  	}
   538  	c.changeStatus(Running)
   539  	return c.saveLocked()
   540  }
   541  
   542  // Run is a helper that calls Create + Start + Wait.
   543  func Run(conf *config.Config, args Args) (unix.WaitStatus, error) {
   544  	log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir)
   545  	c, err := New(conf, args)
   546  	if err != nil {
   547  		return 0, fmt.Errorf("creating container: %v", err)
   548  	}
   549  	// Clean up partially created container if an error occurs.
   550  	// Any errors returned by Destroy() itself are ignored.
   551  	cu := cleanup.Make(func() {
   552  		c.Destroy()
   553  	})
   554  	defer cu.Clean()
   555  
   556  	if conf.RestoreFile != "" {
   557  		log.Debugf("Restore: %v", conf.RestoreFile)
   558  		if err := c.Restore(conf, conf.RestoreFile); err != nil {
   559  			return 0, fmt.Errorf("starting container: %v", err)
   560  		}
   561  	} else {
   562  		if err := c.Start(conf); err != nil {
   563  			return 0, fmt.Errorf("starting container: %v", err)
   564  		}
   565  	}
   566  
   567  	// If we allocate a terminal, forward signals to the sandbox process.
   568  	// Otherwise, Ctrl+C will terminate this process and its children,
   569  	// including the terminal.
   570  	if c.Spec.Process.Terminal {
   571  		stopForwarding := c.ForwardSignals(0, true /* fgProcess */)
   572  		defer stopForwarding()
   573  	}
   574  
   575  	if args.Attached {
   576  		return c.Wait()
   577  	}
   578  	cu.Release()
   579  	return 0, nil
   580  }
   581  
   582  // Execute runs the specified command in the container. It returns the PID of
   583  // the newly created process.
   584  func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   585  	log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args)
   586  	if err := c.requireStatus("execute in", Created, Running); err != nil {
   587  		return 0, err
   588  	}
   589  	args.ContainerID = c.ID
   590  	return c.Sandbox.Execute(conf, args)
   591  }
   592  
   593  // Event returns events for the container.
   594  func (c *Container) Event() (*boot.EventOut, error) {
   595  	log.Debugf("Getting events for container, cid: %s", c.ID)
   596  	if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
   597  		return nil, err
   598  	}
   599  	event, err := c.Sandbox.Event(c.ID)
   600  	if err != nil {
   601  		return nil, err
   602  	}
   603  
   604  	// Some stats can utilize host cgroups for accuracy.
   605  	c.populateStats(event)
   606  
   607  	return event, nil
   608  }
   609  
   610  // PortForward starts port forwarding to the container.
   611  func (c *Container) PortForward(opts *boot.PortForwardOpts) error {
   612  	if err := c.requireStatus("port forward", Running); err != nil {
   613  		return err
   614  	}
   615  	opts.ContainerID = c.ID
   616  	return c.Sandbox.PortForward(opts)
   617  }
   618  
   619  // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the
   620  // container is not running.
   621  func (c *Container) SandboxPid() int {
   622  	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
   623  		return -1
   624  	}
   625  	return c.Sandbox.Getpid()
   626  }
   627  
   628  // Wait waits for the container to exit, and returns its WaitStatus.
   629  // Call to wait on a stopped container is needed to retrieve the exit status
   630  // and wait returns immediately.
   631  func (c *Container) Wait() (unix.WaitStatus, error) {
   632  	log.Debugf("Wait on container, cid: %s", c.ID)
   633  	ws, err := c.Sandbox.Wait(c.ID)
   634  	if err == nil {
   635  		// Wait succeeded, container is not running anymore.
   636  		c.changeStatus(Stopped)
   637  	}
   638  	return ws, err
   639  }
   640  
   641  // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
   642  // returns its WaitStatus.
   643  func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) {
   644  	log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID)
   645  	if !c.IsSandboxRunning() {
   646  		return 0, fmt.Errorf("sandbox is not running")
   647  	}
   648  	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
   649  }
   650  
   651  // WaitPID waits for process 'pid' in the container's PID namespace and returns
   652  // its WaitStatus.
   653  func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) {
   654  	log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID)
   655  	if !c.IsSandboxRunning() {
   656  		return 0, fmt.Errorf("sandbox is not running")
   657  	}
   658  	return c.Sandbox.WaitPID(c.ID, pid)
   659  }
   660  
   661  // SignalContainer sends the signal to the container. If all is true and signal
   662  // is SIGKILL, then waits for all processes to exit before returning.
   663  // SignalContainer returns an error if the container is already stopped.
   664  // TODO(b/113680494): Distinguish different error types.
   665  func (c *Container) SignalContainer(sig unix.Signal, all bool) error {
   666  	log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig)
   667  	// Signaling container in Stopped state is allowed. When all=false,
   668  	// an error will be returned anyway; when all=true, this allows
   669  	// sending signal to other processes inside the container even
   670  	// after the init process exits. This is especially useful for
   671  	// container cleanup.
   672  	if err := c.requireStatus("signal", Running, Stopped); err != nil {
   673  		return err
   674  	}
   675  	if !c.IsSandboxRunning() {
   676  		return fmt.Errorf("sandbox is not running")
   677  	}
   678  	return c.Sandbox.SignalContainer(c.ID, sig, all)
   679  }
   680  
   681  // SignalProcess sends sig to a specific process in the container.
   682  func (c *Container) SignalProcess(sig unix.Signal, pid int32) error {
   683  	log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig)
   684  	if err := c.requireStatus("signal a process inside", Running); err != nil {
   685  		return err
   686  	}
   687  	if !c.IsSandboxRunning() {
   688  		return fmt.Errorf("sandbox is not running")
   689  	}
   690  	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
   691  }
   692  
   693  // ForwardSignals forwards all signals received by the current process to the
   694  // container process inside the sandbox. It returns a function that will stop
   695  // forwarding signals.
   696  func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
   697  	log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess)
   698  	stop := sighandling.StartSignalForwarding(func(sig linux.Signal) {
   699  		log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess)
   700  		if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil {
   701  			log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err)
   702  		}
   703  	})
   704  	return func() {
   705  		log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess)
   706  		stop()
   707  	}
   708  }
   709  
   710  // Checkpoint sends the checkpoint call to the container.
   711  // The statefile will be written to f, the file at the specified image-path.
   712  func (c *Container) Checkpoint(f *os.File, options statefile.Options) error {
   713  	log.Debugf("Checkpoint container, cid: %s", c.ID)
   714  	if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
   715  		return err
   716  	}
   717  	return c.Sandbox.Checkpoint(c.ID, f, options)
   718  }
   719  
   720  // Pause suspends the container and its kernel.
   721  // The call only succeeds if the container's status is created or running.
   722  func (c *Container) Pause() error {
   723  	log.Debugf("Pausing container, cid: %s", c.ID)
   724  	if err := c.Saver.lock(BlockAcquire); err != nil {
   725  		return err
   726  	}
   727  	defer c.Saver.UnlockOrDie()
   728  
   729  	if c.Status != Created && c.Status != Running {
   730  		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
   731  	}
   732  
   733  	if err := c.Sandbox.Pause(c.ID); err != nil {
   734  		return fmt.Errorf("pausing container %q: %v", c.ID, err)
   735  	}
   736  	c.changeStatus(Paused)
   737  	return c.saveLocked()
   738  }
   739  
   740  // Resume unpauses the container and its kernel.
   741  // The call only succeeds if the container's status is paused.
   742  func (c *Container) Resume() error {
   743  	log.Debugf("Resuming container, cid: %s", c.ID)
   744  	if err := c.Saver.lock(BlockAcquire); err != nil {
   745  		return err
   746  	}
   747  	defer c.Saver.UnlockOrDie()
   748  
   749  	if c.Status != Paused {
   750  		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
   751  	}
   752  	if err := c.Sandbox.Resume(c.ID); err != nil {
   753  		return fmt.Errorf("resuming container: %v", err)
   754  	}
   755  	c.changeStatus(Running)
   756  	return c.saveLocked()
   757  }
   758  
   759  // State returns the metadata of the container.
   760  func (c *Container) State() specs.State {
   761  	return specs.State{
   762  		Version:     specs.Version,
   763  		ID:          c.ID,
   764  		Status:      c.Status,
   765  		Pid:         c.SandboxPid(),
   766  		Bundle:      c.BundleDir,
   767  		Annotations: c.Spec.Annotations,
   768  	}
   769  }
   770  
   771  // Processes retrieves the list of processes and associated metadata inside a
   772  // container.
   773  func (c *Container) Processes() ([]*control.Process, error) {
   774  	if err := c.requireStatus("get processes of", Running, Paused); err != nil {
   775  		return nil, err
   776  	}
   777  	return c.Sandbox.Processes(c.ID)
   778  }
   779  
   780  // Destroy stops all processes and frees all resources associated with the
   781  // container.
   782  func (c *Container) Destroy() error {
   783  	log.Debugf("Destroy container, cid: %s", c.ID)
   784  
   785  	if err := c.Saver.lock(BlockAcquire); err != nil {
   786  		return err
   787  	}
   788  	defer func() {
   789  		c.Saver.UnlockOrDie()
   790  		_ = c.Saver.close()
   791  	}()
   792  
   793  	// Stored for later use as stop() sets c.Sandbox to nil.
   794  	sb := c.Sandbox
   795  
   796  	// We must perform the following cleanup steps:
   797  	//	* stop the container and gofer processes,
   798  	//	* remove the container filesystem on the host, and
   799  	//	* delete the container metadata directory.
   800  	//
   801  	// It's possible for one or more of these steps to fail, but we should
   802  	// do our best to perform all of the cleanups. Hence, we keep a slice
   803  	// of errors return their concatenation.
   804  	var errs []string
   805  	if err := c.stop(); err != nil {
   806  		err = fmt.Errorf("stopping container: %v", err)
   807  		log.Warningf("%v", err)
   808  		errs = append(errs, err.Error())
   809  	}
   810  
   811  	if err := c.Saver.Destroy(); err != nil {
   812  		err = fmt.Errorf("deleting container state files: %v", err)
   813  		log.Warningf("%v", err)
   814  		errs = append(errs, err.Error())
   815  	}
   816  
   817  	// Clean up overlay filestore files created in their respective mounts.
   818  	c.forEachSelfOverlay(func(mountSrc string) {
   819  		filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID())
   820  		if err := os.Remove(filestorePath); err != nil {
   821  			err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err)
   822  			log.Warningf("%v", err)
   823  			errs = append(errs, err.Error())
   824  		}
   825  	})
   826  
   827  	c.changeStatus(Stopped)
   828  
   829  	// Adjust oom_score_adj for the sandbox. This must be done after the container
   830  	// is stopped and the directory at c.Root is removed.
   831  	//
   832  	// Use 'sb' to tell whether it has been executed before because Destroy must
   833  	// be idempotent.
   834  	if sb != nil {
   835  		if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil {
   836  			errs = append(errs, err.Error())
   837  		}
   838  	}
   839  
   840  	// "If any poststop hook fails, the runtime MUST log a warning, but the
   841  	// remaining hooks and lifecycle continue as if the hook had
   842  	// succeeded" - OCI spec.
   843  	//
   844  	// Based on the OCI, "The post-stop hooks MUST be called after the container
   845  	// is deleted but before the delete operation returns"
   846  	// Run it here to:
   847  	// 1) Conform to the OCI.
   848  	// 2) Make sure it only runs once, because the root has been deleted, the
   849  	// container can't be loaded again.
   850  	if c.Spec.Hooks != nil {
   851  		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
   852  	}
   853  
   854  	if len(errs) == 0 {
   855  		return nil
   856  	}
   857  	return fmt.Errorf(strings.Join(errs, "\n"))
   858  }
   859  
   860  func (c *Container) sandboxID() string {
   861  	return c.Saver.ID.SandboxID
   862  }
   863  
   864  func (c *Container) forEachSelfOverlay(fn func(mountSrc string)) {
   865  	if c.OverlayMediums == nil {
   866  		// Sub container not started? Skip.
   867  		return
   868  	}
   869  	if c.OverlayMediums[0] == boot.SelfMedium {
   870  		fn(c.Spec.Root.Path)
   871  	}
   872  	goferMntIdx := 1 // First index is for rootfs.
   873  	for i := range c.Spec.Mounts {
   874  		if !specutils.IsGoferMount(c.Spec.Mounts[i]) {
   875  			continue
   876  		}
   877  		if c.OverlayMediums[goferMntIdx] == boot.SelfMedium {
   878  			fn(c.Spec.Mounts[i].Source)
   879  		}
   880  		goferMntIdx++
   881  	}
   882  }
   883  
   884  // createOverlayFilestores creates the regular files that will back the tmpfs
   885  // upper mount for overlay mounts. It also returns information about the
   886  // overlay medium used for each bind mount.
   887  func (c *Container) createOverlayFilestores(conf config.Overlay2, mountHints *boot.PodMountHints) ([]*os.File, []boot.OverlayMedium, error) {
   888  	var filestoreFiles []*os.File
   889  	var overlayMediums []boot.OverlayMedium
   890  
   891  	// Handle root mount first.
   892  	shouldOverlay := conf.RootEnabled() && !c.Spec.Root.Readonly
   893  	filestore, medium, err := c.createOverlayFilestore(conf, c.Spec.Root.Path, shouldOverlay, nil /* hint */)
   894  	if err != nil {
   895  		return nil, nil, err
   896  	}
   897  	if filestore != nil {
   898  		filestoreFiles = append(filestoreFiles, filestore)
   899  	}
   900  	overlayMediums = append(overlayMediums, medium)
   901  
   902  	// Handle bind mounts.
   903  	for i := range c.Spec.Mounts {
   904  		if !specutils.IsGoferMount(c.Spec.Mounts[i]) {
   905  			continue
   906  		}
   907  		hint := mountHints.FindMount(&c.Spec.Mounts[i])
   908  		shouldOverlay := conf.SubMountEnabled() && !specutils.IsReadonlyMount(c.Spec.Mounts[i].Options)
   909  		filestore, medium, err := c.createOverlayFilestore(conf, c.Spec.Mounts[i].Source, shouldOverlay, hint)
   910  		if err != nil {
   911  			return nil, nil, err
   912  		}
   913  		if filestore != nil {
   914  			filestoreFiles = append(filestoreFiles, filestore)
   915  		}
   916  		overlayMediums = append(overlayMediums, medium)
   917  	}
   918  	for _, filestore := range filestoreFiles {
   919  		// Perform this work around outside the sandbox. The sandbox may already be
   920  		// running with seccomp filters that do not allow this.
   921  		pgalloc.IMAWorkAroundForMemFile(filestore.Fd())
   922  	}
   923  	return filestoreFiles, overlayMediums, nil
   924  }
   925  
   926  func (c *Container) createOverlayFilestore(conf config.Overlay2, mountSrc string, shouldOverlay bool, hint *boot.MountHint) (*os.File, boot.OverlayMedium, error) {
   927  	if hint != nil && hint.ShouldOverlay() {
   928  		// MountHint information takes precedence over shouldOverlay.
   929  		return c.createOverlayFilestoreInSelf(mountSrc)
   930  	}
   931  	switch {
   932  	case !shouldOverlay:
   933  		return nil, boot.NoOverlay, nil
   934  	case conf.IsBackedByMemory():
   935  		return nil, boot.MemoryMedium, nil
   936  	case conf.IsBackedBySelf():
   937  		return c.createOverlayFilestoreInSelf(mountSrc)
   938  	default:
   939  		return c.createOverlayFilestoreInDir(conf)
   940  	}
   941  }
   942  
   943  func (c *Container) createOverlayFilestoreInSelf(mountSrc string) (*os.File, boot.OverlayMedium, error) {
   944  	mountSrcInfo, err := os.Stat(mountSrc)
   945  	if err != nil {
   946  		return nil, boot.NoOverlay, fmt.Errorf("failed to stat mount %q to see if it were a directory: %v", mountSrc, err)
   947  	}
   948  	if !mountSrcInfo.IsDir() {
   949  		log.Warningf("overlay2 self medium is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc)
   950  		return nil, boot.MemoryMedium, nil
   951  	}
   952  	// Create the self overlay filestore file.
   953  	filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID())
   954  	filestoreFD, err := unix.Open(filestorePath, unix.O_RDWR|unix.O_CREAT|unix.O_EXCL|unix.O_CLOEXEC, 0666)
   955  	if err != nil {
   956  		if err == unix.EEXIST {
   957  			// Note that if the same submount is mounted multiple times within the
   958  			// same sandbox, then the overlay option doesn't work correctly.
   959  			// Because each overlay mount is independent and changes to one are not
   960  			// visible to the other. Given "overlay on repeated submounts" is
   961  			// already broken, we don't support such a scenario with the self
   962  			// medium. The filestore file will already exist for such a case.
   963  			return nil, boot.NoOverlay, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not suppported with self medium", mountSrc, filestorePath)
   964  		}
   965  		return nil, boot.NoOverlay, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err)
   966  	}
   967  	log.Debugf("Created overlay filestore file at %q for mount source %q", filestorePath, mountSrc)
   968  	// Filestore in self should be a named path because it needs to be
   969  	// discoverable via path traversal so that k8s can scan the filesystem
   970  	// and apply any limits appropriately (like local ephemeral storage
   971  	// limits). So don't delete it. These files will be unlinked when the
   972  	// container is destroyed. This makes self medium appropriate for k8s.
   973  	return os.NewFile(uintptr(filestoreFD), filestorePath), boot.SelfMedium, nil
   974  }
   975  
   976  func (c *Container) createOverlayFilestoreInDir(conf config.Overlay2) (*os.File, boot.OverlayMedium, error) {
   977  	filestoreDir := conf.HostFileDir()
   978  	fileInfo, err := os.Stat(filestoreDir)
   979  	if err != nil {
   980  		return nil, boot.NoOverlay, fmt.Errorf("failed to stat overlay filestore directory %q: %v", filestoreDir, err)
   981  	}
   982  	if !fileInfo.IsDir() {
   983  		return nil, boot.NoOverlay, fmt.Errorf("overlay2 flag should specify an existing directory")
   984  	}
   985  	// Create an unnamed temporary file in filestore directory which will be
   986  	// deleted when the last FD on it is closed. We don't use O_TMPFILE because
   987  	// it is not supported on all filesystems. So we simulate it by creating a
   988  	// named file and then immediately unlinking it while keeping an FD on it.
   989  	// This file will be deleted when the container exits.
   990  	filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-overlay-filestore-")
   991  	if err != nil {
   992  		return nil, boot.NoOverlay, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err)
   993  	}
   994  	if err := unix.Unlink(filestoreFile.Name()); err != nil {
   995  		return nil, boot.NoOverlay, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err)
   996  	}
   997  	log.Debugf("Created an unnamed overlay filestore file at %q", filestoreDir)
   998  	return filestoreFile, boot.AnonDirMedium, nil
   999  }
  1000  
  1001  // saveLocked saves the container metadata to a file.
  1002  //
  1003  // Precondition: container must be locked with container.lock().
  1004  func (c *Container) saveLocked() error {
  1005  	log.Debugf("Save container, cid: %s", c.ID)
  1006  	if err := c.Saver.SaveLocked(c); err != nil {
  1007  		return fmt.Errorf("saving container metadata: %v", err)
  1008  	}
  1009  	return nil
  1010  }
  1011  
  1012  // stop stops the container (for regular containers) or the sandbox (for
  1013  // root containers), and waits for the container or sandbox and the gofer
  1014  // to stop. If any of them doesn't stop before timeout, an error is returned.
  1015  func (c *Container) stop() error {
  1016  	var parentCgroup cgroup.Cgroup
  1017  
  1018  	if c.Sandbox != nil {
  1019  		log.Debugf("Destroying container, cid: %s", c.ID)
  1020  		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
  1021  			return fmt.Errorf("destroying container %q: %v", c.ID, err)
  1022  		}
  1023  		// Only uninstall parentCgroup for sandbox stop.
  1024  		if c.Sandbox.IsRootContainer(c.ID) {
  1025  			parentCgroup = c.Sandbox.CgroupJSON.Cgroup
  1026  		}
  1027  		// Only set sandbox to nil after it has been told to destroy the container.
  1028  		c.Sandbox = nil
  1029  	}
  1030  
  1031  	// Try killing gofer if it does not exit with container.
  1032  	if c.GoferPid != 0 {
  1033  		log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid)
  1034  		if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil {
  1035  			// The gofer may already be stopped, log the error.
  1036  			log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err)
  1037  		}
  1038  	}
  1039  
  1040  	if err := c.waitForStopped(); err != nil {
  1041  		return err
  1042  	}
  1043  
  1044  	// Delete container cgroup if any.
  1045  	if c.CompatCgroup.Cgroup != nil {
  1046  		if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil {
  1047  			return err
  1048  		}
  1049  	}
  1050  	// Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called
  1051  	// after the gofer has stopped.
  1052  	if parentCgroup != nil {
  1053  		if err := parentCgroup.Uninstall(); err != nil {
  1054  			return err
  1055  		}
  1056  	}
  1057  	return nil
  1058  }
  1059  
  1060  func (c *Container) waitForStopped() error {
  1061  	if c.GoferPid == 0 {
  1062  		return nil
  1063  	}
  1064  
  1065  	if c.IsSandboxRunning() {
  1066  		if err := c.SignalContainer(unix.Signal(0), false); err == nil {
  1067  			return fmt.Errorf("container is still running")
  1068  		}
  1069  	}
  1070  
  1071  	if c.goferIsChild {
  1072  		// The gofer process is a child of the current process,
  1073  		// so we can wait it and collect its zombie.
  1074  		if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil {
  1075  			return fmt.Errorf("error waiting the gofer process: %v", err)
  1076  		}
  1077  		c.GoferPid = 0
  1078  		return nil
  1079  	}
  1080  
  1081  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1082  	defer cancel()
  1083  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1084  	op := func() error {
  1085  		if err := unix.Kill(c.GoferPid, 0); err == nil {
  1086  			return fmt.Errorf("gofer is still running")
  1087  		}
  1088  		c.GoferPid = 0
  1089  		return nil
  1090  	}
  1091  	return backoff.Retry(op, b)
  1092  }
  1093  
  1094  func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) {
  1095  	donations := donation.Agency{}
  1096  	defer donations.Close()
  1097  
  1098  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
  1099  		return nil, nil, err
  1100  	}
  1101  	if conf.DebugLog != "" {
  1102  		test := ""
  1103  		if len(conf.TestOnlyTestNameEnv) != 0 {
  1104  			// Fetch test name if one is provided and the test only flag was set.
  1105  			if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
  1106  				test = t
  1107  			}
  1108  		}
  1109  		if specutils.IsDebugCommand(conf, "gofer") {
  1110  			if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test); err != nil {
  1111  				return nil, nil, err
  1112  			}
  1113  		}
  1114  	}
  1115  
  1116  	// Start with the general config flags.
  1117  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
  1118  	cmd.SysProcAttr = &unix.SysProcAttr{
  1119  		// Detach from session. Otherwise, signals sent to the foreground process
  1120  		// will also be forwarded by this process, resulting in duplicate signals.
  1121  		Setsid: true,
  1122  	}
  1123  
  1124  	// Set Args[0] to make easier to spot the gofer process. Otherwise it's
  1125  	// shown as `exe`.
  1126  	cmd.Args[0] = "runsc-gofer"
  1127  
  1128  	// Tranfer FDs that need to be present before the "gofer" command.
  1129  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
  1130  	nextFD := donations.Transfer(cmd, 3)
  1131  
  1132  	cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir)
  1133  	cmd.Args = append(cmd.Args, "--overlay-mediums="+c.OverlayMediums.String())
  1134  
  1135  	// Open the spec file to donate to the sandbox.
  1136  	specFile, err := specutils.OpenSpec(bundleDir)
  1137  	if err != nil {
  1138  		return nil, nil, fmt.Errorf("opening spec file: %v", err)
  1139  	}
  1140  	donations.DonateAndClose("spec-fd", specFile)
  1141  
  1142  	// Donate any profile FDs to the gofer.
  1143  	if err := c.donateGoferProfileFDs(conf, &donations); err != nil {
  1144  		return nil, nil, fmt.Errorf("donating gofer profile fds: %w", err)
  1145  	}
  1146  
  1147  	// Create pipe that allows gofer to send mount list to sandbox after all paths
  1148  	// have been resolved.
  1149  	mountsSand, mountsGofer, err := os.Pipe()
  1150  	if err != nil {
  1151  		return nil, nil, err
  1152  	}
  1153  	donations.DonateAndClose("mounts-fd", mountsGofer)
  1154  
  1155  	// Add root mount and then add any other additional mounts.
  1156  	mountCount := 1
  1157  	for _, m := range spec.Mounts {
  1158  		if specutils.IsGoferMount(m) {
  1159  			mountCount++
  1160  		}
  1161  	}
  1162  
  1163  	sandEnds := make([]*os.File, 0, mountCount)
  1164  	for i := 0; i < mountCount; i++ {
  1165  		fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1166  		if err != nil {
  1167  			return nil, nil, err
  1168  		}
  1169  		sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
  1170  
  1171  		goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
  1172  		donations.DonateAndClose("io-fds", goferEnd)
  1173  	}
  1174  
  1175  	if attached {
  1176  		// The gofer is attached to the lifetime of this process, so it
  1177  		// should synchronously die when this process dies.
  1178  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1179  	}
  1180  
  1181  	// Enter new namespaces to isolate from the rest of the system. Don't unshare
  1182  	// cgroup because gofer is added to a cgroup in the caller's namespace.
  1183  	nss := []specs.LinuxNamespace{
  1184  		{Type: specs.IPCNamespace},
  1185  		{Type: specs.MountNamespace},
  1186  		{Type: specs.NetworkNamespace},
  1187  		{Type: specs.PIDNamespace},
  1188  		{Type: specs.UTSNamespace},
  1189  	}
  1190  
  1191  	rootlessEUID := unix.Geteuid() != 0
  1192  	// Setup any uid/gid mappings, and create or join the configured user
  1193  	// namespace so the gofer's view of the filesystem aligns with the
  1194  	// users in the sandbox.
  1195  	if !rootlessEUID {
  1196  		if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
  1197  			nss = append(nss, userNS)
  1198  			specutils.SetUIDGIDMappings(cmd, spec)
  1199  			// We need to set UID and GID to have capabilities in a new user namespace.
  1200  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
  1201  		}
  1202  	} else {
  1203  		userNS, ok := specutils.GetNS(specs.UserNamespace, spec)
  1204  		if !ok {
  1205  			return nil, nil, fmt.Errorf("unable to run a rootless container without userns")
  1206  		}
  1207  		nss = append(nss, userNS)
  1208  		syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations)
  1209  		if err != nil {
  1210  			return nil, nil, err
  1211  		}
  1212  		defer syncFile.Close()
  1213  	}
  1214  
  1215  	nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations)
  1216  	if err != nil {
  1217  		return nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err)
  1218  	}
  1219  
  1220  	donations.Transfer(cmd, nextFD)
  1221  
  1222  	// Start the gofer in the given namespace.
  1223  	donation.LogDonations(cmd)
  1224  	log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args)
  1225  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1226  		return nil, nil, fmt.Errorf("gofer: %v", err)
  1227  	}
  1228  	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
  1229  	c.GoferPid = cmd.Process.Pid
  1230  	c.goferIsChild = true
  1231  
  1232  	// Set up and synchronize rootless mode userns mappings.
  1233  	if rootlessEUID {
  1234  		if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil {
  1235  			return nil, nil, err
  1236  		}
  1237  	}
  1238  
  1239  	// Set up nvproxy within the Gofer namespace.
  1240  	if err := nvProxySetup(); err != nil {
  1241  		return nil, nil, fmt.Errorf("nvproxy setup: %w", err)
  1242  	}
  1243  
  1244  	return sandEnds, mountsSand, nil
  1245  }
  1246  
  1247  // changeStatus transitions from one status to another ensuring that the
  1248  // transition is valid.
  1249  func (c *Container) changeStatus(s Status) {
  1250  	switch s {
  1251  	case Creating:
  1252  		// Initial state, never transitions to it.
  1253  		panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1254  
  1255  	case Created:
  1256  		if c.Status != Creating {
  1257  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1258  		}
  1259  		if c.Sandbox == nil {
  1260  			panic("sandbox cannot be nil")
  1261  		}
  1262  
  1263  	case Paused:
  1264  		if c.Status != Running {
  1265  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1266  		}
  1267  		if c.Sandbox == nil {
  1268  			panic("sandbox cannot be nil")
  1269  		}
  1270  
  1271  	case Running:
  1272  		if c.Status != Created && c.Status != Paused {
  1273  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1274  		}
  1275  		if c.Sandbox == nil {
  1276  			panic("sandbox cannot be nil")
  1277  		}
  1278  
  1279  	case Stopped:
  1280  		if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped {
  1281  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1282  		}
  1283  
  1284  	default:
  1285  		panic(fmt.Sprintf("invalid new state: %v", s))
  1286  	}
  1287  	c.Status = s
  1288  }
  1289  
  1290  // IsSandboxRunning returns true if the sandbox exists and is running.
  1291  func (c *Container) IsSandboxRunning() bool {
  1292  	return c.Sandbox != nil && c.Sandbox.IsRunning()
  1293  }
  1294  
  1295  // HasCapabilityInAnySet returns true if the given capability is in any of the
  1296  // capability sets of the container process.
  1297  func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool {
  1298  	capString := capability.String()
  1299  	for _, set := range [5][]string{
  1300  		c.Spec.Process.Capabilities.Bounding,
  1301  		c.Spec.Process.Capabilities.Effective,
  1302  		c.Spec.Process.Capabilities.Inheritable,
  1303  		c.Spec.Process.Capabilities.Permitted,
  1304  		c.Spec.Process.Capabilities.Ambient,
  1305  	} {
  1306  		for _, c := range set {
  1307  			if c == capString {
  1308  				return true
  1309  			}
  1310  		}
  1311  	}
  1312  	return false
  1313  }
  1314  
  1315  // RunsAsUID0 returns true if the container process runs with UID 0 (root).
  1316  func (c *Container) RunsAsUID0() bool {
  1317  	return c.Spec.Process.User.UID == 0
  1318  }
  1319  
  1320  func (c *Container) requireStatus(action string, statuses ...Status) error {
  1321  	for _, s := range statuses {
  1322  		if c.Status == s {
  1323  			return nil
  1324  		}
  1325  	}
  1326  	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
  1327  }
  1328  
  1329  // IsSandboxRoot returns true if this container is its sandbox's root container.
  1330  func (c *Container) IsSandboxRoot() bool {
  1331  	return isRoot(c.Spec)
  1332  }
  1333  
  1334  func isRoot(spec *specs.Spec) bool {
  1335  	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
  1336  }
  1337  
  1338  // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
  1339  // it in the current context.
  1340  func runInCgroup(cg cgroup.Cgroup, fn func() error) error {
  1341  	if cg == nil {
  1342  		return fn()
  1343  	}
  1344  	restore, err := cg.Join()
  1345  	if err != nil {
  1346  		return err
  1347  	}
  1348  	defer restore()
  1349  	return fn()
  1350  }
  1351  
  1352  // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
  1353  func (c *Container) adjustGoferOOMScoreAdj() error {
  1354  	if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil {
  1355  		return nil
  1356  	}
  1357  	return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj)
  1358  }
  1359  
  1360  // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
  1361  // oom_score_adj is set to the lowest oom_score_adj among the containers
  1362  // running in the sandbox.
  1363  //
  1364  // TODO(gvisor.dev/issue/238): This call could race with other containers being
  1365  // created at the same time and end up setting the wrong oom_score_adj to the
  1366  // sandbox. Use rpc client to synchronize.
  1367  func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error {
  1368  	// Adjustment can be skipped if the root container is exiting, because it
  1369  	// brings down the entire sandbox.
  1370  	if isRoot(spec) && destroy {
  1371  		return nil
  1372  	}
  1373  
  1374  	containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{})
  1375  	if err != nil {
  1376  		return fmt.Errorf("loading sandbox containers: %v", err)
  1377  	}
  1378  
  1379  	// Do nothing if the sandbox has been terminated.
  1380  	if len(containers) == 0 {
  1381  		return nil
  1382  	}
  1383  
  1384  	// Get the lowest score for all containers.
  1385  	var lowScore int
  1386  	scoreFound := false
  1387  	for _, container := range containers {
  1388  		// Special multi-container support for CRI. Ignore the root container when
  1389  		// calculating oom_score_adj for the sandbox because it is the
  1390  		// infrastructure (pause) container and always has a very low oom_score_adj.
  1391  		//
  1392  		// We will use OOMScoreAdj in the single-container case where the
  1393  		// containerd container-type annotation is not present.
  1394  		if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
  1395  			continue
  1396  		}
  1397  
  1398  		if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
  1399  			scoreFound = true
  1400  			lowScore = *container.Spec.Process.OOMScoreAdj
  1401  		}
  1402  	}
  1403  
  1404  	// If the container is destroyed and remaining containers have no
  1405  	// oomScoreAdj specified then we must revert to the original oom_score_adj
  1406  	// saved with the root container.
  1407  	if !scoreFound && destroy {
  1408  		lowScore = containers[0].Sandbox.OriginalOOMScoreAdj
  1409  		scoreFound = true
  1410  	}
  1411  
  1412  	// Only set oom_score_adj if one of the containers has oom_score_adj set. If
  1413  	// not, oom_score_adj is inherited from the parent process.
  1414  	//
  1415  	// See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process
  1416  	if !scoreFound {
  1417  		return nil
  1418  	}
  1419  
  1420  	// Set the lowest of all containers oom_score_adj to the sandbox.
  1421  	return setOOMScoreAdj(s.Getpid(), lowScore)
  1422  }
  1423  
  1424  // setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
  1425  // /proc must be available and mounted read-write. scoreAdj should be between
  1426  // -1000 and 1000. It's a noop if the process has already exited.
  1427  func setOOMScoreAdj(pid int, scoreAdj int) error {
  1428  	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
  1429  	if err != nil {
  1430  		// Ignore NotExist errors because it can race with process exit.
  1431  		if os.IsNotExist(err) {
  1432  			log.Warningf("Process (%d) not found setting oom_score_adj", pid)
  1433  			return nil
  1434  		}
  1435  		return err
  1436  	}
  1437  	defer f.Close()
  1438  	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
  1439  		if errors.Is(err, unix.ESRCH) {
  1440  			log.Warningf("Process (%d) exited while setting oom_score_adj", pid)
  1441  			return nil
  1442  		}
  1443  		return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err)
  1444  	}
  1445  	return nil
  1446  }
  1447  
  1448  // populateStats populates event with stats estimates based on cgroups and the
  1449  // sentry's accounting.
  1450  // TODO(gvisor.dev/issue/172): This is an estimation; we should do more
  1451  // detailed accounting.
  1452  func (c *Container) populateStats(event *boot.EventOut) {
  1453  	// The events command, when run for all running containers, should
  1454  	// account for the full cgroup CPU usage. We split cgroup usage
  1455  	// proportionally according to the sentry-internal usage measurements,
  1456  	// only counting Running containers.
  1457  	log.Debugf("event.ContainerUsage: %v", event.ContainerUsage)
  1458  	var containerUsage uint64
  1459  	var allContainersUsage uint64
  1460  	for ID, usage := range event.ContainerUsage {
  1461  		allContainersUsage += usage
  1462  		if ID == c.ID {
  1463  			containerUsage = usage
  1464  		}
  1465  	}
  1466  
  1467  	cgroup, err := c.Sandbox.NewCGroup()
  1468  	if err != nil {
  1469  		// No cgroup, so rely purely on the sentry's accounting.
  1470  		log.Warningf("events: no cgroups")
  1471  		event.Event.Data.CPU.Usage.Total = containerUsage
  1472  		return
  1473  	}
  1474  
  1475  	// Get the host cgroup CPU usage.
  1476  	cgroupsUsage, err := cgroup.CPUUsage()
  1477  	if err != nil {
  1478  		// No cgroup usage, so rely purely on the sentry's accounting.
  1479  		log.Warningf("events: failed when getting cgroup CPU usage for container: %v", err)
  1480  		event.Event.Data.CPU.Usage.Total = containerUsage
  1481  		return
  1482  	}
  1483  
  1484  	// If the sentry reports no CPU usage, fall back on cgroups and split usage
  1485  	// equally across containers.
  1486  	if allContainersUsage == 0 {
  1487  		log.Warningf("events: no sentry CPU usage reported")
  1488  		allContainersUsage = cgroupsUsage
  1489  		containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage))
  1490  	}
  1491  
  1492  	// Scaling can easily overflow a uint64 (e.g. a containerUsage and
  1493  	// cgroupsUsage of 16 seconds each will overflow), so use floats.
  1494  	total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage))
  1495  	log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total)
  1496  	event.Event.Data.CPU.Usage.Total = uint64(total)
  1497  	return
  1498  }
  1499  
  1500  // setupCgroupForRoot configures and returns cgroup for the sandbox and the
  1501  // root container. If `cgroupParentAnnotation` is set, use that path as the
  1502  // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup.
  1503  func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) {
  1504  	var parentCgroup cgroup.Cgroup
  1505  	if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok {
  1506  		var err error
  1507  		parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
  1508  		if err != nil {
  1509  			return nil, nil, err
  1510  		}
  1511  	} else {
  1512  		var err error
  1513  		parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup)
  1514  		if parentCgroup == nil || err != nil {
  1515  			return nil, nil, err
  1516  		}
  1517  	}
  1518  
  1519  	var err error
  1520  	parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources)
  1521  	if parentCgroup == nil || err != nil {
  1522  		return nil, nil, err
  1523  	}
  1524  
  1525  	subCgroup, err := c.setupCgroupForSubcontainer(conf, spec)
  1526  	if err != nil {
  1527  		_ = parentCgroup.Uninstall()
  1528  		return nil, nil, err
  1529  	}
  1530  	return parentCgroup, subCgroup, nil
  1531  }
  1532  
  1533  // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since
  1534  // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the
  1535  // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups
  1536  // paths to discover new containers and report stats for them.
  1537  func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) {
  1538  	if isRoot(spec) {
  1539  		if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok {
  1540  			return nil, nil
  1541  		}
  1542  	}
  1543  
  1544  	cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup)
  1545  	if cg == nil || err != nil {
  1546  		return nil, err
  1547  	}
  1548  	// Use empty resources, just want the directory structure created.
  1549  	return cgroupInstall(conf, cg, &specs.LinuxResources{})
  1550  }
  1551  
  1552  // donateGoferProfileFDs will open profile files and donate their FDs to the
  1553  // gofer.
  1554  func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error {
  1555  	// The gofer profile files are named based on the provided flag, but
  1556  	// suffixed with "gofer" and the container ID to avoid collisions with
  1557  	// sentry profile files or profile files from other gofers.
  1558  	//
  1559  	// TODO(b/243183772): Merge gofer profile data with sentry profile data
  1560  	// into a single file.
  1561  	profSuffix := ".gofer." + c.ID
  1562  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
  1563  	if conf.ProfileBlock != "" {
  1564  		if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil {
  1565  			return err
  1566  		}
  1567  	}
  1568  	if conf.ProfileCPU != "" {
  1569  		if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil {
  1570  			return err
  1571  		}
  1572  	}
  1573  	if conf.ProfileHeap != "" {
  1574  		if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil {
  1575  			return err
  1576  		}
  1577  	}
  1578  	if conf.ProfileMutex != "" {
  1579  		if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil {
  1580  			return err
  1581  		}
  1582  	}
  1583  	if conf.TraceFile != "" {
  1584  		if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil {
  1585  			return err
  1586  		}
  1587  	}
  1588  	return nil
  1589  }
  1590  
  1591  // cgroupInstall creates cgroups dir structure and sets their respective
  1592  // resources. In case of success, returns the cgroups instance and nil error.
  1593  // For rootless, it's possible that cgroups operations fail, in this case the
  1594  // error is suppressed and a nil cgroups instance is returned to indicate that
  1595  // no cgroups was configured.
  1596  func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) {
  1597  	if err := cg.Install(res); err != nil {
  1598  		switch {
  1599  		case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless:
  1600  			log.Warningf("Skipping cgroup configuration in rootless mode: %v", err)
  1601  			return nil, nil
  1602  		default:
  1603  			return nil, fmt.Errorf("configuring cgroup: %v", err)
  1604  		}
  1605  	}
  1606  	return cg, nil
  1607  }
  1608  
  1609  func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error {
  1610  	if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1611  		return nil
  1612  	}
  1613  	if conf.Network == config.NetworkHost {
  1614  		// Hostnet feature requires the sandbox to run in the current user
  1615  		// namespace, in which the network namespace is configured.
  1616  		return nil
  1617  	}
  1618  	if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
  1619  		// If the spec already defines a userns, use that.
  1620  		return nil
  1621  	}
  1622  	if spec.Linux == nil {
  1623  		spec.Linux = &specs.Linux{}
  1624  	}
  1625  	if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 {
  1626  		// The spec can only define UID/GID mappings with a userns (checked above).
  1627  		return fmt.Errorf("spec defines UID/GID mappings without defining userns")
  1628  	}
  1629  	// Run the sandbox in a new user namespace with identity UID/GID mappings.
  1630  	log.Debugf("Configuring container with a new userns with identity user mappings into current userns")
  1631  	spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace})
  1632  	uidMappings, err := getIdentityMapping("uid_map")
  1633  	if err != nil {
  1634  		return err
  1635  	}
  1636  	spec.Linux.UIDMappings = uidMappings
  1637  	logIDMappings(uidMappings, "UID")
  1638  	gidMappings, err := getIdentityMapping("gid_map")
  1639  	if err != nil {
  1640  		return err
  1641  	}
  1642  	spec.Linux.GIDMappings = gidMappings
  1643  	logIDMappings(gidMappings, "GID")
  1644  	return nil
  1645  }
  1646  
  1647  func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) {
  1648  	// See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files
  1649  	// are organized.
  1650  	mapFile := path.Join("/proc/self", mapFileName)
  1651  	file, err := os.Open(mapFile)
  1652  	if err != nil {
  1653  		return nil, fmt.Errorf("failed to open %s: %v", mapFile, err)
  1654  	}
  1655  	defer file.Close()
  1656  
  1657  	var mappings []specs.LinuxIDMapping
  1658  	scanner := bufio.NewScanner(file)
  1659  	for scanner.Scan() {
  1660  		line := scanner.Text()
  1661  		var myStart, parentStart, rangeLen uint32
  1662  		numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen)
  1663  		if err != nil {
  1664  			return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err)
  1665  		}
  1666  		if numParsed != 3 {
  1667  			return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile)
  1668  		}
  1669  		// Create an identity mapping with the current userns.
  1670  		mappings = append(mappings, specs.LinuxIDMapping{
  1671  			ContainerID: myStart,
  1672  			HostID:      myStart,
  1673  			Size:        rangeLen,
  1674  		})
  1675  	}
  1676  	if err := scanner.Err(); err != nil {
  1677  		return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err)
  1678  	}
  1679  	return mappings, nil
  1680  }
  1681  
  1682  func logIDMappings(mappings []specs.LinuxIDMapping, idType string) {
  1683  	if !log.IsLogging(log.Debug) {
  1684  		return
  1685  	}
  1686  	log.Debugf("%s Mappings:", idType)
  1687  	for _, m := range mappings {
  1688  		log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size)
  1689  	}
  1690  }
  1691  
  1692  // nvProxyPreGoferHostSetup sets up nvproxy on the host. It runs before any
  1693  // Gofers start.
  1694  // It verifies that all the required dependencies are in place, loads kernel
  1695  // modules, and ensures the correct device files exist and are accessible.
  1696  // This should only be necessary once on the host. It should be run during the
  1697  // root container setup sequence to make sure it has run at least once.
  1698  func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error {
  1699  	if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker {
  1700  		return nil
  1701  	}
  1702  
  1703  	// Locate binaries. For security reasons, unlike
  1704  	// nvidia-container-runtime-hook, we don't add the container's filesystem
  1705  	// to the search path. We also don't support
  1706  	// /etc/nvidia-container-runtime/config.toml to avoid importing a TOML
  1707  	// parser.
  1708  	cliPath, err := exec.LookPath("nvidia-container-cli")
  1709  	if err != nil {
  1710  		return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err)
  1711  	}
  1712  
  1713  	// nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves.
  1714  	nvproxyLoadKernelModules()
  1715  
  1716  	if _, err := os.Stat("/dev/nvidiactl"); err != nil {
  1717  		if !os.IsNotExist(err) {
  1718  			return fmt.Errorf("stat(2) for /dev/nvidiactl failed: %w", err)
  1719  		}
  1720  
  1721  		// Run `nvidia-container-cli info`.
  1722  		// This has the side-effect of automatically creating GPU device files.
  1723  		argv := []string{cliPath, "--load-kmods", "info"}
  1724  		log.Debugf("Executing %q", argv)
  1725  		var infoOut, infoErr strings.Builder
  1726  		cmd := exec.Cmd{
  1727  			Path:   argv[0],
  1728  			Args:   argv,
  1729  			Env:    os.Environ(),
  1730  			Stdout: &infoOut,
  1731  			Stderr: &infoErr,
  1732  		}
  1733  		if err := cmd.Run(); err != nil {
  1734  			return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String())
  1735  		}
  1736  		log.Debugf("nvidia-container-cli info: %v", infoOut.String())
  1737  	}
  1738  
  1739  	return nil
  1740  }
  1741  
  1742  // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe.
  1743  func nvproxyLoadKernelModules() {
  1744  	for _, mod := range [...]string{
  1745  		"nvidia",
  1746  		"nvidia-uvm",
  1747  	} {
  1748  		argv := []string{
  1749  			"/sbin/modprobe",
  1750  			mod,
  1751  		}
  1752  		log.Debugf("Executing %q", argv)
  1753  		var stdout, stderr strings.Builder
  1754  		cmd := exec.Cmd{
  1755  			Path:   argv[0],
  1756  			Args:   argv,
  1757  			Env:    os.Environ(),
  1758  			Stdout: &stdout,
  1759  			Stderr: &stderr,
  1760  		}
  1761  		if err := cmd.Run(); err != nil {
  1762  			// This might not be fatal since modules may already be loaded. Log
  1763  			// the failure but continue.
  1764  			log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String())
  1765  		}
  1766  	}
  1767  }
  1768  
  1769  // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`.
  1770  // This sets up the container filesystem with bind mounts that allow it to
  1771  // use NVIDIA devices.
  1772  //
  1773  // This should be called during the Gofer setup process, as the bind mounts
  1774  // are created in the Gofer's mount namespace.
  1775  // If successful, it returns a callback function that must be called once the
  1776  // Gofer process has started.
  1777  // This function has no effect if nvproxy functionality is not requested.
  1778  //
  1779  // This function essentially replicates
  1780  // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the
  1781  // binary that executeHook() is hard-coded to skip, with differences noted
  1782  // inline. We do this rather than move the prestart hook because the
  1783  // "runtime environment" in which prestart hooks execute is vaguely
  1784  // defined, such that nvidia-container-runtime-hook and existing runsc
  1785  // hooks differ in their expected environment.
  1786  //
  1787  // Note that nvidia-container-cli will set up files in /dev and /proc which
  1788  // are useless, since they will be hidden by sentry devtmpfs and procfs
  1789  // respectively (and some device files will have the wrong device numbers
  1790  // from the application's perspective since nvproxy may register device
  1791  // numbers in sentry VFS that differ from those on the host, e.g. for
  1792  // nvidia-uvm). These files are separately created during sandbox VFS
  1793  // construction. For this reason, we don't need to parse
  1794  // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli.
  1795  func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) {
  1796  	if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker {
  1797  		return func() error { return nil }, nil
  1798  	}
  1799  
  1800  	if spec.Root == nil {
  1801  		return nil, fmt.Errorf("spec missing root filesystem")
  1802  	}
  1803  
  1804  	// nvidia-container-cli does not create this directory.
  1805  	if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil {
  1806  		return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err)
  1807  	}
  1808  
  1809  	cliPath, err := exec.LookPath("nvidia-container-cli")
  1810  	if err != nil {
  1811  		return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err)
  1812  	}
  1813  
  1814  	// On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter.
  1815  	var ldconfigPath string
  1816  	if _, err := os.Stat("/sbin/ldconfig.real"); err == nil {
  1817  		ldconfigPath = "/sbin/ldconfig.real"
  1818  	} else {
  1819  		ldconfigPath = "/sbin/ldconfig"
  1820  	}
  1821  
  1822  	devices, err := specutils.NvidiaDeviceList(spec, conf)
  1823  	if err != nil {
  1824  		return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err)
  1825  	}
  1826  
  1827  	// Create synchronization FD for nvproxy.
  1828  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1829  	if err != nil {
  1830  		return nil, err
  1831  	}
  1832  	ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD")
  1833  	goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD")
  1834  	goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd)
  1835  
  1836  	return func() error {
  1837  		defer ourEnd.Close()
  1838  		argv := []string{
  1839  			cliPath,
  1840  			"--load-kmods",
  1841  			"configure",
  1842  			fmt.Sprintf("--ldconfig=@%s", ldconfigPath),
  1843  			"--no-cgroups", // runsc doesn't configure device cgroups yet
  1844  			"--utility",
  1845  			"--compute",
  1846  			fmt.Sprintf("--pid=%d", goferCmd.Process.Pid),
  1847  			fmt.Sprintf("--device=%s", devices),
  1848  			spec.Root.Path,
  1849  		}
  1850  		log.Debugf("Executing %q", argv)
  1851  		var stdout, stderr strings.Builder
  1852  		cmd := exec.Cmd{
  1853  			Path:   argv[0],
  1854  			Args:   argv,
  1855  			Env:    os.Environ(),
  1856  			Stdout: &stdout,
  1857  			Stderr: &stderr,
  1858  		}
  1859  		if err := cmd.Run(); err != nil {
  1860  			return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String())
  1861  		}
  1862  		return nil
  1863  	}, nil
  1864  }