gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/container/container.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/container/container.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package container creates and manipulates containers.
    16  package container
    17  
    18  import (
    19  	"bufio"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io/ioutil"
    24  	"os"
    25  	"os/exec"
    26  	"path"
    27  	"regexp"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/cenkalti/backoff"
    34  	specs "github.com/opencontainers/runtime-spec/specs-go"
    35  	"golang.org/x/sys/unix"
    36  	"gvisor.dev/gvisor/pkg/abi/linux"
    37  	"gvisor.dev/gvisor/pkg/cleanup"
    38  	"gvisor.dev/gvisor/pkg/log"
    39  	"gvisor.dev/gvisor/pkg/sentry/control"
    40  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs"
    41  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
    42  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    43  	"gvisor.dev/gvisor/pkg/sighandling"
    44  	"gvisor.dev/gvisor/pkg/state/statefile"
    45  	"gvisor.dev/gvisor/runsc/boot"
    46  	"gvisor.dev/gvisor/runsc/cgroup"
    47  	"gvisor.dev/gvisor/runsc/config"
    48  	"gvisor.dev/gvisor/runsc/console"
    49  	"gvisor.dev/gvisor/runsc/donation"
    50  	"gvisor.dev/gvisor/runsc/sandbox"
    51  	"gvisor.dev/gvisor/runsc/specutils"
    52  )
    53  
    54  const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent"
    55  
    56  // validateID validates the container id.
    57  func validateID(id string) error {
    58  	// See libcontainer/factory_linux.go.
    59  	idRegex := regexp.MustCompile(`^[\w+\.-]+$`)
    60  	if !idRegex.MatchString(id) {
    61  		return fmt.Errorf("invalid container id: %v", id)
    62  	}
    63  	return nil
    64  }
    65  
    66  // Container represents a containerized application. When running, the
    67  // container is associated with a single Sandbox.
    68  //
    69  // Container metadata can be saved and loaded to disk. Within a root directory,
    70  // we maintain subdirectories for each container named with the container id.
    71  // The container metadata is stored as a json within the container directory
    72  // in a file named "meta.json". This metadata format is defined by us and is
    73  // not part of the OCI spec.
    74  //
    75  // Containers must write their metadata files after any change to their internal
    76  // states. The entire container directory is deleted when the container is
    77  // destroyed.
    78  //
    79  // When the container is stopped, all processes that belong to the container
    80  // must be stopped before Destroy() returns. containerd makes roughly the
    81  // following calls to stop a container:
    82  //   - First it attempts to kill the container process with
    83  //     'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
    84  //     separate thread, it's waiting on the container. As soon as the wait
    85  //     returns, it moves on to the next step:
    86  //   - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
    87  //     the container. 'kill --all SIGKILL' waits for all processes before
    88  //     returning.
    89  //   - Containerd waits for stdin, stdout and stderr to drain and be closed.
    90  //   - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
    91  //     again just to be sure, waits, and then proceeds with remaining teardown.
    92  //
    93  // Container is thread-unsafe.
    94  type Container struct {
    95  	// ID is the container ID.
    96  	ID string `json:"id"`
    97  
    98  	// Spec is the OCI runtime spec that configures this container.
    99  	Spec *specs.Spec `json:"spec"`
   100  
   101  	// BundleDir is the directory containing the container bundle.
   102  	BundleDir string `json:"bundleDir"`
   103  
   104  	// CreatedAt is the time the container was created.
   105  	CreatedAt time.Time `json:"createdAt"`
   106  
   107  	// Owner is the container owner.
   108  	Owner string `json:"owner"`
   109  
   110  	// ConsoleSocket is the path to a unix domain socket that will receive
   111  	// the console FD.
   112  	ConsoleSocket string `json:"consoleSocket"`
   113  
   114  	// Status is the current container Status.
   115  	Status Status `json:"status"`
   116  
   117  	// GoferPid is the PID of the gofer running along side the sandbox. May
   118  	// be 0 if the gofer has been killed.
   119  	GoferPid int `json:"goferPid"`
   120  
   121  	// Sandbox is the sandbox this container is running in. It's set when the
   122  	// container is created and reset when the sandbox is destroyed.
   123  	Sandbox *sandbox.Sandbox `json:"sandbox"`
   124  
   125  	// CompatCgroup has the cgroup configuration for the container. For the single
   126  	// container case, container cgroup is set in `c.Sandbox` only. CompactCgroup
   127  	// is only set for multi-container, where the `c.Sandbox` cgroup represents
   128  	// the entire pod.
   129  	//
   130  	// Note that CompatCgroup is created only for compatibility with tools
   131  	// that expect container cgroups to exist. Setting limits here makes no change
   132  	// to the container in question.
   133  	CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"`
   134  
   135  	// Saver handles load from/save to the state file safely from multiple
   136  	// processes.
   137  	Saver StateFile `json:"saver"`
   138  
   139  	// GoferMountConfs contains information about how the gofer mounts have been
   140  	// overlaid (with tmpfs or overlayfs). The first entry is for rootfs and the
   141  	// following entries are for bind mounts in Spec.Mounts (in the same order).
   142  	GoferMountConfs boot.GoferMountConfFlags `json:"goferMountConfs"`
   143  
   144  	//
   145  	// Fields below this line are not saved in the state file and will not
   146  	// be preserved across commands.
   147  	//
   148  
   149  	// goferIsChild is set if a gofer process is a child of the current process.
   150  	//
   151  	// This field isn't saved to json, because only a creator of a gofer
   152  	// process will have it as a child process.
   153  	goferIsChild bool `nojson:"true"`
   154  }
   155  
   156  // Args is used to configure a new container.
   157  type Args struct {
   158  	// ID is the container unique identifier.
   159  	ID string
   160  
   161  	// Spec is the OCI spec that describes the container.
   162  	Spec *specs.Spec
   163  
   164  	// BundleDir is the directory containing the container bundle.
   165  	BundleDir string
   166  
   167  	// ConsoleSocket is the path to a unix domain socket that will receive
   168  	// the console FD. It may be empty.
   169  	ConsoleSocket string
   170  
   171  	// PIDFile is the filename where the container's root process PID will be
   172  	// written to. It may be empty.
   173  	PIDFile string
   174  
   175  	// UserLog is the filename to send user-visible logs to. It may be empty.
   176  	//
   177  	// It only applies for the init container.
   178  	UserLog string
   179  
   180  	// Attached indicates that the sandbox lifecycle is attached with the caller.
   181  	// If the caller exits, the sandbox should exit too.
   182  	//
   183  	// It only applies for the init container.
   184  	Attached bool
   185  
   186  	// PassFiles are user-supplied files from the host to be exposed to the
   187  	// sandboxed app.
   188  	PassFiles map[int]*os.File
   189  
   190  	// ExecFile is the host file used for program execution.
   191  	ExecFile *os.File
   192  }
   193  
   194  // New creates the container in a new Sandbox process, unless the metadata
   195  // indicates that an existing Sandbox should be used. The caller must call
   196  // Destroy() on the container.
   197  func New(conf *config.Config, args Args) (*Container, error) {
   198  	log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir)
   199  	if err := validateID(args.ID); err != nil {
   200  		return nil, err
   201  	}
   202  
   203  	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
   204  		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
   205  	}
   206  
   207  	if err := modifySpecForDirectfs(conf, args.Spec); err != nil {
   208  		return nil, fmt.Errorf("failed to modify spec for directfs: %v", err)
   209  	}
   210  
   211  	sandboxID := args.ID
   212  	if !isRoot(args.Spec) {
   213  		var ok bool
   214  		sandboxID, ok = specutils.SandboxID(args.Spec)
   215  		if !ok {
   216  			return nil, fmt.Errorf("no sandbox ID found when creating container")
   217  		}
   218  	}
   219  
   220  	c := &Container{
   221  		ID:            args.ID,
   222  		Spec:          args.Spec,
   223  		ConsoleSocket: args.ConsoleSocket,
   224  		BundleDir:     args.BundleDir,
   225  		Status:        Creating,
   226  		CreatedAt:     time.Now(),
   227  		Owner:         os.Getenv("USER"),
   228  		Saver: StateFile{
   229  			RootDir: conf.RootDir,
   230  			ID: FullID{
   231  				SandboxID:   sandboxID,
   232  				ContainerID: args.ID,
   233  			},
   234  		},
   235  	}
   236  	// The Cleanup object cleans up partially created containers when an error
   237  	// occurs. Any errors occurring during cleanup itself are ignored.
   238  	cu := cleanup.Make(func() { _ = c.Destroy() })
   239  	defer cu.Clean()
   240  
   241  	// Lock the container metadata file to prevent concurrent creations of
   242  	// containers with the same id.
   243  	if err := c.Saver.LockForNew(); err != nil {
   244  		return nil, fmt.Errorf("cannot lock container metadata file: %w", err)
   245  	}
   246  	defer c.Saver.UnlockOrDie()
   247  
   248  	// If the metadata annotations indicate that this container should be started
   249  	// in an existing sandbox, we must do so. These are the possible metadata
   250  	// annotation states:
   251  	//   1. No annotations: it means that there is a single container and this
   252  	//      container is obviously the root. Both container and sandbox share the
   253  	//      ID.
   254  	//   2. Container type == sandbox: it means this is the root container
   255  	//  		starting the sandbox. Both container and sandbox share the same ID.
   256  	//   3. Container type == container: it means this is a subcontainer of an
   257  	//      already started sandbox. In this case, container ID is different than
   258  	//      the sandbox ID.
   259  	if isRoot(args.Spec) {
   260  		log.Debugf("Creating new sandbox for container, cid: %s", args.ID)
   261  
   262  		if args.Spec.Linux == nil {
   263  			args.Spec.Linux = &specs.Linux{}
   264  		}
   265  		// Don't force the use of cgroups in tests because they lack permission to do so.
   266  		if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   267  			args.Spec.Linux.CgroupsPath = "/" + args.ID
   268  		}
   269  		var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup
   270  		if !conf.IgnoreCgroups {
   271  			var err error
   272  
   273  			// Create and join cgroup before processes are created to ensure they are
   274  			// part of the cgroup from the start (and all their children processes).
   275  			parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec)
   276  			if err != nil {
   277  				return nil, fmt.Errorf("cannot set up cgroup for root: %w", err)
   278  			}
   279  			// Join the child cgroup when using cgroupfs. Joining non leaf-node
   280  			// cgroups is illegal in cgroupsv2 and will return EBUSY.
   281  			if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() {
   282  				containerCgroup = subCgroup
   283  			} else {
   284  				containerCgroup = parentCgroup
   285  			}
   286  		}
   287  		c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup}
   288  		mountHints, err := boot.NewPodMountHints(args.Spec)
   289  		if err != nil {
   290  			return nil, fmt.Errorf("error creating pod mount hints: %w", err)
   291  		}
   292  		rootfsHint, err := boot.NewRootfsHint(args.Spec)
   293  		if err != nil {
   294  			return nil, fmt.Errorf("error creating rootfs hint: %w", err)
   295  		}
   296  		goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), mountHints, rootfsHint)
   297  		if err != nil {
   298  			return nil, err
   299  		}
   300  		if !goferConfs[0].ShouldUseLisafs() && specutils.GPUFunctionalityRequestedViaHook(args.Spec, conf) {
   301  			// nvidia-container-runtime-hook attempts to populate the container
   302  			// rootfs with NVIDIA libraries and devices. With EROFS, spec.Root.Path
   303  			// points to an empty directory and populating that has no effect.
   304  			return nil, fmt.Errorf("nvidia-container-runtime-hook cannot be used together with non-lisafs backed root mount")
   305  		}
   306  		c.GoferMountConfs = goferConfs
   307  		if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil {
   308  			return nil, err
   309  		}
   310  		if err := runInCgroup(containerCgroup, func() error {
   311  			ioFiles, devIOFile, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached, rootfsHint)
   312  			if err != nil {
   313  				return fmt.Errorf("cannot create gofer process: %w", err)
   314  			}
   315  
   316  			// Start a new sandbox for this container. Any errors after this point
   317  			// must destroy the container.
   318  			sandArgs := &sandbox.Args{
   319  				ID:                  sandboxID,
   320  				Spec:                args.Spec,
   321  				BundleDir:           args.BundleDir,
   322  				ConsoleSocket:       args.ConsoleSocket,
   323  				UserLog:             args.UserLog,
   324  				IOFiles:             ioFiles,
   325  				DevIOFile:           devIOFile,
   326  				MountsFile:          specFile,
   327  				Cgroup:              containerCgroup,
   328  				Attached:            args.Attached,
   329  				GoferFilestoreFiles: goferFilestores,
   330  				GoferMountConfs:     goferConfs,
   331  				MountHints:          mountHints,
   332  				PassFiles:           args.PassFiles,
   333  				ExecFile:            args.ExecFile,
   334  			}
   335  			sand, err := sandbox.New(conf, sandArgs)
   336  			if err != nil {
   337  				return fmt.Errorf("cannot create sandbox: %w", err)
   338  			}
   339  			c.Sandbox = sand
   340  			return nil
   341  
   342  		}); err != nil {
   343  			return nil, err
   344  		}
   345  	} else {
   346  		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID)
   347  
   348  		// Find the sandbox associated with this ID.
   349  		fullID := FullID{
   350  			SandboxID:   sandboxID,
   351  			ContainerID: sandboxID,
   352  		}
   353  		sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true})
   354  		if err != nil {
   355  			return nil, fmt.Errorf("cannot load sandbox: %w", err)
   356  		}
   357  		c.Sandbox = sb.Sandbox
   358  
   359  		subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec)
   360  		if err != nil {
   361  			return nil, err
   362  		}
   363  		c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup}
   364  
   365  		// If the console control socket file is provided, then create a new
   366  		// pty master/slave pair and send the TTY to the sandbox process.
   367  		var tty *os.File
   368  		if c.ConsoleSocket != "" {
   369  			// Create a new TTY pair and send the master on the provided socket.
   370  			var err error
   371  			tty, err = console.NewWithSocket(c.ConsoleSocket)
   372  			if err != nil {
   373  				return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err)
   374  			}
   375  			// tty file is transferred to the sandbox, then it can be closed here.
   376  			defer tty.Close()
   377  		}
   378  
   379  		if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil {
   380  			return nil, fmt.Errorf("cannot create subcontainer: %w", err)
   381  		}
   382  	}
   383  	c.changeStatus(Created)
   384  
   385  	// Save the metadata file.
   386  	if err := c.saveLocked(); err != nil {
   387  		return nil, err
   388  	}
   389  
   390  	// "If any prestart hook fails, the runtime MUST generate an error,
   391  	// stop and destroy the container" -OCI spec.
   392  	if c.Spec.Hooks != nil {
   393  		// Even though the hook name is Prestart, runc used to call it from create.
   394  		// For this reason, it's now deprecated, but the spec requires it to be
   395  		// called *before* CreateRuntime and CreateRuntime must be called in create.
   396  		//
   397  		// "For runtimes that implement the deprecated prestart hooks as
   398  		// createRuntime hooks, createRuntime hooks MUST be called after the
   399  		// prestart hooks."
   400  		if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
   401  			return nil, err
   402  		}
   403  		if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil {
   404  			return nil, err
   405  		}
   406  		if len(c.Spec.Hooks.CreateContainer) > 0 {
   407  			log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported")
   408  		}
   409  	}
   410  
   411  	// Write the PID file. Containerd considers the call to create complete after
   412  	// this file is created, so it must be the last thing we do.
   413  	if args.PIDFile != "" {
   414  		if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
   415  			return nil, fmt.Errorf("error writing PID file: %v", err)
   416  		}
   417  	}
   418  
   419  	cu.Release()
   420  	return c, nil
   421  }
   422  
   423  // Start starts running the containerized process inside the sandbox.
   424  func (c *Container) Start(conf *config.Config) error {
   425  	log.Debugf("Start container, cid: %s", c.ID)
   426  	return c.startImpl(conf, "start", c.Sandbox.StartRoot, c.Sandbox.StartSubcontainer)
   427  }
   428  
   429  // Restore takes a container and replaces its kernel and file system
   430  // to restore a container from its state file.
   431  func (c *Container) Restore(conf *config.Config, imagePath string, direct bool) error {
   432  	log.Debugf("Restore container, cid: %s", c.ID)
   433  
   434  	restore := func(conf *config.Config) error {
   435  		return c.Sandbox.Restore(conf, c.ID, imagePath, direct)
   436  	}
   437  	return c.startImpl(conf, "restore", restore, c.Sandbox.RestoreSubcontainer)
   438  }
   439  
   440  func (c *Container) startImpl(conf *config.Config, action string, startRoot func(conf *config.Config) error, startSub func(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error) error {
   441  	if err := c.Saver.lock(BlockAcquire); err != nil {
   442  		return err
   443  	}
   444  	unlock := cleanup.Make(c.Saver.UnlockOrDie)
   445  	defer unlock.Clean()
   446  
   447  	if err := c.requireStatus(action, Created); err != nil {
   448  		return err
   449  	}
   450  
   451  	// "If any prestart hook fails, the runtime MUST generate an error,
   452  	// stop and destroy the container" -OCI spec.
   453  	if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 {
   454  		log.Warningf("StartContainer hook skipped because running inside container namespace is not supported")
   455  	}
   456  
   457  	if isRoot(c.Spec) {
   458  		if err := startRoot(conf); err != nil {
   459  			return err
   460  		}
   461  	} else {
   462  		rootfsHint, err := boot.NewRootfsHint(c.Spec)
   463  		if err != nil {
   464  			return fmt.Errorf("error creating rootfs hint: %w", err)
   465  		}
   466  		goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), c.Sandbox.MountHints, rootfsHint)
   467  		if err != nil {
   468  			return err
   469  		}
   470  		c.GoferMountConfs = goferConfs
   471  		// Join cgroup to start gofer process to ensure it's part of the cgroup from
   472  		// the start (and all their children processes).
   473  		if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error {
   474  			// Create the gofer process.
   475  			goferFiles, devIOFile, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false, rootfsHint)
   476  			if err != nil {
   477  				return err
   478  			}
   479  			defer func() {
   480  				if mountsFile != nil {
   481  					_ = mountsFile.Close()
   482  				}
   483  				if devIOFile != nil {
   484  					_ = devIOFile.Close()
   485  				}
   486  				for _, f := range goferFiles {
   487  					_ = f.Close()
   488  				}
   489  				for _, f := range goferFilestores {
   490  					_ = f.Close()
   491  				}
   492  			}()
   493  
   494  			if mountsFile != nil {
   495  				cleanMounts, err := specutils.ReadMounts(mountsFile)
   496  				if err != nil {
   497  					return fmt.Errorf("reading mounts file: %v", err)
   498  				}
   499  				c.Spec.Mounts = cleanMounts
   500  			}
   501  
   502  			// Setup stdios if the container is not using terminal. Otherwise TTY was
   503  			// already setup in create.
   504  			var stdios []*os.File
   505  			if !c.Spec.Process.Terminal {
   506  				stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr}
   507  			}
   508  
   509  			return startSub(c.Spec, conf, c.ID, stdios, goferFiles, goferFilestores, devIOFile, goferConfs)
   510  		}); err != nil {
   511  			return err
   512  		}
   513  	}
   514  
   515  	// "If any poststart hook fails, the runtime MUST log a warning, but
   516  	// the remaining hooks and lifecycle continue as if the hook had
   517  	// succeeded" -OCI spec.
   518  	if c.Spec.Hooks != nil {
   519  		executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
   520  	}
   521  
   522  	c.changeStatus(Running)
   523  	if err := c.saveLocked(); err != nil {
   524  		return err
   525  	}
   526  
   527  	// Release lock before adjusting OOM score because the lock is acquired there.
   528  	unlock.Clean()
   529  
   530  	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
   531  	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil {
   532  		return err
   533  	}
   534  
   535  	// Set container's oom_score_adj to the gofer since it is dedicated to
   536  	// the container, in case the gofer uses up too much memory.
   537  	return c.adjustGoferOOMScoreAdj()
   538  }
   539  
   540  // Run is a helper that calls Create + Start + Wait.
   541  func Run(conf *config.Config, args Args) (unix.WaitStatus, error) {
   542  	log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir)
   543  	c, err := New(conf, args)
   544  	if err != nil {
   545  		return 0, fmt.Errorf("creating container: %v", err)
   546  	}
   547  	// Clean up partially created container if an error occurs.
   548  	// Any errors returned by Destroy() itself are ignored.
   549  	cu := cleanup.Make(func() {
   550  		c.Destroy()
   551  	})
   552  	defer cu.Clean()
   553  
   554  	if err := c.Start(conf); err != nil {
   555  		return 0, fmt.Errorf("starting container: %v", err)
   556  	}
   557  
   558  	// If we allocate a terminal, forward signals to the sandbox process.
   559  	// Otherwise, Ctrl+C will terminate this process and its children,
   560  	// including the terminal.
   561  	if c.Spec.Process.Terminal {
   562  		stopForwarding := c.ForwardSignals(0, true /* fgProcess */)
   563  		defer stopForwarding()
   564  	}
   565  
   566  	if args.Attached {
   567  		return c.Wait()
   568  	}
   569  	cu.Release()
   570  	return 0, nil
   571  }
   572  
   573  // Execute runs the specified command in the container. It returns the PID of
   574  // the newly created process.
   575  func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) {
   576  	log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args)
   577  	if err := c.requireStatus("execute in", Created, Running); err != nil {
   578  		return 0, err
   579  	}
   580  	args.ContainerID = c.ID
   581  	return c.Sandbox.Execute(conf, args)
   582  }
   583  
   584  // Event returns events for the container.
   585  func (c *Container) Event() (*boot.EventOut, error) {
   586  	log.Debugf("Getting events for container, cid: %s", c.ID)
   587  	if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
   588  		return nil, err
   589  	}
   590  	event, err := c.Sandbox.Event(c.ID)
   591  	if err != nil {
   592  		return nil, err
   593  	}
   594  
   595  	if len(event.ContainerUsage) > 0 {
   596  		// Some stats can utilize host cgroups for accuracy.
   597  		c.populateStats(event)
   598  	}
   599  
   600  	return event, nil
   601  }
   602  
   603  // PortForward starts port forwarding to the container.
   604  func (c *Container) PortForward(opts *boot.PortForwardOpts) error {
   605  	if err := c.requireStatus("port forward", Running); err != nil {
   606  		return err
   607  	}
   608  	opts.ContainerID = c.ID
   609  	return c.Sandbox.PortForward(opts)
   610  }
   611  
   612  // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the
   613  // container is not running.
   614  func (c *Container) SandboxPid() int {
   615  	if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
   616  		return -1
   617  	}
   618  	return c.Sandbox.Getpid()
   619  }
   620  
   621  // Wait waits for the container to exit, and returns its WaitStatus.
   622  // Call to wait on a stopped container is needed to retrieve the exit status
   623  // and wait returns immediately.
   624  func (c *Container) Wait() (unix.WaitStatus, error) {
   625  	log.Debugf("Wait on container, cid: %s", c.ID)
   626  	ws, err := c.Sandbox.Wait(c.ID)
   627  	if err == nil {
   628  		// Wait succeeded, container is not running anymore.
   629  		c.changeStatus(Stopped)
   630  	}
   631  	return ws, err
   632  }
   633  
   634  // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
   635  // returns its WaitStatus.
   636  func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) {
   637  	log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID)
   638  	if !c.IsSandboxRunning() {
   639  		return 0, fmt.Errorf("sandbox is not running")
   640  	}
   641  	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
   642  }
   643  
   644  // WaitPID waits for process 'pid' in the container's PID namespace and returns
   645  // its WaitStatus.
   646  func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) {
   647  	log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID)
   648  	if !c.IsSandboxRunning() {
   649  		return 0, fmt.Errorf("sandbox is not running")
   650  	}
   651  	return c.Sandbox.WaitPID(c.ID, pid)
   652  }
   653  
   654  // SignalContainer sends the signal to the container. If all is true and signal
   655  // is SIGKILL, then waits for all processes to exit before returning.
   656  // SignalContainer returns an error if the container is already stopped.
   657  // TODO(b/113680494): Distinguish different error types.
   658  func (c *Container) SignalContainer(sig unix.Signal, all bool) error {
   659  	log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig)
   660  	// Signaling container in Stopped state is allowed. When all=false,
   661  	// an error will be returned anyway; when all=true, this allows
   662  	// sending signal to other processes inside the container even
   663  	// after the init process exits. This is especially useful for
   664  	// container cleanup.
   665  	if err := c.requireStatus("signal", Running, Stopped); err != nil {
   666  		return err
   667  	}
   668  	if !c.IsSandboxRunning() {
   669  		return fmt.Errorf("sandbox is not running")
   670  	}
   671  	return c.Sandbox.SignalContainer(c.ID, sig, all)
   672  }
   673  
   674  // SignalProcess sends sig to a specific process in the container.
   675  func (c *Container) SignalProcess(sig unix.Signal, pid int32) error {
   676  	log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig)
   677  	if err := c.requireStatus("signal a process inside", Running); err != nil {
   678  		return err
   679  	}
   680  	if !c.IsSandboxRunning() {
   681  		return fmt.Errorf("sandbox is not running")
   682  	}
   683  	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
   684  }
   685  
   686  // ForwardSignals forwards all signals received by the current process to the
   687  // container process inside the sandbox. It returns a function that will stop
   688  // forwarding signals.
   689  func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
   690  	log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess)
   691  	stop := sighandling.StartSignalForwarding(func(sig linux.Signal) {
   692  		log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess)
   693  		if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil {
   694  			log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err)
   695  		}
   696  	})
   697  	return func() {
   698  		log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess)
   699  		stop()
   700  	}
   701  }
   702  
   703  // Checkpoint sends the checkpoint call to the container.
   704  // The statefile will be written to f, the file at the specified image-path.
   705  func (c *Container) Checkpoint(imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error {
   706  	log.Debugf("Checkpoint container, cid: %s", c.ID)
   707  	if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
   708  		return err
   709  	}
   710  	return c.Sandbox.Checkpoint(c.ID, imagePath, direct, sfOpts, mfOpts)
   711  }
   712  
   713  // Pause suspends the container and its kernel.
   714  // The call only succeeds if the container's status is created or running.
   715  func (c *Container) Pause() error {
   716  	log.Debugf("Pausing container, cid: %s", c.ID)
   717  	if err := c.Saver.lock(BlockAcquire); err != nil {
   718  		return err
   719  	}
   720  	defer c.Saver.UnlockOrDie()
   721  
   722  	if c.Status != Created && c.Status != Running {
   723  		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
   724  	}
   725  
   726  	if err := c.Sandbox.Pause(c.ID); err != nil {
   727  		return fmt.Errorf("pausing container %q: %v", c.ID, err)
   728  	}
   729  	c.changeStatus(Paused)
   730  	return c.saveLocked()
   731  }
   732  
   733  // Resume unpauses the container and its kernel.
   734  // The call only succeeds if the container's status is paused.
   735  func (c *Container) Resume() error {
   736  	log.Debugf("Resuming container, cid: %s", c.ID)
   737  	if err := c.Saver.lock(BlockAcquire); err != nil {
   738  		return err
   739  	}
   740  	defer c.Saver.UnlockOrDie()
   741  
   742  	if c.Status != Paused {
   743  		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
   744  	}
   745  	if err := c.Sandbox.Resume(c.ID); err != nil {
   746  		return fmt.Errorf("resuming container: %v", err)
   747  	}
   748  	c.changeStatus(Running)
   749  	return c.saveLocked()
   750  }
   751  
   752  // State returns the metadata of the container.
   753  func (c *Container) State() specs.State {
   754  	return specs.State{
   755  		Version:     specs.Version,
   756  		ID:          c.ID,
   757  		Status:      c.Status,
   758  		Pid:         c.SandboxPid(),
   759  		Bundle:      c.BundleDir,
   760  		Annotations: c.Spec.Annotations,
   761  	}
   762  }
   763  
   764  // Processes retrieves the list of processes and associated metadata inside a
   765  // container.
   766  func (c *Container) Processes() ([]*control.Process, error) {
   767  	if err := c.requireStatus("get processes of", Running, Paused); err != nil {
   768  		return nil, err
   769  	}
   770  	return c.Sandbox.Processes(c.ID)
   771  }
   772  
   773  // Destroy stops all processes and frees all resources associated with the
   774  // container.
   775  func (c *Container) Destroy() error {
   776  	log.Debugf("Destroy container, cid: %s", c.ID)
   777  
   778  	if err := c.Saver.lock(BlockAcquire); err != nil {
   779  		return err
   780  	}
   781  	defer func() {
   782  		c.Saver.UnlockOrDie()
   783  		_ = c.Saver.close()
   784  	}()
   785  
   786  	// Stored for later use as stop() sets c.Sandbox to nil.
   787  	sb := c.Sandbox
   788  
   789  	// We must perform the following cleanup steps:
   790  	//	* stop the container and gofer processes,
   791  	//	* remove the container filesystem on the host, and
   792  	//	* delete the container metadata directory.
   793  	//
   794  	// It's possible for one or more of these steps to fail, but we should
   795  	// do our best to perform all of the cleanups. Hence, we keep a slice
   796  	// of errors return their concatenation.
   797  	var errs []string
   798  	if err := c.stop(); err != nil {
   799  		err = fmt.Errorf("stopping container: %v", err)
   800  		log.Warningf("%v", err)
   801  		errs = append(errs, err.Error())
   802  	}
   803  
   804  	if err := c.Saver.Destroy(); err != nil {
   805  		err = fmt.Errorf("deleting container state files: %v", err)
   806  		log.Warningf("%v", err)
   807  		errs = append(errs, err.Error())
   808  	}
   809  
   810  	// Clean up self-backed filestore files created in their respective mounts.
   811  	c.forEachSelfMount(func(mountSrc string) {
   812  		if sb != nil {
   813  			if hint := sb.MountHints.FindMount(mountSrc); hint != nil && hint.ShouldShareMount() {
   814  				// Don't delete filestore file for shared mounts. The sandbox owns a
   815  				// shared master mount which uses this filestore and is shared with
   816  				// multiple mount points.
   817  				return
   818  			}
   819  		}
   820  		filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID())
   821  		if err := os.Remove(filestorePath); err != nil {
   822  			err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err)
   823  			log.Warningf("%v", err)
   824  			errs = append(errs, err.Error())
   825  		}
   826  	})
   827  	if sb != nil && sb.IsRootContainer(c.ID) {
   828  		// When the root container is being destroyed, we can clean up filestores
   829  		// used by shared mounts.
   830  		for _, hint := range sb.MountHints.Mounts {
   831  			if !hint.ShouldShareMount() {
   832  				continue
   833  			}
   834  			// Assume this is a self-backed shared mount and try to delete the
   835  			// filestore. Subsequently ignore the ENOENT if the assumption is wrong.
   836  			filestorePath := boot.SelfFilestorePath(hint.Mount.Source, c.sandboxID())
   837  			if err := os.Remove(filestorePath); err != nil && !os.IsNotExist(err) {
   838  				err = fmt.Errorf("failed to delete shared filestore file %q: %v", filestorePath, err)
   839  				log.Warningf("%v", err)
   840  				errs = append(errs, err.Error())
   841  			}
   842  		}
   843  	}
   844  
   845  	c.changeStatus(Stopped)
   846  
   847  	// Adjust oom_score_adj for the sandbox. This must be done after the container
   848  	// is stopped and the directory at c.Root is removed.
   849  	//
   850  	// Use 'sb' to tell whether it has been executed before because Destroy must
   851  	// be idempotent.
   852  	if sb != nil {
   853  		if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil {
   854  			errs = append(errs, err.Error())
   855  		}
   856  	}
   857  
   858  	// "If any poststop hook fails, the runtime MUST log a warning, but the
   859  	// remaining hooks and lifecycle continue as if the hook had
   860  	// succeeded" - OCI spec.
   861  	//
   862  	// Based on the OCI, "The post-stop hooks MUST be called after the container
   863  	// is deleted but before the delete operation returns"
   864  	// Run it here to:
   865  	// 1) Conform to the OCI.
   866  	// 2) Make sure it only runs once, because the root has been deleted, the
   867  	// container can't be loaded again.
   868  	if c.Spec.Hooks != nil {
   869  		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
   870  	}
   871  
   872  	if len(errs) == 0 {
   873  		return nil
   874  	}
   875  	return fmt.Errorf(strings.Join(errs, "\n"))
   876  }
   877  
   878  func (c *Container) sandboxID() string {
   879  	return c.Saver.ID.SandboxID
   880  }
   881  
   882  func (c *Container) forEachSelfMount(fn func(mountSrc string)) {
   883  	if c.GoferMountConfs == nil {
   884  		// Container not started? Skip.
   885  		return
   886  	}
   887  	if c.GoferMountConfs[0].IsSelfBacked() {
   888  		fn(c.Spec.Root.Path)
   889  	}
   890  	goferMntIdx := 1 // First index is for rootfs.
   891  	for i := range c.Spec.Mounts {
   892  		if !specutils.IsGoferMount(c.Spec.Mounts[i]) {
   893  			continue
   894  		}
   895  		if c.GoferMountConfs[goferMntIdx].IsSelfBacked() {
   896  			fn(c.Spec.Mounts[i].Source)
   897  		}
   898  		goferMntIdx++
   899  	}
   900  }
   901  
   902  // createGoferFilestores creates the regular files that will back the
   903  // tmpfs/overlayfs mounts that will overlay some gofer mounts. It also returns
   904  // information about how each gofer mount is configured.
   905  func (c *Container) createGoferFilestores(ovlConf config.Overlay2, mountHints *boot.PodMountHints, rootfsHint *boot.RootfsHint) ([]*os.File, []boot.GoferMountConf, error) {
   906  	var goferFilestores []*os.File
   907  	var goferConfs []boot.GoferMountConf
   908  
   909  	// Handle root mount first.
   910  	overlayMedium := ovlConf.RootOverlayMedium()
   911  	mountType := boot.Bind
   912  	if rootfsHint != nil {
   913  		overlayMedium = rootfsHint.Overlay
   914  		if !specutils.IsGoferMount(rootfsHint.Mount) {
   915  			mountType = rootfsHint.Mount.Type
   916  		}
   917  	}
   918  	if c.Spec.Root.Readonly {
   919  		overlayMedium = config.NoOverlay
   920  	}
   921  	filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Root.Path, mountType, false /* isShared */)
   922  	if err != nil {
   923  		return nil, nil, err
   924  	}
   925  	if filestore != nil {
   926  		goferFilestores = append(goferFilestores, filestore)
   927  	}
   928  	goferConfs = append(goferConfs, goferConf)
   929  
   930  	// Handle bind mounts.
   931  	for i := range c.Spec.Mounts {
   932  		if !specutils.IsGoferMount(c.Spec.Mounts[i]) {
   933  			continue
   934  		}
   935  		overlayMedium = ovlConf.SubMountOverlayMedium()
   936  		mountType = boot.Bind
   937  		isShared := false
   938  		if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) {
   939  			overlayMedium = config.NoOverlay
   940  		}
   941  		if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil {
   942  			// Note that we want overlayMedium=self even if this is a read-only mount so that
   943  			// the shared mount is created correctly. Future containers may mount this writably.
   944  			overlayMedium = config.SelfOverlay
   945  			if !specutils.IsGoferMount(hint.Mount) {
   946  				mountType = hint.Mount.Type
   947  			}
   948  			isShared = hint.ShouldShareMount()
   949  		}
   950  		filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Mounts[i].Source, mountType, isShared)
   951  		if err != nil {
   952  			return nil, nil, err
   953  		}
   954  		if filestore != nil {
   955  			goferFilestores = append(goferFilestores, filestore)
   956  		}
   957  		goferConfs = append(goferConfs, goferConf)
   958  	}
   959  	for _, filestore := range goferFilestores {
   960  		// Perform this work around outside the sandbox. The sandbox may already be
   961  		// running with seccomp filters that do not allow this.
   962  		pgalloc.IMAWorkAroundForMemFile(filestore.Fd())
   963  	}
   964  	return goferFilestores, goferConfs, nil
   965  }
   966  
   967  func (c *Container) createGoferFilestore(overlayMedium config.OverlayMedium, mountSrc string, mountType string, isShared bool) (*os.File, boot.GoferMountConf, error) {
   968  	var lower boot.GoferMountConfLowerType
   969  	switch mountType {
   970  	case boot.Bind:
   971  		lower = boot.Lisafs
   972  	case tmpfs.Name:
   973  		lower = boot.NoneLower
   974  	case erofs.Name:
   975  		lower = boot.Erofs
   976  	default:
   977  		return nil, boot.GoferMountConf{}, fmt.Errorf("unsupported mount type %q in mount hint", mountType)
   978  	}
   979  	switch overlayMedium {
   980  	case config.NoOverlay:
   981  		return nil, boot.GoferMountConf{Lower: lower, Upper: boot.NoOverlay}, nil
   982  	case config.MemoryOverlay:
   983  		return nil, boot.GoferMountConf{Lower: lower, Upper: boot.MemoryOverlay}, nil
   984  	case config.SelfOverlay:
   985  		return c.createGoferFilestoreInSelf(mountSrc, isShared, boot.GoferMountConf{Lower: lower, Upper: boot.SelfOverlay})
   986  	default:
   987  		if overlayMedium.IsBackedByAnon() {
   988  			return c.createGoferFilestoreInDir(overlayMedium.HostFileDir(), boot.GoferMountConf{Lower: lower, Upper: boot.AnonOverlay})
   989  		}
   990  		return nil, boot.GoferMountConf{}, fmt.Errorf("unexpected overlay medium %q", overlayMedium)
   991  	}
   992  }
   993  
   994  func (c *Container) createGoferFilestoreInSelf(mountSrc string, isShared bool, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) {
   995  	mountSrcInfo, err := os.Stat(mountSrc)
   996  	if err != nil {
   997  		return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat mount %q to see if it were a directory: %v", mountSrc, err)
   998  	}
   999  	if !mountSrcInfo.IsDir() {
  1000  		log.Warningf("self filestore is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc)
  1001  		return nil, boot.GoferMountConf{Lower: successConf.Lower, Upper: boot.MemoryOverlay}, nil
  1002  	}
  1003  	// Create the self filestore file.
  1004  	createFlags := unix.O_RDWR | unix.O_CREAT | unix.O_CLOEXEC
  1005  	if !isShared {
  1006  		// Allow shared mounts to reuse existing filestore. A previous shared user
  1007  		// may have already set up the filestore.
  1008  		createFlags |= unix.O_EXCL
  1009  	}
  1010  	filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID())
  1011  	filestoreFD, err := unix.Open(filestorePath, createFlags, 0666)
  1012  	if err != nil {
  1013  		if err == unix.EEXIST {
  1014  			// Note that if the same submount is mounted multiple times within the
  1015  			// same sandbox, and is not shared, then the overlay option doesn't work
  1016  			// correctly. Because each overlay mount is independent and changes to
  1017  			// one are not visible to the other.
  1018  			return nil, boot.GoferMountConf{}, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not supported with overlay optimizations", mountSrc, filestorePath)
  1019  		}
  1020  		return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err)
  1021  	}
  1022  	log.Debugf("Created filestore file at %q for mount source %q", filestorePath, mountSrc)
  1023  	// Filestore in self should be a named path because it needs to be
  1024  	// discoverable via path traversal so that k8s can scan the filesystem
  1025  	// and apply any limits appropriately (like local ephemeral storage
  1026  	// limits). So don't delete it. These files will be unlinked when the
  1027  	// container is destroyed. This makes self medium appropriate for k8s.
  1028  	return os.NewFile(uintptr(filestoreFD), filestorePath), successConf, nil
  1029  }
  1030  
  1031  func (c *Container) createGoferFilestoreInDir(filestoreDir string, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) {
  1032  	fileInfo, err := os.Stat(filestoreDir)
  1033  	if err != nil {
  1034  		return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat filestore directory %q: %v", filestoreDir, err)
  1035  	}
  1036  	if !fileInfo.IsDir() {
  1037  		return nil, boot.GoferMountConf{}, fmt.Errorf("overlay2 flag should specify an existing directory")
  1038  	}
  1039  	// Create an unnamed temporary file in filestore directory which will be
  1040  	// deleted when the last FD on it is closed. We don't use O_TMPFILE because
  1041  	// it is not supported on all filesystems. So we simulate it by creating a
  1042  	// named file and then immediately unlinking it while keeping an FD on it.
  1043  	// This file will be deleted when the container exits.
  1044  	filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-filestore-")
  1045  	if err != nil {
  1046  		return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err)
  1047  	}
  1048  	if err := unix.Unlink(filestoreFile.Name()); err != nil {
  1049  		return nil, boot.GoferMountConf{}, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err)
  1050  	}
  1051  	log.Debugf("Created an unnamed filestore file at %q", filestoreDir)
  1052  	return filestoreFile, successConf, nil
  1053  }
  1054  
  1055  // saveLocked saves the container metadata to a file.
  1056  //
  1057  // Precondition: container must be locked with container.lock().
  1058  func (c *Container) saveLocked() error {
  1059  	log.Debugf("Save container, cid: %s", c.ID)
  1060  	if err := c.Saver.SaveLocked(c); err != nil {
  1061  		return fmt.Errorf("saving container metadata: %v", err)
  1062  	}
  1063  	return nil
  1064  }
  1065  
  1066  // stop stops the container (for regular containers) or the sandbox (for
  1067  // root containers), and waits for the container or sandbox and the gofer
  1068  // to stop. If any of them doesn't stop before timeout, an error is returned.
  1069  func (c *Container) stop() error {
  1070  	var parentCgroup cgroup.Cgroup
  1071  
  1072  	if c.Sandbox != nil {
  1073  		log.Debugf("Destroying container, cid: %s", c.ID)
  1074  		if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
  1075  			return fmt.Errorf("destroying container %q: %v", c.ID, err)
  1076  		}
  1077  		// Only uninstall parentCgroup for sandbox stop.
  1078  		if c.Sandbox.IsRootContainer(c.ID) {
  1079  			parentCgroup = c.Sandbox.CgroupJSON.Cgroup
  1080  		}
  1081  		// Only set sandbox to nil after it has been told to destroy the container.
  1082  		c.Sandbox = nil
  1083  	}
  1084  
  1085  	// Try killing gofer if it does not exit with container.
  1086  	if c.GoferPid != 0 {
  1087  		log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid)
  1088  		if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil {
  1089  			// The gofer may already be stopped, log the error.
  1090  			log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err)
  1091  		}
  1092  	}
  1093  
  1094  	if err := c.waitForStopped(); err != nil {
  1095  		return err
  1096  	}
  1097  
  1098  	// Delete container cgroup if any.
  1099  	if c.CompatCgroup.Cgroup != nil {
  1100  		if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil {
  1101  			return err
  1102  		}
  1103  	}
  1104  	// Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called
  1105  	// after the gofer has stopped.
  1106  	if parentCgroup != nil {
  1107  		if err := parentCgroup.Uninstall(); err != nil {
  1108  			return err
  1109  		}
  1110  	}
  1111  	return nil
  1112  }
  1113  
  1114  func (c *Container) waitForStopped() error {
  1115  	if c.GoferPid == 0 {
  1116  		return nil
  1117  	}
  1118  
  1119  	if c.IsSandboxRunning() {
  1120  		if err := c.SignalContainer(unix.Signal(0), false); err == nil {
  1121  			return fmt.Errorf("container is still running")
  1122  		}
  1123  	}
  1124  
  1125  	if c.goferIsChild {
  1126  		// The gofer process is a child of the current process,
  1127  		// so we can wait it and collect its zombie.
  1128  		if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil {
  1129  			return fmt.Errorf("error waiting the gofer process: %v", err)
  1130  		}
  1131  		c.GoferPid = 0
  1132  		return nil
  1133  	}
  1134  
  1135  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
  1136  	defer cancel()
  1137  	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
  1138  	op := func() error {
  1139  		if err := unix.Kill(c.GoferPid, 0); err == nil {
  1140  			return fmt.Errorf("gofer is still running")
  1141  		}
  1142  		c.GoferPid = 0
  1143  		return nil
  1144  	}
  1145  	return backoff.Retry(op, b)
  1146  }
  1147  
  1148  // shouldCreateDeviceGofer indicates whether a device gofer connection should
  1149  // be created.
  1150  func shouldCreateDeviceGofer(spec *specs.Spec, conf *config.Config) bool {
  1151  	return specutils.GPUFunctionalityRequested(spec, conf) || specutils.TPUFunctionalityRequested(spec, conf)
  1152  }
  1153  
  1154  // shouldSpawnGofer indicates whether the gofer process should be spawned.
  1155  func shouldSpawnGofer(spec *specs.Spec, conf *config.Config, goferConfs []boot.GoferMountConf) bool {
  1156  	// Lisafs mounts need the gofer.
  1157  	for _, cfg := range goferConfs {
  1158  		if cfg.ShouldUseLisafs() {
  1159  			return true
  1160  		}
  1161  	}
  1162  	// Device gofer needs a gofer process.
  1163  	return shouldCreateDeviceGofer(spec, conf)
  1164  }
  1165  
  1166  // createGoferProcess returns an IO file list and a mounts file on success.
  1167  // The IO file list consists of image files and/or socket files to connect to
  1168  // a gofer endpoint for the mount points using Gofers. The mounts file is the
  1169  // file to read list of mounts after they have been resolved (direct paths,
  1170  // no symlinks), and will be nil if there is no cleaning required for mounts.
  1171  func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool, rootfsHint *boot.RootfsHint) ([]*os.File, *os.File, *os.File, error) {
  1172  	if !shouldSpawnGofer(spec, conf, c.GoferMountConfs) {
  1173  		if !c.GoferMountConfs[0].ShouldUseErofs() {
  1174  			panic("goferless mode is only possible with EROFS rootfs")
  1175  		}
  1176  		ioFile, err := os.Open(rootfsHint.Mount.Source)
  1177  		if err != nil {
  1178  			return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err)
  1179  		}
  1180  		return []*os.File{ioFile}, nil, nil, nil
  1181  	}
  1182  
  1183  	// Ensure we don't leak FDs to the gofer process.
  1184  	if err := sandbox.SetCloExeOnAllFDs(); err != nil {
  1185  		return nil, nil, nil, fmt.Errorf("setting CLOEXEC on all FDs: %w", err)
  1186  	}
  1187  
  1188  	donations := donation.Agency{}
  1189  	defer donations.Close()
  1190  
  1191  	if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil {
  1192  		return nil, nil, nil, err
  1193  	}
  1194  	if conf.DebugLog != "" {
  1195  		test := ""
  1196  		if len(conf.TestOnlyTestNameEnv) != 0 {
  1197  			// Fetch test name if one is provided and the test only flag was set.
  1198  			if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
  1199  				test = t
  1200  			}
  1201  		}
  1202  		if specutils.IsDebugCommand(conf, "gofer") {
  1203  			if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test); err != nil {
  1204  				return nil, nil, nil, err
  1205  			}
  1206  		}
  1207  	}
  1208  
  1209  	// Start with the general config flags.
  1210  	cmd := exec.Command(specutils.ExePath, conf.ToFlags()...)
  1211  	cmd.SysProcAttr = &unix.SysProcAttr{
  1212  		// Detach from session. Otherwise, signals sent to the foreground process
  1213  		// will also be forwarded by this process, resulting in duplicate signals.
  1214  		Setsid: true,
  1215  	}
  1216  
  1217  	// Set Args[0] to make easier to spot the gofer process. Otherwise it's
  1218  	// shown as `exe`.
  1219  	cmd.Args[0] = "runsc-gofer"
  1220  
  1221  	// Tranfer FDs that need to be present before the "gofer" command.
  1222  	// Start at 3 because 0, 1, and 2 are taken by stdin/out/err.
  1223  	nextFD := donations.Transfer(cmd, 3)
  1224  
  1225  	cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir)
  1226  	cmd.Args = append(cmd.Args, "--gofer-mount-confs="+c.GoferMountConfs.String())
  1227  
  1228  	// Open the spec file to donate to the sandbox.
  1229  	specFile, err := specutils.OpenSpec(bundleDir)
  1230  	if err != nil {
  1231  		return nil, nil, nil, fmt.Errorf("opening spec file: %v", err)
  1232  	}
  1233  	donations.DonateAndClose("spec-fd", specFile)
  1234  
  1235  	// Donate any profile FDs to the gofer.
  1236  	if err := c.donateGoferProfileFDs(conf, &donations); err != nil {
  1237  		return nil, nil, nil, fmt.Errorf("donating gofer profile fds: %w", err)
  1238  	}
  1239  
  1240  	// Create pipe that allows gofer to send mount list to sandbox after all paths
  1241  	// have been resolved.
  1242  	mountsSand, mountsGofer, err := os.Pipe()
  1243  	if err != nil {
  1244  		return nil, nil, nil, err
  1245  	}
  1246  	donations.DonateAndClose("mounts-fd", mountsGofer)
  1247  
  1248  	// Count the number of mounts that needs an IO file.
  1249  	ioFileCount := 0
  1250  	for _, cfg := range c.GoferMountConfs {
  1251  		if cfg.ShouldUseLisafs() || cfg.ShouldUseErofs() {
  1252  			ioFileCount++
  1253  		}
  1254  	}
  1255  
  1256  	sandEnds := make([]*os.File, 0, ioFileCount)
  1257  	for i, cfg := range c.GoferMountConfs {
  1258  		switch {
  1259  		case cfg.ShouldUseLisafs():
  1260  			fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1261  			if err != nil {
  1262  				return nil, nil, nil, err
  1263  			}
  1264  			sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
  1265  
  1266  			goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
  1267  			donations.DonateAndClose("io-fds", goferEnd)
  1268  
  1269  		case cfg.ShouldUseErofs():
  1270  			if i > 0 {
  1271  				return nil, nil, nil, fmt.Errorf("EROFS lower layer is only supported for root mount")
  1272  			}
  1273  			f, err := os.Open(rootfsHint.Mount.Source)
  1274  			if err != nil {
  1275  				return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err)
  1276  			}
  1277  			sandEnds = append(sandEnds, f)
  1278  		}
  1279  	}
  1280  	var devSandEnd *os.File
  1281  	if shouldCreateDeviceGofer(spec, conf) {
  1282  		fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1283  		if err != nil {
  1284  			return nil, nil, nil, err
  1285  		}
  1286  		devSandEnd = os.NewFile(uintptr(fds[0]), "sandbox dev IO FD")
  1287  		donations.DonateAndClose("dev-io-fd", os.NewFile(uintptr(fds[1]), "gofer dev IO FD"))
  1288  	}
  1289  
  1290  	if attached {
  1291  		// The gofer is attached to the lifetime of this process, so it
  1292  		// should synchronously die when this process dies.
  1293  		cmd.SysProcAttr.Pdeathsig = unix.SIGKILL
  1294  	}
  1295  
  1296  	// Enter new namespaces to isolate from the rest of the system. Don't unshare
  1297  	// cgroup because gofer is added to a cgroup in the caller's namespace.
  1298  	nss := []specs.LinuxNamespace{
  1299  		{Type: specs.IPCNamespace},
  1300  		{Type: specs.MountNamespace},
  1301  		{Type: specs.NetworkNamespace},
  1302  		{Type: specs.PIDNamespace},
  1303  		{Type: specs.UTSNamespace},
  1304  	}
  1305  
  1306  	rootlessEUID := unix.Geteuid() != 0
  1307  	// Setup any uid/gid mappings, and create or join the configured user
  1308  	// namespace so the gofer's view of the filesystem aligns with the
  1309  	// users in the sandbox.
  1310  	if !rootlessEUID {
  1311  		if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
  1312  			nss = append(nss, userNS)
  1313  			specutils.SetUIDGIDMappings(cmd, spec)
  1314  			// We need to set UID and GID to have capabilities in a new user namespace.
  1315  			cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
  1316  		}
  1317  	} else {
  1318  		userNS, ok := specutils.GetNS(specs.UserNamespace, spec)
  1319  		if !ok {
  1320  			return nil, nil, nil, fmt.Errorf("unable to run a rootless container without userns")
  1321  		}
  1322  		nss = append(nss, userNS)
  1323  		syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations)
  1324  		if err != nil {
  1325  			return nil, nil, nil, err
  1326  		}
  1327  		defer syncFile.Close()
  1328  	}
  1329  
  1330  	nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations)
  1331  	if err != nil {
  1332  		return nil, nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err)
  1333  	}
  1334  
  1335  	donations.Transfer(cmd, nextFD)
  1336  
  1337  	// Start the gofer in the given namespace.
  1338  	donation.LogDonations(cmd)
  1339  	log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args)
  1340  	if err := specutils.StartInNS(cmd, nss); err != nil {
  1341  		return nil, nil, nil, fmt.Errorf("gofer: %v", err)
  1342  	}
  1343  	log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
  1344  	c.GoferPid = cmd.Process.Pid
  1345  	c.goferIsChild = true
  1346  
  1347  	// Set up and synchronize rootless mode userns mappings.
  1348  	if rootlessEUID {
  1349  		if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil {
  1350  			return nil, nil, nil, err
  1351  		}
  1352  	}
  1353  
  1354  	// Set up nvproxy within the Gofer namespace.
  1355  	if err := nvProxySetup(); err != nil {
  1356  		return nil, nil, nil, fmt.Errorf("nvproxy setup: %w", err)
  1357  	}
  1358  
  1359  	return sandEnds, devSandEnd, mountsSand, nil
  1360  }
  1361  
  1362  // changeStatus transitions from one status to another ensuring that the
  1363  // transition is valid.
  1364  func (c *Container) changeStatus(s Status) {
  1365  	switch s {
  1366  	case Creating:
  1367  		// Initial state, never transitions to it.
  1368  		panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1369  
  1370  	case Created:
  1371  		if c.Status != Creating {
  1372  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1373  		}
  1374  		if c.Sandbox == nil {
  1375  			panic("sandbox cannot be nil")
  1376  		}
  1377  
  1378  	case Paused:
  1379  		if c.Status != Running {
  1380  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1381  		}
  1382  		if c.Sandbox == nil {
  1383  			panic("sandbox cannot be nil")
  1384  		}
  1385  
  1386  	case Running:
  1387  		if c.Status != Created && c.Status != Paused {
  1388  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
  1389  		}
  1390  		if c.Sandbox == nil {
  1391  			panic("sandbox cannot be nil")
  1392  		}
  1393  
  1394  	case Stopped:
  1395  		// All states can transition to Stopped.
  1396  
  1397  	default:
  1398  		panic(fmt.Sprintf("invalid new state: %v", s))
  1399  	}
  1400  	c.Status = s
  1401  }
  1402  
  1403  // IsSandboxRunning returns true if the sandbox exists and is running.
  1404  func (c *Container) IsSandboxRunning() bool {
  1405  	return c.Sandbox != nil && c.Sandbox.IsRunning()
  1406  }
  1407  
  1408  // HasCapabilityInAnySet returns true if the given capability is in any of the
  1409  // capability sets of the container process.
  1410  func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool {
  1411  	capString := capability.String()
  1412  	for _, set := range [5][]string{
  1413  		c.Spec.Process.Capabilities.Bounding,
  1414  		c.Spec.Process.Capabilities.Effective,
  1415  		c.Spec.Process.Capabilities.Inheritable,
  1416  		c.Spec.Process.Capabilities.Permitted,
  1417  		c.Spec.Process.Capabilities.Ambient,
  1418  	} {
  1419  		for _, c := range set {
  1420  			if c == capString {
  1421  				return true
  1422  			}
  1423  		}
  1424  	}
  1425  	return false
  1426  }
  1427  
  1428  // RunsAsUID0 returns true if the container process runs with UID 0 (root).
  1429  func (c *Container) RunsAsUID0() bool {
  1430  	return c.Spec.Process.User.UID == 0
  1431  }
  1432  
  1433  func (c *Container) requireStatus(action string, statuses ...Status) error {
  1434  	for _, s := range statuses {
  1435  		if c.Status == s {
  1436  			return nil
  1437  		}
  1438  	}
  1439  	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
  1440  }
  1441  
  1442  // IsSandboxRoot returns true if this container is its sandbox's root container.
  1443  func (c *Container) IsSandboxRoot() bool {
  1444  	return isRoot(c.Spec)
  1445  }
  1446  
  1447  func isRoot(spec *specs.Spec) bool {
  1448  	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
  1449  }
  1450  
  1451  // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
  1452  // it in the current context.
  1453  func runInCgroup(cg cgroup.Cgroup, fn func() error) error {
  1454  	if cg == nil {
  1455  		return fn()
  1456  	}
  1457  	restore, err := cg.Join()
  1458  	if err != nil {
  1459  		return err
  1460  	}
  1461  	defer restore()
  1462  	return fn()
  1463  }
  1464  
  1465  // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
  1466  func (c *Container) adjustGoferOOMScoreAdj() error {
  1467  	if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil {
  1468  		return nil
  1469  	}
  1470  	return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj)
  1471  }
  1472  
  1473  // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
  1474  // oom_score_adj is set to the lowest oom_score_adj among the containers
  1475  // running in the sandbox.
  1476  //
  1477  // TODO(gvisor.dev/issue/238): This call could race with other containers being
  1478  // created at the same time and end up setting the wrong oom_score_adj to the
  1479  // sandbox. Use rpc client to synchronize.
  1480  func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error {
  1481  	// Adjustment can be skipped if the root container is exiting, because it
  1482  	// brings down the entire sandbox.
  1483  	if isRoot(spec) && destroy {
  1484  		return nil
  1485  	}
  1486  
  1487  	containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{})
  1488  	if err != nil {
  1489  		return fmt.Errorf("loading sandbox containers: %v", err)
  1490  	}
  1491  
  1492  	// Do nothing if the sandbox has been terminated.
  1493  	if len(containers) == 0 {
  1494  		return nil
  1495  	}
  1496  
  1497  	// Get the lowest score for all containers.
  1498  	var lowScore int
  1499  	scoreFound := false
  1500  	for _, container := range containers {
  1501  		// Special multi-container support for CRI. Ignore the root container when
  1502  		// calculating oom_score_adj for the sandbox because it is the
  1503  		// infrastructure (pause) container and always has a very low oom_score_adj.
  1504  		//
  1505  		// We will use OOMScoreAdj in the single-container case where the
  1506  		// containerd container-type annotation is not present.
  1507  		if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
  1508  			continue
  1509  		}
  1510  
  1511  		if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
  1512  			scoreFound = true
  1513  			lowScore = *container.Spec.Process.OOMScoreAdj
  1514  		}
  1515  	}
  1516  
  1517  	// If the container is destroyed and remaining containers have no
  1518  	// oomScoreAdj specified then we must revert to the original oom_score_adj
  1519  	// saved with the root container.
  1520  	if !scoreFound && destroy {
  1521  		lowScore = containers[0].Sandbox.OriginalOOMScoreAdj
  1522  		scoreFound = true
  1523  	}
  1524  
  1525  	// Only set oom_score_adj if one of the containers has oom_score_adj set. If
  1526  	// not, oom_score_adj is inherited from the parent process.
  1527  	//
  1528  	// See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process
  1529  	if !scoreFound {
  1530  		return nil
  1531  	}
  1532  
  1533  	// Set the lowest of all containers oom_score_adj to the sandbox.
  1534  	return setOOMScoreAdj(s.Getpid(), lowScore)
  1535  }
  1536  
  1537  // setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
  1538  // /proc must be available and mounted read-write. scoreAdj should be between
  1539  // -1000 and 1000. It's a noop if the process has already exited.
  1540  func setOOMScoreAdj(pid int, scoreAdj int) error {
  1541  	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
  1542  	if err != nil {
  1543  		// Ignore NotExist errors because it can race with process exit.
  1544  		if os.IsNotExist(err) {
  1545  			log.Warningf("Process (%d) not found setting oom_score_adj", pid)
  1546  			return nil
  1547  		}
  1548  		return err
  1549  	}
  1550  	defer f.Close()
  1551  	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
  1552  		if errors.Is(err, unix.ESRCH) {
  1553  			log.Warningf("Process (%d) exited while setting oom_score_adj", pid)
  1554  			return nil
  1555  		}
  1556  		return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err)
  1557  	}
  1558  	return nil
  1559  }
  1560  
  1561  // populateStats populates event with stats estimates based on cgroups and the
  1562  // sentry's accounting.
  1563  func (c *Container) populateStats(event *boot.EventOut) {
  1564  	// The events command, when run for all running containers, should
  1565  	// account for the full cgroup CPU usage. We split cgroup usage
  1566  	// proportionally according to the sentry-internal usage measurements,
  1567  	// only counting Running containers.
  1568  	log.Debugf("event.ContainerUsage: %v", event.ContainerUsage)
  1569  	numContainers := uint64(len(event.ContainerUsage))
  1570  	if numContainers == 0 {
  1571  		log.Warningf("events: no containers listed in usage, returning zero CPU usage")
  1572  		event.Event.Data.CPU.Usage.Total = 0
  1573  		return
  1574  	}
  1575  
  1576  	var containerUsage uint64
  1577  	var allContainersUsage uint64
  1578  	for ID, usage := range event.ContainerUsage {
  1579  		allContainersUsage += usage
  1580  		if ID == c.ID {
  1581  			containerUsage = usage
  1582  		}
  1583  	}
  1584  
  1585  	cgroup, err := c.Sandbox.NewCGroup()
  1586  	if err != nil {
  1587  		// No cgroup, so rely purely on the sentry's accounting.
  1588  		log.Warningf("events: no cgroups")
  1589  		event.Event.Data.CPU.Usage.Total = containerUsage
  1590  		return
  1591  	}
  1592  
  1593  	// Get the host cgroup CPU usage.
  1594  	cgroupsUsage, err := cgroup.CPUUsage()
  1595  	if err != nil || cgroupsUsage == 0 {
  1596  		// No cgroup usage, so rely purely on the sentry's accounting.
  1597  		log.Warningf("events: failed when getting cgroup CPU usage for container: usage=%d, err: %v", cgroupsUsage, err)
  1598  		event.Event.Data.CPU.Usage.Total = containerUsage
  1599  		return
  1600  	}
  1601  
  1602  	// If the sentry reports no CPU usage, fall back on cgroups and split usage
  1603  	// equally across containers.
  1604  	if allContainersUsage == 0 {
  1605  		log.Warningf("events: no sentry CPU usage reported")
  1606  		allContainersUsage = cgroupsUsage
  1607  		containerUsage = cgroupsUsage / numContainers
  1608  	}
  1609  
  1610  	// Scaling can easily overflow a uint64 (e.g. a containerUsage and
  1611  	// cgroupsUsage of 16 seconds each will overflow), so use floats.
  1612  	total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage))
  1613  	log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total)
  1614  	event.Event.Data.CPU.Usage.Total = uint64(total)
  1615  	return
  1616  }
  1617  
  1618  // setupCgroupForRoot configures and returns cgroup for the sandbox and the
  1619  // root container. If `cgroupParentAnnotation` is set, use that path as the
  1620  // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup.
  1621  func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) {
  1622  	var parentCgroup cgroup.Cgroup
  1623  	if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok {
  1624  		var err error
  1625  		parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup)
  1626  		if err != nil {
  1627  			return nil, nil, err
  1628  		}
  1629  	} else {
  1630  		var err error
  1631  		parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup)
  1632  		if parentCgroup == nil || err != nil {
  1633  			return nil, nil, err
  1634  		}
  1635  	}
  1636  
  1637  	var err error
  1638  	parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources)
  1639  	if parentCgroup == nil || err != nil {
  1640  		return nil, nil, err
  1641  	}
  1642  
  1643  	subCgroup, err := c.setupCgroupForSubcontainer(conf, spec)
  1644  	if err != nil {
  1645  		_ = parentCgroup.Uninstall()
  1646  		return nil, nil, err
  1647  	}
  1648  	return parentCgroup, subCgroup, nil
  1649  }
  1650  
  1651  // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since
  1652  // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the
  1653  // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups
  1654  // paths to discover new containers and report stats for them.
  1655  func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) {
  1656  	if isRoot(spec) {
  1657  		if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok {
  1658  			return nil, nil
  1659  		}
  1660  	}
  1661  
  1662  	cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup)
  1663  	if cg == nil || err != nil {
  1664  		return nil, err
  1665  	}
  1666  	// Use empty resources, just want the directory structure created.
  1667  	return cgroupInstall(conf, cg, &specs.LinuxResources{})
  1668  }
  1669  
  1670  // donateGoferProfileFDs will open profile files and donate their FDs to the
  1671  // gofer.
  1672  func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error {
  1673  	// The gofer profile files are named based on the provided flag, but
  1674  	// suffixed with "gofer" and the container ID to avoid collisions with
  1675  	// sentry profile files or profile files from other gofers.
  1676  	//
  1677  	// TODO(b/243183772): Merge gofer profile data with sentry profile data
  1678  	// into a single file.
  1679  	profSuffix := ".gofer." + c.ID
  1680  	const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC
  1681  	if conf.ProfileBlock != "" {
  1682  		if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil {
  1683  			return err
  1684  		}
  1685  	}
  1686  	if conf.ProfileCPU != "" {
  1687  		if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil {
  1688  			return err
  1689  		}
  1690  	}
  1691  	if conf.ProfileHeap != "" {
  1692  		if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil {
  1693  			return err
  1694  		}
  1695  	}
  1696  	if conf.ProfileMutex != "" {
  1697  		if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil {
  1698  			return err
  1699  		}
  1700  	}
  1701  	if conf.TraceFile != "" {
  1702  		if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil {
  1703  			return err
  1704  		}
  1705  	}
  1706  	return nil
  1707  }
  1708  
  1709  // cgroupInstall creates cgroups dir structure and sets their respective
  1710  // resources. In case of success, returns the cgroups instance and nil error.
  1711  // For rootless, it's possible that cgroups operations fail, in this case the
  1712  // error is suppressed and a nil cgroups instance is returned to indicate that
  1713  // no cgroups was configured.
  1714  func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) {
  1715  	if err := cg.Install(res); err != nil {
  1716  		switch {
  1717  		case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless:
  1718  			log.Warningf("Skipping cgroup configuration in rootless mode: %v", err)
  1719  			return nil, nil
  1720  		default:
  1721  			return nil, fmt.Errorf("configuring cgroup: %v", err)
  1722  		}
  1723  	}
  1724  	return cg, nil
  1725  }
  1726  
  1727  func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error {
  1728  	if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
  1729  		return nil
  1730  	}
  1731  	if conf.Network == config.NetworkHost {
  1732  		// Hostnet feature requires the sandbox to run in the current user
  1733  		// namespace, in which the network namespace is configured.
  1734  		return nil
  1735  	}
  1736  	if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok {
  1737  		// If the spec already defines a userns, use that.
  1738  		return nil
  1739  	}
  1740  	if spec.Linux == nil {
  1741  		spec.Linux = &specs.Linux{}
  1742  	}
  1743  	if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 {
  1744  		// The spec can only define UID/GID mappings with a userns (checked above).
  1745  		return fmt.Errorf("spec defines UID/GID mappings without defining userns")
  1746  	}
  1747  	// Run the sandbox in a new user namespace with identity UID/GID mappings.
  1748  	log.Debugf("Configuring container with a new userns with identity user mappings into current userns")
  1749  	spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace})
  1750  	uidMappings, err := getIdentityMapping("uid_map")
  1751  	if err != nil {
  1752  		return err
  1753  	}
  1754  	spec.Linux.UIDMappings = uidMappings
  1755  	logIDMappings(uidMappings, "UID")
  1756  	gidMappings, err := getIdentityMapping("gid_map")
  1757  	if err != nil {
  1758  		return err
  1759  	}
  1760  	spec.Linux.GIDMappings = gidMappings
  1761  	logIDMappings(gidMappings, "GID")
  1762  	return nil
  1763  }
  1764  
  1765  func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) {
  1766  	// See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files
  1767  	// are organized.
  1768  	mapFile := path.Join("/proc/self", mapFileName)
  1769  	file, err := os.Open(mapFile)
  1770  	if err != nil {
  1771  		return nil, fmt.Errorf("failed to open %s: %v", mapFile, err)
  1772  	}
  1773  	defer file.Close()
  1774  
  1775  	var mappings []specs.LinuxIDMapping
  1776  	scanner := bufio.NewScanner(file)
  1777  	for scanner.Scan() {
  1778  		line := scanner.Text()
  1779  		var myStart, parentStart, rangeLen uint32
  1780  		numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen)
  1781  		if err != nil {
  1782  			return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err)
  1783  		}
  1784  		if numParsed != 3 {
  1785  			return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile)
  1786  		}
  1787  		// Create an identity mapping with the current userns.
  1788  		mappings = append(mappings, specs.LinuxIDMapping{
  1789  			ContainerID: myStart,
  1790  			HostID:      myStart,
  1791  			Size:        rangeLen,
  1792  		})
  1793  	}
  1794  	if err := scanner.Err(); err != nil {
  1795  		return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err)
  1796  	}
  1797  	return mappings, nil
  1798  }
  1799  
  1800  func logIDMappings(mappings []specs.LinuxIDMapping, idType string) {
  1801  	if !log.IsLogging(log.Debug) {
  1802  		return
  1803  	}
  1804  	log.Debugf("%s Mappings:", idType)
  1805  	for _, m := range mappings {
  1806  		log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size)
  1807  	}
  1808  }
  1809  
  1810  // nvProxyPreGoferHostSetup does host setup work so that `nvidia-container-cli
  1811  // configure` can be run in the future. It runs before any Gofers start.
  1812  // It verifies that all the required dependencies are in place, loads kernel
  1813  // modules, and ensures the correct device files exist and are accessible.
  1814  // This should only be necessary once on the host. It should be run during the
  1815  // root container setup sequence to make sure it has run at least once.
  1816  func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error {
  1817  	if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) {
  1818  		return nil
  1819  	}
  1820  
  1821  	// Locate binaries. For security reasons, unlike
  1822  	// nvidia-container-runtime-hook, we don't add the container's filesystem
  1823  	// to the search path. We also don't support
  1824  	// /etc/nvidia-container-runtime/config.toml to avoid importing a TOML
  1825  	// parser.
  1826  	cliPath, err := exec.LookPath("nvidia-container-cli")
  1827  	if err != nil {
  1828  		return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err)
  1829  	}
  1830  
  1831  	// nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves.
  1832  	nvproxyLoadKernelModules()
  1833  
  1834  	if _, err := os.Stat("/dev/nvidiactl"); err != nil {
  1835  		if !os.IsNotExist(err) {
  1836  			return fmt.Errorf("stat(2) for /dev/nvidiactl failed: %w", err)
  1837  		}
  1838  
  1839  		// Run `nvidia-container-cli info`.
  1840  		// This has the side-effect of automatically creating GPU device files.
  1841  		argv := []string{cliPath, "--load-kmods", "info"}
  1842  		log.Debugf("Executing %q", argv)
  1843  		var infoOut, infoErr strings.Builder
  1844  		cmd := exec.Cmd{
  1845  			Path:   argv[0],
  1846  			Args:   argv,
  1847  			Env:    os.Environ(),
  1848  			Stdout: &infoOut,
  1849  			Stderr: &infoErr,
  1850  		}
  1851  		if err := cmd.Run(); err != nil {
  1852  			return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String())
  1853  		}
  1854  		log.Debugf("nvidia-container-cli info: %v", infoOut.String())
  1855  	}
  1856  
  1857  	return nil
  1858  }
  1859  
  1860  // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe.
  1861  func nvproxyLoadKernelModules() {
  1862  	for _, mod := range [...]string{
  1863  		"nvidia",
  1864  		"nvidia-uvm",
  1865  	} {
  1866  		argv := []string{
  1867  			"/sbin/modprobe",
  1868  			mod,
  1869  		}
  1870  		log.Debugf("Executing %q", argv)
  1871  		var stdout, stderr strings.Builder
  1872  		cmd := exec.Cmd{
  1873  			Path:   argv[0],
  1874  			Args:   argv,
  1875  			Env:    os.Environ(),
  1876  			Stdout: &stdout,
  1877  			Stderr: &stderr,
  1878  		}
  1879  		if err := cmd.Run(); err != nil {
  1880  			// This might not be fatal since modules may already be loaded. Log
  1881  			// the failure but continue.
  1882  			log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String())
  1883  		}
  1884  	}
  1885  }
  1886  
  1887  // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`.
  1888  // This sets up the container filesystem with bind mounts that allow it to
  1889  // use NVIDIA devices.
  1890  //
  1891  // This should be called during the Gofer setup process, as the bind mounts
  1892  // are created in the Gofer's mount namespace.
  1893  // If successful, it returns a callback function that must be called once the
  1894  // Gofer process has started.
  1895  // This function has no effect if nvproxy functionality is not requested.
  1896  //
  1897  // This function essentially replicates
  1898  // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the
  1899  // binary that executeHook() is hard-coded to skip, with differences noted
  1900  // inline. We do this rather than move the prestart hook because the
  1901  // "runtime environment" in which prestart hooks execute is vaguely
  1902  // defined, such that nvidia-container-runtime-hook and existing runsc
  1903  // hooks differ in their expected environment.
  1904  //
  1905  // Note that nvidia-container-cli will set up files in /dev and /proc which
  1906  // are useless, since they will be hidden by sentry devtmpfs and procfs
  1907  // respectively (and some device files will have the wrong device numbers
  1908  // from the application's perspective since nvproxy may register device
  1909  // numbers in sentry VFS that differ from those on the host, e.g. for
  1910  // nvidia-uvm). These files are separately created during sandbox VFS
  1911  // construction. For this reason, we don't need to parse
  1912  // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli.
  1913  func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) {
  1914  	if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) {
  1915  		return func() error { return nil }, nil
  1916  	}
  1917  
  1918  	if spec.Root == nil {
  1919  		return nil, fmt.Errorf("spec missing root filesystem")
  1920  	}
  1921  
  1922  	// nvidia-container-cli does not create this directory.
  1923  	if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil {
  1924  		return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err)
  1925  	}
  1926  
  1927  	cliPath, err := exec.LookPath("nvidia-container-cli")
  1928  	if err != nil {
  1929  		return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err)
  1930  	}
  1931  
  1932  	// On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter.
  1933  	var ldconfigPath string
  1934  	if _, err := os.Stat("/sbin/ldconfig.real"); err == nil {
  1935  		ldconfigPath = "/sbin/ldconfig.real"
  1936  	} else {
  1937  		ldconfigPath = "/sbin/ldconfig"
  1938  	}
  1939  
  1940  	devices, err := specutils.ParseNvidiaVisibleDevices(spec)
  1941  	if err != nil {
  1942  		return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err)
  1943  	}
  1944  
  1945  	// Create synchronization FD for nvproxy.
  1946  	fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
  1947  	if err != nil {
  1948  		return nil, err
  1949  	}
  1950  	ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD")
  1951  	goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD")
  1952  	goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd)
  1953  
  1954  	return func() error {
  1955  		defer ourEnd.Close()
  1956  		argv := []string{
  1957  			cliPath,
  1958  			"--load-kmods",
  1959  			"configure",
  1960  			fmt.Sprintf("--ldconfig=@%s", ldconfigPath),
  1961  			"--no-cgroups", // runsc doesn't configure device cgroups yet
  1962  			"--utility",
  1963  			"--compute",
  1964  			fmt.Sprintf("--pid=%d", goferCmd.Process.Pid),
  1965  			fmt.Sprintf("--device=%s", devices),
  1966  			spec.Root.Path,
  1967  		}
  1968  		log.Debugf("Executing %q", argv)
  1969  		var stdout, stderr strings.Builder
  1970  		cmd := exec.Cmd{
  1971  			Path:   argv[0],
  1972  			Args:   argv,
  1973  			Env:    os.Environ(),
  1974  			Stdout: &stdout,
  1975  			Stderr: &stderr,
  1976  		}
  1977  		if err := cmd.Run(); err != nil {
  1978  			return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String())
  1979  		}
  1980  		return nil
  1981  	}, nil
  1982  }
  1983  
  1984  // CheckStopped checks if the container is stopped and updates its status.
  1985  func (c *Container) CheckStopped() {
  1986  	if state, err := c.Sandbox.ContainerRuntimeState(c.ID); err != nil {
  1987  		log.Warningf("Cannot find if container %v exists, checking if sandbox %v is running, err: %v", c.ID, c.Sandbox.ID, err)
  1988  		if !c.IsSandboxRunning() {
  1989  			log.Warningf("Sandbox isn't running anymore, marking container %v as stopped:", c.ID)
  1990  			c.changeStatus(Stopped)
  1991  		}
  1992  	} else {
  1993  		if state == boot.RuntimeStateStopped {
  1994  			log.Warningf("Container %v is stopped", c.ID)
  1995  			c.changeStatus(Stopped)
  1996  		}
  1997  	}
  1998  }