gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/controller.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"path"
    21  	"strconv"
    22  	"sync"
    23  	gtime "time"
    24  
    25  	specs "github.com/opencontainers/runtime-spec/specs-go"
    26  	"golang.org/x/sys/unix"
    27  	"gvisor.dev/gvisor/pkg/cleanup"
    28  	"gvisor.dev/gvisor/pkg/context"
    29  	"gvisor.dev/gvisor/pkg/control/server"
    30  	"gvisor.dev/gvisor/pkg/fd"
    31  	"gvisor.dev/gvisor/pkg/fspath"
    32  	"gvisor.dev/gvisor/pkg/log"
    33  	"gvisor.dev/gvisor/pkg/sentry/control"
    34  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs"
    35  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    36  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    37  	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
    38  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    39  	"gvisor.dev/gvisor/pkg/state/statefile"
    40  	"gvisor.dev/gvisor/pkg/urpc"
    41  	"gvisor.dev/gvisor/runsc/boot/procfs"
    42  	"gvisor.dev/gvisor/runsc/config"
    43  	"gvisor.dev/gvisor/runsc/specutils"
    44  )
    45  
    46  const (
    47  	// ContMgrCheckpoint checkpoints a container.
    48  	ContMgrCheckpoint = "containerManager.Checkpoint"
    49  
    50  	// ContMgrCreateSubcontainer creates a sub-container.
    51  	ContMgrCreateSubcontainer = "containerManager.CreateSubcontainer"
    52  
    53  	// ContMgrDestroySubcontainer is used to stop a sub-container and free all
    54  	// associated resources in the sandbox.
    55  	ContMgrDestroySubcontainer = "containerManager.DestroySubcontainer"
    56  
    57  	// ContMgrEvent gets stats about the container used by "runsc events".
    58  	ContMgrEvent = "containerManager.Event"
    59  
    60  	// ContMgrExecuteAsync executes a command in a container.
    61  	ContMgrExecuteAsync = "containerManager.ExecuteAsync"
    62  
    63  	// ContMgrPortForward starts port forwarding with the sandbox.
    64  	ContMgrPortForward = "containerManager.PortForward"
    65  
    66  	// ContMgrProcesses lists processes running in a container.
    67  	ContMgrProcesses = "containerManager.Processes"
    68  
    69  	// ContMgrRestore restores a container from a statefile.
    70  	ContMgrRestore = "containerManager.Restore"
    71  
    72  	// ContMgrRestoreSubcontainer restores a container from a statefile.
    73  	ContMgrRestoreSubcontainer = "containerManager.RestoreSubcontainer"
    74  
    75  	// ContMgrSignal sends a signal to a container.
    76  	ContMgrSignal = "containerManager.Signal"
    77  
    78  	// ContMgrStartSubcontainer starts a sub-container inside a running sandbox.
    79  	ContMgrStartSubcontainer = "containerManager.StartSubcontainer"
    80  
    81  	// ContMgrWait waits on the init process of the container and returns its
    82  	// ExitStatus.
    83  	ContMgrWait = "containerManager.Wait"
    84  
    85  	// ContMgrWaitPID waits on a process with a certain PID in the sandbox and
    86  	// return its ExitStatus.
    87  	ContMgrWaitPID = "containerManager.WaitPID"
    88  
    89  	// ContMgrRootContainerStart starts a new sandbox with a root container.
    90  	ContMgrRootContainerStart = "containerManager.StartRoot"
    91  
    92  	// ContMgrCreateTraceSession starts a trace session.
    93  	ContMgrCreateTraceSession = "containerManager.CreateTraceSession"
    94  
    95  	// ContMgrDeleteTraceSession deletes a trace session.
    96  	ContMgrDeleteTraceSession = "containerManager.DeleteTraceSession"
    97  
    98  	// ContMgrListTraceSessions lists a trace session.
    99  	ContMgrListTraceSessions = "containerManager.ListTraceSessions"
   100  
   101  	// ContMgrProcfsDump dumps sandbox procfs state.
   102  	ContMgrProcfsDump = "containerManager.ProcfsDump"
   103  
   104  	// ContMgrMount mounts a filesystem in a container.
   105  	ContMgrMount = "containerManager.Mount"
   106  
   107  	// ContMgrContainerRuntimeState returns the runtime state of a container.
   108  	ContMgrContainerRuntimeState = "containerManager.ContainerRuntimeState"
   109  )
   110  
   111  const (
   112  	// NetworkCreateLinksAndRoutes creates links and routes in a network stack.
   113  	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
   114  
   115  	// DebugStacks collects sandbox stacks for debugging.
   116  	DebugStacks = "debug.Stacks"
   117  )
   118  
   119  // Profiling related commands (see pprof.go for more details).
   120  const (
   121  	ProfileCPU   = "Profile.CPU"
   122  	ProfileHeap  = "Profile.Heap"
   123  	ProfileBlock = "Profile.Block"
   124  	ProfileMutex = "Profile.Mutex"
   125  	ProfileTrace = "Profile.Trace"
   126  )
   127  
   128  // Logging related commands (see logging.go for more details).
   129  const (
   130  	LoggingChange = "Logging.Change"
   131  )
   132  
   133  // Lifecycle related commands (see lifecycle.go for more details).
   134  const (
   135  	LifecyclePause  = "Lifecycle.Pause"
   136  	LifecycleResume = "Lifecycle.Resume"
   137  )
   138  
   139  // Usage related commands (see usage.go for more details).
   140  const (
   141  	UsageCollect = "Usage.Collect"
   142  	UsageUsageFD = "Usage.UsageFD"
   143  )
   144  
   145  // Metrics related commands (see metrics.go).
   146  const (
   147  	MetricsGetRegistered = "Metrics.GetRegisteredMetrics"
   148  	MetricsExport        = "Metrics.Export"
   149  )
   150  
   151  // Commands for interacting with cgroupfs within the sandbox.
   152  const (
   153  	CgroupsReadControlFiles  = "Cgroups.ReadControlFiles"
   154  	CgroupsWriteControlFiles = "Cgroups.WriteControlFiles"
   155  )
   156  
   157  // controller holds the control server, and is used for communication into the
   158  // sandbox.
   159  type controller struct {
   160  	// srv is the control server.
   161  	srv *server.Server
   162  
   163  	// manager holds the containerManager methods.
   164  	manager *containerManager
   165  }
   166  
   167  // newController creates a new controller. The caller must call
   168  // controller.srv.StartServing() to start the controller.
   169  func newController(fd int, l *Loader) (*controller, error) {
   170  	srv, err := server.CreateFromFD(fd)
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  
   175  	ctrl := &controller{
   176  		manager: &containerManager{
   177  			startChan:       make(chan struct{}),
   178  			startResultChan: make(chan error),
   179  			l:               l,
   180  		},
   181  		srv: srv,
   182  	}
   183  	ctrl.srv.Register(ctrl.manager)
   184  	ctrl.srv.Register(&control.Cgroups{Kernel: l.k})
   185  	ctrl.srv.Register(&control.Lifecycle{Kernel: l.k})
   186  	ctrl.srv.Register(&control.Logging{})
   187  	ctrl.srv.Register(&control.Proc{Kernel: l.k})
   188  	ctrl.srv.Register(&control.State{Kernel: l.k})
   189  	ctrl.srv.Register(&control.Usage{Kernel: l.k})
   190  	ctrl.srv.Register(&control.Metrics{})
   191  	ctrl.srv.Register(&debug{})
   192  
   193  	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
   194  		ctrl.srv.Register(&Network{
   195  			Stack:  eps.Stack,
   196  			Kernel: l.k,
   197  		})
   198  	}
   199  	if l.root.conf.ProfileEnable {
   200  		ctrl.srv.Register(control.NewProfile(l.k))
   201  	}
   202  	return ctrl, nil
   203  }
   204  
   205  // stopRPCTimeout is the time for clients to complete ongoing RPCs.
   206  const stopRPCTimeout = 15 * gtime.Second
   207  
   208  func (c *controller) stop() {
   209  	c.srv.Stop(stopRPCTimeout)
   210  }
   211  
   212  // containerManager manages sandbox containers.
   213  type containerManager struct {
   214  	// startChan is used to signal when the root container process should
   215  	// be started.
   216  	startChan chan struct{}
   217  
   218  	// startResultChan is used to signal when the root container has
   219  	// started. Any errors encountered during startup will be sent to the
   220  	// channel. A nil value indicates success.
   221  	startResultChan chan error
   222  
   223  	// l is the loader that creates containers and sandboxes.
   224  	l *Loader
   225  
   226  	// restorer is set when the sandbox in being restored. It stores the state
   227  	// of all containers and perform all actions required by restore.
   228  	restorer *restorer
   229  }
   230  
   231  // StartRoot will start the root container process.
   232  func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
   233  	log.Debugf("containerManager.StartRoot, cid: %s", *cid)
   234  	// Tell the root container to start and wait for the result.
   235  	return cm.onStart()
   236  }
   237  
   238  // onStart notifies that sandbox is ready to start and wait for the result.
   239  func (cm *containerManager) onStart() error {
   240  	cm.startChan <- struct{}{}
   241  	if err := <-cm.startResultChan; err != nil {
   242  		return fmt.Errorf("starting sandbox: %v", err)
   243  	}
   244  	return nil
   245  }
   246  
   247  // Processes retrieves information about processes running in the sandbox.
   248  func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
   249  	log.Debugf("containerManager.Processes, cid: %s", *cid)
   250  	return control.Processes(cm.l.k, *cid, out)
   251  }
   252  
   253  // CreateArgs contains arguments to the Create method.
   254  type CreateArgs struct {
   255  	// CID is the ID of the container to start.
   256  	CID string
   257  
   258  	// FilePayload may contain a TTY file for the terminal, if enabled.
   259  	urpc.FilePayload
   260  }
   261  
   262  // CreateSubcontainer creates a container within a sandbox.
   263  func (cm *containerManager) CreateSubcontainer(args *CreateArgs, _ *struct{}) error {
   264  	log.Debugf("containerManager.CreateSubcontainer: %s", args.CID)
   265  
   266  	if len(args.Files) > 1 {
   267  		return fmt.Errorf("start arguments must have at most 1 files for TTY")
   268  	}
   269  	var tty *fd.FD
   270  	if len(args.Files) == 1 {
   271  		var err error
   272  		tty, err = fd.NewFromFile(args.Files[0])
   273  		if err != nil {
   274  			return fmt.Errorf("error dup'ing TTY file: %w", err)
   275  		}
   276  	}
   277  	return cm.l.createSubcontainer(args.CID, tty)
   278  }
   279  
   280  // StartArgs contains arguments to the Start method.
   281  type StartArgs struct {
   282  	// Spec is the spec of the container to start.
   283  	Spec *specs.Spec
   284  
   285  	// Config is the runsc-specific configuration for the sandbox.
   286  	Conf *config.Config
   287  
   288  	// CID is the ID of the container to start.
   289  	CID string
   290  
   291  	// NumGoferFilestoreFDs is the number of gofer filestore FDs donated.
   292  	NumGoferFilestoreFDs int
   293  
   294  	// IsDevIoFilePresent indicates whether the dev gofer FD is present.
   295  	IsDevIoFilePresent bool
   296  
   297  	// GoferMountConfs contains information about how the gofer mounts have been
   298  	// configured. The first entry is for rootfs and the following entries are
   299  	// for bind mounts in Spec.Mounts (in the same order).
   300  	GoferMountConfs []GoferMountConf
   301  
   302  	// FilePayload contains, in order:
   303  	//   * stdin, stdout, and stderr (optional: if terminal is disabled).
   304  	//   * file descriptors to gofer-backing host files (optional).
   305  	//   * file descriptor for /dev gofer connection (optional)
   306  	//   * file descriptors to connect to gofer to serve the root filesystem.
   307  	urpc.FilePayload
   308  }
   309  
   310  // StartSubcontainer runs a created container within a sandbox.
   311  func (cm *containerManager) StartSubcontainer(args *StartArgs, _ *struct{}) error {
   312  	// Validate arguments.
   313  	if args == nil {
   314  		return errors.New("start missing arguments")
   315  	}
   316  	log.Debugf("containerManager.StartSubcontainer, cid: %s, args: %+v", args.CID, args)
   317  	if args.Spec == nil {
   318  		return errors.New("start arguments missing spec")
   319  	}
   320  	if args.Conf == nil {
   321  		return errors.New("start arguments missing config")
   322  	}
   323  	if args.CID == "" {
   324  		return errors.New("start argument missing container ID")
   325  	}
   326  	expectedFDs := 1 // At least one FD for the root filesystem.
   327  	expectedFDs += args.NumGoferFilestoreFDs
   328  	if args.IsDevIoFilePresent {
   329  		expectedFDs++
   330  	}
   331  	if !args.Spec.Process.Terminal {
   332  		expectedFDs += 3
   333  	}
   334  	if len(args.Files) < expectedFDs {
   335  		return fmt.Errorf("start arguments must contain at least %d FDs, but only got %d", expectedFDs, len(args.Files))
   336  	}
   337  
   338  	// All validation passed, logs the spec for debugging.
   339  	specutils.LogSpecDebug(args.Spec, args.Conf.OCISeccomp)
   340  
   341  	goferFiles := args.Files
   342  	var stdios []*fd.FD
   343  	if !args.Spec.Process.Terminal {
   344  		// When not using a terminal, stdios come as the first 3 files in the
   345  		// payload.
   346  		var err error
   347  		stdios, err = fd.NewFromFiles(goferFiles[:3])
   348  		if err != nil {
   349  			return fmt.Errorf("error dup'ing stdio files: %w", err)
   350  		}
   351  		goferFiles = goferFiles[3:]
   352  	}
   353  	defer func() {
   354  		for _, fd := range stdios {
   355  			_ = fd.Close()
   356  		}
   357  	}()
   358  
   359  	var goferFilestoreFDs []*fd.FD
   360  	for i := 0; i < args.NumGoferFilestoreFDs; i++ {
   361  		goferFilestoreFD, err := fd.NewFromFile(goferFiles[i])
   362  		if err != nil {
   363  			return fmt.Errorf("error dup'ing gofer filestore file: %w", err)
   364  		}
   365  		goferFilestoreFDs = append(goferFilestoreFDs, goferFilestoreFD)
   366  	}
   367  	goferFiles = goferFiles[args.NumGoferFilestoreFDs:]
   368  	defer func() {
   369  		for _, fd := range goferFilestoreFDs {
   370  			_ = fd.Close()
   371  		}
   372  	}()
   373  
   374  	var devGoferFD *fd.FD
   375  	if args.IsDevIoFilePresent {
   376  		var err error
   377  		devGoferFD, err = fd.NewFromFile(goferFiles[0])
   378  		if err != nil {
   379  			return fmt.Errorf("error dup'ing dev gofer file: %w", err)
   380  		}
   381  		goferFiles = goferFiles[1:]
   382  		defer devGoferFD.Close()
   383  	}
   384  
   385  	goferFDs, err := fd.NewFromFiles(goferFiles)
   386  	if err != nil {
   387  		return fmt.Errorf("error dup'ing gofer files: %w", err)
   388  	}
   389  	defer func() {
   390  		for _, fd := range goferFDs {
   391  			_ = fd.Close()
   392  		}
   393  	}()
   394  
   395  	if err := cm.l.startSubcontainer(args.Spec, args.Conf, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs); err != nil {
   396  		log.Debugf("containerManager.StartSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
   397  		return err
   398  	}
   399  	log.Debugf("Container started, cid: %s", args.CID)
   400  	return nil
   401  }
   402  
   403  // DestroySubcontainer stops a container if it is still running and cleans up
   404  // its filesystem.
   405  func (cm *containerManager) DestroySubcontainer(cid *string, _ *struct{}) error {
   406  	log.Debugf("containerManager.DestroySubcontainer, cid: %s", *cid)
   407  	return cm.l.destroySubcontainer(*cid)
   408  }
   409  
   410  // ExecuteAsync starts running a command on a created or running sandbox. It
   411  // returns the PID of the new process.
   412  func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
   413  	log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args)
   414  	tgid, err := cm.l.executeAsync(args)
   415  	if err != nil {
   416  		log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err)
   417  		return err
   418  	}
   419  	*pid = int32(tgid)
   420  	return nil
   421  }
   422  
   423  // Checkpoint pauses a sandbox and saves its state.
   424  func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
   425  	log.Debugf("containerManager.Checkpoint")
   426  	return cm.l.save(o)
   427  }
   428  
   429  // PortForwardOpts contains options for port forwarding to a port in a
   430  // container.
   431  type PortForwardOpts struct {
   432  	// FilePayload contains one fd for a UDS (or local port) used for port
   433  	// forwarding.
   434  	urpc.FilePayload
   435  
   436  	// ContainerID is the container for the process being executed.
   437  	ContainerID string
   438  	// Port is the port to to forward.
   439  	Port uint16
   440  }
   441  
   442  // PortForward initiates a port forward to the container.
   443  func (cm *containerManager) PortForward(opts *PortForwardOpts, _ *struct{}) error {
   444  	log.Debugf("containerManager.PortForward, cid: %s, port: %d", opts.ContainerID, opts.Port)
   445  	if err := cm.l.portForward(opts); err != nil {
   446  		log.Debugf("containerManager.PortForward failed, opts: %+v, err: %v", opts, err)
   447  		return err
   448  	}
   449  	return nil
   450  }
   451  
   452  // RestoreOpts contains options related to restoring a container's file system.
   453  type RestoreOpts struct {
   454  	// FilePayload contains the state file to be restored, followed in order by:
   455  	// 1. checkpoint state file.
   456  	// 2. optional checkpoint pages metadata file.
   457  	// 3. optional checkpoint pages file.
   458  	// 4. optional platform device file.
   459  	urpc.FilePayload
   460  	HavePagesFile  bool
   461  	HaveDeviceFile bool
   462  }
   463  
   464  // Restore loads a container from a statefile.
   465  // The container's current kernel is destroyed, a restore environment is
   466  // created, and the kernel is recreated with the restore state file. The
   467  // container then sends the signal to start.
   468  func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
   469  	log.Debugf("containerManager.Restore")
   470  
   471  	if cm.l.state == restoring {
   472  		return fmt.Errorf("restore is already in progress")
   473  	}
   474  	if cm.l.state == started {
   475  		return fmt.Errorf("cannot restore a started container")
   476  	}
   477  	if len(o.Files) == 0 {
   478  		return fmt.Errorf("at least one file must be passed to Restore")
   479  	}
   480  
   481  	stateFile, err := o.ReleaseFD(0)
   482  	if err != nil {
   483  		return err
   484  	}
   485  
   486  	var stat unix.Stat_t
   487  	if err := unix.Fstat(stateFile.FD(), &stat); err != nil {
   488  		return err
   489  	}
   490  	if stat.Size == 0 {
   491  		return fmt.Errorf("statefile cannot be empty")
   492  	}
   493  
   494  	cm.restorer = &restorer{restoreDone: cm.onRestoreDone, stateFile: stateFile}
   495  	cm.l.restoreWaiters = sync.NewCond(&cm.l.mu)
   496  	cm.l.state = restoring
   497  
   498  	fileIdx := 1
   499  	if o.HavePagesFile {
   500  		cm.restorer.pagesMetadata, err = o.ReleaseFD(fileIdx)
   501  		if err != nil {
   502  			return err
   503  		}
   504  		defer cm.restorer.pagesMetadata.Close()
   505  		fileIdx++
   506  
   507  		cm.restorer.pagesFile, err = o.ReleaseFD(fileIdx)
   508  		if err != nil {
   509  			return err
   510  		}
   511  		defer cm.restorer.pagesFile.Close()
   512  		fileIdx++
   513  	}
   514  
   515  	if o.HaveDeviceFile {
   516  		cm.restorer.deviceFile, err = o.ReleaseFD(fileIdx)
   517  		if err != nil {
   518  			return err
   519  		}
   520  		fileIdx++
   521  	}
   522  
   523  	if fileIdx < len(o.Files) {
   524  		return fmt.Errorf("more files passed to Restore than expected")
   525  	}
   526  
   527  	// Pause the kernel while we build a new one.
   528  	cm.l.k.Pause()
   529  
   530  	metadata, err := statefile.MetadataUnsafe(cm.restorer.stateFile)
   531  	if err != nil {
   532  		return fmt.Errorf("reading metadata from statefile: %w", err)
   533  	}
   534  	var count int
   535  	countStr, ok := metadata["container_count"]
   536  	if !ok {
   537  		// TODO(gvisor.dev/issue/1956): Add container count with syscall save
   538  		// trigger. For now, assume that only a single container exists if metadata
   539  		// isn't present.
   540  		//
   541  		// -return errors.New("container count not present in state file")
   542  		count = 1
   543  	} else {
   544  		count, err = strconv.Atoi(countStr)
   545  		if err != nil {
   546  			return fmt.Errorf("invalid container count: %w", err)
   547  		}
   548  		if count < 1 {
   549  			return fmt.Errorf("invalid container count value: %v", count)
   550  		}
   551  	}
   552  	cm.restorer.totalContainers = count
   553  	log.Infof("Restoring a total of %d containers", cm.restorer.totalContainers)
   554  
   555  	if _, err := unix.Seek(stateFile.FD(), 0, 0); err != nil {
   556  		return fmt.Errorf("rewinding state file: %w", err)
   557  	}
   558  
   559  	return cm.restorer.restoreContainerInfo(cm.l, &cm.l.root)
   560  }
   561  
   562  func (cm *containerManager) onRestoreDone() error {
   563  	if err := cm.onStart(); err != nil {
   564  		return err
   565  	}
   566  
   567  	cm.l.restoreWaiters.Broadcast()
   568  	cm.restorer = nil
   569  	return nil
   570  }
   571  
   572  func (cm *containerManager) RestoreSubcontainer(args *StartArgs, _ *struct{}) error {
   573  	log.Debugf("containerManager.RestoreSubcontainer, cid: %s, args: %+v", args.CID, args)
   574  
   575  	if cm.l.state != restoring {
   576  		return fmt.Errorf("sandbox is not being restored, cannot restore subcontainer")
   577  	}
   578  
   579  	// Validate arguments.
   580  	if args.Spec == nil {
   581  		return errors.New("start arguments missing spec")
   582  	}
   583  	if args.Conf == nil {
   584  		return errors.New("start arguments missing config")
   585  	}
   586  	if args.CID == "" {
   587  		return errors.New("start argument missing container ID")
   588  	}
   589  	expectedFDs := 1 // At least one FD for the root filesystem.
   590  	expectedFDs += args.NumGoferFilestoreFDs
   591  	if !args.Spec.Process.Terminal {
   592  		expectedFDs += 3
   593  	}
   594  	if len(args.Files) < expectedFDs {
   595  		return fmt.Errorf("restore arguments must contain at least %d FDs, but only got %d", expectedFDs, len(args.Files))
   596  	}
   597  
   598  	// All validation passed, logs the spec for debugging.
   599  	specutils.LogSpecDebug(args.Spec, args.Conf.OCISeccomp)
   600  
   601  	goferFiles := args.Files
   602  	var stdios []*fd.FD
   603  	if !args.Spec.Process.Terminal {
   604  		// When not using a terminal, stdios come as the first 3 files in the
   605  		// payload.
   606  		var err error
   607  		stdios, err = fd.NewFromFiles(goferFiles[:3])
   608  		if err != nil {
   609  			return fmt.Errorf("error dup'ing stdio files: %w", err)
   610  		}
   611  		goferFiles = goferFiles[3:]
   612  	}
   613  
   614  	var goferFilestoreFDs []*fd.FD
   615  	for i := 0; i < args.NumGoferFilestoreFDs; i++ {
   616  		overlayFilestoreFD, err := fd.NewFromFile(goferFiles[i])
   617  		if err != nil {
   618  			return fmt.Errorf("error dup'ing overlay filestore file: %w", err)
   619  		}
   620  		goferFilestoreFDs = append(goferFilestoreFDs, overlayFilestoreFD)
   621  	}
   622  	goferFiles = goferFiles[args.NumGoferFilestoreFDs:]
   623  
   624  	var devGoferFD *fd.FD
   625  	if args.IsDevIoFilePresent {
   626  		var err error
   627  		devGoferFD, err = fd.NewFromFile(goferFiles[0])
   628  		if err != nil {
   629  			return fmt.Errorf("error dup'ing dev gofer file: %w", err)
   630  		}
   631  		goferFiles = goferFiles[1:]
   632  	}
   633  
   634  	goferFDs, err := fd.NewFromFiles(goferFiles)
   635  	if err != nil {
   636  		return fmt.Errorf("error dup'ing gofer files: %w", err)
   637  	}
   638  
   639  	if err := cm.restorer.restoreSubcontainer(args.Spec, args.Conf, cm.l, args.CID, stdios, goferFDs, goferFilestoreFDs, devGoferFD, args.GoferMountConfs); err != nil {
   640  		log.Debugf("containerManager.RestoreSubcontainer failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
   641  		return err
   642  	}
   643  	log.Debugf("Container restored, cid: %s", args.CID)
   644  	return nil
   645  }
   646  
   647  // Wait waits for the init process in the given container.
   648  func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
   649  	log.Debugf("containerManager.Wait, cid: %s", *cid)
   650  	err := cm.l.waitContainer(*cid, waitStatus)
   651  	log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err)
   652  	return err
   653  }
   654  
   655  // WaitPIDArgs are arguments to the WaitPID method.
   656  type WaitPIDArgs struct {
   657  	// PID is the PID in the container's PID namespace.
   658  	PID int32
   659  
   660  	// CID is the container ID.
   661  	CID string
   662  }
   663  
   664  // WaitPID waits for the process with PID 'pid' in the sandbox.
   665  func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
   666  	log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID)
   667  	err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
   668  	log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err)
   669  	return err
   670  }
   671  
   672  // SignalDeliveryMode enumerates different signal delivery modes.
   673  type SignalDeliveryMode int
   674  
   675  const (
   676  	// DeliverToProcess delivers the signal to the container process with
   677  	// the specified PID. If PID is 0, then the container init process is
   678  	// signaled.
   679  	DeliverToProcess SignalDeliveryMode = iota
   680  
   681  	// DeliverToAllProcesses delivers the signal to all processes in the
   682  	// container. PID must be 0.
   683  	DeliverToAllProcesses
   684  
   685  	// DeliverToForegroundProcessGroup delivers the signal to the
   686  	// foreground process group in the same TTY session as the specified
   687  	// process. If PID is 0, then the signal is delivered to the foreground
   688  	// process group for the TTY for the init process.
   689  	DeliverToForegroundProcessGroup
   690  )
   691  
   692  func (s SignalDeliveryMode) String() string {
   693  	switch s {
   694  	case DeliverToProcess:
   695  		return "Process"
   696  	case DeliverToAllProcesses:
   697  		return "All"
   698  	case DeliverToForegroundProcessGroup:
   699  		return "Foreground Process Group"
   700  	}
   701  	return fmt.Sprintf("unknown signal delivery mode: %d", s)
   702  }
   703  
   704  // SignalArgs are arguments to the Signal method.
   705  type SignalArgs struct {
   706  	// CID is the container ID.
   707  	CID string
   708  
   709  	// Signo is the signal to send to the process.
   710  	Signo int32
   711  
   712  	// PID is the process ID in the given container that will be signaled,
   713  	// relative to the root PID namespace, not the container's.
   714  	// If 0, the root container will be signalled.
   715  	PID int32
   716  
   717  	// Mode is the signal delivery mode.
   718  	Mode SignalDeliveryMode
   719  }
   720  
   721  // Signal sends a signal to one or more processes in a container. If args.PID
   722  // is 0, then the container init process is used. Depending on the
   723  // args.SignalDeliveryMode option, the signal may be sent directly to the
   724  // indicated process, to all processes in the container, or to the foreground
   725  // process group.
   726  func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
   727  	log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode)
   728  	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
   729  }
   730  
   731  // CreateTraceSessionArgs are arguments to the CreateTraceSession method.
   732  type CreateTraceSessionArgs struct {
   733  	Config seccheck.SessionConfig
   734  	Force  bool
   735  	urpc.FilePayload
   736  }
   737  
   738  // CreateTraceSession creates a new trace session.
   739  func (cm *containerManager) CreateTraceSession(args *CreateTraceSessionArgs, _ *struct{}) error {
   740  	log.Debugf("containerManager.CreateTraceSession: config: %+v", args.Config)
   741  	for i, sinkFile := range args.Files {
   742  		if sinkFile != nil {
   743  			fd, err := fd.NewFromFile(sinkFile)
   744  			if err != nil {
   745  				return err
   746  			}
   747  			args.Config.Sinks[i].FD = fd
   748  		}
   749  	}
   750  	return seccheck.Create(&args.Config, args.Force)
   751  }
   752  
   753  // DeleteTraceSession deletes an existing trace session.
   754  func (cm *containerManager) DeleteTraceSession(name *string, _ *struct{}) error {
   755  	log.Debugf("containerManager.DeleteTraceSession: name: %q", *name)
   756  	return seccheck.Delete(*name)
   757  }
   758  
   759  // ListTraceSessions lists trace sessions.
   760  func (cm *containerManager) ListTraceSessions(_ *struct{}, out *[]seccheck.SessionConfig) error {
   761  	log.Debugf("containerManager.ListTraceSessions")
   762  	seccheck.List(out)
   763  	return nil
   764  }
   765  
   766  // ProcfsDump dumps procfs state of the sandbox.
   767  func (cm *containerManager) ProcfsDump(_ *struct{}, out *[]procfs.ProcessProcfsDump) error {
   768  	log.Debugf("containerManager.ProcfsDump")
   769  	ts := cm.l.k.TaskSet()
   770  	pidns := ts.Root
   771  	*out = make([]procfs.ProcessProcfsDump, 0, len(cm.l.processes))
   772  	for _, tg := range pidns.ThreadGroups() {
   773  		pid := pidns.IDOfThreadGroup(tg)
   774  		procDump, err := procfs.Dump(tg.Leader(), pid, pidns)
   775  		if err != nil {
   776  			log.Warningf("skipping procfs dump for PID %s: %v", pid, err)
   777  			continue
   778  		}
   779  		*out = append(*out, procDump)
   780  	}
   781  	return nil
   782  }
   783  
   784  // MountArgs contains arguments to the Mount method.
   785  type MountArgs struct {
   786  	// ContainerID is the container in which we will mount the filesystem.
   787  	ContainerID string
   788  
   789  	// Source is the mount source.
   790  	Source string
   791  
   792  	// Destination is the mount target.
   793  	Destination string
   794  
   795  	// FsType is the filesystem type.
   796  	FsType string
   797  
   798  	// FilePayload contains the source image FD, if required by the filesystem.
   799  	urpc.FilePayload
   800  }
   801  
   802  const initTID kernel.ThreadID = 1
   803  
   804  // Mount mounts a filesystem in a container.
   805  func (cm *containerManager) Mount(args *MountArgs, _ *struct{}) error {
   806  	log.Debugf("containerManager.Mount, cid: %s, args: %+v", args.ContainerID, args)
   807  
   808  	var cu cleanup.Cleanup
   809  	defer cu.Clean()
   810  
   811  	eid := execID{cid: args.ContainerID}
   812  	ep, ok := cm.l.processes[eid]
   813  	if !ok {
   814  		return fmt.Errorf("container %v is deleted", args.ContainerID)
   815  	}
   816  	if ep.tg == nil {
   817  		return fmt.Errorf("container %v isn't started", args.ContainerID)
   818  	}
   819  
   820  	t := ep.tg.PIDNamespace().TaskWithID(initTID)
   821  	if t == nil {
   822  		return fmt.Errorf("failed to find init process")
   823  	}
   824  
   825  	source := args.Source
   826  	dest := path.Clean(args.Destination)
   827  	fstype := args.FsType
   828  
   829  	if dest[0] != '/' {
   830  		return fmt.Errorf("absolute path must be provided for destination")
   831  	}
   832  
   833  	var opts vfs.MountOptions
   834  	switch fstype {
   835  	case erofs.Name:
   836  		if len(args.FilePayload.Files) != 1 {
   837  			return fmt.Errorf("exactly one image file must be provided")
   838  		}
   839  
   840  		imageFD, err := unix.Dup(int(args.FilePayload.Files[0].Fd()))
   841  		if err != nil {
   842  			return fmt.Errorf("failed to dup image FD: %v", err)
   843  		}
   844  		cu.Add(func() { unix.Close(imageFD) })
   845  
   846  		opts = vfs.MountOptions{
   847  			ReadOnly: true,
   848  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   849  				InternalMount: true,
   850  				Data:          fmt.Sprintf("ifd=%d", imageFD),
   851  			},
   852  		}
   853  
   854  	default:
   855  		return fmt.Errorf("unsupported filesystem type: %v", fstype)
   856  	}
   857  
   858  	ctx := context.Background()
   859  	root := t.FSContext().RootDirectory()
   860  	defer root.DecRef(ctx)
   861  
   862  	pop := vfs.PathOperation{
   863  		Root:  root,
   864  		Start: root,
   865  		Path:  fspath.Parse(dest),
   866  	}
   867  
   868  	if _, err := t.Kernel().VFS().MountAt(ctx, t.Credentials(), source, &pop, fstype, &opts); err != nil {
   869  		return err
   870  	}
   871  	log.Infof("Mounted %q to %q type: %s, internal-options: %q, in container %q", source, dest, fstype, opts.GetFilesystemOptions.Data, args.ContainerID)
   872  	cu.Release()
   873  	return nil
   874  }
   875  
   876  // ContainerRuntimeState returns the runtime state of a container.
   877  func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerRuntimeState) error {
   878  	log.Debugf("containerManager.ContainerRuntimeState: cid: %s", *cid)
   879  	*state = cm.l.containerRuntimeState(*cid)
   880  	return nil
   881  }