gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/loader.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package boot loads the kernel and runs a container.
    16  package boot
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	mrand "math/rand"
    22  	"os"
    23  	"runtime"
    24  	"strconv"
    25  	gtime "time"
    26  
    27  	specs "github.com/opencontainers/runtime-spec/specs-go"
    28  	"github.com/syndtr/gocapability/capability"
    29  	"golang.org/x/sys/unix"
    30  	"gvisor.dev/gvisor/pkg/abi/linux"
    31  	"gvisor.dev/gvisor/pkg/bpf"
    32  	"gvisor.dev/gvisor/pkg/cleanup"
    33  	"gvisor.dev/gvisor/pkg/context"
    34  	"gvisor.dev/gvisor/pkg/coverage"
    35  	"gvisor.dev/gvisor/pkg/cpuid"
    36  	"gvisor.dev/gvisor/pkg/fd"
    37  	"gvisor.dev/gvisor/pkg/log"
    38  	"gvisor.dev/gvisor/pkg/memutil"
    39  	"gvisor.dev/gvisor/pkg/metric"
    40  	"gvisor.dev/gvisor/pkg/rand"
    41  	"gvisor.dev/gvisor/pkg/refs"
    42  	"gvisor.dev/gvisor/pkg/sentry/control"
    43  	"gvisor.dev/gvisor/pkg/sentry/devices/nvproxy"
    44  	"gvisor.dev/gvisor/pkg/sentry/fdimport"
    45  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
    46  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
    47  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/user"
    48  	"gvisor.dev/gvisor/pkg/sentry/inet"
    49  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    50  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    51  	"gvisor.dev/gvisor/pkg/sentry/loader"
    52  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    53  	"gvisor.dev/gvisor/pkg/sentry/platform"
    54  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    55  	pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    56  	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
    57  	"gvisor.dev/gvisor/pkg/sentry/time"
    58  	"gvisor.dev/gvisor/pkg/sentry/usage"
    59  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    60  	"gvisor.dev/gvisor/pkg/sentry/watchdog"
    61  	"gvisor.dev/gvisor/pkg/sighandling"
    62  	"gvisor.dev/gvisor/pkg/sync"
    63  	"gvisor.dev/gvisor/pkg/tcpip"
    64  	"gvisor.dev/gvisor/pkg/tcpip/link/ethernet"
    65  	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
    66  	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
    67  	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
    68  	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
    69  	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
    70  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    71  	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
    72  	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
    73  	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
    74  	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
    75  	"gvisor.dev/gvisor/runsc/boot/filter"
    76  	_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
    77  	pf "gvisor.dev/gvisor/runsc/boot/portforward"
    78  	"gvisor.dev/gvisor/runsc/boot/pprof"
    79  	"gvisor.dev/gvisor/runsc/config"
    80  	"gvisor.dev/gvisor/runsc/profile"
    81  	"gvisor.dev/gvisor/runsc/specutils"
    82  	"gvisor.dev/gvisor/runsc/specutils/seccomp"
    83  
    84  	// Top-level inet providers.
    85  	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
    86  	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
    87  
    88  	// Include other supported socket providers.
    89  	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
    90  	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
    91  	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
    92  	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
    93  )
    94  
    95  // ContainerRuntimeState is the runtime state of a container.
    96  type ContainerRuntimeState int
    97  
    98  const (
    99  	// RuntimeStateInvalid used just in case of error.
   100  	RuntimeStateInvalid ContainerRuntimeState = iota
   101  	// RuntimeStateCreating indicates that the container is being
   102  	// created, but has not started running yet.
   103  	RuntimeStateCreating
   104  	// RuntimeStateRunning indicates that the container is running.
   105  	RuntimeStateRunning
   106  	// RuntimeStateStopped indicates that the container has stopped.
   107  	RuntimeStateStopped
   108  )
   109  
   110  type containerInfo struct {
   111  	cid string
   112  
   113  	containerName string
   114  
   115  	conf *config.Config
   116  
   117  	// spec is the base configuration for the root container.
   118  	spec *specs.Spec
   119  
   120  	// procArgs refers to the container's init task.
   121  	procArgs kernel.CreateProcessArgs
   122  
   123  	// stdioFDs contains stdin, stdout, and stderr.
   124  	stdioFDs []*fd.FD
   125  
   126  	// passFDs are mappings of user-supplied host to guest file descriptors.
   127  	passFDs []fdMapping
   128  
   129  	// execFD is the host file descriptor used for program execution.
   130  	execFD *fd.FD
   131  
   132  	// goferFDs are the FDs that attach the sandbox to the gofers.
   133  	goferFDs []*fd.FD
   134  
   135  	// devGoferFD is the FD to attach the sandbox to the dev gofer.
   136  	devGoferFD *fd.FD
   137  
   138  	// goferFilestoreFDs are FDs to the regular files that will back the tmpfs or
   139  	// overlayfs mount for certain gofer mounts.
   140  	goferFilestoreFDs []*fd.FD
   141  
   142  	// goferMountConfs contains information about how the gofer mounts have been
   143  	// configured. The first entry is for rootfs and the following entries are
   144  	// for bind mounts in Spec.Mounts (in the same order).
   145  	goferMountConfs []GoferMountConf
   146  
   147  	// nvidiaUVMDevMajor is the device major number used for nvidia-uvm.
   148  	nvidiaUVMDevMajor uint32
   149  
   150  	// nvidiaDriverVersion is the NVIDIA driver ABI version to use for
   151  	// communicating with NVIDIA devices on the host.
   152  	nvidiaDriverVersion string
   153  }
   154  
   155  type loaderState int
   156  
   157  const (
   158  	// created indicates that the Loader has been created, but not started yet.
   159  	created loaderState = iota
   160  	// started indicates that the Loader has been started.
   161  	started
   162  	// restoring indicates that the Loader has been created and is restoring
   163  	// containers. It will change to started after restore is completed.
   164  	restoring
   165  )
   166  
   167  // Loader keeps state needed to start the kernel and run the container.
   168  type Loader struct {
   169  	// k is the kernel.
   170  	k *kernel.Kernel
   171  
   172  	// ctrl is the control server.
   173  	ctrl *controller
   174  
   175  	// root contains information about the root container in the sandbox.
   176  	root containerInfo
   177  
   178  	watchdog *watchdog.Watchdog
   179  
   180  	// stopSignalForwarding disables forwarding of signals to the sandboxed
   181  	// container. It should be called when a sandbox is destroyed.
   182  	stopSignalForwarding func()
   183  
   184  	// stopProfiling stops profiling started at container creation. It
   185  	// should be called when a sandbox is destroyed.
   186  	stopProfiling func()
   187  
   188  	// PreSeccompCallback is called right before installing seccomp filters.
   189  	PreSeccompCallback func()
   190  
   191  	// restore is set to true if we are restoring a container.
   192  	restore bool
   193  
   194  	restoreWaiters *sync.Cond
   195  
   196  	// sandboxID is the ID for the whole sandbox.
   197  	sandboxID string
   198  
   199  	// mountHints provides extra information about mounts for containers that
   200  	// apply to the entire pod.
   201  	mountHints *PodMountHints
   202  
   203  	// productName is the value to show in
   204  	// /sys/devices/virtual/dmi/id/product_name.
   205  	productName string
   206  
   207  	// mu guards the fields below.
   208  	mu sync.Mutex
   209  
   210  	// state is guarded by mu.
   211  	state loaderState
   212  
   213  	// sharedMounts holds VFS mounts that may be shared between containers within
   214  	// the same pod. It is mapped by mount source.
   215  	//
   216  	// sharedMounts is guarded by mu.
   217  	sharedMounts map[string]*vfs.Mount
   218  
   219  	// processes maps containers init process and invocation of exec. Root
   220  	// processes are keyed with container ID and pid=0, while exec invocations
   221  	// have the corresponding pid set.
   222  	//
   223  	// processes is guarded by mu.
   224  	processes map[execID]*execProcess
   225  
   226  	// containerIDs store container names and IDs to assist with restore and container
   227  	// naming when user didn't provide one.
   228  	//
   229  	// Mapping: name -> cid.
   230  	// processes is guarded by mu.
   231  	containerIDs map[string]string
   232  
   233  	// portForwardProxies is a list of active port forwarding connections.
   234  	//
   235  	// portForwardProxies is guarded by mu.
   236  	portForwardProxies []*pf.Proxy
   237  }
   238  
   239  // execID uniquely identifies a sentry process that is executed in a container.
   240  type execID struct {
   241  	cid string
   242  	pid kernel.ThreadID
   243  }
   244  
   245  // execProcess contains the thread group and host TTY of a sentry process.
   246  type execProcess struct {
   247  	// tg will be nil for containers that haven't started yet.
   248  	tg *kernel.ThreadGroup
   249  
   250  	// tty will be nil if the process is not attached to a terminal.
   251  	tty *host.TTYFileDescription
   252  
   253  	// pidnsPath is the pid namespace path in spec
   254  	pidnsPath string
   255  
   256  	// hostTTY is present when creating a sub-container with terminal enabled.
   257  	// TTY file is passed during container create and must be saved until
   258  	// container start.
   259  	hostTTY *fd.FD
   260  }
   261  
   262  // fdMapping maps guest to host file descriptors. Guest file descriptors are
   263  // exposed to the application inside the sandbox through the FD table.
   264  type fdMapping struct {
   265  	guest int
   266  	host  *fd.FD
   267  }
   268  
   269  // FDMapping is a helper type to represent a mapping from guest to host file
   270  // descriptors. In contrast to the unexported fdMapping type, it does not imply
   271  // file ownership.
   272  type FDMapping struct {
   273  	Guest int
   274  	Host  int
   275  }
   276  
   277  func init() {
   278  	// Initialize the random number generator.
   279  	mrand.Seed(gtime.Now().UnixNano())
   280  }
   281  
   282  // Args are the arguments for New().
   283  type Args struct {
   284  	// Id is the sandbox ID.
   285  	ID string
   286  	// Spec is the sandbox specification.
   287  	Spec *specs.Spec
   288  	// Conf is the system configuration.
   289  	Conf *config.Config
   290  	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
   291  	// of this FD and may close it at any time.
   292  	ControllerFD int
   293  	// Device is an optional argument that is passed to the platform. The Loader
   294  	// takes ownership of this file and may close it at any time.
   295  	Device *fd.FD
   296  	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
   297  	// takes ownership of these FDs and may close them at any time.
   298  	GoferFDs []int
   299  	// DevGoferFD is the FD for the dev gofer connection. The Loader takes
   300  	// ownership of this FD and may close it at any time.
   301  	DevGoferFD int
   302  	// StdioFDs is the stdio for the application. The Loader takes ownership of
   303  	// these FDs and may close them at any time.
   304  	StdioFDs []int
   305  	// PassFDs are user-supplied FD mappings from host to guest descriptors.
   306  	// The Loader takes ownership of these FDs and may close them at any time.
   307  	PassFDs []FDMapping
   308  	// ExecFD is the host file descriptor used for program execution.
   309  	ExecFD int
   310  	// GoferFilestoreFDs are FDs to the regular files that will back the tmpfs or
   311  	// overlayfs mount for certain gofer mounts.
   312  	GoferFilestoreFDs []int
   313  	// GoferMountConfs contains information about how the gofer mounts have been
   314  	// configured. The first entry is for rootfs and the following entries are
   315  	// for bind mounts in Spec.Mounts (in the same order).
   316  	GoferMountConfs []GoferMountConf
   317  	// NumCPU is the number of CPUs to create inside the sandbox.
   318  	NumCPU int
   319  	// TotalMem is the initial amount of total memory to report back to the
   320  	// container.
   321  	TotalMem uint64
   322  	// TotalHostMem is the total memory reported by host /proc/meminfo.
   323  	TotalHostMem uint64
   324  	// UserLogFD is the file descriptor to write user logs to.
   325  	UserLogFD int
   326  	// ProductName is the value to show in
   327  	// /sys/devices/virtual/dmi/id/product_name.
   328  	ProductName string
   329  	// PodInitConfigFD is the file descriptor to a file passed in the
   330  	//	--pod-init-config flag
   331  	PodInitConfigFD int
   332  	// SinkFDs is an ordered array of file descriptors to be used by seccheck
   333  	// sinks configured from the --pod-init-config file.
   334  	SinkFDs []int
   335  	// ProfileOpts contains the set of profiles to enable and the
   336  	// corresponding FDs where profile data will be written.
   337  	ProfileOpts profile.Opts
   338  	// NvidiaDriverVersion is the NVIDIA driver ABI version to use for
   339  	// communicating with NVIDIA devices on the host.
   340  	NvidiaDriverVersion string
   341  }
   342  
   343  // make sure stdioFDs are always the same on initial start and on restore
   344  const startingStdioFD = 256
   345  
   346  func getRootCredentials(spec *specs.Spec, conf *config.Config, userNs *auth.UserNamespace) *auth.Credentials {
   347  	// Create capabilities.
   348  	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
   349  	if err != nil {
   350  		return nil
   351  	}
   352  
   353  	// Convert the spec's additional GIDs to KGIDs.
   354  	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
   355  	for _, GID := range spec.Process.User.AdditionalGids {
   356  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   357  	}
   358  
   359  	if userNs == nil {
   360  		userNs = auth.NewRootUserNamespace()
   361  	}
   362  	// Create credentials.
   363  	creds := auth.NewUserCredentials(
   364  		auth.KUID(spec.Process.User.UID),
   365  		auth.KGID(spec.Process.User.GID),
   366  		extraKGIDs,
   367  		caps,
   368  		userNs)
   369  
   370  	return creds
   371  }
   372  
   373  // New initializes a new kernel loader configured by spec.
   374  // New also handles setting up a kernel for restoring a container.
   375  func New(args Args) (*Loader, error) {
   376  	stopProfilingRuntime := profile.Start(args.ProfileOpts)
   377  	stopProfiling := func() {
   378  		stopProfilingRuntime()
   379  		metric.StopProfilingMetrics()
   380  	}
   381  
   382  	// Initialize seccheck points.
   383  	seccheck.Initialize()
   384  
   385  	// We initialize the rand package now to make sure /dev/urandom is pre-opened
   386  	// on kernels that do not support getrandom(2).
   387  	if err := rand.Init(); err != nil {
   388  		return nil, fmt.Errorf("setting up rand: %w", err)
   389  	}
   390  
   391  	if err := usage.Init(); err != nil {
   392  		return nil, fmt.Errorf("setting up memory usage: %w", err)
   393  	}
   394  
   395  	if specutils.NVProxyEnabled(args.Spec, args.Conf) {
   396  		nvproxy.Init()
   397  	}
   398  
   399  	kernel.IOUringEnabled = args.Conf.IOUring
   400  
   401  	eid := execID{cid: args.ID}
   402  	l := &Loader{
   403  		sandboxID:     args.ID,
   404  		processes:     map[execID]*execProcess{eid: {}},
   405  		sharedMounts:  make(map[string]*vfs.Mount),
   406  		stopProfiling: stopProfiling,
   407  		productName:   args.ProductName,
   408  		containerIDs:  map[string]string{},
   409  	}
   410  
   411  	containerName := l.registerContainerLocked(args.Spec, args.ID)
   412  	l.root = containerInfo{
   413  		cid:                 args.ID,
   414  		containerName:       containerName,
   415  		conf:                args.Conf,
   416  		spec:                args.Spec,
   417  		goferMountConfs:     args.GoferMountConfs,
   418  		nvidiaDriverVersion: args.NvidiaDriverVersion,
   419  	}
   420  
   421  	// Make host FDs stable between invocations. Host FDs must map to the exact
   422  	// same number when the sandbox is restored. Otherwise the wrong FD will be
   423  	// used.
   424  	newfd := startingStdioFD
   425  
   426  	for _, stdioFD := range args.StdioFDs {
   427  		// Check that newfd is unused to avoid clobbering over it.
   428  		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
   429  			if err != nil {
   430  				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
   431  			}
   432  			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
   433  		}
   434  
   435  		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
   436  		if err != nil {
   437  			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
   438  		}
   439  		l.root.stdioFDs = append(l.root.stdioFDs, fd.New(newfd))
   440  		_ = unix.Close(stdioFD)
   441  		newfd++
   442  	}
   443  	for _, goferFD := range args.GoferFDs {
   444  		l.root.goferFDs = append(l.root.goferFDs, fd.New(goferFD))
   445  	}
   446  	for _, filestoreFD := range args.GoferFilestoreFDs {
   447  		l.root.goferFilestoreFDs = append(l.root.goferFilestoreFDs, fd.New(filestoreFD))
   448  	}
   449  	if args.DevGoferFD >= 0 {
   450  		l.root.devGoferFD = fd.New(args.DevGoferFD)
   451  	}
   452  	if args.ExecFD >= 0 {
   453  		l.root.execFD = fd.New(args.ExecFD)
   454  	}
   455  
   456  	for _, customFD := range args.PassFDs {
   457  		l.root.passFDs = append(l.root.passFDs, fdMapping{
   458  			host:  fd.New(customFD.Host),
   459  			guest: customFD.Guest,
   460  		})
   461  	}
   462  
   463  	// Create kernel and platform.
   464  	p, err := createPlatform(args.Conf, args.Device)
   465  	if err != nil {
   466  		return nil, fmt.Errorf("creating platform: %w", err)
   467  	}
   468  	if specutils.NVProxyEnabled(args.Spec, args.Conf) && p.OwnsPageTables() {
   469  		return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform)
   470  	}
   471  	l.k = &kernel.Kernel{Platform: p}
   472  
   473  	// Create memory file.
   474  	mf, err := createMemoryFile()
   475  	if err != nil {
   476  		return nil, fmt.Errorf("creating memory file: %w", err)
   477  	}
   478  	l.k.SetMemoryFile(mf)
   479  
   480  	// Create VDSO.
   481  	//
   482  	// Pass k as the platform since it is savable, unlike the actual platform.
   483  	vdso, err := loader.PrepareVDSO(l.k.MemoryFile())
   484  	if err != nil {
   485  		return nil, fmt.Errorf("creating vdso: %w", err)
   486  	}
   487  
   488  	// Create timekeeper.
   489  	tk := kernel.NewTimekeeper(l.k.MemoryFile(), vdso.ParamPage.FileRange())
   490  	tk.SetClocks(time.NewCalibratedClocks())
   491  
   492  	if err := enableStrace(args.Conf); err != nil {
   493  		return nil, fmt.Errorf("enabling strace: %w", err)
   494  	}
   495  
   496  	creds := getRootCredentials(args.Spec, args.Conf, nil /* UserNamespace */)
   497  	if creds == nil {
   498  		return nil, fmt.Errorf("getting root credentials")
   499  	}
   500  	// Create root network namespace/stack.
   501  	netns, err := newRootNetworkNamespace(args.Conf, tk, l.k, creds.UserNamespace)
   502  	if err != nil {
   503  		return nil, fmt.Errorf("creating network: %w", err)
   504  	}
   505  
   506  	if args.NumCPU == 0 {
   507  		args.NumCPU = runtime.NumCPU()
   508  	}
   509  	log.Infof("CPUs: %d", args.NumCPU)
   510  	runtime.GOMAXPROCS(args.NumCPU)
   511  
   512  	if args.TotalHostMem > 0 {
   513  		// As per tmpfs(5), the default size limit is 50% of total physical RAM.
   514  		// See mm/shmem.c:shmem_default_max_blocks().
   515  		tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2)
   516  		// Set a generous but sane on maximum allowable size for memory
   517  		// file allocates.
   518  		usage.MaximumAllocatableBytes = args.TotalHostMem
   519  	}
   520  
   521  	if args.TotalMem > 0 {
   522  		// Adjust the total memory returned by the Sentry so that applications that
   523  		// use /proc/meminfo can make allocations based on this limit.
   524  		usage.MinimumTotalMemoryBytes = args.TotalMem
   525  		usage.MaximumTotalMemoryBytes = args.TotalMem
   526  		// Reset max allocatable to TotalMem because it's smaller than TotalHostMem.
   527  		usage.MaximumAllocatableBytes = args.TotalMem
   528  		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
   529  	}
   530  
   531  	maxFDLimit := kernel.MaxFdLimit
   532  	if args.Spec.Linux != nil && args.Spec.Linux.Sysctl != nil {
   533  		if val, ok := args.Spec.Linux.Sysctl["fs.nr_open"]; ok {
   534  			nrOpen, err := strconv.Atoi(val)
   535  			if err != nil {
   536  				return nil, fmt.Errorf("setting fs.nr_open=%s: %w", val, err)
   537  			}
   538  			if nrOpen <= 0 || nrOpen > int(kernel.MaxFdLimit) {
   539  				return nil, fmt.Errorf("setting fs.nr_open=%s", val)
   540  			}
   541  			maxFDLimit = int32(nrOpen)
   542  		}
   543  	}
   544  	// Initiate the Kernel object, which is required by the Context passed
   545  	// to createVFS in order to mount (among other things) procfs.
   546  	if err = l.k.Init(kernel.InitKernelArgs{
   547  		FeatureSet:           cpuid.HostFeatureSet().Fixed(),
   548  		Timekeeper:           tk,
   549  		RootUserNamespace:    creds.UserNamespace,
   550  		RootNetworkNamespace: netns,
   551  		ApplicationCores:     uint(args.NumCPU),
   552  		Vdso:                 vdso,
   553  		RootUTSNamespace:     kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
   554  		RootIPCNamespace:     kernel.NewIPCNamespace(creds.UserNamespace),
   555  		PIDNamespace:         kernel.NewRootPIDNamespace(creds.UserNamespace),
   556  		MaxFDLimit:           maxFDLimit,
   557  	}); err != nil {
   558  		return nil, fmt.Errorf("initializing kernel: %w", err)
   559  	}
   560  
   561  	if err := registerFilesystems(l.k, &l.root); err != nil {
   562  		return nil, fmt.Errorf("registering filesystems: %w", err)
   563  	}
   564  
   565  	// Turn on packet logging if enabled.
   566  	if args.Conf.LogPackets {
   567  		log.Infof("Packet logging enabled")
   568  		sniffer.LogPackets.Store(1)
   569  	} else {
   570  		log.Infof("Packet logging disabled")
   571  		sniffer.LogPackets.Store(0)
   572  	}
   573  
   574  	// Create a watchdog.
   575  	dogOpts := watchdog.DefaultOpts
   576  	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
   577  	l.watchdog = watchdog.New(l.k, dogOpts)
   578  
   579  	procArgs, err := createProcessArgs(args.ID, args.Spec, args.Conf, creds, l.k, l.k.RootPIDNamespace())
   580  	if err != nil {
   581  		return nil, fmt.Errorf("creating init process for root container: %w", err)
   582  	}
   583  	l.root.procArgs = procArgs
   584  
   585  	if err := initCompatLogs(args.UserLogFD); err != nil {
   586  		return nil, fmt.Errorf("initializing compat logs: %w", err)
   587  	}
   588  
   589  	l.mountHints, err = NewPodMountHints(args.Spec)
   590  	if err != nil {
   591  		return nil, fmt.Errorf("creating pod mount hints: %w", err)
   592  	}
   593  
   594  	// Set up host mount that will be used for imported fds.
   595  	hostFilesystem, err := host.NewFilesystem(l.k.VFS())
   596  	if err != nil {
   597  		return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err)
   598  	}
   599  	defer hostFilesystem.DecRef(l.k.SupervisorContext())
   600  	l.k.SetHostMount(l.k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}))
   601  
   602  	if args.PodInitConfigFD >= 0 {
   603  		if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil {
   604  			log.Warningf("unable to configure event session: %v", err)
   605  		}
   606  	}
   607  
   608  	l.k.RegisterContainerName(args.ID, l.root.containerName)
   609  
   610  	// We don't care about child signals; some platforms can generate a
   611  	// tremendous number of useless ones (I'm looking at you, ptrace).
   612  	if err := sighandling.IgnoreChildStop(); err != nil {
   613  		return nil, fmt.Errorf("ignore child stop signals failed: %w", err)
   614  	}
   615  
   616  	// Create the control server using the provided FD.
   617  	//
   618  	// This must be done *after* we have initialized the kernel since the
   619  	// controller is used to configure the kernel's network stack.
   620  	ctrl, err := newController(args.ControllerFD, l)
   621  	if err != nil {
   622  		return nil, fmt.Errorf("creating control server: %w", err)
   623  	}
   624  	l.ctrl = ctrl
   625  
   626  	// Only start serving after Loader is set to controller and controller is set
   627  	// to Loader, because they are both used in the urpc methods.
   628  	if err := ctrl.srv.StartServing(); err != nil {
   629  		return nil, fmt.Errorf("starting control server: %w", err)
   630  	}
   631  
   632  	return l, nil
   633  }
   634  
   635  // createProcessArgs creates args that can be used with kernel.CreateProcess.
   636  func createProcessArgs(id string, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
   637  	// Create initial limits.
   638  	ls, err := createLimitSet(spec, specutils.TPUProxyIsEnabled(spec, conf))
   639  	if err != nil {
   640  		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err)
   641  	}
   642  	env, err := specutils.ResolveEnvs(spec.Process.Env)
   643  	if err != nil {
   644  		return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err)
   645  	}
   646  
   647  	wd := spec.Process.Cwd
   648  	if wd == "" {
   649  		wd = "/"
   650  	}
   651  
   652  	// Create the process arguments.
   653  	procArgs := kernel.CreateProcessArgs{
   654  		Argv:                 spec.Process.Args,
   655  		Envv:                 env,
   656  		WorkingDirectory:     wd,
   657  		Credentials:          creds,
   658  		Umask:                0022,
   659  		Limits:               ls,
   660  		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
   661  		UTSNamespace:         k.RootUTSNamespace(),
   662  		IPCNamespace:         k.RootIPCNamespace(),
   663  		ContainerID:          id,
   664  		PIDNamespace:         pidns,
   665  	}
   666  
   667  	return procArgs, nil
   668  }
   669  
   670  // Destroy cleans up all resources used by the loader.
   671  //
   672  // Note that this will block until all open control server connections have
   673  // been closed. For that reason, this should NOT be called in a defer, because
   674  // a panic in a control server rpc would then hang forever.
   675  func (l *Loader) Destroy() {
   676  	if l.stopSignalForwarding != nil {
   677  		l.stopSignalForwarding()
   678  	}
   679  	l.watchdog.Stop()
   680  
   681  	ctx := l.k.SupervisorContext()
   682  	for _, m := range l.sharedMounts {
   683  		m.DecRef(ctx)
   684  	}
   685  
   686  	// Stop the control server. This will indirectly stop any
   687  	// long-running control operations that are in flight, e.g.
   688  	// profiling operations.
   689  	l.ctrl.stop()
   690  
   691  	// Release all kernel resources. This is only safe after we can no longer
   692  	// save/restore.
   693  	l.k.Release()
   694  
   695  	// Release any dangling tcp connections.
   696  	tcpip.ReleaseDanglingEndpoints()
   697  
   698  	// In the success case, all FDs in l.root will only contain released/closed
   699  	// FDs whose ownership has been passed over to host FDs and gofer sessions.
   700  	// Close them here in case of failure.
   701  	for _, f := range l.root.stdioFDs {
   702  		_ = f.Close()
   703  	}
   704  	for _, f := range l.root.passFDs {
   705  		_ = f.host.Close()
   706  	}
   707  	for _, f := range l.root.goferFDs {
   708  		_ = f.Close()
   709  	}
   710  	for _, f := range l.root.goferFilestoreFDs {
   711  		_ = f.Close()
   712  	}
   713  	if l.root.devGoferFD != nil {
   714  		_ = l.root.devGoferFD.Close()
   715  	}
   716  
   717  	l.stopProfiling()
   718  	// Check all references.
   719  	refs.OnExit()
   720  }
   721  
   722  func createPlatform(conf *config.Config, deviceFile *fd.FD) (platform.Platform, error) {
   723  	p, err := platform.Lookup(conf.Platform)
   724  	if err != nil {
   725  		panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err))
   726  	}
   727  	log.Infof("Platform: %s", conf.Platform)
   728  	return p.New(deviceFile)
   729  }
   730  
   731  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   732  	const memfileName = "runsc-memory"
   733  	memfd, err := memutil.CreateMemFD(memfileName, 0)
   734  	if err != nil {
   735  		return nil, fmt.Errorf("error creating memfd: %w", err)
   736  	}
   737  	memfile := os.NewFile(uintptr(memfd), memfileName)
   738  	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
   739  	// there are memory cgroups specified, because at this point we're already
   740  	// in a mount namespace in which the relevant cgroupfs is not visible.
   741  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{
   742  		EnforceMaximumAllocatable: true,
   743  	})
   744  	if err != nil {
   745  		_ = memfile.Close()
   746  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err)
   747  	}
   748  	return mf, nil
   749  }
   750  
   751  // installSeccompFilters installs sandbox seccomp filters with the host.
   752  func (l *Loader) installSeccompFilters() error {
   753  	if l.PreSeccompCallback != nil {
   754  		l.PreSeccompCallback()
   755  	}
   756  	if l.root.conf.DisableSeccomp {
   757  		log.Warningf("*** SECCOMP WARNING: syscall filter is DISABLED. Running in less secure mode.")
   758  	} else {
   759  		hostnet := l.root.conf.Network == config.NetworkHost
   760  		opts := filter.Options{
   761  			Platform:              l.k.Platform.SeccompInfo(),
   762  			HostNetwork:           hostnet,
   763  			HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw,
   764  			HostFilesystem:        l.root.conf.DirectFS,
   765  			ProfileEnable:         l.root.conf.ProfileEnable,
   766  			NVProxy:               specutils.NVProxyEnabled(l.root.spec, l.root.conf),
   767  			TPUProxy:              specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf),
   768  			ControllerFD:          uint32(l.ctrl.srv.FD()),
   769  		}
   770  		if err := filter.Install(opts); err != nil {
   771  			return fmt.Errorf("installing seccomp filters: %w", err)
   772  		}
   773  	}
   774  	return nil
   775  }
   776  
   777  // Run runs the root container.
   778  func (l *Loader) Run() error {
   779  	err := l.run()
   780  	l.ctrl.manager.startResultChan <- err
   781  	if err != nil {
   782  		// Give the controller some time to send the error to the
   783  		// runtime. If we return too quickly here the process will exit
   784  		// and the control connection will be closed before the error
   785  		// is returned.
   786  		gtime.Sleep(2 * gtime.Second)
   787  		return err
   788  	}
   789  	return nil
   790  }
   791  
   792  func (l *Loader) run() error {
   793  	if l.root.conf.Network == config.NetworkHost {
   794  		// Delay host network configuration to this point because network namespace
   795  		// is configured after the loader is created and before Run() is called.
   796  		log.Debugf("Configuring host network")
   797  		s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
   798  		if err := s.Configure(l.root.conf.EnableRaw); err != nil {
   799  			return err
   800  		}
   801  	}
   802  
   803  	l.mu.Lock()
   804  	defer l.mu.Unlock()
   805  
   806  	eid := execID{cid: l.sandboxID}
   807  	ep, ok := l.processes[eid]
   808  	if !ok {
   809  		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
   810  	}
   811  
   812  	// If we are restoring, we do not want to create a process.
   813  	// l.restore is set by the container manager when a restore call is made.
   814  	if !l.restore {
   815  		if l.root.conf.ProfileEnable {
   816  			pprof.Initialize()
   817  		}
   818  
   819  		// Finally done with all configuration. Setup filters before user code
   820  		// is loaded.
   821  		if err := l.installSeccompFilters(); err != nil {
   822  			return err
   823  		}
   824  
   825  		// Create the root container init task. It will begin running
   826  		// when the kernel is started.
   827  		var (
   828  			tg  *kernel.ThreadGroup
   829  			err error
   830  		)
   831  		tg, ep.tty, err = l.createContainerProcess(&l.root)
   832  		if err != nil {
   833  			return err
   834  		}
   835  
   836  		if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   837  			evt := pb.Start{
   838  				Id:       l.sandboxID,
   839  				Cwd:      l.root.spec.Process.Cwd,
   840  				Args:     l.root.spec.Process.Args,
   841  				Terminal: l.root.spec.Process.Terminal,
   842  			}
   843  			fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
   844  			if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
   845  				evt.Env = l.root.spec.Process.Env
   846  			}
   847  			if !fields.Context.Empty() {
   848  				evt.ContextData = &pb.ContextData{}
   849  				kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData)
   850  			}
   851  			_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   852  				return c.ContainerStart(context.Background(), fields, &evt)
   853  			})
   854  		}
   855  	}
   856  
   857  	ep.tg = l.k.GlobalInit()
   858  	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
   859  		ep.pidnsPath = ns.Path
   860  	}
   861  
   862  	// Handle signals by forwarding them to the root container process
   863  	// (except for panic signal, which should cause a panic).
   864  	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
   865  		// Panic signal should cause a panic.
   866  		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
   867  			panic("Signal-induced panic")
   868  		}
   869  
   870  		// Otherwise forward to root container.
   871  		deliveryMode := DeliverToProcess
   872  		if l.root.spec.Process.Terminal {
   873  			// Since we are running with a console, we should forward the signal to
   874  			// the foreground process group so that job control signals like ^C can
   875  			// be handled properly.
   876  			deliveryMode = DeliverToForegroundProcessGroup
   877  		}
   878  		log.Infof("Received external signal %d, mode: %s", sig, deliveryMode)
   879  		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
   880  			log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err)
   881  		}
   882  	})
   883  
   884  	log.Infof("Process should have started...")
   885  	l.watchdog.Start()
   886  	if err := l.k.Start(); err != nil {
   887  		return err
   888  	}
   889  	l.state = started
   890  	return nil
   891  }
   892  
   893  // createSubcontainer creates a new container inside the sandbox.
   894  func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error {
   895  	l.mu.Lock()
   896  	defer l.mu.Unlock()
   897  
   898  	eid := execID{cid: cid}
   899  	if _, ok := l.processes[eid]; ok {
   900  		return fmt.Errorf("container %q already exists", cid)
   901  	}
   902  	l.processes[eid] = &execProcess{hostTTY: tty}
   903  	return nil
   904  }
   905  
   906  // startSubcontainer starts a child container. It returns the thread group ID of
   907  // the newly created process. Used FDs are either closed or released. It's safe
   908  // for the caller to close any remaining files upon return.
   909  func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error {
   910  	l.mu.Lock()
   911  	defer l.mu.Unlock()
   912  
   913  	ep := l.processes[execID{cid: cid}]
   914  	if ep == nil {
   915  		return fmt.Errorf("trying to start a deleted container %q", cid)
   916  	}
   917  
   918  	// Create credentials. We reuse the root user namespace because the
   919  	// sentry currently supports only 1 mount namespace, which is tied to a
   920  	// single user namespace. Thus we must run in the same user namespace
   921  	// to access mounts.
   922  	creds := getRootCredentials(spec, conf, l.k.RootUserNamespace())
   923  	if creds == nil {
   924  		return fmt.Errorf("getting root credentials")
   925  	}
   926  	var pidns *kernel.PIDNamespace
   927  	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
   928  		if ns.Path != "" {
   929  			for _, p := range l.processes {
   930  				if ns.Path == p.pidnsPath {
   931  					log.Debugf("Joining PID namespace named %q", ns.Path)
   932  					pidns = p.tg.PIDNamespace()
   933  					break
   934  				}
   935  			}
   936  		}
   937  		if pidns == nil {
   938  			log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path)
   939  			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
   940  		}
   941  		ep.pidnsPath = ns.Path
   942  	} else {
   943  		pidns = l.k.RootPIDNamespace()
   944  	}
   945  
   946  	containerName := l.registerContainerLocked(spec, cid)
   947  	info := &containerInfo{
   948  		cid:                 cid,
   949  		containerName:       containerName,
   950  		conf:                conf,
   951  		spec:                spec,
   952  		goferFDs:            goferFDs,
   953  		devGoferFD:          devGoferFD,
   954  		goferFilestoreFDs:   goferFilestoreFDs,
   955  		goferMountConfs:     goferMountConfs,
   956  		nvidiaUVMDevMajor:   l.root.nvidiaUVMDevMajor,
   957  		nvidiaDriverVersion: l.root.nvidiaDriverVersion,
   958  	}
   959  	var err error
   960  	info.procArgs, err = createProcessArgs(cid, spec, conf, creds, l.k, pidns)
   961  	if err != nil {
   962  		return fmt.Errorf("creating new process: %w", err)
   963  	}
   964  
   965  	// Use stdios or TTY depending on the spec configuration.
   966  	if spec.Process.Terminal {
   967  		if l := len(stdioFDs); l != 0 {
   968  			return fmt.Errorf("using TTY, stdios not expected: %d", l)
   969  		}
   970  		if ep.hostTTY == nil {
   971  			return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
   972  		}
   973  		info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
   974  		ep.hostTTY = nil
   975  	} else {
   976  		info.stdioFDs = stdioFDs
   977  	}
   978  
   979  	var cu cleanup.Cleanup
   980  	defer cu.Clean()
   981  	if devGoferFD != nil {
   982  		cu.Add(func() {
   983  			// createContainerProcess() will consume devGoferFD and initialize a gofer
   984  			// connection. This connection is owned by l.k. In case of failure, we want
   985  			// to clean up this gofer connection so that the gofer process can exit.
   986  			l.k.RemoveDevGofer(containerName)
   987  		})
   988  	}
   989  
   990  	ep.tg, ep.tty, err = l.createContainerProcess(info)
   991  	if err != nil {
   992  		return err
   993  	}
   994  
   995  	if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   996  		evt := pb.Start{
   997  			Id:       cid,
   998  			Cwd:      spec.Process.Cwd,
   999  			Args:     spec.Process.Args,
  1000  			Terminal: spec.Process.Terminal,
  1001  		}
  1002  		fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
  1003  		if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
  1004  			evt.Env = spec.Process.Env
  1005  		}
  1006  		if !fields.Context.Empty() {
  1007  			evt.ContextData = &pb.ContextData{}
  1008  			kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData)
  1009  		}
  1010  		_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
  1011  			return c.ContainerStart(context.Background(), fields, &evt)
  1012  		})
  1013  	}
  1014  
  1015  	l.k.RegisterContainerName(cid, info.containerName)
  1016  	l.k.StartProcess(ep.tg)
  1017  	// No more failures from this point on.
  1018  	cu.Release()
  1019  	return nil
  1020  }
  1021  
  1022  // +checklocks:l.mu
  1023  func (l *Loader) createContainerProcess(info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) {
  1024  	// Create the FD map, which will set stdin, stdout, and stderr.
  1025  	ctx := info.procArgs.NewContext(l.k)
  1026  	fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User, info.containerName)
  1027  	if err != nil {
  1028  		return nil, nil, fmt.Errorf("importing fds: %w", err)
  1029  	}
  1030  	// CreateProcess takes a reference on fdTable if successful. We won't need
  1031  	// ours either way.
  1032  	info.procArgs.FDTable = fdTable
  1033  
  1034  	if info.execFD != nil {
  1035  		if info.procArgs.Filename != "" {
  1036  			return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both")
  1037  		}
  1038  		file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{
  1039  			Readonly:     true,
  1040  			Savable:      true,
  1041  			VirtualOwner: true,
  1042  			UID:          auth.KUID(info.spec.Process.User.UID),
  1043  			GID:          auth.KGID(info.spec.Process.User.GID),
  1044  		})
  1045  		if err != nil {
  1046  			return nil, nil, err
  1047  		}
  1048  		defer file.DecRef(ctx)
  1049  		info.execFD.Release()
  1050  
  1051  		info.procArgs.File = file
  1052  	}
  1053  
  1054  	// Gofer FDs must be ordered and the first FD is always the rootfs.
  1055  	if len(info.goferFDs) < 1 {
  1056  		return nil, nil, fmt.Errorf("rootfs gofer FD not found")
  1057  	}
  1058  	l.startGoferMonitor(info)
  1059  
  1060  	if l.root.cid == l.sandboxID {
  1061  		// Mounts cgroups for all the controllers.
  1062  		if err := l.mountCgroupMounts(info.conf, info.procArgs.Credentials); err != nil {
  1063  			return nil, nil, err
  1064  		}
  1065  	}
  1066  	// We can share l.sharedMounts with containerMounter since l.mu is locked.
  1067  	// Hence, mntr must only be used within this function (while l.mu is locked).
  1068  	mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID)
  1069  	if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil {
  1070  		return nil, nil, err
  1071  	}
  1072  	defer func() {
  1073  		for cg := range info.procArgs.InitialCgroups {
  1074  			cg.Dentry.DecRef(ctx)
  1075  		}
  1076  	}()
  1077  
  1078  	// Add the HOME environment variable if it is not already set.
  1079  	info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
  1080  		info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
  1081  	if err != nil {
  1082  		return nil, nil, err
  1083  	}
  1084  
  1085  	// Create and start the new process.
  1086  	tg, _, err := l.k.CreateProcess(info.procArgs)
  1087  	if err != nil {
  1088  		return nil, nil, fmt.Errorf("creating process: %w", err)
  1089  	}
  1090  	// CreateProcess takes a reference on FDTable if successful.
  1091  	info.procArgs.FDTable.DecRef(ctx)
  1092  
  1093  	// Set the foreground process group on the TTY to the global init process
  1094  	// group, since that is what we are about to start running.
  1095  	if ttyFile != nil {
  1096  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
  1097  	}
  1098  
  1099  	// Install seccomp filters with the new task if there are any.
  1100  	if info.conf.OCISeccomp {
  1101  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
  1102  			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
  1103  			if err != nil {
  1104  				return nil, nil, fmt.Errorf("building seccomp program: %w", err)
  1105  			}
  1106  
  1107  			if log.IsLogging(log.Debug) {
  1108  				out, _ := bpf.DecodeProgram(program)
  1109  				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
  1110  			}
  1111  
  1112  			task := tg.Leader()
  1113  			// NOTE: It seems Flags are ignored by runc so we ignore them too.
  1114  			if err := task.AppendSyscallFilter(program, true); err != nil {
  1115  				return nil, nil, fmt.Errorf("appending seccomp filters: %w", err)
  1116  			}
  1117  		}
  1118  	} else {
  1119  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
  1120  			log.Warningf("Seccomp spec is being ignored")
  1121  		}
  1122  	}
  1123  
  1124  	return tg, ttyFile, nil
  1125  }
  1126  
  1127  // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
  1128  // the gofer FD looking for disconnects, and kills the container processes if
  1129  // the gofer connection disconnects.
  1130  func (l *Loader) startGoferMonitor(info *containerInfo) {
  1131  	// We need to pick a suitable gofer connection that is expected to be alive
  1132  	// for the entire container lifecycle. Only the following can be used:
  1133  	// 1. Rootfs gofer connection
  1134  	// 2. Device gofer connection
  1135  	//
  1136  	// Note that other gofer mounts are allowed to be unmounted and disconnected.
  1137  	goferFD := -1
  1138  	if info.goferMountConfs[0].ShouldUseLisafs() {
  1139  		goferFD = info.goferFDs[0].FD()
  1140  	} else if info.devGoferFD != nil {
  1141  		goferFD = info.devGoferFD.FD()
  1142  	}
  1143  	if goferFD < 0 {
  1144  		log.Warningf("could not find a suitable gofer FD to monitor")
  1145  		return
  1146  	}
  1147  	go func() {
  1148  		log.Debugf("Monitoring gofer health for container %q", info.cid)
  1149  		events := []unix.PollFd{
  1150  			{
  1151  				Fd:     int32(goferFD),
  1152  				Events: unix.POLLHUP | unix.POLLRDHUP,
  1153  			},
  1154  		}
  1155  		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
  1156  			// Use ppoll instead of poll because it's already allowed in seccomp.
  1157  			n, err := unix.Ppoll(events, nil, nil)
  1158  			return uintptr(n), 0, err
  1159  		})
  1160  		if err != nil {
  1161  			panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err))
  1162  		}
  1163  
  1164  		l.mu.Lock()
  1165  		defer l.mu.Unlock()
  1166  
  1167  		// The gofer could have been stopped due to a normal container shutdown.
  1168  		// Check if the container has not stopped yet.
  1169  		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: info.cid}); tg != nil {
  1170  			log.Infof("Gofer socket disconnected, killing container %q", info.cid)
  1171  			if err := l.signalAllProcesses(info.cid, int32(linux.SIGKILL)); err != nil {
  1172  				log.Warningf("Error killing container %q after gofer stopped: %s", info.cid, err)
  1173  			}
  1174  		}
  1175  	}()
  1176  }
  1177  
  1178  // destroySubcontainer stops a container if it is still running and cleans up
  1179  // its filesystem.
  1180  func (l *Loader) destroySubcontainer(cid string) error {
  1181  	l.mu.Lock()
  1182  	defer l.mu.Unlock()
  1183  
  1184  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1185  	if err != nil {
  1186  		// Container doesn't exist.
  1187  		return err
  1188  	}
  1189  
  1190  	// The container exists, but has it been started?
  1191  	if tg != nil {
  1192  		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
  1193  			return fmt.Errorf("sending SIGKILL to all container processes: %w", err)
  1194  		}
  1195  		// Wait for all processes that belong to the container to exit (including
  1196  		// exec'd processes).
  1197  		for _, t := range l.k.TaskSet().Root.Tasks() {
  1198  			if t.ContainerID() == cid {
  1199  				t.ThreadGroup().WaitExited()
  1200  			}
  1201  		}
  1202  	}
  1203  
  1204  	// No more failure from this point on.
  1205  
  1206  	// Remove all container thread groups from the map.
  1207  	for key := range l.processes {
  1208  		if key.cid == cid {
  1209  			delete(l.processes, key)
  1210  		}
  1211  	}
  1212  	// Cleanup the device gofer.
  1213  	l.k.RemoveDevGofer(l.k.ContainerName(cid))
  1214  
  1215  	log.Debugf("Container destroyed, cid: %s", cid)
  1216  	return nil
  1217  }
  1218  
  1219  func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
  1220  	// Hold the lock for the entire operation to ensure that exec'd process is
  1221  	// added to 'processes' in case it races with destroyContainer().
  1222  	l.mu.Lock()
  1223  	defer l.mu.Unlock()
  1224  
  1225  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
  1226  	if err != nil {
  1227  		return 0, err
  1228  	}
  1229  	if tg == nil {
  1230  		return 0, fmt.Errorf("container %q not started", args.ContainerID)
  1231  	}
  1232  
  1233  	// Get the container MountNamespace from the Task. Try to acquire ref may fail
  1234  	// in case it raced with task exit.
  1235  	// task.MountNamespace() does not take a ref, so we must do so ourselves.
  1236  	args.MountNamespace = tg.Leader().MountNamespace()
  1237  	if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() {
  1238  		return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
  1239  	}
  1240  
  1241  	args.Envv, err = specutils.ResolveEnvs(args.Envv)
  1242  	if err != nil {
  1243  		return 0, fmt.Errorf("resolving env: %w", err)
  1244  	}
  1245  
  1246  	// Add the HOME environment variable if it is not already set.
  1247  	sctx := l.k.SupervisorContext()
  1248  	root := args.MountNamespace.Root(sctx)
  1249  	defer root.DecRef(sctx)
  1250  	ctx := vfs.WithRoot(sctx, root)
  1251  	defer args.MountNamespace.DecRef(ctx)
  1252  	args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
  1253  	if err != nil {
  1254  		return 0, err
  1255  	}
  1256  	args.PIDNamespace = tg.PIDNamespace()
  1257  
  1258  	args.Limits, err = createLimitSet(l.root.spec, specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf))
  1259  	if err != nil {
  1260  		return 0, fmt.Errorf("creating limits: %w", err)
  1261  	}
  1262  
  1263  	// Start the process.
  1264  	proc := control.Proc{Kernel: l.k}
  1265  	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
  1266  	if err != nil {
  1267  		return 0, err
  1268  	}
  1269  
  1270  	eid := execID{cid: args.ContainerID, pid: tgid}
  1271  	l.processes[eid] = &execProcess{
  1272  		tg:  newTG,
  1273  		tty: ttyFile,
  1274  	}
  1275  	log.Debugf("updated processes: %v", l.processes)
  1276  
  1277  	return tgid, nil
  1278  }
  1279  
  1280  // waitContainer waits for the init process of a container to exit.
  1281  func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
  1282  	// Don't defer unlock, as doing so would make it impossible for
  1283  	// multiple clients to wait on the same container.
  1284  	key := execID{cid: cid}
  1285  	tg, err := l.threadGroupFromID(key)
  1286  	if err != nil {
  1287  		l.mu.Lock()
  1288  		// Extra handling is needed if the container is restoring.
  1289  		if l.state != restoring {
  1290  			l.mu.Unlock()
  1291  			return err
  1292  		}
  1293  		// Container could be restoring, first check if container exists.
  1294  		if _, err := l.findProcessLocked(key); err != nil {
  1295  			l.mu.Unlock()
  1296  			return err
  1297  		}
  1298  		log.Infof("Waiting for container being restored, CID: %q", cid)
  1299  		l.restoreWaiters.Wait()
  1300  		l.mu.Unlock()
  1301  
  1302  		log.Infof("Restore is completed, trying to wait for container %q again.", cid)
  1303  		return l.waitContainer(cid, waitStatus)
  1304  	}
  1305  
  1306  	// If the thread either has already exited or exits during waiting,
  1307  	// consider the container exited.
  1308  	ws := l.wait(tg)
  1309  	*waitStatus = ws
  1310  
  1311  	// Check for leaks and write coverage report after the root container has
  1312  	// exited. This guarantees that the report is written in cases where the
  1313  	// sandbox is killed by a signal after the ContMgrWait request is completed.
  1314  	if l.root.procArgs.ContainerID == cid {
  1315  		// All sentry-created resources should have been released at this point.
  1316  		_ = coverage.Report()
  1317  	}
  1318  	return nil
  1319  }
  1320  
  1321  func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
  1322  	if tgid <= 0 {
  1323  		return fmt.Errorf("PID (%d) must be positive", tgid)
  1324  	}
  1325  
  1326  	// Try to find a process that was exec'd
  1327  	eid := execID{cid: cid, pid: tgid}
  1328  	execTG, err := l.threadGroupFromID(eid)
  1329  	if err == nil {
  1330  		ws := l.wait(execTG)
  1331  		*waitStatus = ws
  1332  
  1333  		l.mu.Lock()
  1334  		delete(l.processes, eid)
  1335  		log.Debugf("updated processes (removal): %v", l.processes)
  1336  		l.mu.Unlock()
  1337  		return nil
  1338  	}
  1339  
  1340  	// The caller may be waiting on a process not started directly via exec.
  1341  	// In this case, find the process in the container's PID namespace.
  1342  	initTG, err := l.threadGroupFromID(execID{cid: cid})
  1343  	if err != nil {
  1344  		return fmt.Errorf("waiting for PID %d: %w", tgid, err)
  1345  	}
  1346  	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
  1347  	if tg == nil {
  1348  		return fmt.Errorf("waiting for PID %d: no such process", tgid)
  1349  	}
  1350  	if tg.Leader().ContainerID() != cid {
  1351  		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
  1352  	}
  1353  	ws := l.wait(tg)
  1354  	*waitStatus = ws
  1355  	return nil
  1356  }
  1357  
  1358  // wait waits for the process with TGID 'tgid' in a container's PID namespace
  1359  // to exit.
  1360  func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
  1361  	tg.WaitExited()
  1362  	return uint32(tg.ExitStatus())
  1363  }
  1364  
  1365  // WaitForStartSignal waits for a start signal from the control server.
  1366  func (l *Loader) WaitForStartSignal() {
  1367  	<-l.ctrl.manager.startChan
  1368  }
  1369  
  1370  // WaitExit waits for the root container to exit, and returns its exit status.
  1371  func (l *Loader) WaitExit() linux.WaitStatus {
  1372  	// Wait for container.
  1373  	l.k.WaitExited()
  1374  
  1375  	return l.k.GlobalInit().ExitStatus()
  1376  }
  1377  
  1378  func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) {
  1379  	// Create an empty network stack because the network namespace may be empty at
  1380  	// this point. Netns is configured before Run() is called. Netstack is
  1381  	// configured using a control uRPC message. Host network is configured inside
  1382  	// Run().
  1383  	switch conf.Network {
  1384  	case config.NetworkHost:
  1385  		// If configured for raw socket support with host network
  1386  		// stack, make sure that we have CAP_NET_RAW the host,
  1387  		// otherwise we can't make raw sockets.
  1388  		if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) {
  1389  			return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability")
  1390  		}
  1391  		// No network namespacing support for hostinet yet, hence creator is nil.
  1392  		return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil
  1393  
  1394  	case config.NetworkNone, config.NetworkSandbox:
  1395  		s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite)
  1396  		if err != nil {
  1397  			return nil, err
  1398  		}
  1399  		creator := &sandboxNetstackCreator{
  1400  			clock:                    clock,
  1401  			uniqueID:                 uniqueID,
  1402  			allowPacketEndpointWrite: conf.AllowPacketEndpointWrite,
  1403  		}
  1404  		return inet.NewRootNamespace(s, creator, userns), nil
  1405  
  1406  	default:
  1407  		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
  1408  	}
  1409  
  1410  }
  1411  
  1412  func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) {
  1413  	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
  1414  	transProtos := []stack.TransportProtocolFactory{
  1415  		tcp.NewProtocol,
  1416  		udp.NewProtocol,
  1417  		icmp.NewProtocol4,
  1418  		icmp.NewProtocol6,
  1419  	}
  1420  	s := netstack.Stack{Stack: stack.New(stack.Options{
  1421  		NetworkProtocols:   netProtos,
  1422  		TransportProtocols: transProtos,
  1423  		Clock:              clock,
  1424  		Stats:              netstack.Metrics,
  1425  		HandleLocal:        true,
  1426  		// Enable raw sockets for users with sufficient
  1427  		// privileges.
  1428  		RawFactory:               raw.EndpointFactory{},
  1429  		AllowPacketEndpointWrite: allowPacketEndpointWrite,
  1430  		UniqueID:                 uniqueID,
  1431  		DefaultIPTables:          netfilter.DefaultLinuxTables,
  1432  	})}
  1433  
  1434  	// Enable SACK Recovery.
  1435  	{
  1436  		opt := tcpip.TCPSACKEnabled(true)
  1437  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1438  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1439  		}
  1440  	}
  1441  
  1442  	// Set default TTLs as required by socket/netstack.
  1443  	{
  1444  		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
  1445  		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
  1446  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
  1447  		}
  1448  		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
  1449  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
  1450  		}
  1451  	}
  1452  
  1453  	// Enable Receive Buffer Auto-Tuning.
  1454  	{
  1455  		opt := tcpip.TCPModerateReceiveBufferOption(true)
  1456  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1457  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1458  		}
  1459  	}
  1460  
  1461  	return &s, nil
  1462  }
  1463  
  1464  // sandboxNetstackCreator implements kernel.NetworkStackCreator.
  1465  //
  1466  // +stateify savable
  1467  type sandboxNetstackCreator struct {
  1468  	clock                    tcpip.Clock
  1469  	uniqueID                 stack.UniqueID
  1470  	allowPacketEndpointWrite bool
  1471  }
  1472  
  1473  // CreateStack implements kernel.NetworkStackCreator.CreateStack.
  1474  func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
  1475  	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite)
  1476  	if err != nil {
  1477  		return nil, err
  1478  	}
  1479  
  1480  	// Setup loopback.
  1481  	n := &Network{Stack: s.(*netstack.Stack).Stack}
  1482  	nicID := tcpip.NICID(f.uniqueID.UniqueID())
  1483  	link := DefaultLoopbackLink
  1484  	linkEP := ethernet.New(loopback.New())
  1485  	opts := stack.NICOptions{
  1486  		Name:               link.Name,
  1487  		DeliverLinkPackets: true,
  1488  	}
  1489  
  1490  	if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
  1491  		return nil, err
  1492  	}
  1493  
  1494  	return s, nil
  1495  }
  1496  
  1497  // signal sends a signal to one or more processes in a container. If PID is 0,
  1498  // then the container init process is used. Depending on the SignalDeliveryMode
  1499  // option, the signal may be sent directly to the indicated process, to all
  1500  // processes in the container, or to the foreground process group. pid is
  1501  // relative to the root PID namespace, not the container's.
  1502  func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
  1503  	if pid < 0 {
  1504  		return fmt.Errorf("PID (%d) must be positive", pid)
  1505  	}
  1506  
  1507  	switch mode {
  1508  	case DeliverToProcess:
  1509  		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
  1510  			return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err)
  1511  		}
  1512  		return nil
  1513  
  1514  	case DeliverToForegroundProcessGroup:
  1515  		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
  1516  			return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err)
  1517  		}
  1518  		return nil
  1519  
  1520  	case DeliverToAllProcesses:
  1521  		if pid != 0 {
  1522  			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
  1523  		}
  1524  		// Check that the container has actually started before signaling it.
  1525  		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
  1526  			return err
  1527  		}
  1528  		if err := l.signalAllProcesses(cid, signo); err != nil {
  1529  			return fmt.Errorf("signaling all processes in container %q: %w", cid, err)
  1530  		}
  1531  		return nil
  1532  
  1533  	default:
  1534  		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
  1535  	}
  1536  }
  1537  
  1538  // signalProcess sends signal to process in the given container. tgid is
  1539  // relative to the root PID namespace, not the container's.
  1540  func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
  1541  	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
  1542  	if err == nil {
  1543  		// Send signal directly to the identified process.
  1544  		return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo})
  1545  	}
  1546  
  1547  	// The caller may be signaling a process not started directly via exec.
  1548  	// In this case, find the process and check that the process belongs to the
  1549  	// container in question.
  1550  	tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid)
  1551  	if tg == nil {
  1552  		return fmt.Errorf("no such process with PID %d", tgid)
  1553  	}
  1554  	if tg.Leader().ContainerID() != cid {
  1555  		return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID())
  1556  	}
  1557  	return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1558  }
  1559  
  1560  // signalForegrondProcessGroup looks up foreground process group from the TTY
  1561  // for the given "tgid" inside container "cid", and send the signal to it.
  1562  func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
  1563  	l.mu.Lock()
  1564  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
  1565  	if err != nil {
  1566  		l.mu.Unlock()
  1567  		return fmt.Errorf("no thread group found: %w", err)
  1568  	}
  1569  	if tg == nil {
  1570  		l.mu.Unlock()
  1571  		return fmt.Errorf("container %q not started", cid)
  1572  	}
  1573  
  1574  	tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
  1575  	l.mu.Unlock()
  1576  	if err != nil {
  1577  		return fmt.Errorf("no thread group found: %w", err)
  1578  	}
  1579  	if tty == nil {
  1580  		return fmt.Errorf("no TTY attached")
  1581  	}
  1582  	pg := tty.ForegroundProcessGroup()
  1583  	si := &linux.SignalInfo{Signo: signo}
  1584  	if pg == nil {
  1585  		// No foreground process group has been set. Signal the
  1586  		// original thread group.
  1587  		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
  1588  		return l.k.SendExternalSignalThreadGroup(tg, si)
  1589  	}
  1590  	// Send the signal to all processes in the process group.
  1591  	return l.k.SendExternalSignalProcessGroup(pg, si)
  1592  }
  1593  
  1594  // signalAllProcesses that belong to specified container. It's a noop if the
  1595  // container hasn't started or has exited.
  1596  func (l *Loader) signalAllProcesses(cid string, signo int32) error {
  1597  	// Pause the kernel to prevent new processes from being created while
  1598  	// the signal is delivered. This prevents process leaks when SIGKILL is
  1599  	// sent to the entire container.
  1600  	l.k.Pause()
  1601  	defer l.k.Unpause()
  1602  	return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo})
  1603  }
  1604  
  1605  // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
  1606  // acquires mutex before calling it and fails in case container hasn't started
  1607  // yet.
  1608  func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
  1609  	l.mu.Lock()
  1610  	defer l.mu.Unlock()
  1611  	tg, err := l.tryThreadGroupFromIDLocked(key)
  1612  	if err != nil {
  1613  		return nil, err
  1614  	}
  1615  	if tg == nil {
  1616  		return nil, fmt.Errorf("container %q not started", key.cid)
  1617  	}
  1618  	return tg, nil
  1619  }
  1620  
  1621  // tryThreadGroupFromIDLocked returns the thread group for the given execution
  1622  // ID. It may return nil in case the container has not started yet. Returns
  1623  // error if execution ID is invalid or if the container cannot be found (maybe
  1624  // it has been deleted). Caller must hold 'mu'.
  1625  func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
  1626  	ep, err := l.findProcessLocked(key)
  1627  	if err != nil {
  1628  		return nil, err
  1629  	}
  1630  	return ep.tg, nil
  1631  }
  1632  
  1633  // ttyFromIDLocked returns the TTY files for the given execution ID. It may
  1634  // return nil in case the container has not started yet. Returns error if
  1635  // execution ID is invalid or if the container cannot be found (maybe it has
  1636  // been deleted). Caller must hold 'mu'.
  1637  func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) {
  1638  	ep, err := l.findProcessLocked(key)
  1639  	if err != nil {
  1640  		return nil, err
  1641  	}
  1642  	return ep.tty, nil
  1643  }
  1644  
  1645  func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User, containerName string) (*kernel.FDTable, *host.TTYFileDescription, error) {
  1646  	if len(stdioFDs) != 3 {
  1647  		return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
  1648  	}
  1649  	fdMap := map[int]*fd.FD{
  1650  		0: stdioFDs[0],
  1651  		1: stdioFDs[1],
  1652  		2: stdioFDs[2],
  1653  	}
  1654  
  1655  	// Create the entries for the host files that were passed to our app.
  1656  	for _, customFD := range passFDs {
  1657  		if customFD.guest < 0 {
  1658  			return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater")
  1659  		}
  1660  		fdMap[customFD.guest] = customFD.host
  1661  	}
  1662  
  1663  	k := kernel.KernelFromContext(ctx)
  1664  	fdTable := k.NewFDTable()
  1665  	ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap, containerName)
  1666  	if err != nil {
  1667  		fdTable.DecRef(ctx)
  1668  		return nil, nil, err
  1669  	}
  1670  	return fdTable, ttyFile, nil
  1671  }
  1672  
  1673  // portForward implements initiating a portForward connection in the sandbox. portForwardProxies
  1674  // represent a two connections each copying to each other (read ends to write ends) in goroutines.
  1675  // The proxies are stored and can be cleaned up, or clean up after themselves if the connection
  1676  // is broken.
  1677  func (l *Loader) portForward(opts *PortForwardOpts) error {
  1678  	// Validate that we have a stream FD to write to. If this happens then
  1679  	// it means there is a misbehaved urpc client or a bug has occurred.
  1680  	if len(opts.Files) != 1 {
  1681  		return fmt.Errorf("stream FD is required for port forward")
  1682  	}
  1683  
  1684  	l.mu.Lock()
  1685  	defer l.mu.Unlock()
  1686  
  1687  	cid := opts.ContainerID
  1688  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1689  	if err != nil {
  1690  		return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err)
  1691  	}
  1692  	if tg == nil {
  1693  		return fmt.Errorf("container %q not started", cid)
  1694  	}
  1695  
  1696  	// Import the fd for the UDS.
  1697  	ctx := l.k.SupervisorContext()
  1698  	fd, err := l.importFD(ctx, opts.Files[0])
  1699  	if err != nil {
  1700  		return fmt.Errorf("importing stream fd: %w", err)
  1701  	}
  1702  	cu := cleanup.Make(func() { fd.DecRef(ctx) })
  1703  	defer cu.Clean()
  1704  
  1705  	fdConn := pf.NewFileDescriptionConn(fd)
  1706  
  1707  	// Create a proxy to forward data between the fdConn and the sandboxed application.
  1708  	pair := pf.ProxyPair{To: fdConn}
  1709  
  1710  	switch l.root.conf.Network {
  1711  	case config.NetworkSandbox:
  1712  		stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack
  1713  		nsConn, err := pf.NewNetstackConn(stack, opts.Port)
  1714  		if err != nil {
  1715  			return fmt.Errorf("creating netstack port forward connection: %w", err)
  1716  		}
  1717  		pair.From = nsConn
  1718  	case config.NetworkHost:
  1719  		hConn, err := pf.NewHostInetConn(opts.Port)
  1720  		if err != nil {
  1721  			return fmt.Errorf("creating hostinet port forward connection: %w", err)
  1722  		}
  1723  		pair.From = hConn
  1724  	default:
  1725  		return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid)
  1726  	}
  1727  	cu.Release()
  1728  	proxy := pf.NewProxy(pair, opts.ContainerID)
  1729  
  1730  	// Add to the list of port forward connections and remove when the
  1731  	// connection closes.
  1732  	l.portForwardProxies = append(l.portForwardProxies, proxy)
  1733  	proxy.AddCleanup(func() {
  1734  		l.mu.Lock()
  1735  		defer l.mu.Unlock()
  1736  		for i := range l.portForwardProxies {
  1737  			if l.portForwardProxies[i] == proxy {
  1738  				l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...)
  1739  				break
  1740  			}
  1741  		}
  1742  	})
  1743  
  1744  	// Start forwarding on the connection.
  1745  	proxy.Start(ctx)
  1746  	return nil
  1747  }
  1748  
  1749  // importFD generically imports a host file descriptor without adding it to any
  1750  // fd table.
  1751  func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) {
  1752  	hostFD, err := fd.NewFromFile(f)
  1753  	if err != nil {
  1754  		return nil, err
  1755  	}
  1756  	defer hostFD.Close()
  1757  	fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{
  1758  		Savable:      false, // We disconnect and close on save.
  1759  		IsTTY:        false,
  1760  		VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed.
  1761  	})
  1762  
  1763  	if err != nil {
  1764  		return nil, err
  1765  	}
  1766  	hostFD.Release()
  1767  	return fd, nil
  1768  }
  1769  
  1770  func (l *Loader) containerCount() int {
  1771  	l.mu.Lock()
  1772  	defer l.mu.Unlock()
  1773  
  1774  	containers := 0
  1775  	for id := range l.processes {
  1776  		if id.pid == 0 {
  1777  			// pid==0 represents the init process of a container. There is
  1778  			// only one of such process per container.
  1779  			containers++
  1780  		}
  1781  	}
  1782  	return containers
  1783  }
  1784  
  1785  func (l *Loader) pidsCount(cid string) (int, error) {
  1786  	l.mu.Lock()
  1787  	defer l.mu.Unlock()
  1788  
  1789  	if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil {
  1790  		// Container doesn't exist.
  1791  		return 0, err
  1792  	}
  1793  	return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil
  1794  }
  1795  
  1796  func (l *Loader) networkStats() ([]*NetworkInterface, error) {
  1797  	var stats []*NetworkInterface
  1798  	stack := l.k.RootNetworkNamespace().Stack()
  1799  	for _, i := range stack.Interfaces() {
  1800  		var stat inet.StatDev
  1801  		if err := stack.Statistics(&stat, i.Name); err != nil {
  1802  			return nil, err
  1803  		}
  1804  		stats = append(stats, &NetworkInterface{
  1805  			Name:      i.Name,
  1806  			RxBytes:   stat[0],
  1807  			RxPackets: stat[1],
  1808  			RxErrors:  stat[2],
  1809  			RxDropped: stat[3],
  1810  			TxBytes:   stat[8],
  1811  			TxPackets: stat[9],
  1812  			TxErrors:  stat[10],
  1813  			TxDropped: stat[11],
  1814  		})
  1815  	}
  1816  	return stats, nil
  1817  }
  1818  
  1819  func (l *Loader) findProcessLocked(key execID) (*execProcess, error) {
  1820  	ep := l.processes[key]
  1821  	if ep == nil {
  1822  		return nil, fmt.Errorf("container %q not found", key.cid)
  1823  	}
  1824  	return ep, nil
  1825  }
  1826  
  1827  func (l *Loader) registerContainer(spec *specs.Spec, cid string) string {
  1828  	l.mu.Lock()
  1829  	defer l.mu.Unlock()
  1830  
  1831  	return l.registerContainerLocked(spec, cid)
  1832  }
  1833  
  1834  func (l *Loader) registerContainerLocked(spec *specs.Spec, cid string) string {
  1835  	containerName := specutils.ContainerName(spec)
  1836  	if len(containerName) == 0 {
  1837  		// If no name was provided, require containers to be restored in the same order
  1838  		// they were created.
  1839  		containerName = "__no_name_" + strconv.Itoa(len(l.containerIDs))
  1840  	}
  1841  
  1842  	l.containerIDs[containerName] = cid
  1843  	return containerName
  1844  }
  1845  
  1846  func (l *Loader) containerRuntimeState(cid string) ContainerRuntimeState {
  1847  	l.mu.Lock()
  1848  	defer l.mu.Unlock()
  1849  	exec, ok := l.processes[execID{cid: cid}]
  1850  	if !ok {
  1851  		// Can't distinguish between invalid CID and stopped container, assume that
  1852  		// CID is valid.
  1853  		return RuntimeStateStopped
  1854  	}
  1855  	if exec.tg == nil {
  1856  		// Container has no thread group assigned, so it has started yet.
  1857  		return RuntimeStateCreating
  1858  	}
  1859  	if exec.tg.Leader().ExitState() == kernel.TaskExitNone {
  1860  		// Init process is still running.
  1861  		return RuntimeStateRunning
  1862  	}
  1863  	// Init process has stopped, but no one has called wait on it yet.
  1864  	return RuntimeStateStopped
  1865  }