github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/boot/loader.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/boot/loader.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package boot loads the kernel and runs a container.
    16  package boot
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	mrand "math/rand"
    22  	"os"
    23  	"runtime"
    24  	gtime "time"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/syndtr/gocapability/capability"
    28  	"golang.org/x/sys/unix"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/bpf"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/coverage"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/cpuid"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/fd"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    37  	"github.com/nicocha30/gvisor-ligolo/pkg/memutil"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/rand"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/refs"
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/control"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fdimport"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/host"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/user"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/loader"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck"
    52  	pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netfilter"
    54  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/time"
    55  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    56  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    57  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/watchdog"
    58  	"github.com/nicocha30/gvisor-ligolo/pkg/sighandling"
    59  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    60  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    61  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/ethernet"
    62  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/loopback"
    63  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/packetsocket"
    64  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/sniffer"
    65  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/arp"
    66  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/ipv4"
    67  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/ipv6"
    68  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    69  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/icmp"
    70  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/raw"
    71  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/tcp"
    72  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/udp"
    73  	"github.com/nicocha30/gvisor-ligolo/runsc/boot/filter"
    74  	_ "github.com/nicocha30/gvisor-ligolo/runsc/boot/platforms" // register all platforms.
    75  	pf "github.com/nicocha30/gvisor-ligolo/runsc/boot/portforward"
    76  	"github.com/nicocha30/gvisor-ligolo/runsc/boot/pprof"
    77  	"github.com/nicocha30/gvisor-ligolo/runsc/config"
    78  	"github.com/nicocha30/gvisor-ligolo/runsc/profile"
    79  	"github.com/nicocha30/gvisor-ligolo/runsc/specutils"
    80  	"github.com/nicocha30/gvisor-ligolo/runsc/specutils/seccomp"
    81  
    82  	// Top-level inet providers.
    83  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/hostinet"
    84  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netstack"
    85  
    86  	// Include other supported socket providers.
    87  	_ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink"
    88  	_ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/route"
    89  	_ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/uevent"
    90  	_ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix"
    91  )
    92  
    93  type containerInfo struct {
    94  	conf *config.Config
    95  
    96  	// spec is the base configuration for the root container.
    97  	spec *specs.Spec
    98  
    99  	// procArgs refers to the container's init task.
   100  	procArgs kernel.CreateProcessArgs
   101  
   102  	// stdioFDs contains stdin, stdout, and stderr.
   103  	stdioFDs []*fd.FD
   104  
   105  	// passFDs are mappings of user-supplied host to guest file descriptors.
   106  	passFDs []fdMapping
   107  
   108  	// execFD is the host file descriptor used for program execution.
   109  	execFD *fd.FD
   110  
   111  	// goferFDs are the FDs that attach the sandbox to the gofers.
   112  	goferFDs []*fd.FD
   113  
   114  	// overlayFilestoreFDs are the FDs to the regular files that will back the
   115  	// tmpfs upper mount in the overlay mounts.
   116  	overlayFilestoreFDs []*fd.FD
   117  
   118  	// overlayMediums contains information about how the gofer mounts have been
   119  	// overlaid. The first entry is for rootfs and the following entries are for
   120  	// bind mounts in spec.Mounts (in the same order).
   121  	overlayMediums []OverlayMedium
   122  
   123  	// nvidiaUVMDevMajor is the device major number used for nvidia-uvm.
   124  	nvidiaUVMDevMajor uint32
   125  }
   126  
   127  // Loader keeps state needed to start the kernel and run the container.
   128  type Loader struct {
   129  	// k is the kernel.
   130  	k *kernel.Kernel
   131  
   132  	// ctrl is the control server.
   133  	ctrl *controller
   134  
   135  	// root contains information about the root container in the sandbox.
   136  	root containerInfo
   137  
   138  	watchdog *watchdog.Watchdog
   139  
   140  	// stopSignalForwarding disables forwarding of signals to the sandboxed
   141  	// container. It should be called when a sandbox is destroyed.
   142  	stopSignalForwarding func()
   143  
   144  	// stopProfiling stops profiling started at container creation. It
   145  	// should be called when a sandbox is destroyed.
   146  	stopProfiling func()
   147  
   148  	// PreSeccompCallback is called right before installing seccomp filters.
   149  	PreSeccompCallback func()
   150  
   151  	// restore is set to true if we are restoring a container.
   152  	restore bool
   153  
   154  	// sandboxID is the ID for the whole sandbox.
   155  	sandboxID string
   156  
   157  	// mountHints provides extra information about mounts for containers that
   158  	// apply to the entire pod.
   159  	mountHints *PodMountHints
   160  
   161  	// productName is the value to show in
   162  	// /sys/devices/virtual/dmi/id/product_name.
   163  	productName string
   164  
   165  	// nvidiaUVMDevMajor is the device major number used for nvidia-uvm.
   166  	nvidiaUVMDevMajor uint32
   167  
   168  	// mu guards processes and porForwardProxies.
   169  	mu sync.Mutex
   170  
   171  	// processes maps containers init process and invocation of exec. Root
   172  	// processes are keyed with container ID and pid=0, while exec invocations
   173  	// have the corresponding pid set.
   174  	//
   175  	// processes is guarded by mu.
   176  	processes map[execID]*execProcess
   177  
   178  	// portForwardProxies is a list of active port forwarding connections.
   179  	//
   180  	// portForwardProxies is guarded by mu.
   181  	portForwardProxies []*pf.Proxy
   182  }
   183  
   184  // execID uniquely identifies a sentry process that is executed in a container.
   185  type execID struct {
   186  	cid string
   187  	pid kernel.ThreadID
   188  }
   189  
   190  // execProcess contains the thread group and host TTY of a sentry process.
   191  type execProcess struct {
   192  	// tg will be nil for containers that haven't started yet.
   193  	tg *kernel.ThreadGroup
   194  
   195  	// tty will be nil if the process is not attached to a terminal.
   196  	tty *host.TTYFileDescription
   197  
   198  	// pidnsPath is the pid namespace path in spec
   199  	pidnsPath string
   200  
   201  	// hostTTY is present when creating a sub-container with terminal enabled.
   202  	// TTY file is passed during container create and must be saved until
   203  	// container start.
   204  	hostTTY *fd.FD
   205  }
   206  
   207  // fdMapping maps guest to host file descriptors. Guest file descriptors are
   208  // exposed to the application inside the sandbox through the FD table.
   209  type fdMapping struct {
   210  	guest int
   211  	host  *fd.FD
   212  }
   213  
   214  // FDMapping is a helper type to represent a mapping from guest to host file
   215  // descriptors. In contrast to the unexported fdMapping type, it does not imply
   216  // file ownership.
   217  type FDMapping struct {
   218  	Guest int
   219  	Host  int
   220  }
   221  
   222  func init() {
   223  	// Initialize the random number generator.
   224  	mrand.Seed(gtime.Now().UnixNano())
   225  }
   226  
   227  // Args are the arguments for New().
   228  type Args struct {
   229  	// Id is the sandbox ID.
   230  	ID string
   231  	// Spec is the sandbox specification.
   232  	Spec *specs.Spec
   233  	// Conf is the system configuration.
   234  	Conf *config.Config
   235  	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
   236  	// of this FD and may close it at any time.
   237  	ControllerFD int
   238  	// Device is an optional argument that is passed to the platform. The Loader
   239  	// takes ownership of this file and may close it at any time.
   240  	Device *os.File
   241  	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
   242  	// takes ownership of these FDs and may close them at any time.
   243  	GoferFDs []int
   244  	// StdioFDs is the stdio for the application. The Loader takes ownership of
   245  	// these FDs and may close them at any time.
   246  	StdioFDs []int
   247  	// PassFDs are user-supplied FD mappings from host to guest descriptors.
   248  	// The Loader takes ownership of these FDs and may close them at any time.
   249  	PassFDs []FDMapping
   250  	// ExecFD is the host file descriptor used for program execution.
   251  	ExecFD int
   252  	// OverlayFilestoreFDs are the FDs to the regular files that will back the
   253  	// tmpfs upper mount in the overlay mounts.
   254  	OverlayFilestoreFDs []int
   255  	// OverlayMediums contains information about how the gofer mounts have been
   256  	// overlaid. The first entry is for rootfs and the following entries are for
   257  	// bind mounts in Spec.Mounts (in the same order).
   258  	OverlayMediums []OverlayMedium
   259  	// NumCPU is the number of CPUs to create inside the sandbox.
   260  	NumCPU int
   261  	// TotalMem is the initial amount of total memory to report back to the
   262  	// container.
   263  	TotalMem uint64
   264  	// TotalHostMem is the total memory reported by host /proc/meminfo.
   265  	TotalHostMem uint64
   266  	// UserLogFD is the file descriptor to write user logs to.
   267  	UserLogFD int
   268  	// ProductName is the value to show in
   269  	// /sys/devices/virtual/dmi/id/product_name.
   270  	ProductName string
   271  	// PodInitConfigFD is the file descriptor to a file passed in the
   272  	//	--pod-init-config flag
   273  	PodInitConfigFD int
   274  	// SinkFDs is an ordered array of file descriptors to be used by seccheck
   275  	// sinks configured from the --pod-init-config file.
   276  	SinkFDs []int
   277  	// ProfileOpts contains the set of profiles to enable and the
   278  	// corresponding FDs where profile data will be written.
   279  	ProfileOpts profile.Opts
   280  }
   281  
   282  // make sure stdioFDs are always the same on initial start and on restore
   283  const startingStdioFD = 256
   284  
   285  // New initializes a new kernel loader configured by spec.
   286  // New also handles setting up a kernel for restoring a container.
   287  func New(args Args) (*Loader, error) {
   288  	stopProfiling := profile.Start(args.ProfileOpts)
   289  
   290  	// Initialize seccheck points.
   291  	seccheck.Initialize()
   292  
   293  	// We initialize the rand package now to make sure /dev/urandom is pre-opened
   294  	// on kernels that do not support getrandom(2).
   295  	if err := rand.Init(); err != nil {
   296  		return nil, fmt.Errorf("setting up rand: %w", err)
   297  	}
   298  
   299  	if err := usage.Init(); err != nil {
   300  		return nil, fmt.Errorf("setting up memory usage: %w", err)
   301  	}
   302  
   303  	kernel.IOUringEnabled = args.Conf.IOUring
   304  
   305  	info := containerInfo{
   306  		conf:           args.Conf,
   307  		spec:           args.Spec,
   308  		overlayMediums: args.OverlayMediums,
   309  	}
   310  
   311  	// Make host FDs stable between invocations. Host FDs must map to the exact
   312  	// same number when the sandbox is restored. Otherwise the wrong FD will be
   313  	// used.
   314  	newfd := startingStdioFD
   315  
   316  	for _, stdioFD := range args.StdioFDs {
   317  		// Check that newfd is unused to avoid clobbering over it.
   318  		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
   319  			if err != nil {
   320  				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
   321  			}
   322  			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
   323  		}
   324  
   325  		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
   326  		if err != nil {
   327  			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
   328  		}
   329  		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
   330  		_ = unix.Close(stdioFD)
   331  		newfd++
   332  	}
   333  	for _, goferFD := range args.GoferFDs {
   334  		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
   335  	}
   336  	for _, overlayFD := range args.OverlayFilestoreFDs {
   337  		info.overlayFilestoreFDs = append(info.overlayFilestoreFDs, fd.New(overlayFD))
   338  	}
   339  
   340  	if args.ExecFD >= 0 {
   341  		info.execFD = fd.New(args.ExecFD)
   342  	}
   343  
   344  	for _, customFD := range args.PassFDs {
   345  		info.passFDs = append(info.passFDs, fdMapping{
   346  			host:  fd.New(customFD.Host),
   347  			guest: customFD.Guest,
   348  		})
   349  	}
   350  
   351  	// Create kernel and platform.
   352  	p, err := createPlatform(args.Conf, args.Device)
   353  	if err != nil {
   354  		return nil, fmt.Errorf("creating platform: %w", err)
   355  	}
   356  	if args.Conf.NVProxy && p.OwnsPageTables() {
   357  		return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform)
   358  	}
   359  	k := &kernel.Kernel{
   360  		Platform: p,
   361  	}
   362  
   363  	// Create memory file.
   364  	mf, err := createMemoryFile()
   365  	if err != nil {
   366  		return nil, fmt.Errorf("creating memory file: %w", err)
   367  	}
   368  	k.SetMemoryFile(mf)
   369  
   370  	// Create VDSO.
   371  	//
   372  	// Pass k as the platform since it is savable, unlike the actual platform.
   373  	vdso, err := loader.PrepareVDSO(k)
   374  	if err != nil {
   375  		return nil, fmt.Errorf("creating vdso: %w", err)
   376  	}
   377  
   378  	// Create timekeeper.
   379  	tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
   380  	tk.SetClocks(time.NewCalibratedClocks())
   381  
   382  	if err := enableStrace(args.Conf); err != nil {
   383  		return nil, fmt.Errorf("enabling strace: %w", err)
   384  	}
   385  
   386  	// Create capabilities.
   387  	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
   388  	if err != nil {
   389  		return nil, fmt.Errorf("converting capabilities: %w", err)
   390  	}
   391  
   392  	// Convert the spec's additional GIDs to KGIDs.
   393  	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
   394  	for _, GID := range args.Spec.Process.User.AdditionalGids {
   395  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   396  	}
   397  
   398  	// Create credentials.
   399  	creds := auth.NewUserCredentials(
   400  		auth.KUID(args.Spec.Process.User.UID),
   401  		auth.KGID(args.Spec.Process.User.GID),
   402  		extraKGIDs,
   403  		caps,
   404  		auth.NewRootUserNamespace())
   405  
   406  	// Create root network namespace/stack.
   407  	netns, err := newRootNetworkNamespace(args.Conf, tk, k, creds.UserNamespace)
   408  	if err != nil {
   409  		return nil, fmt.Errorf("creating network: %w", err)
   410  	}
   411  
   412  	if args.NumCPU == 0 {
   413  		args.NumCPU = runtime.NumCPU()
   414  	}
   415  	log.Infof("CPUs: %d", args.NumCPU)
   416  	runtime.GOMAXPROCS(args.NumCPU)
   417  
   418  	if args.TotalHostMem > 0 {
   419  		// As per tmpfs(5), the default size limit is 50% of total physical RAM.
   420  		// See mm/shmem.c:shmem_default_max_blocks().
   421  		tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2)
   422  	}
   423  
   424  	if args.TotalMem > 0 {
   425  		// Adjust the total memory returned by the Sentry so that applications that
   426  		// use /proc/meminfo can make allocations based on this limit.
   427  		usage.MinimumTotalMemoryBytes = args.TotalMem
   428  		usage.MaximumTotalMemoryBytes = args.TotalMem
   429  		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
   430  	}
   431  
   432  	// Initiate the Kernel object, which is required by the Context passed
   433  	// to createVFS in order to mount (among other things) procfs.
   434  	if err = k.Init(kernel.InitKernelArgs{
   435  		FeatureSet:                  cpuid.HostFeatureSet().Fixed(),
   436  		Timekeeper:                  tk,
   437  		RootUserNamespace:           creds.UserNamespace,
   438  		RootNetworkNamespace:        netns,
   439  		ApplicationCores:            uint(args.NumCPU),
   440  		Vdso:                        vdso,
   441  		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
   442  		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
   443  		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
   444  		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
   445  	}); err != nil {
   446  		return nil, fmt.Errorf("initializing kernel: %w", err)
   447  	}
   448  
   449  	if err := registerFilesystems(k, &info); err != nil {
   450  		return nil, fmt.Errorf("registering filesystems: %w", err)
   451  	}
   452  
   453  	// Turn on packet logging if enabled.
   454  	if args.Conf.LogPackets {
   455  		log.Infof("Packet logging enabled")
   456  		sniffer.LogPackets.Store(1)
   457  	} else {
   458  		log.Infof("Packet logging disabled")
   459  		sniffer.LogPackets.Store(0)
   460  	}
   461  
   462  	// Create a watchdog.
   463  	dogOpts := watchdog.DefaultOpts
   464  	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
   465  	dog := watchdog.New(k, dogOpts)
   466  
   467  	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
   468  	if err != nil {
   469  		return nil, fmt.Errorf("creating init process for root container: %w", err)
   470  	}
   471  	info.procArgs = procArgs
   472  
   473  	if err := initCompatLogs(args.UserLogFD); err != nil {
   474  		return nil, fmt.Errorf("initializing compat logs: %w", err)
   475  	}
   476  
   477  	mountHints, err := NewPodMountHints(args.Spec)
   478  	if err != nil {
   479  		return nil, fmt.Errorf("creating pod mount hints: %w", err)
   480  	}
   481  
   482  	// Set up host mount that will be used for imported fds.
   483  	hostFilesystem, err := host.NewFilesystem(k.VFS())
   484  	if err != nil {
   485  		return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err)
   486  	}
   487  	defer hostFilesystem.DecRef(k.SupervisorContext())
   488  	k.SetHostMount(k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}))
   489  
   490  	if args.PodInitConfigFD >= 0 {
   491  		if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil {
   492  			log.Warningf("unable to configure event session: %v", err)
   493  		}
   494  	}
   495  
   496  	eid := execID{cid: args.ID}
   497  	l := &Loader{
   498  		k:                 k,
   499  		watchdog:          dog,
   500  		sandboxID:         args.ID,
   501  		processes:         map[execID]*execProcess{eid: {}},
   502  		mountHints:        mountHints,
   503  		root:              info,
   504  		stopProfiling:     stopProfiling,
   505  		productName:       args.ProductName,
   506  		nvidiaUVMDevMajor: info.nvidiaUVMDevMajor,
   507  	}
   508  
   509  	// We don't care about child signals; some platforms can generate a
   510  	// tremendous number of useless ones (I'm looking at you, ptrace).
   511  	if err := sighandling.IgnoreChildStop(); err != nil {
   512  		return nil, fmt.Errorf("ignore child stop signals failed: %w", err)
   513  	}
   514  
   515  	// Create the control server using the provided FD.
   516  	//
   517  	// This must be done *after* we have initialized the kernel since the
   518  	// controller is used to configure the kernel's network stack.
   519  	ctrl, err := newController(args.ControllerFD, l)
   520  	if err != nil {
   521  		return nil, fmt.Errorf("creating control server: %w", err)
   522  	}
   523  	l.ctrl = ctrl
   524  
   525  	// Only start serving after Loader is set to controller and controller is set
   526  	// to Loader, because they are both used in the urpc methods.
   527  	if err := ctrl.srv.StartServing(); err != nil {
   528  		return nil, fmt.Errorf("starting control server: %w", err)
   529  	}
   530  
   531  	return l, nil
   532  }
   533  
   534  // createProcessArgs creates args that can be used with kernel.CreateProcess.
   535  func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
   536  	// Create initial limits.
   537  	ls, err := createLimitSet(spec)
   538  	if err != nil {
   539  		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err)
   540  	}
   541  	env, err := specutils.ResolveEnvs(spec.Process.Env)
   542  	if err != nil {
   543  		return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err)
   544  	}
   545  
   546  	wd := spec.Process.Cwd
   547  	if wd == "" {
   548  		wd = "/"
   549  	}
   550  
   551  	// Create the process arguments.
   552  	procArgs := kernel.CreateProcessArgs{
   553  		Argv:                    spec.Process.Args,
   554  		Envv:                    env,
   555  		WorkingDirectory:        wd,
   556  		Credentials:             creds,
   557  		Umask:                   0022,
   558  		Limits:                  ls,
   559  		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
   560  		UTSNamespace:            k.RootUTSNamespace(),
   561  		IPCNamespace:            k.RootIPCNamespace(),
   562  		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
   563  		ContainerID:             id,
   564  		PIDNamespace:            pidns,
   565  	}
   566  
   567  	return procArgs, nil
   568  }
   569  
   570  // Destroy cleans up all resources used by the loader.
   571  //
   572  // Note that this will block until all open control server connections have
   573  // been closed. For that reason, this should NOT be called in a defer, because
   574  // a panic in a control server rpc would then hang forever.
   575  func (l *Loader) Destroy() {
   576  	if l.stopSignalForwarding != nil {
   577  		l.stopSignalForwarding()
   578  	}
   579  	l.watchdog.Stop()
   580  
   581  	// Stop the control server. This will indirectly stop any
   582  	// long-running control operations that are in flight, e.g.
   583  	// profiling operations.
   584  	l.ctrl.stop()
   585  
   586  	// Release all kernel resources. This is only safe after we can no longer
   587  	// save/restore.
   588  	l.k.Release()
   589  
   590  	// Release any dangling tcp connections.
   591  	tcpip.ReleaseDanglingEndpoints()
   592  
   593  	// In the success case, stdioFDs and goferFDs will only contain
   594  	// released/closed FDs that ownership has been passed over to host FDs and
   595  	// gofer sessions. Close them here in case of failure.
   596  	for _, f := range l.root.stdioFDs {
   597  		_ = f.Close()
   598  	}
   599  	for _, f := range l.root.passFDs {
   600  		_ = f.host.Close()
   601  	}
   602  	for _, f := range l.root.goferFDs {
   603  		_ = f.Close()
   604  	}
   605  
   606  	l.stopProfiling()
   607  }
   608  
   609  func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
   610  	p, err := platform.Lookup(conf.Platform)
   611  	if err != nil {
   612  		panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err))
   613  	}
   614  	log.Infof("Platform: %s", conf.Platform)
   615  	return p.New(deviceFile)
   616  }
   617  
   618  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   619  	const memfileName = "runsc-memory"
   620  	memfd, err := memutil.CreateMemFD(memfileName, 0)
   621  	if err != nil {
   622  		return nil, fmt.Errorf("error creating memfd: %w", err)
   623  	}
   624  	memfile := os.NewFile(uintptr(memfd), memfileName)
   625  	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
   626  	// there are memory cgroups specified, because at this point we're already
   627  	// in a mount namespace in which the relevant cgroupfs is not visible.
   628  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
   629  	if err != nil {
   630  		_ = memfile.Close()
   631  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err)
   632  	}
   633  	return mf, nil
   634  }
   635  
   636  // installSeccompFilters installs sandbox seccomp filters with the host.
   637  func (l *Loader) installSeccompFilters() error {
   638  	if l.PreSeccompCallback != nil {
   639  		l.PreSeccompCallback()
   640  	}
   641  	if l.root.conf.DisableSeccomp {
   642  		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
   643  	} else {
   644  		hostnet := l.root.conf.Network == config.NetworkHost
   645  		opts := filter.Options{
   646  			Platform:              l.k.Platform,
   647  			HostNetwork:           hostnet,
   648  			HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw,
   649  			HostFilesystem:        l.root.conf.DirectFS,
   650  			ProfileEnable:         l.root.conf.ProfileEnable,
   651  			NVProxy:               l.root.conf.NVProxy,
   652  			TPUProxy:              l.root.conf.TPUProxy,
   653  			ControllerFD:          l.ctrl.srv.FD(),
   654  		}
   655  		if err := filter.Install(opts); err != nil {
   656  			return fmt.Errorf("installing seccomp filters: %w", err)
   657  		}
   658  	}
   659  	return nil
   660  }
   661  
   662  // Run runs the root container.
   663  func (l *Loader) Run() error {
   664  	err := l.run()
   665  	l.ctrl.manager.startResultChan <- err
   666  	if err != nil {
   667  		// Give the controller some time to send the error to the
   668  		// runtime. If we return too quickly here the process will exit
   669  		// and the control connection will be closed before the error
   670  		// is returned.
   671  		gtime.Sleep(2 * gtime.Second)
   672  		return err
   673  	}
   674  	return nil
   675  }
   676  
   677  func (l *Loader) run() error {
   678  	if l.root.conf.Network == config.NetworkHost {
   679  		// Delay host network configuration to this point because network namespace
   680  		// is configured after the loader is created and before Run() is called.
   681  		log.Debugf("Configuring host network")
   682  		s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
   683  		if err := s.Configure(l.root.conf.EnableRaw); err != nil {
   684  			return err
   685  		}
   686  	}
   687  
   688  	l.mu.Lock()
   689  	defer l.mu.Unlock()
   690  
   691  	eid := execID{cid: l.sandboxID}
   692  	ep, ok := l.processes[eid]
   693  	if !ok {
   694  		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
   695  	}
   696  
   697  	// If we are restoring, we do not want to create a process.
   698  	// l.restore is set by the container manager when a restore call is made.
   699  	if !l.restore {
   700  		if l.root.conf.ProfileEnable {
   701  			pprof.Initialize()
   702  		}
   703  
   704  		// Finally done with all configuration. Setup filters before user code
   705  		// is loaded.
   706  		if err := l.installSeccompFilters(); err != nil {
   707  			return err
   708  		}
   709  
   710  		// Create the root container init task. It will begin running
   711  		// when the kernel is started.
   712  		var (
   713  			tg  *kernel.ThreadGroup
   714  			err error
   715  		)
   716  		tg, ep.tty, err = l.createContainerProcess(true, l.sandboxID, &l.root)
   717  		if err != nil {
   718  			return err
   719  		}
   720  
   721  		if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   722  			evt := pb.Start{
   723  				Id:       l.sandboxID,
   724  				Cwd:      l.root.spec.Process.Cwd,
   725  				Args:     l.root.spec.Process.Args,
   726  				Terminal: l.root.spec.Process.Terminal,
   727  			}
   728  			fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
   729  			if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
   730  				evt.Env = l.root.spec.Process.Env
   731  			}
   732  			if !fields.Context.Empty() {
   733  				evt.ContextData = &pb.ContextData{}
   734  				kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData)
   735  			}
   736  			_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   737  				return c.ContainerStart(context.Background(), fields, &evt)
   738  			})
   739  		}
   740  	}
   741  
   742  	ep.tg = l.k.GlobalInit()
   743  	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
   744  		ep.pidnsPath = ns.Path
   745  	}
   746  
   747  	// Handle signals by forwarding them to the root container process
   748  	// (except for panic signal, which should cause a panic).
   749  	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
   750  		// Panic signal should cause a panic.
   751  		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
   752  			panic("Signal-induced panic")
   753  		}
   754  
   755  		// Otherwise forward to root container.
   756  		deliveryMode := DeliverToProcess
   757  		if l.root.spec.Process.Terminal {
   758  			// Since we are running with a console, we should forward the signal to
   759  			// the foreground process group so that job control signals like ^C can
   760  			// be handled properly.
   761  			deliveryMode = DeliverToForegroundProcessGroup
   762  		}
   763  		log.Infof("Received external signal %d, mode: %s", sig, deliveryMode)
   764  		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
   765  			log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err)
   766  		}
   767  	})
   768  
   769  	log.Infof("Process should have started...")
   770  	l.watchdog.Start()
   771  	return l.k.Start()
   772  }
   773  
   774  // createSubcontainer creates a new container inside the sandbox.
   775  func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error {
   776  	l.mu.Lock()
   777  	defer l.mu.Unlock()
   778  
   779  	eid := execID{cid: cid}
   780  	if _, ok := l.processes[eid]; ok {
   781  		return fmt.Errorf("container %q already exists", cid)
   782  	}
   783  	l.processes[eid] = &execProcess{hostTTY: tty}
   784  	return nil
   785  }
   786  
   787  // startSubcontainer starts a child container. It returns the thread group ID of
   788  // the newly created process. Used FDs are either closed or released. It's safe
   789  // for the caller to close any remaining files upon return.
   790  func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, overlayFilestoreFDs []*fd.FD, overlayMediums []OverlayMedium) error {
   791  	// Create capabilities.
   792  	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
   793  	if err != nil {
   794  		return fmt.Errorf("creating capabilities: %w", err)
   795  	}
   796  
   797  	l.mu.Lock()
   798  	defer l.mu.Unlock()
   799  
   800  	ep := l.processes[execID{cid: cid}]
   801  	if ep == nil {
   802  		return fmt.Errorf("trying to start a deleted container %q", cid)
   803  	}
   804  
   805  	// Convert the spec's additional GIDs to KGIDs.
   806  	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
   807  	for _, GID := range spec.Process.User.AdditionalGids {
   808  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   809  	}
   810  
   811  	// Create credentials. We reuse the root user namespace because the
   812  	// sentry currently supports only 1 mount namespace, which is tied to a
   813  	// single user namespace. Thus we must run in the same user namespace
   814  	// to access mounts.
   815  	creds := auth.NewUserCredentials(
   816  		auth.KUID(spec.Process.User.UID),
   817  		auth.KGID(spec.Process.User.GID),
   818  		extraKGIDs,
   819  		caps,
   820  		l.k.RootUserNamespace())
   821  
   822  	var pidns *kernel.PIDNamespace
   823  	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
   824  		if ns.Path != "" {
   825  			for _, p := range l.processes {
   826  				if ns.Path == p.pidnsPath {
   827  					log.Debugf("Joining PID namespace named %q", ns.Path)
   828  					pidns = p.tg.PIDNamespace()
   829  					break
   830  				}
   831  			}
   832  		}
   833  		if pidns == nil {
   834  			log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path)
   835  			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
   836  		}
   837  		ep.pidnsPath = ns.Path
   838  	} else {
   839  		pidns = l.k.RootPIDNamespace()
   840  	}
   841  
   842  	info := &containerInfo{
   843  		conf:                conf,
   844  		spec:                spec,
   845  		goferFDs:            goferFDs,
   846  		overlayFilestoreFDs: overlayFilestoreFDs,
   847  		overlayMediums:      overlayMediums,
   848  		nvidiaUVMDevMajor:   l.nvidiaUVMDevMajor,
   849  	}
   850  	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
   851  	if err != nil {
   852  		return fmt.Errorf("creating new process: %w", err)
   853  	}
   854  
   855  	// Use stdios or TTY depending on the spec configuration.
   856  	if spec.Process.Terminal {
   857  		if l := len(stdioFDs); l != 0 {
   858  			return fmt.Errorf("using TTY, stdios not expected: %d", l)
   859  		}
   860  		if ep.hostTTY == nil {
   861  			return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
   862  		}
   863  		info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
   864  		ep.hostTTY = nil
   865  	} else {
   866  		info.stdioFDs = stdioFDs
   867  	}
   868  
   869  	ep.tg, ep.tty, err = l.createContainerProcess(false, cid, info)
   870  	if err != nil {
   871  		return err
   872  	}
   873  
   874  	if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   875  		evt := pb.Start{
   876  			Id:       cid,
   877  			Cwd:      spec.Process.Cwd,
   878  			Args:     spec.Process.Args,
   879  			Terminal: spec.Process.Terminal,
   880  		}
   881  		fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
   882  		if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
   883  			evt.Env = spec.Process.Env
   884  		}
   885  		if !fields.Context.Empty() {
   886  			evt.ContextData = &pb.ContextData{}
   887  			kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData)
   888  		}
   889  		_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   890  			return c.ContainerStart(context.Background(), fields, &evt)
   891  		})
   892  	}
   893  
   894  	l.k.StartProcess(ep.tg)
   895  	return nil
   896  }
   897  
   898  func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) {
   899  	// Create the FD map, which will set stdin, stdout, and stderr.
   900  	ctx := info.procArgs.NewContext(l.k)
   901  	fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User)
   902  	if err != nil {
   903  		return nil, nil, fmt.Errorf("importing fds: %w", err)
   904  	}
   905  	// CreateProcess takes a reference on fdTable if successful. We won't need
   906  	// ours either way.
   907  	info.procArgs.FDTable = fdTable
   908  
   909  	if info.execFD != nil {
   910  		if info.procArgs.Filename != "" {
   911  			return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both")
   912  		}
   913  		file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{
   914  			Readonly:     true,
   915  			Savable:      true,
   916  			VirtualOwner: true,
   917  			UID:          auth.KUID(info.spec.Process.User.UID),
   918  			GID:          auth.KGID(info.spec.Process.User.GID),
   919  		})
   920  		if err != nil {
   921  			return nil, nil, err
   922  		}
   923  		defer file.DecRef(ctx)
   924  		info.execFD.Release()
   925  
   926  		info.procArgs.File = file
   927  	}
   928  
   929  	// Gofer FDs must be ordered and the first FD is always the rootfs.
   930  	if len(info.goferFDs) < 1 {
   931  		return nil, nil, fmt.Errorf("rootfs gofer FD not found")
   932  	}
   933  	l.startGoferMonitor(cid, int32(info.goferFDs[0].FD()))
   934  
   935  	mntr := newContainerMounter(info, l.k, l.mountHints, l.productName, l.sandboxID)
   936  	if root {
   937  		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
   938  			return nil, nil, err
   939  		}
   940  	}
   941  	if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil {
   942  		return nil, nil, err
   943  	}
   944  
   945  	// Add the HOME environment variable if it is not already set.
   946  	info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
   947  		info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
   948  	if err != nil {
   949  		return nil, nil, err
   950  	}
   951  
   952  	// Create and start the new process.
   953  	tg, _, err := l.k.CreateProcess(info.procArgs)
   954  	if err != nil {
   955  		return nil, nil, fmt.Errorf("creating process: %w", err)
   956  	}
   957  	// CreateProcess takes a reference on FDTable if successful.
   958  	info.procArgs.FDTable.DecRef(ctx)
   959  
   960  	// Set the foreground process group on the TTY to the global init process
   961  	// group, since that is what we are about to start running.
   962  	if ttyFile != nil {
   963  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
   964  	}
   965  
   966  	// Install seccomp filters with the new task if there are any.
   967  	if info.conf.OCISeccomp {
   968  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   969  			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
   970  			if err != nil {
   971  				return nil, nil, fmt.Errorf("building seccomp program: %w", err)
   972  			}
   973  
   974  			if log.IsLogging(log.Debug) {
   975  				out, _ := bpf.DecodeProgram(program)
   976  				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
   977  			}
   978  
   979  			task := tg.Leader()
   980  			// NOTE: It seems Flags are ignored by runc so we ignore them too.
   981  			if err := task.AppendSyscallFilter(program, true); err != nil {
   982  				return nil, nil, fmt.Errorf("appending seccomp filters: %w", err)
   983  			}
   984  		}
   985  	} else {
   986  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   987  			log.Warningf("Seccomp spec is being ignored")
   988  		}
   989  	}
   990  
   991  	return tg, ttyFile, nil
   992  }
   993  
   994  // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
   995  // the gofer FD looking for disconnects, and kills the container processes if
   996  // the rootfs FD disconnects.
   997  //
   998  // Note that other gofer mounts are allowed to be unmounted and disconnected.
   999  func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) {
  1000  	if rootfsGoferFD < 0 {
  1001  		panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD))
  1002  	}
  1003  	go func() {
  1004  		log.Debugf("Monitoring gofer health for container %q", cid)
  1005  		events := []unix.PollFd{
  1006  			{
  1007  				Fd:     rootfsGoferFD,
  1008  				Events: unix.POLLHUP | unix.POLLRDHUP,
  1009  			},
  1010  		}
  1011  		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
  1012  			// Use ppoll instead of poll because it's already allowed in seccomp.
  1013  			n, err := unix.Ppoll(events, nil, nil)
  1014  			return uintptr(n), 0, err
  1015  		})
  1016  		if err != nil {
  1017  			panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err))
  1018  		}
  1019  
  1020  		l.mu.Lock()
  1021  		defer l.mu.Unlock()
  1022  
  1023  		// The gofer could have been stopped due to a normal container shutdown.
  1024  		// Check if the container has not stopped yet.
  1025  		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
  1026  			log.Infof("Gofer socket disconnected, killing container %q", cid)
  1027  			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
  1028  				log.Warningf("Error killing container %q after gofer stopped: %s", cid, err)
  1029  			}
  1030  		}
  1031  	}()
  1032  }
  1033  
  1034  // destroySubcontainer stops a container if it is still running and cleans up
  1035  // its filesystem.
  1036  func (l *Loader) destroySubcontainer(cid string) error {
  1037  	l.mu.Lock()
  1038  	defer l.mu.Unlock()
  1039  
  1040  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1041  	if err != nil {
  1042  		// Container doesn't exist.
  1043  		return err
  1044  	}
  1045  
  1046  	// The container exists, but has it been started?
  1047  	if tg != nil {
  1048  		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
  1049  			return fmt.Errorf("sending SIGKILL to all container processes: %w", err)
  1050  		}
  1051  		// Wait for all processes that belong to the container to exit (including
  1052  		// exec'd processes).
  1053  		for _, t := range l.k.TaskSet().Root.Tasks() {
  1054  			if t.ContainerID() == cid {
  1055  				t.ThreadGroup().WaitExited()
  1056  			}
  1057  		}
  1058  	}
  1059  
  1060  	// No more failure from this point on. Remove all container thread groups
  1061  	// from the map.
  1062  	for key := range l.processes {
  1063  		if key.cid == cid {
  1064  			delete(l.processes, key)
  1065  		}
  1066  	}
  1067  
  1068  	log.Debugf("Container destroyed, cid: %s", cid)
  1069  	return nil
  1070  }
  1071  
  1072  func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
  1073  	// Hold the lock for the entire operation to ensure that exec'd process is
  1074  	// added to 'processes' in case it races with destroyContainer().
  1075  	l.mu.Lock()
  1076  	defer l.mu.Unlock()
  1077  
  1078  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
  1079  	if err != nil {
  1080  		return 0, err
  1081  	}
  1082  	if tg == nil {
  1083  		return 0, fmt.Errorf("container %q not started", args.ContainerID)
  1084  	}
  1085  
  1086  	// Get the container MountNamespace from the Task. Try to acquire ref may fail
  1087  	// in case it raced with task exit.
  1088  	// task.MountNamespace() does not take a ref, so we must do so ourselves.
  1089  	args.MountNamespace = tg.Leader().MountNamespace()
  1090  	if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() {
  1091  		return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
  1092  	}
  1093  
  1094  	args.Envv, err = specutils.ResolveEnvs(args.Envv)
  1095  	if err != nil {
  1096  		return 0, fmt.Errorf("resolving env: %w", err)
  1097  	}
  1098  
  1099  	// Add the HOME environment variable if it is not already set.
  1100  	ctx := vfs.WithRoot(l.k.SupervisorContext(), args.MountNamespace.Root())
  1101  	defer args.MountNamespace.DecRef(ctx)
  1102  	args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
  1103  	if err != nil {
  1104  		return 0, err
  1105  	}
  1106  	args.PIDNamespace = tg.PIDNamespace()
  1107  
  1108  	args.Limits, err = createLimitSet(l.root.spec)
  1109  	if err != nil {
  1110  		return 0, fmt.Errorf("creating limits: %w", err)
  1111  	}
  1112  
  1113  	// Start the process.
  1114  	proc := control.Proc{Kernel: l.k}
  1115  	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
  1116  	if err != nil {
  1117  		return 0, err
  1118  	}
  1119  
  1120  	eid := execID{cid: args.ContainerID, pid: tgid}
  1121  	l.processes[eid] = &execProcess{
  1122  		tg:  newTG,
  1123  		tty: ttyFile,
  1124  	}
  1125  	log.Debugf("updated processes: %v", l.processes)
  1126  
  1127  	return tgid, nil
  1128  }
  1129  
  1130  // waitContainer waits for the init process of a container to exit.
  1131  func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
  1132  	// Don't defer unlock, as doing so would make it impossible for
  1133  	// multiple clients to wait on the same container.
  1134  	tg, err := l.threadGroupFromID(execID{cid: cid})
  1135  	if err != nil {
  1136  		return fmt.Errorf("can't wait for container %q: %w", cid, err)
  1137  	}
  1138  
  1139  	// If the thread either has already exited or exits during waiting,
  1140  	// consider the container exited.
  1141  	ws := l.wait(tg)
  1142  	*waitStatus = ws
  1143  
  1144  	// Check for leaks and write coverage report after the root container has
  1145  	// exited. This guarantees that the report is written in cases where the
  1146  	// sandbox is killed by a signal after the ContMgrWait request is completed.
  1147  	if l.root.procArgs.ContainerID == cid {
  1148  		// All sentry-created resources should have been released at this point.
  1149  		refs.DoLeakCheck()
  1150  		_ = coverage.Report()
  1151  	}
  1152  	return nil
  1153  }
  1154  
  1155  func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
  1156  	if tgid <= 0 {
  1157  		return fmt.Errorf("PID (%d) must be positive", tgid)
  1158  	}
  1159  
  1160  	// Try to find a process that was exec'd
  1161  	eid := execID{cid: cid, pid: tgid}
  1162  	execTG, err := l.threadGroupFromID(eid)
  1163  	if err == nil {
  1164  		ws := l.wait(execTG)
  1165  		*waitStatus = ws
  1166  
  1167  		l.mu.Lock()
  1168  		delete(l.processes, eid)
  1169  		log.Debugf("updated processes (removal): %v", l.processes)
  1170  		l.mu.Unlock()
  1171  		return nil
  1172  	}
  1173  
  1174  	// The caller may be waiting on a process not started directly via exec.
  1175  	// In this case, find the process in the container's PID namespace.
  1176  	initTG, err := l.threadGroupFromID(execID{cid: cid})
  1177  	if err != nil {
  1178  		return fmt.Errorf("waiting for PID %d: %w", tgid, err)
  1179  	}
  1180  	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
  1181  	if tg == nil {
  1182  		return fmt.Errorf("waiting for PID %d: no such process", tgid)
  1183  	}
  1184  	if tg.Leader().ContainerID() != cid {
  1185  		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
  1186  	}
  1187  	ws := l.wait(tg)
  1188  	*waitStatus = ws
  1189  	return nil
  1190  }
  1191  
  1192  // wait waits for the process with TGID 'tgid' in a container's PID namespace
  1193  // to exit.
  1194  func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
  1195  	tg.WaitExited()
  1196  	return uint32(tg.ExitStatus())
  1197  }
  1198  
  1199  // WaitForStartSignal waits for a start signal from the control server.
  1200  func (l *Loader) WaitForStartSignal() {
  1201  	<-l.ctrl.manager.startChan
  1202  }
  1203  
  1204  // WaitExit waits for the root container to exit, and returns its exit status.
  1205  func (l *Loader) WaitExit() linux.WaitStatus {
  1206  	// Wait for container.
  1207  	l.k.WaitExited()
  1208  
  1209  	// Check all references.
  1210  	refs.OnExit()
  1211  
  1212  	return l.k.GlobalInit().ExitStatus()
  1213  }
  1214  
  1215  func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) {
  1216  	// Create an empty network stack because the network namespace may be empty at
  1217  	// this point. Netns is configured before Run() is called. Netstack is
  1218  	// configured using a control uRPC message. Host network is configured inside
  1219  	// Run().
  1220  	switch conf.Network {
  1221  	case config.NetworkHost:
  1222  		// If configured for raw socket support with host network
  1223  		// stack, make sure that we have CAP_NET_RAW the host,
  1224  		// otherwise we can't make raw sockets.
  1225  		if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) {
  1226  			return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability")
  1227  		}
  1228  		// No network namespacing support for hostinet yet, hence creator is nil.
  1229  		return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil
  1230  
  1231  	case config.NetworkNone, config.NetworkSandbox:
  1232  		s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite)
  1233  		if err != nil {
  1234  			return nil, err
  1235  		}
  1236  		creator := &sandboxNetstackCreator{
  1237  			clock:                    clock,
  1238  			uniqueID:                 uniqueID,
  1239  			allowPacketEndpointWrite: conf.AllowPacketEndpointWrite,
  1240  		}
  1241  		return inet.NewRootNamespace(s, creator, userns), nil
  1242  
  1243  	default:
  1244  		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
  1245  	}
  1246  
  1247  }
  1248  
  1249  func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) {
  1250  	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
  1251  	transProtos := []stack.TransportProtocolFactory{
  1252  		tcp.NewProtocol,
  1253  		udp.NewProtocol,
  1254  		icmp.NewProtocol4,
  1255  		icmp.NewProtocol6,
  1256  	}
  1257  	s := netstack.Stack{Stack: stack.New(stack.Options{
  1258  		NetworkProtocols:   netProtos,
  1259  		TransportProtocols: transProtos,
  1260  		Clock:              clock,
  1261  		Stats:              netstack.Metrics,
  1262  		HandleLocal:        true,
  1263  		// Enable raw sockets for users with sufficient
  1264  		// privileges.
  1265  		RawFactory:               raw.EndpointFactory{},
  1266  		AllowPacketEndpointWrite: allowPacketEndpointWrite,
  1267  		UniqueID:                 uniqueID,
  1268  		DefaultIPTables:          netfilter.DefaultLinuxTables,
  1269  	})}
  1270  
  1271  	// Enable SACK Recovery.
  1272  	{
  1273  		opt := tcpip.TCPSACKEnabled(true)
  1274  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1275  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1276  		}
  1277  	}
  1278  
  1279  	// Set default TTLs as required by socket/netstack.
  1280  	{
  1281  		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
  1282  		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
  1283  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
  1284  		}
  1285  		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
  1286  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
  1287  		}
  1288  	}
  1289  
  1290  	// Enable Receive Buffer Auto-Tuning.
  1291  	{
  1292  		opt := tcpip.TCPModerateReceiveBufferOption(true)
  1293  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1294  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1295  		}
  1296  	}
  1297  
  1298  	return &s, nil
  1299  }
  1300  
  1301  // sandboxNetstackCreator implements kernel.NetworkStackCreator.
  1302  //
  1303  // +stateify savable
  1304  type sandboxNetstackCreator struct {
  1305  	clock                    tcpip.Clock
  1306  	uniqueID                 stack.UniqueID
  1307  	allowPacketEndpointWrite bool
  1308  }
  1309  
  1310  // CreateStack implements kernel.NetworkStackCreator.CreateStack.
  1311  func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
  1312  	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite)
  1313  	if err != nil {
  1314  		return nil, err
  1315  	}
  1316  
  1317  	// Setup loopback.
  1318  	n := &Network{Stack: s.(*netstack.Stack).Stack}
  1319  	nicID := tcpip.NICID(f.uniqueID.UniqueID())
  1320  	link := DefaultLoopbackLink
  1321  	linkEP := packetsocket.New(ethernet.New(loopback.New()))
  1322  	opts := stack.NICOptions{Name: link.Name}
  1323  
  1324  	if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
  1325  		return nil, err
  1326  	}
  1327  
  1328  	return s, nil
  1329  }
  1330  
  1331  // signal sends a signal to one or more processes in a container. If PID is 0,
  1332  // then the container init process is used. Depending on the SignalDeliveryMode
  1333  // option, the signal may be sent directly to the indicated process, to all
  1334  // processes in the container, or to the foreground process group. pid is
  1335  // relative to the root PID namespace, not the container's.
  1336  func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
  1337  	if pid < 0 {
  1338  		return fmt.Errorf("PID (%d) must be positive", pid)
  1339  	}
  1340  
  1341  	switch mode {
  1342  	case DeliverToProcess:
  1343  		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
  1344  			return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err)
  1345  		}
  1346  		return nil
  1347  
  1348  	case DeliverToForegroundProcessGroup:
  1349  		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
  1350  			return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err)
  1351  		}
  1352  		return nil
  1353  
  1354  	case DeliverToAllProcesses:
  1355  		if pid != 0 {
  1356  			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
  1357  		}
  1358  		// Check that the container has actually started before signaling it.
  1359  		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
  1360  			return err
  1361  		}
  1362  		if err := l.signalAllProcesses(cid, signo); err != nil {
  1363  			return fmt.Errorf("signaling all processes in container %q: %w", cid, err)
  1364  		}
  1365  		return nil
  1366  
  1367  	default:
  1368  		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
  1369  	}
  1370  }
  1371  
  1372  // signalProcess sends signal to process in the given container. tgid is
  1373  // relative to the root PID namespace, not the container's.
  1374  func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
  1375  	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
  1376  	if err == nil {
  1377  		// Send signal directly to the identified process.
  1378  		return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo})
  1379  	}
  1380  
  1381  	// The caller may be signaling a process not started directly via exec.
  1382  	// In this case, find the process and check that the process belongs to the
  1383  	// container in question.
  1384  	tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid)
  1385  	if tg == nil {
  1386  		return fmt.Errorf("no such process with PID %d", tgid)
  1387  	}
  1388  	if tg.Leader().ContainerID() != cid {
  1389  		return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID())
  1390  	}
  1391  	return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1392  }
  1393  
  1394  // signalForegrondProcessGroup looks up foreground process group from the TTY
  1395  // for the given "tgid" inside container "cid", and send the signal to it.
  1396  func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
  1397  	l.mu.Lock()
  1398  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
  1399  	if err != nil {
  1400  		l.mu.Unlock()
  1401  		return fmt.Errorf("no thread group found: %w", err)
  1402  	}
  1403  	if tg == nil {
  1404  		l.mu.Unlock()
  1405  		return fmt.Errorf("container %q not started", cid)
  1406  	}
  1407  
  1408  	tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
  1409  	l.mu.Unlock()
  1410  	if err != nil {
  1411  		return fmt.Errorf("no thread group found: %w", err)
  1412  	}
  1413  	if tty == nil {
  1414  		return fmt.Errorf("no TTY attached")
  1415  	}
  1416  	pg := tty.ForegroundProcessGroup()
  1417  	if pg == nil {
  1418  		// No foreground process group has been set. Signal the
  1419  		// original thread group.
  1420  		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
  1421  		return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1422  	}
  1423  	// Send the signal to all processes in the process group.
  1424  	var lastErr error
  1425  	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
  1426  		if tg.ProcessGroup() != pg {
  1427  			continue
  1428  		}
  1429  		if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}); err != nil {
  1430  			lastErr = err
  1431  		}
  1432  	}
  1433  	return lastErr
  1434  }
  1435  
  1436  // signalAllProcesses that belong to specified container. It's a noop if the
  1437  // container hasn't started or has exited.
  1438  func (l *Loader) signalAllProcesses(cid string, signo int32) error {
  1439  	// Pause the kernel to prevent new processes from being created while
  1440  	// the signal is delivered. This prevents process leaks when SIGKILL is
  1441  	// sent to the entire container.
  1442  	l.k.Pause()
  1443  	defer l.k.Unpause()
  1444  	return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo})
  1445  }
  1446  
  1447  // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
  1448  // acquires mutex before calling it and fails in case container hasn't started
  1449  // yet.
  1450  func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
  1451  	l.mu.Lock()
  1452  	defer l.mu.Unlock()
  1453  	tg, err := l.tryThreadGroupFromIDLocked(key)
  1454  	if err != nil {
  1455  		return nil, err
  1456  	}
  1457  	if tg == nil {
  1458  		return nil, fmt.Errorf("container %q not started", key.cid)
  1459  	}
  1460  	return tg, nil
  1461  }
  1462  
  1463  // tryThreadGroupFromIDLocked returns the thread group for the given execution
  1464  // ID. It may return nil in case the container has not started yet. Returns
  1465  // error if execution ID is invalid or if the container cannot be found (maybe
  1466  // it has been deleted). Caller must hold 'mu'.
  1467  func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
  1468  	ep := l.processes[key]
  1469  	if ep == nil {
  1470  		return nil, fmt.Errorf("container %q not found", key.cid)
  1471  	}
  1472  	return ep.tg, nil
  1473  }
  1474  
  1475  // ttyFromIDLocked returns the TTY files for the given execution ID. It may
  1476  // return nil in case the container has not started yet. Returns error if
  1477  // execution ID is invalid or if the container cannot be found (maybe it has
  1478  // been deleted). Caller must hold 'mu'.
  1479  func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) {
  1480  	ep := l.processes[key]
  1481  	if ep == nil {
  1482  		return nil, fmt.Errorf("container %q not found", key.cid)
  1483  	}
  1484  	return ep.tty, nil
  1485  }
  1486  
  1487  func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User) (*kernel.FDTable, *host.TTYFileDescription, error) {
  1488  	if len(stdioFDs) != 3 {
  1489  		return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
  1490  	}
  1491  	fdMap := map[int]*fd.FD{
  1492  		0: stdioFDs[0],
  1493  		1: stdioFDs[1],
  1494  		2: stdioFDs[2],
  1495  	}
  1496  
  1497  	// Create the entries for the host files that were passed to our app.
  1498  	for _, customFD := range passFDs {
  1499  		if customFD.guest < 0 {
  1500  			return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater")
  1501  		}
  1502  		fdMap[customFD.guest] = customFD.host
  1503  	}
  1504  
  1505  	k := kernel.KernelFromContext(ctx)
  1506  	fdTable := k.NewFDTable()
  1507  	ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap)
  1508  	if err != nil {
  1509  		fdTable.DecRef(ctx)
  1510  		return nil, nil, err
  1511  	}
  1512  	return fdTable, ttyFile, nil
  1513  }
  1514  
  1515  // portForward implements initiating a portForward connection in the sandbox. portForwardProxies
  1516  // represent a two connections each copying to each other (read ends to write ends) in goroutines.
  1517  // The proxies are stored and can be cleaned up, or clean up after themselves if the connection
  1518  // is broken.
  1519  func (l *Loader) portForward(opts *PortForwardOpts) error {
  1520  	// Validate that we have a stream FD to write to. If this happens then
  1521  	// it means there is a misbehaved urpc client or a bug has occurred.
  1522  	if len(opts.Files) != 1 {
  1523  		return fmt.Errorf("stream FD is required for port forward")
  1524  	}
  1525  
  1526  	l.mu.Lock()
  1527  	defer l.mu.Unlock()
  1528  
  1529  	cid := opts.ContainerID
  1530  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1531  	if err != nil {
  1532  		return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err)
  1533  	}
  1534  	if tg == nil {
  1535  		return fmt.Errorf("container %q not started", cid)
  1536  	}
  1537  
  1538  	// Import the fd for the UDS.
  1539  	ctx := l.k.SupervisorContext()
  1540  	fd, err := l.importFD(ctx, opts.Files[0])
  1541  	if err != nil {
  1542  		return fmt.Errorf("importing stream fd: %w", err)
  1543  	}
  1544  	cu := cleanup.Make(func() { fd.DecRef(ctx) })
  1545  	defer cu.Clean()
  1546  
  1547  	fdConn := pf.NewFileDescriptionConn(fd)
  1548  
  1549  	// Create a proxy to forward data between the fdConn and the sandboxed application.
  1550  	pair := pf.ProxyPair{To: fdConn}
  1551  
  1552  	switch l.root.conf.Network {
  1553  	case config.NetworkSandbox:
  1554  		stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack
  1555  		nsConn, err := pf.NewNetstackConn(stack, opts.Port)
  1556  		if err != nil {
  1557  			return fmt.Errorf("creating netstack port forward connection: %w", err)
  1558  		}
  1559  		pair.From = nsConn
  1560  	case config.NetworkHost:
  1561  		hConn, err := pf.NewHostInetConn(opts.Port)
  1562  		if err != nil {
  1563  			return fmt.Errorf("creating hostinet port forward connection: %w", err)
  1564  		}
  1565  		pair.From = hConn
  1566  	default:
  1567  		return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid)
  1568  	}
  1569  	cu.Release()
  1570  	proxy := pf.NewProxy(pair, opts.ContainerID)
  1571  
  1572  	// Add to the list of port forward connections and remove when the
  1573  	// connection closes.
  1574  	l.portForwardProxies = append(l.portForwardProxies, proxy)
  1575  	proxy.AddCleanup(func() {
  1576  		l.mu.Lock()
  1577  		defer l.mu.Unlock()
  1578  		for i := range l.portForwardProxies {
  1579  			if l.portForwardProxies[i] == proxy {
  1580  				l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...)
  1581  				break
  1582  			}
  1583  		}
  1584  	})
  1585  
  1586  	// Start forwarding on the connection.
  1587  	proxy.Start(ctx)
  1588  	return nil
  1589  }
  1590  
  1591  // importFD generically imports a host file descriptor without adding it to any
  1592  // fd table.
  1593  func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) {
  1594  	hostFD, err := fd.NewFromFile(f)
  1595  	if err != nil {
  1596  		return nil, err
  1597  	}
  1598  	defer hostFD.Close()
  1599  	fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{
  1600  		Savable:      false, // We disconnect and close on save.
  1601  		IsTTY:        false,
  1602  		VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed.
  1603  	})
  1604  
  1605  	if err != nil {
  1606  		return nil, err
  1607  	}
  1608  	hostFD.Release()
  1609  	return fd, nil
  1610  }
  1611  
  1612  func (l *Loader) containerCount() int {
  1613  	l.mu.Lock()
  1614  	defer l.mu.Unlock()
  1615  
  1616  	containers := 0
  1617  	for id := range l.processes {
  1618  		if id.pid == 0 {
  1619  			// pid==0 represents the init process of a container. There is
  1620  			// only one of such process per container.
  1621  			containers++
  1622  		}
  1623  	}
  1624  	return containers
  1625  }