github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/boot/loader.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package boot loads the kernel and runs a container.
    16  package boot
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	mrand "math/rand"
    22  	"os"
    23  	"runtime"
    24  	gtime "time"
    25  
    26  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    27  	"github.com/MerlinKodo/gvisor/pkg/bpf"
    28  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    29  	"github.com/MerlinKodo/gvisor/pkg/context"
    30  	"github.com/MerlinKodo/gvisor/pkg/coverage"
    31  	"github.com/MerlinKodo/gvisor/pkg/cpuid"
    32  	"github.com/MerlinKodo/gvisor/pkg/fd"
    33  	"github.com/MerlinKodo/gvisor/pkg/log"
    34  	"github.com/MerlinKodo/gvisor/pkg/memutil"
    35  	"github.com/MerlinKodo/gvisor/pkg/rand"
    36  	"github.com/MerlinKodo/gvisor/pkg/refs"
    37  	"github.com/MerlinKodo/gvisor/pkg/sentry/control"
    38  	"github.com/MerlinKodo/gvisor/pkg/sentry/fdimport"
    39  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/host"
    40  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs"
    41  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user"
    42  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    43  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    44  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    45  	"github.com/MerlinKodo/gvisor/pkg/sentry/loader"
    46  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    47  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    48  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    49  	pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    50  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/netfilter"
    51  	"github.com/MerlinKodo/gvisor/pkg/sentry/time"
    52  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    53  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    54  	"github.com/MerlinKodo/gvisor/pkg/sentry/watchdog"
    55  	"github.com/MerlinKodo/gvisor/pkg/sighandling"
    56  	"github.com/MerlinKodo/gvisor/pkg/sync"
    57  	"github.com/MerlinKodo/gvisor/pkg/tcpip"
    58  	"github.com/MerlinKodo/gvisor/pkg/tcpip/link/ethernet"
    59  	"github.com/MerlinKodo/gvisor/pkg/tcpip/link/loopback"
    60  	"github.com/MerlinKodo/gvisor/pkg/tcpip/link/packetsocket"
    61  	"github.com/MerlinKodo/gvisor/pkg/tcpip/link/sniffer"
    62  	"github.com/MerlinKodo/gvisor/pkg/tcpip/network/arp"
    63  	"github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv4"
    64  	"github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv6"
    65  	"github.com/MerlinKodo/gvisor/pkg/tcpip/stack"
    66  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport/icmp"
    67  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport/raw"
    68  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport/tcp"
    69  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport/udp"
    70  	"github.com/MerlinKodo/gvisor/runsc/boot/filter"
    71  	_ "github.com/MerlinKodo/gvisor/runsc/boot/platforms" // register all platforms.
    72  	pf "github.com/MerlinKodo/gvisor/runsc/boot/portforward"
    73  	"github.com/MerlinKodo/gvisor/runsc/boot/pprof"
    74  	"github.com/MerlinKodo/gvisor/runsc/config"
    75  	"github.com/MerlinKodo/gvisor/runsc/profile"
    76  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    77  	"github.com/MerlinKodo/gvisor/runsc/specutils/seccomp"
    78  	specs "github.com/opencontainers/runtime-spec/specs-go"
    79  	"github.com/syndtr/gocapability/capability"
    80  	"golang.org/x/sys/unix"
    81  
    82  	// Top-level inet providers.
    83  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/hostinet"
    84  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/netstack"
    85  
    86  	// Include other supported socket providers.
    87  	_ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink"
    88  	_ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/route"
    89  	_ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/uevent"
    90  	_ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix"
    91  )
    92  
    93  type containerInfo struct {
    94  	conf *config.Config
    95  
    96  	// spec is the base configuration for the root container.
    97  	spec *specs.Spec
    98  
    99  	// procArgs refers to the container's init task.
   100  	procArgs kernel.CreateProcessArgs
   101  
   102  	// stdioFDs contains stdin, stdout, and stderr.
   103  	stdioFDs []*fd.FD
   104  
   105  	// passFDs are mappings of user-supplied host to guest file descriptors.
   106  	passFDs []fdMapping
   107  
   108  	// execFD is the host file descriptor used for program execution.
   109  	execFD *fd.FD
   110  
   111  	// goferFDs are the FDs that attach the sandbox to the gofers.
   112  	goferFDs []*fd.FD
   113  
   114  	// overlayFilestoreFDs are the FDs to the regular files that will back the
   115  	// tmpfs upper mount in the overlay mounts.
   116  	overlayFilestoreFDs []*fd.FD
   117  
   118  	// overlayMediums contains information about how the gofer mounts have been
   119  	// overlaid. The first entry is for rootfs and the following entries are for
   120  	// bind mounts in spec.Mounts (in the same order).
   121  	overlayMediums []OverlayMedium
   122  
   123  	// nvidiaUVMDevMajor is the device major number used for nvidia-uvm.
   124  	nvidiaUVMDevMajor uint32
   125  }
   126  
   127  // Loader keeps state needed to start the kernel and run the container.
   128  type Loader struct {
   129  	// k is the kernel.
   130  	k *kernel.Kernel
   131  
   132  	// ctrl is the control server.
   133  	ctrl *controller
   134  
   135  	// root contains information about the root container in the sandbox.
   136  	root containerInfo
   137  
   138  	watchdog *watchdog.Watchdog
   139  
   140  	// stopSignalForwarding disables forwarding of signals to the sandboxed
   141  	// container. It should be called when a sandbox is destroyed.
   142  	stopSignalForwarding func()
   143  
   144  	// stopProfiling stops profiling started at container creation. It
   145  	// should be called when a sandbox is destroyed.
   146  	stopProfiling func()
   147  
   148  	// PreSeccompCallback is called right before installing seccomp filters.
   149  	PreSeccompCallback func()
   150  
   151  	// restore is set to true if we are restoring a container.
   152  	restore bool
   153  
   154  	// sandboxID is the ID for the whole sandbox.
   155  	sandboxID string
   156  
   157  	// mountHints provides extra information about mounts for containers that
   158  	// apply to the entire pod.
   159  	mountHints *PodMountHints
   160  
   161  	// sharedMountKey holds VFS mounts that may be shared between containers
   162  	// within the same pod. It is mapped by mount source.
   163  	sharedMounts map[string]*vfs.Mount
   164  
   165  	// productName is the value to show in
   166  	// /sys/devices/virtual/dmi/id/product_name.
   167  	productName string
   168  
   169  	// nvidiaUVMDevMajor is the device major number used for nvidia-uvm.
   170  	nvidiaUVMDevMajor uint32
   171  
   172  	// mu guards processes and porForwardProxies.
   173  	mu sync.Mutex
   174  
   175  	// processes maps containers init process and invocation of exec. Root
   176  	// processes are keyed with container ID and pid=0, while exec invocations
   177  	// have the corresponding pid set.
   178  	//
   179  	// processes is guarded by mu.
   180  	processes map[execID]*execProcess
   181  
   182  	// portForwardProxies is a list of active port forwarding connections.
   183  	//
   184  	// portForwardProxies is guarded by mu.
   185  	portForwardProxies []*pf.Proxy
   186  }
   187  
   188  // execID uniquely identifies a sentry process that is executed in a container.
   189  type execID struct {
   190  	cid string
   191  	pid kernel.ThreadID
   192  }
   193  
   194  // execProcess contains the thread group and host TTY of a sentry process.
   195  type execProcess struct {
   196  	// tg will be nil for containers that haven't started yet.
   197  	tg *kernel.ThreadGroup
   198  
   199  	// tty will be nil if the process is not attached to a terminal.
   200  	tty *host.TTYFileDescription
   201  
   202  	// pidnsPath is the pid namespace path in spec
   203  	pidnsPath string
   204  
   205  	// hostTTY is present when creating a sub-container with terminal enabled.
   206  	// TTY file is passed during container create and must be saved until
   207  	// container start.
   208  	hostTTY *fd.FD
   209  }
   210  
   211  // fdMapping maps guest to host file descriptors. Guest file descriptors are
   212  // exposed to the application inside the sandbox through the FD table.
   213  type fdMapping struct {
   214  	guest int
   215  	host  *fd.FD
   216  }
   217  
   218  // FDMapping is a helper type to represent a mapping from guest to host file
   219  // descriptors. In contrast to the unexported fdMapping type, it does not imply
   220  // file ownership.
   221  type FDMapping struct {
   222  	Guest int
   223  	Host  int
   224  }
   225  
   226  func init() {
   227  	// Initialize the random number generator.
   228  	mrand.Seed(gtime.Now().UnixNano())
   229  }
   230  
   231  // Args are the arguments for New().
   232  type Args struct {
   233  	// Id is the sandbox ID.
   234  	ID string
   235  	// Spec is the sandbox specification.
   236  	Spec *specs.Spec
   237  	// Conf is the system configuration.
   238  	Conf *config.Config
   239  	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
   240  	// of this FD and may close it at any time.
   241  	ControllerFD int
   242  	// Device is an optional argument that is passed to the platform. The Loader
   243  	// takes ownership of this file and may close it at any time.
   244  	Device *os.File
   245  	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
   246  	// takes ownership of these FDs and may close them at any time.
   247  	GoferFDs []int
   248  	// StdioFDs is the stdio for the application. The Loader takes ownership of
   249  	// these FDs and may close them at any time.
   250  	StdioFDs []int
   251  	// PassFDs are user-supplied FD mappings from host to guest descriptors.
   252  	// The Loader takes ownership of these FDs and may close them at any time.
   253  	PassFDs []FDMapping
   254  	// ExecFD is the host file descriptor used for program execution.
   255  	ExecFD int
   256  	// OverlayFilestoreFDs are the FDs to the regular files that will back the
   257  	// tmpfs upper mount in the overlay mounts.
   258  	OverlayFilestoreFDs []int
   259  	// OverlayMediums contains information about how the gofer mounts have been
   260  	// overlaid. The first entry is for rootfs and the following entries are for
   261  	// bind mounts in Spec.Mounts (in the same order).
   262  	OverlayMediums []OverlayMedium
   263  	// NumCPU is the number of CPUs to create inside the sandbox.
   264  	NumCPU int
   265  	// TotalMem is the initial amount of total memory to report back to the
   266  	// container.
   267  	TotalMem uint64
   268  	// TotalHostMem is the total memory reported by host /proc/meminfo.
   269  	TotalHostMem uint64
   270  	// UserLogFD is the file descriptor to write user logs to.
   271  	UserLogFD int
   272  	// ProductName is the value to show in
   273  	// /sys/devices/virtual/dmi/id/product_name.
   274  	ProductName string
   275  	// PodInitConfigFD is the file descriptor to a file passed in the
   276  	//	--pod-init-config flag
   277  	PodInitConfigFD int
   278  	// SinkFDs is an ordered array of file descriptors to be used by seccheck
   279  	// sinks configured from the --pod-init-config file.
   280  	SinkFDs []int
   281  	// ProfileOpts contains the set of profiles to enable and the
   282  	// corresponding FDs where profile data will be written.
   283  	ProfileOpts profile.Opts
   284  }
   285  
   286  // make sure stdioFDs are always the same on initial start and on restore
   287  const startingStdioFD = 256
   288  
   289  // New initializes a new kernel loader configured by spec.
   290  // New also handles setting up a kernel for restoring a container.
   291  func New(args Args) (*Loader, error) {
   292  	stopProfiling := profile.Start(args.ProfileOpts)
   293  
   294  	// Initialize seccheck points.
   295  	seccheck.Initialize()
   296  
   297  	// We initialize the rand package now to make sure /dev/urandom is pre-opened
   298  	// on kernels that do not support getrandom(2).
   299  	if err := rand.Init(); err != nil {
   300  		return nil, fmt.Errorf("setting up rand: %w", err)
   301  	}
   302  
   303  	if err := usage.Init(); err != nil {
   304  		return nil, fmt.Errorf("setting up memory usage: %w", err)
   305  	}
   306  
   307  	kernel.IOUringEnabled = args.Conf.IOUring
   308  
   309  	info := containerInfo{
   310  		conf:           args.Conf,
   311  		spec:           args.Spec,
   312  		overlayMediums: args.OverlayMediums,
   313  	}
   314  
   315  	// Make host FDs stable between invocations. Host FDs must map to the exact
   316  	// same number when the sandbox is restored. Otherwise the wrong FD will be
   317  	// used.
   318  	newfd := startingStdioFD
   319  
   320  	for _, stdioFD := range args.StdioFDs {
   321  		// Check that newfd is unused to avoid clobbering over it.
   322  		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
   323  			if err != nil {
   324  				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
   325  			}
   326  			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
   327  		}
   328  
   329  		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
   330  		if err != nil {
   331  			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
   332  		}
   333  		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
   334  		_ = unix.Close(stdioFD)
   335  		newfd++
   336  	}
   337  	for _, goferFD := range args.GoferFDs {
   338  		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
   339  	}
   340  	for _, overlayFD := range args.OverlayFilestoreFDs {
   341  		info.overlayFilestoreFDs = append(info.overlayFilestoreFDs, fd.New(overlayFD))
   342  	}
   343  
   344  	if args.ExecFD >= 0 {
   345  		info.execFD = fd.New(args.ExecFD)
   346  	}
   347  
   348  	for _, customFD := range args.PassFDs {
   349  		info.passFDs = append(info.passFDs, fdMapping{
   350  			host:  fd.New(customFD.Host),
   351  			guest: customFD.Guest,
   352  		})
   353  	}
   354  
   355  	// Create kernel and platform.
   356  	p, err := createPlatform(args.Conf, args.Device)
   357  	if err != nil {
   358  		return nil, fmt.Errorf("creating platform: %w", err)
   359  	}
   360  	if args.Conf.NVProxy && p.OwnsPageTables() {
   361  		return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform)
   362  	}
   363  	k := &kernel.Kernel{
   364  		Platform: p,
   365  	}
   366  
   367  	// Create memory file.
   368  	mf, err := createMemoryFile()
   369  	if err != nil {
   370  		return nil, fmt.Errorf("creating memory file: %w", err)
   371  	}
   372  	k.SetMemoryFile(mf)
   373  
   374  	// Create VDSO.
   375  	//
   376  	// Pass k as the platform since it is savable, unlike the actual platform.
   377  	vdso, err := loader.PrepareVDSO(k)
   378  	if err != nil {
   379  		return nil, fmt.Errorf("creating vdso: %w", err)
   380  	}
   381  
   382  	// Create timekeeper.
   383  	tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
   384  	tk.SetClocks(time.NewCalibratedClocks())
   385  
   386  	if err := enableStrace(args.Conf); err != nil {
   387  		return nil, fmt.Errorf("enabling strace: %w", err)
   388  	}
   389  
   390  	// Create capabilities.
   391  	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
   392  	if err != nil {
   393  		return nil, fmt.Errorf("converting capabilities: %w", err)
   394  	}
   395  
   396  	// Convert the spec's additional GIDs to KGIDs.
   397  	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
   398  	for _, GID := range args.Spec.Process.User.AdditionalGids {
   399  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   400  	}
   401  
   402  	// Create credentials.
   403  	creds := auth.NewUserCredentials(
   404  		auth.KUID(args.Spec.Process.User.UID),
   405  		auth.KGID(args.Spec.Process.User.GID),
   406  		extraKGIDs,
   407  		caps,
   408  		auth.NewRootUserNamespace())
   409  
   410  	// Create root network namespace/stack.
   411  	netns, err := newRootNetworkNamespace(args.Conf, tk, k, creds.UserNamespace)
   412  	if err != nil {
   413  		return nil, fmt.Errorf("creating network: %w", err)
   414  	}
   415  
   416  	if args.NumCPU == 0 {
   417  		args.NumCPU = runtime.NumCPU()
   418  	}
   419  	log.Infof("CPUs: %d", args.NumCPU)
   420  	runtime.GOMAXPROCS(args.NumCPU)
   421  
   422  	if args.TotalHostMem > 0 {
   423  		// As per tmpfs(5), the default size limit is 50% of total physical RAM.
   424  		// See mm/shmem.c:shmem_default_max_blocks().
   425  		tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2)
   426  	}
   427  
   428  	if args.TotalMem > 0 {
   429  		// Adjust the total memory returned by the Sentry so that applications that
   430  		// use /proc/meminfo can make allocations based on this limit.
   431  		usage.MinimumTotalMemoryBytes = args.TotalMem
   432  		usage.MaximumTotalMemoryBytes = args.TotalMem
   433  		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
   434  	}
   435  
   436  	// Initiate the Kernel object, which is required by the Context passed
   437  	// to createVFS in order to mount (among other things) procfs.
   438  	if err = k.Init(kernel.InitKernelArgs{
   439  		FeatureSet:                  cpuid.HostFeatureSet().Fixed(),
   440  		Timekeeper:                  tk,
   441  		RootUserNamespace:           creds.UserNamespace,
   442  		RootNetworkNamespace:        netns,
   443  		ApplicationCores:            uint(args.NumCPU),
   444  		Vdso:                        vdso,
   445  		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
   446  		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
   447  		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
   448  		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
   449  	}); err != nil {
   450  		return nil, fmt.Errorf("initializing kernel: %w", err)
   451  	}
   452  
   453  	if err := registerFilesystems(k, &info); err != nil {
   454  		return nil, fmt.Errorf("registering filesystems: %w", err)
   455  	}
   456  
   457  	// Turn on packet logging if enabled.
   458  	if args.Conf.LogPackets {
   459  		log.Infof("Packet logging enabled")
   460  		sniffer.LogPackets.Store(1)
   461  	} else {
   462  		log.Infof("Packet logging disabled")
   463  		sniffer.LogPackets.Store(0)
   464  	}
   465  
   466  	// Create a watchdog.
   467  	dogOpts := watchdog.DefaultOpts
   468  	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
   469  	dog := watchdog.New(k, dogOpts)
   470  
   471  	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
   472  	if err != nil {
   473  		return nil, fmt.Errorf("creating init process for root container: %w", err)
   474  	}
   475  	info.procArgs = procArgs
   476  
   477  	if err := initCompatLogs(args.UserLogFD); err != nil {
   478  		return nil, fmt.Errorf("initializing compat logs: %w", err)
   479  	}
   480  
   481  	mountHints, err := NewPodMountHints(args.Spec)
   482  	if err != nil {
   483  		return nil, fmt.Errorf("creating pod mount hints: %w", err)
   484  	}
   485  
   486  	// Set up host mount that will be used for imported fds.
   487  	hostFilesystem, err := host.NewFilesystem(k.VFS())
   488  	if err != nil {
   489  		return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err)
   490  	}
   491  	defer hostFilesystem.DecRef(k.SupervisorContext())
   492  	k.SetHostMount(k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}))
   493  
   494  	if args.PodInitConfigFD >= 0 {
   495  		if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil {
   496  			log.Warningf("unable to configure event session: %v", err)
   497  		}
   498  	}
   499  
   500  	eid := execID{cid: args.ID}
   501  	l := &Loader{
   502  		k:                 k,
   503  		watchdog:          dog,
   504  		sandboxID:         args.ID,
   505  		processes:         map[execID]*execProcess{eid: {}},
   506  		mountHints:        mountHints,
   507  		root:              info,
   508  		stopProfiling:     stopProfiling,
   509  		productName:       args.ProductName,
   510  		nvidiaUVMDevMajor: info.nvidiaUVMDevMajor,
   511  	}
   512  
   513  	// We don't care about child signals; some platforms can generate a
   514  	// tremendous number of useless ones (I'm looking at you, ptrace).
   515  	if err := sighandling.IgnoreChildStop(); err != nil {
   516  		return nil, fmt.Errorf("ignore child stop signals failed: %w", err)
   517  	}
   518  
   519  	// Create the control server using the provided FD.
   520  	//
   521  	// This must be done *after* we have initialized the kernel since the
   522  	// controller is used to configure the kernel's network stack.
   523  	ctrl, err := newController(args.ControllerFD, l)
   524  	if err != nil {
   525  		return nil, fmt.Errorf("creating control server: %w", err)
   526  	}
   527  	l.ctrl = ctrl
   528  
   529  	// Only start serving after Loader is set to controller and controller is set
   530  	// to Loader, because they are both used in the urpc methods.
   531  	if err := ctrl.srv.StartServing(); err != nil {
   532  		return nil, fmt.Errorf("starting control server: %w", err)
   533  	}
   534  
   535  	return l, nil
   536  }
   537  
   538  // createProcessArgs creates args that can be used with kernel.CreateProcess.
   539  func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
   540  	// Create initial limits.
   541  	ls, err := createLimitSet(spec)
   542  	if err != nil {
   543  		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err)
   544  	}
   545  	env, err := specutils.ResolveEnvs(spec.Process.Env)
   546  	if err != nil {
   547  		return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err)
   548  	}
   549  
   550  	wd := spec.Process.Cwd
   551  	if wd == "" {
   552  		wd = "/"
   553  	}
   554  
   555  	// Create the process arguments.
   556  	procArgs := kernel.CreateProcessArgs{
   557  		Argv:                    spec.Process.Args,
   558  		Envv:                    env,
   559  		WorkingDirectory:        wd,
   560  		Credentials:             creds,
   561  		Umask:                   0022,
   562  		Limits:                  ls,
   563  		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
   564  		UTSNamespace:            k.RootUTSNamespace(),
   565  		IPCNamespace:            k.RootIPCNamespace(),
   566  		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
   567  		ContainerID:             id,
   568  		PIDNamespace:            pidns,
   569  	}
   570  
   571  	return procArgs, nil
   572  }
   573  
   574  // Destroy cleans up all resources used by the loader.
   575  //
   576  // Note that this will block until all open control server connections have
   577  // been closed. For that reason, this should NOT be called in a defer, because
   578  // a panic in a control server rpc would then hang forever.
   579  func (l *Loader) Destroy() {
   580  	if l.stopSignalForwarding != nil {
   581  		l.stopSignalForwarding()
   582  	}
   583  	l.watchdog.Stop()
   584  
   585  	// Stop the control server. This will indirectly stop any
   586  	// long-running control operations that are in flight, e.g.
   587  	// profiling operations.
   588  	l.ctrl.stop()
   589  
   590  	// Release all kernel resources. This is only safe after we can no longer
   591  	// save/restore.
   592  	l.k.Release()
   593  
   594  	// Release any dangling tcp connections.
   595  	tcpip.ReleaseDanglingEndpoints()
   596  
   597  	// In the success case, stdioFDs and goferFDs will only contain
   598  	// released/closed FDs that ownership has been passed over to host FDs and
   599  	// gofer sessions. Close them here in case of failure.
   600  	for _, f := range l.root.stdioFDs {
   601  		_ = f.Close()
   602  	}
   603  	for _, f := range l.root.passFDs {
   604  		_ = f.host.Close()
   605  	}
   606  	for _, f := range l.root.goferFDs {
   607  		_ = f.Close()
   608  	}
   609  
   610  	l.stopProfiling()
   611  }
   612  
   613  func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
   614  	p, err := platform.Lookup(conf.Platform)
   615  	if err != nil {
   616  		panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err))
   617  	}
   618  	log.Infof("Platform: %s", conf.Platform)
   619  	return p.New(deviceFile)
   620  }
   621  
   622  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   623  	const memfileName = "runsc-memory"
   624  	memfd, err := memutil.CreateMemFD(memfileName, 0)
   625  	if err != nil {
   626  		return nil, fmt.Errorf("error creating memfd: %w", err)
   627  	}
   628  	memfile := os.NewFile(uintptr(memfd), memfileName)
   629  	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
   630  	// there are memory cgroups specified, because at this point we're already
   631  	// in a mount namespace in which the relevant cgroupfs is not visible.
   632  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
   633  	if err != nil {
   634  		_ = memfile.Close()
   635  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err)
   636  	}
   637  	return mf, nil
   638  }
   639  
   640  // installSeccompFilters installs sandbox seccomp filters with the host.
   641  func (l *Loader) installSeccompFilters() error {
   642  	if l.PreSeccompCallback != nil {
   643  		l.PreSeccompCallback()
   644  	}
   645  	if l.root.conf.DisableSeccomp {
   646  		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
   647  	} else {
   648  		hostnet := l.root.conf.Network == config.NetworkHost
   649  		opts := filter.Options{
   650  			Platform:              l.k.Platform,
   651  			HostNetwork:           hostnet,
   652  			HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw,
   653  			HostFilesystem:        l.root.conf.DirectFS,
   654  			ProfileEnable:         l.root.conf.ProfileEnable,
   655  			NVProxy:               l.root.conf.NVProxy,
   656  			TPUProxy:              l.root.conf.TPUProxy,
   657  			ControllerFD:          l.ctrl.srv.FD(),
   658  		}
   659  		if err := filter.Install(opts); err != nil {
   660  			return fmt.Errorf("installing seccomp filters: %w", err)
   661  		}
   662  	}
   663  	return nil
   664  }
   665  
   666  // Run runs the root container.
   667  func (l *Loader) Run() error {
   668  	err := l.run()
   669  	l.ctrl.manager.startResultChan <- err
   670  	if err != nil {
   671  		// Give the controller some time to send the error to the
   672  		// runtime. If we return too quickly here the process will exit
   673  		// and the control connection will be closed before the error
   674  		// is returned.
   675  		gtime.Sleep(2 * gtime.Second)
   676  		return err
   677  	}
   678  	return nil
   679  }
   680  
   681  func (l *Loader) run() error {
   682  	if l.root.conf.Network == config.NetworkHost {
   683  		// Delay host network configuration to this point because network namespace
   684  		// is configured after the loader is created and before Run() is called.
   685  		log.Debugf("Configuring host network")
   686  		s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
   687  		if err := s.Configure(l.root.conf.EnableRaw); err != nil {
   688  			return err
   689  		}
   690  	}
   691  
   692  	l.mu.Lock()
   693  	defer l.mu.Unlock()
   694  
   695  	eid := execID{cid: l.sandboxID}
   696  	ep, ok := l.processes[eid]
   697  	if !ok {
   698  		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
   699  	}
   700  
   701  	// If we are restoring, we do not want to create a process.
   702  	// l.restore is set by the container manager when a restore call is made.
   703  	if !l.restore {
   704  		if l.root.conf.ProfileEnable {
   705  			pprof.Initialize()
   706  		}
   707  
   708  		// Finally done with all configuration. Setup filters before user code
   709  		// is loaded.
   710  		if err := l.installSeccompFilters(); err != nil {
   711  			return err
   712  		}
   713  
   714  		// Create the root container init task. It will begin running
   715  		// when the kernel is started.
   716  		var (
   717  			tg  *kernel.ThreadGroup
   718  			err error
   719  		)
   720  		tg, ep.tty, err = l.createContainerProcess(true, l.sandboxID, &l.root)
   721  		if err != nil {
   722  			return err
   723  		}
   724  
   725  		if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   726  			evt := pb.Start{
   727  				Id:       l.sandboxID,
   728  				Cwd:      l.root.spec.Process.Cwd,
   729  				Args:     l.root.spec.Process.Args,
   730  				Terminal: l.root.spec.Process.Terminal,
   731  			}
   732  			fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
   733  			if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
   734  				evt.Env = l.root.spec.Process.Env
   735  			}
   736  			if !fields.Context.Empty() {
   737  				evt.ContextData = &pb.ContextData{}
   738  				kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData)
   739  			}
   740  			_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   741  				return c.ContainerStart(context.Background(), fields, &evt)
   742  			})
   743  		}
   744  	}
   745  
   746  	ep.tg = l.k.GlobalInit()
   747  	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
   748  		ep.pidnsPath = ns.Path
   749  	}
   750  
   751  	// Handle signals by forwarding them to the root container process
   752  	// (except for panic signal, which should cause a panic).
   753  	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
   754  		// Panic signal should cause a panic.
   755  		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
   756  			panic("Signal-induced panic")
   757  		}
   758  
   759  		// Otherwise forward to root container.
   760  		deliveryMode := DeliverToProcess
   761  		if l.root.spec.Process.Terminal {
   762  			// Since we are running with a console, we should forward the signal to
   763  			// the foreground process group so that job control signals like ^C can
   764  			// be handled properly.
   765  			deliveryMode = DeliverToForegroundProcessGroup
   766  		}
   767  		log.Infof("Received external signal %d, mode: %s", sig, deliveryMode)
   768  		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
   769  			log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err)
   770  		}
   771  	})
   772  
   773  	log.Infof("Process should have started...")
   774  	l.watchdog.Start()
   775  	return l.k.Start()
   776  }
   777  
   778  // createSubcontainer creates a new container inside the sandbox.
   779  func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error {
   780  	l.mu.Lock()
   781  	defer l.mu.Unlock()
   782  
   783  	eid := execID{cid: cid}
   784  	if _, ok := l.processes[eid]; ok {
   785  		return fmt.Errorf("container %q already exists", cid)
   786  	}
   787  	l.processes[eid] = &execProcess{hostTTY: tty}
   788  	return nil
   789  }
   790  
   791  // startSubcontainer starts a child container. It returns the thread group ID of
   792  // the newly created process. Used FDs are either closed or released. It's safe
   793  // for the caller to close any remaining files upon return.
   794  func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, overlayFilestoreFDs []*fd.FD, overlayMediums []OverlayMedium) error {
   795  	// Create capabilities.
   796  	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
   797  	if err != nil {
   798  		return fmt.Errorf("creating capabilities: %w", err)
   799  	}
   800  
   801  	l.mu.Lock()
   802  	defer l.mu.Unlock()
   803  
   804  	ep := l.processes[execID{cid: cid}]
   805  	if ep == nil {
   806  		return fmt.Errorf("trying to start a deleted container %q", cid)
   807  	}
   808  
   809  	// Convert the spec's additional GIDs to KGIDs.
   810  	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
   811  	for _, GID := range spec.Process.User.AdditionalGids {
   812  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   813  	}
   814  
   815  	// Create credentials. We reuse the root user namespace because the
   816  	// sentry currently supports only 1 mount namespace, which is tied to a
   817  	// single user namespace. Thus we must run in the same user namespace
   818  	// to access mounts.
   819  	creds := auth.NewUserCredentials(
   820  		auth.KUID(spec.Process.User.UID),
   821  		auth.KGID(spec.Process.User.GID),
   822  		extraKGIDs,
   823  		caps,
   824  		l.k.RootUserNamespace())
   825  
   826  	var pidns *kernel.PIDNamespace
   827  	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
   828  		if ns.Path != "" {
   829  			for _, p := range l.processes {
   830  				if ns.Path == p.pidnsPath {
   831  					log.Debugf("Joining PID namespace named %q", ns.Path)
   832  					pidns = p.tg.PIDNamespace()
   833  					break
   834  				}
   835  			}
   836  		}
   837  		if pidns == nil {
   838  			log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path)
   839  			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
   840  		}
   841  		ep.pidnsPath = ns.Path
   842  	} else {
   843  		pidns = l.k.RootPIDNamespace()
   844  	}
   845  
   846  	info := &containerInfo{
   847  		conf:                conf,
   848  		spec:                spec,
   849  		goferFDs:            goferFDs,
   850  		overlayFilestoreFDs: overlayFilestoreFDs,
   851  		overlayMediums:      overlayMediums,
   852  		nvidiaUVMDevMajor:   l.nvidiaUVMDevMajor,
   853  	}
   854  	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
   855  	if err != nil {
   856  		return fmt.Errorf("creating new process: %w", err)
   857  	}
   858  
   859  	// Use stdios or TTY depending on the spec configuration.
   860  	if spec.Process.Terminal {
   861  		if l := len(stdioFDs); l != 0 {
   862  			return fmt.Errorf("using TTY, stdios not expected: %d", l)
   863  		}
   864  		if ep.hostTTY == nil {
   865  			return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
   866  		}
   867  		info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
   868  		ep.hostTTY = nil
   869  	} else {
   870  		info.stdioFDs = stdioFDs
   871  	}
   872  
   873  	ep.tg, ep.tty, err = l.createContainerProcess(false, cid, info)
   874  	if err != nil {
   875  		return err
   876  	}
   877  
   878  	if seccheck.Global.Enabled(seccheck.PointContainerStart) {
   879  		evt := pb.Start{
   880  			Id:       cid,
   881  			Cwd:      spec.Process.Cwd,
   882  			Args:     spec.Process.Args,
   883  			Terminal: spec.Process.Terminal,
   884  		}
   885  		fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart)
   886  		if fields.Local.Contains(seccheck.FieldContainerStartEnv) {
   887  			evt.Env = spec.Process.Env
   888  		}
   889  		if !fields.Context.Empty() {
   890  			evt.ContextData = &pb.ContextData{}
   891  			kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData)
   892  		}
   893  		_ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   894  			return c.ContainerStart(context.Background(), fields, &evt)
   895  		})
   896  	}
   897  
   898  	l.k.StartProcess(ep.tg)
   899  	return nil
   900  }
   901  
   902  func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) {
   903  	// Create the FD map, which will set stdin, stdout, and stderr.
   904  	ctx := info.procArgs.NewContext(l.k)
   905  	fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User)
   906  	if err != nil {
   907  		return nil, nil, fmt.Errorf("importing fds: %w", err)
   908  	}
   909  	// CreateProcess takes a reference on fdTable if successful. We won't need
   910  	// ours either way.
   911  	info.procArgs.FDTable = fdTable
   912  
   913  	if info.execFD != nil {
   914  		if info.procArgs.Filename != "" {
   915  			return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both")
   916  		}
   917  		file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{
   918  			Readonly:     true,
   919  			Savable:      true,
   920  			VirtualOwner: true,
   921  			UID:          auth.KUID(info.spec.Process.User.UID),
   922  			GID:          auth.KGID(info.spec.Process.User.GID),
   923  		})
   924  		if err != nil {
   925  			return nil, nil, err
   926  		}
   927  		defer file.DecRef(ctx)
   928  		info.execFD.Release()
   929  
   930  		info.procArgs.File = file
   931  	}
   932  
   933  	// Gofer FDs must be ordered and the first FD is always the rootfs.
   934  	if len(info.goferFDs) < 1 {
   935  		return nil, nil, fmt.Errorf("rootfs gofer FD not found")
   936  	}
   937  	l.startGoferMonitor(cid, int32(info.goferFDs[0].FD()))
   938  
   939  	if root {
   940  		if err := l.processHints(info.conf, info.procArgs.Credentials); err != nil {
   941  			return nil, nil, err
   942  		}
   943  	}
   944  	mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID)
   945  	if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil {
   946  		return nil, nil, err
   947  	}
   948  
   949  	// Add the HOME environment variable if it is not already set.
   950  	info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
   951  		info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
   952  	if err != nil {
   953  		return nil, nil, err
   954  	}
   955  
   956  	// Create and start the new process.
   957  	tg, _, err := l.k.CreateProcess(info.procArgs)
   958  	if err != nil {
   959  		return nil, nil, fmt.Errorf("creating process: %w", err)
   960  	}
   961  	// CreateProcess takes a reference on FDTable if successful.
   962  	info.procArgs.FDTable.DecRef(ctx)
   963  
   964  	// Set the foreground process group on the TTY to the global init process
   965  	// group, since that is what we are about to start running.
   966  	if ttyFile != nil {
   967  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
   968  	}
   969  
   970  	// Install seccomp filters with the new task if there are any.
   971  	if info.conf.OCISeccomp {
   972  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   973  			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
   974  			if err != nil {
   975  				return nil, nil, fmt.Errorf("building seccomp program: %w", err)
   976  			}
   977  
   978  			if log.IsLogging(log.Debug) {
   979  				out, _ := bpf.DecodeProgram(program)
   980  				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
   981  			}
   982  
   983  			task := tg.Leader()
   984  			// NOTE: It seems Flags are ignored by runc so we ignore them too.
   985  			if err := task.AppendSyscallFilter(program, true); err != nil {
   986  				return nil, nil, fmt.Errorf("appending seccomp filters: %w", err)
   987  			}
   988  		}
   989  	} else {
   990  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   991  			log.Warningf("Seccomp spec is being ignored")
   992  		}
   993  	}
   994  
   995  	return tg, ttyFile, nil
   996  }
   997  
   998  // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
   999  // the gofer FD looking for disconnects, and kills the container processes if
  1000  // the rootfs FD disconnects.
  1001  //
  1002  // Note that other gofer mounts are allowed to be unmounted and disconnected.
  1003  func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) {
  1004  	if rootfsGoferFD < 0 {
  1005  		panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD))
  1006  	}
  1007  	go func() {
  1008  		log.Debugf("Monitoring gofer health for container %q", cid)
  1009  		events := []unix.PollFd{
  1010  			{
  1011  				Fd:     rootfsGoferFD,
  1012  				Events: unix.POLLHUP | unix.POLLRDHUP,
  1013  			},
  1014  		}
  1015  		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
  1016  			// Use ppoll instead of poll because it's already allowed in seccomp.
  1017  			n, err := unix.Ppoll(events, nil, nil)
  1018  			return uintptr(n), 0, err
  1019  		})
  1020  		if err != nil {
  1021  			panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err))
  1022  		}
  1023  
  1024  		l.mu.Lock()
  1025  		defer l.mu.Unlock()
  1026  
  1027  		// The gofer could have been stopped due to a normal container shutdown.
  1028  		// Check if the container has not stopped yet.
  1029  		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
  1030  			log.Infof("Gofer socket disconnected, killing container %q", cid)
  1031  			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
  1032  				log.Warningf("Error killing container %q after gofer stopped: %s", cid, err)
  1033  			}
  1034  		}
  1035  	}()
  1036  }
  1037  
  1038  // destroySubcontainer stops a container if it is still running and cleans up
  1039  // its filesystem.
  1040  func (l *Loader) destroySubcontainer(cid string) error {
  1041  	l.mu.Lock()
  1042  	defer l.mu.Unlock()
  1043  
  1044  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1045  	if err != nil {
  1046  		// Container doesn't exist.
  1047  		return err
  1048  	}
  1049  
  1050  	// The container exists, but has it been started?
  1051  	if tg != nil {
  1052  		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
  1053  			return fmt.Errorf("sending SIGKILL to all container processes: %w", err)
  1054  		}
  1055  		// Wait for all processes that belong to the container to exit (including
  1056  		// exec'd processes).
  1057  		for _, t := range l.k.TaskSet().Root.Tasks() {
  1058  			if t.ContainerID() == cid {
  1059  				t.ThreadGroup().WaitExited()
  1060  			}
  1061  		}
  1062  	}
  1063  
  1064  	// No more failure from this point on. Remove all container thread groups
  1065  	// from the map.
  1066  	for key := range l.processes {
  1067  		if key.cid == cid {
  1068  			delete(l.processes, key)
  1069  		}
  1070  	}
  1071  
  1072  	log.Debugf("Container destroyed, cid: %s", cid)
  1073  	return nil
  1074  }
  1075  
  1076  func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
  1077  	// Hold the lock for the entire operation to ensure that exec'd process is
  1078  	// added to 'processes' in case it races with destroyContainer().
  1079  	l.mu.Lock()
  1080  	defer l.mu.Unlock()
  1081  
  1082  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
  1083  	if err != nil {
  1084  		return 0, err
  1085  	}
  1086  	if tg == nil {
  1087  		return 0, fmt.Errorf("container %q not started", args.ContainerID)
  1088  	}
  1089  
  1090  	// Get the container MountNamespace from the Task. Try to acquire ref may fail
  1091  	// in case it raced with task exit.
  1092  	// task.MountNamespace() does not take a ref, so we must do so ourselves.
  1093  	args.MountNamespace = tg.Leader().MountNamespace()
  1094  	if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() {
  1095  		return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
  1096  	}
  1097  
  1098  	args.Envv, err = specutils.ResolveEnvs(args.Envv)
  1099  	if err != nil {
  1100  		return 0, fmt.Errorf("resolving env: %w", err)
  1101  	}
  1102  
  1103  	// Add the HOME environment variable if it is not already set.
  1104  	sctx := l.k.SupervisorContext()
  1105  	root := args.MountNamespace.Root(sctx)
  1106  	defer root.DecRef(sctx)
  1107  	ctx := vfs.WithRoot(sctx, root)
  1108  	defer args.MountNamespace.DecRef(ctx)
  1109  	args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
  1110  	if err != nil {
  1111  		return 0, err
  1112  	}
  1113  	args.PIDNamespace = tg.PIDNamespace()
  1114  
  1115  	args.Limits, err = createLimitSet(l.root.spec)
  1116  	if err != nil {
  1117  		return 0, fmt.Errorf("creating limits: %w", err)
  1118  	}
  1119  
  1120  	// Start the process.
  1121  	proc := control.Proc{Kernel: l.k}
  1122  	newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args)
  1123  	if err != nil {
  1124  		return 0, err
  1125  	}
  1126  
  1127  	eid := execID{cid: args.ContainerID, pid: tgid}
  1128  	l.processes[eid] = &execProcess{
  1129  		tg:  newTG,
  1130  		tty: ttyFile,
  1131  	}
  1132  	log.Debugf("updated processes: %v", l.processes)
  1133  
  1134  	return tgid, nil
  1135  }
  1136  
  1137  // waitContainer waits for the init process of a container to exit.
  1138  func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
  1139  	// Don't defer unlock, as doing so would make it impossible for
  1140  	// multiple clients to wait on the same container.
  1141  	tg, err := l.threadGroupFromID(execID{cid: cid})
  1142  	if err != nil {
  1143  		return fmt.Errorf("can't wait for container %q: %w", cid, err)
  1144  	}
  1145  
  1146  	// If the thread either has already exited or exits during waiting,
  1147  	// consider the container exited.
  1148  	ws := l.wait(tg)
  1149  	*waitStatus = ws
  1150  
  1151  	// Check for leaks and write coverage report after the root container has
  1152  	// exited. This guarantees that the report is written in cases where the
  1153  	// sandbox is killed by a signal after the ContMgrWait request is completed.
  1154  	if l.root.procArgs.ContainerID == cid {
  1155  		// All sentry-created resources should have been released at this point.
  1156  		refs.DoLeakCheck()
  1157  		_ = coverage.Report()
  1158  	}
  1159  	return nil
  1160  }
  1161  
  1162  func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
  1163  	if tgid <= 0 {
  1164  		return fmt.Errorf("PID (%d) must be positive", tgid)
  1165  	}
  1166  
  1167  	// Try to find a process that was exec'd
  1168  	eid := execID{cid: cid, pid: tgid}
  1169  	execTG, err := l.threadGroupFromID(eid)
  1170  	if err == nil {
  1171  		ws := l.wait(execTG)
  1172  		*waitStatus = ws
  1173  
  1174  		l.mu.Lock()
  1175  		delete(l.processes, eid)
  1176  		log.Debugf("updated processes (removal): %v", l.processes)
  1177  		l.mu.Unlock()
  1178  		return nil
  1179  	}
  1180  
  1181  	// The caller may be waiting on a process not started directly via exec.
  1182  	// In this case, find the process in the container's PID namespace.
  1183  	initTG, err := l.threadGroupFromID(execID{cid: cid})
  1184  	if err != nil {
  1185  		return fmt.Errorf("waiting for PID %d: %w", tgid, err)
  1186  	}
  1187  	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
  1188  	if tg == nil {
  1189  		return fmt.Errorf("waiting for PID %d: no such process", tgid)
  1190  	}
  1191  	if tg.Leader().ContainerID() != cid {
  1192  		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
  1193  	}
  1194  	ws := l.wait(tg)
  1195  	*waitStatus = ws
  1196  	return nil
  1197  }
  1198  
  1199  // wait waits for the process with TGID 'tgid' in a container's PID namespace
  1200  // to exit.
  1201  func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
  1202  	tg.WaitExited()
  1203  	return uint32(tg.ExitStatus())
  1204  }
  1205  
  1206  // WaitForStartSignal waits for a start signal from the control server.
  1207  func (l *Loader) WaitForStartSignal() {
  1208  	<-l.ctrl.manager.startChan
  1209  }
  1210  
  1211  // WaitExit waits for the root container to exit, and returns its exit status.
  1212  func (l *Loader) WaitExit() linux.WaitStatus {
  1213  	// Wait for container.
  1214  	l.k.WaitExited()
  1215  
  1216  	// Check all references.
  1217  	refs.OnExit()
  1218  
  1219  	return l.k.GlobalInit().ExitStatus()
  1220  }
  1221  
  1222  func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) {
  1223  	// Create an empty network stack because the network namespace may be empty at
  1224  	// this point. Netns is configured before Run() is called. Netstack is
  1225  	// configured using a control uRPC message. Host network is configured inside
  1226  	// Run().
  1227  	switch conf.Network {
  1228  	case config.NetworkHost:
  1229  		// If configured for raw socket support with host network
  1230  		// stack, make sure that we have CAP_NET_RAW the host,
  1231  		// otherwise we can't make raw sockets.
  1232  		if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) {
  1233  			return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability")
  1234  		}
  1235  		// No network namespacing support for hostinet yet, hence creator is nil.
  1236  		return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil
  1237  
  1238  	case config.NetworkNone, config.NetworkSandbox:
  1239  		s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite)
  1240  		if err != nil {
  1241  			return nil, err
  1242  		}
  1243  		creator := &sandboxNetstackCreator{
  1244  			clock:                    clock,
  1245  			uniqueID:                 uniqueID,
  1246  			allowPacketEndpointWrite: conf.AllowPacketEndpointWrite,
  1247  		}
  1248  		return inet.NewRootNamespace(s, creator, userns), nil
  1249  
  1250  	default:
  1251  		panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
  1252  	}
  1253  
  1254  }
  1255  
  1256  func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) {
  1257  	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
  1258  	transProtos := []stack.TransportProtocolFactory{
  1259  		tcp.NewProtocol,
  1260  		udp.NewProtocol,
  1261  		icmp.NewProtocol4,
  1262  		icmp.NewProtocol6,
  1263  	}
  1264  	s := netstack.Stack{Stack: stack.New(stack.Options{
  1265  		NetworkProtocols:   netProtos,
  1266  		TransportProtocols: transProtos,
  1267  		Clock:              clock,
  1268  		Stats:              netstack.Metrics,
  1269  		HandleLocal:        true,
  1270  		// Enable raw sockets for users with sufficient
  1271  		// privileges.
  1272  		RawFactory:               raw.EndpointFactory{},
  1273  		AllowPacketEndpointWrite: allowPacketEndpointWrite,
  1274  		UniqueID:                 uniqueID,
  1275  		DefaultIPTables:          netfilter.DefaultLinuxTables,
  1276  	})}
  1277  
  1278  	// Enable SACK Recovery.
  1279  	{
  1280  		opt := tcpip.TCPSACKEnabled(true)
  1281  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1282  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1283  		}
  1284  	}
  1285  
  1286  	// Set default TTLs as required by socket/netstack.
  1287  	{
  1288  		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
  1289  		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
  1290  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
  1291  		}
  1292  		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
  1293  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
  1294  		}
  1295  	}
  1296  
  1297  	// Enable Receive Buffer Auto-Tuning.
  1298  	{
  1299  		opt := tcpip.TCPModerateReceiveBufferOption(true)
  1300  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1301  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1302  		}
  1303  	}
  1304  
  1305  	return &s, nil
  1306  }
  1307  
  1308  // sandboxNetstackCreator implements kernel.NetworkStackCreator.
  1309  //
  1310  // +stateify savable
  1311  type sandboxNetstackCreator struct {
  1312  	clock                    tcpip.Clock
  1313  	uniqueID                 stack.UniqueID
  1314  	allowPacketEndpointWrite bool
  1315  }
  1316  
  1317  // CreateStack implements kernel.NetworkStackCreator.CreateStack.
  1318  func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
  1319  	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite)
  1320  	if err != nil {
  1321  		return nil, err
  1322  	}
  1323  
  1324  	// Setup loopback.
  1325  	n := &Network{Stack: s.(*netstack.Stack).Stack}
  1326  	nicID := tcpip.NICID(f.uniqueID.UniqueID())
  1327  	link := DefaultLoopbackLink
  1328  	linkEP := packetsocket.New(ethernet.New(loopback.New()))
  1329  	opts := stack.NICOptions{Name: link.Name}
  1330  
  1331  	if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil {
  1332  		return nil, err
  1333  	}
  1334  
  1335  	return s, nil
  1336  }
  1337  
  1338  // signal sends a signal to one or more processes in a container. If PID is 0,
  1339  // then the container init process is used. Depending on the SignalDeliveryMode
  1340  // option, the signal may be sent directly to the indicated process, to all
  1341  // processes in the container, or to the foreground process group. pid is
  1342  // relative to the root PID namespace, not the container's.
  1343  func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
  1344  	if pid < 0 {
  1345  		return fmt.Errorf("PID (%d) must be positive", pid)
  1346  	}
  1347  
  1348  	switch mode {
  1349  	case DeliverToProcess:
  1350  		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
  1351  			return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err)
  1352  		}
  1353  		return nil
  1354  
  1355  	case DeliverToForegroundProcessGroup:
  1356  		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
  1357  			return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err)
  1358  		}
  1359  		return nil
  1360  
  1361  	case DeliverToAllProcesses:
  1362  		if pid != 0 {
  1363  			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
  1364  		}
  1365  		// Check that the container has actually started before signaling it.
  1366  		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
  1367  			return err
  1368  		}
  1369  		if err := l.signalAllProcesses(cid, signo); err != nil {
  1370  			return fmt.Errorf("signaling all processes in container %q: %w", cid, err)
  1371  		}
  1372  		return nil
  1373  
  1374  	default:
  1375  		panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
  1376  	}
  1377  }
  1378  
  1379  // signalProcess sends signal to process in the given container. tgid is
  1380  // relative to the root PID namespace, not the container's.
  1381  func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
  1382  	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
  1383  	if err == nil {
  1384  		// Send signal directly to the identified process.
  1385  		return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo})
  1386  	}
  1387  
  1388  	// The caller may be signaling a process not started directly via exec.
  1389  	// In this case, find the process and check that the process belongs to the
  1390  	// container in question.
  1391  	tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid)
  1392  	if tg == nil {
  1393  		return fmt.Errorf("no such process with PID %d", tgid)
  1394  	}
  1395  	if tg.Leader().ContainerID() != cid {
  1396  		return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID())
  1397  	}
  1398  	return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1399  }
  1400  
  1401  // signalForegrondProcessGroup looks up foreground process group from the TTY
  1402  // for the given "tgid" inside container "cid", and send the signal to it.
  1403  func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
  1404  	l.mu.Lock()
  1405  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
  1406  	if err != nil {
  1407  		l.mu.Unlock()
  1408  		return fmt.Errorf("no thread group found: %w", err)
  1409  	}
  1410  	if tg == nil {
  1411  		l.mu.Unlock()
  1412  		return fmt.Errorf("container %q not started", cid)
  1413  	}
  1414  
  1415  	tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
  1416  	l.mu.Unlock()
  1417  	if err != nil {
  1418  		return fmt.Errorf("no thread group found: %w", err)
  1419  	}
  1420  	if tty == nil {
  1421  		return fmt.Errorf("no TTY attached")
  1422  	}
  1423  	pg := tty.ForegroundProcessGroup()
  1424  	si := &linux.SignalInfo{Signo: signo}
  1425  	if pg == nil {
  1426  		// No foreground process group has been set. Signal the
  1427  		// original thread group.
  1428  		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
  1429  		return l.k.SendExternalSignalThreadGroup(tg, si)
  1430  	}
  1431  	// Send the signal to all processes in the process group.
  1432  	return l.k.SendExternalSignalProcessGroup(pg, si)
  1433  }
  1434  
  1435  // signalAllProcesses that belong to specified container. It's a noop if the
  1436  // container hasn't started or has exited.
  1437  func (l *Loader) signalAllProcesses(cid string, signo int32) error {
  1438  	// Pause the kernel to prevent new processes from being created while
  1439  	// the signal is delivered. This prevents process leaks when SIGKILL is
  1440  	// sent to the entire container.
  1441  	l.k.Pause()
  1442  	defer l.k.Unpause()
  1443  	return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo})
  1444  }
  1445  
  1446  // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
  1447  // acquires mutex before calling it and fails in case container hasn't started
  1448  // yet.
  1449  func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
  1450  	l.mu.Lock()
  1451  	defer l.mu.Unlock()
  1452  	tg, err := l.tryThreadGroupFromIDLocked(key)
  1453  	if err != nil {
  1454  		return nil, err
  1455  	}
  1456  	if tg == nil {
  1457  		return nil, fmt.Errorf("container %q not started", key.cid)
  1458  	}
  1459  	return tg, nil
  1460  }
  1461  
  1462  // tryThreadGroupFromIDLocked returns the thread group for the given execution
  1463  // ID. It may return nil in case the container has not started yet. Returns
  1464  // error if execution ID is invalid or if the container cannot be found (maybe
  1465  // it has been deleted). Caller must hold 'mu'.
  1466  func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
  1467  	ep := l.processes[key]
  1468  	if ep == nil {
  1469  		return nil, fmt.Errorf("container %q not found", key.cid)
  1470  	}
  1471  	return ep.tg, nil
  1472  }
  1473  
  1474  // ttyFromIDLocked returns the TTY files for the given execution ID. It may
  1475  // return nil in case the container has not started yet. Returns error if
  1476  // execution ID is invalid or if the container cannot be found (maybe it has
  1477  // been deleted). Caller must hold 'mu'.
  1478  func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) {
  1479  	ep := l.processes[key]
  1480  	if ep == nil {
  1481  		return nil, fmt.Errorf("container %q not found", key.cid)
  1482  	}
  1483  	return ep.tty, nil
  1484  }
  1485  
  1486  func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User) (*kernel.FDTable, *host.TTYFileDescription, error) {
  1487  	if len(stdioFDs) != 3 {
  1488  		return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
  1489  	}
  1490  	fdMap := map[int]*fd.FD{
  1491  		0: stdioFDs[0],
  1492  		1: stdioFDs[1],
  1493  		2: stdioFDs[2],
  1494  	}
  1495  
  1496  	// Create the entries for the host files that were passed to our app.
  1497  	for _, customFD := range passFDs {
  1498  		if customFD.guest < 0 {
  1499  			return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater")
  1500  		}
  1501  		fdMap[customFD.guest] = customFD.host
  1502  	}
  1503  
  1504  	k := kernel.KernelFromContext(ctx)
  1505  	fdTable := k.NewFDTable()
  1506  	ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap)
  1507  	if err != nil {
  1508  		fdTable.DecRef(ctx)
  1509  		return nil, nil, err
  1510  	}
  1511  	return fdTable, ttyFile, nil
  1512  }
  1513  
  1514  // portForward implements initiating a portForward connection in the sandbox. portForwardProxies
  1515  // represent a two connections each copying to each other (read ends to write ends) in goroutines.
  1516  // The proxies are stored and can be cleaned up, or clean up after themselves if the connection
  1517  // is broken.
  1518  func (l *Loader) portForward(opts *PortForwardOpts) error {
  1519  	// Validate that we have a stream FD to write to. If this happens then
  1520  	// it means there is a misbehaved urpc client or a bug has occurred.
  1521  	if len(opts.Files) != 1 {
  1522  		return fmt.Errorf("stream FD is required for port forward")
  1523  	}
  1524  
  1525  	l.mu.Lock()
  1526  	defer l.mu.Unlock()
  1527  
  1528  	cid := opts.ContainerID
  1529  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
  1530  	if err != nil {
  1531  		return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err)
  1532  	}
  1533  	if tg == nil {
  1534  		return fmt.Errorf("container %q not started", cid)
  1535  	}
  1536  
  1537  	// Import the fd for the UDS.
  1538  	ctx := l.k.SupervisorContext()
  1539  	fd, err := l.importFD(ctx, opts.Files[0])
  1540  	if err != nil {
  1541  		return fmt.Errorf("importing stream fd: %w", err)
  1542  	}
  1543  	cu := cleanup.Make(func() { fd.DecRef(ctx) })
  1544  	defer cu.Clean()
  1545  
  1546  	fdConn := pf.NewFileDescriptionConn(fd)
  1547  
  1548  	// Create a proxy to forward data between the fdConn and the sandboxed application.
  1549  	pair := pf.ProxyPair{To: fdConn}
  1550  
  1551  	switch l.root.conf.Network {
  1552  	case config.NetworkSandbox:
  1553  		stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack
  1554  		nsConn, err := pf.NewNetstackConn(stack, opts.Port)
  1555  		if err != nil {
  1556  			return fmt.Errorf("creating netstack port forward connection: %w", err)
  1557  		}
  1558  		pair.From = nsConn
  1559  	case config.NetworkHost:
  1560  		hConn, err := pf.NewHostInetConn(opts.Port)
  1561  		if err != nil {
  1562  			return fmt.Errorf("creating hostinet port forward connection: %w", err)
  1563  		}
  1564  		pair.From = hConn
  1565  	default:
  1566  		return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid)
  1567  	}
  1568  	cu.Release()
  1569  	proxy := pf.NewProxy(pair, opts.ContainerID)
  1570  
  1571  	// Add to the list of port forward connections and remove when the
  1572  	// connection closes.
  1573  	l.portForwardProxies = append(l.portForwardProxies, proxy)
  1574  	proxy.AddCleanup(func() {
  1575  		l.mu.Lock()
  1576  		defer l.mu.Unlock()
  1577  		for i := range l.portForwardProxies {
  1578  			if l.portForwardProxies[i] == proxy {
  1579  				l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...)
  1580  				break
  1581  			}
  1582  		}
  1583  	})
  1584  
  1585  	// Start forwarding on the connection.
  1586  	proxy.Start(ctx)
  1587  	return nil
  1588  }
  1589  
  1590  // importFD generically imports a host file descriptor without adding it to any
  1591  // fd table.
  1592  func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) {
  1593  	hostFD, err := fd.NewFromFile(f)
  1594  	if err != nil {
  1595  		return nil, err
  1596  	}
  1597  	defer hostFD.Close()
  1598  	fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{
  1599  		Savable:      false, // We disconnect and close on save.
  1600  		IsTTY:        false,
  1601  		VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed.
  1602  	})
  1603  
  1604  	if err != nil {
  1605  		return nil, err
  1606  	}
  1607  	hostFD.Release()
  1608  	return fd, nil
  1609  }
  1610  
  1611  func (l *Loader) containerCount() int {
  1612  	l.mu.Lock()
  1613  	defer l.mu.Unlock()
  1614  
  1615  	containers := 0
  1616  	for id := range l.processes {
  1617  		if id.pid == 0 {
  1618  			// pid==0 represents the init process of a container. There is
  1619  			// only one of such process per container.
  1620  			containers++
  1621  		}
  1622  	}
  1623  	return containers
  1624  }
  1625  
  1626  func (l *Loader) pidsCount(cid string) (int, error) {
  1627  	l.mu.Lock()
  1628  	defer l.mu.Unlock()
  1629  
  1630  	if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil {
  1631  		// Container doesn't exist.
  1632  		return 0, err
  1633  	}
  1634  	return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil
  1635  }