github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/loader.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/loader.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package boot loads the kernel and runs a container.
    16  package boot
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	mrand "math/rand"
    22  	"os"
    23  	"runtime"
    24  	"sync/atomic"
    25  	gtime "time"
    26  
    27  	specs "github.com/opencontainers/runtime-spec/specs-go"
    28  	"golang.org/x/sys/unix"
    29  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    30  	"github.com/SagerNet/gvisor/pkg/bpf"
    31  	"github.com/SagerNet/gvisor/pkg/context"
    32  	"github.com/SagerNet/gvisor/pkg/coverage"
    33  	"github.com/SagerNet/gvisor/pkg/cpuid"
    34  	"github.com/SagerNet/gvisor/pkg/fd"
    35  	"github.com/SagerNet/gvisor/pkg/log"
    36  	"github.com/SagerNet/gvisor/pkg/memutil"
    37  	"github.com/SagerNet/gvisor/pkg/rand"
    38  	"github.com/SagerNet/gvisor/pkg/refs"
    39  	"github.com/SagerNet/gvisor/pkg/refsvfs2"
    40  	"github.com/SagerNet/gvisor/pkg/sentry/control"
    41  	"github.com/SagerNet/gvisor/pkg/sentry/fdimport"
    42  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    43  	"github.com/SagerNet/gvisor/pkg/sentry/fs/host"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/fs/user"
    45  	hostvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/host"
    46  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    48  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    49  	"github.com/SagerNet/gvisor/pkg/sentry/loader"
    50  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    51  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    52  	"github.com/SagerNet/gvisor/pkg/sentry/sighandling"
    53  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netfilter"
    54  	"github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux/vfs2"
    55  	"github.com/SagerNet/gvisor/pkg/sentry/time"
    56  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    57  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    58  	"github.com/SagerNet/gvisor/pkg/sentry/watchdog"
    59  	"github.com/SagerNet/gvisor/pkg/sync"
    60  	"github.com/SagerNet/gvisor/pkg/tcpip"
    61  	"github.com/SagerNet/gvisor/pkg/tcpip/link/loopback"
    62  	"github.com/SagerNet/gvisor/pkg/tcpip/link/sniffer"
    63  	"github.com/SagerNet/gvisor/pkg/tcpip/network/arp"
    64  	"github.com/SagerNet/gvisor/pkg/tcpip/network/ipv4"
    65  	"github.com/SagerNet/gvisor/pkg/tcpip/network/ipv6"
    66  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    67  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/icmp"
    68  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/raw"
    69  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/tcp"
    70  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/udp"
    71  	"github.com/SagerNet/gvisor/runsc/boot/filter"
    72  	_ "github.com/SagerNet/gvisor/runsc/boot/platforms" // register all platforms.
    73  	"github.com/SagerNet/gvisor/runsc/boot/pprof"
    74  	"github.com/SagerNet/gvisor/runsc/config"
    75  	"github.com/SagerNet/gvisor/runsc/specutils"
    76  	"github.com/SagerNet/gvisor/runsc/specutils/seccomp"
    77  
    78  	// Top-level inet providers.
    79  	"github.com/SagerNet/gvisor/pkg/sentry/socket/hostinet"
    80  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netstack"
    81  
    82  	// Include other supported socket providers.
    83  	_ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink"
    84  	_ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/route"
    85  	_ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/uevent"
    86  	_ "github.com/SagerNet/gvisor/pkg/sentry/socket/unix"
    87  )
    88  
    89  type containerInfo struct {
    90  	conf *config.Config
    91  
    92  	// spec is the base configuration for the root container.
    93  	spec *specs.Spec
    94  
    95  	// procArgs refers to the container's init task.
    96  	procArgs kernel.CreateProcessArgs
    97  
    98  	// stdioFDs contains stdin, stdout, and stderr.
    99  	stdioFDs []*fd.FD
   100  
   101  	// goferFDs are the FDs that attach the sandbox to the gofers.
   102  	goferFDs []*fd.FD
   103  }
   104  
   105  // Loader keeps state needed to start the kernel and run the container.
   106  type Loader struct {
   107  	// k is the kernel.
   108  	k *kernel.Kernel
   109  
   110  	// ctrl is the control server.
   111  	ctrl *controller
   112  
   113  	// root contains information about the root container in the sandbox.
   114  	root containerInfo
   115  
   116  	watchdog *watchdog.Watchdog
   117  
   118  	// stopSignalForwarding disables forwarding of signals to the sandboxed
   119  	// container. It should be called when a sandbox is destroyed.
   120  	stopSignalForwarding func()
   121  
   122  	// restore is set to true if we are restoring a container.
   123  	restore bool
   124  
   125  	// sandboxID is the ID for the whole sandbox.
   126  	sandboxID string
   127  
   128  	// mu guards processes.
   129  	mu sync.Mutex
   130  
   131  	// processes maps containers init process and invocation of exec. Root
   132  	// processes are keyed with container ID and pid=0, while exec invocations
   133  	// have the corresponding pid set.
   134  	//
   135  	// processes is guardded by mu.
   136  	processes map[execID]*execProcess
   137  
   138  	// mountHints provides extra information about mounts for containers that
   139  	// apply to the entire pod.
   140  	mountHints *podMountHints
   141  }
   142  
   143  // execID uniquely identifies a sentry process that is executed in a container.
   144  type execID struct {
   145  	cid string
   146  	pid kernel.ThreadID
   147  }
   148  
   149  // execProcess contains the thread group and host TTY of a sentry process.
   150  type execProcess struct {
   151  	// tg will be nil for containers that haven't started yet.
   152  	tg *kernel.ThreadGroup
   153  
   154  	// tty will be nil if the process is not attached to a terminal.
   155  	tty *host.TTYFileOperations
   156  
   157  	// tty will be nil if the process is not attached to a terminal.
   158  	ttyVFS2 *hostvfs2.TTYFileDescription
   159  
   160  	// pidnsPath is the pid namespace path in spec
   161  	pidnsPath string
   162  
   163  	// hostTTY is present when creating a sub-container with terminal enabled.
   164  	// TTY file is passed during container create and must be saved until
   165  	// container start.
   166  	hostTTY *fd.FD
   167  }
   168  
   169  func init() {
   170  	// Initialize the random number generator.
   171  	mrand.Seed(gtime.Now().UnixNano())
   172  }
   173  
   174  // Args are the arguments for New().
   175  type Args struct {
   176  	// Id is the sandbox ID.
   177  	ID string
   178  	// Spec is the sandbox specification.
   179  	Spec *specs.Spec
   180  	// Conf is the system configuration.
   181  	Conf *config.Config
   182  	// ControllerFD is the FD to the URPC controller. The Loader takes ownership
   183  	// of this FD and may close it at any time.
   184  	ControllerFD int
   185  	// Device is an optional argument that is passed to the platform. The Loader
   186  	// takes ownership of this file and may close it at any time.
   187  	Device *os.File
   188  	// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
   189  	// takes ownership of these FDs and may close them at any time.
   190  	GoferFDs []int
   191  	// StdioFDs is the stdio for the application. The Loader takes ownership of
   192  	// these FDs and may close them at any time.
   193  	StdioFDs []int
   194  	// NumCPU is the number of CPUs to create inside the sandbox.
   195  	NumCPU int
   196  	// TotalMem is the initial amount of total memory to report back to the
   197  	// container.
   198  	TotalMem uint64
   199  	// UserLogFD is the file descriptor to write user logs to.
   200  	UserLogFD int
   201  }
   202  
   203  // make sure stdioFDs are always the same on initial start and on restore
   204  const startingStdioFD = 256
   205  
   206  // New initializes a new kernel loader configured by spec.
   207  // New also handles setting up a kernel for restoring a container.
   208  func New(args Args) (*Loader, error) {
   209  	// We initialize the rand package now to make sure /dev/urandom is pre-opened
   210  	// on kernels that do not support getrandom(2).
   211  	if err := rand.Init(); err != nil {
   212  		return nil, fmt.Errorf("setting up rand: %w", err)
   213  	}
   214  
   215  	if err := usage.Init(); err != nil {
   216  		return nil, fmt.Errorf("setting up memory usage: %w", err)
   217  	}
   218  
   219  	// Is this a VFSv2 kernel?
   220  	if args.Conf.VFS2 {
   221  		kernel.VFS2Enabled = true
   222  		if args.Conf.FUSE {
   223  			kernel.FUSEEnabled = true
   224  		}
   225  
   226  		vfs2.Override()
   227  	}
   228  
   229  	// Make host FDs stable between invocations. Host FDs must map to the exact
   230  	// same number when the sandbox is restored. Otherwise the wrong FD will be
   231  	// used.
   232  	info := containerInfo{}
   233  	newfd := startingStdioFD
   234  
   235  	for _, stdioFD := range args.StdioFDs {
   236  		// Check that newfd is unused to avoid clobbering over it.
   237  		if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
   238  			if err != nil {
   239  				return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err)
   240  			}
   241  			return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
   242  		}
   243  
   244  		err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
   245  		if err != nil {
   246  			return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
   247  		}
   248  		info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
   249  		_ = unix.Close(stdioFD)
   250  		newfd++
   251  	}
   252  	for _, goferFD := range args.GoferFDs {
   253  		info.goferFDs = append(info.goferFDs, fd.New(goferFD))
   254  	}
   255  
   256  	// Create kernel and platform.
   257  	p, err := createPlatform(args.Conf, args.Device)
   258  	if err != nil {
   259  		return nil, fmt.Errorf("creating platform: %w", err)
   260  	}
   261  	k := &kernel.Kernel{
   262  		Platform: p,
   263  	}
   264  
   265  	// Create memory file.
   266  	mf, err := createMemoryFile()
   267  	if err != nil {
   268  		return nil, fmt.Errorf("creating memory file: %w", err)
   269  	}
   270  	k.SetMemoryFile(mf)
   271  
   272  	// Create VDSO.
   273  	//
   274  	// Pass k as the platform since it is savable, unlike the actual platform.
   275  	vdso, err := loader.PrepareVDSO(k)
   276  	if err != nil {
   277  		return nil, fmt.Errorf("creating vdso: %w", err)
   278  	}
   279  
   280  	// Create timekeeper.
   281  	tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
   282  	tk.SetClocks(time.NewCalibratedClocks())
   283  
   284  	if err := enableStrace(args.Conf); err != nil {
   285  		return nil, fmt.Errorf("enabling strace: %w", err)
   286  	}
   287  
   288  	// Create root network namespace/stack.
   289  	netns, err := newRootNetworkNamespace(args.Conf, tk, k)
   290  	if err != nil {
   291  		return nil, fmt.Errorf("creating network: %w", err)
   292  	}
   293  
   294  	// Create capabilities.
   295  	caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
   296  	if err != nil {
   297  		return nil, fmt.Errorf("converting capabilities: %w", err)
   298  	}
   299  
   300  	// Convert the spec's additional GIDs to KGIDs.
   301  	extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
   302  	for _, GID := range args.Spec.Process.User.AdditionalGids {
   303  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   304  	}
   305  
   306  	// Create credentials.
   307  	creds := auth.NewUserCredentials(
   308  		auth.KUID(args.Spec.Process.User.UID),
   309  		auth.KGID(args.Spec.Process.User.GID),
   310  		extraKGIDs,
   311  		caps,
   312  		auth.NewRootUserNamespace())
   313  
   314  	if args.NumCPU == 0 {
   315  		args.NumCPU = runtime.NumCPU()
   316  	}
   317  	log.Infof("CPUs: %d", args.NumCPU)
   318  	runtime.GOMAXPROCS(args.NumCPU)
   319  
   320  	if args.TotalMem > 0 {
   321  		// Adjust the total memory returned by the Sentry so that applications that
   322  		// use /proc/meminfo can make allocations based on this limit.
   323  		usage.MaximumTotalMemoryBytes = args.TotalMem
   324  		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
   325  	}
   326  
   327  	// Initiate the Kernel object, which is required by the Context passed
   328  	// to createVFS in order to mount (among other things) procfs.
   329  	if err = k.Init(kernel.InitKernelArgs{
   330  		FeatureSet:                  cpuid.HostFeatureSet(),
   331  		Timekeeper:                  tk,
   332  		RootUserNamespace:           creds.UserNamespace,
   333  		RootNetworkNamespace:        netns,
   334  		ApplicationCores:            uint(args.NumCPU),
   335  		Vdso:                        vdso,
   336  		RootUTSNamespace:            kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
   337  		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
   338  		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
   339  		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
   340  	}); err != nil {
   341  		return nil, fmt.Errorf("initializing kernel: %w", err)
   342  	}
   343  
   344  	if kernel.VFS2Enabled {
   345  		if err := registerFilesystems(k); err != nil {
   346  			return nil, fmt.Errorf("registering filesystems: %w", err)
   347  		}
   348  	}
   349  
   350  	if err := adjustDirentCache(k); err != nil {
   351  		return nil, err
   352  	}
   353  
   354  	// Turn on packet logging if enabled.
   355  	if args.Conf.LogPackets {
   356  		log.Infof("Packet logging enabled")
   357  		atomic.StoreUint32(&sniffer.LogPackets, 1)
   358  	} else {
   359  		log.Infof("Packet logging disabled")
   360  		atomic.StoreUint32(&sniffer.LogPackets, 0)
   361  	}
   362  
   363  	// Create a watchdog.
   364  	dogOpts := watchdog.DefaultOpts
   365  	dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
   366  	dog := watchdog.New(k, dogOpts)
   367  
   368  	procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
   369  	if err != nil {
   370  		return nil, fmt.Errorf("creating init process for root container: %w", err)
   371  	}
   372  	info.procArgs = procArgs
   373  
   374  	if err := initCompatLogs(args.UserLogFD); err != nil {
   375  		return nil, fmt.Errorf("initializing compat logs: %w", err)
   376  	}
   377  
   378  	mountHints, err := newPodMountHints(args.Spec)
   379  	if err != nil {
   380  		return nil, fmt.Errorf("creating pod mount hints: %w", err)
   381  	}
   382  
   383  	info.conf = args.Conf
   384  	info.spec = args.Spec
   385  
   386  	if kernel.VFS2Enabled {
   387  		// Set up host mount that will be used for imported fds.
   388  		hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
   389  		if err != nil {
   390  			return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err)
   391  		}
   392  		defer hostFilesystem.DecRef(k.SupervisorContext())
   393  		hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
   394  		if err != nil {
   395  			return nil, fmt.Errorf("failed to create hostfs mount: %w", err)
   396  		}
   397  		k.SetHostMount(hostMount)
   398  	}
   399  
   400  	eid := execID{cid: args.ID}
   401  	l := &Loader{
   402  		k:          k,
   403  		watchdog:   dog,
   404  		sandboxID:  args.ID,
   405  		processes:  map[execID]*execProcess{eid: {}},
   406  		mountHints: mountHints,
   407  		root:       info,
   408  	}
   409  
   410  	// We don't care about child signals; some platforms can generate a
   411  	// tremendous number of useless ones (I'm looking at you, ptrace).
   412  	if err := sighandling.IgnoreChildStop(); err != nil {
   413  		return nil, fmt.Errorf("ignore child stop signals failed: %w", err)
   414  	}
   415  
   416  	// Create the control server using the provided FD.
   417  	//
   418  	// This must be done *after* we have initialized the kernel since the
   419  	// controller is used to configure the kernel's network stack.
   420  	ctrl, err := newController(args.ControllerFD, l)
   421  	if err != nil {
   422  		return nil, fmt.Errorf("creating control server: %w", err)
   423  	}
   424  	l.ctrl = ctrl
   425  
   426  	// Only start serving after Loader is set to controller and controller is set
   427  	// to Loader, because they are both used in the urpc methods.
   428  	if err := ctrl.srv.StartServing(); err != nil {
   429  		return nil, fmt.Errorf("starting control server: %w", err)
   430  	}
   431  
   432  	return l, nil
   433  }
   434  
   435  // createProcessArgs creates args that can be used with kernel.CreateProcess.
   436  func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
   437  	// Create initial limits.
   438  	ls, err := createLimitSet(spec)
   439  	if err != nil {
   440  		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err)
   441  	}
   442  	env, err := specutils.ResolveEnvs(spec.Process.Env)
   443  	if err != nil {
   444  		return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err)
   445  	}
   446  
   447  	wd := spec.Process.Cwd
   448  	if wd == "" {
   449  		wd = "/"
   450  	}
   451  
   452  	// Create the process arguments.
   453  	procArgs := kernel.CreateProcessArgs{
   454  		Argv:                    spec.Process.Args,
   455  		Envv:                    env,
   456  		WorkingDirectory:        wd,
   457  		Credentials:             creds,
   458  		Umask:                   0022,
   459  		Limits:                  ls,
   460  		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
   461  		UTSNamespace:            k.RootUTSNamespace(),
   462  		IPCNamespace:            k.RootIPCNamespace(),
   463  		AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
   464  		ContainerID:             id,
   465  		PIDNamespace:            pidns,
   466  	}
   467  
   468  	return procArgs, nil
   469  }
   470  
   471  // Destroy cleans up all resources used by the loader.
   472  //
   473  // Note that this will block until all open control server connections have
   474  // been closed. For that reason, this should NOT be called in a defer, because
   475  // a panic in a control server rpc would then hang forever.
   476  func (l *Loader) Destroy() {
   477  	if l.stopSignalForwarding != nil {
   478  		l.stopSignalForwarding()
   479  	}
   480  	l.watchdog.Stop()
   481  
   482  	// Stop the control server. This will indirectly stop any
   483  	// long-running control operations that are in flight, e.g.
   484  	// profiling operations.
   485  	l.ctrl.stop()
   486  
   487  	// Release all kernel resources. This is only safe after we can no longer
   488  	// save/restore.
   489  	l.k.Release()
   490  
   491  	// In the success case, stdioFDs and goferFDs will only contain
   492  	// released/closed FDs that ownership has been passed over to host FDs and
   493  	// gofer sessions. Close them here in case of failure.
   494  	for _, f := range l.root.stdioFDs {
   495  		_ = f.Close()
   496  	}
   497  	for _, f := range l.root.goferFDs {
   498  		_ = f.Close()
   499  	}
   500  }
   501  
   502  func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) {
   503  	p, err := platform.Lookup(conf.Platform)
   504  	if err != nil {
   505  		panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err))
   506  	}
   507  	log.Infof("Platform: %s", conf.Platform)
   508  	return p.New(deviceFile)
   509  }
   510  
   511  func createMemoryFile() (*pgalloc.MemoryFile, error) {
   512  	const memfileName = "runsc-memory"
   513  	memfd, err := memutil.CreateMemFD(memfileName, 0)
   514  	if err != nil {
   515  		return nil, fmt.Errorf("error creating memfd: %w", err)
   516  	}
   517  	memfile := os.NewFile(uintptr(memfd), memfileName)
   518  	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
   519  	// there are memory cgroups specified, because at this point we're already
   520  	// in a mount namespace in which the relevant cgroupfs is not visible.
   521  	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
   522  	if err != nil {
   523  		_ = memfile.Close()
   524  		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err)
   525  	}
   526  	return mf, nil
   527  }
   528  
   529  // installSeccompFilters installs sandbox seccomp filters with the host.
   530  func (l *Loader) installSeccompFilters() error {
   531  	if l.root.conf.DisableSeccomp {
   532  		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
   533  	} else {
   534  		opts := filter.Options{
   535  			Platform:      l.k.Platform,
   536  			HostNetwork:   l.root.conf.Network == config.NetworkHost,
   537  			ProfileEnable: l.root.conf.ProfileEnable,
   538  			ControllerFD:  l.ctrl.srv.FD(),
   539  		}
   540  		if err := filter.Install(opts); err != nil {
   541  			return fmt.Errorf("installing seccomp filters: %w", err)
   542  		}
   543  	}
   544  	return nil
   545  }
   546  
   547  // Run runs the root container.
   548  func (l *Loader) Run() error {
   549  	err := l.run()
   550  	l.ctrl.manager.startResultChan <- err
   551  	if err != nil {
   552  		// Give the controller some time to send the error to the
   553  		// runtime. If we return too quickly here the process will exit
   554  		// and the control connection will be closed before the error
   555  		// is returned.
   556  		gtime.Sleep(2 * gtime.Second)
   557  		return err
   558  	}
   559  	return nil
   560  }
   561  
   562  func (l *Loader) run() error {
   563  	if l.root.conf.Network == config.NetworkHost {
   564  		// Delay host network configuration to this point because network namespace
   565  		// is configured after the loader is created and before Run() is called.
   566  		log.Debugf("Configuring host network")
   567  		s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
   568  		if err := s.Configure(); err != nil {
   569  			return err
   570  		}
   571  	}
   572  
   573  	l.mu.Lock()
   574  	defer l.mu.Unlock()
   575  
   576  	eid := execID{cid: l.sandboxID}
   577  	ep, ok := l.processes[eid]
   578  	if !ok {
   579  		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
   580  	}
   581  
   582  	// If we are restoring, we do not want to create a process.
   583  	// l.restore is set by the container manager when a restore call is made.
   584  	if !l.restore {
   585  		if l.root.conf.ProfileEnable {
   586  			pprof.Initialize()
   587  		}
   588  
   589  		// Finally done with all configuration. Setup filters before user code
   590  		// is loaded.
   591  		if err := l.installSeccompFilters(); err != nil {
   592  			return err
   593  		}
   594  
   595  		// Create the root container init task. It will begin running
   596  		// when the kernel is started.
   597  		var err error
   598  		_, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root)
   599  		if err != nil {
   600  			return err
   601  		}
   602  	}
   603  
   604  	ep.tg = l.k.GlobalInit()
   605  	if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok {
   606  		ep.pidnsPath = ns.Path
   607  	}
   608  
   609  	// Handle signals by forwarding them to the root container process
   610  	// (except for panic signal, which should cause a panic).
   611  	l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
   612  		// Panic signal should cause a panic.
   613  		if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) {
   614  			panic("Signal-induced panic")
   615  		}
   616  
   617  		// Otherwise forward to root container.
   618  		deliveryMode := DeliverToProcess
   619  		if l.root.spec.Process.Terminal {
   620  			// Since we are running with a console, we should forward the signal to
   621  			// the foreground process group so that job control signals like ^C can
   622  			// be handled properly.
   623  			deliveryMode = DeliverToForegroundProcessGroup
   624  		}
   625  		log.Infof("Received external signal %d, mode: %s", sig, deliveryMode)
   626  		if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
   627  			log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err)
   628  		}
   629  	})
   630  
   631  	log.Infof("Process should have started...")
   632  	l.watchdog.Start()
   633  	return l.k.Start()
   634  }
   635  
   636  // createContainer creates a new container inside the sandbox.
   637  func (l *Loader) createContainer(cid string, tty *fd.FD) error {
   638  	l.mu.Lock()
   639  	defer l.mu.Unlock()
   640  
   641  	eid := execID{cid: cid}
   642  	if _, ok := l.processes[eid]; ok {
   643  		return fmt.Errorf("container %q already exists", cid)
   644  	}
   645  	l.processes[eid] = &execProcess{hostTTY: tty}
   646  	return nil
   647  }
   648  
   649  // startContainer starts a child container. It returns the thread group ID of
   650  // the newly created process. Used FDs are either closed or released. It's safe
   651  // for the caller to close any remaining files upon return.
   652  func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
   653  	// Create capabilities.
   654  	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
   655  	if err != nil {
   656  		return fmt.Errorf("creating capabilities: %w", err)
   657  	}
   658  
   659  	l.mu.Lock()
   660  	defer l.mu.Unlock()
   661  
   662  	ep := l.processes[execID{cid: cid}]
   663  	if ep == nil {
   664  		return fmt.Errorf("trying to start a deleted container %q", cid)
   665  	}
   666  
   667  	// Convert the spec's additional GIDs to KGIDs.
   668  	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
   669  	for _, GID := range spec.Process.User.AdditionalGids {
   670  		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
   671  	}
   672  
   673  	// Create credentials. We reuse the root user namespace because the
   674  	// sentry currently supports only 1 mount namespace, which is tied to a
   675  	// single user namespace. Thus we must run in the same user namespace
   676  	// to access mounts.
   677  	creds := auth.NewUserCredentials(
   678  		auth.KUID(spec.Process.User.UID),
   679  		auth.KGID(spec.Process.User.GID),
   680  		extraKGIDs,
   681  		caps,
   682  		l.k.RootUserNamespace())
   683  
   684  	var pidns *kernel.PIDNamespace
   685  	if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
   686  		if ns.Path != "" {
   687  			for _, p := range l.processes {
   688  				if ns.Path == p.pidnsPath {
   689  					pidns = p.tg.PIDNamespace()
   690  					break
   691  				}
   692  			}
   693  		}
   694  		if pidns == nil {
   695  			pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
   696  		}
   697  		ep.pidnsPath = ns.Path
   698  	} else {
   699  		pidns = l.k.RootPIDNamespace()
   700  	}
   701  
   702  	info := &containerInfo{
   703  		conf:     conf,
   704  		spec:     spec,
   705  		goferFDs: goferFDs,
   706  	}
   707  	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
   708  	if err != nil {
   709  		return fmt.Errorf("creating new process: %w", err)
   710  	}
   711  
   712  	// Use stdios or TTY depending on the spec configuration.
   713  	if spec.Process.Terminal {
   714  		if l := len(stdioFDs); l != 0 {
   715  			return fmt.Errorf("using TTY, stdios not expected: %d", l)
   716  		}
   717  		if ep.hostTTY == nil {
   718  			return fmt.Errorf("terminal enabled but no TTY provided (--console-socket possibly passed)")
   719  		}
   720  		info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
   721  		ep.hostTTY = nil
   722  	} else {
   723  		info.stdioFDs = stdioFDs
   724  	}
   725  
   726  	ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info)
   727  	if err != nil {
   728  		return err
   729  	}
   730  	l.k.StartProcess(ep.tg)
   731  	return nil
   732  }
   733  
   734  func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
   735  	// Create the FD map, which will set stdin, stdout, and stderr.
   736  	ctx := info.procArgs.NewContext(l.k)
   737  	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs)
   738  	if err != nil {
   739  		return nil, nil, nil, fmt.Errorf("importing fds: %w", err)
   740  	}
   741  	// CreateProcess takes a reference on fdTable if successful. We won't need
   742  	// ours either way.
   743  	info.procArgs.FDTable = fdTable
   744  
   745  	// Setup the child container file system.
   746  	l.startGoferMonitor(cid, info.goferFDs)
   747  
   748  	mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled)
   749  	if root {
   750  		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
   751  			return nil, nil, nil, err
   752  		}
   753  	}
   754  	if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
   755  		return nil, nil, nil, err
   756  	}
   757  
   758  	// Add the HOME environment variable if it is not already set.
   759  	var envv []string
   760  	if kernel.VFS2Enabled {
   761  		envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2,
   762  			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
   763  
   764  	} else {
   765  		envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace,
   766  			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
   767  	}
   768  	if err != nil {
   769  		return nil, nil, nil, err
   770  	}
   771  	info.procArgs.Envv = envv
   772  
   773  	// Create and start the new process.
   774  	tg, _, err := l.k.CreateProcess(info.procArgs)
   775  	if err != nil {
   776  		return nil, nil, nil, fmt.Errorf("creating process: %w", err)
   777  	}
   778  	// CreateProcess takes a reference on FDTable if successful.
   779  	info.procArgs.FDTable.DecRef(ctx)
   780  
   781  	// Set the foreground process group on the TTY to the global init process
   782  	// group, since that is what we are about to start running.
   783  	switch {
   784  	case ttyFileVFS2 != nil:
   785  		ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
   786  	case ttyFile != nil:
   787  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
   788  	}
   789  
   790  	// Install seccomp filters with the new task if there are any.
   791  	if info.conf.OCISeccomp {
   792  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   793  			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
   794  			if err != nil {
   795  				return nil, nil, nil, fmt.Errorf("building seccomp program: %w", err)
   796  			}
   797  
   798  			if log.IsLogging(log.Debug) {
   799  				out, _ := bpf.DecodeProgram(program)
   800  				log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
   801  			}
   802  
   803  			task := tg.Leader()
   804  			// NOTE: It seems Flags are ignored by runc so we ignore them too.
   805  			if err := task.AppendSyscallFilter(program, true); err != nil {
   806  				return nil, nil, nil, fmt.Errorf("appending seccomp filters: %w", err)
   807  			}
   808  		}
   809  	} else {
   810  		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
   811  			log.Warningf("Seccomp spec is being ignored")
   812  		}
   813  	}
   814  
   815  	return tg, ttyFile, ttyFileVFS2, nil
   816  }
   817  
   818  // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
   819  // the gofer FDs looking for disconnects, and kills the container processes if a
   820  // disconnect occurs in any of the gofer FDs.
   821  func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
   822  	go func() {
   823  		log.Debugf("Monitoring gofer health for container %q", cid)
   824  		var events []unix.PollFd
   825  		for _, goferFD := range goferFDs {
   826  			events = append(events, unix.PollFd{
   827  				Fd:     int32(goferFD.FD()),
   828  				Events: unix.POLLHUP | unix.POLLRDHUP,
   829  			})
   830  		}
   831  		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
   832  			// Use ppoll instead of poll because it's already whilelisted in seccomp.
   833  			n, err := unix.Ppoll(events, nil, nil)
   834  			return uintptr(n), 0, err
   835  		})
   836  		if err != nil {
   837  			panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err))
   838  		}
   839  
   840  		l.mu.Lock()
   841  		defer l.mu.Unlock()
   842  
   843  		// The gofer could have been stopped due to a normal container shutdown.
   844  		// Check if the container has not stopped yet.
   845  		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
   846  			log.Infof("Gofer socket disconnected, killing container %q", cid)
   847  			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
   848  				log.Warningf("Error killing container %q after gofer stopped: %s", cid, err)
   849  			}
   850  		}
   851  	}()
   852  }
   853  
   854  // destroyContainer stops a container if it is still running and cleans up its
   855  // filesystem.
   856  func (l *Loader) destroyContainer(cid string) error {
   857  	l.mu.Lock()
   858  	defer l.mu.Unlock()
   859  
   860  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
   861  	if err != nil {
   862  		// Container doesn't exist.
   863  		return err
   864  	}
   865  
   866  	// The container exists, but has it been started?
   867  	if tg != nil {
   868  		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
   869  			return fmt.Errorf("sending SIGKILL to all container processes: %w", err)
   870  		}
   871  		// Wait for all processes that belong to the container to exit (including
   872  		// exec'd processes).
   873  		for _, t := range l.k.TaskSet().Root.Tasks() {
   874  			if t.ContainerID() == cid {
   875  				t.ThreadGroup().WaitExited()
   876  			}
   877  		}
   878  
   879  		// At this point, all processes inside of the container have exited,
   880  		// releasing all references to the container's MountNamespace and
   881  		// causing all submounts and overlays to be unmounted.
   882  		//
   883  		// Since the container's MountNamespace has been released,
   884  		// MountNamespace.destroy() will have executed, but that function may
   885  		// trigger async close operations. We must wait for those to complete
   886  		// before returning, otherwise the caller may kill the gofer before
   887  		// they complete, causing a cascade of failing RPCs.
   888  		fs.AsyncBarrier()
   889  	}
   890  
   891  	// No more failure from this point on. Remove all container thread groups
   892  	// from the map.
   893  	for key := range l.processes {
   894  		if key.cid == cid {
   895  			delete(l.processes, key)
   896  		}
   897  	}
   898  
   899  	log.Debugf("Container destroyed, cid: %s", cid)
   900  	return nil
   901  }
   902  
   903  func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
   904  	// Hold the lock for the entire operation to ensure that exec'd process is
   905  	// added to 'processes' in case it races with destroyContainer().
   906  	l.mu.Lock()
   907  	defer l.mu.Unlock()
   908  
   909  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
   910  	if err != nil {
   911  		return 0, err
   912  	}
   913  	if tg == nil {
   914  		return 0, fmt.Errorf("container %q not started", args.ContainerID)
   915  	}
   916  
   917  	// Get the container MountNamespace from the Task. Try to acquire ref may fail
   918  	// in case it raced with task exit.
   919  	if kernel.VFS2Enabled {
   920  		// task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
   921  		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
   922  		if !args.MountNamespaceVFS2.TryIncRef() {
   923  			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
   924  		}
   925  	} else {
   926  		var reffed bool
   927  		tg.Leader().WithMuLocked(func(t *kernel.Task) {
   928  			// task.MountNamespace() does not take a ref, so we must do so ourselves.
   929  			args.MountNamespace = t.MountNamespace()
   930  			reffed = args.MountNamespace.TryIncRef()
   931  		})
   932  		if !reffed {
   933  			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
   934  		}
   935  	}
   936  
   937  	args.Envv, err = specutils.ResolveEnvs(args.Envv)
   938  	if err != nil {
   939  		return 0, fmt.Errorf("resolving env: %w", err)
   940  	}
   941  
   942  	// Add the HOME environment variable if it is not already set.
   943  	if kernel.VFS2Enabled {
   944  		root := args.MountNamespaceVFS2.Root()
   945  		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
   946  		defer args.MountNamespaceVFS2.DecRef(ctx)
   947  		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
   948  		if err != nil {
   949  			return 0, err
   950  		}
   951  		args.Envv = envv
   952  	} else {
   953  		root := args.MountNamespace.Root()
   954  		ctx := fs.WithRoot(l.k.SupervisorContext(), root)
   955  		defer args.MountNamespace.DecRef(ctx)
   956  		defer root.DecRef(ctx)
   957  		envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
   958  		if err != nil {
   959  			return 0, err
   960  		}
   961  		args.Envv = envv
   962  	}
   963  	args.PIDNamespace = tg.PIDNamespace()
   964  
   965  	args.Limits, err = createLimitSet(l.root.spec)
   966  	if err != nil {
   967  		return 0, fmt.Errorf("creating limits: %w", err)
   968  	}
   969  
   970  	// Start the process.
   971  	proc := control.Proc{Kernel: l.k}
   972  	newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
   973  	if err != nil {
   974  		return 0, err
   975  	}
   976  
   977  	eid := execID{cid: args.ContainerID, pid: tgid}
   978  	l.processes[eid] = &execProcess{
   979  		tg:      newTG,
   980  		tty:     ttyFile,
   981  		ttyVFS2: ttyFileVFS2,
   982  	}
   983  	log.Debugf("updated processes: %s", l.processes)
   984  
   985  	return tgid, nil
   986  }
   987  
   988  // waitContainer waits for the init process of a container to exit.
   989  func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
   990  	// Don't defer unlock, as doing so would make it impossible for
   991  	// multiple clients to wait on the same container.
   992  	tg, err := l.threadGroupFromID(execID{cid: cid})
   993  	if err != nil {
   994  		return fmt.Errorf("can't wait for container %q: %w", cid, err)
   995  	}
   996  
   997  	// If the thread either has already exited or exits during waiting,
   998  	// consider the container exited.
   999  	ws := l.wait(tg)
  1000  	*waitStatus = ws
  1001  
  1002  	// Check for leaks and write coverage report after the root container has
  1003  	// exited. This guarantees that the report is written in cases where the
  1004  	// sandbox is killed by a signal after the ContainerWait request is completed.
  1005  	if l.root.procArgs.ContainerID == cid {
  1006  		// All sentry-created resources should have been released at this point.
  1007  		refsvfs2.DoLeakCheck()
  1008  		_ = coverage.Report()
  1009  	}
  1010  	return nil
  1011  }
  1012  
  1013  func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
  1014  	if tgid <= 0 {
  1015  		return fmt.Errorf("PID (%d) must be positive", tgid)
  1016  	}
  1017  
  1018  	// Try to find a process that was exec'd
  1019  	eid := execID{cid: cid, pid: tgid}
  1020  	execTG, err := l.threadGroupFromID(eid)
  1021  	if err == nil {
  1022  		ws := l.wait(execTG)
  1023  		*waitStatus = ws
  1024  
  1025  		l.mu.Lock()
  1026  		delete(l.processes, eid)
  1027  		log.Debugf("updated processes (removal): %s", l.processes)
  1028  		l.mu.Unlock()
  1029  		return nil
  1030  	}
  1031  
  1032  	// The caller may be waiting on a process not started directly via exec.
  1033  	// In this case, find the process in the container's PID namespace.
  1034  	initTG, err := l.threadGroupFromID(execID{cid: cid})
  1035  	if err != nil {
  1036  		return fmt.Errorf("waiting for PID %d: %w", tgid, err)
  1037  	}
  1038  	tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
  1039  	if tg == nil {
  1040  		return fmt.Errorf("waiting for PID %d: no such process", tgid)
  1041  	}
  1042  	if tg.Leader().ContainerID() != cid {
  1043  		return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
  1044  	}
  1045  	ws := l.wait(tg)
  1046  	*waitStatus = ws
  1047  	return nil
  1048  }
  1049  
  1050  // wait waits for the process with TGID 'tgid' in a container's PID namespace
  1051  // to exit.
  1052  func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
  1053  	tg.WaitExited()
  1054  	return tg.ExitStatus().Status()
  1055  }
  1056  
  1057  // WaitForStartSignal waits for a start signal from the control server.
  1058  func (l *Loader) WaitForStartSignal() {
  1059  	<-l.ctrl.manager.startChan
  1060  }
  1061  
  1062  // WaitExit waits for the root container to exit, and returns its exit status.
  1063  func (l *Loader) WaitExit() kernel.ExitStatus {
  1064  	// Wait for container.
  1065  	l.k.WaitExited()
  1066  
  1067  	// Check all references.
  1068  	refs.OnExit()
  1069  
  1070  	return l.k.GlobalInit().ExitStatus()
  1071  }
  1072  
  1073  func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
  1074  	// Create an empty network stack because the network namespace may be empty at
  1075  	// this point. Netns is configured before Run() is called. Netstack is
  1076  	// configured using a control uRPC message. Host network is configured inside
  1077  	// Run().
  1078  	switch conf.Network {
  1079  	case config.NetworkHost:
  1080  		// No network namespacing support for hostinet yet, hence creator is nil.
  1081  		return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
  1082  
  1083  	case config.NetworkNone, config.NetworkSandbox:
  1084  		s, err := newEmptySandboxNetworkStack(clock, uniqueID)
  1085  		if err != nil {
  1086  			return nil, err
  1087  		}
  1088  		creator := &sandboxNetstackCreator{
  1089  			clock:    clock,
  1090  			uniqueID: uniqueID,
  1091  		}
  1092  		return inet.NewRootNamespace(s, creator), nil
  1093  
  1094  	default:
  1095  		panic(fmt.Sprintf("invalid network configuration: %d", conf.Network))
  1096  	}
  1097  
  1098  }
  1099  
  1100  func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
  1101  	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
  1102  	transProtos := []stack.TransportProtocolFactory{
  1103  		tcp.NewProtocol,
  1104  		udp.NewProtocol,
  1105  		icmp.NewProtocol4,
  1106  		icmp.NewProtocol6,
  1107  	}
  1108  	s := netstack.Stack{Stack: stack.New(stack.Options{
  1109  		NetworkProtocols:   netProtos,
  1110  		TransportProtocols: transProtos,
  1111  		Clock:              clock,
  1112  		Stats:              netstack.Metrics,
  1113  		HandleLocal:        true,
  1114  		// Enable raw sockets for users with sufficient
  1115  		// privileges.
  1116  		RawFactory:      raw.EndpointFactory{},
  1117  		UniqueID:        uniqueID,
  1118  		DefaultIPTables: netfilter.DefaultLinuxTables,
  1119  	})}
  1120  
  1121  	// Enable SACK Recovery.
  1122  	{
  1123  		opt := tcpip.TCPSACKEnabled(true)
  1124  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1125  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1126  		}
  1127  	}
  1128  
  1129  	// Set default TTLs as required by socket/netstack.
  1130  	{
  1131  		opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
  1132  		if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
  1133  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
  1134  		}
  1135  		if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
  1136  			return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
  1137  		}
  1138  	}
  1139  
  1140  	// Enable Receive Buffer Auto-Tuning.
  1141  	{
  1142  		opt := tcpip.TCPModerateReceiveBufferOption(true)
  1143  		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
  1144  			return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
  1145  		}
  1146  	}
  1147  
  1148  	return &s, nil
  1149  }
  1150  
  1151  // sandboxNetstackCreator implements kernel.NetworkStackCreator.
  1152  //
  1153  // +stateify savable
  1154  type sandboxNetstackCreator struct {
  1155  	clock    tcpip.Clock
  1156  	uniqueID stack.UniqueID
  1157  }
  1158  
  1159  // CreateStack implements kernel.NetworkStackCreator.CreateStack.
  1160  func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
  1161  	s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
  1162  	if err != nil {
  1163  		return nil, err
  1164  	}
  1165  
  1166  	// Setup loopback.
  1167  	n := &Network{Stack: s.(*netstack.Stack).Stack}
  1168  	nicID := tcpip.NICID(f.uniqueID.UniqueID())
  1169  	link := DefaultLoopbackLink
  1170  	linkEP := loopback.New()
  1171  	if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
  1172  		return nil, err
  1173  	}
  1174  
  1175  	return s, nil
  1176  }
  1177  
  1178  // signal sends a signal to one or more processes in a container. If PID is 0,
  1179  // then the container init process is used. Depending on the SignalDeliveryMode
  1180  // option, the signal may be sent directly to the indicated process, to all
  1181  // processes in the container, or to the foreground process group. pid is
  1182  // relative to the root PID namespace, not the container's.
  1183  func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
  1184  	if pid < 0 {
  1185  		return fmt.Errorf("PID (%d) must be positive", pid)
  1186  	}
  1187  
  1188  	switch mode {
  1189  	case DeliverToProcess:
  1190  		if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
  1191  			return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err)
  1192  		}
  1193  		return nil
  1194  
  1195  	case DeliverToForegroundProcessGroup:
  1196  		if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
  1197  			return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err)
  1198  		}
  1199  		return nil
  1200  
  1201  	case DeliverToAllProcesses:
  1202  		if pid != 0 {
  1203  			return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
  1204  		}
  1205  		// Check that the container has actually started before signaling it.
  1206  		if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
  1207  			return err
  1208  		}
  1209  		if err := l.signalAllProcesses(cid, signo); err != nil {
  1210  			return fmt.Errorf("signaling all processes in container %q: %w", cid, err)
  1211  		}
  1212  		return nil
  1213  
  1214  	default:
  1215  		panic(fmt.Sprintf("unknown signal delivery mode %s", mode))
  1216  	}
  1217  }
  1218  
  1219  // signalProcess sends signal to process in the given container. tgid is
  1220  // relative to the root PID namespace, not the container's.
  1221  func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
  1222  	execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
  1223  	if err == nil {
  1224  		// Send signal directly to the identified process.
  1225  		return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo})
  1226  	}
  1227  
  1228  	// The caller may be signaling a process not started directly via exec.
  1229  	// In this case, find the process and check that the process belongs to the
  1230  	// container in question.
  1231  	tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid)
  1232  	if tg == nil {
  1233  		return fmt.Errorf("no such process with PID %d", tgid)
  1234  	}
  1235  	if tg.Leader().ContainerID() != cid {
  1236  		return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID())
  1237  	}
  1238  	return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1239  }
  1240  
  1241  // signalForegrondProcessGroup looks up foreground process group from the TTY
  1242  // for the given "tgid" inside container "cid", and send the signal to it.
  1243  func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
  1244  	l.mu.Lock()
  1245  	tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
  1246  	if err != nil {
  1247  		l.mu.Unlock()
  1248  		return fmt.Errorf("no thread group found: %w", err)
  1249  	}
  1250  	if tg == nil {
  1251  		l.mu.Unlock()
  1252  		return fmt.Errorf("container %q not started", cid)
  1253  	}
  1254  
  1255  	tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
  1256  	l.mu.Unlock()
  1257  	if err != nil {
  1258  		return fmt.Errorf("no thread group found: %w", err)
  1259  	}
  1260  
  1261  	var pg *kernel.ProcessGroup
  1262  	switch {
  1263  	case ttyVFS2 != nil:
  1264  		pg = ttyVFS2.ForegroundProcessGroup()
  1265  	case tty != nil:
  1266  		pg = tty.ForegroundProcessGroup()
  1267  	default:
  1268  		return fmt.Errorf("no TTY attached")
  1269  	}
  1270  	if pg == nil {
  1271  		// No foreground process group has been set. Signal the
  1272  		// original thread group.
  1273  		log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
  1274  		return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo})
  1275  	}
  1276  	// Send the signal to all processes in the process group.
  1277  	var lastErr error
  1278  	for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
  1279  		if tg.ProcessGroup() != pg {
  1280  			continue
  1281  		}
  1282  		if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}); err != nil {
  1283  			lastErr = err
  1284  		}
  1285  	}
  1286  	return lastErr
  1287  }
  1288  
  1289  // signalAllProcesses that belong to specified container. It's a noop if the
  1290  // container hasn't started or has exited.
  1291  func (l *Loader) signalAllProcesses(cid string, signo int32) error {
  1292  	// Pause the kernel to prevent new processes from being created while
  1293  	// the signal is delivered. This prevents process leaks when SIGKILL is
  1294  	// sent to the entire container.
  1295  	l.k.Pause()
  1296  	defer l.k.Unpause()
  1297  	return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo})
  1298  }
  1299  
  1300  // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
  1301  // acquires mutex before calling it and fails in case container hasn't started
  1302  // yet.
  1303  func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
  1304  	l.mu.Lock()
  1305  	defer l.mu.Unlock()
  1306  	tg, err := l.tryThreadGroupFromIDLocked(key)
  1307  	if err != nil {
  1308  		return nil, err
  1309  	}
  1310  	if tg == nil {
  1311  		return nil, fmt.Errorf("container %q not started", key.cid)
  1312  	}
  1313  	return tg, nil
  1314  }
  1315  
  1316  // tryThreadGroupFromIDLocked returns the thread group for the given execution
  1317  // ID. It may return nil in case the container has not started yet. Returns
  1318  // error if execution ID is invalid or if the container cannot be found (maybe
  1319  // it has been deleted). Caller must hold 'mu'.
  1320  func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
  1321  	ep := l.processes[key]
  1322  	if ep == nil {
  1323  		return nil, fmt.Errorf("container %q not found", key.cid)
  1324  	}
  1325  	return ep.tg, nil
  1326  }
  1327  
  1328  // ttyFromIDLocked returns the TTY files for the given execution ID. It may
  1329  // return nil in case the container has not started yet. Returns error if
  1330  // execution ID is invalid or if the container cannot be found (maybe it has
  1331  // been deleted). Caller must hold 'mu'.
  1332  func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
  1333  	ep := l.processes[key]
  1334  	if ep == nil {
  1335  		return nil, nil, fmt.Errorf("container %q not found", key.cid)
  1336  	}
  1337  	return ep.tty, ep.ttyVFS2, nil
  1338  }
  1339  
  1340  func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
  1341  	if len(stdioFDs) != 3 {
  1342  		return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
  1343  	}
  1344  
  1345  	k := kernel.KernelFromContext(ctx)
  1346  	fdTable := k.NewFDTable()
  1347  	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
  1348  	if err != nil {
  1349  		fdTable.DecRef(ctx)
  1350  		return nil, nil, nil, err
  1351  	}
  1352  	return fdTable, ttyFile, ttyFileVFS2, nil
  1353  }