gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/kernel.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/kernel.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernel provides an emulation of the Linux kernel.
    16  //
    17  // See README.md for a detailed overview.
    18  //
    19  // Lock order (outermost locks must be taken first):
    20  //
    21  //	Kernel.extMu
    22  //		ThreadGroup.timerMu
    23  //		  ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu
    24  //		    TaskSet.mu
    25  //		      SignalHandlers.mu
    26  //		        Task.mu
    27  //		    runningTasksMu
    28  //
    29  // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
    30  // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
    31  // time requires locking all of their signal mutexes first.
    32  package kernel
    33  
    34  import (
    35  	"errors"
    36  	"fmt"
    37  	"io"
    38  	"path/filepath"
    39  	"time"
    40  
    41  	"gvisor.dev/gvisor/pkg/abi/linux"
    42  	"gvisor.dev/gvisor/pkg/atomicbitops"
    43  	"gvisor.dev/gvisor/pkg/cleanup"
    44  	"gvisor.dev/gvisor/pkg/context"
    45  	"gvisor.dev/gvisor/pkg/cpuid"
    46  	"gvisor.dev/gvisor/pkg/devutil"
    47  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    48  	"gvisor.dev/gvisor/pkg/eventchannel"
    49  	"gvisor.dev/gvisor/pkg/fd"
    50  	"gvisor.dev/gvisor/pkg/fspath"
    51  	"gvisor.dev/gvisor/pkg/log"
    52  	"gvisor.dev/gvisor/pkg/refs"
    53  	"gvisor.dev/gvisor/pkg/sentry/arch"
    54  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
    55  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
    56  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
    57  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
    58  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
    59  	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
    60  	"gvisor.dev/gvisor/pkg/sentry/inet"
    61  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    62  	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
    63  	"gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
    64  	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
    65  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    66  	"gvisor.dev/gvisor/pkg/sentry/limits"
    67  	"gvisor.dev/gvisor/pkg/sentry/loader"
    68  	"gvisor.dev/gvisor/pkg/sentry/mm"
    69  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    70  	"gvisor.dev/gvisor/pkg/sentry/platform"
    71  	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
    72  	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
    73  	"gvisor.dev/gvisor/pkg/sentry/unimpl"
    74  	uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
    75  	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
    76  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    77  	"gvisor.dev/gvisor/pkg/state"
    78  	"gvisor.dev/gvisor/pkg/state/statefile"
    79  	"gvisor.dev/gvisor/pkg/sync"
    80  	"gvisor.dev/gvisor/pkg/tcpip"
    81  )
    82  
    83  // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to
    84  // allow easy access everywhere.
    85  var IOUringEnabled = false
    86  
    87  // UserCounters is a set of user counters.
    88  //
    89  // +stateify savable
    90  type UserCounters struct {
    91  	uid auth.KUID
    92  
    93  	rlimitNProc atomicbitops.Uint64
    94  }
    95  
    96  // incRLimitNProc increments the rlimitNProc counter.
    97  func (uc *UserCounters) incRLimitNProc(ctx context.Context) error {
    98  	lim := limits.FromContext(ctx).Get(limits.ProcessCount)
    99  	creds := auth.CredentialsFromContext(ctx)
   100  	nproc := uc.rlimitNProc.Add(1)
   101  	if nproc > lim.Cur &&
   102  		!creds.HasCapability(linux.CAP_SYS_ADMIN) &&
   103  		!creds.HasCapability(linux.CAP_SYS_RESOURCE) {
   104  		uc.rlimitNProc.Add(^uint64(0))
   105  		return linuxerr.EAGAIN
   106  	}
   107  	return nil
   108  }
   109  
   110  // decRLimitNProc decrements the rlimitNProc counter.
   111  func (uc *UserCounters) decRLimitNProc() {
   112  	uc.rlimitNProc.Add(^uint64(0))
   113  }
   114  
   115  // CgroupMount contains the cgroup mount. These mounts are created for the root
   116  // container by default and are stored in the kernel.
   117  //
   118  // +stateify savable
   119  type CgroupMount struct {
   120  	Fs    *vfs.Filesystem
   121  	Root  *vfs.Dentry
   122  	Mount *vfs.Mount
   123  }
   124  
   125  // Kernel represents an emulated Linux kernel. It must be initialized by calling
   126  // Init() or LoadFrom().
   127  //
   128  // +stateify savable
   129  type Kernel struct {
   130  	// extMu serializes external changes to the Kernel with calls to
   131  	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
   132  	// remains frozen for the duration of the call; it requires that the Kernel
   133  	// is paused as a precondition, which ensures that none of the tasks
   134  	// running within the Kernel can affect its state, but extMu is required to
   135  	// ensure that concurrent users of the Kernel *outside* the Kernel's
   136  	// control cannot affect its state by calling e.g.
   137  	// Kernel.SendExternalSignal.)
   138  	extMu sync.Mutex `state:"nosave"`
   139  
   140  	// started is true if Start has been called. Unless otherwise specified,
   141  	// all Kernel fields become immutable once started becomes true.
   142  	started bool `state:"nosave"`
   143  
   144  	// All of the following fields are immutable unless otherwise specified.
   145  
   146  	// Platform is the platform that is used to execute tasks in the created
   147  	// Kernel.
   148  	platform.Platform `state:"nosave"`
   149  
   150  	// mf provides application memory.
   151  	mf *pgalloc.MemoryFile `state:"nosave"`
   152  
   153  	// See InitKernelArgs for the meaning of these fields.
   154  	featureSet           cpuid.FeatureSet
   155  	timekeeper           *Timekeeper
   156  	tasks                *TaskSet
   157  	rootUserNamespace    *auth.UserNamespace
   158  	rootNetworkNamespace *inet.Namespace
   159  	applicationCores     uint
   160  	useHostCores         bool
   161  	extraAuxv            []arch.AuxEntry
   162  	vdso                 *loader.VDSO
   163  	rootUTSNamespace     *UTSNamespace
   164  	rootIPCNamespace     *IPCNamespace
   165  
   166  	// futexes is the "root" futex.Manager, from which all others are forked.
   167  	// This is necessary to ensure that shared futexes are coherent across all
   168  	// tasks, including those created by CreateProcess.
   169  	futexes *futex.Manager
   170  
   171  	// globalInit is the thread group whose leader has ID 1 in the root PID
   172  	// namespace. globalInit is stored separately so that it is accessible even
   173  	// after all tasks in the thread group have exited, such that ID 1 is no
   174  	// longer mapped.
   175  	//
   176  	// globalInit is mutable until it is assigned by the first successful call
   177  	// to CreateProcess, and is protected by extMu.
   178  	globalInit *ThreadGroup
   179  
   180  	// syslog is the kernel log.
   181  	syslog syslog
   182  
   183  	runningTasksMu runningTasksMutex `state:"nosave"`
   184  
   185  	// runningTasks is the total count of tasks currently in
   186  	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
   187  	// not blocked or stopped.
   188  	//
   189  	// runningTasks must be accessed atomically. Increments from 0 to 1 are
   190  	// further protected by runningTasksMu (see incRunningTasks).
   191  	runningTasks atomicbitops.Int64
   192  
   193  	// runningTasksCond is signaled when runningTasks is incremented from 0 to 1.
   194  	//
   195  	// Invariant: runningTasksCond.L == &runningTasksMu.
   196  	runningTasksCond sync.Cond `state:"nosave"`
   197  
   198  	// cpuClock is incremented every linux.ClockTick by a goroutine running
   199  	// kernel.runCPUClockTicker() while runningTasks != 0.
   200  	//
   201  	// cpuClock is used to measure task CPU usage, since sampling monotonicClock
   202  	// twice on every syscall turns out to be unreasonably expensive. This is
   203  	// similar to how Linux does task CPU accounting on x86
   204  	// (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing
   205  	// information to improve resolution
   206  	// (kernel/sched/cputime.c:cputime_adjust()), which we can't do since
   207  	// "preeemptive" scheduling is managed by the Go runtime, which doesn't
   208  	// provide this information.
   209  	//
   210  	// cpuClock is mutable, and is accessed using atomic memory operations.
   211  	cpuClock atomicbitops.Uint64
   212  
   213  	// cpuClockTickTimer drives increments of cpuClock.
   214  	cpuClockTickTimer *time.Timer `state:"nosave"`
   215  
   216  	// cpuClockMu is used to make increments of cpuClock, and updates of timers
   217  	// based on cpuClock, atomic.
   218  	cpuClockMu cpuClockMutex `state:"nosave"`
   219  
   220  	// cpuClockTickerRunning is true if the goroutine that increments cpuClock is
   221  	// running and false if it is blocked in runningTasksCond.Wait() or if it
   222  	// never started.
   223  	//
   224  	// cpuClockTickerRunning is protected by runningTasksMu.
   225  	cpuClockTickerRunning bool
   226  
   227  	// cpuClockTickerWakeCh is sent to to wake the goroutine that increments
   228  	// cpuClock if it's sleeping between ticks.
   229  	cpuClockTickerWakeCh chan struct{} `state:"nosave"`
   230  
   231  	// cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions
   232  	// from true to false.
   233  	//
   234  	// Invariant: cpuClockTickerStopCond.L == &runningTasksMu.
   235  	cpuClockTickerStopCond sync.Cond `state:"nosave"`
   236  
   237  	// uniqueID is used to generate unique identifiers.
   238  	//
   239  	// uniqueID is mutable, and is accessed using atomic memory operations.
   240  	uniqueID atomicbitops.Uint64
   241  
   242  	// nextInotifyCookie is a monotonically increasing counter used for
   243  	// generating unique inotify event cookies.
   244  	//
   245  	// nextInotifyCookie is mutable.
   246  	nextInotifyCookie atomicbitops.Uint32
   247  
   248  	// netlinkPorts manages allocation of netlink socket port IDs.
   249  	netlinkPorts *port.Manager
   250  
   251  	// saveStatus is nil if the sandbox has not been saved, errSaved or
   252  	// errAutoSaved if it has been saved successfully, or the error causing the
   253  	// sandbox to exit during save.
   254  	// It is protected by extMu.
   255  	saveStatus error `state:"nosave"`
   256  
   257  	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
   258  	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
   259  
   260  	// sockets records all network sockets in the system. Protected by extMu.
   261  	sockets map[*vfs.FileDescription]*SocketRecord
   262  
   263  	// nextSocketRecord is the next entry number to use in sockets. Protected
   264  	// by extMu.
   265  	nextSocketRecord uint64
   266  
   267  	// unimplementedSyscallEmitterOnce is used in the initialization of
   268  	// unimplementedSyscallEmitter.
   269  	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
   270  
   271  	// unimplementedSyscallEmitter is used to emit unimplemented syscall
   272  	// events. This is initialized lazily on the first unimplemented
   273  	// syscall.
   274  	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
   275  
   276  	// SpecialOpts contains special kernel options.
   277  	SpecialOpts
   278  
   279  	// vfs keeps the filesystem state used across the kernel.
   280  	vfs vfs.VirtualFilesystem
   281  
   282  	// hostMount is the Mount used for file descriptors that were imported
   283  	// from the host.
   284  	hostMount *vfs.Mount
   285  
   286  	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
   287  	// syscalls (as opposed to named pipes created by mknod()).
   288  	pipeMount *vfs.Mount
   289  
   290  	// nsfsMount is the Mount used for namespaces.
   291  	nsfsMount *vfs.Mount
   292  
   293  	// shmMount is the Mount used for anonymous files created by the
   294  	// memfd_create() syscalls. It is analogous to Linux's shm_mnt.
   295  	shmMount *vfs.Mount
   296  
   297  	// socketMount is the Mount used for sockets created by the socket() and
   298  	// socketpair() syscalls. There are several cases where a socket dentry will
   299  	// not be contained in socketMount:
   300  	// 1. Socket files created by mknod()
   301  	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
   302  	// 3. Socket files created by binding Unix sockets to a file path
   303  	socketMount *vfs.Mount
   304  
   305  	// sysVShmDevID is the device number used by SysV shm segments. In Linux,
   306  	// SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number.
   307  	// In gVisor, the shm implementation does not use shmMount, extracting
   308  	// shmMount's device number is inconvenient, applications accept a
   309  	// different device number in practice, and using a distinct device number
   310  	// avoids the possibility of inode number collisions due to the hack
   311  	// described in shm.Shm.InodeID().
   312  	sysVShmDevID uint32
   313  
   314  	// If set to true, report address space activation waits as if the task is in
   315  	// external wait so that the watchdog doesn't report the task stuck.
   316  	SleepForAddressSpaceActivation bool
   317  
   318  	// Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
   319  	// tracee-tracer relationship. The key is a process (technically, the thread
   320  	// group leader) that can be traced by any thread that is a descendant of the
   321  	// value. If the value is nil, then anyone can trace the process represented by
   322  	// the key.
   323  	//
   324  	// ptraceExceptions is protected by the TaskSet mutex.
   325  	ptraceExceptions map[*Task]*Task
   326  
   327  	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
   328  	YAMAPtraceScope atomicbitops.Int32
   329  
   330  	// cgroupRegistry contains the set of active cgroup controllers on the
   331  	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
   332  	// the system.
   333  	cgroupRegistry *CgroupRegistry
   334  
   335  	// cgroupMountsMap maps the cgroup controller names to the cgroup mounts
   336  	// created for the root container. These mounts are then bind mounted
   337  	// for other application containers by creating their own container
   338  	// directories.
   339  	cgroupMountsMap   map[string]*CgroupMount
   340  	cgroupMountsMapMu cgroupMountsMutex `state:"nosave"`
   341  
   342  	// userCountersMap maps auth.KUID into a set of user counters.
   343  	userCountersMap   map[auth.KUID]*UserCounters
   344  	userCountersMapMu userCountersMutex `state:"nosave"`
   345  
   346  	// MaxFDLimit specifies the maximum file descriptor number that can be
   347  	// used by processes.
   348  	MaxFDLimit atomicbitops.Int32
   349  
   350  	// devGofers maps containers (using its name) to its device gofer client.
   351  	devGofers   map[string]*devutil.GoferClient `state:"nosave"`
   352  	devGofersMu sync.Mutex                      `state:"nosave"`
   353  
   354  	// containerNames store the container name based on their container ID.
   355  	// Names are preserved between save/restore session, while IDs can change.
   356  	//
   357  	// Mapping: cid -> name.
   358  	// It's protected by extMu.
   359  	containerNames map[string]string
   360  }
   361  
   362  // InitKernelArgs holds arguments to Init.
   363  type InitKernelArgs struct {
   364  	// FeatureSet is the emulated CPU feature set.
   365  	FeatureSet cpuid.FeatureSet
   366  
   367  	// Timekeeper manages time for all tasks in the system.
   368  	Timekeeper *Timekeeper
   369  
   370  	// RootUserNamespace is the root user namespace.
   371  	RootUserNamespace *auth.UserNamespace
   372  
   373  	// RootNetworkNamespace is the root network namespace. If nil, no networking
   374  	// will be available.
   375  	RootNetworkNamespace *inet.Namespace
   376  
   377  	// ApplicationCores is the number of logical CPUs visible to sandboxed
   378  	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
   379  	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
   380  	// most significant bit in cpu_possible_mask + 1.
   381  	ApplicationCores uint
   382  
   383  	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
   384  	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
   385  	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
   386  	// will be overridden.
   387  	UseHostCores bool
   388  
   389  	// ExtraAuxv contains additional auxiliary vector entries that are added to
   390  	// each process by the ELF loader.
   391  	ExtraAuxv []arch.AuxEntry
   392  
   393  	// Vdso holds the VDSO and its parameter page.
   394  	Vdso *loader.VDSO
   395  
   396  	// RootUTSNamespace is the root UTS namespace.
   397  	RootUTSNamespace *UTSNamespace
   398  
   399  	// RootIPCNamespace is the root IPC namespace.
   400  	RootIPCNamespace *IPCNamespace
   401  
   402  	// PIDNamespace is the root PID namespace.
   403  	PIDNamespace *PIDNamespace
   404  
   405  	// MaxFDLimit specifies the maximum file descriptor number that can be
   406  	// used by processes.  If it is zero, the limit will be set to
   407  	// unlimited.
   408  	MaxFDLimit int32
   409  }
   410  
   411  // Init initialize the Kernel with no tasks.
   412  //
   413  // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
   414  // before calling Init.
   415  func (k *Kernel) Init(args InitKernelArgs) error {
   416  	if args.Timekeeper == nil {
   417  		return fmt.Errorf("args.Timekeeper is nil")
   418  	}
   419  	if args.Timekeeper.clocks == nil {
   420  		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
   421  	}
   422  	if args.RootUserNamespace == nil {
   423  		return fmt.Errorf("args.RootUserNamespace is nil")
   424  	}
   425  	if args.ApplicationCores == 0 {
   426  		return fmt.Errorf("args.ApplicationCores is 0")
   427  	}
   428  
   429  	k.featureSet = args.FeatureSet
   430  	k.timekeeper = args.Timekeeper
   431  	k.tasks = newTaskSet(args.PIDNamespace)
   432  	k.rootUserNamespace = args.RootUserNamespace
   433  	k.rootUTSNamespace = args.RootUTSNamespace
   434  	k.rootIPCNamespace = args.RootIPCNamespace
   435  	k.rootNetworkNamespace = args.RootNetworkNamespace
   436  	if k.rootNetworkNamespace == nil {
   437  		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace)
   438  	}
   439  	k.runningTasksCond.L = &k.runningTasksMu
   440  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   441  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   442  	k.applicationCores = args.ApplicationCores
   443  	if args.UseHostCores {
   444  		k.useHostCores = true
   445  		maxCPU, err := hostcpu.MaxPossibleCPU()
   446  		if err != nil {
   447  			return fmt.Errorf("failed to get maximum CPU number: %v", err)
   448  		}
   449  		minAppCores := uint(maxCPU) + 1
   450  		if k.applicationCores < minAppCores {
   451  			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
   452  			k.applicationCores = minAppCores
   453  		}
   454  	}
   455  	k.extraAuxv = args.ExtraAuxv
   456  	k.vdso = args.Vdso
   457  	k.futexes = futex.NewManager()
   458  	k.netlinkPorts = port.New()
   459  	k.ptraceExceptions = make(map[*Task]*Task)
   460  	k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL)
   461  	k.userCountersMap = make(map[auth.KUID]*UserCounters)
   462  	if args.MaxFDLimit == 0 {
   463  		args.MaxFDLimit = MaxFdLimit
   464  	}
   465  	k.MaxFDLimit.Store(args.MaxFDLimit)
   466  	k.containerNames = make(map[string]string)
   467  
   468  	ctx := k.SupervisorContext()
   469  	if err := k.vfs.Init(ctx); err != nil {
   470  		return fmt.Errorf("failed to initialize VFS: %v", err)
   471  	}
   472  
   473  	err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx))
   474  	if err != nil {
   475  		return fmt.Errorf("failed to create mqfs filesystem: %v", err)
   476  	}
   477  
   478  	pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
   479  	if err != nil {
   480  		return fmt.Errorf("failed to create pipefs filesystem: %v", err)
   481  	}
   482  	defer pipeFilesystem.DecRef(ctx)
   483  	pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
   484  	k.pipeMount = pipeMount
   485  
   486  	nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs)
   487  	if err != nil {
   488  		return fmt.Errorf("failed to create nsfs filesystem: %v", err)
   489  	}
   490  	defer nsfsFilesystem.DecRef(ctx)
   491  	k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
   492  	k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
   493  	k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
   494  	k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
   495  
   496  	tmpfsOpts := vfs.GetFilesystemOptions{
   497  		InternalData: tmpfs.FilesystemOpts{
   498  			// See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT).
   499  			// Note how mm/shmem.c:shmem_fill_super() does not provide a default
   500  			// value for sbinfo->max_blocks when SB_KERNMOUNT is set.
   501  			DisableDefaultSizeLimit: true,
   502  		},
   503  		InternalMount: true,
   504  	}
   505  	tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts)
   506  	if err != nil {
   507  		return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
   508  	}
   509  	defer tmpfsFilesystem.DecRef(ctx)
   510  	defer tmpfsRoot.DecRef(ctx)
   511  	k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
   512  
   513  	socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
   514  	if err != nil {
   515  		return fmt.Errorf("failed to create sockfs filesystem: %v", err)
   516  	}
   517  	defer socketFilesystem.DecRef(ctx)
   518  	k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
   519  
   520  	sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor()
   521  	if err != nil {
   522  		return fmt.Errorf("failed to get device number for SysV shm: %v", err)
   523  	}
   524  	k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor)
   525  
   526  	k.sockets = make(map[*vfs.FileDescription]*SocketRecord)
   527  
   528  	k.cgroupRegistry = newCgroupRegistry()
   529  	return nil
   530  }
   531  
   532  // +stateify savable
   533  type privateMemoryFileMetadata struct {
   534  	owners []string
   535  }
   536  
   537  func savePrivateMFs(ctx context.Context, w io.Writer, pw io.Writer, mfsToSave map[string]*pgalloc.MemoryFile, mfOpts pgalloc.SaveOpts) error {
   538  	// mfOpts.ExcludeCommittedZeroPages is expected to reflect application
   539  	// memory usage behavior, but not necessarily usage of private MemoryFiles.
   540  	mfOpts.ExcludeCommittedZeroPages = false
   541  
   542  	var meta privateMemoryFileMetadata
   543  	// Generate the order in which private memory files are saved.
   544  	for fsID := range mfsToSave {
   545  		meta.owners = append(meta.owners, fsID)
   546  	}
   547  	// Save the metadata.
   548  	if _, err := state.Save(ctx, w, &meta); err != nil {
   549  		return err
   550  	}
   551  	// Followed by the private memory files in order.
   552  	for _, fsID := range meta.owners {
   553  		if err := mfsToSave[fsID].SaveTo(ctx, w, pw, mfOpts); err != nil {
   554  			return err
   555  		}
   556  	}
   557  	return nil
   558  }
   559  
   560  func loadPrivateMFs(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error {
   561  	// Load the metadata.
   562  	var meta privateMemoryFileMetadata
   563  	if _, err := state.Load(ctx, r, &meta); err != nil {
   564  		return err
   565  	}
   566  	mfmap := pgalloc.MemoryFileMapFromContext(ctx)
   567  	// Ensure that it is consistent with CtxFilesystemMemoryFileMap.
   568  	if len(mfmap) != len(meta.owners) {
   569  		return fmt.Errorf("inconsistent private memory files on restore: savedMFOwners = %v, CtxFilesystemMemoryFileMap = %v", meta.owners, mfmap)
   570  	}
   571  	// Load all private memory files.
   572  	for _, fsID := range meta.owners {
   573  		mf, ok := mfmap[fsID]
   574  		if !ok {
   575  			return fmt.Errorf("saved memory file for %q was not configured on restore", fsID)
   576  		}
   577  		if err := mf.LoadFrom(ctx, r, pr); err != nil {
   578  			return err
   579  		}
   580  	}
   581  	return nil
   582  }
   583  
   584  // SaveTo saves the state of k to w.
   585  //
   586  // Preconditions: The kernel must be paused throughout the call to SaveTo.
   587  func (k *Kernel) SaveTo(ctx context.Context, w io.Writer, pagesMetadata, pagesFile *fd.FD, mfOpts pgalloc.SaveOpts) error {
   588  	saveStart := time.Now()
   589  
   590  	// Do not allow other Kernel methods to affect it while it's being saved.
   591  	k.extMu.Lock()
   592  	defer k.extMu.Unlock()
   593  
   594  	// Stop time.
   595  	k.pauseTimeLocked(ctx)
   596  	defer k.resumeTimeLocked(ctx)
   597  
   598  	// Evict all evictable MemoryFile allocations.
   599  	k.mf.StartEvictions()
   600  	k.mf.WaitForEvictions()
   601  
   602  	// Discard unsavable mappings, such as those for host file descriptors.
   603  	if err := k.invalidateUnsavableMappings(ctx); err != nil {
   604  		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   605  	}
   606  
   607  	// Capture all private memory files.
   608  	mfsToSave := make(map[string]*pgalloc.MemoryFile)
   609  	vfsCtx := context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfsToSave)
   610  	// Prepare filesystems for saving. This must be done after
   611  	// invalidateUnsavableMappings(), since dropping memory mappings may
   612  	// affect filesystem state (e.g. page cache reference counts).
   613  	if err := k.vfs.PrepareSave(vfsCtx); err != nil {
   614  		return err
   615  	}
   616  	// Mark all to-be-saved MemoryFiles as savable to inform kernel save below.
   617  	k.mf.MarkSavable()
   618  	for _, mf := range mfsToSave {
   619  		mf.MarkSavable()
   620  	}
   621  
   622  	// Save the CPUID FeatureSet before the rest of the kernel so we can
   623  	// verify its compatibility on restore before attempting to restore the
   624  	// entire kernel, which may fail on an incompatible machine.
   625  	//
   626  	// N.B. This will also be saved along with the full kernel save below.
   627  	cpuidStart := time.Now()
   628  	if _, err := state.Save(ctx, w, &k.featureSet); err != nil {
   629  		return err
   630  	}
   631  	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
   632  
   633  	// Save the timekeeper's state.
   634  
   635  	if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil {
   636  		// Pause the network stack.
   637  		netstackPauseStart := time.Now()
   638  		log.Infof("Pausing root network namespace")
   639  		k.rootNetworkNamespace.Stack().Pause()
   640  		defer k.rootNetworkNamespace.Stack().Resume()
   641  		log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart))
   642  	}
   643  
   644  	// Save the kernel state.
   645  	kernelStart := time.Now()
   646  	stats, err := state.Save(ctx, w, k)
   647  	if err != nil {
   648  		return err
   649  	}
   650  	log.Infof("Kernel save stats: %s", stats.String())
   651  	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
   652  
   653  	// Save the memory files' state.
   654  	memoryStart := time.Now()
   655  	pmw := w
   656  	if pagesMetadata != nil {
   657  		pmw = pagesMetadata
   658  	}
   659  	pw := w
   660  	if pagesFile != nil {
   661  		pw = pagesFile
   662  	}
   663  	if err := k.mf.SaveTo(ctx, pmw, pw, mfOpts); err != nil {
   664  		return err
   665  	}
   666  	if err := savePrivateMFs(ctx, pmw, pw, mfsToSave, mfOpts); err != nil {
   667  		return err
   668  	}
   669  	log.Infof("Memory files save took [%s].", time.Since(memoryStart))
   670  
   671  	log.Infof("Overall save took [%s].", time.Since(saveStart))
   672  
   673  	return nil
   674  }
   675  
   676  // Preconditions: The kernel must be paused.
   677  func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
   678  	invalidated := make(map[*mm.MemoryManager]struct{})
   679  	k.tasks.mu.RLock()
   680  	defer k.tasks.mu.RUnlock()
   681  	for t := range k.tasks.Root.tids {
   682  		// We can skip locking Task.mu here since the kernel is paused.
   683  		if memMgr := t.image.MemoryManager; memMgr != nil {
   684  			if _, ok := invalidated[memMgr]; !ok {
   685  				if err := memMgr.InvalidateUnsavable(ctx); err != nil {
   686  					return err
   687  				}
   688  				invalidated[memMgr] = struct{}{}
   689  			}
   690  		}
   691  		// I really wish we just had a sync.Map of all MMs...
   692  		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
   693  			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
   694  				return err
   695  			}
   696  		}
   697  	}
   698  	return nil
   699  }
   700  
   701  // LoadFrom returns a new Kernel loaded from args.
   702  func (k *Kernel) LoadFrom(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
   703  	loadStart := time.Now()
   704  
   705  	var (
   706  		mfLoadWg  sync.WaitGroup
   707  		mfLoadErr error
   708  	)
   709  	parallelMfLoad := pagesMetadata != nil && pagesFile != nil
   710  	if parallelMfLoad {
   711  		// Parallelize MemoryFile load and kernel load. Both are independent.
   712  		mfLoadWg.Add(1)
   713  		go func() {
   714  			defer mfLoadWg.Done()
   715  			mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile)
   716  		}()
   717  		// Defer a Wait() so we wait for k.loadMemoryFiles() to complete even if we
   718  		// error out without reaching the other Wait() below.
   719  		defer mfLoadWg.Wait()
   720  	}
   721  
   722  	k.runningTasksCond.L = &k.runningTasksMu
   723  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   724  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   725  
   726  	initAppCores := k.applicationCores
   727  
   728  	// Load the pre-saved CPUID FeatureSet.
   729  	//
   730  	// N.B. This was also saved along with the full kernel below, so we
   731  	// don't need to explicitly install it in the Kernel.
   732  	cpuidStart := time.Now()
   733  	if _, err := state.Load(ctx, r, &k.featureSet); err != nil {
   734  		return err
   735  	}
   736  	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
   737  
   738  	// Verify that the FeatureSet is usable on this host. We do this before
   739  	// Kernel load so that the explicit CPUID mismatch error has priority
   740  	// over floating point state restore errors that may occur on load on
   741  	// an incompatible machine.
   742  	if err := k.featureSet.CheckHostCompatible(); err != nil {
   743  		return err
   744  	}
   745  
   746  	// Load the kernel state.
   747  	kernelStart := time.Now()
   748  	stats, err := state.Load(ctx, r, k)
   749  	if err != nil {
   750  		return err
   751  	}
   752  	log.Infof("Kernel load stats: %s", stats.String())
   753  	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
   754  
   755  	if parallelMfLoad {
   756  		mfLoadWg.Wait()
   757  	} else {
   758  		mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile)
   759  	}
   760  	if mfLoadErr != nil {
   761  		return mfLoadErr
   762  	}
   763  
   764  	// rootNetworkNamespace should be populated after loading the state file.
   765  	// Restore the root network stack.
   766  	k.rootNetworkNamespace.RestoreRootStack(net)
   767  
   768  	k.Timekeeper().SetClocks(clocks)
   769  
   770  	if timeReady != nil {
   771  		close(timeReady)
   772  	}
   773  
   774  	if net != nil {
   775  		net.Restore()
   776  	}
   777  
   778  	if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
   779  		return err
   780  	}
   781  
   782  	tcpip.AsyncLoading.Wait()
   783  
   784  	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
   785  
   786  	// Applications may size per-cpu structures based on k.applicationCores, so
   787  	// it can't change across save/restore. When we are virtualizing CPU
   788  	// numbers, this isn't a problem. However, when we are exposing host CPU
   789  	// assignments, we can't tolerate an increase in the number of host CPUs,
   790  	// which could result in getcpu(2) returning CPUs that applications expect
   791  	// not to exist.
   792  	if k.useHostCores && initAppCores > k.applicationCores {
   793  		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
   794  	}
   795  
   796  	return nil
   797  }
   798  
   799  func (k *Kernel) loadMemoryFiles(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD) error {
   800  	// Load the memory files' state.
   801  	memoryStart := time.Now()
   802  	pmr := r
   803  	if pagesMetadata != nil {
   804  		pmr = pagesMetadata
   805  	}
   806  	var pr *statefile.AsyncReader
   807  	if pagesFile != nil {
   808  		pr = statefile.NewAsyncReader(pagesFile, 0 /* off */)
   809  	}
   810  	if err := k.mf.LoadFrom(ctx, pmr, pr); err != nil {
   811  		return err
   812  	}
   813  	if err := loadPrivateMFs(ctx, pmr, pr); err != nil {
   814  		return err
   815  	}
   816  	if pr != nil {
   817  		if err := pr.Close(); err != nil {
   818  			return err
   819  		}
   820  	}
   821  	log.Infof("Memory files load took [%s].", time.Since(memoryStart))
   822  	return nil
   823  }
   824  
   825  // UniqueID returns a unique identifier.
   826  func (k *Kernel) UniqueID() uint64 {
   827  	id := k.uniqueID.Add(1)
   828  	if id == 0 {
   829  		panic("unique identifier generator wrapped around")
   830  	}
   831  	return id
   832  }
   833  
   834  // CreateProcessArgs holds arguments to kernel.CreateProcess.
   835  type CreateProcessArgs struct {
   836  	// Filename is the filename to load as the init binary.
   837  	//
   838  	// If this is provided as "", File will be checked, then the file will be
   839  	// guessed via Argv[0].
   840  	Filename string
   841  
   842  	// File is a passed host FD pointing to a file to load as the init binary.
   843  	//
   844  	// This is checked if and only if Filename is "".
   845  	File *vfs.FileDescription
   846  
   847  	// Argv is a list of arguments.
   848  	Argv []string
   849  
   850  	// Envv is a list of environment variables.
   851  	Envv []string
   852  
   853  	// WorkingDirectory is the initial working directory.
   854  	//
   855  	// This defaults to the root if empty.
   856  	WorkingDirectory string
   857  
   858  	// Credentials is the initial credentials.
   859  	Credentials *auth.Credentials
   860  
   861  	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
   862  	// it takes a reference on FDTable.
   863  	FDTable *FDTable
   864  
   865  	// Umask is the initial umask.
   866  	Umask uint
   867  
   868  	// Limits are the initial resource limits.
   869  	Limits *limits.LimitSet
   870  
   871  	// MaxSymlinkTraversals is the maximum number of symlinks to follow
   872  	// during resolution.
   873  	MaxSymlinkTraversals uint
   874  
   875  	// UTSNamespace is the initial UTS namespace.
   876  	UTSNamespace *UTSNamespace
   877  
   878  	// IPCNamespace is the initial IPC namespace.
   879  	IPCNamespace *IPCNamespace
   880  
   881  	// PIDNamespace is the initial PID Namespace.
   882  	PIDNamespace *PIDNamespace
   883  
   884  	// MountNamespace optionally contains the mount namespace for this
   885  	// process. If nil, the init process's mount namespace is used.
   886  	//
   887  	// Anyone setting MountNamespace must donate a reference (i.e.
   888  	// increment it).
   889  	MountNamespace *vfs.MountNamespace
   890  
   891  	// ContainerID is the container that the process belongs to.
   892  	ContainerID string
   893  
   894  	// InitialCgroups are the cgroups the container is initialized to.
   895  	InitialCgroups map[Cgroup]struct{}
   896  
   897  	// Origin indicates how the task was first created.
   898  	Origin TaskOrigin
   899  }
   900  
   901  // NewContext returns a context.Context that represents the task that will be
   902  // created by args.NewContext(k).
   903  func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context {
   904  	return &createProcessContext{
   905  		Context: context.Background(),
   906  		kernel:  k,
   907  		args:    args,
   908  	}
   909  }
   910  
   911  // createProcessContext is a context.Context that represents the context
   912  // associated with a task that is being created.
   913  type createProcessContext struct {
   914  	context.Context
   915  	kernel *Kernel
   916  	args   *CreateProcessArgs
   917  }
   918  
   919  // Value implements context.Context.Value.
   920  func (ctx *createProcessContext) Value(key any) any {
   921  	switch key {
   922  	case CtxKernel:
   923  		return ctx.kernel
   924  	case CtxPIDNamespace:
   925  		return ctx.args.PIDNamespace
   926  	case CtxUTSNamespace:
   927  		utsns := ctx.args.UTSNamespace
   928  		utsns.IncRef()
   929  		return utsns
   930  	case ipc.CtxIPCNamespace:
   931  		ipcns := ctx.args.IPCNamespace
   932  		ipcns.IncRef()
   933  		return ipcns
   934  	case auth.CtxCredentials:
   935  		return ctx.args.Credentials
   936  	case vfs.CtxRoot:
   937  		if ctx.args.MountNamespace == nil {
   938  			return nil
   939  		}
   940  		root := ctx.args.MountNamespace.Root(ctx)
   941  		return root
   942  	case vfs.CtxMountNamespace:
   943  		if ctx.kernel.globalInit == nil {
   944  			return nil
   945  		}
   946  		mntns := ctx.kernel.GlobalInit().Leader().MountNamespace()
   947  		mntns.IncRef()
   948  		return mntns
   949  	case devutil.CtxDevGoferClient:
   950  		return ctx.kernel.GetDevGoferClient(ctx.kernel.ContainerName(ctx.args.ContainerID))
   951  	case inet.CtxStack:
   952  		return ctx.kernel.RootNetworkNamespace().Stack()
   953  	case ktime.CtxRealtimeClock:
   954  		return ctx.kernel.RealtimeClock()
   955  	case limits.CtxLimits:
   956  		return ctx.args.Limits
   957  	case pgalloc.CtxMemoryCgroupID:
   958  		return ctx.getMemoryCgroupID()
   959  	case pgalloc.CtxMemoryFile:
   960  		return ctx.kernel.mf
   961  	case platform.CtxPlatform:
   962  		return ctx.kernel
   963  	case uniqueid.CtxGlobalUniqueID:
   964  		return ctx.kernel.UniqueID()
   965  	case uniqueid.CtxGlobalUniqueIDProvider:
   966  		return ctx.kernel
   967  	case uniqueid.CtxInotifyCookie:
   968  		return ctx.kernel.GenerateInotifyCookie()
   969  	case unimpl.CtxEvents:
   970  		return ctx.kernel
   971  	default:
   972  		return nil
   973  	}
   974  }
   975  
   976  func (ctx *createProcessContext) getMemoryCgroupID() uint32 {
   977  	for cg := range ctx.args.InitialCgroups {
   978  		for _, ctl := range cg.Controllers() {
   979  			if ctl.Type() == CgroupControllerMemory {
   980  				return cg.ID()
   981  			}
   982  		}
   983  	}
   984  	return InvalidCgroupID
   985  }
   986  
   987  // CreateProcess creates a new task in a new thread group with the given
   988  // options. The new task has no parent and is in the root PID namespace.
   989  //
   990  // If k.Start() has already been called, then the created process must be
   991  // started by calling kernel.StartProcess(tg).
   992  //
   993  // If k.Start() has not yet been called, then the created task will begin
   994  // running when k.Start() is called.
   995  //
   996  // CreateProcess has no analogue in Linux; it is used to create the initial
   997  // application task, as well as processes started by the control server.
   998  func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
   999  	k.extMu.Lock()
  1000  	defer k.extMu.Unlock()
  1001  	log.Infof("EXEC: %v", args.Argv)
  1002  
  1003  	ctx := args.NewContext(k)
  1004  	mntns := args.MountNamespace
  1005  	if mntns == nil {
  1006  		if k.globalInit == nil {
  1007  			return nil, 0, fmt.Errorf("mount namespace is nil")
  1008  		}
  1009  		// Add a reference to the namespace, which is transferred to the new process.
  1010  		mntns = k.globalInit.Leader().MountNamespace()
  1011  		mntns.IncRef()
  1012  	}
  1013  	// Get the root directory from the MountNamespace.
  1014  	root := mntns.Root(ctx)
  1015  	defer root.DecRef(ctx)
  1016  
  1017  	// Grab the working directory.
  1018  	wd := root // Default.
  1019  	if args.WorkingDirectory != "" {
  1020  		pop := vfs.PathOperation{
  1021  			Root:               root,
  1022  			Start:              wd,
  1023  			Path:               fspath.Parse(args.WorkingDirectory),
  1024  			FollowFinalSymlink: true,
  1025  		}
  1026  		// NOTE(b/236028361): Do not set CheckSearchable flag to true.
  1027  		// Application is allowed to start with a working directory that it can
  1028  		// not access/search. This is consistent with Docker and VFS1. Runc
  1029  		// explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry
  1030  		// chdir to fix EPERM"). As described in the commit, runc unintentionally
  1031  		// allowed this behavior in a couple of releases and applications started
  1032  		// relying on it. So they decided to allow it for backward compatibility.
  1033  		var err error
  1034  		wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{})
  1035  		if err != nil {
  1036  			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
  1037  		}
  1038  		defer wd.DecRef(ctx)
  1039  	}
  1040  	fsContext := NewFSContext(root, wd, args.Umask)
  1041  
  1042  	tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
  1043  	cu := cleanup.Make(func() {
  1044  		tg.Release(ctx)
  1045  	})
  1046  	defer cu.Clean()
  1047  
  1048  	// Check which file to start from.
  1049  	switch {
  1050  	case args.Filename != "":
  1051  		// If a filename is given, take that.
  1052  		// Set File to nil so we resolve the path in LoadTaskImage.
  1053  		args.File = nil
  1054  	case args.File != nil:
  1055  		// If File is set, take the File provided directly.
  1056  		args.Filename = args.File.MappedName(ctx)
  1057  	default:
  1058  		// Otherwise look at Argv and see if the first argument is a valid path.
  1059  		if len(args.Argv) == 0 {
  1060  			return nil, 0, fmt.Errorf("no filename or command provided")
  1061  		}
  1062  		if !filepath.IsAbs(args.Argv[0]) {
  1063  			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
  1064  		}
  1065  		args.Filename = args.Argv[0]
  1066  	}
  1067  
  1068  	// Create a fresh task context.
  1069  	remainingTraversals := args.MaxSymlinkTraversals
  1070  	loadArgs := loader.LoadArgs{
  1071  		Root:                root,
  1072  		WorkingDir:          wd,
  1073  		RemainingTraversals: &remainingTraversals,
  1074  		ResolveFinal:        true,
  1075  		Filename:            args.Filename,
  1076  		File:                args.File,
  1077  		CloseOnExec:         false,
  1078  		Argv:                args.Argv,
  1079  		Envv:                args.Envv,
  1080  		Features:            k.featureSet,
  1081  	}
  1082  
  1083  	image, se := k.LoadTaskImage(ctx, loadArgs)
  1084  	if se != nil {
  1085  		return nil, 0, errors.New(se.String())
  1086  	}
  1087  	var capData auth.VfsCapData
  1088  	if len(image.FileCaps()) != 0 {
  1089  		var err error
  1090  		capData, err = auth.VfsCapDataOf([]byte(image.FileCaps()))
  1091  		if err != nil {
  1092  			return nil, 0, err
  1093  		}
  1094  	}
  1095  	creds, err := auth.CapsFromVfsCaps(capData, args.Credentials)
  1096  	if err != nil {
  1097  		return nil, 0, err
  1098  	}
  1099  	args.FDTable.IncRef()
  1100  
  1101  	// Create the task.
  1102  	config := &TaskConfig{
  1103  		Kernel:           k,
  1104  		ThreadGroup:      tg,
  1105  		TaskImage:        image,
  1106  		FSContext:        fsContext,
  1107  		FDTable:          args.FDTable,
  1108  		Credentials:      creds,
  1109  		NetworkNamespace: k.RootNetworkNamespace(),
  1110  		AllowedCPUMask:   sched.NewFullCPUSet(k.applicationCores),
  1111  		UTSNamespace:     args.UTSNamespace,
  1112  		IPCNamespace:     args.IPCNamespace,
  1113  		MountNamespace:   mntns,
  1114  		ContainerID:      args.ContainerID,
  1115  		InitialCgroups:   args.InitialCgroups,
  1116  		UserCounters:     k.GetUserCounters(args.Credentials.RealKUID),
  1117  		Origin:           args.Origin,
  1118  		// A task with no parent starts out with no session keyring.
  1119  		SessionKeyring: nil,
  1120  	}
  1121  	config.UTSNamespace.IncRef()
  1122  	config.IPCNamespace.IncRef()
  1123  	config.NetworkNamespace.IncRef()
  1124  	t, err := k.tasks.NewTask(ctx, config)
  1125  	if err != nil {
  1126  		return nil, 0, err
  1127  	}
  1128  	t.traceExecEvent(image) // Simulate exec for tracing.
  1129  
  1130  	// Success.
  1131  	cu.Release()
  1132  	tgid := k.tasks.Root.IDOfThreadGroup(tg)
  1133  	if k.globalInit == nil {
  1134  		k.globalInit = tg
  1135  	}
  1136  	return tg, tgid, nil
  1137  }
  1138  
  1139  // StartProcess starts running a process that was created with CreateProcess.
  1140  func (k *Kernel) StartProcess(tg *ThreadGroup) {
  1141  	t := tg.Leader()
  1142  	tid := k.tasks.Root.IDOfTask(t)
  1143  	t.Start(tid)
  1144  }
  1145  
  1146  // Start starts execution of all tasks in k.
  1147  //
  1148  // Preconditions: Start may be called exactly once.
  1149  func (k *Kernel) Start() error {
  1150  	k.extMu.Lock()
  1151  	defer k.extMu.Unlock()
  1152  
  1153  	if k.started {
  1154  		return fmt.Errorf("kernel already started")
  1155  	}
  1156  
  1157  	k.started = true
  1158  	k.cpuClockTickTimer = time.NewTimer(linux.ClockTick)
  1159  	k.runningTasksMu.Lock()
  1160  	k.cpuClockTickerRunning = true
  1161  	k.runningTasksMu.Unlock()
  1162  	go k.runCPUClockTicker()
  1163  	// If k was created by LoadKernelFrom, timers were stopped during
  1164  	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
  1165  	// this is a no-op.
  1166  	k.resumeTimeLocked(k.SupervisorContext())
  1167  	k.tasks.mu.RLock()
  1168  	ts := make([]*Task, 0, len(k.tasks.Root.tids))
  1169  	for t := range k.tasks.Root.tids {
  1170  		ts = append(ts, t)
  1171  	}
  1172  	k.tasks.mu.RUnlock()
  1173  	// Start task goroutines.
  1174  	// NOTE(b/235349091): We don't actually need the TaskSet mutex, we just
  1175  	// need to make sure we only call t.Start() once for each task. Holding the
  1176  	// mutex for each task start may cause a nested locking error.
  1177  	for _, t := range ts {
  1178  		t.Start(t.ThreadID())
  1179  	}
  1180  	return nil
  1181  }
  1182  
  1183  // pauseTimeLocked pauses all Timers and Timekeeper updates.
  1184  //
  1185  // Preconditions:
  1186  //   - Any task goroutines running in k must be stopped.
  1187  //   - k.extMu must be locked.
  1188  func (k *Kernel) pauseTimeLocked(ctx context.Context) {
  1189  	// Since all task goroutines have been stopped by precondition, the CPU clock
  1190  	// ticker should stop on its own; wait for it to do so, waking it up from
  1191  	// sleeping between ticks if necessary.
  1192  	k.runningTasksMu.Lock()
  1193  	for k.cpuClockTickerRunning {
  1194  		select {
  1195  		case k.cpuClockTickerWakeCh <- struct{}{}:
  1196  		default:
  1197  		}
  1198  		k.cpuClockTickerStopCond.Wait()
  1199  	}
  1200  	k.runningTasksMu.Unlock()
  1201  
  1202  	// By precondition, nothing else can be interacting with PIDNamespace.tids
  1203  	// or FDTable.files, so we can iterate them without synchronization. (We
  1204  	// can't hold the TaskSet mutex when pausing thread group timers because
  1205  	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
  1206  	// mutex, while holding the Timer mutex.)
  1207  	for t := range k.tasks.Root.tids {
  1208  		if t == t.tg.leader {
  1209  			t.tg.itimerRealTimer.Pause()
  1210  			for _, it := range t.tg.timers {
  1211  				it.PauseTimer()
  1212  			}
  1213  		}
  1214  		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
  1215  		// but ktime.Timer.Pause is idempotent so this is harmless.
  1216  		if t.fdTable != nil {
  1217  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1218  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1219  					tfd.PauseTimer()
  1220  				}
  1221  			})
  1222  		}
  1223  	}
  1224  	k.timekeeper.PauseUpdates()
  1225  }
  1226  
  1227  // resumeTimeLocked resumes all Timers and Timekeeper updates. If
  1228  // pauseTimeLocked has not been previously called, resumeTimeLocked has no
  1229  // effect.
  1230  //
  1231  // Preconditions:
  1232  //   - Any task goroutines running in k must be stopped.
  1233  //   - k.extMu must be locked.
  1234  func (k *Kernel) resumeTimeLocked(ctx context.Context) {
  1235  	// The CPU clock ticker will automatically resume as task goroutines resume
  1236  	// execution.
  1237  
  1238  	k.timekeeper.ResumeUpdates()
  1239  	for t := range k.tasks.Root.tids {
  1240  		if t == t.tg.leader {
  1241  			t.tg.itimerRealTimer.Resume()
  1242  			for _, it := range t.tg.timers {
  1243  				it.ResumeTimer()
  1244  			}
  1245  		}
  1246  		if t.fdTable != nil {
  1247  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1248  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1249  					tfd.ResumeTimer()
  1250  				}
  1251  			})
  1252  		}
  1253  	}
  1254  }
  1255  
  1256  func (k *Kernel) incRunningTasks() {
  1257  	for {
  1258  		tasks := k.runningTasks.Load()
  1259  		if tasks != 0 {
  1260  			// Standard case. Simply increment.
  1261  			if !k.runningTasks.CompareAndSwap(tasks, tasks+1) {
  1262  				continue
  1263  			}
  1264  			return
  1265  		}
  1266  
  1267  		// Transition from 0 -> 1.
  1268  		k.runningTasksMu.Lock()
  1269  		if k.runningTasks.Load() != 0 {
  1270  			// Raced with another transition and lost.
  1271  			k.runningTasks.Add(1)
  1272  			k.runningTasksMu.Unlock()
  1273  			return
  1274  		}
  1275  		if !k.cpuClockTickerRunning {
  1276  			select {
  1277  			case tickTime := <-k.cpuClockTickTimer.C:
  1278  				// Rearm the timer since we consumed the wakeup. Estimate how much time
  1279  				// remains on the current tick so that periodic workloads interact with
  1280  				// the (periodic) CPU clock ticker in the same way that they would
  1281  				// without the optimization of putting the ticker to sleep.
  1282  				missedNS := time.Since(tickTime).Nanoseconds()
  1283  				missedTicks := missedNS / linux.ClockTick.Nanoseconds()
  1284  				thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds()
  1285  				k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS))
  1286  				// Increment k.cpuClock on the CPU clock ticker goroutine's behalf.
  1287  				// (Whole missed ticks don't matter, and adding them to k.cpuClock will
  1288  				// just confuse the watchdog.) At the time the tick occurred, all task
  1289  				// goroutines were asleep, so there's nothing else to do. This ensures
  1290  				// that our caller (Task.accountTaskGoroutineLeave()) records an
  1291  				// updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly
  1292  				// accounted as having resumed execution in the sentry during this tick
  1293  				// instead of at the end of the previous one.
  1294  				k.cpuClock.Add(1)
  1295  			default:
  1296  			}
  1297  			// We are transitioning from idle to active. Set k.cpuClockTickerRunning
  1298  			// = true here so that if we transition to idle and then active again
  1299  			// before the CPU clock ticker goroutine has a chance to run, the first
  1300  			// call to k.incRunningTasks() at the end of that cycle does not try to
  1301  			// steal k.cpuClockTickTimer.C again, as this would allow workloads that
  1302  			// rapidly cycle between idle and active to starve the CPU clock ticker
  1303  			// of chances to observe task goroutines in a running state and account
  1304  			// their CPU usage.
  1305  			k.cpuClockTickerRunning = true
  1306  			k.runningTasksCond.Signal()
  1307  		}
  1308  		// This store must happen after the increment of k.cpuClock above to ensure
  1309  		// that concurrent calls to Task.accountTaskGoroutineLeave() also observe
  1310  		// the updated k.cpuClock.
  1311  		k.runningTasks.Store(1)
  1312  		k.runningTasksMu.Unlock()
  1313  		return
  1314  	}
  1315  }
  1316  
  1317  func (k *Kernel) decRunningTasks() {
  1318  	tasks := k.runningTasks.Add(-1)
  1319  	if tasks < 0 {
  1320  		panic(fmt.Sprintf("Invalid running count %d", tasks))
  1321  	}
  1322  
  1323  	// Nothing to do. The next CPU clock tick will disable the timer if
  1324  	// there is still nothing running. This provides approximately one tick
  1325  	// of slack in which we can switch back and forth between idle and
  1326  	// active without an expensive transition.
  1327  }
  1328  
  1329  // WaitExited blocks until all tasks in k have exited.
  1330  func (k *Kernel) WaitExited() {
  1331  	k.tasks.liveGoroutines.Wait()
  1332  }
  1333  
  1334  // Kill requests that all tasks in k immediately exit as if group exiting with
  1335  // status ws. Kill does not wait for tasks to exit.
  1336  func (k *Kernel) Kill(ws linux.WaitStatus) {
  1337  	k.extMu.Lock()
  1338  	defer k.extMu.Unlock()
  1339  	k.tasks.Kill(ws)
  1340  }
  1341  
  1342  // Pause requests that all tasks in k temporarily stop executing, and blocks
  1343  // until all tasks and asynchronous I/O operations in k have stopped. Multiple
  1344  // calls to Pause nest and require an equal number of calls to Unpause to
  1345  // resume execution.
  1346  func (k *Kernel) Pause() {
  1347  	k.extMu.Lock()
  1348  	k.tasks.BeginExternalStop()
  1349  	k.extMu.Unlock()
  1350  	k.tasks.runningGoroutines.Wait()
  1351  	k.tasks.aioGoroutines.Wait()
  1352  }
  1353  
  1354  // ReceiveTaskStates receives full states for all tasks.
  1355  func (k *Kernel) ReceiveTaskStates() {
  1356  	k.extMu.Lock()
  1357  	k.tasks.PullFullState()
  1358  	k.extMu.Unlock()
  1359  }
  1360  
  1361  // Unpause ends the effect of a previous call to Pause. If Unpause is called
  1362  // without a matching preceding call to Pause, Unpause may panic.
  1363  func (k *Kernel) Unpause() {
  1364  	k.extMu.Lock()
  1365  	defer k.extMu.Unlock()
  1366  	k.tasks.EndExternalStop()
  1367  }
  1368  
  1369  // SendExternalSignal injects a signal into the kernel.
  1370  //
  1371  // context is used only for debugging to describe how the signal was received.
  1372  //
  1373  // Preconditions: Kernel must have an init process.
  1374  func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
  1375  	k.extMu.Lock()
  1376  	defer k.extMu.Unlock()
  1377  	k.sendExternalSignal(info, context)
  1378  }
  1379  
  1380  // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
  1381  //
  1382  // This function doesn't skip signals like SendExternalSignal does.
  1383  func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
  1384  	k.extMu.Lock()
  1385  	defer k.extMu.Unlock()
  1386  	return tg.SendSignal(info)
  1387  }
  1388  
  1389  // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the
  1390  // given process group.
  1391  //
  1392  // This function doesn't skip signals like SendExternalSignal does.
  1393  func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error {
  1394  	k.extMu.Lock()
  1395  	defer k.extMu.Unlock()
  1396  	// If anything goes wrong, we'll return the error, but still try our
  1397  	// best to deliver to other processes in the group.
  1398  	var firstErr error
  1399  	for _, tg := range k.TaskSet().Root.ThreadGroups() {
  1400  		if tg.ProcessGroup() != pg {
  1401  			continue
  1402  		}
  1403  		if err := tg.SendSignal(info); err != nil && firstErr == nil {
  1404  			firstErr = err
  1405  		}
  1406  	}
  1407  	return firstErr
  1408  }
  1409  
  1410  // SendContainerSignal sends the given signal to all processes inside the
  1411  // namespace that match the given container ID.
  1412  func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
  1413  	k.extMu.Lock()
  1414  	defer k.extMu.Unlock()
  1415  	k.tasks.mu.RLock()
  1416  	defer k.tasks.mu.RUnlock()
  1417  
  1418  	var lastErr error
  1419  	for tg := range k.tasks.Root.tgids {
  1420  		if tg.leader.ContainerID() == cid {
  1421  			tg.signalHandlers.mu.Lock()
  1422  			infoCopy := *info
  1423  			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
  1424  				lastErr = err
  1425  			}
  1426  			tg.signalHandlers.mu.Unlock()
  1427  		}
  1428  	}
  1429  	return lastErr
  1430  }
  1431  
  1432  // RebuildTraceContexts rebuilds the trace context for all tasks.
  1433  //
  1434  // Unfortunately, if these are built while tracing is not enabled, then we will
  1435  // not have meaningful trace data. Rebuilding here ensures that we can do so
  1436  // after tracing has been enabled.
  1437  func (k *Kernel) RebuildTraceContexts() {
  1438  	// We need to pause all task goroutines because Task.rebuildTraceContext()
  1439  	// replaces Task.traceContext and Task.traceTask, which are
  1440  	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
  1441  	// access them without synchronization) for performance.
  1442  	k.Pause()
  1443  	defer k.Unpause()
  1444  
  1445  	k.extMu.Lock()
  1446  	defer k.extMu.Unlock()
  1447  	k.tasks.mu.RLock()
  1448  	defer k.tasks.mu.RUnlock()
  1449  
  1450  	for t, tid := range k.tasks.Root.tids {
  1451  		t.rebuildTraceContext(tid)
  1452  	}
  1453  }
  1454  
  1455  // FeatureSet returns the FeatureSet.
  1456  func (k *Kernel) FeatureSet() cpuid.FeatureSet {
  1457  	return k.featureSet
  1458  }
  1459  
  1460  // Timekeeper returns the Timekeeper.
  1461  func (k *Kernel) Timekeeper() *Timekeeper {
  1462  	return k.timekeeper
  1463  }
  1464  
  1465  // TaskSet returns the TaskSet.
  1466  func (k *Kernel) TaskSet() *TaskSet {
  1467  	return k.tasks
  1468  }
  1469  
  1470  // RootUserNamespace returns the root UserNamespace.
  1471  func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
  1472  	return k.rootUserNamespace
  1473  }
  1474  
  1475  // RootUTSNamespace returns the root UTSNamespace.
  1476  func (k *Kernel) RootUTSNamespace() *UTSNamespace {
  1477  	return k.rootUTSNamespace
  1478  }
  1479  
  1480  // RootIPCNamespace takes a reference and returns the root IPCNamespace.
  1481  func (k *Kernel) RootIPCNamespace() *IPCNamespace {
  1482  	return k.rootIPCNamespace
  1483  }
  1484  
  1485  // RootPIDNamespace returns the root PIDNamespace.
  1486  func (k *Kernel) RootPIDNamespace() *PIDNamespace {
  1487  	return k.tasks.Root
  1488  }
  1489  
  1490  // RootNetworkNamespace returns the root network namespace, always non-nil.
  1491  func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
  1492  	return k.rootNetworkNamespace
  1493  }
  1494  
  1495  // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
  1496  // nil if no such thread group exists. GlobalInit may return a thread group
  1497  // containing no tasks if the thread group has already exited.
  1498  func (k *Kernel) GlobalInit() *ThreadGroup {
  1499  	k.extMu.Lock()
  1500  	defer k.extMu.Unlock()
  1501  	return k.globalInit
  1502  }
  1503  
  1504  // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
  1505  func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
  1506  	k.globalInit = tg
  1507  }
  1508  
  1509  // ApplicationCores returns the number of CPUs visible to sandboxed
  1510  // applications.
  1511  func (k *Kernel) ApplicationCores() uint {
  1512  	return k.applicationCores
  1513  }
  1514  
  1515  // RealtimeClock returns the application CLOCK_REALTIME clock.
  1516  func (k *Kernel) RealtimeClock() ktime.Clock {
  1517  	return k.timekeeper.realtimeClock
  1518  }
  1519  
  1520  // MonotonicClock returns the application CLOCK_MONOTONIC clock.
  1521  func (k *Kernel) MonotonicClock() ktime.Clock {
  1522  	return k.timekeeper.monotonicClock
  1523  }
  1524  
  1525  // CPUClockNow returns the current value of k.cpuClock.
  1526  func (k *Kernel) CPUClockNow() uint64 {
  1527  	return k.cpuClock.Load()
  1528  }
  1529  
  1530  // Syslog returns the syslog.
  1531  func (k *Kernel) Syslog() *syslog {
  1532  	return &k.syslog
  1533  }
  1534  
  1535  // GenerateInotifyCookie generates a unique inotify event cookie.
  1536  //
  1537  // Returned values may overlap with previously returned values if the value
  1538  // space is exhausted. 0 is not a valid cookie value, all other values
  1539  // representable in a uint32 are allowed.
  1540  func (k *Kernel) GenerateInotifyCookie() uint32 {
  1541  	id := k.nextInotifyCookie.Add(1)
  1542  	// Wrap-around is explicitly allowed for inotify event cookies.
  1543  	if id == 0 {
  1544  		id = k.nextInotifyCookie.Add(1)
  1545  	}
  1546  	return id
  1547  }
  1548  
  1549  // NetlinkPorts returns the netlink port manager.
  1550  func (k *Kernel) NetlinkPorts() *port.Manager {
  1551  	return k.netlinkPorts
  1552  }
  1553  
  1554  var (
  1555  	errSaved     = errors.New("sandbox has been successfully saved")
  1556  	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
  1557  )
  1558  
  1559  // SaveStatus returns the sandbox save status. If it was saved successfully,
  1560  // autosaved indicates whether save was triggered by autosave. If it was not
  1561  // saved successfully, err indicates the sandbox error that caused the kernel to
  1562  // exit during save.
  1563  func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
  1564  	k.extMu.Lock()
  1565  	defer k.extMu.Unlock()
  1566  	switch k.saveStatus {
  1567  	case nil:
  1568  		return false, false, nil
  1569  	case errSaved:
  1570  		return true, false, nil
  1571  	case errAutoSaved:
  1572  		return true, true, nil
  1573  	default:
  1574  		return false, false, k.saveStatus
  1575  	}
  1576  }
  1577  
  1578  // SetSaveSuccess sets the flag indicating that save completed successfully, if
  1579  // no status was already set.
  1580  func (k *Kernel) SetSaveSuccess(autosave bool) {
  1581  	k.extMu.Lock()
  1582  	defer k.extMu.Unlock()
  1583  	if k.saveStatus == nil {
  1584  		if autosave {
  1585  			k.saveStatus = errAutoSaved
  1586  		} else {
  1587  			k.saveStatus = errSaved
  1588  		}
  1589  	}
  1590  }
  1591  
  1592  // SetSaveError sets the sandbox error that caused the kernel to exit during
  1593  // save, if one is not already set.
  1594  func (k *Kernel) SetSaveError(err error) {
  1595  	k.extMu.Lock()
  1596  	defer k.extMu.Unlock()
  1597  	if k.saveStatus == nil {
  1598  		k.saveStatus = err
  1599  	}
  1600  }
  1601  
  1602  // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
  1603  // LoadFrom.
  1604  func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
  1605  	k.mf = mf
  1606  }
  1607  
  1608  // MemoryFile returns the MemoryFile that provides application memory.
  1609  func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
  1610  	return k.mf
  1611  }
  1612  
  1613  // SupervisorContext returns a Context with maximum privileges in k. It should
  1614  // only be used by goroutines outside the control of the emulated kernel
  1615  // defined by e.
  1616  //
  1617  // Callers are responsible for ensuring that the returned Context is not used
  1618  // concurrently with changes to the Kernel.
  1619  func (k *Kernel) SupervisorContext() context.Context {
  1620  	return &supervisorContext{
  1621  		Kernel: k,
  1622  		Logger: log.Log(),
  1623  	}
  1624  }
  1625  
  1626  // SocketRecord represents a socket recorded in Kernel.sockets.
  1627  //
  1628  // +stateify savable
  1629  type SocketRecord struct {
  1630  	k    *Kernel
  1631  	Sock *vfs.FileDescription
  1632  	ID   uint64 // Socket table entry number.
  1633  }
  1634  
  1635  // RecordSocket adds a socket to the system-wide socket table for
  1636  // tracking.
  1637  //
  1638  // Precondition: Caller must hold a reference to sock.
  1639  //
  1640  // Note that the socket table will not hold a reference on the
  1641  // vfs.FileDescription.
  1642  func (k *Kernel) RecordSocket(sock *vfs.FileDescription) {
  1643  	k.extMu.Lock()
  1644  	if _, ok := k.sockets[sock]; ok {
  1645  		panic(fmt.Sprintf("Socket %p added twice", sock))
  1646  	}
  1647  	id := k.nextSocketRecord
  1648  	k.nextSocketRecord++
  1649  	s := &SocketRecord{
  1650  		k:    k,
  1651  		ID:   id,
  1652  		Sock: sock,
  1653  	}
  1654  	k.sockets[sock] = s
  1655  	k.extMu.Unlock()
  1656  }
  1657  
  1658  // DeleteSocket removes a socket from the system-wide socket table.
  1659  func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) {
  1660  	k.extMu.Lock()
  1661  	delete(k.sockets, sock)
  1662  	k.extMu.Unlock()
  1663  }
  1664  
  1665  // ListSockets returns a snapshot of all sockets.
  1666  //
  1667  // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef()
  1668  // to get a reference on a socket in the table.
  1669  func (k *Kernel) ListSockets() []*SocketRecord {
  1670  	k.extMu.Lock()
  1671  	var socks []*SocketRecord
  1672  	for _, s := range k.sockets {
  1673  		socks = append(socks, s)
  1674  	}
  1675  	k.extMu.Unlock()
  1676  	return socks
  1677  }
  1678  
  1679  // supervisorContext is a privileged context.
  1680  type supervisorContext struct {
  1681  	context.NoTask
  1682  	log.Logger
  1683  	*Kernel
  1684  }
  1685  
  1686  // Deadline implements context.Context.Deadline.
  1687  func (*Kernel) Deadline() (time.Time, bool) {
  1688  	return time.Time{}, false
  1689  }
  1690  
  1691  // Done implements context.Context.Done.
  1692  func (*Kernel) Done() <-chan struct{} {
  1693  	return nil
  1694  }
  1695  
  1696  // Err implements context.Context.Err.
  1697  func (*Kernel) Err() error {
  1698  	return nil
  1699  }
  1700  
  1701  // Value implements context.Context.
  1702  func (ctx *supervisorContext) Value(key any) any {
  1703  	switch key {
  1704  	case CtxCanTrace:
  1705  		// The supervisor context can trace anything. (None of
  1706  		// supervisorContext's users are expected to invoke ptrace, but ptrace
  1707  		// permissions are required for certain file accesses.)
  1708  		return func(*Task, bool) bool { return true }
  1709  	case CtxKernel:
  1710  		return ctx.Kernel
  1711  	case CtxPIDNamespace:
  1712  		return ctx.Kernel.tasks.Root
  1713  	case CtxUTSNamespace:
  1714  		utsns := ctx.Kernel.rootUTSNamespace
  1715  		utsns.IncRef()
  1716  		return utsns
  1717  	case ipc.CtxIPCNamespace:
  1718  		ipcns := ctx.Kernel.rootIPCNamespace
  1719  		ipcns.IncRef()
  1720  		return ipcns
  1721  	case auth.CtxCredentials:
  1722  		// The supervisor context is global root.
  1723  		return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace)
  1724  	case vfs.CtxRoot:
  1725  		if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil {
  1726  			return vfs.VirtualDentry{}
  1727  		}
  1728  		root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx)
  1729  		return root
  1730  	case vfs.CtxMountNamespace:
  1731  		if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil {
  1732  			return nil
  1733  		}
  1734  		mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace()
  1735  		mntns.IncRef()
  1736  		return mntns
  1737  	case inet.CtxStack:
  1738  		return ctx.Kernel.RootNetworkNamespace().Stack()
  1739  	case ktime.CtxRealtimeClock:
  1740  		return ctx.Kernel.RealtimeClock()
  1741  	case limits.CtxLimits:
  1742  		// No limits apply.
  1743  		return limits.NewLimitSet()
  1744  	case pgalloc.CtxMemoryFile:
  1745  		return ctx.Kernel.mf
  1746  	case platform.CtxPlatform:
  1747  		return ctx.Kernel
  1748  	case uniqueid.CtxGlobalUniqueID:
  1749  		return ctx.Kernel.UniqueID()
  1750  	case uniqueid.CtxGlobalUniqueIDProvider:
  1751  		return ctx.Kernel
  1752  	case uniqueid.CtxInotifyCookie:
  1753  		return ctx.Kernel.GenerateInotifyCookie()
  1754  	case unimpl.CtxEvents:
  1755  		return ctx.Kernel
  1756  	case cpuid.CtxFeatureSet:
  1757  		return ctx.Kernel.featureSet
  1758  	default:
  1759  		return nil
  1760  	}
  1761  }
  1762  
  1763  // Rate limits for the number of unimplemented syscall events.
  1764  const (
  1765  	unimplementedSyscallsMaxRate = 100  // events per second
  1766  	unimplementedSyscallBurst    = 1000 // events
  1767  )
  1768  
  1769  // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
  1770  // channel.
  1771  func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) {
  1772  	k.unimplementedSyscallEmitterOnce.Do(func() {
  1773  		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
  1774  	})
  1775  
  1776  	t := TaskFromContext(ctx)
  1777  	IncrementUnimplementedSyscallCounter(sysno)
  1778  	_, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
  1779  		Tid:       int32(t.ThreadID()),
  1780  		Registers: t.Arch().StateData().Proto(),
  1781  	})
  1782  }
  1783  
  1784  // VFS returns the virtual filesystem for the kernel.
  1785  func (k *Kernel) VFS() *vfs.VirtualFilesystem {
  1786  	return &k.vfs
  1787  }
  1788  
  1789  // SetHostMount sets the hostfs mount.
  1790  func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
  1791  	if k.hostMount != nil {
  1792  		panic("Kernel.hostMount cannot be set more than once")
  1793  	}
  1794  	k.hostMount = mnt
  1795  }
  1796  
  1797  // HostMount returns the hostfs mount.
  1798  func (k *Kernel) HostMount() *vfs.Mount {
  1799  	return k.hostMount
  1800  }
  1801  
  1802  // PipeMount returns the pipefs mount.
  1803  func (k *Kernel) PipeMount() *vfs.Mount {
  1804  	return k.pipeMount
  1805  }
  1806  
  1807  // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace.
  1808  func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter {
  1809  	return nsfs.NewInode(ctx, k.nsfsMount, ns)
  1810  }
  1811  
  1812  // ShmMount returns the tmpfs mount.
  1813  func (k *Kernel) ShmMount() *vfs.Mount {
  1814  	return k.shmMount
  1815  }
  1816  
  1817  // SocketMount returns the sockfs mount.
  1818  func (k *Kernel) SocketMount() *vfs.Mount {
  1819  	return k.socketMount
  1820  }
  1821  
  1822  // CgroupRegistry returns the cgroup registry.
  1823  func (k *Kernel) CgroupRegistry() *CgroupRegistry {
  1824  	return k.cgroupRegistry
  1825  }
  1826  
  1827  // AddCgroupMount adds the cgroup mounts to the cgroupMountsMap. These cgroup
  1828  // mounts are created during the creation of root container process and the
  1829  // reference ownership is transferred to the kernel.
  1830  func (k *Kernel) AddCgroupMount(ctl string, mnt *CgroupMount) {
  1831  	k.cgroupMountsMapMu.Lock()
  1832  	defer k.cgroupMountsMapMu.Unlock()
  1833  
  1834  	if k.cgroupMountsMap == nil {
  1835  		k.cgroupMountsMap = make(map[string]*CgroupMount)
  1836  	}
  1837  	k.cgroupMountsMap[ctl] = mnt
  1838  }
  1839  
  1840  // GetCgroupMount returns the cgroup mount for the given cgroup controller.
  1841  func (k *Kernel) GetCgroupMount(ctl string) *CgroupMount {
  1842  	k.cgroupMountsMapMu.Lock()
  1843  	defer k.cgroupMountsMapMu.Unlock()
  1844  
  1845  	return k.cgroupMountsMap[ctl]
  1846  }
  1847  
  1848  // releaseCgroupMounts releases the cgroup mounts.
  1849  func (k *Kernel) releaseCgroupMounts(ctx context.Context) {
  1850  	k.cgroupMountsMapMu.Lock()
  1851  	defer k.cgroupMountsMapMu.Unlock()
  1852  
  1853  	for _, m := range k.cgroupMountsMap {
  1854  		m.Mount.DecRef(ctx)
  1855  		m.Root.DecRef(ctx)
  1856  		m.Fs.DecRef(ctx)
  1857  	}
  1858  }
  1859  
  1860  // Release releases resources owned by k.
  1861  //
  1862  // Precondition: This should only be called after the kernel is fully
  1863  // initialized, e.g. after k.Start() has been called.
  1864  func (k *Kernel) Release() {
  1865  	ctx := k.SupervisorContext()
  1866  	k.releaseCgroupMounts(ctx)
  1867  	k.hostMount.DecRef(ctx)
  1868  	k.pipeMount.DecRef(ctx)
  1869  	k.nsfsMount.DecRef(ctx)
  1870  	k.shmMount.DecRef(ctx)
  1871  	k.socketMount.DecRef(ctx)
  1872  	k.vfs.Release(ctx)
  1873  	k.timekeeper.Destroy()
  1874  	k.vdso.Release(ctx)
  1875  	k.RootNetworkNamespace().DecRef(ctx)
  1876  	k.rootIPCNamespace.DecRef(ctx)
  1877  	k.rootUTSNamespace.DecRef(ctx)
  1878  	k.cleaupDevGofers()
  1879  }
  1880  
  1881  // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
  1882  // hierarchy.
  1883  //
  1884  // Precondition: root must be a new cgroup with no tasks. This implies the
  1885  // controllers for root are also new and currently manage no task, which in turn
  1886  // implies the new cgroup can be populated without migrating tasks between
  1887  // cgroups.
  1888  func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
  1889  	k.tasks.mu.RLock()
  1890  	k.tasks.forEachTaskLocked(func(t *Task) {
  1891  		if t.exitState != TaskExitNone {
  1892  			return
  1893  		}
  1894  		t.mu.Lock()
  1895  		// A task can be in the cgroup if it has been created after the
  1896  		// cgroup hierarchy was registered.
  1897  		t.enterCgroupIfNotYetLocked(root)
  1898  		t.mu.Unlock()
  1899  	})
  1900  	k.tasks.mu.RUnlock()
  1901  }
  1902  
  1903  // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
  1904  // hierarchy with the provided id.  This is intended for use during hierarchy
  1905  // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
  1906  func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
  1907  	var releasedCGs []Cgroup
  1908  
  1909  	k.tasks.mu.RLock()
  1910  	// We'll have one cgroup per hierarchy per task.
  1911  	releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids))
  1912  	k.tasks.forEachTaskLocked(func(t *Task) {
  1913  		if t.exitState != TaskExitNone {
  1914  			return
  1915  		}
  1916  		t.mu.Lock()
  1917  		for cg := range t.cgroups {
  1918  			if cg.HierarchyID() == hid {
  1919  				cg.Leave(t)
  1920  				t.ResetMemCgIDFromCgroup(cg)
  1921  				delete(t.cgroups, cg)
  1922  				releasedCGs = append(releasedCGs, cg)
  1923  				// A task can't be part of multiple cgroups from the same
  1924  				// hierarchy, so we can skip checking the rest once we find a
  1925  				// match.
  1926  				break
  1927  			}
  1928  		}
  1929  		t.mu.Unlock()
  1930  	})
  1931  	k.tasks.mu.RUnlock()
  1932  
  1933  	for _, c := range releasedCGs {
  1934  		c.decRef()
  1935  	}
  1936  }
  1937  
  1938  // ReplaceFSContextRoots updates root and cwd to `newRoot` in the FSContext
  1939  // across all tasks whose old root or cwd were `oldRoot`.
  1940  func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) {
  1941  	k.tasks.mu.RLock()
  1942  	oldRootDecRefs := 0
  1943  	k.tasks.forEachTaskLocked(func(t *Task) {
  1944  		t.mu.Lock()
  1945  		defer t.mu.Unlock()
  1946  		if fsc := t.fsContext; fsc != nil {
  1947  			fsc.mu.Lock()
  1948  			defer fsc.mu.Unlock()
  1949  			if fsc.root == oldRoot {
  1950  				newRoot.IncRef()
  1951  				oldRootDecRefs++
  1952  				fsc.root = newRoot
  1953  			}
  1954  			if fsc.cwd == oldRoot {
  1955  				newRoot.IncRef()
  1956  				oldRootDecRefs++
  1957  				fsc.cwd = newRoot
  1958  			}
  1959  		}
  1960  	})
  1961  	k.tasks.mu.RUnlock()
  1962  	for i := 0; i < oldRootDecRefs; i++ {
  1963  		oldRoot.DecRef(ctx)
  1964  	}
  1965  }
  1966  
  1967  // GetUserCounters returns the user counters for the given KUID.
  1968  func (k *Kernel) GetUserCounters(uid auth.KUID) *UserCounters {
  1969  	k.userCountersMapMu.Lock()
  1970  	defer k.userCountersMapMu.Unlock()
  1971  
  1972  	if uc, ok := k.userCountersMap[uid]; ok {
  1973  		return uc
  1974  	}
  1975  
  1976  	uc := &UserCounters{}
  1977  	k.userCountersMap[uid] = uc
  1978  	return uc
  1979  }
  1980  
  1981  // AddDevGofer initializes the dev gofer connection and starts tracking it.
  1982  // It takes ownership of goferFD.
  1983  func (k *Kernel) AddDevGofer(contName string, goferFD int) error {
  1984  	client, err := devutil.NewGoferClient(k.SupervisorContext(), contName, goferFD)
  1985  	if err != nil {
  1986  		return err
  1987  	}
  1988  
  1989  	k.devGofersMu.Lock()
  1990  	defer k.devGofersMu.Unlock()
  1991  	if k.devGofers == nil {
  1992  		k.devGofers = make(map[string]*devutil.GoferClient)
  1993  	}
  1994  	k.devGofers[contName] = client
  1995  	return nil
  1996  }
  1997  
  1998  // RemoveDevGofer closes the dev gofer connection, if one exists, and stops
  1999  // tracking it.
  2000  func (k *Kernel) RemoveDevGofer(contName string) {
  2001  	k.devGofersMu.Lock()
  2002  	defer k.devGofersMu.Unlock()
  2003  	client, ok := k.devGofers[contName]
  2004  	if !ok {
  2005  		return
  2006  	}
  2007  	client.Close()
  2008  	delete(k.devGofers, contName)
  2009  }
  2010  
  2011  // GetDevGoferClient implements
  2012  // devutil.GoferClientProviderFromContext.GetDevGoferClient.
  2013  func (k *Kernel) GetDevGoferClient(contName string) *devutil.GoferClient {
  2014  	k.devGofersMu.Lock()
  2015  	defer k.devGofersMu.Unlock()
  2016  	return k.devGofers[contName]
  2017  }
  2018  
  2019  func (k *Kernel) cleaupDevGofers() {
  2020  	k.devGofersMu.Lock()
  2021  	defer k.devGofersMu.Unlock()
  2022  	for _, client := range k.devGofers {
  2023  		client.Close()
  2024  	}
  2025  	k.devGofers = nil
  2026  }
  2027  
  2028  // RegisterContainerName registers a container name for a given container ID.
  2029  func (k *Kernel) RegisterContainerName(cid, containerName string) {
  2030  	k.extMu.Lock()
  2031  	defer k.extMu.Unlock()
  2032  	k.containerNames[cid] = containerName
  2033  }
  2034  
  2035  // RestoreContainerMapping remaps old container IDs to new ones after a restore.
  2036  // containerIDs maps "name -> new container ID". Note that container names remain
  2037  // constant between restore sessions.
  2038  func (k *Kernel) RestoreContainerMapping(containerIDs map[string]string) {
  2039  	k.extMu.Lock()
  2040  	defer k.extMu.Unlock()
  2041  
  2042  	// Delete mapping from old session and replace with new values.
  2043  	k.containerNames = make(map[string]string)
  2044  	for name, cid := range containerIDs {
  2045  		k.containerNames[cid] = name
  2046  	}
  2047  }
  2048  
  2049  // ContainerName returns the container name for a given container ID.
  2050  func (k *Kernel) ContainerName(cid string) string {
  2051  	k.extMu.Lock()
  2052  	defer k.extMu.Unlock()
  2053  	return k.containerNames[cid]
  2054  }