github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/kernel.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/kernel.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernel provides an emulation of the Linux kernel.
    16  //
    17  // See README.md for a detailed overview.
    18  //
    19  // Lock order (outermost locks must be taken first):
    20  //
    21  //	Kernel.extMu
    22  //		ThreadGroup.timerMu
    23  //		  ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu
    24  //		    TaskSet.mu
    25  //		      SignalHandlers.mu
    26  //		        Task.mu
    27  //		    runningTasksMu
    28  //
    29  // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
    30  // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
    31  // time requires locking all of their signal mutexes first.
    32  package kernel
    33  
    34  import (
    35  	"errors"
    36  	"fmt"
    37  	"path/filepath"
    38  	"time"
    39  
    40  	"github.com/metacubex/gvisor/pkg/abi/linux"
    41  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    42  	"github.com/metacubex/gvisor/pkg/cleanup"
    43  	"github.com/metacubex/gvisor/pkg/context"
    44  	"github.com/metacubex/gvisor/pkg/cpuid"
    45  	"github.com/metacubex/gvisor/pkg/devutil"
    46  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    47  	"github.com/metacubex/gvisor/pkg/eventchannel"
    48  	"github.com/metacubex/gvisor/pkg/fspath"
    49  	"github.com/metacubex/gvisor/pkg/log"
    50  	"github.com/metacubex/gvisor/pkg/refs"
    51  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    52  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/nsfs"
    53  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/pipefs"
    54  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/sockfs"
    55  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/timerfd"
    56  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs"
    57  	"github.com/metacubex/gvisor/pkg/sentry/hostcpu"
    58  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    59  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    60  	"github.com/metacubex/gvisor/pkg/sentry/kernel/futex"
    61  	"github.com/metacubex/gvisor/pkg/sentry/kernel/ipc"
    62  	"github.com/metacubex/gvisor/pkg/sentry/kernel/sched"
    63  	ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    64  	"github.com/metacubex/gvisor/pkg/sentry/limits"
    65  	"github.com/metacubex/gvisor/pkg/sentry/loader"
    66  	"github.com/metacubex/gvisor/pkg/sentry/mm"
    67  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    68  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    69  	"github.com/metacubex/gvisor/pkg/sentry/socket/netlink/port"
    70  	sentrytime "github.com/metacubex/gvisor/pkg/sentry/time"
    71  	"github.com/metacubex/gvisor/pkg/sentry/unimpl"
    72  	uspb "github.com/metacubex/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
    73  	"github.com/metacubex/gvisor/pkg/sentry/uniqueid"
    74  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    75  	"github.com/metacubex/gvisor/pkg/state"
    76  	"github.com/metacubex/gvisor/pkg/state/wire"
    77  	"github.com/metacubex/gvisor/pkg/sync"
    78  	"github.com/metacubex/gvisor/pkg/tcpip"
    79  )
    80  
    81  // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to
    82  // allow easy access everywhere.
    83  var IOUringEnabled = false
    84  
    85  // UserCounters is a set of user counters.
    86  //
    87  // +stateify savable
    88  type UserCounters struct {
    89  	uid auth.KUID
    90  
    91  	rlimitNProc atomicbitops.Uint64
    92  }
    93  
    94  // incRLimitNProc increments the rlimitNProc counter.
    95  func (uc *UserCounters) incRLimitNProc(ctx context.Context) error {
    96  	lim := limits.FromContext(ctx).Get(limits.ProcessCount)
    97  	creds := auth.CredentialsFromContext(ctx)
    98  	nproc := uc.rlimitNProc.Add(1)
    99  	if nproc > lim.Cur &&
   100  		!creds.HasCapability(linux.CAP_SYS_ADMIN) &&
   101  		!creds.HasCapability(linux.CAP_SYS_RESOURCE) {
   102  		uc.rlimitNProc.Add(^uint64(0))
   103  		return linuxerr.EAGAIN
   104  	}
   105  	return nil
   106  }
   107  
   108  // decRLimitNProc decrements the rlimitNProc counter.
   109  func (uc *UserCounters) decRLimitNProc() {
   110  	uc.rlimitNProc.Add(^uint64(0))
   111  }
   112  
   113  // CgroupMount contains the cgroup mount. These mounts are created for the root
   114  // container by default and are stored in the kernel.
   115  //
   116  // +stateify savable
   117  type CgroupMount struct {
   118  	Fs    *vfs.Filesystem
   119  	Root  *vfs.Dentry
   120  	Mount *vfs.Mount
   121  }
   122  
   123  // Kernel represents an emulated Linux kernel. It must be initialized by calling
   124  // Init() or LoadFrom().
   125  //
   126  // +stateify savable
   127  type Kernel struct {
   128  	// extMu serializes external changes to the Kernel with calls to
   129  	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
   130  	// remains frozen for the duration of the call; it requires that the Kernel
   131  	// is paused as a precondition, which ensures that none of the tasks
   132  	// running within the Kernel can affect its state, but extMu is required to
   133  	// ensure that concurrent users of the Kernel *outside* the Kernel's
   134  	// control cannot affect its state by calling e.g.
   135  	// Kernel.SendExternalSignal.)
   136  	extMu sync.Mutex `state:"nosave"`
   137  
   138  	// started is true if Start has been called. Unless otherwise specified,
   139  	// all Kernel fields become immutable once started becomes true.
   140  	started bool `state:"nosave"`
   141  
   142  	// All of the following fields are immutable unless otherwise specified.
   143  
   144  	// Platform is the platform that is used to execute tasks in the created
   145  	// Kernel.
   146  	platform.Platform `state:"nosave"`
   147  
   148  	// mf provides application memory.
   149  	mf *pgalloc.MemoryFile `state:"nosave"`
   150  
   151  	// See InitKernelArgs for the meaning of these fields.
   152  	featureSet           cpuid.FeatureSet
   153  	timekeeper           *Timekeeper
   154  	tasks                *TaskSet
   155  	rootUserNamespace    *auth.UserNamespace
   156  	rootNetworkNamespace *inet.Namespace
   157  	applicationCores     uint
   158  	useHostCores         bool
   159  	extraAuxv            []arch.AuxEntry
   160  	vdso                 *loader.VDSO
   161  	rootUTSNamespace     *UTSNamespace
   162  	rootIPCNamespace     *IPCNamespace
   163  
   164  	// futexes is the "root" futex.Manager, from which all others are forked.
   165  	// This is necessary to ensure that shared futexes are coherent across all
   166  	// tasks, including those created by CreateProcess.
   167  	futexes *futex.Manager
   168  
   169  	// globalInit is the thread group whose leader has ID 1 in the root PID
   170  	// namespace. globalInit is stored separately so that it is accessible even
   171  	// after all tasks in the thread group have exited, such that ID 1 is no
   172  	// longer mapped.
   173  	//
   174  	// globalInit is mutable until it is assigned by the first successful call
   175  	// to CreateProcess, and is protected by extMu.
   176  	globalInit *ThreadGroup
   177  
   178  	// syslog is the kernel log.
   179  	syslog syslog
   180  
   181  	runningTasksMu runningTasksMutex `state:"nosave"`
   182  
   183  	// runningTasks is the total count of tasks currently in
   184  	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
   185  	// not blocked or stopped.
   186  	//
   187  	// runningTasks must be accessed atomically. Increments from 0 to 1 are
   188  	// further protected by runningTasksMu (see incRunningTasks).
   189  	runningTasks atomicbitops.Int64
   190  
   191  	// runningTasksCond is signaled when runningTasks is incremented from 0 to 1.
   192  	//
   193  	// Invariant: runningTasksCond.L == &runningTasksMu.
   194  	runningTasksCond sync.Cond `state:"nosave"`
   195  
   196  	// cpuClock is incremented every linux.ClockTick by a goroutine running
   197  	// kernel.runCPUClockTicker() while runningTasks != 0.
   198  	//
   199  	// cpuClock is used to measure task CPU usage, since sampling monotonicClock
   200  	// twice on every syscall turns out to be unreasonably expensive. This is
   201  	// similar to how Linux does task CPU accounting on x86
   202  	// (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing
   203  	// information to improve resolution
   204  	// (kernel/sched/cputime.c:cputime_adjust()), which we can't do since
   205  	// "preeemptive" scheduling is managed by the Go runtime, which doesn't
   206  	// provide this information.
   207  	//
   208  	// cpuClock is mutable, and is accessed using atomic memory operations.
   209  	cpuClock atomicbitops.Uint64
   210  
   211  	// cpuClockTickTimer drives increments of cpuClock.
   212  	cpuClockTickTimer *time.Timer `state:"nosave"`
   213  
   214  	// cpuClockMu is used to make increments of cpuClock, and updates of timers
   215  	// based on cpuClock, atomic.
   216  	cpuClockMu cpuClockMutex `state:"nosave"`
   217  
   218  	// cpuClockTickerRunning is true if the goroutine that increments cpuClock is
   219  	// running and false if it is blocked in runningTasksCond.Wait() or if it
   220  	// never started.
   221  	//
   222  	// cpuClockTickerRunning is protected by runningTasksMu.
   223  	cpuClockTickerRunning bool
   224  
   225  	// cpuClockTickerWakeCh is sent to to wake the goroutine that increments
   226  	// cpuClock if it's sleeping between ticks.
   227  	cpuClockTickerWakeCh chan struct{} `state:"nosave"`
   228  
   229  	// cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions
   230  	// from true to false.
   231  	//
   232  	// Invariant: cpuClockTickerStopCond.L == &runningTasksMu.
   233  	cpuClockTickerStopCond sync.Cond `state:"nosave"`
   234  
   235  	// uniqueID is used to generate unique identifiers.
   236  	//
   237  	// uniqueID is mutable, and is accessed using atomic memory operations.
   238  	uniqueID atomicbitops.Uint64
   239  
   240  	// nextInotifyCookie is a monotonically increasing counter used for
   241  	// generating unique inotify event cookies.
   242  	//
   243  	// nextInotifyCookie is mutable.
   244  	nextInotifyCookie atomicbitops.Uint32
   245  
   246  	// netlinkPorts manages allocation of netlink socket port IDs.
   247  	netlinkPorts *port.Manager
   248  
   249  	// saveStatus is nil if the sandbox has not been saved, errSaved or
   250  	// errAutoSaved if it has been saved successfully, or the error causing the
   251  	// sandbox to exit during save.
   252  	// It is protected by extMu.
   253  	saveStatus error `state:"nosave"`
   254  
   255  	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
   256  	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
   257  
   258  	// sockets records all network sockets in the system. Protected by extMu.
   259  	sockets map[*vfs.FileDescription]*SocketRecord
   260  
   261  	// nextSocketRecord is the next entry number to use in sockets. Protected
   262  	// by extMu.
   263  	nextSocketRecord uint64
   264  
   265  	// unimplementedSyscallEmitterOnce is used in the initialization of
   266  	// unimplementedSyscallEmitter.
   267  	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
   268  
   269  	// unimplementedSyscallEmitter is used to emit unimplemented syscall
   270  	// events. This is initialized lazily on the first unimplemented
   271  	// syscall.
   272  	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
   273  
   274  	// SpecialOpts contains special kernel options.
   275  	SpecialOpts
   276  
   277  	// vfs keeps the filesystem state used across the kernel.
   278  	vfs vfs.VirtualFilesystem
   279  
   280  	// hostMount is the Mount used for file descriptors that were imported
   281  	// from the host.
   282  	hostMount *vfs.Mount
   283  
   284  	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
   285  	// syscalls (as opposed to named pipes created by mknod()).
   286  	pipeMount *vfs.Mount
   287  
   288  	// nsfsMount is the Mount used for namespaces.
   289  	nsfsMount *vfs.Mount
   290  
   291  	// shmMount is the Mount used for anonymous files created by the
   292  	// memfd_create() syscalls. It is analogous to Linux's shm_mnt.
   293  	shmMount *vfs.Mount
   294  
   295  	// socketMount is the Mount used for sockets created by the socket() and
   296  	// socketpair() syscalls. There are several cases where a socket dentry will
   297  	// not be contained in socketMount:
   298  	// 1. Socket files created by mknod()
   299  	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
   300  	// 3. Socket files created by binding Unix sockets to a file path
   301  	socketMount *vfs.Mount
   302  
   303  	// sysVShmDevID is the device number used by SysV shm segments. In Linux,
   304  	// SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number.
   305  	// In gVisor, the shm implementation does not use shmMount, extracting
   306  	// shmMount's device number is inconvenient, applications accept a
   307  	// different device number in practice, and using a distinct device number
   308  	// avoids the possibility of inode number collisions due to the hack
   309  	// described in shm.Shm.InodeID().
   310  	sysVShmDevID uint32
   311  
   312  	// If set to true, report address space activation waits as if the task is in
   313  	// external wait so that the watchdog doesn't report the task stuck.
   314  	SleepForAddressSpaceActivation bool
   315  
   316  	// Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
   317  	// tracee-tracer relationship. The key is a process (technically, the thread
   318  	// group leader) that can be traced by any thread that is a descendant of the
   319  	// value. If the value is nil, then anyone can trace the process represented by
   320  	// the key.
   321  	//
   322  	// ptraceExceptions is protected by the TaskSet mutex.
   323  	ptraceExceptions map[*Task]*Task
   324  
   325  	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
   326  	YAMAPtraceScope atomicbitops.Int32
   327  
   328  	// cgroupRegistry contains the set of active cgroup controllers on the
   329  	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
   330  	// the system.
   331  	cgroupRegistry *CgroupRegistry
   332  
   333  	// cgroupMountsMap maps the cgroup controller names to the cgroup mounts
   334  	// created for the root container. These mounts are then bind mounted
   335  	// for other application containers by creating their own container
   336  	// directories.
   337  	cgroupMountsMap   map[string]*CgroupMount
   338  	cgroupMountsMapMu cgroupMountsMutex `state:"nosave"`
   339  
   340  	// userCountersMap maps auth.KUID into a set of user counters.
   341  	userCountersMap   map[auth.KUID]*UserCounters
   342  	userCountersMapMu userCountersMutex `state:"nosave"`
   343  
   344  	// MaxFDLimit specifies the maximum file descriptor number that can be
   345  	// used by processes.
   346  	MaxFDLimit atomicbitops.Int32
   347  
   348  	// devGofers maps container ID to its device gofer client.
   349  	devGofers   map[string]*devutil.GoferClient `state:"nosave"`
   350  	devGofersMu sync.Mutex                      `state:"nosave"`
   351  }
   352  
   353  // InitKernelArgs holds arguments to Init.
   354  type InitKernelArgs struct {
   355  	// FeatureSet is the emulated CPU feature set.
   356  	FeatureSet cpuid.FeatureSet
   357  
   358  	// Timekeeper manages time for all tasks in the system.
   359  	Timekeeper *Timekeeper
   360  
   361  	// RootUserNamespace is the root user namespace.
   362  	RootUserNamespace *auth.UserNamespace
   363  
   364  	// RootNetworkNamespace is the root network namespace. If nil, no networking
   365  	// will be available.
   366  	RootNetworkNamespace *inet.Namespace
   367  
   368  	// ApplicationCores is the number of logical CPUs visible to sandboxed
   369  	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
   370  	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
   371  	// most significant bit in cpu_possible_mask + 1.
   372  	ApplicationCores uint
   373  
   374  	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
   375  	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
   376  	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
   377  	// will be overridden.
   378  	UseHostCores bool
   379  
   380  	// ExtraAuxv contains additional auxiliary vector entries that are added to
   381  	// each process by the ELF loader.
   382  	ExtraAuxv []arch.AuxEntry
   383  
   384  	// Vdso holds the VDSO and its parameter page.
   385  	Vdso *loader.VDSO
   386  
   387  	// RootUTSNamespace is the root UTS namespace.
   388  	RootUTSNamespace *UTSNamespace
   389  
   390  	// RootIPCNamespace is the root IPC namespace.
   391  	RootIPCNamespace *IPCNamespace
   392  
   393  	// PIDNamespace is the root PID namespace.
   394  	PIDNamespace *PIDNamespace
   395  
   396  	// MaxFDLimit specifies the maximum file descriptor number that can be
   397  	// used by processes.  If it is zero, the limit will be set to
   398  	// unlimited.
   399  	MaxFDLimit int32
   400  }
   401  
   402  // Init initialize the Kernel with no tasks.
   403  //
   404  // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
   405  // before calling Init.
   406  func (k *Kernel) Init(args InitKernelArgs) error {
   407  	if args.Timekeeper == nil {
   408  		return fmt.Errorf("args.Timekeeper is nil")
   409  	}
   410  	if args.Timekeeper.clocks == nil {
   411  		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
   412  	}
   413  	if args.RootUserNamespace == nil {
   414  		return fmt.Errorf("args.RootUserNamespace is nil")
   415  	}
   416  	if args.ApplicationCores == 0 {
   417  		return fmt.Errorf("args.ApplicationCores is 0")
   418  	}
   419  
   420  	k.featureSet = args.FeatureSet
   421  	k.timekeeper = args.Timekeeper
   422  	k.tasks = newTaskSet(args.PIDNamespace)
   423  	k.rootUserNamespace = args.RootUserNamespace
   424  	k.rootUTSNamespace = args.RootUTSNamespace
   425  	k.rootIPCNamespace = args.RootIPCNamespace
   426  	k.rootNetworkNamespace = args.RootNetworkNamespace
   427  	if k.rootNetworkNamespace == nil {
   428  		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace)
   429  	}
   430  	k.runningTasksCond.L = &k.runningTasksMu
   431  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   432  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   433  	k.applicationCores = args.ApplicationCores
   434  	if args.UseHostCores {
   435  		k.useHostCores = true
   436  		maxCPU, err := hostcpu.MaxPossibleCPU()
   437  		if err != nil {
   438  			return fmt.Errorf("failed to get maximum CPU number: %v", err)
   439  		}
   440  		minAppCores := uint(maxCPU) + 1
   441  		if k.applicationCores < minAppCores {
   442  			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
   443  			k.applicationCores = minAppCores
   444  		}
   445  	}
   446  	k.extraAuxv = args.ExtraAuxv
   447  	k.vdso = args.Vdso
   448  	k.futexes = futex.NewManager()
   449  	k.netlinkPorts = port.New()
   450  	k.ptraceExceptions = make(map[*Task]*Task)
   451  	k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL)
   452  	k.userCountersMap = make(map[auth.KUID]*UserCounters)
   453  	if args.MaxFDLimit == 0 {
   454  		args.MaxFDLimit = MaxFdLimit
   455  	}
   456  	k.MaxFDLimit.Store(args.MaxFDLimit)
   457  
   458  	ctx := k.SupervisorContext()
   459  	if err := k.vfs.Init(ctx); err != nil {
   460  		return fmt.Errorf("failed to initialize VFS: %v", err)
   461  	}
   462  
   463  	err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx))
   464  	if err != nil {
   465  		return fmt.Errorf("failed to create mqfs filesystem: %v", err)
   466  	}
   467  
   468  	pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
   469  	if err != nil {
   470  		return fmt.Errorf("failed to create pipefs filesystem: %v", err)
   471  	}
   472  	defer pipeFilesystem.DecRef(ctx)
   473  	pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
   474  	k.pipeMount = pipeMount
   475  
   476  	nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs)
   477  	if err != nil {
   478  		return fmt.Errorf("failed to create nsfs filesystem: %v", err)
   479  	}
   480  	defer nsfsFilesystem.DecRef(ctx)
   481  	k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
   482  	k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
   483  	k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
   484  	k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
   485  
   486  	tmpfsOpts := vfs.GetFilesystemOptions{
   487  		InternalData: tmpfs.FilesystemOpts{
   488  			// See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT).
   489  			// Note how mm/shmem.c:shmem_fill_super() does not provide a default
   490  			// value for sbinfo->max_blocks when SB_KERNMOUNT is set.
   491  			DisableDefaultSizeLimit: true,
   492  		},
   493  		InternalMount: true,
   494  	}
   495  	tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts)
   496  	if err != nil {
   497  		return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
   498  	}
   499  	defer tmpfsFilesystem.DecRef(ctx)
   500  	defer tmpfsRoot.DecRef(ctx)
   501  	k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
   502  
   503  	socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
   504  	if err != nil {
   505  		return fmt.Errorf("failed to create sockfs filesystem: %v", err)
   506  	}
   507  	defer socketFilesystem.DecRef(ctx)
   508  	k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
   509  
   510  	sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor()
   511  	if err != nil {
   512  		return fmt.Errorf("failed to get device number for SysV shm: %v", err)
   513  	}
   514  	k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor)
   515  
   516  	k.sockets = make(map[*vfs.FileDescription]*SocketRecord)
   517  
   518  	k.cgroupRegistry = newCgroupRegistry()
   519  	return nil
   520  }
   521  
   522  // +stateify savable
   523  type privateMemoryFileMetadata struct {
   524  	owners []string
   525  }
   526  
   527  func savePrivateMFs(ctx context.Context, w wire.Writer, mfsToSave map[string]*pgalloc.MemoryFile) error {
   528  	var meta privateMemoryFileMetadata
   529  	// Generate the order in which private memory files are saved.
   530  	for fsID := range mfsToSave {
   531  		meta.owners = append(meta.owners, fsID)
   532  	}
   533  	// Save the metadata.
   534  	if _, err := state.Save(ctx, w, &meta); err != nil {
   535  		return err
   536  	}
   537  	// Followed by the private memory files in order.
   538  	for _, fsID := range meta.owners {
   539  		if err := mfsToSave[fsID].SaveTo(ctx, w); err != nil {
   540  			return err
   541  		}
   542  	}
   543  	return nil
   544  }
   545  
   546  func loadPrivateMFs(ctx context.Context, r wire.Reader) error {
   547  	// Load the metadata.
   548  	var meta privateMemoryFileMetadata
   549  	if _, err := state.Load(ctx, r, &meta); err != nil {
   550  		return err
   551  	}
   552  	mfmap := pgalloc.MemoryFileMapFromContext(ctx)
   553  	// Ensure that it is consistent with CtxFilesystemMemoryFileMap.
   554  	if len(mfmap) != len(meta.owners) {
   555  		return fmt.Errorf("inconsistent private memory files on restore: savedMFOwners = %v, CtxFilesystemMemoryFileMap = %v", meta.owners, mfmap)
   556  	}
   557  	// Load all private memory files.
   558  	for _, fsID := range meta.owners {
   559  		mf, ok := mfmap[fsID]
   560  		if !ok {
   561  			return fmt.Errorf("saved memory file for %q was not configured on restore", fsID)
   562  		}
   563  		if err := mf.LoadFrom(ctx, r); err != nil {
   564  			return err
   565  		}
   566  	}
   567  	return nil
   568  }
   569  
   570  // SaveTo saves the state of k to w.
   571  //
   572  // Preconditions: The kernel must be paused throughout the call to SaveTo.
   573  func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
   574  	saveStart := time.Now()
   575  
   576  	// Do not allow other Kernel methods to affect it while it's being saved.
   577  	k.extMu.Lock()
   578  	defer k.extMu.Unlock()
   579  
   580  	// Stop time.
   581  	k.pauseTimeLocked(ctx)
   582  	defer k.resumeTimeLocked(ctx)
   583  
   584  	// Evict all evictable MemoryFile allocations.
   585  	k.mf.StartEvictions()
   586  	k.mf.WaitForEvictions()
   587  
   588  	// Discard unsavable mappings, such as those for host file descriptors.
   589  	if err := k.invalidateUnsavableMappings(ctx); err != nil {
   590  		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   591  	}
   592  
   593  	// Capture all private memory files.
   594  	mfsToSave := make(map[string]*pgalloc.MemoryFile)
   595  	vfsCtx := context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfsToSave)
   596  	// Prepare filesystems for saving. This must be done after
   597  	// invalidateUnsavableMappings(), since dropping memory mappings may
   598  	// affect filesystem state (e.g. page cache reference counts).
   599  	if err := k.vfs.PrepareSave(vfsCtx); err != nil {
   600  		return err
   601  	}
   602  	// Mark all to-be-saved MemoryFiles as savable to inform kernel save below.
   603  	k.mf.MarkSavable()
   604  	for _, mf := range mfsToSave {
   605  		mf.MarkSavable()
   606  	}
   607  
   608  	// Save the CPUID FeatureSet before the rest of the kernel so we can
   609  	// verify its compatibility on restore before attempting to restore the
   610  	// entire kernel, which may fail on an incompatible machine.
   611  	//
   612  	// N.B. This will also be saved along with the full kernel save below.
   613  	cpuidStart := time.Now()
   614  	if _, err := state.Save(ctx, w, &k.featureSet); err != nil {
   615  		return err
   616  	}
   617  	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
   618  
   619  	// Save the timekeeper's state.
   620  
   621  	if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil {
   622  		// Pause the network stack.
   623  		netstackPauseStart := time.Now()
   624  		log.Infof("Pausing root network namespace")
   625  		k.rootNetworkNamespace.Stack().Pause()
   626  		defer k.rootNetworkNamespace.Stack().Resume()
   627  		log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart))
   628  	}
   629  
   630  	// Save the kernel state.
   631  	kernelStart := time.Now()
   632  	stats, err := state.Save(ctx, w, k)
   633  	if err != nil {
   634  		return err
   635  	}
   636  	log.Infof("Kernel save stats: %s", stats.String())
   637  	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
   638  
   639  	// Save the memory files' state.
   640  	memoryStart := time.Now()
   641  	if err := k.mf.SaveTo(ctx, w); err != nil {
   642  		return err
   643  	}
   644  	if err := savePrivateMFs(ctx, w, mfsToSave); err != nil {
   645  		return err
   646  	}
   647  	log.Infof("Memory files save took [%s].", time.Since(memoryStart))
   648  
   649  	log.Infof("Overall save took [%s].", time.Since(saveStart))
   650  
   651  	return nil
   652  }
   653  
   654  // Preconditions: The kernel must be paused.
   655  func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
   656  	invalidated := make(map[*mm.MemoryManager]struct{})
   657  	k.tasks.mu.RLock()
   658  	defer k.tasks.mu.RUnlock()
   659  	for t := range k.tasks.Root.tids {
   660  		// We can skip locking Task.mu here since the kernel is paused.
   661  		if memMgr := t.image.MemoryManager; memMgr != nil {
   662  			if _, ok := invalidated[memMgr]; !ok {
   663  				if err := memMgr.InvalidateUnsavable(ctx); err != nil {
   664  					return err
   665  				}
   666  				invalidated[memMgr] = struct{}{}
   667  			}
   668  		}
   669  		// I really wish we just had a sync.Map of all MMs...
   670  		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
   671  			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
   672  				return err
   673  			}
   674  		}
   675  	}
   676  	return nil
   677  }
   678  
   679  // LoadFrom returns a new Kernel loaded from args.
   680  func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
   681  	loadStart := time.Now()
   682  
   683  	k.runningTasksCond.L = &k.runningTasksMu
   684  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   685  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   686  
   687  	initAppCores := k.applicationCores
   688  
   689  	// Load the pre-saved CPUID FeatureSet.
   690  	//
   691  	// N.B. This was also saved along with the full kernel below, so we
   692  	// don't need to explicitly install it in the Kernel.
   693  	cpuidStart := time.Now()
   694  	if _, err := state.Load(ctx, r, &k.featureSet); err != nil {
   695  		return err
   696  	}
   697  	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
   698  
   699  	// Verify that the FeatureSet is usable on this host. We do this before
   700  	// Kernel load so that the explicit CPUID mismatch error has priority
   701  	// over floating point state restore errors that may occur on load on
   702  	// an incompatible machine.
   703  	if err := k.featureSet.CheckHostCompatible(); err != nil {
   704  		return err
   705  	}
   706  
   707  	// Load the kernel state.
   708  	kernelStart := time.Now()
   709  	stats, err := state.Load(ctx, r, k)
   710  	if err != nil {
   711  		return err
   712  	}
   713  	log.Infof("Kernel load stats: %s", stats.String())
   714  	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
   715  
   716  	// rootNetworkNamespace should be populated after loading the state file.
   717  	// Restore the root network stack.
   718  	k.rootNetworkNamespace.RestoreRootStack(net)
   719  
   720  	// Load the memory files' state.
   721  	memoryStart := time.Now()
   722  	if err := k.mf.LoadFrom(ctx, r); err != nil {
   723  		return err
   724  	}
   725  	if err := loadPrivateMFs(ctx, r); err != nil {
   726  		return err
   727  	}
   728  	log.Infof("Memory files load took [%s].", time.Since(memoryStart))
   729  
   730  	log.Infof("Overall load took [%s]", time.Since(loadStart))
   731  
   732  	k.Timekeeper().SetClocks(clocks)
   733  
   734  	if timeReady != nil {
   735  		close(timeReady)
   736  	}
   737  
   738  	if net != nil {
   739  		net.Restore()
   740  	}
   741  
   742  	if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
   743  		return err
   744  	}
   745  
   746  	tcpip.AsyncLoading.Wait()
   747  
   748  	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
   749  
   750  	// Applications may size per-cpu structures based on k.applicationCores, so
   751  	// it can't change across save/restore. When we are virtualizing CPU
   752  	// numbers, this isn't a problem. However, when we are exposing host CPU
   753  	// assignments, we can't tolerate an increase in the number of host CPUs,
   754  	// which could result in getcpu(2) returning CPUs that applications expect
   755  	// not to exist.
   756  	if k.useHostCores && initAppCores > k.applicationCores {
   757  		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
   758  	}
   759  
   760  	return nil
   761  }
   762  
   763  // UniqueID returns a unique identifier.
   764  func (k *Kernel) UniqueID() uint64 {
   765  	id := k.uniqueID.Add(1)
   766  	if id == 0 {
   767  		panic("unique identifier generator wrapped around")
   768  	}
   769  	return id
   770  }
   771  
   772  // CreateProcessArgs holds arguments to kernel.CreateProcess.
   773  type CreateProcessArgs struct {
   774  	// Filename is the filename to load as the init binary.
   775  	//
   776  	// If this is provided as "", File will be checked, then the file will be
   777  	// guessed via Argv[0].
   778  	Filename string
   779  
   780  	// File is a passed host FD pointing to a file to load as the init binary.
   781  	//
   782  	// This is checked if and only if Filename is "".
   783  	File *vfs.FileDescription
   784  
   785  	// Argv is a list of arguments.
   786  	Argv []string
   787  
   788  	// Envv is a list of environment variables.
   789  	Envv []string
   790  
   791  	// WorkingDirectory is the initial working directory.
   792  	//
   793  	// This defaults to the root if empty.
   794  	WorkingDirectory string
   795  
   796  	// Credentials is the initial credentials.
   797  	Credentials *auth.Credentials
   798  
   799  	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
   800  	// it takes a reference on FDTable.
   801  	FDTable *FDTable
   802  
   803  	// Umask is the initial umask.
   804  	Umask uint
   805  
   806  	// Limits are the initial resource limits.
   807  	Limits *limits.LimitSet
   808  
   809  	// MaxSymlinkTraversals is the maximum number of symlinks to follow
   810  	// during resolution.
   811  	MaxSymlinkTraversals uint
   812  
   813  	// UTSNamespace is the initial UTS namespace.
   814  	UTSNamespace *UTSNamespace
   815  
   816  	// IPCNamespace is the initial IPC namespace.
   817  	IPCNamespace *IPCNamespace
   818  
   819  	// PIDNamespace is the initial PID Namespace.
   820  	PIDNamespace *PIDNamespace
   821  
   822  	// MountNamespace optionally contains the mount namespace for this
   823  	// process. If nil, the init process's mount namespace is used.
   824  	//
   825  	// Anyone setting MountNamespace must donate a reference (i.e.
   826  	// increment it).
   827  	MountNamespace *vfs.MountNamespace
   828  
   829  	// ContainerID is the container that the process belongs to.
   830  	ContainerID string
   831  
   832  	// InitialCgroups are the cgroups the container is initialized to.
   833  	InitialCgroups map[Cgroup]struct{}
   834  }
   835  
   836  // NewContext returns a context.Context that represents the task that will be
   837  // created by args.NewContext(k).
   838  func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context {
   839  	return &createProcessContext{
   840  		Context: context.Background(),
   841  		kernel:  k,
   842  		args:    args,
   843  	}
   844  }
   845  
   846  // createProcessContext is a context.Context that represents the context
   847  // associated with a task that is being created.
   848  type createProcessContext struct {
   849  	context.Context
   850  	kernel *Kernel
   851  	args   *CreateProcessArgs
   852  }
   853  
   854  // Value implements context.Context.Value.
   855  func (ctx *createProcessContext) Value(key any) any {
   856  	switch key {
   857  	case CtxKernel:
   858  		return ctx.kernel
   859  	case CtxPIDNamespace:
   860  		return ctx.args.PIDNamespace
   861  	case CtxUTSNamespace:
   862  		utsns := ctx.args.UTSNamespace
   863  		utsns.IncRef()
   864  		return utsns
   865  	case ipc.CtxIPCNamespace:
   866  		ipcns := ctx.args.IPCNamespace
   867  		ipcns.IncRef()
   868  		return ipcns
   869  	case auth.CtxCredentials:
   870  		return ctx.args.Credentials
   871  	case vfs.CtxRoot:
   872  		if ctx.args.MountNamespace == nil {
   873  			return nil
   874  		}
   875  		root := ctx.args.MountNamespace.Root(ctx)
   876  		return root
   877  	case vfs.CtxMountNamespace:
   878  		if ctx.kernel.globalInit == nil {
   879  			return nil
   880  		}
   881  		mntns := ctx.kernel.GlobalInit().Leader().MountNamespace()
   882  		mntns.IncRef()
   883  		return mntns
   884  	case devutil.CtxDevGoferClient:
   885  		return ctx.kernel.getDevGoferClient(ctx.args.ContainerID)
   886  	case inet.CtxStack:
   887  		return ctx.kernel.RootNetworkNamespace().Stack()
   888  	case ktime.CtxRealtimeClock:
   889  		return ctx.kernel.RealtimeClock()
   890  	case limits.CtxLimits:
   891  		return ctx.args.Limits
   892  	case pgalloc.CtxMemoryCgroupID:
   893  		return ctx.getMemoryCgroupID()
   894  	case pgalloc.CtxMemoryFile:
   895  		return ctx.kernel.mf
   896  	case platform.CtxPlatform:
   897  		return ctx.kernel
   898  	case uniqueid.CtxGlobalUniqueID:
   899  		return ctx.kernel.UniqueID()
   900  	case uniqueid.CtxGlobalUniqueIDProvider:
   901  		return ctx.kernel
   902  	case uniqueid.CtxInotifyCookie:
   903  		return ctx.kernel.GenerateInotifyCookie()
   904  	case unimpl.CtxEvents:
   905  		return ctx.kernel
   906  	default:
   907  		return nil
   908  	}
   909  }
   910  
   911  func (ctx *createProcessContext) getMemoryCgroupID() uint32 {
   912  	for cg := range ctx.args.InitialCgroups {
   913  		for _, ctl := range cg.Controllers() {
   914  			if ctl.Type() == CgroupControllerMemory {
   915  				return cg.ID()
   916  			}
   917  		}
   918  	}
   919  	return InvalidCgroupID
   920  }
   921  
   922  // CreateProcess creates a new task in a new thread group with the given
   923  // options. The new task has no parent and is in the root PID namespace.
   924  //
   925  // If k.Start() has already been called, then the created process must be
   926  // started by calling kernel.StartProcess(tg).
   927  //
   928  // If k.Start() has not yet been called, then the created task will begin
   929  // running when k.Start() is called.
   930  //
   931  // CreateProcess has no analogue in Linux; it is used to create the initial
   932  // application task, as well as processes started by the control server.
   933  func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
   934  	k.extMu.Lock()
   935  	defer k.extMu.Unlock()
   936  	log.Infof("EXEC: %v", args.Argv)
   937  
   938  	ctx := args.NewContext(k)
   939  	mntns := args.MountNamespace
   940  	if mntns == nil {
   941  		if k.globalInit == nil {
   942  			return nil, 0, fmt.Errorf("mount namespace is nil")
   943  		}
   944  		// Add a reference to the namespace, which is transferred to the new process.
   945  		mntns = k.globalInit.Leader().MountNamespace()
   946  		mntns.IncRef()
   947  	}
   948  	// Get the root directory from the MountNamespace.
   949  	root := mntns.Root(ctx)
   950  	defer root.DecRef(ctx)
   951  
   952  	// Grab the working directory.
   953  	wd := root // Default.
   954  	if args.WorkingDirectory != "" {
   955  		pop := vfs.PathOperation{
   956  			Root:               root,
   957  			Start:              wd,
   958  			Path:               fspath.Parse(args.WorkingDirectory),
   959  			FollowFinalSymlink: true,
   960  		}
   961  		// NOTE(b/236028361): Do not set CheckSearchable flag to true.
   962  		// Application is allowed to start with a working directory that it can
   963  		// not access/search. This is consistent with Docker and VFS1. Runc
   964  		// explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry
   965  		// chdir to fix EPERM"). As described in the commit, runc unintentionally
   966  		// allowed this behavior in a couple of releases and applications started
   967  		// relying on it. So they decided to allow it for backward compatibility.
   968  		var err error
   969  		wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{})
   970  		if err != nil {
   971  			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
   972  		}
   973  		defer wd.DecRef(ctx)
   974  	}
   975  	fsContext := NewFSContext(root, wd, args.Umask)
   976  
   977  	tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
   978  	cu := cleanup.Make(func() {
   979  		tg.Release(ctx)
   980  	})
   981  	defer cu.Clean()
   982  
   983  	// Check which file to start from.
   984  	switch {
   985  	case args.Filename != "":
   986  		// If a filename is given, take that.
   987  		// Set File to nil so we resolve the path in LoadTaskImage.
   988  		args.File = nil
   989  	case args.File != nil:
   990  		// If File is set, take the File provided directly.
   991  		args.Filename = args.File.MappedName(ctx)
   992  	default:
   993  		// Otherwise look at Argv and see if the first argument is a valid path.
   994  		if len(args.Argv) == 0 {
   995  			return nil, 0, fmt.Errorf("no filename or command provided")
   996  		}
   997  		if !filepath.IsAbs(args.Argv[0]) {
   998  			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
   999  		}
  1000  		args.Filename = args.Argv[0]
  1001  	}
  1002  
  1003  	// Create a fresh task context.
  1004  	remainingTraversals := args.MaxSymlinkTraversals
  1005  	loadArgs := loader.LoadArgs{
  1006  		Root:                root,
  1007  		WorkingDir:          wd,
  1008  		RemainingTraversals: &remainingTraversals,
  1009  		ResolveFinal:        true,
  1010  		Filename:            args.Filename,
  1011  		File:                args.File,
  1012  		CloseOnExec:         false,
  1013  		Argv:                args.Argv,
  1014  		Envv:                args.Envv,
  1015  		Features:            k.featureSet,
  1016  	}
  1017  
  1018  	image, se := k.LoadTaskImage(ctx, loadArgs)
  1019  	if se != nil {
  1020  		return nil, 0, errors.New(se.String())
  1021  	}
  1022  	var capData auth.VfsCapData
  1023  	if len(image.FileCaps()) != 0 {
  1024  		var err error
  1025  		capData, err = auth.VfsCapDataOf([]byte(image.FileCaps()))
  1026  		if err != nil {
  1027  			return nil, 0, err
  1028  		}
  1029  	}
  1030  	creds, err := auth.CapsFromVfsCaps(capData, args.Credentials)
  1031  	if err != nil {
  1032  		return nil, 0, err
  1033  	}
  1034  	args.FDTable.IncRef()
  1035  
  1036  	// Create the task.
  1037  	config := &TaskConfig{
  1038  		Kernel:           k,
  1039  		ThreadGroup:      tg,
  1040  		TaskImage:        image,
  1041  		FSContext:        fsContext,
  1042  		FDTable:          args.FDTable,
  1043  		Credentials:      creds,
  1044  		NetworkNamespace: k.RootNetworkNamespace(),
  1045  		AllowedCPUMask:   sched.NewFullCPUSet(k.applicationCores),
  1046  		UTSNamespace:     args.UTSNamespace,
  1047  		IPCNamespace:     args.IPCNamespace,
  1048  		MountNamespace:   mntns,
  1049  		ContainerID:      args.ContainerID,
  1050  		InitialCgroups:   args.InitialCgroups,
  1051  		UserCounters:     k.GetUserCounters(args.Credentials.RealKUID),
  1052  		// A task with no parent starts out with no session keyring.
  1053  		SessionKeyring: nil,
  1054  	}
  1055  	config.UTSNamespace.IncRef()
  1056  	config.IPCNamespace.IncRef()
  1057  	config.NetworkNamespace.IncRef()
  1058  	t, err := k.tasks.NewTask(ctx, config)
  1059  	if err != nil {
  1060  		return nil, 0, err
  1061  	}
  1062  	t.traceExecEvent(image) // Simulate exec for tracing.
  1063  
  1064  	// Success.
  1065  	cu.Release()
  1066  	tgid := k.tasks.Root.IDOfThreadGroup(tg)
  1067  	if k.globalInit == nil {
  1068  		k.globalInit = tg
  1069  	}
  1070  	return tg, tgid, nil
  1071  }
  1072  
  1073  // StartProcess starts running a process that was created with CreateProcess.
  1074  func (k *Kernel) StartProcess(tg *ThreadGroup) {
  1075  	t := tg.Leader()
  1076  	tid := k.tasks.Root.IDOfTask(t)
  1077  	t.Start(tid)
  1078  }
  1079  
  1080  // Start starts execution of all tasks in k.
  1081  //
  1082  // Preconditions: Start may be called exactly once.
  1083  func (k *Kernel) Start() error {
  1084  	k.extMu.Lock()
  1085  	defer k.extMu.Unlock()
  1086  
  1087  	if k.started {
  1088  		return fmt.Errorf("kernel already started")
  1089  	}
  1090  
  1091  	k.started = true
  1092  	k.cpuClockTickTimer = time.NewTimer(linux.ClockTick)
  1093  	k.runningTasksMu.Lock()
  1094  	k.cpuClockTickerRunning = true
  1095  	k.runningTasksMu.Unlock()
  1096  	go k.runCPUClockTicker()
  1097  	// If k was created by LoadKernelFrom, timers were stopped during
  1098  	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
  1099  	// this is a no-op.
  1100  	k.resumeTimeLocked(k.SupervisorContext())
  1101  	k.tasks.mu.RLock()
  1102  	ts := make([]*Task, 0, len(k.tasks.Root.tids))
  1103  	for t := range k.tasks.Root.tids {
  1104  		ts = append(ts, t)
  1105  	}
  1106  	k.tasks.mu.RUnlock()
  1107  	// Start task goroutines.
  1108  	// NOTE(b/235349091): We don't actually need the TaskSet mutex, we just
  1109  	// need to make sure we only call t.Start() once for each task. Holding the
  1110  	// mutex for each task start may cause a nested locking error.
  1111  	for _, t := range ts {
  1112  		t.Start(t.ThreadID())
  1113  	}
  1114  	return nil
  1115  }
  1116  
  1117  // pauseTimeLocked pauses all Timers and Timekeeper updates.
  1118  //
  1119  // Preconditions:
  1120  //   - Any task goroutines running in k must be stopped.
  1121  //   - k.extMu must be locked.
  1122  func (k *Kernel) pauseTimeLocked(ctx context.Context) {
  1123  	// Since all task goroutines have been stopped by precondition, the CPU clock
  1124  	// ticker should stop on its own; wait for it to do so, waking it up from
  1125  	// sleeping between ticks if necessary.
  1126  	k.runningTasksMu.Lock()
  1127  	for k.cpuClockTickerRunning {
  1128  		select {
  1129  		case k.cpuClockTickerWakeCh <- struct{}{}:
  1130  		default:
  1131  		}
  1132  		k.cpuClockTickerStopCond.Wait()
  1133  	}
  1134  	k.runningTasksMu.Unlock()
  1135  
  1136  	// By precondition, nothing else can be interacting with PIDNamespace.tids
  1137  	// or FDTable.files, so we can iterate them without synchronization. (We
  1138  	// can't hold the TaskSet mutex when pausing thread group timers because
  1139  	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
  1140  	// mutex, while holding the Timer mutex.)
  1141  	for t := range k.tasks.Root.tids {
  1142  		if t == t.tg.leader {
  1143  			t.tg.itimerRealTimer.Pause()
  1144  			for _, it := range t.tg.timers {
  1145  				it.PauseTimer()
  1146  			}
  1147  		}
  1148  		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
  1149  		// but ktime.Timer.Pause is idempotent so this is harmless.
  1150  		if t.fdTable != nil {
  1151  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1152  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1153  					tfd.PauseTimer()
  1154  				}
  1155  			})
  1156  		}
  1157  	}
  1158  	k.timekeeper.PauseUpdates()
  1159  }
  1160  
  1161  // resumeTimeLocked resumes all Timers and Timekeeper updates. If
  1162  // pauseTimeLocked has not been previously called, resumeTimeLocked has no
  1163  // effect.
  1164  //
  1165  // Preconditions:
  1166  //   - Any task goroutines running in k must be stopped.
  1167  //   - k.extMu must be locked.
  1168  func (k *Kernel) resumeTimeLocked(ctx context.Context) {
  1169  	// The CPU clock ticker will automatically resume as task goroutines resume
  1170  	// execution.
  1171  
  1172  	k.timekeeper.ResumeUpdates()
  1173  	for t := range k.tasks.Root.tids {
  1174  		if t == t.tg.leader {
  1175  			t.tg.itimerRealTimer.Resume()
  1176  			for _, it := range t.tg.timers {
  1177  				it.ResumeTimer()
  1178  			}
  1179  		}
  1180  		if t.fdTable != nil {
  1181  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1182  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1183  					tfd.ResumeTimer()
  1184  				}
  1185  			})
  1186  		}
  1187  	}
  1188  }
  1189  
  1190  func (k *Kernel) incRunningTasks() {
  1191  	for {
  1192  		tasks := k.runningTasks.Load()
  1193  		if tasks != 0 {
  1194  			// Standard case. Simply increment.
  1195  			if !k.runningTasks.CompareAndSwap(tasks, tasks+1) {
  1196  				continue
  1197  			}
  1198  			return
  1199  		}
  1200  
  1201  		// Transition from 0 -> 1.
  1202  		k.runningTasksMu.Lock()
  1203  		if k.runningTasks.Load() != 0 {
  1204  			// Raced with another transition and lost.
  1205  			k.runningTasks.Add(1)
  1206  			k.runningTasksMu.Unlock()
  1207  			return
  1208  		}
  1209  		if !k.cpuClockTickerRunning {
  1210  			select {
  1211  			case tickTime := <-k.cpuClockTickTimer.C:
  1212  				// Rearm the timer since we consumed the wakeup. Estimate how much time
  1213  				// remains on the current tick so that periodic workloads interact with
  1214  				// the (periodic) CPU clock ticker in the same way that they would
  1215  				// without the optimization of putting the ticker to sleep.
  1216  				missedNS := time.Since(tickTime).Nanoseconds()
  1217  				missedTicks := missedNS / linux.ClockTick.Nanoseconds()
  1218  				thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds()
  1219  				k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS))
  1220  				// Increment k.cpuClock on the CPU clock ticker goroutine's behalf.
  1221  				// (Whole missed ticks don't matter, and adding them to k.cpuClock will
  1222  				// just confuse the watchdog.) At the time the tick occurred, all task
  1223  				// goroutines were asleep, so there's nothing else to do. This ensures
  1224  				// that our caller (Task.accountTaskGoroutineLeave()) records an
  1225  				// updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly
  1226  				// accounted as having resumed execution in the sentry during this tick
  1227  				// instead of at the end of the previous one.
  1228  				k.cpuClock.Add(1)
  1229  			default:
  1230  			}
  1231  			// We are transitioning from idle to active. Set k.cpuClockTickerRunning
  1232  			// = true here so that if we transition to idle and then active again
  1233  			// before the CPU clock ticker goroutine has a chance to run, the first
  1234  			// call to k.incRunningTasks() at the end of that cycle does not try to
  1235  			// steal k.cpuClockTickTimer.C again, as this would allow workloads that
  1236  			// rapidly cycle between idle and active to starve the CPU clock ticker
  1237  			// of chances to observe task goroutines in a running state and account
  1238  			// their CPU usage.
  1239  			k.cpuClockTickerRunning = true
  1240  			k.runningTasksCond.Signal()
  1241  		}
  1242  		// This store must happen after the increment of k.cpuClock above to ensure
  1243  		// that concurrent calls to Task.accountTaskGoroutineLeave() also observe
  1244  		// the updated k.cpuClock.
  1245  		k.runningTasks.Store(1)
  1246  		k.runningTasksMu.Unlock()
  1247  		return
  1248  	}
  1249  }
  1250  
  1251  func (k *Kernel) decRunningTasks() {
  1252  	tasks := k.runningTasks.Add(-1)
  1253  	if tasks < 0 {
  1254  		panic(fmt.Sprintf("Invalid running count %d", tasks))
  1255  	}
  1256  
  1257  	// Nothing to do. The next CPU clock tick will disable the timer if
  1258  	// there is still nothing running. This provides approximately one tick
  1259  	// of slack in which we can switch back and forth between idle and
  1260  	// active without an expensive transition.
  1261  }
  1262  
  1263  // WaitExited blocks until all tasks in k have exited.
  1264  func (k *Kernel) WaitExited() {
  1265  	k.tasks.liveGoroutines.Wait()
  1266  }
  1267  
  1268  // Kill requests that all tasks in k immediately exit as if group exiting with
  1269  // status ws. Kill does not wait for tasks to exit.
  1270  func (k *Kernel) Kill(ws linux.WaitStatus) {
  1271  	k.extMu.Lock()
  1272  	defer k.extMu.Unlock()
  1273  	k.tasks.Kill(ws)
  1274  }
  1275  
  1276  // Pause requests that all tasks in k temporarily stop executing, and blocks
  1277  // until all tasks and asynchronous I/O operations in k have stopped. Multiple
  1278  // calls to Pause nest and require an equal number of calls to Unpause to
  1279  // resume execution.
  1280  func (k *Kernel) Pause() {
  1281  	k.extMu.Lock()
  1282  	k.tasks.BeginExternalStop()
  1283  	k.extMu.Unlock()
  1284  	k.tasks.runningGoroutines.Wait()
  1285  	k.tasks.aioGoroutines.Wait()
  1286  }
  1287  
  1288  // ReceiveTaskStates receives full states for all tasks.
  1289  func (k *Kernel) ReceiveTaskStates() {
  1290  	k.extMu.Lock()
  1291  	k.tasks.PullFullState()
  1292  	k.extMu.Unlock()
  1293  }
  1294  
  1295  // Unpause ends the effect of a previous call to Pause. If Unpause is called
  1296  // without a matching preceding call to Pause, Unpause may panic.
  1297  func (k *Kernel) Unpause() {
  1298  	k.extMu.Lock()
  1299  	defer k.extMu.Unlock()
  1300  	k.tasks.EndExternalStop()
  1301  }
  1302  
  1303  // SendExternalSignal injects a signal into the kernel.
  1304  //
  1305  // context is used only for debugging to describe how the signal was received.
  1306  //
  1307  // Preconditions: Kernel must have an init process.
  1308  func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
  1309  	k.extMu.Lock()
  1310  	defer k.extMu.Unlock()
  1311  	k.sendExternalSignal(info, context)
  1312  }
  1313  
  1314  // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
  1315  //
  1316  // This function doesn't skip signals like SendExternalSignal does.
  1317  func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
  1318  	k.extMu.Lock()
  1319  	defer k.extMu.Unlock()
  1320  	return tg.SendSignal(info)
  1321  }
  1322  
  1323  // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the
  1324  // given process group.
  1325  //
  1326  // This function doesn't skip signals like SendExternalSignal does.
  1327  func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error {
  1328  	k.extMu.Lock()
  1329  	defer k.extMu.Unlock()
  1330  	// If anything goes wrong, we'll return the error, but still try our
  1331  	// best to deliver to other processes in the group.
  1332  	var firstErr error
  1333  	for _, tg := range k.TaskSet().Root.ThreadGroups() {
  1334  		if tg.ProcessGroup() != pg {
  1335  			continue
  1336  		}
  1337  		if err := tg.SendSignal(info); err != nil && firstErr == nil {
  1338  			firstErr = err
  1339  		}
  1340  	}
  1341  	return firstErr
  1342  }
  1343  
  1344  // SendContainerSignal sends the given signal to all processes inside the
  1345  // namespace that match the given container ID.
  1346  func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
  1347  	k.extMu.Lock()
  1348  	defer k.extMu.Unlock()
  1349  	k.tasks.mu.RLock()
  1350  	defer k.tasks.mu.RUnlock()
  1351  
  1352  	var lastErr error
  1353  	for tg := range k.tasks.Root.tgids {
  1354  		if tg.leader.ContainerID() == cid {
  1355  			tg.signalHandlers.mu.Lock()
  1356  			infoCopy := *info
  1357  			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
  1358  				lastErr = err
  1359  			}
  1360  			tg.signalHandlers.mu.Unlock()
  1361  		}
  1362  	}
  1363  	return lastErr
  1364  }
  1365  
  1366  // RebuildTraceContexts rebuilds the trace context for all tasks.
  1367  //
  1368  // Unfortunately, if these are built while tracing is not enabled, then we will
  1369  // not have meaningful trace data. Rebuilding here ensures that we can do so
  1370  // after tracing has been enabled.
  1371  func (k *Kernel) RebuildTraceContexts() {
  1372  	// We need to pause all task goroutines because Task.rebuildTraceContext()
  1373  	// replaces Task.traceContext and Task.traceTask, which are
  1374  	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
  1375  	// access them without synchronization) for performance.
  1376  	k.Pause()
  1377  	defer k.Unpause()
  1378  
  1379  	k.extMu.Lock()
  1380  	defer k.extMu.Unlock()
  1381  	k.tasks.mu.RLock()
  1382  	defer k.tasks.mu.RUnlock()
  1383  
  1384  	for t, tid := range k.tasks.Root.tids {
  1385  		t.rebuildTraceContext(tid)
  1386  	}
  1387  }
  1388  
  1389  // FeatureSet returns the FeatureSet.
  1390  func (k *Kernel) FeatureSet() cpuid.FeatureSet {
  1391  	return k.featureSet
  1392  }
  1393  
  1394  // Timekeeper returns the Timekeeper.
  1395  func (k *Kernel) Timekeeper() *Timekeeper {
  1396  	return k.timekeeper
  1397  }
  1398  
  1399  // TaskSet returns the TaskSet.
  1400  func (k *Kernel) TaskSet() *TaskSet {
  1401  	return k.tasks
  1402  }
  1403  
  1404  // RootUserNamespace returns the root UserNamespace.
  1405  func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
  1406  	return k.rootUserNamespace
  1407  }
  1408  
  1409  // RootUTSNamespace returns the root UTSNamespace.
  1410  func (k *Kernel) RootUTSNamespace() *UTSNamespace {
  1411  	return k.rootUTSNamespace
  1412  }
  1413  
  1414  // RootIPCNamespace takes a reference and returns the root IPCNamespace.
  1415  func (k *Kernel) RootIPCNamespace() *IPCNamespace {
  1416  	return k.rootIPCNamespace
  1417  }
  1418  
  1419  // RootPIDNamespace returns the root PIDNamespace.
  1420  func (k *Kernel) RootPIDNamespace() *PIDNamespace {
  1421  	return k.tasks.Root
  1422  }
  1423  
  1424  // RootNetworkNamespace returns the root network namespace, always non-nil.
  1425  func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
  1426  	return k.rootNetworkNamespace
  1427  }
  1428  
  1429  // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
  1430  // nil if no such thread group exists. GlobalInit may return a thread group
  1431  // containing no tasks if the thread group has already exited.
  1432  func (k *Kernel) GlobalInit() *ThreadGroup {
  1433  	k.extMu.Lock()
  1434  	defer k.extMu.Unlock()
  1435  	return k.globalInit
  1436  }
  1437  
  1438  // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
  1439  func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
  1440  	k.globalInit = tg
  1441  }
  1442  
  1443  // ApplicationCores returns the number of CPUs visible to sandboxed
  1444  // applications.
  1445  func (k *Kernel) ApplicationCores() uint {
  1446  	return k.applicationCores
  1447  }
  1448  
  1449  // RealtimeClock returns the application CLOCK_REALTIME clock.
  1450  func (k *Kernel) RealtimeClock() ktime.Clock {
  1451  	return k.timekeeper.realtimeClock
  1452  }
  1453  
  1454  // MonotonicClock returns the application CLOCK_MONOTONIC clock.
  1455  func (k *Kernel) MonotonicClock() ktime.Clock {
  1456  	return k.timekeeper.monotonicClock
  1457  }
  1458  
  1459  // CPUClockNow returns the current value of k.cpuClock.
  1460  func (k *Kernel) CPUClockNow() uint64 {
  1461  	return k.cpuClock.Load()
  1462  }
  1463  
  1464  // Syslog returns the syslog.
  1465  func (k *Kernel) Syslog() *syslog {
  1466  	return &k.syslog
  1467  }
  1468  
  1469  // GenerateInotifyCookie generates a unique inotify event cookie.
  1470  //
  1471  // Returned values may overlap with previously returned values if the value
  1472  // space is exhausted. 0 is not a valid cookie value, all other values
  1473  // representable in a uint32 are allowed.
  1474  func (k *Kernel) GenerateInotifyCookie() uint32 {
  1475  	id := k.nextInotifyCookie.Add(1)
  1476  	// Wrap-around is explicitly allowed for inotify event cookies.
  1477  	if id == 0 {
  1478  		id = k.nextInotifyCookie.Add(1)
  1479  	}
  1480  	return id
  1481  }
  1482  
  1483  // NetlinkPorts returns the netlink port manager.
  1484  func (k *Kernel) NetlinkPorts() *port.Manager {
  1485  	return k.netlinkPorts
  1486  }
  1487  
  1488  var (
  1489  	errSaved     = errors.New("sandbox has been successfully saved")
  1490  	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
  1491  )
  1492  
  1493  // SaveStatus returns the sandbox save status. If it was saved successfully,
  1494  // autosaved indicates whether save was triggered by autosave. If it was not
  1495  // saved successfully, err indicates the sandbox error that caused the kernel to
  1496  // exit during save.
  1497  func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
  1498  	k.extMu.Lock()
  1499  	defer k.extMu.Unlock()
  1500  	switch k.saveStatus {
  1501  	case nil:
  1502  		return false, false, nil
  1503  	case errSaved:
  1504  		return true, false, nil
  1505  	case errAutoSaved:
  1506  		return true, true, nil
  1507  	default:
  1508  		return false, false, k.saveStatus
  1509  	}
  1510  }
  1511  
  1512  // SetSaveSuccess sets the flag indicating that save completed successfully, if
  1513  // no status was already set.
  1514  func (k *Kernel) SetSaveSuccess(autosave bool) {
  1515  	k.extMu.Lock()
  1516  	defer k.extMu.Unlock()
  1517  	if k.saveStatus == nil {
  1518  		if autosave {
  1519  			k.saveStatus = errAutoSaved
  1520  		} else {
  1521  			k.saveStatus = errSaved
  1522  		}
  1523  	}
  1524  }
  1525  
  1526  // SetSaveError sets the sandbox error that caused the kernel to exit during
  1527  // save, if one is not already set.
  1528  func (k *Kernel) SetSaveError(err error) {
  1529  	k.extMu.Lock()
  1530  	defer k.extMu.Unlock()
  1531  	if k.saveStatus == nil {
  1532  		k.saveStatus = err
  1533  	}
  1534  }
  1535  
  1536  // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
  1537  // LoadFrom.
  1538  func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
  1539  	k.mf = mf
  1540  }
  1541  
  1542  // MemoryFile returns the MemoryFile that provides application memory.
  1543  func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
  1544  	return k.mf
  1545  }
  1546  
  1547  // SupervisorContext returns a Context with maximum privileges in k. It should
  1548  // only be used by goroutines outside the control of the emulated kernel
  1549  // defined by e.
  1550  //
  1551  // Callers are responsible for ensuring that the returned Context is not used
  1552  // concurrently with changes to the Kernel.
  1553  func (k *Kernel) SupervisorContext() context.Context {
  1554  	return &supervisorContext{
  1555  		Kernel: k,
  1556  		Logger: log.Log(),
  1557  	}
  1558  }
  1559  
  1560  // SocketRecord represents a socket recorded in Kernel.sockets.
  1561  //
  1562  // +stateify savable
  1563  type SocketRecord struct {
  1564  	k    *Kernel
  1565  	Sock *vfs.FileDescription
  1566  	ID   uint64 // Socket table entry number.
  1567  }
  1568  
  1569  // RecordSocket adds a socket to the system-wide socket table for
  1570  // tracking.
  1571  //
  1572  // Precondition: Caller must hold a reference to sock.
  1573  //
  1574  // Note that the socket table will not hold a reference on the
  1575  // vfs.FileDescription.
  1576  func (k *Kernel) RecordSocket(sock *vfs.FileDescription) {
  1577  	k.extMu.Lock()
  1578  	if _, ok := k.sockets[sock]; ok {
  1579  		panic(fmt.Sprintf("Socket %p added twice", sock))
  1580  	}
  1581  	id := k.nextSocketRecord
  1582  	k.nextSocketRecord++
  1583  	s := &SocketRecord{
  1584  		k:    k,
  1585  		ID:   id,
  1586  		Sock: sock,
  1587  	}
  1588  	k.sockets[sock] = s
  1589  	k.extMu.Unlock()
  1590  }
  1591  
  1592  // DeleteSocket removes a socket from the system-wide socket table.
  1593  func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) {
  1594  	k.extMu.Lock()
  1595  	delete(k.sockets, sock)
  1596  	k.extMu.Unlock()
  1597  }
  1598  
  1599  // ListSockets returns a snapshot of all sockets.
  1600  //
  1601  // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef()
  1602  // to get a reference on a socket in the table.
  1603  func (k *Kernel) ListSockets() []*SocketRecord {
  1604  	k.extMu.Lock()
  1605  	var socks []*SocketRecord
  1606  	for _, s := range k.sockets {
  1607  		socks = append(socks, s)
  1608  	}
  1609  	k.extMu.Unlock()
  1610  	return socks
  1611  }
  1612  
  1613  // supervisorContext is a privileged context.
  1614  type supervisorContext struct {
  1615  	context.NoTask
  1616  	log.Logger
  1617  	*Kernel
  1618  }
  1619  
  1620  // Deadline implements context.Context.Deadline.
  1621  func (*Kernel) Deadline() (time.Time, bool) {
  1622  	return time.Time{}, false
  1623  }
  1624  
  1625  // Done implements context.Context.Done.
  1626  func (*Kernel) Done() <-chan struct{} {
  1627  	return nil
  1628  }
  1629  
  1630  // Err implements context.Context.Err.
  1631  func (*Kernel) Err() error {
  1632  	return nil
  1633  }
  1634  
  1635  // Value implements context.Context.
  1636  func (ctx *supervisorContext) Value(key any) any {
  1637  	switch key {
  1638  	case CtxCanTrace:
  1639  		// The supervisor context can trace anything. (None of
  1640  		// supervisorContext's users are expected to invoke ptrace, but ptrace
  1641  		// permissions are required for certain file accesses.)
  1642  		return func(*Task, bool) bool { return true }
  1643  	case CtxKernel:
  1644  		return ctx.Kernel
  1645  	case CtxPIDNamespace:
  1646  		return ctx.Kernel.tasks.Root
  1647  	case CtxUTSNamespace:
  1648  		utsns := ctx.Kernel.rootUTSNamespace
  1649  		utsns.IncRef()
  1650  		return utsns
  1651  	case ipc.CtxIPCNamespace:
  1652  		ipcns := ctx.Kernel.rootIPCNamespace
  1653  		ipcns.IncRef()
  1654  		return ipcns
  1655  	case auth.CtxCredentials:
  1656  		// The supervisor context is global root.
  1657  		return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace)
  1658  	case vfs.CtxRoot:
  1659  		if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil {
  1660  			return vfs.VirtualDentry{}
  1661  		}
  1662  		root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx)
  1663  		return root
  1664  	case vfs.CtxMountNamespace:
  1665  		if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil {
  1666  			return nil
  1667  		}
  1668  		mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace()
  1669  		mntns.IncRef()
  1670  		return mntns
  1671  	case inet.CtxStack:
  1672  		return ctx.Kernel.RootNetworkNamespace().Stack()
  1673  	case ktime.CtxRealtimeClock:
  1674  		return ctx.Kernel.RealtimeClock()
  1675  	case limits.CtxLimits:
  1676  		// No limits apply.
  1677  		return limits.NewLimitSet()
  1678  	case pgalloc.CtxMemoryFile:
  1679  		return ctx.Kernel.mf
  1680  	case platform.CtxPlatform:
  1681  		return ctx.Kernel
  1682  	case uniqueid.CtxGlobalUniqueID:
  1683  		return ctx.Kernel.UniqueID()
  1684  	case uniqueid.CtxGlobalUniqueIDProvider:
  1685  		return ctx.Kernel
  1686  	case uniqueid.CtxInotifyCookie:
  1687  		return ctx.Kernel.GenerateInotifyCookie()
  1688  	case unimpl.CtxEvents:
  1689  		return ctx.Kernel
  1690  	case cpuid.CtxFeatureSet:
  1691  		return ctx.Kernel.featureSet
  1692  	default:
  1693  		return nil
  1694  	}
  1695  }
  1696  
  1697  // Rate limits for the number of unimplemented syscall events.
  1698  const (
  1699  	unimplementedSyscallsMaxRate = 100  // events per second
  1700  	unimplementedSyscallBurst    = 1000 // events
  1701  )
  1702  
  1703  // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
  1704  // channel.
  1705  func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) {
  1706  	k.unimplementedSyscallEmitterOnce.Do(func() {
  1707  		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
  1708  	})
  1709  
  1710  	t := TaskFromContext(ctx)
  1711  	IncrementUnimplementedSyscallCounter(sysno)
  1712  	_, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
  1713  		Tid:       int32(t.ThreadID()),
  1714  		Registers: t.Arch().StateData().Proto(),
  1715  	})
  1716  }
  1717  
  1718  // VFS returns the virtual filesystem for the kernel.
  1719  func (k *Kernel) VFS() *vfs.VirtualFilesystem {
  1720  	return &k.vfs
  1721  }
  1722  
  1723  // SetHostMount sets the hostfs mount.
  1724  func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
  1725  	if k.hostMount != nil {
  1726  		panic("Kernel.hostMount cannot be set more than once")
  1727  	}
  1728  	k.hostMount = mnt
  1729  }
  1730  
  1731  // HostMount returns the hostfs mount.
  1732  func (k *Kernel) HostMount() *vfs.Mount {
  1733  	return k.hostMount
  1734  }
  1735  
  1736  // PipeMount returns the pipefs mount.
  1737  func (k *Kernel) PipeMount() *vfs.Mount {
  1738  	return k.pipeMount
  1739  }
  1740  
  1741  // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace.
  1742  func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter {
  1743  	return nsfs.NewInode(ctx, k.nsfsMount, ns)
  1744  }
  1745  
  1746  // ShmMount returns the tmpfs mount.
  1747  func (k *Kernel) ShmMount() *vfs.Mount {
  1748  	return k.shmMount
  1749  }
  1750  
  1751  // SocketMount returns the sockfs mount.
  1752  func (k *Kernel) SocketMount() *vfs.Mount {
  1753  	return k.socketMount
  1754  }
  1755  
  1756  // CgroupRegistry returns the cgroup registry.
  1757  func (k *Kernel) CgroupRegistry() *CgroupRegistry {
  1758  	return k.cgroupRegistry
  1759  }
  1760  
  1761  // AddCgroupMount adds the cgroup mounts to the cgroupMountsMap. These cgroup
  1762  // mounts are created during the creation of root container process and the
  1763  // reference ownership is transferred to the kernel.
  1764  func (k *Kernel) AddCgroupMount(ctl string, mnt *CgroupMount) {
  1765  	k.cgroupMountsMapMu.Lock()
  1766  	defer k.cgroupMountsMapMu.Unlock()
  1767  
  1768  	if k.cgroupMountsMap == nil {
  1769  		k.cgroupMountsMap = make(map[string]*CgroupMount)
  1770  	}
  1771  	k.cgroupMountsMap[ctl] = mnt
  1772  }
  1773  
  1774  // GetCgroupMount returns the cgroup mount for the given cgroup controller.
  1775  func (k *Kernel) GetCgroupMount(ctl string) *CgroupMount {
  1776  	k.cgroupMountsMapMu.Lock()
  1777  	defer k.cgroupMountsMapMu.Unlock()
  1778  
  1779  	return k.cgroupMountsMap[ctl]
  1780  }
  1781  
  1782  // releaseCgroupMounts releases the cgroup mounts.
  1783  func (k *Kernel) releaseCgroupMounts(ctx context.Context) {
  1784  	k.cgroupMountsMapMu.Lock()
  1785  	defer k.cgroupMountsMapMu.Unlock()
  1786  
  1787  	for _, m := range k.cgroupMountsMap {
  1788  		m.Mount.DecRef(ctx)
  1789  		m.Root.DecRef(ctx)
  1790  		m.Fs.DecRef(ctx)
  1791  	}
  1792  }
  1793  
  1794  // Release releases resources owned by k.
  1795  //
  1796  // Precondition: This should only be called after the kernel is fully
  1797  // initialized, e.g. after k.Start() has been called.
  1798  func (k *Kernel) Release() {
  1799  	ctx := k.SupervisorContext()
  1800  	k.releaseCgroupMounts(ctx)
  1801  	k.hostMount.DecRef(ctx)
  1802  	k.pipeMount.DecRef(ctx)
  1803  	k.nsfsMount.DecRef(ctx)
  1804  	k.shmMount.DecRef(ctx)
  1805  	k.socketMount.DecRef(ctx)
  1806  	k.vfs.Release(ctx)
  1807  	k.timekeeper.Destroy()
  1808  	k.vdso.Release(ctx)
  1809  	k.RootNetworkNamespace().DecRef(ctx)
  1810  	k.rootIPCNamespace.DecRef(ctx)
  1811  	k.rootUTSNamespace.DecRef(ctx)
  1812  	k.cleaupDevGofers()
  1813  }
  1814  
  1815  // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
  1816  // hierarchy.
  1817  //
  1818  // Precondition: root must be a new cgroup with no tasks. This implies the
  1819  // controllers for root are also new and currently manage no task, which in turn
  1820  // implies the new cgroup can be populated without migrating tasks between
  1821  // cgroups.
  1822  func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
  1823  	k.tasks.mu.RLock()
  1824  	k.tasks.forEachTaskLocked(func(t *Task) {
  1825  		if t.exitState != TaskExitNone {
  1826  			return
  1827  		}
  1828  		t.mu.Lock()
  1829  		// A task can be in the cgroup if it has been created after the
  1830  		// cgroup hierarchy was registered.
  1831  		t.enterCgroupIfNotYetLocked(root)
  1832  		t.mu.Unlock()
  1833  	})
  1834  	k.tasks.mu.RUnlock()
  1835  }
  1836  
  1837  // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
  1838  // hierarchy with the provided id.  This is intended for use during hierarchy
  1839  // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
  1840  func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
  1841  	var releasedCGs []Cgroup
  1842  
  1843  	k.tasks.mu.RLock()
  1844  	// We'll have one cgroup per hierarchy per task.
  1845  	releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids))
  1846  	k.tasks.forEachTaskLocked(func(t *Task) {
  1847  		if t.exitState != TaskExitNone {
  1848  			return
  1849  		}
  1850  		t.mu.Lock()
  1851  		for cg := range t.cgroups {
  1852  			if cg.HierarchyID() == hid {
  1853  				cg.Leave(t)
  1854  				t.ResetMemCgIDFromCgroup(cg)
  1855  				delete(t.cgroups, cg)
  1856  				releasedCGs = append(releasedCGs, cg)
  1857  				// A task can't be part of multiple cgroups from the same
  1858  				// hierarchy, so we can skip checking the rest once we find a
  1859  				// match.
  1860  				break
  1861  			}
  1862  		}
  1863  		t.mu.Unlock()
  1864  	})
  1865  	k.tasks.mu.RUnlock()
  1866  
  1867  	for _, c := range releasedCGs {
  1868  		c.decRef()
  1869  	}
  1870  }
  1871  
  1872  // ReplaceFSContextRoots updates root and cwd to `newRoot` in the FSContext
  1873  // across all tasks whose old root or cwd were `oldRoot`.
  1874  func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) {
  1875  	k.tasks.mu.RLock()
  1876  	oldRootDecRefs := 0
  1877  	k.tasks.forEachTaskLocked(func(t *Task) {
  1878  		t.mu.Lock()
  1879  		defer t.mu.Unlock()
  1880  		if fsc := t.fsContext; fsc != nil {
  1881  			fsc.mu.Lock()
  1882  			defer fsc.mu.Unlock()
  1883  			if fsc.root == oldRoot {
  1884  				newRoot.IncRef()
  1885  				oldRootDecRefs++
  1886  				fsc.root = newRoot
  1887  			}
  1888  			if fsc.cwd == oldRoot {
  1889  				newRoot.IncRef()
  1890  				oldRootDecRefs++
  1891  				fsc.cwd = newRoot
  1892  			}
  1893  		}
  1894  	})
  1895  	k.tasks.mu.RUnlock()
  1896  	for i := 0; i < oldRootDecRefs; i++ {
  1897  		oldRoot.DecRef(ctx)
  1898  	}
  1899  }
  1900  
  1901  // GetUserCounters returns the user counters for the given KUID.
  1902  func (k *Kernel) GetUserCounters(uid auth.KUID) *UserCounters {
  1903  	k.userCountersMapMu.Lock()
  1904  	defer k.userCountersMapMu.Unlock()
  1905  
  1906  	if uc, ok := k.userCountersMap[uid]; ok {
  1907  		return uc
  1908  	}
  1909  
  1910  	uc := &UserCounters{}
  1911  	k.userCountersMap[uid] = uc
  1912  	return uc
  1913  }
  1914  
  1915  // AddDevGofer initializes the dev gofer connection and starts tracking it.
  1916  // It takes ownership of goferFD.
  1917  func (k *Kernel) AddDevGofer(cid string, goferFD int) error {
  1918  	client, err := devutil.NewGoferClient(k.SupervisorContext(), goferFD)
  1919  	if err != nil {
  1920  		return err
  1921  	}
  1922  
  1923  	k.devGofersMu.Lock()
  1924  	defer k.devGofersMu.Unlock()
  1925  	if k.devGofers == nil {
  1926  		k.devGofers = make(map[string]*devutil.GoferClient)
  1927  	}
  1928  	k.devGofers[cid] = client
  1929  	return nil
  1930  }
  1931  
  1932  // RemoveDevGofer closes the dev gofer connection, if one exists, and stops
  1933  // tracking it.
  1934  func (k *Kernel) RemoveDevGofer(cid string) {
  1935  	k.devGofersMu.Lock()
  1936  	defer k.devGofersMu.Unlock()
  1937  	client, ok := k.devGofers[cid]
  1938  	if !ok {
  1939  		return
  1940  	}
  1941  	client.Close()
  1942  	delete(k.devGofers, cid)
  1943  }
  1944  
  1945  func (k *Kernel) getDevGoferClient(cid string) *devutil.GoferClient {
  1946  	k.devGofersMu.Lock()
  1947  	defer k.devGofersMu.Unlock()
  1948  	return k.devGofers[cid]
  1949  }
  1950  
  1951  func (k *Kernel) cleaupDevGofers() {
  1952  	k.devGofersMu.Lock()
  1953  	defer k.devGofersMu.Unlock()
  1954  	for _, client := range k.devGofers {
  1955  		client.Close()
  1956  	}
  1957  	k.devGofers = nil
  1958  }