github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/kernel.go

github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/kernel.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernel provides an emulation of the Linux kernel.
    16  //
    17  // See README.md for a detailed overview.
    18  //
    19  // Lock order (outermost locks must be taken first):
    20  //
    21  //	Kernel.extMu
    22  //		ThreadGroup.timerMu
    23  //		  ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu
    24  //		    TaskSet.mu
    25  //		      SignalHandlers.mu
    26  //		        Task.mu
    27  //		    runningTasksMu
    28  //
    29  // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
    30  // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
    31  // time requires locking all of their signal mutexes first.
    32  package kernel
    33  
    34  import (
    35  	"errors"
    36  	"fmt"
    37  	"path/filepath"
    38  	"time"
    39  
    40  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    41  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    42  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    43  	"github.com/MerlinKodo/gvisor/pkg/context"
    44  	"github.com/MerlinKodo/gvisor/pkg/cpuid"
    45  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    46  	"github.com/MerlinKodo/gvisor/pkg/eventchannel"
    47  	"github.com/MerlinKodo/gvisor/pkg/fspath"
    48  	"github.com/MerlinKodo/gvisor/pkg/log"
    49  	"github.com/MerlinKodo/gvisor/pkg/refs"
    50  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    51  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs"
    52  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/pipefs"
    53  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sockfs"
    54  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/timerfd"
    55  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs"
    56  	"github.com/MerlinKodo/gvisor/pkg/sentry/hostcpu"
    57  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    58  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    59  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/futex"
    60  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/ipc"
    61  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/sched"
    62  	ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    63  	"github.com/MerlinKodo/gvisor/pkg/sentry/limits"
    64  	"github.com/MerlinKodo/gvisor/pkg/sentry/loader"
    65  	"github.com/MerlinKodo/gvisor/pkg/sentry/mm"
    66  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    67  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    68  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/port"
    69  	sentrytime "github.com/MerlinKodo/gvisor/pkg/sentry/time"
    70  	"github.com/MerlinKodo/gvisor/pkg/sentry/unimpl"
    71  	uspb "github.com/MerlinKodo/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
    72  	"github.com/MerlinKodo/gvisor/pkg/sentry/uniqueid"
    73  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    74  	"github.com/MerlinKodo/gvisor/pkg/state"
    75  	"github.com/MerlinKodo/gvisor/pkg/state/wire"
    76  	"github.com/MerlinKodo/gvisor/pkg/sync"
    77  	"github.com/MerlinKodo/gvisor/pkg/tcpip"
    78  )
    79  
    80  // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to
    81  // allow easy access everywhere.
    82  var IOUringEnabled = false
    83  
    84  // userCounters is a set of user counters.
    85  //
    86  // +stateify savable
    87  type userCounters struct {
    88  	uid auth.KUID
    89  
    90  	rlimitNProc atomicbitops.Uint64
    91  }
    92  
    93  // incRLimitNProc increments the rlimitNProc counter.
    94  func (uc *userCounters) incRLimitNProc(ctx context.Context) error {
    95  	lim := limits.FromContext(ctx).Get(limits.ProcessCount)
    96  	creds := auth.CredentialsFromContext(ctx)
    97  	nproc := uc.rlimitNProc.Add(1)
    98  	if nproc > lim.Cur &&
    99  		!creds.HasCapability(linux.CAP_SYS_ADMIN) &&
   100  		!creds.HasCapability(linux.CAP_SYS_RESOURCE) {
   101  		uc.rlimitNProc.Add(^uint64(0))
   102  		return linuxerr.EAGAIN
   103  	}
   104  	return nil
   105  }
   106  
   107  // decRLimitNProc decrements the rlimitNProc counter.
   108  func (uc *userCounters) decRLimitNProc() {
   109  	uc.rlimitNProc.Add(^uint64(0))
   110  }
   111  
   112  // Kernel represents an emulated Linux kernel. It must be initialized by calling
   113  // Init() or LoadFrom().
   114  //
   115  // +stateify savable
   116  type Kernel struct {
   117  	// extMu serializes external changes to the Kernel with calls to
   118  	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
   119  	// remains frozen for the duration of the call; it requires that the Kernel
   120  	// is paused as a precondition, which ensures that none of the tasks
   121  	// running within the Kernel can affect its state, but extMu is required to
   122  	// ensure that concurrent users of the Kernel *outside* the Kernel's
   123  	// control cannot affect its state by calling e.g.
   124  	// Kernel.SendExternalSignal.)
   125  	extMu sync.Mutex `state:"nosave"`
   126  
   127  	// started is true if Start has been called. Unless otherwise specified,
   128  	// all Kernel fields become immutable once started becomes true.
   129  	started bool `state:"nosave"`
   130  
   131  	// All of the following fields are immutable unless otherwise specified.
   132  
   133  	// Platform is the platform that is used to execute tasks in the created
   134  	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
   135  	// embedded anonymously (the same issue applies).
   136  	platform.Platform `state:"nosave"`
   137  
   138  	// mf provides application memory.
   139  	mf *pgalloc.MemoryFile `state:"nosave"`
   140  
   141  	// See InitKernelArgs for the meaning of these fields.
   142  	featureSet                  cpuid.FeatureSet
   143  	timekeeper                  *Timekeeper
   144  	tasks                       *TaskSet
   145  	rootUserNamespace           *auth.UserNamespace
   146  	rootNetworkNamespace        *inet.Namespace
   147  	applicationCores            uint
   148  	useHostCores                bool
   149  	extraAuxv                   []arch.AuxEntry
   150  	vdso                        *loader.VDSO
   151  	rootUTSNamespace            *UTSNamespace
   152  	rootIPCNamespace            *IPCNamespace
   153  	rootAbstractSocketNamespace *AbstractSocketNamespace
   154  
   155  	// futexes is the "root" futex.Manager, from which all others are forked.
   156  	// This is necessary to ensure that shared futexes are coherent across all
   157  	// tasks, including those created by CreateProcess.
   158  	futexes *futex.Manager
   159  
   160  	// globalInit is the thread group whose leader has ID 1 in the root PID
   161  	// namespace. globalInit is stored separately so that it is accessible even
   162  	// after all tasks in the thread group have exited, such that ID 1 is no
   163  	// longer mapped.
   164  	//
   165  	// globalInit is mutable until it is assigned by the first successful call
   166  	// to CreateProcess, and is protected by extMu.
   167  	globalInit *ThreadGroup
   168  
   169  	// syslog is the kernel log.
   170  	syslog syslog
   171  
   172  	runningTasksMu runningTasksMutex `state:"nosave"`
   173  
   174  	// runningTasks is the total count of tasks currently in
   175  	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
   176  	// not blocked or stopped.
   177  	//
   178  	// runningTasks must be accessed atomically. Increments from 0 to 1 are
   179  	// further protected by runningTasksMu (see incRunningTasks).
   180  	runningTasks atomicbitops.Int64
   181  
   182  	// runningTasksCond is signaled when runningTasks is incremented from 0 to 1.
   183  	//
   184  	// Invariant: runningTasksCond.L == &runningTasksMu.
   185  	runningTasksCond sync.Cond `state:"nosave"`
   186  
   187  	// cpuClock is incremented every linux.ClockTick by a goroutine running
   188  	// kernel.runCPUClockTicker() while runningTasks != 0.
   189  	//
   190  	// cpuClock is used to measure task CPU usage, since sampling monotonicClock
   191  	// twice on every syscall turns out to be unreasonably expensive. This is
   192  	// similar to how Linux does task CPU accounting on x86
   193  	// (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing
   194  	// information to improve resolution
   195  	// (kernel/sched/cputime.c:cputime_adjust()), which we can't do since
   196  	// "preeemptive" scheduling is managed by the Go runtime, which doesn't
   197  	// provide this information.
   198  	//
   199  	// cpuClock is mutable, and is accessed using atomic memory operations.
   200  	cpuClock atomicbitops.Uint64
   201  
   202  	// cpuClockTickTimer drives increments of cpuClock.
   203  	cpuClockTickTimer *time.Timer `state:"nosave"`
   204  
   205  	// cpuClockMu is used to make increments of cpuClock, and updates of timers
   206  	// based on cpuClock, atomic.
   207  	cpuClockMu cpuClockMutex `state:"nosave"`
   208  
   209  	// cpuClockTickerRunning is true if the goroutine that increments cpuClock is
   210  	// running and false if it is blocked in runningTasksCond.Wait() or if it
   211  	// never started.
   212  	//
   213  	// cpuClockTickerRunning is protected by runningTasksMu.
   214  	cpuClockTickerRunning bool
   215  
   216  	// cpuClockTickerWakeCh is sent to to wake the goroutine that increments
   217  	// cpuClock if it's sleeping between ticks.
   218  	cpuClockTickerWakeCh chan struct{} `state:"nosave"`
   219  
   220  	// cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions
   221  	// from true to false.
   222  	//
   223  	// Invariant: cpuClockTickerStopCond.L == &runningTasksMu.
   224  	cpuClockTickerStopCond sync.Cond `state:"nosave"`
   225  
   226  	// uniqueID is used to generate unique identifiers.
   227  	//
   228  	// uniqueID is mutable, and is accessed using atomic memory operations.
   229  	uniqueID atomicbitops.Uint64
   230  
   231  	// nextInotifyCookie is a monotonically increasing counter used for
   232  	// generating unique inotify event cookies.
   233  	//
   234  	// nextInotifyCookie is mutable.
   235  	nextInotifyCookie atomicbitops.Uint32
   236  
   237  	// netlinkPorts manages allocation of netlink socket port IDs.
   238  	netlinkPorts *port.Manager
   239  
   240  	// saveStatus is nil if the sandbox has not been saved, errSaved or
   241  	// errAutoSaved if it has been saved successfully, or the error causing the
   242  	// sandbox to exit during save.
   243  	// It is protected by extMu.
   244  	saveStatus error `state:"nosave"`
   245  
   246  	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
   247  	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
   248  
   249  	// sockets records all network sockets in the system. Protected by extMu.
   250  	sockets map[*vfs.FileDescription]*SocketRecord
   251  
   252  	// nextSocketRecord is the next entry number to use in sockets. Protected
   253  	// by extMu.
   254  	nextSocketRecord uint64
   255  
   256  	// unimplementedSyscallEmitterOnce is used in the initialization of
   257  	// unimplementedSyscallEmitter.
   258  	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
   259  
   260  	// unimplementedSyscallEmitter is used to emit unimplemented syscall
   261  	// events. This is initialized lazily on the first unimplemented
   262  	// syscall.
   263  	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
   264  
   265  	// SpecialOpts contains special kernel options.
   266  	SpecialOpts
   267  
   268  	// vfs keeps the filesystem state used across the kernel.
   269  	vfs vfs.VirtualFilesystem
   270  
   271  	// hostMount is the Mount used for file descriptors that were imported
   272  	// from the host.
   273  	hostMount *vfs.Mount
   274  
   275  	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
   276  	// syscalls (as opposed to named pipes created by mknod()).
   277  	pipeMount *vfs.Mount
   278  
   279  	// nsfsMount is the Mount used for namespaces.
   280  	nsfsMount *vfs.Mount
   281  
   282  	// shmMount is the Mount used for anonymous files created by the
   283  	// memfd_create() syscalls. It is analogous to Linux's shm_mnt.
   284  	shmMount *vfs.Mount
   285  
   286  	// socketMount is the Mount used for sockets created by the socket() and
   287  	// socketpair() syscalls. There are several cases where a socket dentry will
   288  	// not be contained in socketMount:
   289  	// 1. Socket files created by mknod()
   290  	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
   291  	// 3. Socket files created by binding Unix sockets to a file path
   292  	socketMount *vfs.Mount
   293  
   294  	// sysVShmDevID is the device number used by SysV shm segments. In Linux,
   295  	// SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number.
   296  	// In gVisor, the shm implementation does not use shmMount, extracting
   297  	// shmMount's device number is inconvenient, applications accept a
   298  	// different device number in practice, and using a distinct device number
   299  	// avoids the possibility of inode number collisions due to the hack
   300  	// described in shm.Shm.InodeID().
   301  	sysVShmDevID uint32
   302  
   303  	// If set to true, report address space activation waits as if the task is in
   304  	// external wait so that the watchdog doesn't report the task stuck.
   305  	SleepForAddressSpaceActivation bool
   306  
   307  	// Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
   308  	// tracee-tracer relationship. The key is a process (technically, the thread
   309  	// group leader) that can be traced by any thread that is a descendant of the
   310  	// value. If the value is nil, then anyone can trace the process represented by
   311  	// the key.
   312  	//
   313  	// ptraceExceptions is protected by the TaskSet mutex.
   314  	ptraceExceptions map[*Task]*Task
   315  
   316  	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
   317  	YAMAPtraceScope atomicbitops.Int32
   318  
   319  	// cgroupRegistry contains the set of active cgroup controllers on the
   320  	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
   321  	// the system.
   322  	cgroupRegistry *CgroupRegistry
   323  
   324  	// userCountersMap maps auth.KUID into a set of user counters.
   325  	userCountersMap   map[auth.KUID]*userCounters
   326  	userCountersMapMu userCountersMutex `state:"nosave"`
   327  }
   328  
   329  // InitKernelArgs holds arguments to Init.
   330  type InitKernelArgs struct {
   331  	// FeatureSet is the emulated CPU feature set.
   332  	FeatureSet cpuid.FeatureSet
   333  
   334  	// Timekeeper manages time for all tasks in the system.
   335  	Timekeeper *Timekeeper
   336  
   337  	// RootUserNamespace is the root user namespace.
   338  	RootUserNamespace *auth.UserNamespace
   339  
   340  	// RootNetworkNamespace is the root network namespace. If nil, no networking
   341  	// will be available.
   342  	RootNetworkNamespace *inet.Namespace
   343  
   344  	// ApplicationCores is the number of logical CPUs visible to sandboxed
   345  	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
   346  	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
   347  	// most significant bit in cpu_possible_mask + 1.
   348  	ApplicationCores uint
   349  
   350  	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
   351  	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
   352  	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
   353  	// will be overridden.
   354  	UseHostCores bool
   355  
   356  	// ExtraAuxv contains additional auxiliary vector entries that are added to
   357  	// each process by the ELF loader.
   358  	ExtraAuxv []arch.AuxEntry
   359  
   360  	// Vdso holds the VDSO and its parameter page.
   361  	Vdso *loader.VDSO
   362  
   363  	// RootUTSNamespace is the root UTS namespace.
   364  	RootUTSNamespace *UTSNamespace
   365  
   366  	// RootIPCNamespace is the root IPC namespace.
   367  	RootIPCNamespace *IPCNamespace
   368  
   369  	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
   370  	RootAbstractSocketNamespace *AbstractSocketNamespace
   371  
   372  	// PIDNamespace is the root PID namespace.
   373  	PIDNamespace *PIDNamespace
   374  }
   375  
   376  // Init initialize the Kernel with no tasks.
   377  //
   378  // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
   379  // before calling Init.
   380  func (k *Kernel) Init(args InitKernelArgs) error {
   381  	if args.Timekeeper == nil {
   382  		return fmt.Errorf("args.Timekeeper is nil")
   383  	}
   384  	if args.Timekeeper.clocks == nil {
   385  		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
   386  	}
   387  	if args.RootUserNamespace == nil {
   388  		return fmt.Errorf("args.RootUserNamespace is nil")
   389  	}
   390  	if args.ApplicationCores == 0 {
   391  		return fmt.Errorf("args.ApplicationCores is 0")
   392  	}
   393  
   394  	k.featureSet = args.FeatureSet
   395  	k.timekeeper = args.Timekeeper
   396  	k.tasks = newTaskSet(args.PIDNamespace)
   397  	k.rootUserNamespace = args.RootUserNamespace
   398  	k.rootUTSNamespace = args.RootUTSNamespace
   399  	k.rootIPCNamespace = args.RootIPCNamespace
   400  	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
   401  	k.rootNetworkNamespace = args.RootNetworkNamespace
   402  	if k.rootNetworkNamespace == nil {
   403  		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace)
   404  	}
   405  	k.runningTasksCond.L = &k.runningTasksMu
   406  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   407  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   408  	k.applicationCores = args.ApplicationCores
   409  	if args.UseHostCores {
   410  		k.useHostCores = true
   411  		maxCPU, err := hostcpu.MaxPossibleCPU()
   412  		if err != nil {
   413  			return fmt.Errorf("failed to get maximum CPU number: %v", err)
   414  		}
   415  		minAppCores := uint(maxCPU) + 1
   416  		if k.applicationCores < minAppCores {
   417  			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
   418  			k.applicationCores = minAppCores
   419  		}
   420  	}
   421  	k.extraAuxv = args.ExtraAuxv
   422  	k.vdso = args.Vdso
   423  	k.futexes = futex.NewManager()
   424  	k.netlinkPorts = port.New()
   425  	k.ptraceExceptions = make(map[*Task]*Task)
   426  	k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL)
   427  	k.userCountersMap = make(map[auth.KUID]*userCounters)
   428  
   429  	ctx := k.SupervisorContext()
   430  	if err := k.vfs.Init(ctx); err != nil {
   431  		return fmt.Errorf("failed to initialize VFS: %v", err)
   432  	}
   433  
   434  	err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx))
   435  	if err != nil {
   436  		return fmt.Errorf("failed to create mqfs filesystem: %v", err)
   437  	}
   438  
   439  	pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
   440  	if err != nil {
   441  		return fmt.Errorf("failed to create pipefs filesystem: %v", err)
   442  	}
   443  	defer pipeFilesystem.DecRef(ctx)
   444  	pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
   445  	k.pipeMount = pipeMount
   446  
   447  	nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs)
   448  	if err != nil {
   449  		return fmt.Errorf("failed to create nsfs filesystem: %v", err)
   450  	}
   451  	defer nsfsFilesystem.DecRef(ctx)
   452  	k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
   453  	k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
   454  	k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
   455  	k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
   456  
   457  	tmpfsOpts := vfs.GetFilesystemOptions{
   458  		InternalData: tmpfs.FilesystemOpts{
   459  			// See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT).
   460  			// Note how mm/shmem.c:shmem_fill_super() does not provide a default
   461  			// value for sbinfo->max_blocks when SB_KERNMOUNT is set.
   462  			DisableDefaultSizeLimit: true,
   463  		},
   464  	}
   465  	tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts)
   466  	if err != nil {
   467  		return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
   468  	}
   469  	defer tmpfsFilesystem.DecRef(ctx)
   470  	defer tmpfsRoot.DecRef(ctx)
   471  	k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
   472  
   473  	socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
   474  	if err != nil {
   475  		return fmt.Errorf("failed to create sockfs filesystem: %v", err)
   476  	}
   477  	defer socketFilesystem.DecRef(ctx)
   478  	k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
   479  
   480  	sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor()
   481  	if err != nil {
   482  		return fmt.Errorf("failed to get device number for SysV shm: %v", err)
   483  	}
   484  	k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor)
   485  
   486  	k.sockets = make(map[*vfs.FileDescription]*SocketRecord)
   487  
   488  	k.cgroupRegistry = newCgroupRegistry()
   489  	return nil
   490  }
   491  
   492  // SaveTo saves the state of k to w.
   493  //
   494  // Preconditions: The kernel must be paused throughout the call to SaveTo.
   495  func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
   496  	saveStart := time.Now()
   497  
   498  	// Do not allow other Kernel methods to affect it while it's being saved.
   499  	k.extMu.Lock()
   500  	defer k.extMu.Unlock()
   501  
   502  	// Stop time.
   503  	k.pauseTimeLocked(ctx)
   504  	defer k.resumeTimeLocked(ctx)
   505  
   506  	// Evict all evictable MemoryFile allocations.
   507  	k.mf.StartEvictions()
   508  	k.mf.WaitForEvictions()
   509  
   510  	// Discard unsavable mappings, such as those for host file descriptors.
   511  	if err := k.invalidateUnsavableMappings(ctx); err != nil {
   512  		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   513  	}
   514  
   515  	// Prepare filesystems for saving. This must be done after
   516  	// invalidateUnsavableMappings(), since dropping memory mappings may
   517  	// affect filesystem state (e.g. page cache reference counts).
   518  	if err := k.vfs.PrepareSave(ctx); err != nil {
   519  		return err
   520  	}
   521  
   522  	// Save the CPUID FeatureSet before the rest of the kernel so we can
   523  	// verify its compatibility on restore before attempting to restore the
   524  	// entire kernel, which may fail on an incompatible machine.
   525  	//
   526  	// N.B. This will also be saved along with the full kernel save below.
   527  	cpuidStart := time.Now()
   528  	if _, err := state.Save(ctx, w, &k.featureSet); err != nil {
   529  		return err
   530  	}
   531  	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
   532  
   533  	// Save the timekeeper's state.
   534  
   535  	if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil {
   536  		// Pause the network stack.
   537  		netstackPauseStart := time.Now()
   538  		log.Infof("Pausing root network namespace")
   539  		k.rootNetworkNamespace.Stack().Pause()
   540  		defer k.rootNetworkNamespace.Stack().Resume()
   541  		log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart))
   542  	}
   543  
   544  	// Save the kernel state.
   545  	kernelStart := time.Now()
   546  	stats, err := state.Save(ctx, w, k)
   547  	if err != nil {
   548  		return err
   549  	}
   550  	log.Infof("Kernel save stats: %s", stats.String())
   551  	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
   552  
   553  	// Save the memory file's state.
   554  	memoryStart := time.Now()
   555  	if err := k.mf.SaveTo(ctx, w); err != nil {
   556  		return err
   557  	}
   558  	log.Infof("Memory save took [%s].", time.Since(memoryStart))
   559  
   560  	log.Infof("Overall save took [%s].", time.Since(saveStart))
   561  
   562  	return nil
   563  }
   564  
   565  // Preconditions: The kernel must be paused.
   566  func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
   567  	invalidated := make(map[*mm.MemoryManager]struct{})
   568  	k.tasks.mu.RLock()
   569  	defer k.tasks.mu.RUnlock()
   570  	for t := range k.tasks.Root.tids {
   571  		// We can skip locking Task.mu here since the kernel is paused.
   572  		if memMgr := t.image.MemoryManager; memMgr != nil {
   573  			if _, ok := invalidated[memMgr]; !ok {
   574  				if err := memMgr.InvalidateUnsavable(ctx); err != nil {
   575  					return err
   576  				}
   577  				invalidated[memMgr] = struct{}{}
   578  			}
   579  		}
   580  		// I really wish we just had a sync.Map of all MMs...
   581  		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
   582  			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
   583  				return err
   584  			}
   585  		}
   586  	}
   587  	return nil
   588  }
   589  
   590  // LoadFrom returns a new Kernel loaded from args.
   591  func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
   592  	loadStart := time.Now()
   593  
   594  	k.runningTasksCond.L = &k.runningTasksMu
   595  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   596  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   597  
   598  	initAppCores := k.applicationCores
   599  
   600  	// Load the pre-saved CPUID FeatureSet.
   601  	//
   602  	// N.B. This was also saved along with the full kernel below, so we
   603  	// don't need to explicitly install it in the Kernel.
   604  	cpuidStart := time.Now()
   605  	if _, err := state.Load(ctx, r, &k.featureSet); err != nil {
   606  		return err
   607  	}
   608  	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
   609  
   610  	// Verify that the FeatureSet is usable on this host. We do this before
   611  	// Kernel load so that the explicit CPUID mismatch error has priority
   612  	// over floating point state restore errors that may occur on load on
   613  	// an incompatible machine.
   614  	if err := k.featureSet.CheckHostCompatible(); err != nil {
   615  		return err
   616  	}
   617  
   618  	// Load the kernel state.
   619  	kernelStart := time.Now()
   620  	stats, err := state.Load(ctx, r, k)
   621  	if err != nil {
   622  		return err
   623  	}
   624  	log.Infof("Kernel load stats: %s", stats.String())
   625  	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
   626  
   627  	// rootNetworkNamespace should be populated after loading the state file.
   628  	// Restore the root network stack.
   629  	k.rootNetworkNamespace.RestoreRootStack(net)
   630  
   631  	// Load the memory file's state.
   632  	memoryStart := time.Now()
   633  	if err := k.mf.LoadFrom(ctx, r); err != nil {
   634  		return err
   635  	}
   636  	log.Infof("Memory load took [%s].", time.Since(memoryStart))
   637  
   638  	log.Infof("Overall load took [%s]", time.Since(loadStart))
   639  
   640  	k.Timekeeper().SetClocks(clocks)
   641  
   642  	if timeReady != nil {
   643  		close(timeReady)
   644  	}
   645  
   646  	if net != nil {
   647  		net.Resume()
   648  	}
   649  
   650  	if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
   651  		return err
   652  	}
   653  
   654  	tcpip.AsyncLoading.Wait()
   655  
   656  	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
   657  
   658  	// Applications may size per-cpu structures based on k.applicationCores, so
   659  	// it can't change across save/restore. When we are virtualizing CPU
   660  	// numbers, this isn't a problem. However, when we are exposing host CPU
   661  	// assignments, we can't tolerate an increase in the number of host CPUs,
   662  	// which could result in getcpu(2) returning CPUs that applications expect
   663  	// not to exist.
   664  	if k.useHostCores && initAppCores > k.applicationCores {
   665  		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
   666  	}
   667  
   668  	return nil
   669  }
   670  
   671  // UniqueID returns a unique identifier.
   672  func (k *Kernel) UniqueID() uint64 {
   673  	id := k.uniqueID.Add(1)
   674  	if id == 0 {
   675  		panic("unique identifier generator wrapped around")
   676  	}
   677  	return id
   678  }
   679  
   680  // CreateProcessArgs holds arguments to kernel.CreateProcess.
   681  type CreateProcessArgs struct {
   682  	// Filename is the filename to load as the init binary.
   683  	//
   684  	// If this is provided as "", File will be checked, then the file will be
   685  	// guessed via Argv[0].
   686  	Filename string
   687  
   688  	// File is a passed host FD pointing to a file to load as the init binary.
   689  	//
   690  	// This is checked if and only if Filename is "".
   691  	File *vfs.FileDescription
   692  
   693  	// Argv is a list of arguments.
   694  	Argv []string
   695  
   696  	// Envv is a list of environment variables.
   697  	Envv []string
   698  
   699  	// WorkingDirectory is the initial working directory.
   700  	//
   701  	// This defaults to the root if empty.
   702  	WorkingDirectory string
   703  
   704  	// Credentials is the initial credentials.
   705  	Credentials *auth.Credentials
   706  
   707  	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
   708  	// it takes a reference on FDTable.
   709  	FDTable *FDTable
   710  
   711  	// Umask is the initial umask.
   712  	Umask uint
   713  
   714  	// Limits are the initial resource limits.
   715  	Limits *limits.LimitSet
   716  
   717  	// MaxSymlinkTraversals is the maximum number of symlinks to follow
   718  	// during resolution.
   719  	MaxSymlinkTraversals uint
   720  
   721  	// UTSNamespace is the initial UTS namespace.
   722  	UTSNamespace *UTSNamespace
   723  
   724  	// IPCNamespace is the initial IPC namespace.
   725  	IPCNamespace *IPCNamespace
   726  
   727  	// PIDNamespace is the initial PID Namespace.
   728  	PIDNamespace *PIDNamespace
   729  
   730  	// AbstractSocketNamespace is the initial Abstract Socket namespace.
   731  	AbstractSocketNamespace *AbstractSocketNamespace
   732  
   733  	// MountNamespace optionally contains the mount namespace for this
   734  	// process. If nil, the init process's mount namespace is used.
   735  	//
   736  	// Anyone setting MountNamespace must donate a reference (i.e.
   737  	// increment it).
   738  	MountNamespace *vfs.MountNamespace
   739  
   740  	// ContainerID is the container that the process belongs to.
   741  	ContainerID string
   742  
   743  	// InitialCgroups are the cgroups the container is initialized to.
   744  	InitialCgroups map[Cgroup]struct{}
   745  }
   746  
   747  // NewContext returns a context.Context that represents the task that will be
   748  // created by args.NewContext(k).
   749  func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context {
   750  	return &createProcessContext{
   751  		Context: context.Background(),
   752  		kernel:  k,
   753  		args:    args,
   754  	}
   755  }
   756  
   757  // createProcessContext is a context.Context that represents the context
   758  // associated with a task that is being created.
   759  type createProcessContext struct {
   760  	context.Context
   761  	kernel *Kernel
   762  	args   *CreateProcessArgs
   763  }
   764  
   765  // Value implements context.Context.Value.
   766  func (ctx *createProcessContext) Value(key any) any {
   767  	switch key {
   768  	case CtxKernel:
   769  		return ctx.kernel
   770  	case CtxPIDNamespace:
   771  		return ctx.args.PIDNamespace
   772  	case CtxUTSNamespace:
   773  		utsns := ctx.args.UTSNamespace
   774  		utsns.IncRef()
   775  		return utsns
   776  	case ipc.CtxIPCNamespace:
   777  		ipcns := ctx.args.IPCNamespace
   778  		ipcns.IncRef()
   779  		return ipcns
   780  	case auth.CtxCredentials:
   781  		return ctx.args.Credentials
   782  	case vfs.CtxRoot:
   783  		if ctx.args.MountNamespace == nil {
   784  			return nil
   785  		}
   786  		root := ctx.args.MountNamespace.Root(ctx)
   787  		return root
   788  	case vfs.CtxMountNamespace:
   789  		if ctx.kernel.globalInit == nil {
   790  			return nil
   791  		}
   792  		mntns := ctx.kernel.GlobalInit().Leader().MountNamespace()
   793  		mntns.IncRef()
   794  		return mntns
   795  	case inet.CtxStack:
   796  		return ctx.kernel.RootNetworkNamespace().Stack()
   797  	case ktime.CtxRealtimeClock:
   798  		return ctx.kernel.RealtimeClock()
   799  	case limits.CtxLimits:
   800  		return ctx.args.Limits
   801  	case pgalloc.CtxMemoryCgroupID:
   802  		return ctx.getMemoryCgroupID()
   803  	case pgalloc.CtxMemoryFile:
   804  		return ctx.kernel.mf
   805  	case pgalloc.CtxMemoryFileProvider:
   806  		return ctx.kernel
   807  	case platform.CtxPlatform:
   808  		return ctx.kernel
   809  	case uniqueid.CtxGlobalUniqueID:
   810  		return ctx.kernel.UniqueID()
   811  	case uniqueid.CtxGlobalUniqueIDProvider:
   812  		return ctx.kernel
   813  	case uniqueid.CtxInotifyCookie:
   814  		return ctx.kernel.GenerateInotifyCookie()
   815  	case unimpl.CtxEvents:
   816  		return ctx.kernel
   817  	default:
   818  		return nil
   819  	}
   820  }
   821  
   822  func (ctx *createProcessContext) getMemoryCgroupID() uint32 {
   823  	for cg := range ctx.args.InitialCgroups {
   824  		for _, ctl := range cg.Controllers() {
   825  			if ctl.Type() == CgroupControllerMemory {
   826  				return cg.ID()
   827  			}
   828  		}
   829  	}
   830  	return InvalidCgroupID
   831  }
   832  
   833  // CreateProcess creates a new task in a new thread group with the given
   834  // options. The new task has no parent and is in the root PID namespace.
   835  //
   836  // If k.Start() has already been called, then the created process must be
   837  // started by calling kernel.StartProcess(tg).
   838  //
   839  // If k.Start() has not yet been called, then the created task will begin
   840  // running when k.Start() is called.
   841  //
   842  // CreateProcess has no analogue in Linux; it is used to create the initial
   843  // application task, as well as processes started by the control server.
   844  func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
   845  	k.extMu.Lock()
   846  	defer k.extMu.Unlock()
   847  	log.Infof("EXEC: %v", args.Argv)
   848  
   849  	ctx := args.NewContext(k)
   850  	mntns := args.MountNamespace
   851  	if mntns == nil {
   852  		if k.globalInit == nil {
   853  			return nil, 0, fmt.Errorf("mount namespace is nil")
   854  		}
   855  		// Add a reference to the namespace, which is transferred to the new process.
   856  		mntns = k.globalInit.Leader().MountNamespace()
   857  		mntns.IncRef()
   858  	}
   859  	// Get the root directory from the MountNamespace.
   860  	root := mntns.Root(ctx)
   861  	defer root.DecRef(ctx)
   862  
   863  	// Grab the working directory.
   864  	wd := root // Default.
   865  	if args.WorkingDirectory != "" {
   866  		pop := vfs.PathOperation{
   867  			Root:               root,
   868  			Start:              wd,
   869  			Path:               fspath.Parse(args.WorkingDirectory),
   870  			FollowFinalSymlink: true,
   871  		}
   872  		// NOTE(b/236028361): Do not set CheckSearchable flag to true.
   873  		// Application is allowed to start with a working directory that it can
   874  		// not access/search. This is consistent with Docker and VFS1. Runc
   875  		// explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry
   876  		// chdir to fix EPERM"). As described in the commit, runc unintentionally
   877  		// allowed this behavior in a couple of releases and applications started
   878  		// relying on it. So they decided to allow it for backward compatibility.
   879  		var err error
   880  		wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{})
   881  		if err != nil {
   882  			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
   883  		}
   884  		defer wd.DecRef(ctx)
   885  	}
   886  	fsContext := NewFSContext(root, wd, args.Umask)
   887  
   888  	tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
   889  	cu := cleanup.Make(func() {
   890  		tg.Release(ctx)
   891  	})
   892  	defer cu.Clean()
   893  
   894  	// Check which file to start from.
   895  	switch {
   896  	case args.Filename != "":
   897  		// If a filename is given, take that.
   898  		// Set File to nil so we resolve the path in LoadTaskImage.
   899  		args.File = nil
   900  	case args.File != nil:
   901  		// If File is set, take the File provided directly.
   902  		args.Filename = args.File.MappedName(ctx)
   903  	default:
   904  		// Otherwise look at Argv and see if the first argument is a valid path.
   905  		if len(args.Argv) == 0 {
   906  			return nil, 0, fmt.Errorf("no filename or command provided")
   907  		}
   908  		if !filepath.IsAbs(args.Argv[0]) {
   909  			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
   910  		}
   911  		args.Filename = args.Argv[0]
   912  	}
   913  
   914  	// Create a fresh task context.
   915  	remainingTraversals := args.MaxSymlinkTraversals
   916  	loadArgs := loader.LoadArgs{
   917  		Root:                root,
   918  		WorkingDir:          wd,
   919  		RemainingTraversals: &remainingTraversals,
   920  		ResolveFinal:        true,
   921  		Filename:            args.Filename,
   922  		File:                args.File,
   923  		CloseOnExec:         false,
   924  		Argv:                args.Argv,
   925  		Envv:                args.Envv,
   926  		Features:            k.featureSet,
   927  	}
   928  
   929  	image, se := k.LoadTaskImage(ctx, loadArgs)
   930  	if se != nil {
   931  		return nil, 0, errors.New(se.String())
   932  	}
   933  
   934  	// Take a reference on the FDTable, which will be transferred to
   935  	// TaskSet.NewTask().
   936  	args.FDTable.IncRef()
   937  
   938  	// Create the task.
   939  	config := &TaskConfig{
   940  		Kernel:                  k,
   941  		ThreadGroup:             tg,
   942  		TaskImage:               image,
   943  		FSContext:               fsContext,
   944  		FDTable:                 args.FDTable,
   945  		Credentials:             args.Credentials,
   946  		NetworkNamespace:        k.RootNetworkNamespace(),
   947  		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
   948  		UTSNamespace:            args.UTSNamespace,
   949  		IPCNamespace:            args.IPCNamespace,
   950  		AbstractSocketNamespace: args.AbstractSocketNamespace,
   951  		MountNamespace:          mntns,
   952  		ContainerID:             args.ContainerID,
   953  		InitialCgroups:          args.InitialCgroups,
   954  		UserCounters:            k.GetUserCounters(args.Credentials.RealKUID),
   955  		// A task with no parent starts out with no session keyring.
   956  		SessionKeyring: nil,
   957  	}
   958  	config.NetworkNamespace.IncRef()
   959  	t, err := k.tasks.NewTask(ctx, config)
   960  	if err != nil {
   961  		return nil, 0, err
   962  	}
   963  	t.traceExecEvent(image) // Simulate exec for tracing.
   964  
   965  	// Success.
   966  	cu.Release()
   967  	tgid := k.tasks.Root.IDOfThreadGroup(tg)
   968  	if k.globalInit == nil {
   969  		k.globalInit = tg
   970  	}
   971  	return tg, tgid, nil
   972  }
   973  
   974  // StartProcess starts running a process that was created with CreateProcess.
   975  func (k *Kernel) StartProcess(tg *ThreadGroup) {
   976  	t := tg.Leader()
   977  	tid := k.tasks.Root.IDOfTask(t)
   978  	t.Start(tid)
   979  }
   980  
   981  // Start starts execution of all tasks in k.
   982  //
   983  // Preconditions: Start may be called exactly once.
   984  func (k *Kernel) Start() error {
   985  	k.extMu.Lock()
   986  	defer k.extMu.Unlock()
   987  
   988  	if k.started {
   989  		return fmt.Errorf("kernel already started")
   990  	}
   991  
   992  	k.started = true
   993  	k.cpuClockTickTimer = time.NewTimer(linux.ClockTick)
   994  	k.runningTasksMu.Lock()
   995  	k.cpuClockTickerRunning = true
   996  	k.runningTasksMu.Unlock()
   997  	go k.runCPUClockTicker()
   998  	// If k was created by LoadKernelFrom, timers were stopped during
   999  	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
  1000  	// this is a no-op.
  1001  	k.resumeTimeLocked(k.SupervisorContext())
  1002  	k.tasks.mu.RLock()
  1003  	ts := make([]*Task, 0, len(k.tasks.Root.tids))
  1004  	for t := range k.tasks.Root.tids {
  1005  		ts = append(ts, t)
  1006  	}
  1007  	k.tasks.mu.RUnlock()
  1008  	// Start task goroutines.
  1009  	// NOTE(b/235349091): We don't actually need the TaskSet mutex, we just
  1010  	// need to make sure we only call t.Start() once for each task. Holding the
  1011  	// mutex for each task start may cause a nested locking error.
  1012  	for _, t := range ts {
  1013  		t.Start(t.ThreadID())
  1014  	}
  1015  	return nil
  1016  }
  1017  
  1018  // pauseTimeLocked pauses all Timers and Timekeeper updates.
  1019  //
  1020  // Preconditions:
  1021  //   - Any task goroutines running in k must be stopped.
  1022  //   - k.extMu must be locked.
  1023  func (k *Kernel) pauseTimeLocked(ctx context.Context) {
  1024  	// Since all task goroutines have been stopped by precondition, the CPU clock
  1025  	// ticker should stop on its own; wait for it to do so, waking it up from
  1026  	// sleeping betwen ticks if necessary.
  1027  	k.runningTasksMu.Lock()
  1028  	for k.cpuClockTickerRunning {
  1029  		select {
  1030  		case k.cpuClockTickerWakeCh <- struct{}{}:
  1031  		default:
  1032  		}
  1033  		k.cpuClockTickerStopCond.Wait()
  1034  	}
  1035  	k.runningTasksMu.Unlock()
  1036  
  1037  	// By precondition, nothing else can be interacting with PIDNamespace.tids
  1038  	// or FDTable.files, so we can iterate them without synchronization. (We
  1039  	// can't hold the TaskSet mutex when pausing thread group timers because
  1040  	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
  1041  	// mutex, while holding the Timer mutex.)
  1042  	for t := range k.tasks.Root.tids {
  1043  		if t == t.tg.leader {
  1044  			t.tg.itimerRealTimer.Pause()
  1045  			for _, it := range t.tg.timers {
  1046  				it.PauseTimer()
  1047  			}
  1048  		}
  1049  		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
  1050  		// but ktime.Timer.Pause is idempotent so this is harmless.
  1051  		if t.fdTable != nil {
  1052  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1053  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1054  					tfd.PauseTimer()
  1055  				}
  1056  			})
  1057  		}
  1058  	}
  1059  	k.timekeeper.PauseUpdates()
  1060  }
  1061  
  1062  // resumeTimeLocked resumes all Timers and Timekeeper updates. If
  1063  // pauseTimeLocked has not been previously called, resumeTimeLocked has no
  1064  // effect.
  1065  //
  1066  // Preconditions:
  1067  //   - Any task goroutines running in k must be stopped.
  1068  //   - k.extMu must be locked.
  1069  func (k *Kernel) resumeTimeLocked(ctx context.Context) {
  1070  	// The CPU clock ticker will automatically resume as task goroutines resume
  1071  	// execution.
  1072  
  1073  	k.timekeeper.ResumeUpdates()
  1074  	for t := range k.tasks.Root.tids {
  1075  		if t == t.tg.leader {
  1076  			t.tg.itimerRealTimer.Resume()
  1077  			for _, it := range t.tg.timers {
  1078  				it.ResumeTimer()
  1079  			}
  1080  		}
  1081  		if t.fdTable != nil {
  1082  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1083  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1084  					tfd.ResumeTimer()
  1085  				}
  1086  			})
  1087  		}
  1088  	}
  1089  }
  1090  
  1091  func (k *Kernel) incRunningTasks() {
  1092  	for {
  1093  		tasks := k.runningTasks.Load()
  1094  		if tasks != 0 {
  1095  			// Standard case. Simply increment.
  1096  			if !k.runningTasks.CompareAndSwap(tasks, tasks+1) {
  1097  				continue
  1098  			}
  1099  			return
  1100  		}
  1101  
  1102  		// Transition from 0 -> 1.
  1103  		k.runningTasksMu.Lock()
  1104  		if k.runningTasks.Load() != 0 {
  1105  			// Raced with another transition and lost.
  1106  			k.runningTasks.Add(1)
  1107  			k.runningTasksMu.Unlock()
  1108  			return
  1109  		}
  1110  		if !k.cpuClockTickerRunning {
  1111  			select {
  1112  			case tickTime := <-k.cpuClockTickTimer.C:
  1113  				// Rearm the timer since we consumed the wakeup. Estimate how much time
  1114  				// remains on the current tick so that periodic workloads interact with
  1115  				// the (periodic) CPU clock ticker in the same way that they would
  1116  				// without the optimization of putting the ticker to sleep.
  1117  				missedNS := time.Since(tickTime).Nanoseconds()
  1118  				missedTicks := missedNS / linux.ClockTick.Nanoseconds()
  1119  				thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds()
  1120  				k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS))
  1121  				// Increment k.cpuClock on the CPU clock ticker goroutine's behalf.
  1122  				// (Whole missed ticks don't matter, and adding them to k.cpuClock will
  1123  				// just confuse the watchdog.) At the time the tick occurred, all task
  1124  				// goroutines were asleep, so there's nothing else to do. This ensures
  1125  				// that our caller (Task.accountTaskGoroutineLeave()) records an
  1126  				// updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly
  1127  				// accounted as having resumed execution in the sentry during this tick
  1128  				// instead of at the end of the previous one.
  1129  				k.cpuClock.Add(1)
  1130  			default:
  1131  			}
  1132  			// We are transitioning from idle to active. Set k.cpuClockTickerRunning
  1133  			// = true here so that if we transition to idle and then active again
  1134  			// before the CPU clock ticker goroutine has a chance to run, the first
  1135  			// call to k.incRunningTasks() at the end of that cycle does not try to
  1136  			// steal k.cpuClockTickTimer.C again, as this would allow workloads that
  1137  			// rapidly cycle between idle and active to starve the CPU clock ticker
  1138  			// of chances to observe task goroutines in a running state and account
  1139  			// their CPU usage.
  1140  			k.cpuClockTickerRunning = true
  1141  			k.runningTasksCond.Signal()
  1142  		}
  1143  		// This store must happen after the increment of k.cpuClock above to ensure
  1144  		// that concurrent calls to Task.accountTaskGoroutineLeave() also observe
  1145  		// the updated k.cpuClock.
  1146  		k.runningTasks.Store(1)
  1147  		k.runningTasksMu.Unlock()
  1148  		return
  1149  	}
  1150  }
  1151  
  1152  func (k *Kernel) decRunningTasks() {
  1153  	tasks := k.runningTasks.Add(-1)
  1154  	if tasks < 0 {
  1155  		panic(fmt.Sprintf("Invalid running count %d", tasks))
  1156  	}
  1157  
  1158  	// Nothing to do. The next CPU clock tick will disable the timer if
  1159  	// there is still nothing running. This provides approximately one tick
  1160  	// of slack in which we can switch back and forth between idle and
  1161  	// active without an expensive transition.
  1162  }
  1163  
  1164  // WaitExited blocks until all tasks in k have exited.
  1165  func (k *Kernel) WaitExited() {
  1166  	k.tasks.liveGoroutines.Wait()
  1167  }
  1168  
  1169  // Kill requests that all tasks in k immediately exit as if group exiting with
  1170  // status ws. Kill does not wait for tasks to exit.
  1171  func (k *Kernel) Kill(ws linux.WaitStatus) {
  1172  	k.extMu.Lock()
  1173  	defer k.extMu.Unlock()
  1174  	k.tasks.Kill(ws)
  1175  }
  1176  
  1177  // Pause requests that all tasks in k temporarily stop executing, and blocks
  1178  // until all tasks and asynchronous I/O operations in k have stopped. Multiple
  1179  // calls to Pause nest and require an equal number of calls to Unpause to
  1180  // resume execution.
  1181  func (k *Kernel) Pause() {
  1182  	k.extMu.Lock()
  1183  	k.tasks.BeginExternalStop()
  1184  	k.extMu.Unlock()
  1185  	k.tasks.runningGoroutines.Wait()
  1186  	k.tasks.aioGoroutines.Wait()
  1187  }
  1188  
  1189  // ReceiveTaskStates receives full states for all tasks.
  1190  func (k *Kernel) ReceiveTaskStates() {
  1191  	k.extMu.Lock()
  1192  	k.tasks.PullFullState()
  1193  	k.extMu.Unlock()
  1194  }
  1195  
  1196  // Unpause ends the effect of a previous call to Pause. If Unpause is called
  1197  // without a matching preceding call to Pause, Unpause may panic.
  1198  func (k *Kernel) Unpause() {
  1199  	k.extMu.Lock()
  1200  	defer k.extMu.Unlock()
  1201  	k.tasks.EndExternalStop()
  1202  }
  1203  
  1204  // SendExternalSignal injects a signal into the kernel.
  1205  //
  1206  // context is used only for debugging to describe how the signal was received.
  1207  //
  1208  // Preconditions: Kernel must have an init process.
  1209  func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
  1210  	k.extMu.Lock()
  1211  	defer k.extMu.Unlock()
  1212  	k.sendExternalSignal(info, context)
  1213  }
  1214  
  1215  // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
  1216  //
  1217  // This function doesn't skip signals like SendExternalSignal does.
  1218  func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
  1219  	k.extMu.Lock()
  1220  	defer k.extMu.Unlock()
  1221  	return tg.SendSignal(info)
  1222  }
  1223  
  1224  // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the
  1225  // given process group.
  1226  //
  1227  // This function doesn't skip signals like SendExternalSignal does.
  1228  func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error {
  1229  	k.extMu.Lock()
  1230  	defer k.extMu.Unlock()
  1231  	// If anything goes wrong, we'll return the error, but still try our
  1232  	// best to deliver to other processes in the group.
  1233  	var firstErr error
  1234  	for _, tg := range k.TaskSet().Root.ThreadGroups() {
  1235  		if tg.ProcessGroup() != pg {
  1236  			continue
  1237  		}
  1238  		if err := tg.SendSignal(info); err != nil && firstErr == nil {
  1239  			firstErr = err
  1240  		}
  1241  	}
  1242  	return firstErr
  1243  }
  1244  
  1245  // SendContainerSignal sends the given signal to all processes inside the
  1246  // namespace that match the given container ID.
  1247  func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
  1248  	k.extMu.Lock()
  1249  	defer k.extMu.Unlock()
  1250  	k.tasks.mu.RLock()
  1251  	defer k.tasks.mu.RUnlock()
  1252  
  1253  	var lastErr error
  1254  	for tg := range k.tasks.Root.tgids {
  1255  		if tg.leader.ContainerID() == cid {
  1256  			tg.signalHandlers.mu.Lock()
  1257  			infoCopy := *info
  1258  			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
  1259  				lastErr = err
  1260  			}
  1261  			tg.signalHandlers.mu.Unlock()
  1262  		}
  1263  	}
  1264  	return lastErr
  1265  }
  1266  
  1267  // RebuildTraceContexts rebuilds the trace context for all tasks.
  1268  //
  1269  // Unfortunately, if these are built while tracing is not enabled, then we will
  1270  // not have meaningful trace data. Rebuilding here ensures that we can do so
  1271  // after tracing has been enabled.
  1272  func (k *Kernel) RebuildTraceContexts() {
  1273  	// We need to pause all task goroutines because Task.rebuildTraceContext()
  1274  	// replaces Task.traceContext and Task.traceTask, which are
  1275  	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
  1276  	// access them without synchronization) for performance.
  1277  	k.Pause()
  1278  	defer k.Unpause()
  1279  
  1280  	k.extMu.Lock()
  1281  	defer k.extMu.Unlock()
  1282  	k.tasks.mu.RLock()
  1283  	defer k.tasks.mu.RUnlock()
  1284  
  1285  	for t, tid := range k.tasks.Root.tids {
  1286  		t.rebuildTraceContext(tid)
  1287  	}
  1288  }
  1289  
  1290  // FeatureSet returns the FeatureSet.
  1291  func (k *Kernel) FeatureSet() cpuid.FeatureSet {
  1292  	return k.featureSet
  1293  }
  1294  
  1295  // Timekeeper returns the Timekeeper.
  1296  func (k *Kernel) Timekeeper() *Timekeeper {
  1297  	return k.timekeeper
  1298  }
  1299  
  1300  // TaskSet returns the TaskSet.
  1301  func (k *Kernel) TaskSet() *TaskSet {
  1302  	return k.tasks
  1303  }
  1304  
  1305  // RootUserNamespace returns the root UserNamespace.
  1306  func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
  1307  	return k.rootUserNamespace
  1308  }
  1309  
  1310  // RootUTSNamespace returns the root UTSNamespace.
  1311  func (k *Kernel) RootUTSNamespace() *UTSNamespace {
  1312  	k.rootUTSNamespace.IncRef()
  1313  	return k.rootUTSNamespace
  1314  }
  1315  
  1316  // RootIPCNamespace takes a reference and returns the root IPCNamespace.
  1317  func (k *Kernel) RootIPCNamespace() *IPCNamespace {
  1318  	k.rootIPCNamespace.IncRef()
  1319  	return k.rootIPCNamespace
  1320  }
  1321  
  1322  // RootPIDNamespace returns the root PIDNamespace.
  1323  func (k *Kernel) RootPIDNamespace() *PIDNamespace {
  1324  	return k.tasks.Root
  1325  }
  1326  
  1327  // RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
  1328  func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
  1329  	return k.rootAbstractSocketNamespace
  1330  }
  1331  
  1332  // RootNetworkNamespace returns the root network namespace, always non-nil.
  1333  func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
  1334  	return k.rootNetworkNamespace
  1335  }
  1336  
  1337  // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
  1338  // nil if no such thread group exists. GlobalInit may return a thread group
  1339  // containing no tasks if the thread group has already exited.
  1340  func (k *Kernel) GlobalInit() *ThreadGroup {
  1341  	k.extMu.Lock()
  1342  	defer k.extMu.Unlock()
  1343  	return k.globalInit
  1344  }
  1345  
  1346  // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
  1347  func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
  1348  	k.globalInit = tg
  1349  }
  1350  
  1351  // ApplicationCores returns the number of CPUs visible to sandboxed
  1352  // applications.
  1353  func (k *Kernel) ApplicationCores() uint {
  1354  	return k.applicationCores
  1355  }
  1356  
  1357  // RealtimeClock returns the application CLOCK_REALTIME clock.
  1358  func (k *Kernel) RealtimeClock() ktime.Clock {
  1359  	return k.timekeeper.realtimeClock
  1360  }
  1361  
  1362  // MonotonicClock returns the application CLOCK_MONOTONIC clock.
  1363  func (k *Kernel) MonotonicClock() ktime.Clock {
  1364  	return k.timekeeper.monotonicClock
  1365  }
  1366  
  1367  // CPUClockNow returns the current value of k.cpuClock.
  1368  func (k *Kernel) CPUClockNow() uint64 {
  1369  	return k.cpuClock.Load()
  1370  }
  1371  
  1372  // Syslog returns the syslog.
  1373  func (k *Kernel) Syslog() *syslog {
  1374  	return &k.syslog
  1375  }
  1376  
  1377  // GenerateInotifyCookie generates a unique inotify event cookie.
  1378  //
  1379  // Returned values may overlap with previously returned values if the value
  1380  // space is exhausted. 0 is not a valid cookie value, all other values
  1381  // representable in a uint32 are allowed.
  1382  func (k *Kernel) GenerateInotifyCookie() uint32 {
  1383  	id := k.nextInotifyCookie.Add(1)
  1384  	// Wrap-around is explicitly allowed for inotify event cookies.
  1385  	if id == 0 {
  1386  		id = k.nextInotifyCookie.Add(1)
  1387  	}
  1388  	return id
  1389  }
  1390  
  1391  // NetlinkPorts returns the netlink port manager.
  1392  func (k *Kernel) NetlinkPorts() *port.Manager {
  1393  	return k.netlinkPorts
  1394  }
  1395  
  1396  var (
  1397  	errSaved     = errors.New("sandbox has been successfully saved")
  1398  	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
  1399  )
  1400  
  1401  // SaveStatus returns the sandbox save status. If it was saved successfully,
  1402  // autosaved indicates whether save was triggered by autosave. If it was not
  1403  // saved successfully, err indicates the sandbox error that caused the kernel to
  1404  // exit during save.
  1405  func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
  1406  	k.extMu.Lock()
  1407  	defer k.extMu.Unlock()
  1408  	switch k.saveStatus {
  1409  	case nil:
  1410  		return false, false, nil
  1411  	case errSaved:
  1412  		return true, false, nil
  1413  	case errAutoSaved:
  1414  		return true, true, nil
  1415  	default:
  1416  		return false, false, k.saveStatus
  1417  	}
  1418  }
  1419  
  1420  // SetSaveSuccess sets the flag indicating that save completed successfully, if
  1421  // no status was already set.
  1422  func (k *Kernel) SetSaveSuccess(autosave bool) {
  1423  	k.extMu.Lock()
  1424  	defer k.extMu.Unlock()
  1425  	if k.saveStatus == nil {
  1426  		if autosave {
  1427  			k.saveStatus = errAutoSaved
  1428  		} else {
  1429  			k.saveStatus = errSaved
  1430  		}
  1431  	}
  1432  }
  1433  
  1434  // SetSaveError sets the sandbox error that caused the kernel to exit during
  1435  // save, if one is not already set.
  1436  func (k *Kernel) SetSaveError(err error) {
  1437  	k.extMu.Lock()
  1438  	defer k.extMu.Unlock()
  1439  	if k.saveStatus == nil {
  1440  		k.saveStatus = err
  1441  	}
  1442  }
  1443  
  1444  // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
  1445  // LoadFrom.
  1446  func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
  1447  	k.mf = mf
  1448  }
  1449  
  1450  // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
  1451  func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
  1452  	return k.mf
  1453  }
  1454  
  1455  // SupervisorContext returns a Context with maximum privileges in k. It should
  1456  // only be used by goroutines outside the control of the emulated kernel
  1457  // defined by e.
  1458  //
  1459  // Callers are responsible for ensuring that the returned Context is not used
  1460  // concurrently with changes to the Kernel.
  1461  func (k *Kernel) SupervisorContext() context.Context {
  1462  	return &supervisorContext{
  1463  		Kernel: k,
  1464  		Logger: log.Log(),
  1465  	}
  1466  }
  1467  
  1468  // SocketRecord represents a socket recorded in Kernel.sockets.
  1469  //
  1470  // +stateify savable
  1471  type SocketRecord struct {
  1472  	k    *Kernel
  1473  	Sock *vfs.FileDescription
  1474  	ID   uint64 // Socket table entry number.
  1475  }
  1476  
  1477  // RecordSocket adds a socket to the system-wide socket table for
  1478  // tracking.
  1479  //
  1480  // Precondition: Caller must hold a reference to sock.
  1481  //
  1482  // Note that the socket table will not hold a reference on the
  1483  // vfs.FileDescription.
  1484  func (k *Kernel) RecordSocket(sock *vfs.FileDescription) {
  1485  	k.extMu.Lock()
  1486  	if _, ok := k.sockets[sock]; ok {
  1487  		panic(fmt.Sprintf("Socket %p added twice", sock))
  1488  	}
  1489  	id := k.nextSocketRecord
  1490  	k.nextSocketRecord++
  1491  	s := &SocketRecord{
  1492  		k:    k,
  1493  		ID:   id,
  1494  		Sock: sock,
  1495  	}
  1496  	k.sockets[sock] = s
  1497  	k.extMu.Unlock()
  1498  }
  1499  
  1500  // DeleteSocket removes a socket from the system-wide socket table.
  1501  func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) {
  1502  	k.extMu.Lock()
  1503  	delete(k.sockets, sock)
  1504  	k.extMu.Unlock()
  1505  }
  1506  
  1507  // ListSockets returns a snapshot of all sockets.
  1508  //
  1509  // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef()
  1510  // to get a reference on a socket in the table.
  1511  func (k *Kernel) ListSockets() []*SocketRecord {
  1512  	k.extMu.Lock()
  1513  	var socks []*SocketRecord
  1514  	for _, s := range k.sockets {
  1515  		socks = append(socks, s)
  1516  	}
  1517  	k.extMu.Unlock()
  1518  	return socks
  1519  }
  1520  
  1521  // supervisorContext is a privileged context.
  1522  type supervisorContext struct {
  1523  	context.NoTask
  1524  	log.Logger
  1525  	*Kernel
  1526  }
  1527  
  1528  // Deadline implements context.Context.Deadline.
  1529  func (*Kernel) Deadline() (time.Time, bool) {
  1530  	return time.Time{}, false
  1531  }
  1532  
  1533  // Done implements context.Context.Done.
  1534  func (*Kernel) Done() <-chan struct{} {
  1535  	return nil
  1536  }
  1537  
  1538  // Err implements context.Context.Err.
  1539  func (*Kernel) Err() error {
  1540  	return nil
  1541  }
  1542  
  1543  // Value implements context.Context.
  1544  func (ctx *supervisorContext) Value(key any) any {
  1545  	switch key {
  1546  	case CtxCanTrace:
  1547  		// The supervisor context can trace anything. (None of
  1548  		// supervisorContext's users are expected to invoke ptrace, but ptrace
  1549  		// permissions are required for certain file accesses.)
  1550  		return func(*Task, bool) bool { return true }
  1551  	case CtxKernel:
  1552  		return ctx.Kernel
  1553  	case CtxPIDNamespace:
  1554  		return ctx.Kernel.tasks.Root
  1555  	case CtxUTSNamespace:
  1556  		utsns := ctx.Kernel.rootUTSNamespace
  1557  		utsns.IncRef()
  1558  		return utsns
  1559  	case ipc.CtxIPCNamespace:
  1560  		ipcns := ctx.Kernel.rootIPCNamespace
  1561  		ipcns.IncRef()
  1562  		return ipcns
  1563  	case auth.CtxCredentials:
  1564  		// The supervisor context is global root.
  1565  		return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace)
  1566  	case vfs.CtxRoot:
  1567  		if ctx.Kernel.globalInit == nil {
  1568  			return vfs.VirtualDentry{}
  1569  		}
  1570  		root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx)
  1571  		return root
  1572  	case vfs.CtxMountNamespace:
  1573  		if ctx.Kernel.globalInit == nil {
  1574  			return nil
  1575  		}
  1576  		mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace()
  1577  		mntns.IncRef()
  1578  		return mntns
  1579  	case inet.CtxStack:
  1580  		return ctx.Kernel.RootNetworkNamespace().Stack()
  1581  	case ktime.CtxRealtimeClock:
  1582  		return ctx.Kernel.RealtimeClock()
  1583  	case limits.CtxLimits:
  1584  		// No limits apply.
  1585  		return limits.NewLimitSet()
  1586  	case pgalloc.CtxMemoryFile:
  1587  		return ctx.Kernel.mf
  1588  	case pgalloc.CtxMemoryFileProvider:
  1589  		return ctx.Kernel
  1590  	case platform.CtxPlatform:
  1591  		return ctx.Kernel
  1592  	case uniqueid.CtxGlobalUniqueID:
  1593  		return ctx.Kernel.UniqueID()
  1594  	case uniqueid.CtxGlobalUniqueIDProvider:
  1595  		return ctx.Kernel
  1596  	case uniqueid.CtxInotifyCookie:
  1597  		return ctx.Kernel.GenerateInotifyCookie()
  1598  	case unimpl.CtxEvents:
  1599  		return ctx.Kernel
  1600  	case cpuid.CtxFeatureSet:
  1601  		return ctx.Kernel.featureSet
  1602  	default:
  1603  		return nil
  1604  	}
  1605  }
  1606  
  1607  // Rate limits for the number of unimplemented syscall events.
  1608  const (
  1609  	unimplementedSyscallsMaxRate = 100  // events per second
  1610  	unimplementedSyscallBurst    = 1000 // events
  1611  )
  1612  
  1613  // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
  1614  // channel.
  1615  func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) {
  1616  	k.unimplementedSyscallEmitterOnce.Do(func() {
  1617  		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
  1618  	})
  1619  
  1620  	t := TaskFromContext(ctx)
  1621  	IncrementUnimplementedSyscallCounter(sysno)
  1622  	_, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
  1623  		Tid:       int32(t.ThreadID()),
  1624  		Registers: t.Arch().StateData().Proto(),
  1625  	})
  1626  }
  1627  
  1628  // VFS returns the virtual filesystem for the kernel.
  1629  func (k *Kernel) VFS() *vfs.VirtualFilesystem {
  1630  	return &k.vfs
  1631  }
  1632  
  1633  // SetHostMount sets the hostfs mount.
  1634  func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
  1635  	if k.hostMount != nil {
  1636  		panic("Kernel.hostMount cannot be set more than once")
  1637  	}
  1638  	k.hostMount = mnt
  1639  }
  1640  
  1641  // HostMount returns the hostfs mount.
  1642  func (k *Kernel) HostMount() *vfs.Mount {
  1643  	return k.hostMount
  1644  }
  1645  
  1646  // PipeMount returns the pipefs mount.
  1647  func (k *Kernel) PipeMount() *vfs.Mount {
  1648  	return k.pipeMount
  1649  }
  1650  
  1651  // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace.
  1652  func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter {
  1653  	return nsfs.NewInode(ctx, k.nsfsMount, ns)
  1654  }
  1655  
  1656  // ShmMount returns the tmpfs mount.
  1657  func (k *Kernel) ShmMount() *vfs.Mount {
  1658  	return k.shmMount
  1659  }
  1660  
  1661  // SocketMount returns the sockfs mount.
  1662  func (k *Kernel) SocketMount() *vfs.Mount {
  1663  	return k.socketMount
  1664  }
  1665  
  1666  // CgroupRegistry returns the cgroup registry.
  1667  func (k *Kernel) CgroupRegistry() *CgroupRegistry {
  1668  	return k.cgroupRegistry
  1669  }
  1670  
  1671  // Release releases resources owned by k.
  1672  //
  1673  // Precondition: This should only be called after the kernel is fully
  1674  // initialized, e.g. after k.Start() has been called.
  1675  func (k *Kernel) Release() {
  1676  	ctx := k.SupervisorContext()
  1677  	k.hostMount.DecRef(ctx)
  1678  	k.pipeMount.DecRef(ctx)
  1679  	k.nsfsMount.DecRef(ctx)
  1680  	k.shmMount.DecRef(ctx)
  1681  	k.socketMount.DecRef(ctx)
  1682  	k.vfs.Release(ctx)
  1683  	k.timekeeper.Destroy()
  1684  	k.vdso.Release(ctx)
  1685  	k.RootNetworkNamespace().DecRef(ctx)
  1686  }
  1687  
  1688  // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
  1689  // hierarchy.
  1690  //
  1691  // Precondition: root must be a new cgroup with no tasks. This implies the
  1692  // controllers for root are also new and currently manage no task, which in turn
  1693  // implies the new cgroup can be populated without migrating tasks between
  1694  // cgroups.
  1695  func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
  1696  	k.tasks.mu.RLock()
  1697  	k.tasks.forEachTaskLocked(func(t *Task) {
  1698  		if t.exitState != TaskExitNone {
  1699  			return
  1700  		}
  1701  		t.mu.Lock()
  1702  		// A task can be in the cgroup if it has been created after the
  1703  		// cgroup hierarchy was registered.
  1704  		t.enterCgroupIfNotYetLocked(root)
  1705  		t.mu.Unlock()
  1706  	})
  1707  	k.tasks.mu.RUnlock()
  1708  }
  1709  
  1710  // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
  1711  // hierarchy with the provided id.  This is intended for use during hierarchy
  1712  // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
  1713  func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
  1714  	var releasedCGs []Cgroup
  1715  
  1716  	k.tasks.mu.RLock()
  1717  	// We'll have one cgroup per hierarchy per task.
  1718  	releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids))
  1719  	k.tasks.forEachTaskLocked(func(t *Task) {
  1720  		if t.exitState != TaskExitNone {
  1721  			return
  1722  		}
  1723  		t.mu.Lock()
  1724  		for cg := range t.cgroups {
  1725  			if cg.HierarchyID() == hid {
  1726  				cg.Leave(t)
  1727  				t.ResetMemCgIDFromCgroup(cg)
  1728  				delete(t.cgroups, cg)
  1729  				releasedCGs = append(releasedCGs, cg)
  1730  				// A task can't be part of multiple cgroups from the same
  1731  				// hierarchy, so we can skip checking the rest once we find a
  1732  				// match.
  1733  				break
  1734  			}
  1735  		}
  1736  		t.mu.Unlock()
  1737  	})
  1738  	k.tasks.mu.RUnlock()
  1739  
  1740  	for _, c := range releasedCGs {
  1741  		c.decRef()
  1742  	}
  1743  }
  1744  
  1745  func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) {
  1746  	k.tasks.mu.RLock()
  1747  	oldRootDecRefs := 0
  1748  	k.tasks.forEachTaskLocked(func(t *Task) {
  1749  		t.mu.Lock()
  1750  		defer t.mu.Unlock()
  1751  		if fsc := t.fsContext; fsc != nil {
  1752  			fsc.mu.Lock()
  1753  			defer fsc.mu.Unlock()
  1754  			if fsc.root == oldRoot {
  1755  				newRoot.IncRef()
  1756  				oldRootDecRefs++
  1757  				fsc.root = newRoot
  1758  			}
  1759  			if fsc.cwd == oldRoot {
  1760  				newRoot.IncRef()
  1761  				oldRootDecRefs++
  1762  				fsc.cwd = newRoot
  1763  			}
  1764  		}
  1765  	})
  1766  	k.tasks.mu.RUnlock()
  1767  	for i := 0; i < oldRootDecRefs; i++ {
  1768  		oldRoot.DecRef(ctx)
  1769  	}
  1770  }
  1771  
  1772  func (k *Kernel) GetUserCounters(uid auth.KUID) *userCounters {
  1773  	k.userCountersMapMu.Lock()
  1774  	defer k.userCountersMapMu.Unlock()
  1775  
  1776  	if uc, ok := k.userCountersMap[uid]; ok {
  1777  		return uc
  1778  	}
  1779  
  1780  	uc := &userCounters{}
  1781  	k.userCountersMap[uid] = uc
  1782  	return uc
  1783  }