github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/kernel.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/kernel.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernel provides an emulation of the Linux kernel.
    16  //
    17  // See README.md for a detailed overview.
    18  //
    19  // Lock order (outermost locks must be taken first):
    20  //
    21  //	Kernel.extMu
    22  //		ThreadGroup.timerMu
    23  //		  ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu
    24  //		    TaskSet.mu
    25  //		      SignalHandlers.mu
    26  //		        Task.mu
    27  //		    runningTasksMu
    28  //
    29  // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
    30  // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
    31  // time requires locking all of their signal mutexes first.
    32  package kernel
    33  
    34  import (
    35  	"errors"
    36  	"fmt"
    37  	"path/filepath"
    38  	"time"
    39  
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/cpuid"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/eventchannel"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/nsfs"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/pipefs"
    52  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/sockfs"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/timerfd"
    54  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs"
    55  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostcpu"
    56  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    57  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    58  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/futex"
    59  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/ipc"
    60  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched"
    61  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    62  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits"
    63  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/loader"
    64  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm"
    65  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc"
    66  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    67  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/port"
    68  	sentrytime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time"
    69  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/unimpl"
    70  	uspb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
    71  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/uniqueid"
    72  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    73  	"github.com/nicocha30/gvisor-ligolo/pkg/state"
    74  	"github.com/nicocha30/gvisor-ligolo/pkg/state/wire"
    75  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    76  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    77  )
    78  
    79  // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to
    80  // allow easy access everywhere.
    81  var IOUringEnabled = false
    82  
    83  // userCounters is a set of user counters.
    84  //
    85  // +stateify savable
    86  type userCounters struct {
    87  	uid auth.KUID
    88  
    89  	rlimitNProc atomicbitops.Uint64
    90  }
    91  
    92  // incRLimitNProc increments the rlimitNProc counter.
    93  func (uc *userCounters) incRLimitNProc(ctx context.Context) error {
    94  	lim := limits.FromContext(ctx).Get(limits.ProcessCount)
    95  	creds := auth.CredentialsFromContext(ctx)
    96  	nproc := uc.rlimitNProc.Add(1)
    97  	if nproc > lim.Cur &&
    98  		!creds.HasCapability(linux.CAP_SYS_ADMIN) &&
    99  		!creds.HasCapability(linux.CAP_SYS_RESOURCE) {
   100  		uc.rlimitNProc.Add(^uint64(0))
   101  		return linuxerr.EAGAIN
   102  	}
   103  	return nil
   104  }
   105  
   106  // decRLimitNProc decrements the rlimitNProc counter.
   107  func (uc *userCounters) decRLimitNProc() {
   108  	uc.rlimitNProc.Add(^uint64(0))
   109  }
   110  
   111  // Kernel represents an emulated Linux kernel. It must be initialized by calling
   112  // Init() or LoadFrom().
   113  //
   114  // +stateify savable
   115  type Kernel struct {
   116  	// extMu serializes external changes to the Kernel with calls to
   117  	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
   118  	// remains frozen for the duration of the call; it requires that the Kernel
   119  	// is paused as a precondition, which ensures that none of the tasks
   120  	// running within the Kernel can affect its state, but extMu is required to
   121  	// ensure that concurrent users of the Kernel *outside* the Kernel's
   122  	// control cannot affect its state by calling e.g.
   123  	// Kernel.SendExternalSignal.)
   124  	extMu sync.Mutex `state:"nosave"`
   125  
   126  	// started is true if Start has been called. Unless otherwise specified,
   127  	// all Kernel fields become immutable once started becomes true.
   128  	started bool `state:"nosave"`
   129  
   130  	// All of the following fields are immutable unless otherwise specified.
   131  
   132  	// Platform is the platform that is used to execute tasks in the created
   133  	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
   134  	// embedded anonymously (the same issue applies).
   135  	platform.Platform `state:"nosave"`
   136  
   137  	// mf provides application memory.
   138  	mf *pgalloc.MemoryFile `state:"nosave"`
   139  
   140  	// See InitKernelArgs for the meaning of these fields.
   141  	featureSet                  cpuid.FeatureSet
   142  	timekeeper                  *Timekeeper
   143  	tasks                       *TaskSet
   144  	rootUserNamespace           *auth.UserNamespace
   145  	rootNetworkNamespace        *inet.Namespace
   146  	applicationCores            uint
   147  	useHostCores                bool
   148  	extraAuxv                   []arch.AuxEntry
   149  	vdso                        *loader.VDSO
   150  	rootUTSNamespace            *UTSNamespace
   151  	rootIPCNamespace            *IPCNamespace
   152  	rootAbstractSocketNamespace *AbstractSocketNamespace
   153  
   154  	// futexes is the "root" futex.Manager, from which all others are forked.
   155  	// This is necessary to ensure that shared futexes are coherent across all
   156  	// tasks, including those created by CreateProcess.
   157  	futexes *futex.Manager
   158  
   159  	// globalInit is the thread group whose leader has ID 1 in the root PID
   160  	// namespace. globalInit is stored separately so that it is accessible even
   161  	// after all tasks in the thread group have exited, such that ID 1 is no
   162  	// longer mapped.
   163  	//
   164  	// globalInit is mutable until it is assigned by the first successful call
   165  	// to CreateProcess, and is protected by extMu.
   166  	globalInit *ThreadGroup
   167  
   168  	// syslog is the kernel log.
   169  	syslog syslog
   170  
   171  	runningTasksMu runningTasksMutex `state:"nosave"`
   172  
   173  	// runningTasks is the total count of tasks currently in
   174  	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
   175  	// not blocked or stopped.
   176  	//
   177  	// runningTasks must be accessed atomically. Increments from 0 to 1 are
   178  	// further protected by runningTasksMu (see incRunningTasks).
   179  	runningTasks atomicbitops.Int64
   180  
   181  	// runningTasksCond is signaled when runningTasks is incremented from 0 to 1.
   182  	//
   183  	// Invariant: runningTasksCond.L == &runningTasksMu.
   184  	runningTasksCond sync.Cond `state:"nosave"`
   185  
   186  	// cpuClock is incremented every linux.ClockTick by a goroutine running
   187  	// kernel.runCPUClockTicker() while runningTasks != 0.
   188  	//
   189  	// cpuClock is used to measure task CPU usage, since sampling monotonicClock
   190  	// twice on every syscall turns out to be unreasonably expensive. This is
   191  	// similar to how Linux does task CPU accounting on x86
   192  	// (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing
   193  	// information to improve resolution
   194  	// (kernel/sched/cputime.c:cputime_adjust()), which we can't do since
   195  	// "preeemptive" scheduling is managed by the Go runtime, which doesn't
   196  	// provide this information.
   197  	//
   198  	// cpuClock is mutable, and is accessed using atomic memory operations.
   199  	cpuClock atomicbitops.Uint64
   200  
   201  	// cpuClockTickTimer drives increments of cpuClock.
   202  	cpuClockTickTimer *time.Timer `state:"nosave"`
   203  
   204  	// cpuClockMu is used to make increments of cpuClock, and updates of timers
   205  	// based on cpuClock, atomic.
   206  	cpuClockMu cpuClockMutex `state:"nosave"`
   207  
   208  	// cpuClockTickerRunning is true if the goroutine that increments cpuClock is
   209  	// running and false if it is blocked in runningTasksCond.Wait() or if it
   210  	// never started.
   211  	//
   212  	// cpuClockTickerRunning is protected by runningTasksMu.
   213  	cpuClockTickerRunning bool
   214  
   215  	// cpuClockTickerWakeCh is sent to to wake the goroutine that increments
   216  	// cpuClock if it's sleeping between ticks.
   217  	cpuClockTickerWakeCh chan struct{} `state:"nosave"`
   218  
   219  	// cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions
   220  	// from true to false.
   221  	//
   222  	// Invariant: cpuClockTickerStopCond.L == &runningTasksMu.
   223  	cpuClockTickerStopCond sync.Cond `state:"nosave"`
   224  
   225  	// uniqueID is used to generate unique identifiers.
   226  	//
   227  	// uniqueID is mutable, and is accessed using atomic memory operations.
   228  	uniqueID atomicbitops.Uint64
   229  
   230  	// nextInotifyCookie is a monotonically increasing counter used for
   231  	// generating unique inotify event cookies.
   232  	//
   233  	// nextInotifyCookie is mutable.
   234  	nextInotifyCookie atomicbitops.Uint32
   235  
   236  	// netlinkPorts manages allocation of netlink socket port IDs.
   237  	netlinkPorts *port.Manager
   238  
   239  	// saveStatus is nil if the sandbox has not been saved, errSaved or
   240  	// errAutoSaved if it has been saved successfully, or the error causing the
   241  	// sandbox to exit during save.
   242  	// It is protected by extMu.
   243  	saveStatus error `state:"nosave"`
   244  
   245  	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
   246  	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
   247  
   248  	// sockets records all network sockets in the system. Protected by extMu.
   249  	sockets map[*vfs.FileDescription]*SocketRecord
   250  
   251  	// nextSocketRecord is the next entry number to use in sockets. Protected
   252  	// by extMu.
   253  	nextSocketRecord uint64
   254  
   255  	// unimplementedSyscallEmitterOnce is used in the initialization of
   256  	// unimplementedSyscallEmitter.
   257  	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
   258  
   259  	// unimplementedSyscallEmitter is used to emit unimplemented syscall
   260  	// events. This is initialized lazily on the first unimplemented
   261  	// syscall.
   262  	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
   263  
   264  	// SpecialOpts contains special kernel options.
   265  	SpecialOpts
   266  
   267  	// vfs keeps the filesystem state used across the kernel.
   268  	vfs vfs.VirtualFilesystem
   269  
   270  	// hostMount is the Mount used for file descriptors that were imported
   271  	// from the host.
   272  	hostMount *vfs.Mount
   273  
   274  	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
   275  	// syscalls (as opposed to named pipes created by mknod()).
   276  	pipeMount *vfs.Mount
   277  
   278  	// nsfsMount is the Mount used for namespaces.
   279  	nsfsMount *vfs.Mount
   280  
   281  	// shmMount is the Mount used for anonymous files created by the
   282  	// memfd_create() syscalls. It is analogous to Linux's shm_mnt.
   283  	shmMount *vfs.Mount
   284  
   285  	// socketMount is the Mount used for sockets created by the socket() and
   286  	// socketpair() syscalls. There are several cases where a socket dentry will
   287  	// not be contained in socketMount:
   288  	// 1. Socket files created by mknod()
   289  	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
   290  	// 3. Socket files created by binding Unix sockets to a file path
   291  	socketMount *vfs.Mount
   292  
   293  	// sysVShmDevID is the device number used by SysV shm segments. In Linux,
   294  	// SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number.
   295  	// In gVisor, the shm implementation does not use shmMount, extracting
   296  	// shmMount's device number is inconvenient, applications accept a
   297  	// different device number in practice, and using a distinct device number
   298  	// avoids the possibility of inode number collisions due to the hack
   299  	// described in shm.Shm.InodeID().
   300  	sysVShmDevID uint32
   301  
   302  	// If set to true, report address space activation waits as if the task is in
   303  	// external wait so that the watchdog doesn't report the task stuck.
   304  	SleepForAddressSpaceActivation bool
   305  
   306  	// Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
   307  	// tracee-tracer relationship. The key is a process (technically, the thread
   308  	// group leader) that can be traced by any thread that is a descendant of the
   309  	// value. If the value is nil, then anyone can trace the process represented by
   310  	// the key.
   311  	//
   312  	// ptraceExceptions is protected by the TaskSet mutex.
   313  	ptraceExceptions map[*Task]*Task
   314  
   315  	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
   316  	YAMAPtraceScope atomicbitops.Int32
   317  
   318  	// cgroupRegistry contains the set of active cgroup controllers on the
   319  	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
   320  	// the system.
   321  	cgroupRegistry *CgroupRegistry
   322  
   323  	// userCountersMap maps auth.KUID into a set of user counters.
   324  	userCountersMap   map[auth.KUID]*userCounters
   325  	userCountersMapMu userCountersMutex `state:"nosave"`
   326  }
   327  
   328  // InitKernelArgs holds arguments to Init.
   329  type InitKernelArgs struct {
   330  	// FeatureSet is the emulated CPU feature set.
   331  	FeatureSet cpuid.FeatureSet
   332  
   333  	// Timekeeper manages time for all tasks in the system.
   334  	Timekeeper *Timekeeper
   335  
   336  	// RootUserNamespace is the root user namespace.
   337  	RootUserNamespace *auth.UserNamespace
   338  
   339  	// RootNetworkNamespace is the root network namespace. If nil, no networking
   340  	// will be available.
   341  	RootNetworkNamespace *inet.Namespace
   342  
   343  	// ApplicationCores is the number of logical CPUs visible to sandboxed
   344  	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
   345  	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
   346  	// most significant bit in cpu_possible_mask + 1.
   347  	ApplicationCores uint
   348  
   349  	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
   350  	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
   351  	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
   352  	// will be overridden.
   353  	UseHostCores bool
   354  
   355  	// ExtraAuxv contains additional auxiliary vector entries that are added to
   356  	// each process by the ELF loader.
   357  	ExtraAuxv []arch.AuxEntry
   358  
   359  	// Vdso holds the VDSO and its parameter page.
   360  	Vdso *loader.VDSO
   361  
   362  	// RootUTSNamespace is the root UTS namespace.
   363  	RootUTSNamespace *UTSNamespace
   364  
   365  	// RootIPCNamespace is the root IPC namespace.
   366  	RootIPCNamespace *IPCNamespace
   367  
   368  	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
   369  	RootAbstractSocketNamespace *AbstractSocketNamespace
   370  
   371  	// PIDNamespace is the root PID namespace.
   372  	PIDNamespace *PIDNamespace
   373  }
   374  
   375  // Init initialize the Kernel with no tasks.
   376  //
   377  // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
   378  // before calling Init.
   379  func (k *Kernel) Init(args InitKernelArgs) error {
   380  	if args.Timekeeper == nil {
   381  		return fmt.Errorf("args.Timekeeper is nil")
   382  	}
   383  	if args.Timekeeper.clocks == nil {
   384  		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
   385  	}
   386  	if args.RootUserNamespace == nil {
   387  		return fmt.Errorf("args.RootUserNamespace is nil")
   388  	}
   389  	if args.ApplicationCores == 0 {
   390  		return fmt.Errorf("args.ApplicationCores is 0")
   391  	}
   392  
   393  	k.featureSet = args.FeatureSet
   394  	k.timekeeper = args.Timekeeper
   395  	k.tasks = newTaskSet(args.PIDNamespace)
   396  	k.rootUserNamespace = args.RootUserNamespace
   397  	k.rootUTSNamespace = args.RootUTSNamespace
   398  	k.rootIPCNamespace = args.RootIPCNamespace
   399  	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
   400  	k.rootNetworkNamespace = args.RootNetworkNamespace
   401  	if k.rootNetworkNamespace == nil {
   402  		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace)
   403  	}
   404  	k.runningTasksCond.L = &k.runningTasksMu
   405  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   406  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   407  	k.applicationCores = args.ApplicationCores
   408  	if args.UseHostCores {
   409  		k.useHostCores = true
   410  		maxCPU, err := hostcpu.MaxPossibleCPU()
   411  		if err != nil {
   412  			return fmt.Errorf("failed to get maximum CPU number: %v", err)
   413  		}
   414  		minAppCores := uint(maxCPU) + 1
   415  		if k.applicationCores < minAppCores {
   416  			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
   417  			k.applicationCores = minAppCores
   418  		}
   419  	}
   420  	k.extraAuxv = args.ExtraAuxv
   421  	k.vdso = args.Vdso
   422  	k.futexes = futex.NewManager()
   423  	k.netlinkPorts = port.New()
   424  	k.ptraceExceptions = make(map[*Task]*Task)
   425  	k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL)
   426  	k.userCountersMap = make(map[auth.KUID]*userCounters)
   427  
   428  	ctx := k.SupervisorContext()
   429  	if err := k.vfs.Init(ctx); err != nil {
   430  		return fmt.Errorf("failed to initialize VFS: %v", err)
   431  	}
   432  
   433  	err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx))
   434  	if err != nil {
   435  		return fmt.Errorf("failed to create mqfs filesystem: %v", err)
   436  	}
   437  
   438  	pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
   439  	if err != nil {
   440  		return fmt.Errorf("failed to create pipefs filesystem: %v", err)
   441  	}
   442  	defer pipeFilesystem.DecRef(ctx)
   443  	pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
   444  	k.pipeMount = pipeMount
   445  
   446  	nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs)
   447  	if err != nil {
   448  		return fmt.Errorf("failed to create nsfs filesystem: %v", err)
   449  	}
   450  	defer nsfsFilesystem.DecRef(ctx)
   451  	nsfsMount := k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
   452  	k.nsfsMount = nsfsMount
   453  	k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, nsfsMount, k.rootNetworkNamespace))
   454  
   455  	tmpfsOpts := vfs.GetFilesystemOptions{
   456  		InternalData: tmpfs.FilesystemOpts{
   457  			// See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT).
   458  			// Note how mm/shmem.c:shmem_fill_super() does not provide a default
   459  			// value for sbinfo->max_blocks when SB_KERNMOUNT is set.
   460  			DisableDefaultSizeLimit: true,
   461  		},
   462  	}
   463  	tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts)
   464  	if err != nil {
   465  		return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
   466  	}
   467  	defer tmpfsFilesystem.DecRef(ctx)
   468  	defer tmpfsRoot.DecRef(ctx)
   469  	k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
   470  
   471  	socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
   472  	if err != nil {
   473  		return fmt.Errorf("failed to create sockfs filesystem: %v", err)
   474  	}
   475  	defer socketFilesystem.DecRef(ctx)
   476  	k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
   477  
   478  	sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor()
   479  	if err != nil {
   480  		return fmt.Errorf("failed to get device number for SysV shm: %v", err)
   481  	}
   482  	k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor)
   483  
   484  	k.sockets = make(map[*vfs.FileDescription]*SocketRecord)
   485  
   486  	k.cgroupRegistry = newCgroupRegistry()
   487  	return nil
   488  }
   489  
   490  // SaveTo saves the state of k to w.
   491  //
   492  // Preconditions: The kernel must be paused throughout the call to SaveTo.
   493  func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
   494  	saveStart := time.Now()
   495  
   496  	// Do not allow other Kernel methods to affect it while it's being saved.
   497  	k.extMu.Lock()
   498  	defer k.extMu.Unlock()
   499  
   500  	// Stop time.
   501  	k.pauseTimeLocked(ctx)
   502  	defer k.resumeTimeLocked(ctx)
   503  
   504  	// Evict all evictable MemoryFile allocations.
   505  	k.mf.StartEvictions()
   506  	k.mf.WaitForEvictions()
   507  
   508  	// Discard unsavable mappings, such as those for host file descriptors.
   509  	if err := k.invalidateUnsavableMappings(ctx); err != nil {
   510  		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   511  	}
   512  
   513  	// Prepare filesystems for saving. This must be done after
   514  	// invalidateUnsavableMappings(), since dropping memory mappings may
   515  	// affect filesystem state (e.g. page cache reference counts).
   516  	if err := k.vfs.PrepareSave(ctx); err != nil {
   517  		return err
   518  	}
   519  
   520  	// Save the CPUID FeatureSet before the rest of the kernel so we can
   521  	// verify its compatibility on restore before attempting to restore the
   522  	// entire kernel, which may fail on an incompatible machine.
   523  	//
   524  	// N.B. This will also be saved along with the full kernel save below.
   525  	cpuidStart := time.Now()
   526  	if _, err := state.Save(ctx, w, &k.featureSet); err != nil {
   527  		return err
   528  	}
   529  	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
   530  
   531  	// Save the timekeeper's state.
   532  
   533  	if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil {
   534  		// Pause the network stack.
   535  		netstackPauseStart := time.Now()
   536  		log.Infof("Pausing root network namespace")
   537  		k.rootNetworkNamespace.Stack().Pause()
   538  		defer k.rootNetworkNamespace.Stack().Resume()
   539  		log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart))
   540  	}
   541  
   542  	// Save the kernel state.
   543  	kernelStart := time.Now()
   544  	stats, err := state.Save(ctx, w, k)
   545  	if err != nil {
   546  		return err
   547  	}
   548  	log.Infof("Kernel save stats: %s", stats.String())
   549  	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
   550  
   551  	// Save the memory file's state.
   552  	memoryStart := time.Now()
   553  	if err := k.mf.SaveTo(ctx, w); err != nil {
   554  		return err
   555  	}
   556  	log.Infof("Memory save took [%s].", time.Since(memoryStart))
   557  
   558  	log.Infof("Overall save took [%s].", time.Since(saveStart))
   559  
   560  	return nil
   561  }
   562  
   563  // Preconditions: The kernel must be paused.
   564  func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
   565  	invalidated := make(map[*mm.MemoryManager]struct{})
   566  	k.tasks.mu.RLock()
   567  	defer k.tasks.mu.RUnlock()
   568  	for t := range k.tasks.Root.tids {
   569  		// We can skip locking Task.mu here since the kernel is paused.
   570  		if memMgr := t.image.MemoryManager; memMgr != nil {
   571  			if _, ok := invalidated[memMgr]; !ok {
   572  				if err := memMgr.InvalidateUnsavable(ctx); err != nil {
   573  					return err
   574  				}
   575  				invalidated[memMgr] = struct{}{}
   576  			}
   577  		}
   578  		// I really wish we just had a sync.Map of all MMs...
   579  		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
   580  			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
   581  				return err
   582  			}
   583  		}
   584  	}
   585  	return nil
   586  }
   587  
   588  // LoadFrom returns a new Kernel loaded from args.
   589  func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
   590  	loadStart := time.Now()
   591  
   592  	k.runningTasksCond.L = &k.runningTasksMu
   593  	k.cpuClockTickerWakeCh = make(chan struct{}, 1)
   594  	k.cpuClockTickerStopCond.L = &k.runningTasksMu
   595  
   596  	initAppCores := k.applicationCores
   597  
   598  	// Load the pre-saved CPUID FeatureSet.
   599  	//
   600  	// N.B. This was also saved along with the full kernel below, so we
   601  	// don't need to explicitly install it in the Kernel.
   602  	cpuidStart := time.Now()
   603  	if _, err := state.Load(ctx, r, &k.featureSet); err != nil {
   604  		return err
   605  	}
   606  	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
   607  
   608  	// Verify that the FeatureSet is usable on this host. We do this before
   609  	// Kernel load so that the explicit CPUID mismatch error has priority
   610  	// over floating point state restore errors that may occur on load on
   611  	// an incompatible machine.
   612  	if err := k.featureSet.CheckHostCompatible(); err != nil {
   613  		return err
   614  	}
   615  
   616  	// Load the kernel state.
   617  	kernelStart := time.Now()
   618  	stats, err := state.Load(ctx, r, k)
   619  	if err != nil {
   620  		return err
   621  	}
   622  	log.Infof("Kernel load stats: %s", stats.String())
   623  	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
   624  
   625  	// rootNetworkNamespace should be populated after loading the state file.
   626  	// Restore the root network stack.
   627  	k.rootNetworkNamespace.RestoreRootStack(net)
   628  
   629  	// Load the memory file's state.
   630  	memoryStart := time.Now()
   631  	if err := k.mf.LoadFrom(ctx, r); err != nil {
   632  		return err
   633  	}
   634  	log.Infof("Memory load took [%s].", time.Since(memoryStart))
   635  
   636  	log.Infof("Overall load took [%s]", time.Since(loadStart))
   637  
   638  	k.Timekeeper().SetClocks(clocks)
   639  
   640  	if timeReady != nil {
   641  		close(timeReady)
   642  	}
   643  
   644  	if net != nil {
   645  		net.Resume()
   646  	}
   647  
   648  	if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
   649  		return err
   650  	}
   651  
   652  	tcpip.AsyncLoading.Wait()
   653  
   654  	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
   655  
   656  	// Applications may size per-cpu structures based on k.applicationCores, so
   657  	// it can't change across save/restore. When we are virtualizing CPU
   658  	// numbers, this isn't a problem. However, when we are exposing host CPU
   659  	// assignments, we can't tolerate an increase in the number of host CPUs,
   660  	// which could result in getcpu(2) returning CPUs that applications expect
   661  	// not to exist.
   662  	if k.useHostCores && initAppCores > k.applicationCores {
   663  		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
   664  	}
   665  
   666  	return nil
   667  }
   668  
   669  // UniqueID returns a unique identifier.
   670  func (k *Kernel) UniqueID() uint64 {
   671  	id := k.uniqueID.Add(1)
   672  	if id == 0 {
   673  		panic("unique identifier generator wrapped around")
   674  	}
   675  	return id
   676  }
   677  
   678  // CreateProcessArgs holds arguments to kernel.CreateProcess.
   679  type CreateProcessArgs struct {
   680  	// Filename is the filename to load as the init binary.
   681  	//
   682  	// If this is provided as "", File will be checked, then the file will be
   683  	// guessed via Argv[0].
   684  	Filename string
   685  
   686  	// File is a passed host FD pointing to a file to load as the init binary.
   687  	//
   688  	// This is checked if and only if Filename is "".
   689  	File *vfs.FileDescription
   690  
   691  	// Argv is a list of arguments.
   692  	Argv []string
   693  
   694  	// Envv is a list of environment variables.
   695  	Envv []string
   696  
   697  	// WorkingDirectory is the initial working directory.
   698  	//
   699  	// This defaults to the root if empty.
   700  	WorkingDirectory string
   701  
   702  	// Credentials is the initial credentials.
   703  	Credentials *auth.Credentials
   704  
   705  	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
   706  	// it takes a reference on FDTable.
   707  	FDTable *FDTable
   708  
   709  	// Umask is the initial umask.
   710  	Umask uint
   711  
   712  	// Limits are the initial resource limits.
   713  	Limits *limits.LimitSet
   714  
   715  	// MaxSymlinkTraversals is the maximum number of symlinks to follow
   716  	// during resolution.
   717  	MaxSymlinkTraversals uint
   718  
   719  	// UTSNamespace is the initial UTS namespace.
   720  	UTSNamespace *UTSNamespace
   721  
   722  	// IPCNamespace is the initial IPC namespace.
   723  	IPCNamespace *IPCNamespace
   724  
   725  	// PIDNamespace is the initial PID Namespace.
   726  	PIDNamespace *PIDNamespace
   727  
   728  	// AbstractSocketNamespace is the initial Abstract Socket namespace.
   729  	AbstractSocketNamespace *AbstractSocketNamespace
   730  
   731  	// MountNamespace optionally contains the mount namespace for this
   732  	// process. If nil, the init process's mount namespace is used.
   733  	//
   734  	// Anyone setting MountNamespace must donate a reference (i.e.
   735  	// increment it).
   736  	MountNamespace *vfs.MountNamespace
   737  
   738  	// ContainerID is the container that the process belongs to.
   739  	ContainerID string
   740  
   741  	// InitialCgroups are the cgroups the container is initialized to.
   742  	InitialCgroups map[Cgroup]struct{}
   743  }
   744  
   745  // NewContext returns a context.Context that represents the task that will be
   746  // created by args.NewContext(k).
   747  func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context {
   748  	return &createProcessContext{
   749  		Context: context.Background(),
   750  		kernel:  k,
   751  		args:    args,
   752  	}
   753  }
   754  
   755  // createProcessContext is a context.Context that represents the context
   756  // associated with a task that is being created.
   757  type createProcessContext struct {
   758  	context.Context
   759  	kernel *Kernel
   760  	args   *CreateProcessArgs
   761  }
   762  
   763  // Value implements context.Context.Value.
   764  func (ctx *createProcessContext) Value(key any) any {
   765  	switch key {
   766  	case CtxKernel:
   767  		return ctx.kernel
   768  	case CtxPIDNamespace:
   769  		return ctx.args.PIDNamespace
   770  	case CtxUTSNamespace:
   771  		return ctx.args.UTSNamespace
   772  	case ipc.CtxIPCNamespace:
   773  		ipcns := ctx.args.IPCNamespace
   774  		ipcns.IncRef()
   775  		return ipcns
   776  	case auth.CtxCredentials:
   777  		return ctx.args.Credentials
   778  	case vfs.CtxRoot:
   779  		if ctx.args.MountNamespace == nil {
   780  			return nil
   781  		}
   782  		root := ctx.args.MountNamespace.Root()
   783  		root.IncRef()
   784  		return root
   785  	case vfs.CtxMountNamespace:
   786  		if ctx.kernel.globalInit == nil {
   787  			return nil
   788  		}
   789  		mntns := ctx.kernel.GlobalInit().Leader().MountNamespace()
   790  		mntns.IncRef()
   791  		return mntns
   792  	case inet.CtxStack:
   793  		return ctx.kernel.RootNetworkNamespace().Stack()
   794  	case ktime.CtxRealtimeClock:
   795  		return ctx.kernel.RealtimeClock()
   796  	case limits.CtxLimits:
   797  		return ctx.args.Limits
   798  	case pgalloc.CtxMemoryCgroupID:
   799  		return ctx.getMemoryCgroupID()
   800  	case pgalloc.CtxMemoryFile:
   801  		return ctx.kernel.mf
   802  	case pgalloc.CtxMemoryFileProvider:
   803  		return ctx.kernel
   804  	case platform.CtxPlatform:
   805  		return ctx.kernel
   806  	case uniqueid.CtxGlobalUniqueID:
   807  		return ctx.kernel.UniqueID()
   808  	case uniqueid.CtxGlobalUniqueIDProvider:
   809  		return ctx.kernel
   810  	case uniqueid.CtxInotifyCookie:
   811  		return ctx.kernel.GenerateInotifyCookie()
   812  	case unimpl.CtxEvents:
   813  		return ctx.kernel
   814  	default:
   815  		return nil
   816  	}
   817  }
   818  
   819  func (ctx *createProcessContext) getMemoryCgroupID() uint32 {
   820  	for cg := range ctx.args.InitialCgroups {
   821  		for _, ctl := range cg.Controllers() {
   822  			if ctl.Type() == CgroupControllerMemory {
   823  				return cg.ID()
   824  			}
   825  		}
   826  	}
   827  	return InvalidCgroupID
   828  }
   829  
   830  // CreateProcess creates a new task in a new thread group with the given
   831  // options. The new task has no parent and is in the root PID namespace.
   832  //
   833  // If k.Start() has already been called, then the created process must be
   834  // started by calling kernel.StartProcess(tg).
   835  //
   836  // If k.Start() has not yet been called, then the created task will begin
   837  // running when k.Start() is called.
   838  //
   839  // CreateProcess has no analogue in Linux; it is used to create the initial
   840  // application task, as well as processes started by the control server.
   841  func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
   842  	k.extMu.Lock()
   843  	defer k.extMu.Unlock()
   844  	log.Infof("EXEC: %v", args.Argv)
   845  
   846  	ctx := args.NewContext(k)
   847  	mntns := args.MountNamespace
   848  	if mntns == nil {
   849  		if k.globalInit == nil {
   850  			return nil, 0, fmt.Errorf("mount namespace is nil")
   851  		}
   852  		// Add a reference to the namespace, which is transferred to the new process.
   853  		mntns = k.globalInit.Leader().MountNamespace()
   854  		mntns.IncRef()
   855  	}
   856  	// Get the root directory from the MountNamespace.
   857  	root := mntns.Root()
   858  	root.IncRef()
   859  	defer root.DecRef(ctx)
   860  
   861  	// Grab the working directory.
   862  	wd := root // Default.
   863  	if args.WorkingDirectory != "" {
   864  		pop := vfs.PathOperation{
   865  			Root:               root,
   866  			Start:              wd,
   867  			Path:               fspath.Parse(args.WorkingDirectory),
   868  			FollowFinalSymlink: true,
   869  		}
   870  		// NOTE(b/236028361): Do not set CheckSearchable flag to true.
   871  		// Application is allowed to start with a working directory that it can
   872  		// not access/search. This is consistent with Docker and VFS1. Runc
   873  		// explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry
   874  		// chdir to fix EPERM"). As described in the commit, runc unintentionally
   875  		// allowed this behavior in a couple of releases and applications started
   876  		// relying on it. So they decided to allow it for backward compatibility.
   877  		var err error
   878  		wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{})
   879  		if err != nil {
   880  			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
   881  		}
   882  		defer wd.DecRef(ctx)
   883  	}
   884  	fsContext := NewFSContext(root, wd, args.Umask)
   885  
   886  	tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
   887  	cu := cleanup.Make(func() {
   888  		tg.Release(ctx)
   889  	})
   890  	defer cu.Clean()
   891  
   892  	// Check which file to start from.
   893  	switch {
   894  	case args.Filename != "":
   895  		// If a filename is given, take that.
   896  		// Set File to nil so we resolve the path in LoadTaskImage.
   897  		args.File = nil
   898  	case args.File != nil:
   899  		// If File is set, take the File provided directly.
   900  		args.Filename = args.File.MappedName(ctx)
   901  	default:
   902  		// Otherwise look at Argv and see if the first argument is a valid path.
   903  		if len(args.Argv) == 0 {
   904  			return nil, 0, fmt.Errorf("no filename or command provided")
   905  		}
   906  		if !filepath.IsAbs(args.Argv[0]) {
   907  			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
   908  		}
   909  		args.Filename = args.Argv[0]
   910  	}
   911  
   912  	// Create a fresh task context.
   913  	remainingTraversals := args.MaxSymlinkTraversals
   914  	loadArgs := loader.LoadArgs{
   915  		Root:                root,
   916  		WorkingDir:          wd,
   917  		RemainingTraversals: &remainingTraversals,
   918  		ResolveFinal:        true,
   919  		Filename:            args.Filename,
   920  		File:                args.File,
   921  		CloseOnExec:         false,
   922  		Argv:                args.Argv,
   923  		Envv:                args.Envv,
   924  		Features:            k.featureSet,
   925  	}
   926  
   927  	image, se := k.LoadTaskImage(ctx, loadArgs)
   928  	if se != nil {
   929  		return nil, 0, errors.New(se.String())
   930  	}
   931  
   932  	// Take a reference on the FDTable, which will be transferred to
   933  	// TaskSet.NewTask().
   934  	args.FDTable.IncRef()
   935  
   936  	// Create the task.
   937  	config := &TaskConfig{
   938  		Kernel:                  k,
   939  		ThreadGroup:             tg,
   940  		TaskImage:               image,
   941  		FSContext:               fsContext,
   942  		FDTable:                 args.FDTable,
   943  		Credentials:             args.Credentials,
   944  		NetworkNamespace:        k.RootNetworkNamespace(),
   945  		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
   946  		UTSNamespace:            args.UTSNamespace,
   947  		IPCNamespace:            args.IPCNamespace,
   948  		AbstractSocketNamespace: args.AbstractSocketNamespace,
   949  		MountNamespace:          mntns,
   950  		ContainerID:             args.ContainerID,
   951  		InitialCgroups:          args.InitialCgroups,
   952  		UserCounters:            k.GetUserCounters(args.Credentials.RealKUID),
   953  	}
   954  	config.NetworkNamespace.IncRef()
   955  	t, err := k.tasks.NewTask(ctx, config)
   956  	if err != nil {
   957  		return nil, 0, err
   958  	}
   959  	t.traceExecEvent(image) // Simulate exec for tracing.
   960  
   961  	// Success.
   962  	cu.Release()
   963  	tgid := k.tasks.Root.IDOfThreadGroup(tg)
   964  	if k.globalInit == nil {
   965  		k.globalInit = tg
   966  	}
   967  	return tg, tgid, nil
   968  }
   969  
   970  // StartProcess starts running a process that was created with CreateProcess.
   971  func (k *Kernel) StartProcess(tg *ThreadGroup) {
   972  	t := tg.Leader()
   973  	tid := k.tasks.Root.IDOfTask(t)
   974  	t.Start(tid)
   975  }
   976  
   977  // Start starts execution of all tasks in k.
   978  //
   979  // Preconditions: Start may be called exactly once.
   980  func (k *Kernel) Start() error {
   981  	k.extMu.Lock()
   982  	defer k.extMu.Unlock()
   983  
   984  	if k.started {
   985  		return fmt.Errorf("kernel already started")
   986  	}
   987  
   988  	k.started = true
   989  	k.cpuClockTickTimer = time.NewTimer(linux.ClockTick)
   990  	k.runningTasksMu.Lock()
   991  	k.cpuClockTickerRunning = true
   992  	k.runningTasksMu.Unlock()
   993  	go k.runCPUClockTicker()
   994  	// If k was created by LoadKernelFrom, timers were stopped during
   995  	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
   996  	// this is a no-op.
   997  	k.resumeTimeLocked(k.SupervisorContext())
   998  	k.tasks.mu.RLock()
   999  	ts := make([]*Task, 0, len(k.tasks.Root.tids))
  1000  	for t := range k.tasks.Root.tids {
  1001  		ts = append(ts, t)
  1002  	}
  1003  	k.tasks.mu.RUnlock()
  1004  	// Start task goroutines.
  1005  	// NOTE(b/235349091): We don't actually need the TaskSet mutex, we just
  1006  	// need to make sure we only call t.Start() once for each task. Holding the
  1007  	// mutex for each task start may cause a nested locking error.
  1008  	for _, t := range ts {
  1009  		t.Start(t.ThreadID())
  1010  	}
  1011  	return nil
  1012  }
  1013  
  1014  // pauseTimeLocked pauses all Timers and Timekeeper updates.
  1015  //
  1016  // Preconditions:
  1017  //   - Any task goroutines running in k must be stopped.
  1018  //   - k.extMu must be locked.
  1019  func (k *Kernel) pauseTimeLocked(ctx context.Context) {
  1020  	// Since all task goroutines have been stopped by precondition, the CPU clock
  1021  	// ticker should stop on its own; wait for it to do so, waking it up from
  1022  	// sleeping betwen ticks if necessary.
  1023  	k.runningTasksMu.Lock()
  1024  	for k.cpuClockTickerRunning {
  1025  		select {
  1026  		case k.cpuClockTickerWakeCh <- struct{}{}:
  1027  		default:
  1028  		}
  1029  		k.cpuClockTickerStopCond.Wait()
  1030  	}
  1031  	k.runningTasksMu.Unlock()
  1032  
  1033  	// By precondition, nothing else can be interacting with PIDNamespace.tids
  1034  	// or FDTable.files, so we can iterate them without synchronization. (We
  1035  	// can't hold the TaskSet mutex when pausing thread group timers because
  1036  	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
  1037  	// mutex, while holding the Timer mutex.)
  1038  	for t := range k.tasks.Root.tids {
  1039  		if t == t.tg.leader {
  1040  			t.tg.itimerRealTimer.Pause()
  1041  			for _, it := range t.tg.timers {
  1042  				it.PauseTimer()
  1043  			}
  1044  		}
  1045  		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
  1046  		// but ktime.Timer.Pause is idempotent so this is harmless.
  1047  		if t.fdTable != nil {
  1048  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1049  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1050  					tfd.PauseTimer()
  1051  				}
  1052  			})
  1053  		}
  1054  	}
  1055  	k.timekeeper.PauseUpdates()
  1056  }
  1057  
  1058  // resumeTimeLocked resumes all Timers and Timekeeper updates. If
  1059  // pauseTimeLocked has not been previously called, resumeTimeLocked has no
  1060  // effect.
  1061  //
  1062  // Preconditions:
  1063  //   - Any task goroutines running in k must be stopped.
  1064  //   - k.extMu must be locked.
  1065  func (k *Kernel) resumeTimeLocked(ctx context.Context) {
  1066  	// The CPU clock ticker will automatically resume as task goroutines resume
  1067  	// execution.
  1068  
  1069  	k.timekeeper.ResumeUpdates()
  1070  	for t := range k.tasks.Root.tids {
  1071  		if t == t.tg.leader {
  1072  			t.tg.itimerRealTimer.Resume()
  1073  			for _, it := range t.tg.timers {
  1074  				it.ResumeTimer()
  1075  			}
  1076  		}
  1077  		if t.fdTable != nil {
  1078  			t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) {
  1079  				if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1080  					tfd.ResumeTimer()
  1081  				}
  1082  			})
  1083  		}
  1084  	}
  1085  }
  1086  
  1087  func (k *Kernel) incRunningTasks() {
  1088  	for {
  1089  		tasks := k.runningTasks.Load()
  1090  		if tasks != 0 {
  1091  			// Standard case. Simply increment.
  1092  			if !k.runningTasks.CompareAndSwap(tasks, tasks+1) {
  1093  				continue
  1094  			}
  1095  			return
  1096  		}
  1097  
  1098  		// Transition from 0 -> 1.
  1099  		k.runningTasksMu.Lock()
  1100  		if k.runningTasks.Load() != 0 {
  1101  			// Raced with another transition and lost.
  1102  			k.runningTasks.Add(1)
  1103  			k.runningTasksMu.Unlock()
  1104  			return
  1105  		}
  1106  		if !k.cpuClockTickerRunning {
  1107  			select {
  1108  			case tickTime := <-k.cpuClockTickTimer.C:
  1109  				// Rearm the timer since we consumed the wakeup. Estimate how much time
  1110  				// remains on the current tick so that periodic workloads interact with
  1111  				// the (periodic) CPU clock ticker in the same way that they would
  1112  				// without the optimization of putting the ticker to sleep.
  1113  				missedNS := time.Since(tickTime).Nanoseconds()
  1114  				missedTicks := missedNS / linux.ClockTick.Nanoseconds()
  1115  				thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds()
  1116  				k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS))
  1117  				// Increment k.cpuClock on the CPU clock ticker goroutine's behalf.
  1118  				// (Whole missed ticks don't matter, and adding them to k.cpuClock will
  1119  				// just confuse the watchdog.) At the time the tick occurred, all task
  1120  				// goroutines were asleep, so there's nothing else to do. This ensures
  1121  				// that our caller (Task.accountTaskGoroutineLeave()) records an
  1122  				// updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly
  1123  				// accounted as having resumed execution in the sentry during this tick
  1124  				// instead of at the end of the previous one.
  1125  				k.cpuClock.Add(1)
  1126  			default:
  1127  			}
  1128  			// We are transitioning from idle to active. Set k.cpuClockTickerRunning
  1129  			// = true here so that if we transition to idle and then active again
  1130  			// before the CPU clock ticker goroutine has a chance to run, the first
  1131  			// call to k.incRunningTasks() at the end of that cycle does not try to
  1132  			// steal k.cpuClockTickTimer.C again, as this would allow workloads that
  1133  			// rapidly cycle between idle and active to starve the CPU clock ticker
  1134  			// of chances to observe task goroutines in a running state and account
  1135  			// their CPU usage.
  1136  			k.cpuClockTickerRunning = true
  1137  			k.runningTasksCond.Signal()
  1138  		}
  1139  		// This store must happen after the increment of k.cpuClock above to ensure
  1140  		// that concurrent calls to Task.accountTaskGoroutineLeave() also observe
  1141  		// the updated k.cpuClock.
  1142  		k.runningTasks.Store(1)
  1143  		k.runningTasksMu.Unlock()
  1144  		return
  1145  	}
  1146  }
  1147  
  1148  func (k *Kernel) decRunningTasks() {
  1149  	tasks := k.runningTasks.Add(-1)
  1150  	if tasks < 0 {
  1151  		panic(fmt.Sprintf("Invalid running count %d", tasks))
  1152  	}
  1153  
  1154  	// Nothing to do. The next CPU clock tick will disable the timer if
  1155  	// there is still nothing running. This provides approximately one tick
  1156  	// of slack in which we can switch back and forth between idle and
  1157  	// active without an expensive transition.
  1158  }
  1159  
  1160  // WaitExited blocks until all tasks in k have exited.
  1161  func (k *Kernel) WaitExited() {
  1162  	k.tasks.liveGoroutines.Wait()
  1163  }
  1164  
  1165  // Kill requests that all tasks in k immediately exit as if group exiting with
  1166  // status ws. Kill does not wait for tasks to exit.
  1167  func (k *Kernel) Kill(ws linux.WaitStatus) {
  1168  	k.extMu.Lock()
  1169  	defer k.extMu.Unlock()
  1170  	k.tasks.Kill(ws)
  1171  }
  1172  
  1173  // Pause requests that all tasks in k temporarily stop executing, and blocks
  1174  // until all tasks and asynchronous I/O operations in k have stopped. Multiple
  1175  // calls to Pause nest and require an equal number of calls to Unpause to
  1176  // resume execution.
  1177  func (k *Kernel) Pause() {
  1178  	k.extMu.Lock()
  1179  	k.tasks.BeginExternalStop()
  1180  	k.extMu.Unlock()
  1181  	k.tasks.runningGoroutines.Wait()
  1182  	k.tasks.aioGoroutines.Wait()
  1183  }
  1184  
  1185  // ReceiveTaskStates receives full states for all tasks.
  1186  func (k *Kernel) ReceiveTaskStates() {
  1187  	k.extMu.Lock()
  1188  	k.tasks.PullFullState()
  1189  	k.extMu.Unlock()
  1190  }
  1191  
  1192  // Unpause ends the effect of a previous call to Pause. If Unpause is called
  1193  // without a matching preceding call to Pause, Unpause may panic.
  1194  func (k *Kernel) Unpause() {
  1195  	k.extMu.Lock()
  1196  	defer k.extMu.Unlock()
  1197  	k.tasks.EndExternalStop()
  1198  }
  1199  
  1200  // SendExternalSignal injects a signal into the kernel.
  1201  //
  1202  // context is used only for debugging to describe how the signal was received.
  1203  //
  1204  // Preconditions: Kernel must have an init process.
  1205  func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
  1206  	k.extMu.Lock()
  1207  	defer k.extMu.Unlock()
  1208  	k.sendExternalSignal(info, context)
  1209  }
  1210  
  1211  // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
  1212  // This function doesn't skip signals like SendExternalSignal does.
  1213  func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
  1214  	k.extMu.Lock()
  1215  	defer k.extMu.Unlock()
  1216  	return tg.SendSignal(info)
  1217  }
  1218  
  1219  // SendContainerSignal sends the given signal to all processes inside the
  1220  // namespace that match the given container ID.
  1221  func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
  1222  	k.extMu.Lock()
  1223  	defer k.extMu.Unlock()
  1224  	k.tasks.mu.RLock()
  1225  	defer k.tasks.mu.RUnlock()
  1226  
  1227  	var lastErr error
  1228  	for tg := range k.tasks.Root.tgids {
  1229  		if tg.leader.ContainerID() == cid {
  1230  			tg.signalHandlers.mu.Lock()
  1231  			infoCopy := *info
  1232  			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
  1233  				lastErr = err
  1234  			}
  1235  			tg.signalHandlers.mu.Unlock()
  1236  		}
  1237  	}
  1238  	return lastErr
  1239  }
  1240  
  1241  // RebuildTraceContexts rebuilds the trace context for all tasks.
  1242  //
  1243  // Unfortunately, if these are built while tracing is not enabled, then we will
  1244  // not have meaningful trace data. Rebuilding here ensures that we can do so
  1245  // after tracing has been enabled.
  1246  func (k *Kernel) RebuildTraceContexts() {
  1247  	// We need to pause all task goroutines because Task.rebuildTraceContext()
  1248  	// replaces Task.traceContext and Task.traceTask, which are
  1249  	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
  1250  	// access them without synchronization) for performance.
  1251  	k.Pause()
  1252  	defer k.Unpause()
  1253  
  1254  	k.extMu.Lock()
  1255  	defer k.extMu.Unlock()
  1256  	k.tasks.mu.RLock()
  1257  	defer k.tasks.mu.RUnlock()
  1258  
  1259  	for t, tid := range k.tasks.Root.tids {
  1260  		t.rebuildTraceContext(tid)
  1261  	}
  1262  }
  1263  
  1264  // FeatureSet returns the FeatureSet.
  1265  func (k *Kernel) FeatureSet() cpuid.FeatureSet {
  1266  	return k.featureSet
  1267  }
  1268  
  1269  // Timekeeper returns the Timekeeper.
  1270  func (k *Kernel) Timekeeper() *Timekeeper {
  1271  	return k.timekeeper
  1272  }
  1273  
  1274  // TaskSet returns the TaskSet.
  1275  func (k *Kernel) TaskSet() *TaskSet {
  1276  	return k.tasks
  1277  }
  1278  
  1279  // RootUserNamespace returns the root UserNamespace.
  1280  func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
  1281  	return k.rootUserNamespace
  1282  }
  1283  
  1284  // RootUTSNamespace returns the root UTSNamespace.
  1285  func (k *Kernel) RootUTSNamespace() *UTSNamespace {
  1286  	return k.rootUTSNamespace
  1287  }
  1288  
  1289  // RootIPCNamespace takes a reference and returns the root IPCNamespace.
  1290  func (k *Kernel) RootIPCNamespace() *IPCNamespace {
  1291  	k.rootIPCNamespace.IncRef()
  1292  	return k.rootIPCNamespace
  1293  }
  1294  
  1295  // RootPIDNamespace returns the root PIDNamespace.
  1296  func (k *Kernel) RootPIDNamespace() *PIDNamespace {
  1297  	return k.tasks.Root
  1298  }
  1299  
  1300  // RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
  1301  func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
  1302  	return k.rootAbstractSocketNamespace
  1303  }
  1304  
  1305  // RootNetworkNamespace returns the root network namespace, always non-nil.
  1306  func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
  1307  	return k.rootNetworkNamespace
  1308  }
  1309  
  1310  // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
  1311  // nil if no such thread group exists. GlobalInit may return a thread group
  1312  // containing no tasks if the thread group has already exited.
  1313  func (k *Kernel) GlobalInit() *ThreadGroup {
  1314  	k.extMu.Lock()
  1315  	defer k.extMu.Unlock()
  1316  	return k.globalInit
  1317  }
  1318  
  1319  // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
  1320  func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
  1321  	k.globalInit = tg
  1322  }
  1323  
  1324  // ApplicationCores returns the number of CPUs visible to sandboxed
  1325  // applications.
  1326  func (k *Kernel) ApplicationCores() uint {
  1327  	return k.applicationCores
  1328  }
  1329  
  1330  // RealtimeClock returns the application CLOCK_REALTIME clock.
  1331  func (k *Kernel) RealtimeClock() ktime.Clock {
  1332  	return k.timekeeper.realtimeClock
  1333  }
  1334  
  1335  // MonotonicClock returns the application CLOCK_MONOTONIC clock.
  1336  func (k *Kernel) MonotonicClock() ktime.Clock {
  1337  	return k.timekeeper.monotonicClock
  1338  }
  1339  
  1340  // CPUClockNow returns the current value of k.cpuClock.
  1341  func (k *Kernel) CPUClockNow() uint64 {
  1342  	return k.cpuClock.Load()
  1343  }
  1344  
  1345  // Syslog returns the syslog.
  1346  func (k *Kernel) Syslog() *syslog {
  1347  	return &k.syslog
  1348  }
  1349  
  1350  // GenerateInotifyCookie generates a unique inotify event cookie.
  1351  //
  1352  // Returned values may overlap with previously returned values if the value
  1353  // space is exhausted. 0 is not a valid cookie value, all other values
  1354  // representable in a uint32 are allowed.
  1355  func (k *Kernel) GenerateInotifyCookie() uint32 {
  1356  	id := k.nextInotifyCookie.Add(1)
  1357  	// Wrap-around is explicitly allowed for inotify event cookies.
  1358  	if id == 0 {
  1359  		id = k.nextInotifyCookie.Add(1)
  1360  	}
  1361  	return id
  1362  }
  1363  
  1364  // NetlinkPorts returns the netlink port manager.
  1365  func (k *Kernel) NetlinkPorts() *port.Manager {
  1366  	return k.netlinkPorts
  1367  }
  1368  
  1369  var (
  1370  	errSaved     = errors.New("sandbox has been successfully saved")
  1371  	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
  1372  )
  1373  
  1374  // SaveStatus returns the sandbox save status. If it was saved successfully,
  1375  // autosaved indicates whether save was triggered by autosave. If it was not
  1376  // saved successfully, err indicates the sandbox error that caused the kernel to
  1377  // exit during save.
  1378  func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
  1379  	k.extMu.Lock()
  1380  	defer k.extMu.Unlock()
  1381  	switch k.saveStatus {
  1382  	case nil:
  1383  		return false, false, nil
  1384  	case errSaved:
  1385  		return true, false, nil
  1386  	case errAutoSaved:
  1387  		return true, true, nil
  1388  	default:
  1389  		return false, false, k.saveStatus
  1390  	}
  1391  }
  1392  
  1393  // SetSaveSuccess sets the flag indicating that save completed successfully, if
  1394  // no status was already set.
  1395  func (k *Kernel) SetSaveSuccess(autosave bool) {
  1396  	k.extMu.Lock()
  1397  	defer k.extMu.Unlock()
  1398  	if k.saveStatus == nil {
  1399  		if autosave {
  1400  			k.saveStatus = errAutoSaved
  1401  		} else {
  1402  			k.saveStatus = errSaved
  1403  		}
  1404  	}
  1405  }
  1406  
  1407  // SetSaveError sets the sandbox error that caused the kernel to exit during
  1408  // save, if one is not already set.
  1409  func (k *Kernel) SetSaveError(err error) {
  1410  	k.extMu.Lock()
  1411  	defer k.extMu.Unlock()
  1412  	if k.saveStatus == nil {
  1413  		k.saveStatus = err
  1414  	}
  1415  }
  1416  
  1417  // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
  1418  // LoadFrom.
  1419  func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
  1420  	k.mf = mf
  1421  }
  1422  
  1423  // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
  1424  func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
  1425  	return k.mf
  1426  }
  1427  
  1428  // SupervisorContext returns a Context with maximum privileges in k. It should
  1429  // only be used by goroutines outside the control of the emulated kernel
  1430  // defined by e.
  1431  //
  1432  // Callers are responsible for ensuring that the returned Context is not used
  1433  // concurrently with changes to the Kernel.
  1434  func (k *Kernel) SupervisorContext() context.Context {
  1435  	return &supervisorContext{
  1436  		Kernel: k,
  1437  		Logger: log.Log(),
  1438  	}
  1439  }
  1440  
  1441  // SocketRecord represents a socket recorded in Kernel.sockets.
  1442  //
  1443  // +stateify savable
  1444  type SocketRecord struct {
  1445  	k    *Kernel
  1446  	Sock *vfs.FileDescription
  1447  	ID   uint64 // Socket table entry number.
  1448  }
  1449  
  1450  // RecordSocket adds a socket to the system-wide socket table for
  1451  // tracking.
  1452  //
  1453  // Precondition: Caller must hold a reference to sock.
  1454  //
  1455  // Note that the socket table will not hold a reference on the
  1456  // vfs.FileDescription.
  1457  func (k *Kernel) RecordSocket(sock *vfs.FileDescription) {
  1458  	k.extMu.Lock()
  1459  	if _, ok := k.sockets[sock]; ok {
  1460  		panic(fmt.Sprintf("Socket %p added twice", sock))
  1461  	}
  1462  	id := k.nextSocketRecord
  1463  	k.nextSocketRecord++
  1464  	s := &SocketRecord{
  1465  		k:    k,
  1466  		ID:   id,
  1467  		Sock: sock,
  1468  	}
  1469  	k.sockets[sock] = s
  1470  	k.extMu.Unlock()
  1471  }
  1472  
  1473  // DeleteSocket removes a socket from the system-wide socket table.
  1474  func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) {
  1475  	k.extMu.Lock()
  1476  	delete(k.sockets, sock)
  1477  	k.extMu.Unlock()
  1478  }
  1479  
  1480  // ListSockets returns a snapshot of all sockets.
  1481  //
  1482  // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef()
  1483  // to get a reference on a socket in the table.
  1484  func (k *Kernel) ListSockets() []*SocketRecord {
  1485  	k.extMu.Lock()
  1486  	var socks []*SocketRecord
  1487  	for _, s := range k.sockets {
  1488  		socks = append(socks, s)
  1489  	}
  1490  	k.extMu.Unlock()
  1491  	return socks
  1492  }
  1493  
  1494  // supervisorContext is a privileged context.
  1495  type supervisorContext struct {
  1496  	context.NoTask
  1497  	log.Logger
  1498  	*Kernel
  1499  }
  1500  
  1501  // Deadline implements context.Context.Deadline.
  1502  func (*Kernel) Deadline() (time.Time, bool) {
  1503  	return time.Time{}, false
  1504  }
  1505  
  1506  // Done implements context.Context.Done.
  1507  func (*Kernel) Done() <-chan struct{} {
  1508  	return nil
  1509  }
  1510  
  1511  // Err implements context.Context.Err.
  1512  func (*Kernel) Err() error {
  1513  	return nil
  1514  }
  1515  
  1516  // Value implements context.Context.
  1517  func (ctx *supervisorContext) Value(key any) any {
  1518  	switch key {
  1519  	case CtxCanTrace:
  1520  		// The supervisor context can trace anything. (None of
  1521  		// supervisorContext's users are expected to invoke ptrace, but ptrace
  1522  		// permissions are required for certain file accesses.)
  1523  		return func(*Task, bool) bool { return true }
  1524  	case CtxKernel:
  1525  		return ctx.Kernel
  1526  	case CtxPIDNamespace:
  1527  		return ctx.Kernel.tasks.Root
  1528  	case CtxUTSNamespace:
  1529  		return ctx.Kernel.rootUTSNamespace
  1530  	case ipc.CtxIPCNamespace:
  1531  		ipcns := ctx.Kernel.rootIPCNamespace
  1532  		ipcns.IncRef()
  1533  		return ipcns
  1534  	case auth.CtxCredentials:
  1535  		// The supervisor context is global root.
  1536  		return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace)
  1537  	case vfs.CtxRoot:
  1538  		if ctx.Kernel.globalInit == nil {
  1539  			return vfs.VirtualDentry{}
  1540  		}
  1541  		root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root()
  1542  		root.IncRef()
  1543  		return root
  1544  	case vfs.CtxMountNamespace:
  1545  		if ctx.Kernel.globalInit == nil {
  1546  			return nil
  1547  		}
  1548  		mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace()
  1549  		mntns.IncRef()
  1550  		return mntns
  1551  	case inet.CtxStack:
  1552  		return ctx.Kernel.RootNetworkNamespace().Stack()
  1553  	case ktime.CtxRealtimeClock:
  1554  		return ctx.Kernel.RealtimeClock()
  1555  	case limits.CtxLimits:
  1556  		// No limits apply.
  1557  		return limits.NewLimitSet()
  1558  	case pgalloc.CtxMemoryFile:
  1559  		return ctx.Kernel.mf
  1560  	case pgalloc.CtxMemoryFileProvider:
  1561  		return ctx.Kernel
  1562  	case platform.CtxPlatform:
  1563  		return ctx.Kernel
  1564  	case uniqueid.CtxGlobalUniqueID:
  1565  		return ctx.Kernel.UniqueID()
  1566  	case uniqueid.CtxGlobalUniqueIDProvider:
  1567  		return ctx.Kernel
  1568  	case uniqueid.CtxInotifyCookie:
  1569  		return ctx.Kernel.GenerateInotifyCookie()
  1570  	case unimpl.CtxEvents:
  1571  		return ctx.Kernel
  1572  	case cpuid.CtxFeatureSet:
  1573  		return ctx.Kernel.featureSet
  1574  	default:
  1575  		return nil
  1576  	}
  1577  }
  1578  
  1579  // Rate limits for the number of unimplemented syscall events.
  1580  const (
  1581  	unimplementedSyscallsMaxRate = 100  // events per second
  1582  	unimplementedSyscallBurst    = 1000 // events
  1583  )
  1584  
  1585  // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
  1586  // channel.
  1587  func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) {
  1588  	k.unimplementedSyscallEmitterOnce.Do(func() {
  1589  		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
  1590  	})
  1591  
  1592  	t := TaskFromContext(ctx)
  1593  	IncrementUnimplementedSyscallCounter(sysno)
  1594  	_, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
  1595  		Tid:       int32(t.ThreadID()),
  1596  		Registers: t.Arch().StateData().Proto(),
  1597  	})
  1598  }
  1599  
  1600  // VFS returns the virtual filesystem for the kernel.
  1601  func (k *Kernel) VFS() *vfs.VirtualFilesystem {
  1602  	return &k.vfs
  1603  }
  1604  
  1605  // SetHostMount sets the hostfs mount.
  1606  func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
  1607  	if k.hostMount != nil {
  1608  		panic("Kernel.hostMount cannot be set more than once")
  1609  	}
  1610  	k.hostMount = mnt
  1611  }
  1612  
  1613  // HostMount returns the hostfs mount.
  1614  func (k *Kernel) HostMount() *vfs.Mount {
  1615  	return k.hostMount
  1616  }
  1617  
  1618  // PipeMount returns the pipefs mount.
  1619  func (k *Kernel) PipeMount() *vfs.Mount {
  1620  	return k.pipeMount
  1621  }
  1622  
  1623  // NsfsMount returns the nsfs mount.
  1624  func (k *Kernel) NsfsMount() *vfs.Mount {
  1625  	return k.nsfsMount
  1626  }
  1627  
  1628  // ShmMount returns the tmpfs mount.
  1629  func (k *Kernel) ShmMount() *vfs.Mount {
  1630  	return k.shmMount
  1631  }
  1632  
  1633  // SocketMount returns the sockfs mount.
  1634  func (k *Kernel) SocketMount() *vfs.Mount {
  1635  	return k.socketMount
  1636  }
  1637  
  1638  // CgroupRegistry returns the cgroup registry.
  1639  func (k *Kernel) CgroupRegistry() *CgroupRegistry {
  1640  	return k.cgroupRegistry
  1641  }
  1642  
  1643  // Release releases resources owned by k.
  1644  //
  1645  // Precondition: This should only be called after the kernel is fully
  1646  // initialized, e.g. after k.Start() has been called.
  1647  func (k *Kernel) Release() {
  1648  	ctx := k.SupervisorContext()
  1649  	k.hostMount.DecRef(ctx)
  1650  	k.pipeMount.DecRef(ctx)
  1651  	k.nsfsMount.DecRef(ctx)
  1652  	k.shmMount.DecRef(ctx)
  1653  	k.socketMount.DecRef(ctx)
  1654  	k.vfs.Release(ctx)
  1655  	k.timekeeper.Destroy()
  1656  	k.vdso.Release(ctx)
  1657  	k.RootNetworkNamespace().DecRef(ctx)
  1658  }
  1659  
  1660  // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
  1661  // hierarchy.
  1662  //
  1663  // Precondition: root must be a new cgroup with no tasks. This implies the
  1664  // controllers for root are also new and currently manage no task, which in turn
  1665  // implies the new cgroup can be populated without migrating tasks between
  1666  // cgroups.
  1667  func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
  1668  	k.tasks.mu.RLock()
  1669  	k.tasks.forEachTaskLocked(func(t *Task) {
  1670  		if t.exitState != TaskExitNone {
  1671  			return
  1672  		}
  1673  		t.mu.Lock()
  1674  		// A task can be in the cgroup if it has been created after the
  1675  		// cgroup hierarchy was registered.
  1676  		t.enterCgroupIfNotYetLocked(root)
  1677  		t.mu.Unlock()
  1678  	})
  1679  	k.tasks.mu.RUnlock()
  1680  }
  1681  
  1682  // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
  1683  // hierarchy with the provided id.  This is intended for use during hierarchy
  1684  // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
  1685  func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
  1686  	var releasedCGs []Cgroup
  1687  
  1688  	k.tasks.mu.RLock()
  1689  	// We'll have one cgroup per hierarchy per task.
  1690  	releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids))
  1691  	k.tasks.forEachTaskLocked(func(t *Task) {
  1692  		if t.exitState != TaskExitNone {
  1693  			return
  1694  		}
  1695  		t.mu.Lock()
  1696  		for cg := range t.cgroups {
  1697  			if cg.HierarchyID() == hid {
  1698  				cg.Leave(t)
  1699  				t.resetMemCgID(cg)
  1700  				delete(t.cgroups, cg)
  1701  				releasedCGs = append(releasedCGs, cg)
  1702  				// A task can't be part of multiple cgroups from the same
  1703  				// hierarchy, so we can skip checking the rest once we find a
  1704  				// match.
  1705  				break
  1706  			}
  1707  		}
  1708  		t.mu.Unlock()
  1709  	})
  1710  	k.tasks.mu.RUnlock()
  1711  
  1712  	for _, c := range releasedCGs {
  1713  		c.decRef()
  1714  	}
  1715  }
  1716  
  1717  func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) {
  1718  	k.tasks.mu.RLock()
  1719  	oldRootDecRefs := 0
  1720  	k.tasks.forEachTaskLocked(func(t *Task) {
  1721  		t.mu.Lock()
  1722  		defer t.mu.Unlock()
  1723  		if fsc := t.fsContext; fsc != nil {
  1724  			fsc.mu.Lock()
  1725  			defer fsc.mu.Unlock()
  1726  			if fsc.root == oldRoot {
  1727  				newRoot.IncRef()
  1728  				oldRootDecRefs++
  1729  				fsc.root = newRoot
  1730  			}
  1731  			if fsc.cwd == oldRoot {
  1732  				newRoot.IncRef()
  1733  				oldRootDecRefs++
  1734  				fsc.cwd = newRoot
  1735  			}
  1736  		}
  1737  	})
  1738  	k.tasks.mu.RUnlock()
  1739  	for i := 0; i < oldRootDecRefs; i++ {
  1740  		oldRoot.DecRef(ctx)
  1741  	}
  1742  }
  1743  
  1744  func (k *Kernel) GetUserCounters(uid auth.KUID) *userCounters {
  1745  	k.userCountersMapMu.Lock()
  1746  	defer k.userCountersMapMu.Unlock()
  1747  
  1748  	if uc, ok := k.userCountersMap[uid]; ok {
  1749  		return uc
  1750  	}
  1751  
  1752  	uc := &userCounters{}
  1753  	k.userCountersMap[uid] = uc
  1754  	return uc
  1755  }