github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/kernel.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/kernel.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernel provides an emulation of the Linux kernel.
    16  //
    17  // See README.md for a detailed overview.
    18  //
    19  // Lock order (outermost locks must be taken first):
    20  //
    21  // Kernel.extMu
    22  //   ThreadGroup.timerMu
    23  //     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
    24  //       TaskSet.mu
    25  //         SignalHandlers.mu
    26  //           Task.mu
    27  //       runningTasksMu
    28  //
    29  // Locking SignalHandlers.mu in multiple SignalHandlers requires locking
    30  // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
    31  // time requires locking all of their signal mutexes first.
    32  package kernel
    33  
    34  import (
    35  	"errors"
    36  	"fmt"
    37  	"path/filepath"
    38  	"sync/atomic"
    39  	"time"
    40  
    41  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    42  	"github.com/SagerNet/gvisor/pkg/cleanup"
    43  	"github.com/SagerNet/gvisor/pkg/context"
    44  	"github.com/SagerNet/gvisor/pkg/cpuid"
    45  	"github.com/SagerNet/gvisor/pkg/eventchannel"
    46  	"github.com/SagerNet/gvisor/pkg/fspath"
    47  	"github.com/SagerNet/gvisor/pkg/log"
    48  	"github.com/SagerNet/gvisor/pkg/refs"
    49  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    50  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    51  	oldtimerfd "github.com/SagerNet/gvisor/pkg/sentry/fs/timerfd"
    52  	"github.com/SagerNet/gvisor/pkg/sentry/fsbridge"
    53  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/pipefs"
    54  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/sockfs"
    55  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/timerfd"
    56  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/tmpfs"
    57  	"github.com/SagerNet/gvisor/pkg/sentry/hostcpu"
    58  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    59  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    60  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/epoll"
    61  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/futex"
    62  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/sched"
    63  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    64  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    65  	"github.com/SagerNet/gvisor/pkg/sentry/loader"
    66  	"github.com/SagerNet/gvisor/pkg/sentry/mm"
    67  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    68  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    69  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/port"
    70  	sentrytime "github.com/SagerNet/gvisor/pkg/sentry/time"
    71  	"github.com/SagerNet/gvisor/pkg/sentry/unimpl"
    72  	uspb "github.com/SagerNet/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
    73  	"github.com/SagerNet/gvisor/pkg/sentry/uniqueid"
    74  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    75  	"github.com/SagerNet/gvisor/pkg/state"
    76  	"github.com/SagerNet/gvisor/pkg/state/wire"
    77  	"github.com/SagerNet/gvisor/pkg/sync"
    78  	"github.com/SagerNet/gvisor/pkg/tcpip"
    79  )
    80  
    81  // VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
    82  // easy access everywhere. To be removed once VFS2 becomes the default.
    83  var VFS2Enabled = false
    84  
    85  // FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
    86  // easy access everywhere. To be removed once FUSE is completed.
    87  var FUSEEnabled = false
    88  
    89  // Kernel represents an emulated Linux kernel. It must be initialized by calling
    90  // Init() or LoadFrom().
    91  //
    92  // +stateify savable
    93  type Kernel struct {
    94  	// extMu serializes external changes to the Kernel with calls to
    95  	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
    96  	// remains frozen for the duration of the call; it requires that the Kernel
    97  	// is paused as a precondition, which ensures that none of the tasks
    98  	// running within the Kernel can affect its state, but extMu is required to
    99  	// ensure that concurrent users of the Kernel *outside* the Kernel's
   100  	// control cannot affect its state by calling e.g.
   101  	// Kernel.SendExternalSignal.)
   102  	extMu sync.Mutex `state:"nosave"`
   103  
   104  	// started is true if Start has been called. Unless otherwise specified,
   105  	// all Kernel fields become immutable once started becomes true.
   106  	started bool `state:"nosave"`
   107  
   108  	// All of the following fields are immutable unless otherwise specified.
   109  
   110  	// Platform is the platform that is used to execute tasks in the created
   111  	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
   112  	// embedded anonymously (the same issue applies).
   113  	platform.Platform `state:"nosave"`
   114  
   115  	// mf provides application memory.
   116  	mf *pgalloc.MemoryFile `state:"nosave"`
   117  
   118  	// See InitKernelArgs for the meaning of these fields.
   119  	featureSet                  *cpuid.FeatureSet
   120  	timekeeper                  *Timekeeper
   121  	tasks                       *TaskSet
   122  	rootUserNamespace           *auth.UserNamespace
   123  	rootNetworkNamespace        *inet.Namespace
   124  	applicationCores            uint
   125  	useHostCores                bool
   126  	extraAuxv                   []arch.AuxEntry
   127  	vdso                        *loader.VDSO
   128  	rootUTSNamespace            *UTSNamespace
   129  	rootIPCNamespace            *IPCNamespace
   130  	rootAbstractSocketNamespace *AbstractSocketNamespace
   131  
   132  	// futexes is the "root" futex.Manager, from which all others are forked.
   133  	// This is necessary to ensure that shared futexes are coherent across all
   134  	// tasks, including those created by CreateProcess.
   135  	futexes *futex.Manager
   136  
   137  	// globalInit is the thread group whose leader has ID 1 in the root PID
   138  	// namespace. globalInit is stored separately so that it is accessible even
   139  	// after all tasks in the thread group have exited, such that ID 1 is no
   140  	// longer mapped.
   141  	//
   142  	// globalInit is mutable until it is assigned by the first successful call
   143  	// to CreateProcess, and is protected by extMu.
   144  	globalInit *ThreadGroup
   145  
   146  	// syslog is the kernel log.
   147  	syslog syslog
   148  
   149  	// runningTasksMu synchronizes disable/enable of cpuClockTicker when
   150  	// the kernel is idle (runningTasks == 0).
   151  	//
   152  	// runningTasksMu is used to exclude critical sections when the timer
   153  	// disables itself and when the first active task enables the timer,
   154  	// ensuring that tasks always see a valid cpuClock value.
   155  	runningTasksMu sync.Mutex `state:"nosave"`
   156  
   157  	// runningTasks is the total count of tasks currently in
   158  	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
   159  	// not blocked or stopped.
   160  	//
   161  	// runningTasks must be accessed atomically. Increments from 0 to 1 are
   162  	// further protected by runningTasksMu (see incRunningTasks).
   163  	runningTasks int64
   164  
   165  	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
   166  	// measure task CPU usage, since sampling monotonicClock twice on every
   167  	// syscall turns out to be unreasonably expensive. This is similar to how
   168  	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
   169  	// although Linux also uses scheduler timing information to improve
   170  	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
   171  	// since "preeemptive" scheduling is managed by the Go runtime, which
   172  	// doesn't provide this information.
   173  	//
   174  	// cpuClock is mutable, and is accessed using atomic memory operations.
   175  	cpuClock uint64
   176  
   177  	// cpuClockTicker increments cpuClock.
   178  	cpuClockTicker *ktime.Timer `state:"nosave"`
   179  
   180  	// cpuClockTickerDisabled indicates that cpuClockTicker has been
   181  	// disabled because no tasks are running.
   182  	//
   183  	// cpuClockTickerDisabled is protected by runningTasksMu.
   184  	cpuClockTickerDisabled bool
   185  
   186  	// cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
   187  	// point it was disabled. It is cached here to avoid a lock ordering
   188  	// violation with cpuClockTicker.mu when runningTaskMu is held.
   189  	//
   190  	// cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
   191  	// true.
   192  	//
   193  	// cpuClockTickerSetting is protected by runningTasksMu.
   194  	cpuClockTickerSetting ktime.Setting
   195  
   196  	// uniqueID is used to generate unique identifiers.
   197  	//
   198  	// uniqueID is mutable, and is accessed using atomic memory operations.
   199  	uniqueID uint64
   200  
   201  	// nextInotifyCookie is a monotonically increasing counter used for
   202  	// generating unique inotify event cookies.
   203  	//
   204  	// nextInotifyCookie is mutable, and is accessed using atomic memory
   205  	// operations.
   206  	nextInotifyCookie uint32
   207  
   208  	// netlinkPorts manages allocation of netlink socket port IDs.
   209  	netlinkPorts *port.Manager
   210  
   211  	// saveStatus is nil if the sandbox has not been saved, errSaved or
   212  	// errAutoSaved if it has been saved successfully, or the error causing the
   213  	// sandbox to exit during save.
   214  	// It is protected by extMu.
   215  	saveStatus error `state:"nosave"`
   216  
   217  	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
   218  	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
   219  
   220  	// sockets is the list of all network sockets in the system.
   221  	// Protected by extMu.
   222  	// TODO(github.com/SagerNet/issue/1624): Only used by VFS1.
   223  	sockets socketList
   224  
   225  	// socketsVFS2 records all network sockets in the system. Protected by
   226  	// extMu.
   227  	socketsVFS2 map[*vfs.FileDescription]*SocketRecord
   228  
   229  	// nextSocketRecord is the next entry number to use in sockets. Protected
   230  	// by extMu.
   231  	nextSocketRecord uint64
   232  
   233  	// deviceRegistry is used to save/restore device.SimpleDevices.
   234  	deviceRegistry struct{} `state:".(*device.Registry)"`
   235  
   236  	// DirentCacheLimiter controls the number of total dirent entries can be in
   237  	// caches. Not all caches use it, only the caches that use host resources use
   238  	// the limiter. It may be nil if disabled.
   239  	DirentCacheLimiter *fs.DirentCacheLimiter
   240  
   241  	// unimplementedSyscallEmitterOnce is used in the initialization of
   242  	// unimplementedSyscallEmitter.
   243  	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
   244  
   245  	// unimplementedSyscallEmitter is used to emit unimplemented syscall
   246  	// events. This is initialized lazily on the first unimplemented
   247  	// syscall.
   248  	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
   249  
   250  	// SpecialOpts contains special kernel options.
   251  	SpecialOpts
   252  
   253  	// vfs keeps the filesystem state used across the kernel.
   254  	vfs vfs.VirtualFilesystem
   255  
   256  	// hostMount is the Mount used for file descriptors that were imported
   257  	// from the host.
   258  	hostMount *vfs.Mount
   259  
   260  	// pipeMount is the Mount used for pipes created by the pipe() and pipe2()
   261  	// syscalls (as opposed to named pipes created by mknod()).
   262  	pipeMount *vfs.Mount
   263  
   264  	// shmMount is the Mount used for anonymous files created by the
   265  	// memfd_create() syscalls. It is analagous to Linux's shm_mnt.
   266  	shmMount *vfs.Mount
   267  
   268  	// socketMount is the Mount used for sockets created by the socket() and
   269  	// socketpair() syscalls. There are several cases where a socket dentry will
   270  	// not be contained in socketMount:
   271  	// 1. Socket files created by mknod()
   272  	// 2. Socket fds imported from the host (Kernel.hostMount is used for these)
   273  	// 3. Socket files created by binding Unix sockets to a file path
   274  	socketMount *vfs.Mount
   275  
   276  	// If set to true, report address space activation waits as if the task is in
   277  	// external wait so that the watchdog doesn't report the task stuck.
   278  	SleepForAddressSpaceActivation bool
   279  
   280  	// Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
   281  	// tracee-tracer relationship. The key is a process (technically, the thread
   282  	// group leader) that can be traced by any thread that is a descendant of the
   283  	// value. If the value is nil, then anyone can trace the process represented by
   284  	// the key.
   285  	//
   286  	// ptraceExceptions is protected by the TaskSet mutex.
   287  	ptraceExceptions map[*Task]*Task
   288  
   289  	// YAMAPtraceScope is the current level of YAMA ptrace restrictions.
   290  	YAMAPtraceScope int32
   291  
   292  	// cgroupRegistry contains the set of active cgroup controllers on the
   293  	// system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
   294  	// the system.
   295  	cgroupRegistry *CgroupRegistry
   296  }
   297  
   298  // InitKernelArgs holds arguments to Init.
   299  type InitKernelArgs struct {
   300  	// FeatureSet is the emulated CPU feature set.
   301  	FeatureSet *cpuid.FeatureSet
   302  
   303  	// Timekeeper manages time for all tasks in the system.
   304  	Timekeeper *Timekeeper
   305  
   306  	// RootUserNamespace is the root user namespace.
   307  	RootUserNamespace *auth.UserNamespace
   308  
   309  	// RootNetworkNamespace is the root network namespace. If nil, no networking
   310  	// will be available.
   311  	RootNetworkNamespace *inet.Namespace
   312  
   313  	// ApplicationCores is the number of logical CPUs visible to sandboxed
   314  	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
   315  	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
   316  	// most significant bit in cpu_possible_mask + 1.
   317  	ApplicationCores uint
   318  
   319  	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
   320  	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
   321  	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
   322  	// will be overridden.
   323  	UseHostCores bool
   324  
   325  	// ExtraAuxv contains additional auxiliary vector entries that are added to
   326  	// each process by the ELF loader.
   327  	ExtraAuxv []arch.AuxEntry
   328  
   329  	// Vdso holds the VDSO and its parameter page.
   330  	Vdso *loader.VDSO
   331  
   332  	// RootUTSNamespace is the root UTS namespace.
   333  	RootUTSNamespace *UTSNamespace
   334  
   335  	// RootIPCNamespace is the root IPC namespace.
   336  	RootIPCNamespace *IPCNamespace
   337  
   338  	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
   339  	RootAbstractSocketNamespace *AbstractSocketNamespace
   340  
   341  	// PIDNamespace is the root PID namespace.
   342  	PIDNamespace *PIDNamespace
   343  }
   344  
   345  // Init initialize the Kernel with no tasks.
   346  //
   347  // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
   348  // before calling Init.
   349  func (k *Kernel) Init(args InitKernelArgs) error {
   350  	if args.FeatureSet == nil {
   351  		return fmt.Errorf("args.FeatureSet is nil")
   352  	}
   353  	if args.Timekeeper == nil {
   354  		return fmt.Errorf("args.Timekeeper is nil")
   355  	}
   356  	if args.Timekeeper.clocks == nil {
   357  		return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
   358  	}
   359  	if args.RootUserNamespace == nil {
   360  		return fmt.Errorf("args.RootUserNamespace is nil")
   361  	}
   362  	if args.ApplicationCores == 0 {
   363  		return fmt.Errorf("args.ApplicationCores is 0")
   364  	}
   365  
   366  	k.featureSet = args.FeatureSet
   367  	k.timekeeper = args.Timekeeper
   368  	k.tasks = newTaskSet(args.PIDNamespace)
   369  	k.rootUserNamespace = args.RootUserNamespace
   370  	k.rootUTSNamespace = args.RootUTSNamespace
   371  	k.rootIPCNamespace = args.RootIPCNamespace
   372  	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
   373  	k.rootNetworkNamespace = args.RootNetworkNamespace
   374  	if k.rootNetworkNamespace == nil {
   375  		k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
   376  	}
   377  	k.applicationCores = args.ApplicationCores
   378  	if args.UseHostCores {
   379  		k.useHostCores = true
   380  		maxCPU, err := hostcpu.MaxPossibleCPU()
   381  		if err != nil {
   382  			return fmt.Errorf("failed to get maximum CPU number: %v", err)
   383  		}
   384  		minAppCores := uint(maxCPU) + 1
   385  		if k.applicationCores < minAppCores {
   386  			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
   387  			k.applicationCores = minAppCores
   388  		}
   389  	}
   390  	k.extraAuxv = args.ExtraAuxv
   391  	k.vdso = args.Vdso
   392  	k.futexes = futex.NewManager()
   393  	k.netlinkPorts = port.New()
   394  	k.ptraceExceptions = make(map[*Task]*Task)
   395  	k.YAMAPtraceScope = linux.YAMA_SCOPE_RELATIONAL
   396  
   397  	if VFS2Enabled {
   398  		ctx := k.SupervisorContext()
   399  		if err := k.vfs.Init(ctx); err != nil {
   400  			return fmt.Errorf("failed to initialize VFS: %v", err)
   401  		}
   402  
   403  		pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
   404  		if err != nil {
   405  			return fmt.Errorf("failed to create pipefs filesystem: %v", err)
   406  		}
   407  		defer pipeFilesystem.DecRef(ctx)
   408  		pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
   409  		if err != nil {
   410  			return fmt.Errorf("failed to create pipefs mount: %v", err)
   411  		}
   412  		k.pipeMount = pipeMount
   413  
   414  		tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
   415  		if err != nil {
   416  			return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
   417  		}
   418  		defer tmpfsFilesystem.DecRef(ctx)
   419  		defer tmpfsRoot.DecRef(ctx)
   420  		shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
   421  		if err != nil {
   422  			return fmt.Errorf("failed to create tmpfs mount: %v", err)
   423  		}
   424  		k.shmMount = shmMount
   425  
   426  		socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
   427  		if err != nil {
   428  			return fmt.Errorf("failed to create sockfs filesystem: %v", err)
   429  		}
   430  		defer socketFilesystem.DecRef(ctx)
   431  		socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
   432  		if err != nil {
   433  			return fmt.Errorf("failed to create sockfs mount: %v", err)
   434  		}
   435  		k.socketMount = socketMount
   436  
   437  		k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)
   438  
   439  		k.cgroupRegistry = newCgroupRegistry()
   440  	}
   441  	return nil
   442  }
   443  
   444  // SaveTo saves the state of k to w.
   445  //
   446  // Preconditions: The kernel must be paused throughout the call to SaveTo.
   447  func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
   448  	saveStart := time.Now()
   449  
   450  	// Do not allow other Kernel methods to affect it while it's being saved.
   451  	k.extMu.Lock()
   452  	defer k.extMu.Unlock()
   453  
   454  	// Stop time.
   455  	k.pauseTimeLocked(ctx)
   456  	defer k.resumeTimeLocked(ctx)
   457  
   458  	// Evict all evictable MemoryFile allocations.
   459  	k.mf.StartEvictions()
   460  	k.mf.WaitForEvictions()
   461  
   462  	if VFS2Enabled {
   463  		// Discard unsavable mappings, such as those for host file descriptors.
   464  		if err := k.invalidateUnsavableMappings(ctx); err != nil {
   465  			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   466  		}
   467  
   468  		// Prepare filesystems for saving. This must be done after
   469  		// invalidateUnsavableMappings(), since dropping memory mappings may
   470  		// affect filesystem state (e.g. page cache reference counts).
   471  		if err := k.vfs.PrepareSave(ctx); err != nil {
   472  			return err
   473  		}
   474  	} else {
   475  		// Flush cached file writes to backing storage. This must come after
   476  		// MemoryFile eviction since eviction may cause file writes.
   477  		if err := k.flushWritesToFiles(ctx); err != nil {
   478  			return err
   479  		}
   480  
   481  		// Remove all epoll waiter objects from underlying wait queues.
   482  		// NOTE: for programs to resume execution in future snapshot scenarios,
   483  		// we will need to re-establish these waiter objects after saving.
   484  		k.tasks.unregisterEpollWaiters(ctx)
   485  
   486  		// Clear the dirent cache before saving because Dirents must be Loaded in a
   487  		// particular order (parents before children), and Loading dirents from a cache
   488  		// breaks that order.
   489  		if err := k.flushMountSourceRefs(ctx); err != nil {
   490  			return err
   491  		}
   492  
   493  		// Ensure that all inode and mount release operations have completed.
   494  		fs.AsyncBarrier()
   495  
   496  		// Once all fs work has completed (flushed references have all been released),
   497  		// reset mount mappings. This allows individual mounts to save how inodes map
   498  		// to filesystem resources. Without this, fs.Inodes cannot be restored.
   499  		fs.SaveInodeMappings()
   500  
   501  		// Discard unsavable mappings, such as those for host file descriptors.
   502  		// This must be done after waiting for "asynchronous fs work", which
   503  		// includes async I/O that may touch application memory.
   504  		//
   505  		// TODO(github.com/SagerNet/issue/1624): This rationale is believed to be
   506  		// obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
   507  		// but this order is conservatively retained for VFS1.
   508  		if err := k.invalidateUnsavableMappings(ctx); err != nil {
   509  			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
   510  		}
   511  	}
   512  
   513  	// Save the CPUID FeatureSet before the rest of the kernel so we can
   514  	// verify its compatibility on restore before attempting to restore the
   515  	// entire kernel, which may fail on an incompatible machine.
   516  	//
   517  	// N.B. This will also be saved along with the full kernel save below.
   518  	cpuidStart := time.Now()
   519  	if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
   520  		return err
   521  	}
   522  	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
   523  
   524  	// Save the timekeeper's state.
   525  
   526  	// Save the kernel state.
   527  	kernelStart := time.Now()
   528  	stats, err := state.Save(ctx, w, k)
   529  	if err != nil {
   530  		return err
   531  	}
   532  	log.Infof("Kernel save stats: %s", stats.String())
   533  	log.Infof("Kernel save took [%s].", time.Since(kernelStart))
   534  
   535  	// Save the memory file's state.
   536  	memoryStart := time.Now()
   537  	if err := k.mf.SaveTo(ctx, w); err != nil {
   538  		return err
   539  	}
   540  	log.Infof("Memory save took [%s].", time.Since(memoryStart))
   541  
   542  	log.Infof("Overall save took [%s].", time.Since(saveStart))
   543  
   544  	return nil
   545  }
   546  
   547  // flushMountSourceRefs flushes the MountSources for all mounted filesystems
   548  // and open FDs.
   549  //
   550  // Preconditions: !VFS2Enabled.
   551  func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
   552  	// Flush all mount sources for currently mounted filesystems in each task.
   553  	flushed := make(map[*fs.MountNamespace]struct{})
   554  	k.tasks.mu.RLock()
   555  	k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
   556  		if _, ok := flushed[tg.mounts]; ok {
   557  			// Already flushed.
   558  			return
   559  		}
   560  		tg.mounts.FlushMountSourceRefs()
   561  		flushed[tg.mounts] = struct{}{}
   562  	})
   563  	k.tasks.mu.RUnlock()
   564  
   565  	// There may be some open FDs whose filesystems have been unmounted. We
   566  	// must flush those as well.
   567  	return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
   568  		file.Dirent.Inode.MountSource.FlushDirentRefs()
   569  		return nil
   570  	})
   571  }
   572  
   573  // forEachFDPaused applies the given function to each open file descriptor in
   574  // each task.
   575  //
   576  // Precondition: Must be called with the kernel paused.
   577  func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
   578  	ts.mu.RLock()
   579  	defer ts.mu.RUnlock()
   580  	for t := range ts.Root.tids {
   581  		// We can skip locking Task.mu here since the kernel is paused.
   582  		if t.fdTable == nil {
   583  			continue
   584  		}
   585  		t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
   586  			if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
   587  				err = lastErr
   588  			}
   589  		})
   590  	}
   591  	return err
   592  }
   593  
   594  // Preconditions: !VFS2Enabled.
   595  func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
   596  	return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
   597  		if flags := file.Flags(); !flags.Write {
   598  			return nil
   599  		}
   600  		if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
   601  			return nil
   602  		}
   603  		// Here we need all metadata synced.
   604  		syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
   605  		if err := fs.SaveFileFsyncError(syncErr); err != nil {
   606  			name, _ := file.Dirent.FullName(nil /* root */)
   607  			// Wrap this error in ErrSaveRejection so that it will trigger a save
   608  			// error, rather than a panic. This also allows us to distinguish Fsync
   609  			// errors from state file errors in state.Save.
   610  			return &fs.ErrSaveRejection{
   611  				Err: fmt.Errorf("%q was not sufficiently synced: %w", name, err),
   612  			}
   613  		}
   614  		return nil
   615  	})
   616  }
   617  
   618  // Preconditions: !VFS2Enabled.
   619  func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
   620  	ts.mu.RLock()
   621  	defer ts.mu.RUnlock()
   622  
   623  	// Tasks that belong to the same process could potentially point to the
   624  	// same FDTable. So we retain a map of processed ones to avoid
   625  	// processing the same FDTable multiple times.
   626  	processed := make(map[*FDTable]struct{})
   627  	for t := range ts.Root.tids {
   628  		// We can skip locking Task.mu here since the kernel is paused.
   629  		if t.fdTable == nil {
   630  			continue
   631  		}
   632  		if _, ok := processed[t.fdTable]; ok {
   633  			continue
   634  		}
   635  		t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
   636  			if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
   637  				e.UnregisterEpollWaiters()
   638  			}
   639  		})
   640  		processed[t.fdTable] = struct{}{}
   641  	}
   642  }
   643  
   644  // Preconditions: The kernel must be paused.
   645  func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
   646  	invalidated := make(map[*mm.MemoryManager]struct{})
   647  	k.tasks.mu.RLock()
   648  	defer k.tasks.mu.RUnlock()
   649  	for t := range k.tasks.Root.tids {
   650  		// We can skip locking Task.mu here since the kernel is paused.
   651  		if memMgr := t.image.MemoryManager; memMgr != nil {
   652  			if _, ok := invalidated[memMgr]; !ok {
   653  				if err := memMgr.InvalidateUnsavable(ctx); err != nil {
   654  					return err
   655  				}
   656  				invalidated[memMgr] = struct{}{}
   657  			}
   658  		}
   659  		// I really wish we just had a sync.Map of all MMs...
   660  		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
   661  			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
   662  				return err
   663  			}
   664  		}
   665  	}
   666  	return nil
   667  }
   668  
   669  // LoadFrom returns a new Kernel loaded from args.
   670  func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
   671  	loadStart := time.Now()
   672  
   673  	initAppCores := k.applicationCores
   674  
   675  	// Load the pre-saved CPUID FeatureSet.
   676  	//
   677  	// N.B. This was also saved along with the full kernel below, so we
   678  	// don't need to explicitly install it in the Kernel.
   679  	cpuidStart := time.Now()
   680  	var features cpuid.FeatureSet
   681  	if _, err := state.Load(ctx, r, &features); err != nil {
   682  		return err
   683  	}
   684  	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
   685  
   686  	// Verify that the FeatureSet is usable on this host. We do this before
   687  	// Kernel load so that the explicit CPUID mismatch error has priority
   688  	// over floating point state restore errors that may occur on load on
   689  	// an incompatible machine.
   690  	if err := features.CheckHostCompatible(); err != nil {
   691  		return err
   692  	}
   693  
   694  	// Load the kernel state.
   695  	kernelStart := time.Now()
   696  	stats, err := state.Load(ctx, r, k)
   697  	if err != nil {
   698  		return err
   699  	}
   700  	log.Infof("Kernel load stats: %s", stats.String())
   701  	log.Infof("Kernel load took [%s].", time.Since(kernelStart))
   702  
   703  	// rootNetworkNamespace should be populated after loading the state file.
   704  	// Restore the root network stack.
   705  	k.rootNetworkNamespace.RestoreRootStack(net)
   706  
   707  	// Load the memory file's state.
   708  	memoryStart := time.Now()
   709  	if err := k.mf.LoadFrom(ctx, r); err != nil {
   710  		return err
   711  	}
   712  	log.Infof("Memory load took [%s].", time.Since(memoryStart))
   713  
   714  	log.Infof("Overall load took [%s]", time.Since(loadStart))
   715  
   716  	k.Timekeeper().SetClocks(clocks)
   717  
   718  	if timeReady != nil {
   719  		close(timeReady)
   720  	}
   721  
   722  	if net != nil {
   723  		net.Resume()
   724  	}
   725  
   726  	if VFS2Enabled {
   727  		if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
   728  			return err
   729  		}
   730  	} else {
   731  		// Ensure that all pending asynchronous work is complete:
   732  		//   - namedpipe opening
   733  		//   - inode file opening
   734  		if err := fs.AsyncErrorBarrier(); err != nil {
   735  			return err
   736  		}
   737  	}
   738  
   739  	tcpip.AsyncLoading.Wait()
   740  
   741  	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
   742  
   743  	// Applications may size per-cpu structures based on k.applicationCores, so
   744  	// it can't change across save/restore. When we are virtualizing CPU
   745  	// numbers, this isn't a problem. However, when we are exposing host CPU
   746  	// assignments, we can't tolerate an increase in the number of host CPUs,
   747  	// which could result in getcpu(2) returning CPUs that applications expect
   748  	// not to exist.
   749  	if k.useHostCores && initAppCores > k.applicationCores {
   750  		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
   751  	}
   752  
   753  	return nil
   754  }
   755  
   756  // UniqueID returns a unique identifier.
   757  func (k *Kernel) UniqueID() uint64 {
   758  	id := atomic.AddUint64(&k.uniqueID, 1)
   759  	if id == 0 {
   760  		panic("unique identifier generator wrapped around")
   761  	}
   762  	return id
   763  }
   764  
   765  // CreateProcessArgs holds arguments to kernel.CreateProcess.
   766  type CreateProcessArgs struct {
   767  	// Filename is the filename to load as the init binary.
   768  	//
   769  	// If this is provided as "", File will be checked, then the file will be
   770  	// guessed via Argv[0].
   771  	Filename string
   772  
   773  	// File is a passed host FD pointing to a file to load as the init binary.
   774  	//
   775  	// This is checked if and only if Filename is "".
   776  	File fsbridge.File
   777  
   778  	// Argvv is a list of arguments.
   779  	Argv []string
   780  
   781  	// Envv is a list of environment variables.
   782  	Envv []string
   783  
   784  	// WorkingDirectory is the initial working directory.
   785  	//
   786  	// This defaults to the root if empty.
   787  	WorkingDirectory string
   788  
   789  	// Credentials is the initial credentials.
   790  	Credentials *auth.Credentials
   791  
   792  	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
   793  	// it takes a reference on FDTable.
   794  	FDTable *FDTable
   795  
   796  	// Umask is the initial umask.
   797  	Umask uint
   798  
   799  	// Limits is the initial resource limits.
   800  	Limits *limits.LimitSet
   801  
   802  	// MaxSymlinkTraversals is the maximum number of symlinks to follow
   803  	// during resolution.
   804  	MaxSymlinkTraversals uint
   805  
   806  	// UTSNamespace is the initial UTS namespace.
   807  	UTSNamespace *UTSNamespace
   808  
   809  	// IPCNamespace is the initial IPC namespace.
   810  	IPCNamespace *IPCNamespace
   811  
   812  	// PIDNamespace is the initial PID Namespace.
   813  	PIDNamespace *PIDNamespace
   814  
   815  	// AbstractSocketNamespace is the initial Abstract Socket namespace.
   816  	AbstractSocketNamespace *AbstractSocketNamespace
   817  
   818  	// MountNamespace optionally contains the mount namespace for this
   819  	// process. If nil, the init process's mount namespace is used.
   820  	//
   821  	// Anyone setting MountNamespace must donate a reference (i.e.
   822  	// increment it).
   823  	MountNamespace *fs.MountNamespace
   824  
   825  	// MountNamespaceVFS2 optionally contains the mount namespace for this
   826  	// process. If nil, the init process's mount namespace is used.
   827  	//
   828  	// Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
   829  	// increment it).
   830  	MountNamespaceVFS2 *vfs.MountNamespace
   831  
   832  	// ContainerID is the container that the process belongs to.
   833  	ContainerID string
   834  }
   835  
   836  // NewContext returns a context.Context that represents the task that will be
   837  // created by args.NewContext(k).
   838  func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
   839  	return &createProcessContext{
   840  		Logger: log.Log(),
   841  		k:      k,
   842  		args:   args,
   843  	}
   844  }
   845  
   846  // createProcessContext is a context.Context that represents the context
   847  // associated with a task that is being created.
   848  type createProcessContext struct {
   849  	context.NoopSleeper
   850  	log.Logger
   851  	k    *Kernel
   852  	args *CreateProcessArgs
   853  }
   854  
   855  // Value implements context.Context.Value.
   856  func (ctx *createProcessContext) Value(key interface{}) interface{} {
   857  	switch key {
   858  	case CtxKernel:
   859  		return ctx.k
   860  	case CtxPIDNamespace:
   861  		return ctx.args.PIDNamespace
   862  	case CtxUTSNamespace:
   863  		return ctx.args.UTSNamespace
   864  	case CtxIPCNamespace:
   865  		ipcns := ctx.args.IPCNamespace
   866  		ipcns.IncRef()
   867  		return ipcns
   868  	case auth.CtxCredentials:
   869  		return ctx.args.Credentials
   870  	case fs.CtxRoot:
   871  		if ctx.args.MountNamespace != nil {
   872  			// MountNamespace.Root() will take a reference on the root dirent for us.
   873  			return ctx.args.MountNamespace.Root()
   874  		}
   875  		return nil
   876  	case vfs.CtxRoot:
   877  		if ctx.args.MountNamespaceVFS2 == nil {
   878  			return nil
   879  		}
   880  		root := ctx.args.MountNamespaceVFS2.Root()
   881  		root.IncRef()
   882  		return root
   883  	case vfs.CtxMountNamespace:
   884  		if ctx.k.globalInit == nil {
   885  			return nil
   886  		}
   887  		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
   888  		mntns.IncRef()
   889  		return mntns
   890  	case fs.CtxDirentCacheLimiter:
   891  		return ctx.k.DirentCacheLimiter
   892  	case inet.CtxStack:
   893  		return ctx.k.RootNetworkNamespace().Stack()
   894  	case ktime.CtxRealtimeClock:
   895  		return ctx.k.RealtimeClock()
   896  	case limits.CtxLimits:
   897  		return ctx.args.Limits
   898  	case pgalloc.CtxMemoryFile:
   899  		return ctx.k.mf
   900  	case pgalloc.CtxMemoryFileProvider:
   901  		return ctx.k
   902  	case platform.CtxPlatform:
   903  		return ctx.k
   904  	case uniqueid.CtxGlobalUniqueID:
   905  		return ctx.k.UniqueID()
   906  	case uniqueid.CtxGlobalUniqueIDProvider:
   907  		return ctx.k
   908  	case uniqueid.CtxInotifyCookie:
   909  		return ctx.k.GenerateInotifyCookie()
   910  	case unimpl.CtxEvents:
   911  		return ctx.k
   912  	default:
   913  		return nil
   914  	}
   915  }
   916  
   917  // CreateProcess creates a new task in a new thread group with the given
   918  // options. The new task has no parent and is in the root PID namespace.
   919  //
   920  // If k.Start() has already been called, then the created process must be
   921  // started by calling kernel.StartProcess(tg).
   922  //
   923  // If k.Start() has not yet been called, then the created task will begin
   924  // running when k.Start() is called.
   925  //
   926  // CreateProcess has no analogue in Linux; it is used to create the initial
   927  // application task, as well as processes started by the control server.
   928  func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
   929  	k.extMu.Lock()
   930  	defer k.extMu.Unlock()
   931  	log.Infof("EXEC: %v", args.Argv)
   932  
   933  	ctx := args.NewContext(k)
   934  
   935  	var (
   936  		opener    fsbridge.Lookup
   937  		fsContext *FSContext
   938  		mntns     *fs.MountNamespace
   939  		mntnsVFS2 *vfs.MountNamespace
   940  	)
   941  
   942  	if VFS2Enabled {
   943  		mntnsVFS2 = args.MountNamespaceVFS2
   944  		if mntnsVFS2 == nil {
   945  			// Add a reference to the namespace, which is transferred to the new process.
   946  			mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
   947  			mntnsVFS2.IncRef()
   948  		}
   949  		// Get the root directory from the MountNamespace.
   950  		root := mntnsVFS2.Root()
   951  		root.IncRef()
   952  		defer root.DecRef(ctx)
   953  
   954  		// Grab the working directory.
   955  		wd := root // Default.
   956  		if args.WorkingDirectory != "" {
   957  			pop := vfs.PathOperation{
   958  				Root:               root,
   959  				Start:              wd,
   960  				Path:               fspath.Parse(args.WorkingDirectory),
   961  				FollowFinalSymlink: true,
   962  			}
   963  			var err error
   964  			wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
   965  				CheckSearchable: true,
   966  			})
   967  			if err != nil {
   968  				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
   969  			}
   970  			defer wd.DecRef(ctx)
   971  		}
   972  		opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
   973  		fsContext = NewFSContextVFS2(root, wd, args.Umask)
   974  
   975  	} else {
   976  		mntns = args.MountNamespace
   977  		if mntns == nil {
   978  			mntns = k.GlobalInit().Leader().MountNamespace()
   979  			mntns.IncRef()
   980  		}
   981  		// Get the root directory from the MountNamespace.
   982  		root := mntns.Root()
   983  		// The call to newFSContext below will take a reference on root, so we
   984  		// don't need to hold this one.
   985  		defer root.DecRef(ctx)
   986  
   987  		// Grab the working directory.
   988  		remainingTraversals := args.MaxSymlinkTraversals
   989  		wd := root // Default.
   990  		if args.WorkingDirectory != "" {
   991  			var err error
   992  			wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
   993  			if err != nil {
   994  				return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
   995  			}
   996  			defer wd.DecRef(ctx)
   997  		}
   998  		opener = fsbridge.NewFSLookup(mntns, root, wd)
   999  		fsContext = newFSContext(root, wd, args.Umask)
  1000  	}
  1001  
  1002  	tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
  1003  	cu := cleanup.Make(func() {
  1004  		tg.Release(ctx)
  1005  	})
  1006  	defer cu.Clean()
  1007  
  1008  	// Check which file to start from.
  1009  	switch {
  1010  	case args.Filename != "":
  1011  		// If a filename is given, take that.
  1012  		// Set File to nil so we resolve the path in LoadTaskImage.
  1013  		args.File = nil
  1014  	case args.File != nil:
  1015  		// If File is set, take the File provided directly.
  1016  	default:
  1017  		// Otherwise look at Argv and see if the first argument is a valid path.
  1018  		if len(args.Argv) == 0 {
  1019  			return nil, 0, fmt.Errorf("no filename or command provided")
  1020  		}
  1021  		if !filepath.IsAbs(args.Argv[0]) {
  1022  			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
  1023  		}
  1024  		args.Filename = args.Argv[0]
  1025  	}
  1026  
  1027  	// Create a fresh task context.
  1028  	remainingTraversals := args.MaxSymlinkTraversals
  1029  	loadArgs := loader.LoadArgs{
  1030  		Opener:              opener,
  1031  		RemainingTraversals: &remainingTraversals,
  1032  		ResolveFinal:        true,
  1033  		Filename:            args.Filename,
  1034  		File:                args.File,
  1035  		CloseOnExec:         false,
  1036  		Argv:                args.Argv,
  1037  		Envv:                args.Envv,
  1038  		Features:            k.featureSet,
  1039  	}
  1040  
  1041  	image, se := k.LoadTaskImage(ctx, loadArgs)
  1042  	if se != nil {
  1043  		return nil, 0, errors.New(se.String())
  1044  	}
  1045  
  1046  	// Take a reference on the FDTable, which will be transferred to
  1047  	// TaskSet.NewTask().
  1048  	args.FDTable.IncRef()
  1049  
  1050  	// Create the task.
  1051  	config := &TaskConfig{
  1052  		Kernel:                  k,
  1053  		ThreadGroup:             tg,
  1054  		TaskImage:               image,
  1055  		FSContext:               fsContext,
  1056  		FDTable:                 args.FDTable,
  1057  		Credentials:             args.Credentials,
  1058  		NetworkNamespace:        k.RootNetworkNamespace(),
  1059  		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
  1060  		UTSNamespace:            args.UTSNamespace,
  1061  		IPCNamespace:            args.IPCNamespace,
  1062  		AbstractSocketNamespace: args.AbstractSocketNamespace,
  1063  		MountNamespaceVFS2:      mntnsVFS2,
  1064  		ContainerID:             args.ContainerID,
  1065  	}
  1066  	t, err := k.tasks.NewTask(ctx, config)
  1067  	if err != nil {
  1068  		return nil, 0, err
  1069  	}
  1070  	t.traceExecEvent(image) // Simulate exec for tracing.
  1071  
  1072  	// Success.
  1073  	cu.Release()
  1074  	tgid := k.tasks.Root.IDOfThreadGroup(tg)
  1075  	if k.globalInit == nil {
  1076  		k.globalInit = tg
  1077  	}
  1078  	return tg, tgid, nil
  1079  }
  1080  
  1081  // StartProcess starts running a process that was created with CreateProcess.
  1082  func (k *Kernel) StartProcess(tg *ThreadGroup) {
  1083  	t := tg.Leader()
  1084  	tid := k.tasks.Root.IDOfTask(t)
  1085  	t.Start(tid)
  1086  }
  1087  
  1088  // Start starts execution of all tasks in k.
  1089  //
  1090  // Preconditions: Start may be called exactly once.
  1091  func (k *Kernel) Start() error {
  1092  	k.extMu.Lock()
  1093  	defer k.extMu.Unlock()
  1094  
  1095  	if k.globalInit == nil {
  1096  		return fmt.Errorf("kernel contains no tasks")
  1097  	}
  1098  	if k.started {
  1099  		return fmt.Errorf("kernel already started")
  1100  	}
  1101  
  1102  	k.started = true
  1103  	k.cpuClockTicker = ktime.NewTimer(k.timekeeper.monotonicClock, newKernelCPUClockTicker(k))
  1104  	k.cpuClockTicker.Swap(ktime.Setting{
  1105  		Enabled: true,
  1106  		Period:  linux.ClockTick,
  1107  	})
  1108  	// If k was created by LoadKernelFrom, timers were stopped during
  1109  	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
  1110  	// this is a no-op.
  1111  	k.resumeTimeLocked(k.SupervisorContext())
  1112  	// Start task goroutines.
  1113  	k.tasks.mu.RLock()
  1114  	defer k.tasks.mu.RUnlock()
  1115  	for t, tid := range k.tasks.Root.tids {
  1116  		t.Start(tid)
  1117  	}
  1118  	return nil
  1119  }
  1120  
  1121  // pauseTimeLocked pauses all Timers and Timekeeper updates.
  1122  //
  1123  // Preconditions:
  1124  // * Any task goroutines running in k must be stopped.
  1125  // * k.extMu must be locked.
  1126  func (k *Kernel) pauseTimeLocked(ctx context.Context) {
  1127  	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
  1128  	// Kernel.Start().
  1129  	if k.cpuClockTicker != nil {
  1130  		k.cpuClockTicker.Pause()
  1131  	}
  1132  
  1133  	// By precondition, nothing else can be interacting with PIDNamespace.tids
  1134  	// or FDTable.files, so we can iterate them without synchronization. (We
  1135  	// can't hold the TaskSet mutex when pausing thread group timers because
  1136  	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
  1137  	// mutex, while holding the Timer mutex.)
  1138  	for t := range k.tasks.Root.tids {
  1139  		if t == t.tg.leader {
  1140  			t.tg.itimerRealTimer.Pause()
  1141  			for _, it := range t.tg.timers {
  1142  				it.PauseTimer()
  1143  			}
  1144  		}
  1145  		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
  1146  		// but ktime.Timer.Pause is idempotent so this is harmless.
  1147  		if t.fdTable != nil {
  1148  			t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
  1149  				if VFS2Enabled {
  1150  					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1151  						tfd.PauseTimer()
  1152  					}
  1153  				} else {
  1154  					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
  1155  						tfd.PauseTimer()
  1156  					}
  1157  				}
  1158  			})
  1159  		}
  1160  	}
  1161  	k.timekeeper.PauseUpdates()
  1162  }
  1163  
  1164  // resumeTimeLocked resumes all Timers and Timekeeper updates. If
  1165  // pauseTimeLocked has not been previously called, resumeTimeLocked has no
  1166  // effect.
  1167  //
  1168  // Preconditions:
  1169  // * Any task goroutines running in k must be stopped.
  1170  // * k.extMu must be locked.
  1171  func (k *Kernel) resumeTimeLocked(ctx context.Context) {
  1172  	if k.cpuClockTicker != nil {
  1173  		k.cpuClockTicker.Resume()
  1174  	}
  1175  
  1176  	k.timekeeper.ResumeUpdates()
  1177  	for t := range k.tasks.Root.tids {
  1178  		if t == t.tg.leader {
  1179  			t.tg.itimerRealTimer.Resume()
  1180  			for _, it := range t.tg.timers {
  1181  				it.ResumeTimer()
  1182  			}
  1183  		}
  1184  		if t.fdTable != nil {
  1185  			t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
  1186  				if VFS2Enabled {
  1187  					if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
  1188  						tfd.ResumeTimer()
  1189  					}
  1190  				} else {
  1191  					if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
  1192  						tfd.ResumeTimer()
  1193  					}
  1194  				}
  1195  			})
  1196  		}
  1197  	}
  1198  }
  1199  
  1200  func (k *Kernel) incRunningTasks() {
  1201  	for {
  1202  		tasks := atomic.LoadInt64(&k.runningTasks)
  1203  		if tasks != 0 {
  1204  			// Standard case. Simply increment.
  1205  			if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
  1206  				continue
  1207  			}
  1208  			return
  1209  		}
  1210  
  1211  		// Transition from 0 -> 1. Synchronize with other transitions and timer.
  1212  		k.runningTasksMu.Lock()
  1213  		tasks = atomic.LoadInt64(&k.runningTasks)
  1214  		if tasks != 0 {
  1215  			// We're no longer the first task, no need to
  1216  			// re-enable.
  1217  			atomic.AddInt64(&k.runningTasks, 1)
  1218  			k.runningTasksMu.Unlock()
  1219  			return
  1220  		}
  1221  
  1222  		if !k.cpuClockTickerDisabled {
  1223  			// Timer was never disabled.
  1224  			atomic.StoreInt64(&k.runningTasks, 1)
  1225  			k.runningTasksMu.Unlock()
  1226  			return
  1227  		}
  1228  
  1229  		// We need to update cpuClock for all of the ticks missed while we
  1230  		// slept, and then re-enable the timer.
  1231  		//
  1232  		// The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
  1233  		// always increments cpuClock by 1 regardless of the number of
  1234  		// expirations as a heuristic to avoid over-accounting in cases of CPU
  1235  		// throttling.
  1236  		//
  1237  		// We want to cover the normal case, when all time should be accounted,
  1238  		// so we increment for all expirations. Throttling is less concerning
  1239  		// here because the ticker is only disabled from Notify. This means
  1240  		// that Notify must schedule and compensate for the throttled period
  1241  		// before the timer is disabled. Throttling while the timer is disabled
  1242  		// doesn't matter, as nothing is running or reading cpuClock anyways.
  1243  		//
  1244  		// S/R also adds complication, as there are two cases. Recall that
  1245  		// monotonicClock will jump forward on restore.
  1246  		//
  1247  		// 1. If the ticker is enabled during save, then on Restore Notify is
  1248  		// called with many expirations, covering the time jump, but cpuClock
  1249  		// is only incremented by 1.
  1250  		//
  1251  		// 2. If the ticker is disabled during save, then after Restore the
  1252  		// first wakeup will call this function and cpuClock will be
  1253  		// incremented by the number of expirations across the S/R.
  1254  		//
  1255  		// These cause very different value of cpuClock. But again, since
  1256  		// nothing was running while the ticker was disabled, those differences
  1257  		// don't matter.
  1258  		setting, exp := k.cpuClockTickerSetting.At(k.timekeeper.monotonicClock.Now())
  1259  		if exp > 0 {
  1260  			atomic.AddUint64(&k.cpuClock, exp)
  1261  		}
  1262  
  1263  		// Now that cpuClock is updated it is safe to allow other tasks to
  1264  		// transition to running.
  1265  		atomic.StoreInt64(&k.runningTasks, 1)
  1266  
  1267  		// N.B. we must unlock before calling Swap to maintain lock ordering.
  1268  		//
  1269  		// cpuClockTickerDisabled need not wait until after Swap to become
  1270  		// true. It is sufficient that the timer *will* be enabled.
  1271  		k.cpuClockTickerDisabled = false
  1272  		k.runningTasksMu.Unlock()
  1273  
  1274  		// This won't call Notify (unless it's been ClockTick since setting.At
  1275  		// above). This means we skip the thread group work in Notify. However,
  1276  		// since nothing was running while we were disabled, none of the timers
  1277  		// could have expired.
  1278  		k.cpuClockTicker.Swap(setting)
  1279  
  1280  		return
  1281  	}
  1282  }
  1283  
  1284  func (k *Kernel) decRunningTasks() {
  1285  	tasks := atomic.AddInt64(&k.runningTasks, -1)
  1286  	if tasks < 0 {
  1287  		panic(fmt.Sprintf("Invalid running count %d", tasks))
  1288  	}
  1289  
  1290  	// Nothing to do. The next CPU clock tick will disable the timer if
  1291  	// there is still nothing running. This provides approximately one tick
  1292  	// of slack in which we can switch back and forth between idle and
  1293  	// active without an expensive transition.
  1294  }
  1295  
  1296  // WaitExited blocks until all tasks in k have exited.
  1297  func (k *Kernel) WaitExited() {
  1298  	k.tasks.liveGoroutines.Wait()
  1299  }
  1300  
  1301  // Kill requests that all tasks in k immediately exit as if group exiting with
  1302  // status es. Kill does not wait for tasks to exit.
  1303  func (k *Kernel) Kill(es ExitStatus) {
  1304  	k.extMu.Lock()
  1305  	defer k.extMu.Unlock()
  1306  	k.tasks.Kill(es)
  1307  }
  1308  
  1309  // Pause requests that all tasks in k temporarily stop executing, and blocks
  1310  // until all tasks and asynchronous I/O operations in k have stopped. Multiple
  1311  // calls to Pause nest and require an equal number of calls to Unpause to
  1312  // resume execution.
  1313  func (k *Kernel) Pause() {
  1314  	k.extMu.Lock()
  1315  	k.tasks.BeginExternalStop()
  1316  	k.extMu.Unlock()
  1317  	k.tasks.runningGoroutines.Wait()
  1318  	k.tasks.aioGoroutines.Wait()
  1319  }
  1320  
  1321  // ReceiveTaskStates receives full states for all tasks.
  1322  func (k *Kernel) ReceiveTaskStates() {
  1323  	k.extMu.Lock()
  1324  	k.tasks.PullFullState()
  1325  	k.extMu.Unlock()
  1326  }
  1327  
  1328  // Unpause ends the effect of a previous call to Pause. If Unpause is called
  1329  // without a matching preceding call to Pause, Unpause may panic.
  1330  func (k *Kernel) Unpause() {
  1331  	k.extMu.Lock()
  1332  	defer k.extMu.Unlock()
  1333  	k.tasks.EndExternalStop()
  1334  }
  1335  
  1336  // SendExternalSignal injects a signal into the kernel.
  1337  //
  1338  // context is used only for debugging to describe how the signal was received.
  1339  //
  1340  // Preconditions: Kernel must have an init process.
  1341  func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
  1342  	k.extMu.Lock()
  1343  	defer k.extMu.Unlock()
  1344  	k.sendExternalSignal(info, context)
  1345  }
  1346  
  1347  // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
  1348  // This function doesn't skip signals like SendExternalSignal does.
  1349  func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
  1350  	k.extMu.Lock()
  1351  	defer k.extMu.Unlock()
  1352  	return tg.SendSignal(info)
  1353  }
  1354  
  1355  // SendContainerSignal sends the given signal to all processes inside the
  1356  // namespace that match the given container ID.
  1357  func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
  1358  	k.extMu.Lock()
  1359  	defer k.extMu.Unlock()
  1360  	k.tasks.mu.RLock()
  1361  	defer k.tasks.mu.RUnlock()
  1362  
  1363  	var lastErr error
  1364  	for tg := range k.tasks.Root.tgids {
  1365  		if tg.leader.ContainerID() == cid {
  1366  			tg.signalHandlers.mu.Lock()
  1367  			infoCopy := *info
  1368  			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
  1369  				lastErr = err
  1370  			}
  1371  			tg.signalHandlers.mu.Unlock()
  1372  		}
  1373  	}
  1374  	return lastErr
  1375  }
  1376  
  1377  // RebuildTraceContexts rebuilds the trace context for all tasks.
  1378  //
  1379  // Unfortunately, if these are built while tracing is not enabled, then we will
  1380  // not have meaningful trace data. Rebuilding here ensures that we can do so
  1381  // after tracing has been enabled.
  1382  func (k *Kernel) RebuildTraceContexts() {
  1383  	// We need to pause all task goroutines because Task.rebuildTraceContext()
  1384  	// replaces Task.traceContext and Task.traceTask, which are
  1385  	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
  1386  	// access them without synchronization) for performance.
  1387  	k.Pause()
  1388  	defer k.Unpause()
  1389  
  1390  	k.extMu.Lock()
  1391  	defer k.extMu.Unlock()
  1392  	k.tasks.mu.RLock()
  1393  	defer k.tasks.mu.RUnlock()
  1394  
  1395  	for t, tid := range k.tasks.Root.tids {
  1396  		t.rebuildTraceContext(tid)
  1397  	}
  1398  }
  1399  
  1400  // FeatureSet returns the FeatureSet.
  1401  func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
  1402  	return k.featureSet
  1403  }
  1404  
  1405  // Timekeeper returns the Timekeeper.
  1406  func (k *Kernel) Timekeeper() *Timekeeper {
  1407  	return k.timekeeper
  1408  }
  1409  
  1410  // TaskSet returns the TaskSet.
  1411  func (k *Kernel) TaskSet() *TaskSet {
  1412  	return k.tasks
  1413  }
  1414  
  1415  // RootUserNamespace returns the root UserNamespace.
  1416  func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
  1417  	return k.rootUserNamespace
  1418  }
  1419  
  1420  // RootUTSNamespace returns the root UTSNamespace.
  1421  func (k *Kernel) RootUTSNamespace() *UTSNamespace {
  1422  	return k.rootUTSNamespace
  1423  }
  1424  
  1425  // RootIPCNamespace takes a reference and returns the root IPCNamespace.
  1426  func (k *Kernel) RootIPCNamespace() *IPCNamespace {
  1427  	k.rootIPCNamespace.IncRef()
  1428  	return k.rootIPCNamespace
  1429  }
  1430  
  1431  // RootPIDNamespace returns the root PIDNamespace.
  1432  func (k *Kernel) RootPIDNamespace() *PIDNamespace {
  1433  	return k.tasks.Root
  1434  }
  1435  
  1436  // RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
  1437  func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
  1438  	return k.rootAbstractSocketNamespace
  1439  }
  1440  
  1441  // RootNetworkNamespace returns the root network namespace, always non-nil.
  1442  func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
  1443  	return k.rootNetworkNamespace
  1444  }
  1445  
  1446  // GlobalInit returns the thread group with ID 1 in the root PID namespace, or
  1447  // nil if no such thread group exists. GlobalInit may return a thread group
  1448  // containing no tasks if the thread group has already exited.
  1449  func (k *Kernel) GlobalInit() *ThreadGroup {
  1450  	k.extMu.Lock()
  1451  	defer k.extMu.Unlock()
  1452  	return k.globalInit
  1453  }
  1454  
  1455  // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
  1456  func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
  1457  	k.globalInit = tg
  1458  }
  1459  
  1460  // ApplicationCores returns the number of CPUs visible to sandboxed
  1461  // applications.
  1462  func (k *Kernel) ApplicationCores() uint {
  1463  	return k.applicationCores
  1464  }
  1465  
  1466  // RealtimeClock returns the application CLOCK_REALTIME clock.
  1467  func (k *Kernel) RealtimeClock() ktime.Clock {
  1468  	return k.timekeeper.realtimeClock
  1469  }
  1470  
  1471  // MonotonicClock returns the application CLOCK_MONOTONIC clock.
  1472  func (k *Kernel) MonotonicClock() ktime.Clock {
  1473  	return k.timekeeper.monotonicClock
  1474  }
  1475  
  1476  // CPUClockNow returns the current value of k.cpuClock.
  1477  func (k *Kernel) CPUClockNow() uint64 {
  1478  	return atomic.LoadUint64(&k.cpuClock)
  1479  }
  1480  
  1481  // Syslog returns the syslog.
  1482  func (k *Kernel) Syslog() *syslog {
  1483  	return &k.syslog
  1484  }
  1485  
  1486  // GenerateInotifyCookie generates a unique inotify event cookie.
  1487  //
  1488  // Returned values may overlap with previously returned values if the value
  1489  // space is exhausted. 0 is not a valid cookie value, all other values
  1490  // representable in a uint32 are allowed.
  1491  func (k *Kernel) GenerateInotifyCookie() uint32 {
  1492  	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
  1493  	// Wrap-around is explicitly allowed for inotify event cookies.
  1494  	if id == 0 {
  1495  		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
  1496  	}
  1497  	return id
  1498  }
  1499  
  1500  // NetlinkPorts returns the netlink port manager.
  1501  func (k *Kernel) NetlinkPorts() *port.Manager {
  1502  	return k.netlinkPorts
  1503  }
  1504  
  1505  var (
  1506  	errSaved     = errors.New("sandbox has been successfully saved")
  1507  	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
  1508  )
  1509  
  1510  // SaveStatus returns the sandbox save status. If it was saved successfully,
  1511  // autosaved indicates whether save was triggered by autosave. If it was not
  1512  // saved successfully, err indicates the sandbox error that caused the kernel to
  1513  // exit during save.
  1514  func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
  1515  	k.extMu.Lock()
  1516  	defer k.extMu.Unlock()
  1517  	switch k.saveStatus {
  1518  	case nil:
  1519  		return false, false, nil
  1520  	case errSaved:
  1521  		return true, false, nil
  1522  	case errAutoSaved:
  1523  		return true, true, nil
  1524  	default:
  1525  		return false, false, k.saveStatus
  1526  	}
  1527  }
  1528  
  1529  // SetSaveSuccess sets the flag indicating that save completed successfully, if
  1530  // no status was already set.
  1531  func (k *Kernel) SetSaveSuccess(autosave bool) {
  1532  	k.extMu.Lock()
  1533  	defer k.extMu.Unlock()
  1534  	if k.saveStatus == nil {
  1535  		if autosave {
  1536  			k.saveStatus = errAutoSaved
  1537  		} else {
  1538  			k.saveStatus = errSaved
  1539  		}
  1540  	}
  1541  }
  1542  
  1543  // SetSaveError sets the sandbox error that caused the kernel to exit during
  1544  // save, if one is not already set.
  1545  func (k *Kernel) SetSaveError(err error) {
  1546  	k.extMu.Lock()
  1547  	defer k.extMu.Unlock()
  1548  	if k.saveStatus == nil {
  1549  		k.saveStatus = err
  1550  	}
  1551  }
  1552  
  1553  // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
  1554  // LoadFrom.
  1555  func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
  1556  	k.mf = mf
  1557  }
  1558  
  1559  // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
  1560  func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
  1561  	return k.mf
  1562  }
  1563  
  1564  // SupervisorContext returns a Context with maximum privileges in k. It should
  1565  // only be used by goroutines outside the control of the emulated kernel
  1566  // defined by e.
  1567  //
  1568  // Callers are responsible for ensuring that the returned Context is not used
  1569  // concurrently with changes to the Kernel.
  1570  func (k *Kernel) SupervisorContext() context.Context {
  1571  	return supervisorContext{
  1572  		Logger: log.Log(),
  1573  		k:      k,
  1574  	}
  1575  }
  1576  
  1577  // SocketRecord represents a socket recorded in Kernel.socketsVFS2.
  1578  //
  1579  // +stateify savable
  1580  type SocketRecord struct {
  1581  	k        *Kernel
  1582  	Sock     *refs.WeakRef        // TODO(github.com/SagerNet/issue/1624): Only used by VFS1.
  1583  	SockVFS2 *vfs.FileDescription // Only used by VFS2.
  1584  	ID       uint64               // Socket table entry number.
  1585  }
  1586  
  1587  // SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
  1588  // refs.WeakRefUser for sockets stored in the socket table.
  1589  //
  1590  // +stateify savable
  1591  type SocketRecordVFS1 struct {
  1592  	socketEntry
  1593  	SocketRecord
  1594  }
  1595  
  1596  // WeakRefGone implements refs.WeakRefUser.WeakRefGone.
  1597  func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
  1598  	s.k.extMu.Lock()
  1599  	s.k.sockets.Remove(s)
  1600  	s.k.extMu.Unlock()
  1601  }
  1602  
  1603  // RecordSocket adds a socket to the system-wide socket table for tracking.
  1604  //
  1605  // Precondition: Caller must hold a reference to sock.
  1606  func (k *Kernel) RecordSocket(sock *fs.File) {
  1607  	k.extMu.Lock()
  1608  	id := k.nextSocketRecord
  1609  	k.nextSocketRecord++
  1610  	s := &SocketRecordVFS1{
  1611  		SocketRecord: SocketRecord{
  1612  			k:  k,
  1613  			ID: id,
  1614  		},
  1615  	}
  1616  	s.Sock = refs.NewWeakRef(sock, s)
  1617  	k.sockets.PushBack(s)
  1618  	k.extMu.Unlock()
  1619  }
  1620  
  1621  // RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
  1622  // tracking.
  1623  //
  1624  // Precondition: Caller must hold a reference to sock.
  1625  //
  1626  // Note that the socket table will not hold a reference on the
  1627  // vfs.FileDescription.
  1628  func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
  1629  	k.extMu.Lock()
  1630  	if _, ok := k.socketsVFS2[sock]; ok {
  1631  		panic(fmt.Sprintf("Socket %p added twice", sock))
  1632  	}
  1633  	id := k.nextSocketRecord
  1634  	k.nextSocketRecord++
  1635  	s := &SocketRecord{
  1636  		k:        k,
  1637  		ID:       id,
  1638  		SockVFS2: sock,
  1639  	}
  1640  	k.socketsVFS2[sock] = s
  1641  	k.extMu.Unlock()
  1642  }
  1643  
  1644  // DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
  1645  func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
  1646  	k.extMu.Lock()
  1647  	delete(k.socketsVFS2, sock)
  1648  	k.extMu.Unlock()
  1649  }
  1650  
  1651  // ListSockets returns a snapshot of all sockets.
  1652  //
  1653  // Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
  1654  // to get a reference on a socket in the table.
  1655  func (k *Kernel) ListSockets() []*SocketRecord {
  1656  	k.extMu.Lock()
  1657  	var socks []*SocketRecord
  1658  	if VFS2Enabled {
  1659  		for _, s := range k.socketsVFS2 {
  1660  			socks = append(socks, s)
  1661  		}
  1662  	} else {
  1663  		for s := k.sockets.Front(); s != nil; s = s.Next() {
  1664  			socks = append(socks, &s.SocketRecord)
  1665  		}
  1666  	}
  1667  	k.extMu.Unlock()
  1668  	return socks
  1669  }
  1670  
  1671  // supervisorContext is a privileged context.
  1672  type supervisorContext struct {
  1673  	context.NoopSleeper
  1674  	log.Logger
  1675  	k *Kernel
  1676  }
  1677  
  1678  // Value implements context.Context.
  1679  func (ctx supervisorContext) Value(key interface{}) interface{} {
  1680  	switch key {
  1681  	case CtxCanTrace:
  1682  		// The supervisor context can trace anything. (None of
  1683  		// supervisorContext's users are expected to invoke ptrace, but ptrace
  1684  		// permissions are required for certain file accesses.)
  1685  		return func(*Task, bool) bool { return true }
  1686  	case CtxKernel:
  1687  		return ctx.k
  1688  	case CtxPIDNamespace:
  1689  		return ctx.k.tasks.Root
  1690  	case CtxUTSNamespace:
  1691  		return ctx.k.rootUTSNamespace
  1692  	case CtxIPCNamespace:
  1693  		ipcns := ctx.k.rootIPCNamespace
  1694  		ipcns.IncRef()
  1695  		return ipcns
  1696  	case auth.CtxCredentials:
  1697  		// The supervisor context is global root.
  1698  		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
  1699  	case fs.CtxRoot:
  1700  		if ctx.k.globalInit != nil {
  1701  			return ctx.k.globalInit.mounts.Root()
  1702  		}
  1703  		return nil
  1704  	case vfs.CtxRoot:
  1705  		if ctx.k.globalInit == nil {
  1706  			return vfs.VirtualDentry{}
  1707  		}
  1708  		root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
  1709  		root.IncRef()
  1710  		return root
  1711  	case vfs.CtxMountNamespace:
  1712  		if ctx.k.globalInit == nil {
  1713  			return nil
  1714  		}
  1715  		mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
  1716  		mntns.IncRef()
  1717  		return mntns
  1718  	case fs.CtxDirentCacheLimiter:
  1719  		return ctx.k.DirentCacheLimiter
  1720  	case inet.CtxStack:
  1721  		return ctx.k.RootNetworkNamespace().Stack()
  1722  	case ktime.CtxRealtimeClock:
  1723  		return ctx.k.RealtimeClock()
  1724  	case limits.CtxLimits:
  1725  		// No limits apply.
  1726  		return limits.NewLimitSet()
  1727  	case pgalloc.CtxMemoryFile:
  1728  		return ctx.k.mf
  1729  	case pgalloc.CtxMemoryFileProvider:
  1730  		return ctx.k
  1731  	case platform.CtxPlatform:
  1732  		return ctx.k
  1733  	case uniqueid.CtxGlobalUniqueID:
  1734  		return ctx.k.UniqueID()
  1735  	case uniqueid.CtxGlobalUniqueIDProvider:
  1736  		return ctx.k
  1737  	case uniqueid.CtxInotifyCookie:
  1738  		return ctx.k.GenerateInotifyCookie()
  1739  	case unimpl.CtxEvents:
  1740  		return ctx.k
  1741  	default:
  1742  		return nil
  1743  	}
  1744  }
  1745  
  1746  // Rate limits for the number of unimplemented syscall events.
  1747  const (
  1748  	unimplementedSyscallsMaxRate = 100  // events per second
  1749  	unimplementedSyscallBurst    = 1000 // events
  1750  )
  1751  
  1752  // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
  1753  // channel.
  1754  func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
  1755  	k.unimplementedSyscallEmitterOnce.Do(func() {
  1756  		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
  1757  	})
  1758  
  1759  	t := TaskFromContext(ctx)
  1760  	_, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
  1761  		Tid:       int32(t.ThreadID()),
  1762  		Registers: t.Arch().StateData().Proto(),
  1763  	})
  1764  }
  1765  
  1766  // VFS returns the virtual filesystem for the kernel.
  1767  func (k *Kernel) VFS() *vfs.VirtualFilesystem {
  1768  	return &k.vfs
  1769  }
  1770  
  1771  // SetHostMount sets the hostfs mount.
  1772  func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
  1773  	if k.hostMount != nil {
  1774  		panic("Kernel.hostMount cannot be set more than once")
  1775  	}
  1776  	k.hostMount = mnt
  1777  }
  1778  
  1779  // HostMount returns the hostfs mount.
  1780  func (k *Kernel) HostMount() *vfs.Mount {
  1781  	return k.hostMount
  1782  }
  1783  
  1784  // PipeMount returns the pipefs mount.
  1785  func (k *Kernel) PipeMount() *vfs.Mount {
  1786  	return k.pipeMount
  1787  }
  1788  
  1789  // ShmMount returns the tmpfs mount.
  1790  func (k *Kernel) ShmMount() *vfs.Mount {
  1791  	return k.shmMount
  1792  }
  1793  
  1794  // SocketMount returns the sockfs mount.
  1795  func (k *Kernel) SocketMount() *vfs.Mount {
  1796  	return k.socketMount
  1797  }
  1798  
  1799  // CgroupRegistry returns the cgroup registry.
  1800  func (k *Kernel) CgroupRegistry() *CgroupRegistry {
  1801  	return k.cgroupRegistry
  1802  }
  1803  
  1804  // Release releases resources owned by k.
  1805  //
  1806  // Precondition: This should only be called after the kernel is fully
  1807  // initialized, e.g. after k.Start() has been called.
  1808  func (k *Kernel) Release() {
  1809  	ctx := k.SupervisorContext()
  1810  	if VFS2Enabled {
  1811  		k.hostMount.DecRef(ctx)
  1812  		k.pipeMount.DecRef(ctx)
  1813  		k.shmMount.DecRef(ctx)
  1814  		k.socketMount.DecRef(ctx)
  1815  		k.vfs.Release(ctx)
  1816  	}
  1817  	k.timekeeper.Destroy()
  1818  	k.vdso.Release(ctx)
  1819  }
  1820  
  1821  // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
  1822  // hierarchy.
  1823  //
  1824  // Precondition: root must be a new cgroup with no tasks. This implies the
  1825  // controllers for root are also new and currently manage no task, which in turn
  1826  // implies the new cgroup can be populated without migrating tasks between
  1827  // cgroups.
  1828  func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
  1829  	k.tasks.mu.RLock()
  1830  	k.tasks.forEachTaskLocked(func(t *Task) {
  1831  		if t.exitState != TaskExitNone {
  1832  			return
  1833  		}
  1834  		t.mu.Lock()
  1835  		// A task can be in the cgroup if it has been created after the
  1836  		// cgroup hierarchy was registered.
  1837  		t.enterCgroupIfNotYetLocked(root)
  1838  		t.mu.Unlock()
  1839  	})
  1840  	k.tasks.mu.RUnlock()
  1841  }
  1842  
  1843  // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
  1844  // hierarchy with the provided id.  This is intended for use during hierarchy
  1845  // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
  1846  func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
  1847  	k.tasks.mu.RLock()
  1848  	k.tasks.forEachTaskLocked(func(t *Task) {
  1849  		if t.exitState != TaskExitNone {
  1850  			return
  1851  		}
  1852  		t.mu.Lock()
  1853  		for cg := range t.cgroups {
  1854  			if cg.HierarchyID() == hid {
  1855  				t.leaveCgroupLocked(cg)
  1856  			}
  1857  		}
  1858  		t.mu.Unlock()
  1859  	})
  1860  	k.tasks.mu.RUnlock()
  1861  }