github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/kernel.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernel provides an emulation of the Linux kernel. 16 // 17 // See README.md for a detailed overview. 18 // 19 // Lock order (outermost locks must be taken first): 20 // 21 // Kernel.extMu 22 // ThreadGroup.timerMu 23 // ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu 24 // TaskSet.mu 25 // SignalHandlers.mu 26 // Task.mu 27 // runningTasksMu 28 // 29 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking 30 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same 31 // time requires locking all of their signal mutexes first. 32 package kernel 33 34 import ( 35 "errors" 36 "fmt" 37 "path/filepath" 38 "time" 39 40 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 41 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 42 "github.com/MerlinKodo/gvisor/pkg/cleanup" 43 "github.com/MerlinKodo/gvisor/pkg/context" 44 "github.com/MerlinKodo/gvisor/pkg/cpuid" 45 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 46 "github.com/MerlinKodo/gvisor/pkg/eventchannel" 47 "github.com/MerlinKodo/gvisor/pkg/fspath" 48 "github.com/MerlinKodo/gvisor/pkg/log" 49 "github.com/MerlinKodo/gvisor/pkg/refs" 50 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 51 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs" 52 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/pipefs" 53 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sockfs" 54 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/timerfd" 55 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs" 56 "github.com/MerlinKodo/gvisor/pkg/sentry/hostcpu" 57 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 58 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 59 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/futex" 60 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/ipc" 61 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/sched" 62 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 63 "github.com/MerlinKodo/gvisor/pkg/sentry/limits" 64 "github.com/MerlinKodo/gvisor/pkg/sentry/loader" 65 "github.com/MerlinKodo/gvisor/pkg/sentry/mm" 66 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 67 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 68 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/port" 69 sentrytime "github.com/MerlinKodo/gvisor/pkg/sentry/time" 70 "github.com/MerlinKodo/gvisor/pkg/sentry/unimpl" 71 uspb "github.com/MerlinKodo/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" 72 "github.com/MerlinKodo/gvisor/pkg/sentry/uniqueid" 73 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 74 "github.com/MerlinKodo/gvisor/pkg/state" 75 "github.com/MerlinKodo/gvisor/pkg/state/wire" 76 "github.com/MerlinKodo/gvisor/pkg/sync" 77 "github.com/MerlinKodo/gvisor/pkg/tcpip" 78 ) 79 80 // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to 81 // allow easy access everywhere. 82 var IOUringEnabled = false 83 84 // userCounters is a set of user counters. 85 // 86 // +stateify savable 87 type userCounters struct { 88 uid auth.KUID 89 90 rlimitNProc atomicbitops.Uint64 91 } 92 93 // incRLimitNProc increments the rlimitNProc counter. 94 func (uc *userCounters) incRLimitNProc(ctx context.Context) error { 95 lim := limits.FromContext(ctx).Get(limits.ProcessCount) 96 creds := auth.CredentialsFromContext(ctx) 97 nproc := uc.rlimitNProc.Add(1) 98 if nproc > lim.Cur && 99 !creds.HasCapability(linux.CAP_SYS_ADMIN) && 100 !creds.HasCapability(linux.CAP_SYS_RESOURCE) { 101 uc.rlimitNProc.Add(^uint64(0)) 102 return linuxerr.EAGAIN 103 } 104 return nil 105 } 106 107 // decRLimitNProc decrements the rlimitNProc counter. 108 func (uc *userCounters) decRLimitNProc() { 109 uc.rlimitNProc.Add(^uint64(0)) 110 } 111 112 // Kernel represents an emulated Linux kernel. It must be initialized by calling 113 // Init() or LoadFrom(). 114 // 115 // +stateify savable 116 type Kernel struct { 117 // extMu serializes external changes to the Kernel with calls to 118 // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel 119 // remains frozen for the duration of the call; it requires that the Kernel 120 // is paused as a precondition, which ensures that none of the tasks 121 // running within the Kernel can affect its state, but extMu is required to 122 // ensure that concurrent users of the Kernel *outside* the Kernel's 123 // control cannot affect its state by calling e.g. 124 // Kernel.SendExternalSignal.) 125 extMu sync.Mutex `state:"nosave"` 126 127 // started is true if Start has been called. Unless otherwise specified, 128 // all Kernel fields become immutable once started becomes true. 129 started bool `state:"nosave"` 130 131 // All of the following fields are immutable unless otherwise specified. 132 133 // Platform is the platform that is used to execute tasks in the created 134 // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is 135 // embedded anonymously (the same issue applies). 136 platform.Platform `state:"nosave"` 137 138 // mf provides application memory. 139 mf *pgalloc.MemoryFile `state:"nosave"` 140 141 // See InitKernelArgs for the meaning of these fields. 142 featureSet cpuid.FeatureSet 143 timekeeper *Timekeeper 144 tasks *TaskSet 145 rootUserNamespace *auth.UserNamespace 146 rootNetworkNamespace *inet.Namespace 147 applicationCores uint 148 useHostCores bool 149 extraAuxv []arch.AuxEntry 150 vdso *loader.VDSO 151 rootUTSNamespace *UTSNamespace 152 rootIPCNamespace *IPCNamespace 153 rootAbstractSocketNamespace *AbstractSocketNamespace 154 155 // futexes is the "root" futex.Manager, from which all others are forked. 156 // This is necessary to ensure that shared futexes are coherent across all 157 // tasks, including those created by CreateProcess. 158 futexes *futex.Manager 159 160 // globalInit is the thread group whose leader has ID 1 in the root PID 161 // namespace. globalInit is stored separately so that it is accessible even 162 // after all tasks in the thread group have exited, such that ID 1 is no 163 // longer mapped. 164 // 165 // globalInit is mutable until it is assigned by the first successful call 166 // to CreateProcess, and is protected by extMu. 167 globalInit *ThreadGroup 168 169 // syslog is the kernel log. 170 syslog syslog 171 172 runningTasksMu runningTasksMutex `state:"nosave"` 173 174 // runningTasks is the total count of tasks currently in 175 // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are 176 // not blocked or stopped. 177 // 178 // runningTasks must be accessed atomically. Increments from 0 to 1 are 179 // further protected by runningTasksMu (see incRunningTasks). 180 runningTasks atomicbitops.Int64 181 182 // runningTasksCond is signaled when runningTasks is incremented from 0 to 1. 183 // 184 // Invariant: runningTasksCond.L == &runningTasksMu. 185 runningTasksCond sync.Cond `state:"nosave"` 186 187 // cpuClock is incremented every linux.ClockTick by a goroutine running 188 // kernel.runCPUClockTicker() while runningTasks != 0. 189 // 190 // cpuClock is used to measure task CPU usage, since sampling monotonicClock 191 // twice on every syscall turns out to be unreasonably expensive. This is 192 // similar to how Linux does task CPU accounting on x86 193 // (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing 194 // information to improve resolution 195 // (kernel/sched/cputime.c:cputime_adjust()), which we can't do since 196 // "preeemptive" scheduling is managed by the Go runtime, which doesn't 197 // provide this information. 198 // 199 // cpuClock is mutable, and is accessed using atomic memory operations. 200 cpuClock atomicbitops.Uint64 201 202 // cpuClockTickTimer drives increments of cpuClock. 203 cpuClockTickTimer *time.Timer `state:"nosave"` 204 205 // cpuClockMu is used to make increments of cpuClock, and updates of timers 206 // based on cpuClock, atomic. 207 cpuClockMu cpuClockMutex `state:"nosave"` 208 209 // cpuClockTickerRunning is true if the goroutine that increments cpuClock is 210 // running and false if it is blocked in runningTasksCond.Wait() or if it 211 // never started. 212 // 213 // cpuClockTickerRunning is protected by runningTasksMu. 214 cpuClockTickerRunning bool 215 216 // cpuClockTickerWakeCh is sent to to wake the goroutine that increments 217 // cpuClock if it's sleeping between ticks. 218 cpuClockTickerWakeCh chan struct{} `state:"nosave"` 219 220 // cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions 221 // from true to false. 222 // 223 // Invariant: cpuClockTickerStopCond.L == &runningTasksMu. 224 cpuClockTickerStopCond sync.Cond `state:"nosave"` 225 226 // uniqueID is used to generate unique identifiers. 227 // 228 // uniqueID is mutable, and is accessed using atomic memory operations. 229 uniqueID atomicbitops.Uint64 230 231 // nextInotifyCookie is a monotonically increasing counter used for 232 // generating unique inotify event cookies. 233 // 234 // nextInotifyCookie is mutable. 235 nextInotifyCookie atomicbitops.Uint32 236 237 // netlinkPorts manages allocation of netlink socket port IDs. 238 netlinkPorts *port.Manager 239 240 // saveStatus is nil if the sandbox has not been saved, errSaved or 241 // errAutoSaved if it has been saved successfully, or the error causing the 242 // sandbox to exit during save. 243 // It is protected by extMu. 244 saveStatus error `state:"nosave"` 245 246 // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. 247 danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` 248 249 // sockets records all network sockets in the system. Protected by extMu. 250 sockets map[*vfs.FileDescription]*SocketRecord 251 252 // nextSocketRecord is the next entry number to use in sockets. Protected 253 // by extMu. 254 nextSocketRecord uint64 255 256 // unimplementedSyscallEmitterOnce is used in the initialization of 257 // unimplementedSyscallEmitter. 258 unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` 259 260 // unimplementedSyscallEmitter is used to emit unimplemented syscall 261 // events. This is initialized lazily on the first unimplemented 262 // syscall. 263 unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` 264 265 // SpecialOpts contains special kernel options. 266 SpecialOpts 267 268 // vfs keeps the filesystem state used across the kernel. 269 vfs vfs.VirtualFilesystem 270 271 // hostMount is the Mount used for file descriptors that were imported 272 // from the host. 273 hostMount *vfs.Mount 274 275 // pipeMount is the Mount used for pipes created by the pipe() and pipe2() 276 // syscalls (as opposed to named pipes created by mknod()). 277 pipeMount *vfs.Mount 278 279 // nsfsMount is the Mount used for namespaces. 280 nsfsMount *vfs.Mount 281 282 // shmMount is the Mount used for anonymous files created by the 283 // memfd_create() syscalls. It is analogous to Linux's shm_mnt. 284 shmMount *vfs.Mount 285 286 // socketMount is the Mount used for sockets created by the socket() and 287 // socketpair() syscalls. There are several cases where a socket dentry will 288 // not be contained in socketMount: 289 // 1. Socket files created by mknod() 290 // 2. Socket fds imported from the host (Kernel.hostMount is used for these) 291 // 3. Socket files created by binding Unix sockets to a file path 292 socketMount *vfs.Mount 293 294 // sysVShmDevID is the device number used by SysV shm segments. In Linux, 295 // SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number. 296 // In gVisor, the shm implementation does not use shmMount, extracting 297 // shmMount's device number is inconvenient, applications accept a 298 // different device number in practice, and using a distinct device number 299 // avoids the possibility of inode number collisions due to the hack 300 // described in shm.Shm.InodeID(). 301 sysVShmDevID uint32 302 303 // If set to true, report address space activation waits as if the task is in 304 // external wait so that the watchdog doesn't report the task stuck. 305 SleepForAddressSpaceActivation bool 306 307 // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a 308 // tracee-tracer relationship. The key is a process (technically, the thread 309 // group leader) that can be traced by any thread that is a descendant of the 310 // value. If the value is nil, then anyone can trace the process represented by 311 // the key. 312 // 313 // ptraceExceptions is protected by the TaskSet mutex. 314 ptraceExceptions map[*Task]*Task 315 316 // YAMAPtraceScope is the current level of YAMA ptrace restrictions. 317 YAMAPtraceScope atomicbitops.Int32 318 319 // cgroupRegistry contains the set of active cgroup controllers on the 320 // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on 321 // the system. 322 cgroupRegistry *CgroupRegistry 323 324 // userCountersMap maps auth.KUID into a set of user counters. 325 userCountersMap map[auth.KUID]*userCounters 326 userCountersMapMu userCountersMutex `state:"nosave"` 327 } 328 329 // InitKernelArgs holds arguments to Init. 330 type InitKernelArgs struct { 331 // FeatureSet is the emulated CPU feature set. 332 FeatureSet cpuid.FeatureSet 333 334 // Timekeeper manages time for all tasks in the system. 335 Timekeeper *Timekeeper 336 337 // RootUserNamespace is the root user namespace. 338 RootUserNamespace *auth.UserNamespace 339 340 // RootNetworkNamespace is the root network namespace. If nil, no networking 341 // will be available. 342 RootNetworkNamespace *inet.Namespace 343 344 // ApplicationCores is the number of logical CPUs visible to sandboxed 345 // applications. The set of logical CPU IDs is [0, ApplicationCores); thus 346 // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the 347 // most significant bit in cpu_possible_mask + 1. 348 ApplicationCores uint 349 350 // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU 351 // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a 352 // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it 353 // will be overridden. 354 UseHostCores bool 355 356 // ExtraAuxv contains additional auxiliary vector entries that are added to 357 // each process by the ELF loader. 358 ExtraAuxv []arch.AuxEntry 359 360 // Vdso holds the VDSO and its parameter page. 361 Vdso *loader.VDSO 362 363 // RootUTSNamespace is the root UTS namespace. 364 RootUTSNamespace *UTSNamespace 365 366 // RootIPCNamespace is the root IPC namespace. 367 RootIPCNamespace *IPCNamespace 368 369 // RootAbstractSocketNamespace is the root Abstract Socket namespace. 370 RootAbstractSocketNamespace *AbstractSocketNamespace 371 372 // PIDNamespace is the root PID namespace. 373 PIDNamespace *PIDNamespace 374 } 375 376 // Init initialize the Kernel with no tasks. 377 // 378 // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile 379 // before calling Init. 380 func (k *Kernel) Init(args InitKernelArgs) error { 381 if args.Timekeeper == nil { 382 return fmt.Errorf("args.Timekeeper is nil") 383 } 384 if args.Timekeeper.clocks == nil { 385 return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") 386 } 387 if args.RootUserNamespace == nil { 388 return fmt.Errorf("args.RootUserNamespace is nil") 389 } 390 if args.ApplicationCores == 0 { 391 return fmt.Errorf("args.ApplicationCores is 0") 392 } 393 394 k.featureSet = args.FeatureSet 395 k.timekeeper = args.Timekeeper 396 k.tasks = newTaskSet(args.PIDNamespace) 397 k.rootUserNamespace = args.RootUserNamespace 398 k.rootUTSNamespace = args.RootUTSNamespace 399 k.rootIPCNamespace = args.RootIPCNamespace 400 k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace 401 k.rootNetworkNamespace = args.RootNetworkNamespace 402 if k.rootNetworkNamespace == nil { 403 k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace) 404 } 405 k.runningTasksCond.L = &k.runningTasksMu 406 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 407 k.cpuClockTickerStopCond.L = &k.runningTasksMu 408 k.applicationCores = args.ApplicationCores 409 if args.UseHostCores { 410 k.useHostCores = true 411 maxCPU, err := hostcpu.MaxPossibleCPU() 412 if err != nil { 413 return fmt.Errorf("failed to get maximum CPU number: %v", err) 414 } 415 minAppCores := uint(maxCPU) + 1 416 if k.applicationCores < minAppCores { 417 log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) 418 k.applicationCores = minAppCores 419 } 420 } 421 k.extraAuxv = args.ExtraAuxv 422 k.vdso = args.Vdso 423 k.futexes = futex.NewManager() 424 k.netlinkPorts = port.New() 425 k.ptraceExceptions = make(map[*Task]*Task) 426 k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL) 427 k.userCountersMap = make(map[auth.KUID]*userCounters) 428 429 ctx := k.SupervisorContext() 430 if err := k.vfs.Init(ctx); err != nil { 431 return fmt.Errorf("failed to initialize VFS: %v", err) 432 } 433 434 err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) 435 if err != nil { 436 return fmt.Errorf("failed to create mqfs filesystem: %v", err) 437 } 438 439 pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) 440 if err != nil { 441 return fmt.Errorf("failed to create pipefs filesystem: %v", err) 442 } 443 defer pipeFilesystem.DecRef(ctx) 444 pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) 445 k.pipeMount = pipeMount 446 447 nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs) 448 if err != nil { 449 return fmt.Errorf("failed to create nsfs filesystem: %v", err) 450 } 451 defer nsfsFilesystem.DecRef(ctx) 452 k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) 453 k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) 454 k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) 455 k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) 456 457 tmpfsOpts := vfs.GetFilesystemOptions{ 458 InternalData: tmpfs.FilesystemOpts{ 459 // See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT). 460 // Note how mm/shmem.c:shmem_fill_super() does not provide a default 461 // value for sbinfo->max_blocks when SB_KERNMOUNT is set. 462 DisableDefaultSizeLimit: true, 463 }, 464 } 465 tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts) 466 if err != nil { 467 return fmt.Errorf("failed to create tmpfs filesystem: %v", err) 468 } 469 defer tmpfsFilesystem.DecRef(ctx) 470 defer tmpfsRoot.DecRef(ctx) 471 k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) 472 473 socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) 474 if err != nil { 475 return fmt.Errorf("failed to create sockfs filesystem: %v", err) 476 } 477 defer socketFilesystem.DecRef(ctx) 478 k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) 479 480 sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor() 481 if err != nil { 482 return fmt.Errorf("failed to get device number for SysV shm: %v", err) 483 } 484 k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor) 485 486 k.sockets = make(map[*vfs.FileDescription]*SocketRecord) 487 488 k.cgroupRegistry = newCgroupRegistry() 489 return nil 490 } 491 492 // SaveTo saves the state of k to w. 493 // 494 // Preconditions: The kernel must be paused throughout the call to SaveTo. 495 func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { 496 saveStart := time.Now() 497 498 // Do not allow other Kernel methods to affect it while it's being saved. 499 k.extMu.Lock() 500 defer k.extMu.Unlock() 501 502 // Stop time. 503 k.pauseTimeLocked(ctx) 504 defer k.resumeTimeLocked(ctx) 505 506 // Evict all evictable MemoryFile allocations. 507 k.mf.StartEvictions() 508 k.mf.WaitForEvictions() 509 510 // Discard unsavable mappings, such as those for host file descriptors. 511 if err := k.invalidateUnsavableMappings(ctx); err != nil { 512 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 513 } 514 515 // Prepare filesystems for saving. This must be done after 516 // invalidateUnsavableMappings(), since dropping memory mappings may 517 // affect filesystem state (e.g. page cache reference counts). 518 if err := k.vfs.PrepareSave(ctx); err != nil { 519 return err 520 } 521 522 // Save the CPUID FeatureSet before the rest of the kernel so we can 523 // verify its compatibility on restore before attempting to restore the 524 // entire kernel, which may fail on an incompatible machine. 525 // 526 // N.B. This will also be saved along with the full kernel save below. 527 cpuidStart := time.Now() 528 if _, err := state.Save(ctx, w, &k.featureSet); err != nil { 529 return err 530 } 531 log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) 532 533 // Save the timekeeper's state. 534 535 if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil { 536 // Pause the network stack. 537 netstackPauseStart := time.Now() 538 log.Infof("Pausing root network namespace") 539 k.rootNetworkNamespace.Stack().Pause() 540 defer k.rootNetworkNamespace.Stack().Resume() 541 log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart)) 542 } 543 544 // Save the kernel state. 545 kernelStart := time.Now() 546 stats, err := state.Save(ctx, w, k) 547 if err != nil { 548 return err 549 } 550 log.Infof("Kernel save stats: %s", stats.String()) 551 log.Infof("Kernel save took [%s].", time.Since(kernelStart)) 552 553 // Save the memory file's state. 554 memoryStart := time.Now() 555 if err := k.mf.SaveTo(ctx, w); err != nil { 556 return err 557 } 558 log.Infof("Memory save took [%s].", time.Since(memoryStart)) 559 560 log.Infof("Overall save took [%s].", time.Since(saveStart)) 561 562 return nil 563 } 564 565 // Preconditions: The kernel must be paused. 566 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { 567 invalidated := make(map[*mm.MemoryManager]struct{}) 568 k.tasks.mu.RLock() 569 defer k.tasks.mu.RUnlock() 570 for t := range k.tasks.Root.tids { 571 // We can skip locking Task.mu here since the kernel is paused. 572 if memMgr := t.image.MemoryManager; memMgr != nil { 573 if _, ok := invalidated[memMgr]; !ok { 574 if err := memMgr.InvalidateUnsavable(ctx); err != nil { 575 return err 576 } 577 invalidated[memMgr] = struct{}{} 578 } 579 } 580 // I really wish we just had a sync.Map of all MMs... 581 if r, ok := t.runState.(*runSyscallAfterExecStop); ok { 582 if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { 583 return err 584 } 585 } 586 } 587 return nil 588 } 589 590 // LoadFrom returns a new Kernel loaded from args. 591 func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { 592 loadStart := time.Now() 593 594 k.runningTasksCond.L = &k.runningTasksMu 595 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 596 k.cpuClockTickerStopCond.L = &k.runningTasksMu 597 598 initAppCores := k.applicationCores 599 600 // Load the pre-saved CPUID FeatureSet. 601 // 602 // N.B. This was also saved along with the full kernel below, so we 603 // don't need to explicitly install it in the Kernel. 604 cpuidStart := time.Now() 605 if _, err := state.Load(ctx, r, &k.featureSet); err != nil { 606 return err 607 } 608 log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) 609 610 // Verify that the FeatureSet is usable on this host. We do this before 611 // Kernel load so that the explicit CPUID mismatch error has priority 612 // over floating point state restore errors that may occur on load on 613 // an incompatible machine. 614 if err := k.featureSet.CheckHostCompatible(); err != nil { 615 return err 616 } 617 618 // Load the kernel state. 619 kernelStart := time.Now() 620 stats, err := state.Load(ctx, r, k) 621 if err != nil { 622 return err 623 } 624 log.Infof("Kernel load stats: %s", stats.String()) 625 log.Infof("Kernel load took [%s].", time.Since(kernelStart)) 626 627 // rootNetworkNamespace should be populated after loading the state file. 628 // Restore the root network stack. 629 k.rootNetworkNamespace.RestoreRootStack(net) 630 631 // Load the memory file's state. 632 memoryStart := time.Now() 633 if err := k.mf.LoadFrom(ctx, r); err != nil { 634 return err 635 } 636 log.Infof("Memory load took [%s].", time.Since(memoryStart)) 637 638 log.Infof("Overall load took [%s]", time.Since(loadStart)) 639 640 k.Timekeeper().SetClocks(clocks) 641 642 if timeReady != nil { 643 close(timeReady) 644 } 645 646 if net != nil { 647 net.Resume() 648 } 649 650 if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { 651 return err 652 } 653 654 tcpip.AsyncLoading.Wait() 655 656 log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) 657 658 // Applications may size per-cpu structures based on k.applicationCores, so 659 // it can't change across save/restore. When we are virtualizing CPU 660 // numbers, this isn't a problem. However, when we are exposing host CPU 661 // assignments, we can't tolerate an increase in the number of host CPUs, 662 // which could result in getcpu(2) returning CPUs that applications expect 663 // not to exist. 664 if k.useHostCores && initAppCores > k.applicationCores { 665 return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) 666 } 667 668 return nil 669 } 670 671 // UniqueID returns a unique identifier. 672 func (k *Kernel) UniqueID() uint64 { 673 id := k.uniqueID.Add(1) 674 if id == 0 { 675 panic("unique identifier generator wrapped around") 676 } 677 return id 678 } 679 680 // CreateProcessArgs holds arguments to kernel.CreateProcess. 681 type CreateProcessArgs struct { 682 // Filename is the filename to load as the init binary. 683 // 684 // If this is provided as "", File will be checked, then the file will be 685 // guessed via Argv[0]. 686 Filename string 687 688 // File is a passed host FD pointing to a file to load as the init binary. 689 // 690 // This is checked if and only if Filename is "". 691 File *vfs.FileDescription 692 693 // Argv is a list of arguments. 694 Argv []string 695 696 // Envv is a list of environment variables. 697 Envv []string 698 699 // WorkingDirectory is the initial working directory. 700 // 701 // This defaults to the root if empty. 702 WorkingDirectory string 703 704 // Credentials is the initial credentials. 705 Credentials *auth.Credentials 706 707 // FDTable is the initial set of file descriptors. If CreateProcess succeeds, 708 // it takes a reference on FDTable. 709 FDTable *FDTable 710 711 // Umask is the initial umask. 712 Umask uint 713 714 // Limits are the initial resource limits. 715 Limits *limits.LimitSet 716 717 // MaxSymlinkTraversals is the maximum number of symlinks to follow 718 // during resolution. 719 MaxSymlinkTraversals uint 720 721 // UTSNamespace is the initial UTS namespace. 722 UTSNamespace *UTSNamespace 723 724 // IPCNamespace is the initial IPC namespace. 725 IPCNamespace *IPCNamespace 726 727 // PIDNamespace is the initial PID Namespace. 728 PIDNamespace *PIDNamespace 729 730 // AbstractSocketNamespace is the initial Abstract Socket namespace. 731 AbstractSocketNamespace *AbstractSocketNamespace 732 733 // MountNamespace optionally contains the mount namespace for this 734 // process. If nil, the init process's mount namespace is used. 735 // 736 // Anyone setting MountNamespace must donate a reference (i.e. 737 // increment it). 738 MountNamespace *vfs.MountNamespace 739 740 // ContainerID is the container that the process belongs to. 741 ContainerID string 742 743 // InitialCgroups are the cgroups the container is initialized to. 744 InitialCgroups map[Cgroup]struct{} 745 } 746 747 // NewContext returns a context.Context that represents the task that will be 748 // created by args.NewContext(k). 749 func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context { 750 return &createProcessContext{ 751 Context: context.Background(), 752 kernel: k, 753 args: args, 754 } 755 } 756 757 // createProcessContext is a context.Context that represents the context 758 // associated with a task that is being created. 759 type createProcessContext struct { 760 context.Context 761 kernel *Kernel 762 args *CreateProcessArgs 763 } 764 765 // Value implements context.Context.Value. 766 func (ctx *createProcessContext) Value(key any) any { 767 switch key { 768 case CtxKernel: 769 return ctx.kernel 770 case CtxPIDNamespace: 771 return ctx.args.PIDNamespace 772 case CtxUTSNamespace: 773 utsns := ctx.args.UTSNamespace 774 utsns.IncRef() 775 return utsns 776 case ipc.CtxIPCNamespace: 777 ipcns := ctx.args.IPCNamespace 778 ipcns.IncRef() 779 return ipcns 780 case auth.CtxCredentials: 781 return ctx.args.Credentials 782 case vfs.CtxRoot: 783 if ctx.args.MountNamespace == nil { 784 return nil 785 } 786 root := ctx.args.MountNamespace.Root(ctx) 787 return root 788 case vfs.CtxMountNamespace: 789 if ctx.kernel.globalInit == nil { 790 return nil 791 } 792 mntns := ctx.kernel.GlobalInit().Leader().MountNamespace() 793 mntns.IncRef() 794 return mntns 795 case inet.CtxStack: 796 return ctx.kernel.RootNetworkNamespace().Stack() 797 case ktime.CtxRealtimeClock: 798 return ctx.kernel.RealtimeClock() 799 case limits.CtxLimits: 800 return ctx.args.Limits 801 case pgalloc.CtxMemoryCgroupID: 802 return ctx.getMemoryCgroupID() 803 case pgalloc.CtxMemoryFile: 804 return ctx.kernel.mf 805 case pgalloc.CtxMemoryFileProvider: 806 return ctx.kernel 807 case platform.CtxPlatform: 808 return ctx.kernel 809 case uniqueid.CtxGlobalUniqueID: 810 return ctx.kernel.UniqueID() 811 case uniqueid.CtxGlobalUniqueIDProvider: 812 return ctx.kernel 813 case uniqueid.CtxInotifyCookie: 814 return ctx.kernel.GenerateInotifyCookie() 815 case unimpl.CtxEvents: 816 return ctx.kernel 817 default: 818 return nil 819 } 820 } 821 822 func (ctx *createProcessContext) getMemoryCgroupID() uint32 { 823 for cg := range ctx.args.InitialCgroups { 824 for _, ctl := range cg.Controllers() { 825 if ctl.Type() == CgroupControllerMemory { 826 return cg.ID() 827 } 828 } 829 } 830 return InvalidCgroupID 831 } 832 833 // CreateProcess creates a new task in a new thread group with the given 834 // options. The new task has no parent and is in the root PID namespace. 835 // 836 // If k.Start() has already been called, then the created process must be 837 // started by calling kernel.StartProcess(tg). 838 // 839 // If k.Start() has not yet been called, then the created task will begin 840 // running when k.Start() is called. 841 // 842 // CreateProcess has no analogue in Linux; it is used to create the initial 843 // application task, as well as processes started by the control server. 844 func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { 845 k.extMu.Lock() 846 defer k.extMu.Unlock() 847 log.Infof("EXEC: %v", args.Argv) 848 849 ctx := args.NewContext(k) 850 mntns := args.MountNamespace 851 if mntns == nil { 852 if k.globalInit == nil { 853 return nil, 0, fmt.Errorf("mount namespace is nil") 854 } 855 // Add a reference to the namespace, which is transferred to the new process. 856 mntns = k.globalInit.Leader().MountNamespace() 857 mntns.IncRef() 858 } 859 // Get the root directory from the MountNamespace. 860 root := mntns.Root(ctx) 861 defer root.DecRef(ctx) 862 863 // Grab the working directory. 864 wd := root // Default. 865 if args.WorkingDirectory != "" { 866 pop := vfs.PathOperation{ 867 Root: root, 868 Start: wd, 869 Path: fspath.Parse(args.WorkingDirectory), 870 FollowFinalSymlink: true, 871 } 872 // NOTE(b/236028361): Do not set CheckSearchable flag to true. 873 // Application is allowed to start with a working directory that it can 874 // not access/search. This is consistent with Docker and VFS1. Runc 875 // explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry 876 // chdir to fix EPERM"). As described in the commit, runc unintentionally 877 // allowed this behavior in a couple of releases and applications started 878 // relying on it. So they decided to allow it for backward compatibility. 879 var err error 880 wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{}) 881 if err != nil { 882 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 883 } 884 defer wd.DecRef(ctx) 885 } 886 fsContext := NewFSContext(root, wd, args.Umask) 887 888 tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) 889 cu := cleanup.Make(func() { 890 tg.Release(ctx) 891 }) 892 defer cu.Clean() 893 894 // Check which file to start from. 895 switch { 896 case args.Filename != "": 897 // If a filename is given, take that. 898 // Set File to nil so we resolve the path in LoadTaskImage. 899 args.File = nil 900 case args.File != nil: 901 // If File is set, take the File provided directly. 902 args.Filename = args.File.MappedName(ctx) 903 default: 904 // Otherwise look at Argv and see if the first argument is a valid path. 905 if len(args.Argv) == 0 { 906 return nil, 0, fmt.Errorf("no filename or command provided") 907 } 908 if !filepath.IsAbs(args.Argv[0]) { 909 return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) 910 } 911 args.Filename = args.Argv[0] 912 } 913 914 // Create a fresh task context. 915 remainingTraversals := args.MaxSymlinkTraversals 916 loadArgs := loader.LoadArgs{ 917 Root: root, 918 WorkingDir: wd, 919 RemainingTraversals: &remainingTraversals, 920 ResolveFinal: true, 921 Filename: args.Filename, 922 File: args.File, 923 CloseOnExec: false, 924 Argv: args.Argv, 925 Envv: args.Envv, 926 Features: k.featureSet, 927 } 928 929 image, se := k.LoadTaskImage(ctx, loadArgs) 930 if se != nil { 931 return nil, 0, errors.New(se.String()) 932 } 933 934 // Take a reference on the FDTable, which will be transferred to 935 // TaskSet.NewTask(). 936 args.FDTable.IncRef() 937 938 // Create the task. 939 config := &TaskConfig{ 940 Kernel: k, 941 ThreadGroup: tg, 942 TaskImage: image, 943 FSContext: fsContext, 944 FDTable: args.FDTable, 945 Credentials: args.Credentials, 946 NetworkNamespace: k.RootNetworkNamespace(), 947 AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), 948 UTSNamespace: args.UTSNamespace, 949 IPCNamespace: args.IPCNamespace, 950 AbstractSocketNamespace: args.AbstractSocketNamespace, 951 MountNamespace: mntns, 952 ContainerID: args.ContainerID, 953 InitialCgroups: args.InitialCgroups, 954 UserCounters: k.GetUserCounters(args.Credentials.RealKUID), 955 // A task with no parent starts out with no session keyring. 956 SessionKeyring: nil, 957 } 958 config.NetworkNamespace.IncRef() 959 t, err := k.tasks.NewTask(ctx, config) 960 if err != nil { 961 return nil, 0, err 962 } 963 t.traceExecEvent(image) // Simulate exec for tracing. 964 965 // Success. 966 cu.Release() 967 tgid := k.tasks.Root.IDOfThreadGroup(tg) 968 if k.globalInit == nil { 969 k.globalInit = tg 970 } 971 return tg, tgid, nil 972 } 973 974 // StartProcess starts running a process that was created with CreateProcess. 975 func (k *Kernel) StartProcess(tg *ThreadGroup) { 976 t := tg.Leader() 977 tid := k.tasks.Root.IDOfTask(t) 978 t.Start(tid) 979 } 980 981 // Start starts execution of all tasks in k. 982 // 983 // Preconditions: Start may be called exactly once. 984 func (k *Kernel) Start() error { 985 k.extMu.Lock() 986 defer k.extMu.Unlock() 987 988 if k.started { 989 return fmt.Errorf("kernel already started") 990 } 991 992 k.started = true 993 k.cpuClockTickTimer = time.NewTimer(linux.ClockTick) 994 k.runningTasksMu.Lock() 995 k.cpuClockTickerRunning = true 996 k.runningTasksMu.Unlock() 997 go k.runCPUClockTicker() 998 // If k was created by LoadKernelFrom, timers were stopped during 999 // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, 1000 // this is a no-op. 1001 k.resumeTimeLocked(k.SupervisorContext()) 1002 k.tasks.mu.RLock() 1003 ts := make([]*Task, 0, len(k.tasks.Root.tids)) 1004 for t := range k.tasks.Root.tids { 1005 ts = append(ts, t) 1006 } 1007 k.tasks.mu.RUnlock() 1008 // Start task goroutines. 1009 // NOTE(b/235349091): We don't actually need the TaskSet mutex, we just 1010 // need to make sure we only call t.Start() once for each task. Holding the 1011 // mutex for each task start may cause a nested locking error. 1012 for _, t := range ts { 1013 t.Start(t.ThreadID()) 1014 } 1015 return nil 1016 } 1017 1018 // pauseTimeLocked pauses all Timers and Timekeeper updates. 1019 // 1020 // Preconditions: 1021 // - Any task goroutines running in k must be stopped. 1022 // - k.extMu must be locked. 1023 func (k *Kernel) pauseTimeLocked(ctx context.Context) { 1024 // Since all task goroutines have been stopped by precondition, the CPU clock 1025 // ticker should stop on its own; wait for it to do so, waking it up from 1026 // sleeping betwen ticks if necessary. 1027 k.runningTasksMu.Lock() 1028 for k.cpuClockTickerRunning { 1029 select { 1030 case k.cpuClockTickerWakeCh <- struct{}{}: 1031 default: 1032 } 1033 k.cpuClockTickerStopCond.Wait() 1034 } 1035 k.runningTasksMu.Unlock() 1036 1037 // By precondition, nothing else can be interacting with PIDNamespace.tids 1038 // or FDTable.files, so we can iterate them without synchronization. (We 1039 // can't hold the TaskSet mutex when pausing thread group timers because 1040 // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet 1041 // mutex, while holding the Timer mutex.) 1042 for t := range k.tasks.Root.tids { 1043 if t == t.tg.leader { 1044 t.tg.itimerRealTimer.Pause() 1045 for _, it := range t.tg.timers { 1046 it.PauseTimer() 1047 } 1048 } 1049 // This means we'll iterate FDTables shared by multiple tasks repeatedly, 1050 // but ktime.Timer.Pause is idempotent so this is harmless. 1051 if t.fdTable != nil { 1052 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1053 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1054 tfd.PauseTimer() 1055 } 1056 }) 1057 } 1058 } 1059 k.timekeeper.PauseUpdates() 1060 } 1061 1062 // resumeTimeLocked resumes all Timers and Timekeeper updates. If 1063 // pauseTimeLocked has not been previously called, resumeTimeLocked has no 1064 // effect. 1065 // 1066 // Preconditions: 1067 // - Any task goroutines running in k must be stopped. 1068 // - k.extMu must be locked. 1069 func (k *Kernel) resumeTimeLocked(ctx context.Context) { 1070 // The CPU clock ticker will automatically resume as task goroutines resume 1071 // execution. 1072 1073 k.timekeeper.ResumeUpdates() 1074 for t := range k.tasks.Root.tids { 1075 if t == t.tg.leader { 1076 t.tg.itimerRealTimer.Resume() 1077 for _, it := range t.tg.timers { 1078 it.ResumeTimer() 1079 } 1080 } 1081 if t.fdTable != nil { 1082 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1083 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1084 tfd.ResumeTimer() 1085 } 1086 }) 1087 } 1088 } 1089 } 1090 1091 func (k *Kernel) incRunningTasks() { 1092 for { 1093 tasks := k.runningTasks.Load() 1094 if tasks != 0 { 1095 // Standard case. Simply increment. 1096 if !k.runningTasks.CompareAndSwap(tasks, tasks+1) { 1097 continue 1098 } 1099 return 1100 } 1101 1102 // Transition from 0 -> 1. 1103 k.runningTasksMu.Lock() 1104 if k.runningTasks.Load() != 0 { 1105 // Raced with another transition and lost. 1106 k.runningTasks.Add(1) 1107 k.runningTasksMu.Unlock() 1108 return 1109 } 1110 if !k.cpuClockTickerRunning { 1111 select { 1112 case tickTime := <-k.cpuClockTickTimer.C: 1113 // Rearm the timer since we consumed the wakeup. Estimate how much time 1114 // remains on the current tick so that periodic workloads interact with 1115 // the (periodic) CPU clock ticker in the same way that they would 1116 // without the optimization of putting the ticker to sleep. 1117 missedNS := time.Since(tickTime).Nanoseconds() 1118 missedTicks := missedNS / linux.ClockTick.Nanoseconds() 1119 thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds() 1120 k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS)) 1121 // Increment k.cpuClock on the CPU clock ticker goroutine's behalf. 1122 // (Whole missed ticks don't matter, and adding them to k.cpuClock will 1123 // just confuse the watchdog.) At the time the tick occurred, all task 1124 // goroutines were asleep, so there's nothing else to do. This ensures 1125 // that our caller (Task.accountTaskGoroutineLeave()) records an 1126 // updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly 1127 // accounted as having resumed execution in the sentry during this tick 1128 // instead of at the end of the previous one. 1129 k.cpuClock.Add(1) 1130 default: 1131 } 1132 // We are transitioning from idle to active. Set k.cpuClockTickerRunning 1133 // = true here so that if we transition to idle and then active again 1134 // before the CPU clock ticker goroutine has a chance to run, the first 1135 // call to k.incRunningTasks() at the end of that cycle does not try to 1136 // steal k.cpuClockTickTimer.C again, as this would allow workloads that 1137 // rapidly cycle between idle and active to starve the CPU clock ticker 1138 // of chances to observe task goroutines in a running state and account 1139 // their CPU usage. 1140 k.cpuClockTickerRunning = true 1141 k.runningTasksCond.Signal() 1142 } 1143 // This store must happen after the increment of k.cpuClock above to ensure 1144 // that concurrent calls to Task.accountTaskGoroutineLeave() also observe 1145 // the updated k.cpuClock. 1146 k.runningTasks.Store(1) 1147 k.runningTasksMu.Unlock() 1148 return 1149 } 1150 } 1151 1152 func (k *Kernel) decRunningTasks() { 1153 tasks := k.runningTasks.Add(-1) 1154 if tasks < 0 { 1155 panic(fmt.Sprintf("Invalid running count %d", tasks)) 1156 } 1157 1158 // Nothing to do. The next CPU clock tick will disable the timer if 1159 // there is still nothing running. This provides approximately one tick 1160 // of slack in which we can switch back and forth between idle and 1161 // active without an expensive transition. 1162 } 1163 1164 // WaitExited blocks until all tasks in k have exited. 1165 func (k *Kernel) WaitExited() { 1166 k.tasks.liveGoroutines.Wait() 1167 } 1168 1169 // Kill requests that all tasks in k immediately exit as if group exiting with 1170 // status ws. Kill does not wait for tasks to exit. 1171 func (k *Kernel) Kill(ws linux.WaitStatus) { 1172 k.extMu.Lock() 1173 defer k.extMu.Unlock() 1174 k.tasks.Kill(ws) 1175 } 1176 1177 // Pause requests that all tasks in k temporarily stop executing, and blocks 1178 // until all tasks and asynchronous I/O operations in k have stopped. Multiple 1179 // calls to Pause nest and require an equal number of calls to Unpause to 1180 // resume execution. 1181 func (k *Kernel) Pause() { 1182 k.extMu.Lock() 1183 k.tasks.BeginExternalStop() 1184 k.extMu.Unlock() 1185 k.tasks.runningGoroutines.Wait() 1186 k.tasks.aioGoroutines.Wait() 1187 } 1188 1189 // ReceiveTaskStates receives full states for all tasks. 1190 func (k *Kernel) ReceiveTaskStates() { 1191 k.extMu.Lock() 1192 k.tasks.PullFullState() 1193 k.extMu.Unlock() 1194 } 1195 1196 // Unpause ends the effect of a previous call to Pause. If Unpause is called 1197 // without a matching preceding call to Pause, Unpause may panic. 1198 func (k *Kernel) Unpause() { 1199 k.extMu.Lock() 1200 defer k.extMu.Unlock() 1201 k.tasks.EndExternalStop() 1202 } 1203 1204 // SendExternalSignal injects a signal into the kernel. 1205 // 1206 // context is used only for debugging to describe how the signal was received. 1207 // 1208 // Preconditions: Kernel must have an init process. 1209 func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { 1210 k.extMu.Lock() 1211 defer k.extMu.Unlock() 1212 k.sendExternalSignal(info, context) 1213 } 1214 1215 // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. 1216 // 1217 // This function doesn't skip signals like SendExternalSignal does. 1218 func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { 1219 k.extMu.Lock() 1220 defer k.extMu.Unlock() 1221 return tg.SendSignal(info) 1222 } 1223 1224 // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the 1225 // given process group. 1226 // 1227 // This function doesn't skip signals like SendExternalSignal does. 1228 func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error { 1229 k.extMu.Lock() 1230 defer k.extMu.Unlock() 1231 // If anything goes wrong, we'll return the error, but still try our 1232 // best to deliver to other processes in the group. 1233 var firstErr error 1234 for _, tg := range k.TaskSet().Root.ThreadGroups() { 1235 if tg.ProcessGroup() != pg { 1236 continue 1237 } 1238 if err := tg.SendSignal(info); err != nil && firstErr == nil { 1239 firstErr = err 1240 } 1241 } 1242 return firstErr 1243 } 1244 1245 // SendContainerSignal sends the given signal to all processes inside the 1246 // namespace that match the given container ID. 1247 func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { 1248 k.extMu.Lock() 1249 defer k.extMu.Unlock() 1250 k.tasks.mu.RLock() 1251 defer k.tasks.mu.RUnlock() 1252 1253 var lastErr error 1254 for tg := range k.tasks.Root.tgids { 1255 if tg.leader.ContainerID() == cid { 1256 tg.signalHandlers.mu.Lock() 1257 infoCopy := *info 1258 if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { 1259 lastErr = err 1260 } 1261 tg.signalHandlers.mu.Unlock() 1262 } 1263 } 1264 return lastErr 1265 } 1266 1267 // RebuildTraceContexts rebuilds the trace context for all tasks. 1268 // 1269 // Unfortunately, if these are built while tracing is not enabled, then we will 1270 // not have meaningful trace data. Rebuilding here ensures that we can do so 1271 // after tracing has been enabled. 1272 func (k *Kernel) RebuildTraceContexts() { 1273 // We need to pause all task goroutines because Task.rebuildTraceContext() 1274 // replaces Task.traceContext and Task.traceTask, which are 1275 // task-goroutine-exclusive (i.e. the task goroutine assumes that it can 1276 // access them without synchronization) for performance. 1277 k.Pause() 1278 defer k.Unpause() 1279 1280 k.extMu.Lock() 1281 defer k.extMu.Unlock() 1282 k.tasks.mu.RLock() 1283 defer k.tasks.mu.RUnlock() 1284 1285 for t, tid := range k.tasks.Root.tids { 1286 t.rebuildTraceContext(tid) 1287 } 1288 } 1289 1290 // FeatureSet returns the FeatureSet. 1291 func (k *Kernel) FeatureSet() cpuid.FeatureSet { 1292 return k.featureSet 1293 } 1294 1295 // Timekeeper returns the Timekeeper. 1296 func (k *Kernel) Timekeeper() *Timekeeper { 1297 return k.timekeeper 1298 } 1299 1300 // TaskSet returns the TaskSet. 1301 func (k *Kernel) TaskSet() *TaskSet { 1302 return k.tasks 1303 } 1304 1305 // RootUserNamespace returns the root UserNamespace. 1306 func (k *Kernel) RootUserNamespace() *auth.UserNamespace { 1307 return k.rootUserNamespace 1308 } 1309 1310 // RootUTSNamespace returns the root UTSNamespace. 1311 func (k *Kernel) RootUTSNamespace() *UTSNamespace { 1312 k.rootUTSNamespace.IncRef() 1313 return k.rootUTSNamespace 1314 } 1315 1316 // RootIPCNamespace takes a reference and returns the root IPCNamespace. 1317 func (k *Kernel) RootIPCNamespace() *IPCNamespace { 1318 k.rootIPCNamespace.IncRef() 1319 return k.rootIPCNamespace 1320 } 1321 1322 // RootPIDNamespace returns the root PIDNamespace. 1323 func (k *Kernel) RootPIDNamespace() *PIDNamespace { 1324 return k.tasks.Root 1325 } 1326 1327 // RootAbstractSocketNamespace returns the root AbstractSocketNamespace. 1328 func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { 1329 return k.rootAbstractSocketNamespace 1330 } 1331 1332 // RootNetworkNamespace returns the root network namespace, always non-nil. 1333 func (k *Kernel) RootNetworkNamespace() *inet.Namespace { 1334 return k.rootNetworkNamespace 1335 } 1336 1337 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or 1338 // nil if no such thread group exists. GlobalInit may return a thread group 1339 // containing no tasks if the thread group has already exited. 1340 func (k *Kernel) GlobalInit() *ThreadGroup { 1341 k.extMu.Lock() 1342 defer k.extMu.Unlock() 1343 return k.globalInit 1344 } 1345 1346 // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. 1347 func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { 1348 k.globalInit = tg 1349 } 1350 1351 // ApplicationCores returns the number of CPUs visible to sandboxed 1352 // applications. 1353 func (k *Kernel) ApplicationCores() uint { 1354 return k.applicationCores 1355 } 1356 1357 // RealtimeClock returns the application CLOCK_REALTIME clock. 1358 func (k *Kernel) RealtimeClock() ktime.Clock { 1359 return k.timekeeper.realtimeClock 1360 } 1361 1362 // MonotonicClock returns the application CLOCK_MONOTONIC clock. 1363 func (k *Kernel) MonotonicClock() ktime.Clock { 1364 return k.timekeeper.monotonicClock 1365 } 1366 1367 // CPUClockNow returns the current value of k.cpuClock. 1368 func (k *Kernel) CPUClockNow() uint64 { 1369 return k.cpuClock.Load() 1370 } 1371 1372 // Syslog returns the syslog. 1373 func (k *Kernel) Syslog() *syslog { 1374 return &k.syslog 1375 } 1376 1377 // GenerateInotifyCookie generates a unique inotify event cookie. 1378 // 1379 // Returned values may overlap with previously returned values if the value 1380 // space is exhausted. 0 is not a valid cookie value, all other values 1381 // representable in a uint32 are allowed. 1382 func (k *Kernel) GenerateInotifyCookie() uint32 { 1383 id := k.nextInotifyCookie.Add(1) 1384 // Wrap-around is explicitly allowed for inotify event cookies. 1385 if id == 0 { 1386 id = k.nextInotifyCookie.Add(1) 1387 } 1388 return id 1389 } 1390 1391 // NetlinkPorts returns the netlink port manager. 1392 func (k *Kernel) NetlinkPorts() *port.Manager { 1393 return k.netlinkPorts 1394 } 1395 1396 var ( 1397 errSaved = errors.New("sandbox has been successfully saved") 1398 errAutoSaved = errors.New("sandbox has been successfully auto-saved") 1399 ) 1400 1401 // SaveStatus returns the sandbox save status. If it was saved successfully, 1402 // autosaved indicates whether save was triggered by autosave. If it was not 1403 // saved successfully, err indicates the sandbox error that caused the kernel to 1404 // exit during save. 1405 func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { 1406 k.extMu.Lock() 1407 defer k.extMu.Unlock() 1408 switch k.saveStatus { 1409 case nil: 1410 return false, false, nil 1411 case errSaved: 1412 return true, false, nil 1413 case errAutoSaved: 1414 return true, true, nil 1415 default: 1416 return false, false, k.saveStatus 1417 } 1418 } 1419 1420 // SetSaveSuccess sets the flag indicating that save completed successfully, if 1421 // no status was already set. 1422 func (k *Kernel) SetSaveSuccess(autosave bool) { 1423 k.extMu.Lock() 1424 defer k.extMu.Unlock() 1425 if k.saveStatus == nil { 1426 if autosave { 1427 k.saveStatus = errAutoSaved 1428 } else { 1429 k.saveStatus = errSaved 1430 } 1431 } 1432 } 1433 1434 // SetSaveError sets the sandbox error that caused the kernel to exit during 1435 // save, if one is not already set. 1436 func (k *Kernel) SetSaveError(err error) { 1437 k.extMu.Lock() 1438 defer k.extMu.Unlock() 1439 if k.saveStatus == nil { 1440 k.saveStatus = err 1441 } 1442 } 1443 1444 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or 1445 // LoadFrom. 1446 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { 1447 k.mf = mf 1448 } 1449 1450 // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. 1451 func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { 1452 return k.mf 1453 } 1454 1455 // SupervisorContext returns a Context with maximum privileges in k. It should 1456 // only be used by goroutines outside the control of the emulated kernel 1457 // defined by e. 1458 // 1459 // Callers are responsible for ensuring that the returned Context is not used 1460 // concurrently with changes to the Kernel. 1461 func (k *Kernel) SupervisorContext() context.Context { 1462 return &supervisorContext{ 1463 Kernel: k, 1464 Logger: log.Log(), 1465 } 1466 } 1467 1468 // SocketRecord represents a socket recorded in Kernel.sockets. 1469 // 1470 // +stateify savable 1471 type SocketRecord struct { 1472 k *Kernel 1473 Sock *vfs.FileDescription 1474 ID uint64 // Socket table entry number. 1475 } 1476 1477 // RecordSocket adds a socket to the system-wide socket table for 1478 // tracking. 1479 // 1480 // Precondition: Caller must hold a reference to sock. 1481 // 1482 // Note that the socket table will not hold a reference on the 1483 // vfs.FileDescription. 1484 func (k *Kernel) RecordSocket(sock *vfs.FileDescription) { 1485 k.extMu.Lock() 1486 if _, ok := k.sockets[sock]; ok { 1487 panic(fmt.Sprintf("Socket %p added twice", sock)) 1488 } 1489 id := k.nextSocketRecord 1490 k.nextSocketRecord++ 1491 s := &SocketRecord{ 1492 k: k, 1493 ID: id, 1494 Sock: sock, 1495 } 1496 k.sockets[sock] = s 1497 k.extMu.Unlock() 1498 } 1499 1500 // DeleteSocket removes a socket from the system-wide socket table. 1501 func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) { 1502 k.extMu.Lock() 1503 delete(k.sockets, sock) 1504 k.extMu.Unlock() 1505 } 1506 1507 // ListSockets returns a snapshot of all sockets. 1508 // 1509 // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef() 1510 // to get a reference on a socket in the table. 1511 func (k *Kernel) ListSockets() []*SocketRecord { 1512 k.extMu.Lock() 1513 var socks []*SocketRecord 1514 for _, s := range k.sockets { 1515 socks = append(socks, s) 1516 } 1517 k.extMu.Unlock() 1518 return socks 1519 } 1520 1521 // supervisorContext is a privileged context. 1522 type supervisorContext struct { 1523 context.NoTask 1524 log.Logger 1525 *Kernel 1526 } 1527 1528 // Deadline implements context.Context.Deadline. 1529 func (*Kernel) Deadline() (time.Time, bool) { 1530 return time.Time{}, false 1531 } 1532 1533 // Done implements context.Context.Done. 1534 func (*Kernel) Done() <-chan struct{} { 1535 return nil 1536 } 1537 1538 // Err implements context.Context.Err. 1539 func (*Kernel) Err() error { 1540 return nil 1541 } 1542 1543 // Value implements context.Context. 1544 func (ctx *supervisorContext) Value(key any) any { 1545 switch key { 1546 case CtxCanTrace: 1547 // The supervisor context can trace anything. (None of 1548 // supervisorContext's users are expected to invoke ptrace, but ptrace 1549 // permissions are required for certain file accesses.) 1550 return func(*Task, bool) bool { return true } 1551 case CtxKernel: 1552 return ctx.Kernel 1553 case CtxPIDNamespace: 1554 return ctx.Kernel.tasks.Root 1555 case CtxUTSNamespace: 1556 utsns := ctx.Kernel.rootUTSNamespace 1557 utsns.IncRef() 1558 return utsns 1559 case ipc.CtxIPCNamespace: 1560 ipcns := ctx.Kernel.rootIPCNamespace 1561 ipcns.IncRef() 1562 return ipcns 1563 case auth.CtxCredentials: 1564 // The supervisor context is global root. 1565 return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace) 1566 case vfs.CtxRoot: 1567 if ctx.Kernel.globalInit == nil { 1568 return vfs.VirtualDentry{} 1569 } 1570 root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx) 1571 return root 1572 case vfs.CtxMountNamespace: 1573 if ctx.Kernel.globalInit == nil { 1574 return nil 1575 } 1576 mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace() 1577 mntns.IncRef() 1578 return mntns 1579 case inet.CtxStack: 1580 return ctx.Kernel.RootNetworkNamespace().Stack() 1581 case ktime.CtxRealtimeClock: 1582 return ctx.Kernel.RealtimeClock() 1583 case limits.CtxLimits: 1584 // No limits apply. 1585 return limits.NewLimitSet() 1586 case pgalloc.CtxMemoryFile: 1587 return ctx.Kernel.mf 1588 case pgalloc.CtxMemoryFileProvider: 1589 return ctx.Kernel 1590 case platform.CtxPlatform: 1591 return ctx.Kernel 1592 case uniqueid.CtxGlobalUniqueID: 1593 return ctx.Kernel.UniqueID() 1594 case uniqueid.CtxGlobalUniqueIDProvider: 1595 return ctx.Kernel 1596 case uniqueid.CtxInotifyCookie: 1597 return ctx.Kernel.GenerateInotifyCookie() 1598 case unimpl.CtxEvents: 1599 return ctx.Kernel 1600 case cpuid.CtxFeatureSet: 1601 return ctx.Kernel.featureSet 1602 default: 1603 return nil 1604 } 1605 } 1606 1607 // Rate limits for the number of unimplemented syscall events. 1608 const ( 1609 unimplementedSyscallsMaxRate = 100 // events per second 1610 unimplementedSyscallBurst = 1000 // events 1611 ) 1612 1613 // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event 1614 // channel. 1615 func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { 1616 k.unimplementedSyscallEmitterOnce.Do(func() { 1617 k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) 1618 }) 1619 1620 t := TaskFromContext(ctx) 1621 IncrementUnimplementedSyscallCounter(sysno) 1622 _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ 1623 Tid: int32(t.ThreadID()), 1624 Registers: t.Arch().StateData().Proto(), 1625 }) 1626 } 1627 1628 // VFS returns the virtual filesystem for the kernel. 1629 func (k *Kernel) VFS() *vfs.VirtualFilesystem { 1630 return &k.vfs 1631 } 1632 1633 // SetHostMount sets the hostfs mount. 1634 func (k *Kernel) SetHostMount(mnt *vfs.Mount) { 1635 if k.hostMount != nil { 1636 panic("Kernel.hostMount cannot be set more than once") 1637 } 1638 k.hostMount = mnt 1639 } 1640 1641 // HostMount returns the hostfs mount. 1642 func (k *Kernel) HostMount() *vfs.Mount { 1643 return k.hostMount 1644 } 1645 1646 // PipeMount returns the pipefs mount. 1647 func (k *Kernel) PipeMount() *vfs.Mount { 1648 return k.pipeMount 1649 } 1650 1651 // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace. 1652 func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter { 1653 return nsfs.NewInode(ctx, k.nsfsMount, ns) 1654 } 1655 1656 // ShmMount returns the tmpfs mount. 1657 func (k *Kernel) ShmMount() *vfs.Mount { 1658 return k.shmMount 1659 } 1660 1661 // SocketMount returns the sockfs mount. 1662 func (k *Kernel) SocketMount() *vfs.Mount { 1663 return k.socketMount 1664 } 1665 1666 // CgroupRegistry returns the cgroup registry. 1667 func (k *Kernel) CgroupRegistry() *CgroupRegistry { 1668 return k.cgroupRegistry 1669 } 1670 1671 // Release releases resources owned by k. 1672 // 1673 // Precondition: This should only be called after the kernel is fully 1674 // initialized, e.g. after k.Start() has been called. 1675 func (k *Kernel) Release() { 1676 ctx := k.SupervisorContext() 1677 k.hostMount.DecRef(ctx) 1678 k.pipeMount.DecRef(ctx) 1679 k.nsfsMount.DecRef(ctx) 1680 k.shmMount.DecRef(ctx) 1681 k.socketMount.DecRef(ctx) 1682 k.vfs.Release(ctx) 1683 k.timekeeper.Destroy() 1684 k.vdso.Release(ctx) 1685 k.RootNetworkNamespace().DecRef(ctx) 1686 } 1687 1688 // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup 1689 // hierarchy. 1690 // 1691 // Precondition: root must be a new cgroup with no tasks. This implies the 1692 // controllers for root are also new and currently manage no task, which in turn 1693 // implies the new cgroup can be populated without migrating tasks between 1694 // cgroups. 1695 func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { 1696 k.tasks.mu.RLock() 1697 k.tasks.forEachTaskLocked(func(t *Task) { 1698 if t.exitState != TaskExitNone { 1699 return 1700 } 1701 t.mu.Lock() 1702 // A task can be in the cgroup if it has been created after the 1703 // cgroup hierarchy was registered. 1704 t.enterCgroupIfNotYetLocked(root) 1705 t.mu.Unlock() 1706 }) 1707 k.tasks.mu.RUnlock() 1708 } 1709 1710 // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the 1711 // hierarchy with the provided id. This is intended for use during hierarchy 1712 // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. 1713 func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { 1714 var releasedCGs []Cgroup 1715 1716 k.tasks.mu.RLock() 1717 // We'll have one cgroup per hierarchy per task. 1718 releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids)) 1719 k.tasks.forEachTaskLocked(func(t *Task) { 1720 if t.exitState != TaskExitNone { 1721 return 1722 } 1723 t.mu.Lock() 1724 for cg := range t.cgroups { 1725 if cg.HierarchyID() == hid { 1726 cg.Leave(t) 1727 t.ResetMemCgIDFromCgroup(cg) 1728 delete(t.cgroups, cg) 1729 releasedCGs = append(releasedCGs, cg) 1730 // A task can't be part of multiple cgroups from the same 1731 // hierarchy, so we can skip checking the rest once we find a 1732 // match. 1733 break 1734 } 1735 } 1736 t.mu.Unlock() 1737 }) 1738 k.tasks.mu.RUnlock() 1739 1740 for _, c := range releasedCGs { 1741 c.decRef() 1742 } 1743 } 1744 1745 func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) { 1746 k.tasks.mu.RLock() 1747 oldRootDecRefs := 0 1748 k.tasks.forEachTaskLocked(func(t *Task) { 1749 t.mu.Lock() 1750 defer t.mu.Unlock() 1751 if fsc := t.fsContext; fsc != nil { 1752 fsc.mu.Lock() 1753 defer fsc.mu.Unlock() 1754 if fsc.root == oldRoot { 1755 newRoot.IncRef() 1756 oldRootDecRefs++ 1757 fsc.root = newRoot 1758 } 1759 if fsc.cwd == oldRoot { 1760 newRoot.IncRef() 1761 oldRootDecRefs++ 1762 fsc.cwd = newRoot 1763 } 1764 } 1765 }) 1766 k.tasks.mu.RUnlock() 1767 for i := 0; i < oldRootDecRefs; i++ { 1768 oldRoot.DecRef(ctx) 1769 } 1770 } 1771 1772 func (k *Kernel) GetUserCounters(uid auth.KUID) *userCounters { 1773 k.userCountersMapMu.Lock() 1774 defer k.userCountersMapMu.Unlock() 1775 1776 if uc, ok := k.userCountersMap[uid]; ok { 1777 return uc 1778 } 1779 1780 uc := &userCounters{} 1781 k.userCountersMap[uid] = uc 1782 return uc 1783 }