github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/kernel.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernel provides an emulation of the Linux kernel. 16 // 17 // See README.md for a detailed overview. 18 // 19 // Lock order (outermost locks must be taken first): 20 // 21 // Kernel.extMu 22 // ThreadGroup.timerMu 23 // ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu 24 // TaskSet.mu 25 // SignalHandlers.mu 26 // Task.mu 27 // runningTasksMu 28 // 29 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking 30 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same 31 // time requires locking all of their signal mutexes first. 32 package kernel 33 34 import ( 35 "errors" 36 "fmt" 37 "path/filepath" 38 "time" 39 40 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 41 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 42 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 43 "github.com/nicocha30/gvisor-ligolo/pkg/context" 44 "github.com/nicocha30/gvisor-ligolo/pkg/cpuid" 45 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 46 "github.com/nicocha30/gvisor-ligolo/pkg/eventchannel" 47 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 48 "github.com/nicocha30/gvisor-ligolo/pkg/log" 49 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 50 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/nsfs" 51 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/pipefs" 52 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/sockfs" 53 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/timerfd" 54 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs" 55 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostcpu" 56 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet" 57 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 58 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/futex" 59 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/ipc" 60 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched" 61 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time" 62 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits" 63 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/loader" 64 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm" 65 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 66 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 67 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/port" 68 sentrytime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time" 69 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/unimpl" 70 uspb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/unimpl/unimplemented_syscall_go_proto" 71 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/uniqueid" 72 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 73 "github.com/nicocha30/gvisor-ligolo/pkg/state" 74 "github.com/nicocha30/gvisor-ligolo/pkg/state/wire" 75 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 76 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 77 ) 78 79 // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to 80 // allow easy access everywhere. 81 var IOUringEnabled = false 82 83 // userCounters is a set of user counters. 84 // 85 // +stateify savable 86 type userCounters struct { 87 uid auth.KUID 88 89 rlimitNProc atomicbitops.Uint64 90 } 91 92 // incRLimitNProc increments the rlimitNProc counter. 93 func (uc *userCounters) incRLimitNProc(ctx context.Context) error { 94 lim := limits.FromContext(ctx).Get(limits.ProcessCount) 95 creds := auth.CredentialsFromContext(ctx) 96 nproc := uc.rlimitNProc.Add(1) 97 if nproc > lim.Cur && 98 !creds.HasCapability(linux.CAP_SYS_ADMIN) && 99 !creds.HasCapability(linux.CAP_SYS_RESOURCE) { 100 uc.rlimitNProc.Add(^uint64(0)) 101 return linuxerr.EAGAIN 102 } 103 return nil 104 } 105 106 // decRLimitNProc decrements the rlimitNProc counter. 107 func (uc *userCounters) decRLimitNProc() { 108 uc.rlimitNProc.Add(^uint64(0)) 109 } 110 111 // Kernel represents an emulated Linux kernel. It must be initialized by calling 112 // Init() or LoadFrom(). 113 // 114 // +stateify savable 115 type Kernel struct { 116 // extMu serializes external changes to the Kernel with calls to 117 // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel 118 // remains frozen for the duration of the call; it requires that the Kernel 119 // is paused as a precondition, which ensures that none of the tasks 120 // running within the Kernel can affect its state, but extMu is required to 121 // ensure that concurrent users of the Kernel *outside* the Kernel's 122 // control cannot affect its state by calling e.g. 123 // Kernel.SendExternalSignal.) 124 extMu sync.Mutex `state:"nosave"` 125 126 // started is true if Start has been called. Unless otherwise specified, 127 // all Kernel fields become immutable once started becomes true. 128 started bool `state:"nosave"` 129 130 // All of the following fields are immutable unless otherwise specified. 131 132 // Platform is the platform that is used to execute tasks in the created 133 // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is 134 // embedded anonymously (the same issue applies). 135 platform.Platform `state:"nosave"` 136 137 // mf provides application memory. 138 mf *pgalloc.MemoryFile `state:"nosave"` 139 140 // See InitKernelArgs for the meaning of these fields. 141 featureSet cpuid.FeatureSet 142 timekeeper *Timekeeper 143 tasks *TaskSet 144 rootUserNamespace *auth.UserNamespace 145 rootNetworkNamespace *inet.Namespace 146 applicationCores uint 147 useHostCores bool 148 extraAuxv []arch.AuxEntry 149 vdso *loader.VDSO 150 rootUTSNamespace *UTSNamespace 151 rootIPCNamespace *IPCNamespace 152 rootAbstractSocketNamespace *AbstractSocketNamespace 153 154 // futexes is the "root" futex.Manager, from which all others are forked. 155 // This is necessary to ensure that shared futexes are coherent across all 156 // tasks, including those created by CreateProcess. 157 futexes *futex.Manager 158 159 // globalInit is the thread group whose leader has ID 1 in the root PID 160 // namespace. globalInit is stored separately so that it is accessible even 161 // after all tasks in the thread group have exited, such that ID 1 is no 162 // longer mapped. 163 // 164 // globalInit is mutable until it is assigned by the first successful call 165 // to CreateProcess, and is protected by extMu. 166 globalInit *ThreadGroup 167 168 // syslog is the kernel log. 169 syslog syslog 170 171 runningTasksMu runningTasksMutex `state:"nosave"` 172 173 // runningTasks is the total count of tasks currently in 174 // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are 175 // not blocked or stopped. 176 // 177 // runningTasks must be accessed atomically. Increments from 0 to 1 are 178 // further protected by runningTasksMu (see incRunningTasks). 179 runningTasks atomicbitops.Int64 180 181 // runningTasksCond is signaled when runningTasks is incremented from 0 to 1. 182 // 183 // Invariant: runningTasksCond.L == &runningTasksMu. 184 runningTasksCond sync.Cond `state:"nosave"` 185 186 // cpuClock is incremented every linux.ClockTick by a goroutine running 187 // kernel.runCPUClockTicker() while runningTasks != 0. 188 // 189 // cpuClock is used to measure task CPU usage, since sampling monotonicClock 190 // twice on every syscall turns out to be unreasonably expensive. This is 191 // similar to how Linux does task CPU accounting on x86 192 // (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing 193 // information to improve resolution 194 // (kernel/sched/cputime.c:cputime_adjust()), which we can't do since 195 // "preeemptive" scheduling is managed by the Go runtime, which doesn't 196 // provide this information. 197 // 198 // cpuClock is mutable, and is accessed using atomic memory operations. 199 cpuClock atomicbitops.Uint64 200 201 // cpuClockTickTimer drives increments of cpuClock. 202 cpuClockTickTimer *time.Timer `state:"nosave"` 203 204 // cpuClockMu is used to make increments of cpuClock, and updates of timers 205 // based on cpuClock, atomic. 206 cpuClockMu cpuClockMutex `state:"nosave"` 207 208 // cpuClockTickerRunning is true if the goroutine that increments cpuClock is 209 // running and false if it is blocked in runningTasksCond.Wait() or if it 210 // never started. 211 // 212 // cpuClockTickerRunning is protected by runningTasksMu. 213 cpuClockTickerRunning bool 214 215 // cpuClockTickerWakeCh is sent to to wake the goroutine that increments 216 // cpuClock if it's sleeping between ticks. 217 cpuClockTickerWakeCh chan struct{} `state:"nosave"` 218 219 // cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions 220 // from true to false. 221 // 222 // Invariant: cpuClockTickerStopCond.L == &runningTasksMu. 223 cpuClockTickerStopCond sync.Cond `state:"nosave"` 224 225 // uniqueID is used to generate unique identifiers. 226 // 227 // uniqueID is mutable, and is accessed using atomic memory operations. 228 uniqueID atomicbitops.Uint64 229 230 // nextInotifyCookie is a monotonically increasing counter used for 231 // generating unique inotify event cookies. 232 // 233 // nextInotifyCookie is mutable. 234 nextInotifyCookie atomicbitops.Uint32 235 236 // netlinkPorts manages allocation of netlink socket port IDs. 237 netlinkPorts *port.Manager 238 239 // saveStatus is nil if the sandbox has not been saved, errSaved or 240 // errAutoSaved if it has been saved successfully, or the error causing the 241 // sandbox to exit during save. 242 // It is protected by extMu. 243 saveStatus error `state:"nosave"` 244 245 // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. 246 danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` 247 248 // sockets records all network sockets in the system. Protected by extMu. 249 sockets map[*vfs.FileDescription]*SocketRecord 250 251 // nextSocketRecord is the next entry number to use in sockets. Protected 252 // by extMu. 253 nextSocketRecord uint64 254 255 // unimplementedSyscallEmitterOnce is used in the initialization of 256 // unimplementedSyscallEmitter. 257 unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` 258 259 // unimplementedSyscallEmitter is used to emit unimplemented syscall 260 // events. This is initialized lazily on the first unimplemented 261 // syscall. 262 unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` 263 264 // SpecialOpts contains special kernel options. 265 SpecialOpts 266 267 // vfs keeps the filesystem state used across the kernel. 268 vfs vfs.VirtualFilesystem 269 270 // hostMount is the Mount used for file descriptors that were imported 271 // from the host. 272 hostMount *vfs.Mount 273 274 // pipeMount is the Mount used for pipes created by the pipe() and pipe2() 275 // syscalls (as opposed to named pipes created by mknod()). 276 pipeMount *vfs.Mount 277 278 // nsfsMount is the Mount used for namespaces. 279 nsfsMount *vfs.Mount 280 281 // shmMount is the Mount used for anonymous files created by the 282 // memfd_create() syscalls. It is analogous to Linux's shm_mnt. 283 shmMount *vfs.Mount 284 285 // socketMount is the Mount used for sockets created by the socket() and 286 // socketpair() syscalls. There are several cases where a socket dentry will 287 // not be contained in socketMount: 288 // 1. Socket files created by mknod() 289 // 2. Socket fds imported from the host (Kernel.hostMount is used for these) 290 // 3. Socket files created by binding Unix sockets to a file path 291 socketMount *vfs.Mount 292 293 // sysVShmDevID is the device number used by SysV shm segments. In Linux, 294 // SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number. 295 // In gVisor, the shm implementation does not use shmMount, extracting 296 // shmMount's device number is inconvenient, applications accept a 297 // different device number in practice, and using a distinct device number 298 // avoids the possibility of inode number collisions due to the hack 299 // described in shm.Shm.InodeID(). 300 sysVShmDevID uint32 301 302 // If set to true, report address space activation waits as if the task is in 303 // external wait so that the watchdog doesn't report the task stuck. 304 SleepForAddressSpaceActivation bool 305 306 // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a 307 // tracee-tracer relationship. The key is a process (technically, the thread 308 // group leader) that can be traced by any thread that is a descendant of the 309 // value. If the value is nil, then anyone can trace the process represented by 310 // the key. 311 // 312 // ptraceExceptions is protected by the TaskSet mutex. 313 ptraceExceptions map[*Task]*Task 314 315 // YAMAPtraceScope is the current level of YAMA ptrace restrictions. 316 YAMAPtraceScope atomicbitops.Int32 317 318 // cgroupRegistry contains the set of active cgroup controllers on the 319 // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on 320 // the system. 321 cgroupRegistry *CgroupRegistry 322 323 // userCountersMap maps auth.KUID into a set of user counters. 324 userCountersMap map[auth.KUID]*userCounters 325 userCountersMapMu userCountersMutex `state:"nosave"` 326 } 327 328 // InitKernelArgs holds arguments to Init. 329 type InitKernelArgs struct { 330 // FeatureSet is the emulated CPU feature set. 331 FeatureSet cpuid.FeatureSet 332 333 // Timekeeper manages time for all tasks in the system. 334 Timekeeper *Timekeeper 335 336 // RootUserNamespace is the root user namespace. 337 RootUserNamespace *auth.UserNamespace 338 339 // RootNetworkNamespace is the root network namespace. If nil, no networking 340 // will be available. 341 RootNetworkNamespace *inet.Namespace 342 343 // ApplicationCores is the number of logical CPUs visible to sandboxed 344 // applications. The set of logical CPU IDs is [0, ApplicationCores); thus 345 // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the 346 // most significant bit in cpu_possible_mask + 1. 347 ApplicationCores uint 348 349 // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU 350 // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a 351 // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it 352 // will be overridden. 353 UseHostCores bool 354 355 // ExtraAuxv contains additional auxiliary vector entries that are added to 356 // each process by the ELF loader. 357 ExtraAuxv []arch.AuxEntry 358 359 // Vdso holds the VDSO and its parameter page. 360 Vdso *loader.VDSO 361 362 // RootUTSNamespace is the root UTS namespace. 363 RootUTSNamespace *UTSNamespace 364 365 // RootIPCNamespace is the root IPC namespace. 366 RootIPCNamespace *IPCNamespace 367 368 // RootAbstractSocketNamespace is the root Abstract Socket namespace. 369 RootAbstractSocketNamespace *AbstractSocketNamespace 370 371 // PIDNamespace is the root PID namespace. 372 PIDNamespace *PIDNamespace 373 } 374 375 // Init initialize the Kernel with no tasks. 376 // 377 // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile 378 // before calling Init. 379 func (k *Kernel) Init(args InitKernelArgs) error { 380 if args.Timekeeper == nil { 381 return fmt.Errorf("args.Timekeeper is nil") 382 } 383 if args.Timekeeper.clocks == nil { 384 return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") 385 } 386 if args.RootUserNamespace == nil { 387 return fmt.Errorf("args.RootUserNamespace is nil") 388 } 389 if args.ApplicationCores == 0 { 390 return fmt.Errorf("args.ApplicationCores is 0") 391 } 392 393 k.featureSet = args.FeatureSet 394 k.timekeeper = args.Timekeeper 395 k.tasks = newTaskSet(args.PIDNamespace) 396 k.rootUserNamespace = args.RootUserNamespace 397 k.rootUTSNamespace = args.RootUTSNamespace 398 k.rootIPCNamespace = args.RootIPCNamespace 399 k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace 400 k.rootNetworkNamespace = args.RootNetworkNamespace 401 if k.rootNetworkNamespace == nil { 402 k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace) 403 } 404 k.runningTasksCond.L = &k.runningTasksMu 405 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 406 k.cpuClockTickerStopCond.L = &k.runningTasksMu 407 k.applicationCores = args.ApplicationCores 408 if args.UseHostCores { 409 k.useHostCores = true 410 maxCPU, err := hostcpu.MaxPossibleCPU() 411 if err != nil { 412 return fmt.Errorf("failed to get maximum CPU number: %v", err) 413 } 414 minAppCores := uint(maxCPU) + 1 415 if k.applicationCores < minAppCores { 416 log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) 417 k.applicationCores = minAppCores 418 } 419 } 420 k.extraAuxv = args.ExtraAuxv 421 k.vdso = args.Vdso 422 k.futexes = futex.NewManager() 423 k.netlinkPorts = port.New() 424 k.ptraceExceptions = make(map[*Task]*Task) 425 k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL) 426 k.userCountersMap = make(map[auth.KUID]*userCounters) 427 428 ctx := k.SupervisorContext() 429 if err := k.vfs.Init(ctx); err != nil { 430 return fmt.Errorf("failed to initialize VFS: %v", err) 431 } 432 433 err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) 434 if err != nil { 435 return fmt.Errorf("failed to create mqfs filesystem: %v", err) 436 } 437 438 pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) 439 if err != nil { 440 return fmt.Errorf("failed to create pipefs filesystem: %v", err) 441 } 442 defer pipeFilesystem.DecRef(ctx) 443 pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) 444 k.pipeMount = pipeMount 445 446 nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs) 447 if err != nil { 448 return fmt.Errorf("failed to create nsfs filesystem: %v", err) 449 } 450 defer nsfsFilesystem.DecRef(ctx) 451 nsfsMount := k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) 452 k.nsfsMount = nsfsMount 453 k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, nsfsMount, k.rootNetworkNamespace)) 454 455 tmpfsOpts := vfs.GetFilesystemOptions{ 456 InternalData: tmpfs.FilesystemOpts{ 457 // See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT). 458 // Note how mm/shmem.c:shmem_fill_super() does not provide a default 459 // value for sbinfo->max_blocks when SB_KERNMOUNT is set. 460 DisableDefaultSizeLimit: true, 461 }, 462 } 463 tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts) 464 if err != nil { 465 return fmt.Errorf("failed to create tmpfs filesystem: %v", err) 466 } 467 defer tmpfsFilesystem.DecRef(ctx) 468 defer tmpfsRoot.DecRef(ctx) 469 k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) 470 471 socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) 472 if err != nil { 473 return fmt.Errorf("failed to create sockfs filesystem: %v", err) 474 } 475 defer socketFilesystem.DecRef(ctx) 476 k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) 477 478 sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor() 479 if err != nil { 480 return fmt.Errorf("failed to get device number for SysV shm: %v", err) 481 } 482 k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor) 483 484 k.sockets = make(map[*vfs.FileDescription]*SocketRecord) 485 486 k.cgroupRegistry = newCgroupRegistry() 487 return nil 488 } 489 490 // SaveTo saves the state of k to w. 491 // 492 // Preconditions: The kernel must be paused throughout the call to SaveTo. 493 func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { 494 saveStart := time.Now() 495 496 // Do not allow other Kernel methods to affect it while it's being saved. 497 k.extMu.Lock() 498 defer k.extMu.Unlock() 499 500 // Stop time. 501 k.pauseTimeLocked(ctx) 502 defer k.resumeTimeLocked(ctx) 503 504 // Evict all evictable MemoryFile allocations. 505 k.mf.StartEvictions() 506 k.mf.WaitForEvictions() 507 508 // Discard unsavable mappings, such as those for host file descriptors. 509 if err := k.invalidateUnsavableMappings(ctx); err != nil { 510 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 511 } 512 513 // Prepare filesystems for saving. This must be done after 514 // invalidateUnsavableMappings(), since dropping memory mappings may 515 // affect filesystem state (e.g. page cache reference counts). 516 if err := k.vfs.PrepareSave(ctx); err != nil { 517 return err 518 } 519 520 // Save the CPUID FeatureSet before the rest of the kernel so we can 521 // verify its compatibility on restore before attempting to restore the 522 // entire kernel, which may fail on an incompatible machine. 523 // 524 // N.B. This will also be saved along with the full kernel save below. 525 cpuidStart := time.Now() 526 if _, err := state.Save(ctx, w, &k.featureSet); err != nil { 527 return err 528 } 529 log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) 530 531 // Save the timekeeper's state. 532 533 if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil { 534 // Pause the network stack. 535 netstackPauseStart := time.Now() 536 log.Infof("Pausing root network namespace") 537 k.rootNetworkNamespace.Stack().Pause() 538 defer k.rootNetworkNamespace.Stack().Resume() 539 log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart)) 540 } 541 542 // Save the kernel state. 543 kernelStart := time.Now() 544 stats, err := state.Save(ctx, w, k) 545 if err != nil { 546 return err 547 } 548 log.Infof("Kernel save stats: %s", stats.String()) 549 log.Infof("Kernel save took [%s].", time.Since(kernelStart)) 550 551 // Save the memory file's state. 552 memoryStart := time.Now() 553 if err := k.mf.SaveTo(ctx, w); err != nil { 554 return err 555 } 556 log.Infof("Memory save took [%s].", time.Since(memoryStart)) 557 558 log.Infof("Overall save took [%s].", time.Since(saveStart)) 559 560 return nil 561 } 562 563 // Preconditions: The kernel must be paused. 564 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { 565 invalidated := make(map[*mm.MemoryManager]struct{}) 566 k.tasks.mu.RLock() 567 defer k.tasks.mu.RUnlock() 568 for t := range k.tasks.Root.tids { 569 // We can skip locking Task.mu here since the kernel is paused. 570 if memMgr := t.image.MemoryManager; memMgr != nil { 571 if _, ok := invalidated[memMgr]; !ok { 572 if err := memMgr.InvalidateUnsavable(ctx); err != nil { 573 return err 574 } 575 invalidated[memMgr] = struct{}{} 576 } 577 } 578 // I really wish we just had a sync.Map of all MMs... 579 if r, ok := t.runState.(*runSyscallAfterExecStop); ok { 580 if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { 581 return err 582 } 583 } 584 } 585 return nil 586 } 587 588 // LoadFrom returns a new Kernel loaded from args. 589 func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { 590 loadStart := time.Now() 591 592 k.runningTasksCond.L = &k.runningTasksMu 593 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 594 k.cpuClockTickerStopCond.L = &k.runningTasksMu 595 596 initAppCores := k.applicationCores 597 598 // Load the pre-saved CPUID FeatureSet. 599 // 600 // N.B. This was also saved along with the full kernel below, so we 601 // don't need to explicitly install it in the Kernel. 602 cpuidStart := time.Now() 603 if _, err := state.Load(ctx, r, &k.featureSet); err != nil { 604 return err 605 } 606 log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) 607 608 // Verify that the FeatureSet is usable on this host. We do this before 609 // Kernel load so that the explicit CPUID mismatch error has priority 610 // over floating point state restore errors that may occur on load on 611 // an incompatible machine. 612 if err := k.featureSet.CheckHostCompatible(); err != nil { 613 return err 614 } 615 616 // Load the kernel state. 617 kernelStart := time.Now() 618 stats, err := state.Load(ctx, r, k) 619 if err != nil { 620 return err 621 } 622 log.Infof("Kernel load stats: %s", stats.String()) 623 log.Infof("Kernel load took [%s].", time.Since(kernelStart)) 624 625 // rootNetworkNamespace should be populated after loading the state file. 626 // Restore the root network stack. 627 k.rootNetworkNamespace.RestoreRootStack(net) 628 629 // Load the memory file's state. 630 memoryStart := time.Now() 631 if err := k.mf.LoadFrom(ctx, r); err != nil { 632 return err 633 } 634 log.Infof("Memory load took [%s].", time.Since(memoryStart)) 635 636 log.Infof("Overall load took [%s]", time.Since(loadStart)) 637 638 k.Timekeeper().SetClocks(clocks) 639 640 if timeReady != nil { 641 close(timeReady) 642 } 643 644 if net != nil { 645 net.Resume() 646 } 647 648 if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { 649 return err 650 } 651 652 tcpip.AsyncLoading.Wait() 653 654 log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) 655 656 // Applications may size per-cpu structures based on k.applicationCores, so 657 // it can't change across save/restore. When we are virtualizing CPU 658 // numbers, this isn't a problem. However, when we are exposing host CPU 659 // assignments, we can't tolerate an increase in the number of host CPUs, 660 // which could result in getcpu(2) returning CPUs that applications expect 661 // not to exist. 662 if k.useHostCores && initAppCores > k.applicationCores { 663 return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) 664 } 665 666 return nil 667 } 668 669 // UniqueID returns a unique identifier. 670 func (k *Kernel) UniqueID() uint64 { 671 id := k.uniqueID.Add(1) 672 if id == 0 { 673 panic("unique identifier generator wrapped around") 674 } 675 return id 676 } 677 678 // CreateProcessArgs holds arguments to kernel.CreateProcess. 679 type CreateProcessArgs struct { 680 // Filename is the filename to load as the init binary. 681 // 682 // If this is provided as "", File will be checked, then the file will be 683 // guessed via Argv[0]. 684 Filename string 685 686 // File is a passed host FD pointing to a file to load as the init binary. 687 // 688 // This is checked if and only if Filename is "". 689 File *vfs.FileDescription 690 691 // Argv is a list of arguments. 692 Argv []string 693 694 // Envv is a list of environment variables. 695 Envv []string 696 697 // WorkingDirectory is the initial working directory. 698 // 699 // This defaults to the root if empty. 700 WorkingDirectory string 701 702 // Credentials is the initial credentials. 703 Credentials *auth.Credentials 704 705 // FDTable is the initial set of file descriptors. If CreateProcess succeeds, 706 // it takes a reference on FDTable. 707 FDTable *FDTable 708 709 // Umask is the initial umask. 710 Umask uint 711 712 // Limits are the initial resource limits. 713 Limits *limits.LimitSet 714 715 // MaxSymlinkTraversals is the maximum number of symlinks to follow 716 // during resolution. 717 MaxSymlinkTraversals uint 718 719 // UTSNamespace is the initial UTS namespace. 720 UTSNamespace *UTSNamespace 721 722 // IPCNamespace is the initial IPC namespace. 723 IPCNamespace *IPCNamespace 724 725 // PIDNamespace is the initial PID Namespace. 726 PIDNamespace *PIDNamespace 727 728 // AbstractSocketNamespace is the initial Abstract Socket namespace. 729 AbstractSocketNamespace *AbstractSocketNamespace 730 731 // MountNamespace optionally contains the mount namespace for this 732 // process. If nil, the init process's mount namespace is used. 733 // 734 // Anyone setting MountNamespace must donate a reference (i.e. 735 // increment it). 736 MountNamespace *vfs.MountNamespace 737 738 // ContainerID is the container that the process belongs to. 739 ContainerID string 740 741 // InitialCgroups are the cgroups the container is initialized to. 742 InitialCgroups map[Cgroup]struct{} 743 } 744 745 // NewContext returns a context.Context that represents the task that will be 746 // created by args.NewContext(k). 747 func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context { 748 return &createProcessContext{ 749 Context: context.Background(), 750 kernel: k, 751 args: args, 752 } 753 } 754 755 // createProcessContext is a context.Context that represents the context 756 // associated with a task that is being created. 757 type createProcessContext struct { 758 context.Context 759 kernel *Kernel 760 args *CreateProcessArgs 761 } 762 763 // Value implements context.Context.Value. 764 func (ctx *createProcessContext) Value(key any) any { 765 switch key { 766 case CtxKernel: 767 return ctx.kernel 768 case CtxPIDNamespace: 769 return ctx.args.PIDNamespace 770 case CtxUTSNamespace: 771 return ctx.args.UTSNamespace 772 case ipc.CtxIPCNamespace: 773 ipcns := ctx.args.IPCNamespace 774 ipcns.IncRef() 775 return ipcns 776 case auth.CtxCredentials: 777 return ctx.args.Credentials 778 case vfs.CtxRoot: 779 if ctx.args.MountNamespace == nil { 780 return nil 781 } 782 root := ctx.args.MountNamespace.Root() 783 root.IncRef() 784 return root 785 case vfs.CtxMountNamespace: 786 if ctx.kernel.globalInit == nil { 787 return nil 788 } 789 mntns := ctx.kernel.GlobalInit().Leader().MountNamespace() 790 mntns.IncRef() 791 return mntns 792 case inet.CtxStack: 793 return ctx.kernel.RootNetworkNamespace().Stack() 794 case ktime.CtxRealtimeClock: 795 return ctx.kernel.RealtimeClock() 796 case limits.CtxLimits: 797 return ctx.args.Limits 798 case pgalloc.CtxMemoryCgroupID: 799 return ctx.getMemoryCgroupID() 800 case pgalloc.CtxMemoryFile: 801 return ctx.kernel.mf 802 case pgalloc.CtxMemoryFileProvider: 803 return ctx.kernel 804 case platform.CtxPlatform: 805 return ctx.kernel 806 case uniqueid.CtxGlobalUniqueID: 807 return ctx.kernel.UniqueID() 808 case uniqueid.CtxGlobalUniqueIDProvider: 809 return ctx.kernel 810 case uniqueid.CtxInotifyCookie: 811 return ctx.kernel.GenerateInotifyCookie() 812 case unimpl.CtxEvents: 813 return ctx.kernel 814 default: 815 return nil 816 } 817 } 818 819 func (ctx *createProcessContext) getMemoryCgroupID() uint32 { 820 for cg := range ctx.args.InitialCgroups { 821 for _, ctl := range cg.Controllers() { 822 if ctl.Type() == CgroupControllerMemory { 823 return cg.ID() 824 } 825 } 826 } 827 return InvalidCgroupID 828 } 829 830 // CreateProcess creates a new task in a new thread group with the given 831 // options. The new task has no parent and is in the root PID namespace. 832 // 833 // If k.Start() has already been called, then the created process must be 834 // started by calling kernel.StartProcess(tg). 835 // 836 // If k.Start() has not yet been called, then the created task will begin 837 // running when k.Start() is called. 838 // 839 // CreateProcess has no analogue in Linux; it is used to create the initial 840 // application task, as well as processes started by the control server. 841 func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { 842 k.extMu.Lock() 843 defer k.extMu.Unlock() 844 log.Infof("EXEC: %v", args.Argv) 845 846 ctx := args.NewContext(k) 847 mntns := args.MountNamespace 848 if mntns == nil { 849 if k.globalInit == nil { 850 return nil, 0, fmt.Errorf("mount namespace is nil") 851 } 852 // Add a reference to the namespace, which is transferred to the new process. 853 mntns = k.globalInit.Leader().MountNamespace() 854 mntns.IncRef() 855 } 856 // Get the root directory from the MountNamespace. 857 root := mntns.Root() 858 root.IncRef() 859 defer root.DecRef(ctx) 860 861 // Grab the working directory. 862 wd := root // Default. 863 if args.WorkingDirectory != "" { 864 pop := vfs.PathOperation{ 865 Root: root, 866 Start: wd, 867 Path: fspath.Parse(args.WorkingDirectory), 868 FollowFinalSymlink: true, 869 } 870 // NOTE(b/236028361): Do not set CheckSearchable flag to true. 871 // Application is allowed to start with a working directory that it can 872 // not access/search. This is consistent with Docker and VFS1. Runc 873 // explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry 874 // chdir to fix EPERM"). As described in the commit, runc unintentionally 875 // allowed this behavior in a couple of releases and applications started 876 // relying on it. So they decided to allow it for backward compatibility. 877 var err error 878 wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{}) 879 if err != nil { 880 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 881 } 882 defer wd.DecRef(ctx) 883 } 884 fsContext := NewFSContext(root, wd, args.Umask) 885 886 tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) 887 cu := cleanup.Make(func() { 888 tg.Release(ctx) 889 }) 890 defer cu.Clean() 891 892 // Check which file to start from. 893 switch { 894 case args.Filename != "": 895 // If a filename is given, take that. 896 // Set File to nil so we resolve the path in LoadTaskImage. 897 args.File = nil 898 case args.File != nil: 899 // If File is set, take the File provided directly. 900 args.Filename = args.File.MappedName(ctx) 901 default: 902 // Otherwise look at Argv and see if the first argument is a valid path. 903 if len(args.Argv) == 0 { 904 return nil, 0, fmt.Errorf("no filename or command provided") 905 } 906 if !filepath.IsAbs(args.Argv[0]) { 907 return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) 908 } 909 args.Filename = args.Argv[0] 910 } 911 912 // Create a fresh task context. 913 remainingTraversals := args.MaxSymlinkTraversals 914 loadArgs := loader.LoadArgs{ 915 Root: root, 916 WorkingDir: wd, 917 RemainingTraversals: &remainingTraversals, 918 ResolveFinal: true, 919 Filename: args.Filename, 920 File: args.File, 921 CloseOnExec: false, 922 Argv: args.Argv, 923 Envv: args.Envv, 924 Features: k.featureSet, 925 } 926 927 image, se := k.LoadTaskImage(ctx, loadArgs) 928 if se != nil { 929 return nil, 0, errors.New(se.String()) 930 } 931 932 // Take a reference on the FDTable, which will be transferred to 933 // TaskSet.NewTask(). 934 args.FDTable.IncRef() 935 936 // Create the task. 937 config := &TaskConfig{ 938 Kernel: k, 939 ThreadGroup: tg, 940 TaskImage: image, 941 FSContext: fsContext, 942 FDTable: args.FDTable, 943 Credentials: args.Credentials, 944 NetworkNamespace: k.RootNetworkNamespace(), 945 AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), 946 UTSNamespace: args.UTSNamespace, 947 IPCNamespace: args.IPCNamespace, 948 AbstractSocketNamespace: args.AbstractSocketNamespace, 949 MountNamespace: mntns, 950 ContainerID: args.ContainerID, 951 InitialCgroups: args.InitialCgroups, 952 UserCounters: k.GetUserCounters(args.Credentials.RealKUID), 953 } 954 config.NetworkNamespace.IncRef() 955 t, err := k.tasks.NewTask(ctx, config) 956 if err != nil { 957 return nil, 0, err 958 } 959 t.traceExecEvent(image) // Simulate exec for tracing. 960 961 // Success. 962 cu.Release() 963 tgid := k.tasks.Root.IDOfThreadGroup(tg) 964 if k.globalInit == nil { 965 k.globalInit = tg 966 } 967 return tg, tgid, nil 968 } 969 970 // StartProcess starts running a process that was created with CreateProcess. 971 func (k *Kernel) StartProcess(tg *ThreadGroup) { 972 t := tg.Leader() 973 tid := k.tasks.Root.IDOfTask(t) 974 t.Start(tid) 975 } 976 977 // Start starts execution of all tasks in k. 978 // 979 // Preconditions: Start may be called exactly once. 980 func (k *Kernel) Start() error { 981 k.extMu.Lock() 982 defer k.extMu.Unlock() 983 984 if k.started { 985 return fmt.Errorf("kernel already started") 986 } 987 988 k.started = true 989 k.cpuClockTickTimer = time.NewTimer(linux.ClockTick) 990 k.runningTasksMu.Lock() 991 k.cpuClockTickerRunning = true 992 k.runningTasksMu.Unlock() 993 go k.runCPUClockTicker() 994 // If k was created by LoadKernelFrom, timers were stopped during 995 // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, 996 // this is a no-op. 997 k.resumeTimeLocked(k.SupervisorContext()) 998 k.tasks.mu.RLock() 999 ts := make([]*Task, 0, len(k.tasks.Root.tids)) 1000 for t := range k.tasks.Root.tids { 1001 ts = append(ts, t) 1002 } 1003 k.tasks.mu.RUnlock() 1004 // Start task goroutines. 1005 // NOTE(b/235349091): We don't actually need the TaskSet mutex, we just 1006 // need to make sure we only call t.Start() once for each task. Holding the 1007 // mutex for each task start may cause a nested locking error. 1008 for _, t := range ts { 1009 t.Start(t.ThreadID()) 1010 } 1011 return nil 1012 } 1013 1014 // pauseTimeLocked pauses all Timers and Timekeeper updates. 1015 // 1016 // Preconditions: 1017 // - Any task goroutines running in k must be stopped. 1018 // - k.extMu must be locked. 1019 func (k *Kernel) pauseTimeLocked(ctx context.Context) { 1020 // Since all task goroutines have been stopped by precondition, the CPU clock 1021 // ticker should stop on its own; wait for it to do so, waking it up from 1022 // sleeping betwen ticks if necessary. 1023 k.runningTasksMu.Lock() 1024 for k.cpuClockTickerRunning { 1025 select { 1026 case k.cpuClockTickerWakeCh <- struct{}{}: 1027 default: 1028 } 1029 k.cpuClockTickerStopCond.Wait() 1030 } 1031 k.runningTasksMu.Unlock() 1032 1033 // By precondition, nothing else can be interacting with PIDNamespace.tids 1034 // or FDTable.files, so we can iterate them without synchronization. (We 1035 // can't hold the TaskSet mutex when pausing thread group timers because 1036 // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet 1037 // mutex, while holding the Timer mutex.) 1038 for t := range k.tasks.Root.tids { 1039 if t == t.tg.leader { 1040 t.tg.itimerRealTimer.Pause() 1041 for _, it := range t.tg.timers { 1042 it.PauseTimer() 1043 } 1044 } 1045 // This means we'll iterate FDTables shared by multiple tasks repeatedly, 1046 // but ktime.Timer.Pause is idempotent so this is harmless. 1047 if t.fdTable != nil { 1048 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1049 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1050 tfd.PauseTimer() 1051 } 1052 }) 1053 } 1054 } 1055 k.timekeeper.PauseUpdates() 1056 } 1057 1058 // resumeTimeLocked resumes all Timers and Timekeeper updates. If 1059 // pauseTimeLocked has not been previously called, resumeTimeLocked has no 1060 // effect. 1061 // 1062 // Preconditions: 1063 // - Any task goroutines running in k must be stopped. 1064 // - k.extMu must be locked. 1065 func (k *Kernel) resumeTimeLocked(ctx context.Context) { 1066 // The CPU clock ticker will automatically resume as task goroutines resume 1067 // execution. 1068 1069 k.timekeeper.ResumeUpdates() 1070 for t := range k.tasks.Root.tids { 1071 if t == t.tg.leader { 1072 t.tg.itimerRealTimer.Resume() 1073 for _, it := range t.tg.timers { 1074 it.ResumeTimer() 1075 } 1076 } 1077 if t.fdTable != nil { 1078 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1079 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1080 tfd.ResumeTimer() 1081 } 1082 }) 1083 } 1084 } 1085 } 1086 1087 func (k *Kernel) incRunningTasks() { 1088 for { 1089 tasks := k.runningTasks.Load() 1090 if tasks != 0 { 1091 // Standard case. Simply increment. 1092 if !k.runningTasks.CompareAndSwap(tasks, tasks+1) { 1093 continue 1094 } 1095 return 1096 } 1097 1098 // Transition from 0 -> 1. 1099 k.runningTasksMu.Lock() 1100 if k.runningTasks.Load() != 0 { 1101 // Raced with another transition and lost. 1102 k.runningTasks.Add(1) 1103 k.runningTasksMu.Unlock() 1104 return 1105 } 1106 if !k.cpuClockTickerRunning { 1107 select { 1108 case tickTime := <-k.cpuClockTickTimer.C: 1109 // Rearm the timer since we consumed the wakeup. Estimate how much time 1110 // remains on the current tick so that periodic workloads interact with 1111 // the (periodic) CPU clock ticker in the same way that they would 1112 // without the optimization of putting the ticker to sleep. 1113 missedNS := time.Since(tickTime).Nanoseconds() 1114 missedTicks := missedNS / linux.ClockTick.Nanoseconds() 1115 thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds() 1116 k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS)) 1117 // Increment k.cpuClock on the CPU clock ticker goroutine's behalf. 1118 // (Whole missed ticks don't matter, and adding them to k.cpuClock will 1119 // just confuse the watchdog.) At the time the tick occurred, all task 1120 // goroutines were asleep, so there's nothing else to do. This ensures 1121 // that our caller (Task.accountTaskGoroutineLeave()) records an 1122 // updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly 1123 // accounted as having resumed execution in the sentry during this tick 1124 // instead of at the end of the previous one. 1125 k.cpuClock.Add(1) 1126 default: 1127 } 1128 // We are transitioning from idle to active. Set k.cpuClockTickerRunning 1129 // = true here so that if we transition to idle and then active again 1130 // before the CPU clock ticker goroutine has a chance to run, the first 1131 // call to k.incRunningTasks() at the end of that cycle does not try to 1132 // steal k.cpuClockTickTimer.C again, as this would allow workloads that 1133 // rapidly cycle between idle and active to starve the CPU clock ticker 1134 // of chances to observe task goroutines in a running state and account 1135 // their CPU usage. 1136 k.cpuClockTickerRunning = true 1137 k.runningTasksCond.Signal() 1138 } 1139 // This store must happen after the increment of k.cpuClock above to ensure 1140 // that concurrent calls to Task.accountTaskGoroutineLeave() also observe 1141 // the updated k.cpuClock. 1142 k.runningTasks.Store(1) 1143 k.runningTasksMu.Unlock() 1144 return 1145 } 1146 } 1147 1148 func (k *Kernel) decRunningTasks() { 1149 tasks := k.runningTasks.Add(-1) 1150 if tasks < 0 { 1151 panic(fmt.Sprintf("Invalid running count %d", tasks)) 1152 } 1153 1154 // Nothing to do. The next CPU clock tick will disable the timer if 1155 // there is still nothing running. This provides approximately one tick 1156 // of slack in which we can switch back and forth between idle and 1157 // active without an expensive transition. 1158 } 1159 1160 // WaitExited blocks until all tasks in k have exited. 1161 func (k *Kernel) WaitExited() { 1162 k.tasks.liveGoroutines.Wait() 1163 } 1164 1165 // Kill requests that all tasks in k immediately exit as if group exiting with 1166 // status ws. Kill does not wait for tasks to exit. 1167 func (k *Kernel) Kill(ws linux.WaitStatus) { 1168 k.extMu.Lock() 1169 defer k.extMu.Unlock() 1170 k.tasks.Kill(ws) 1171 } 1172 1173 // Pause requests that all tasks in k temporarily stop executing, and blocks 1174 // until all tasks and asynchronous I/O operations in k have stopped. Multiple 1175 // calls to Pause nest and require an equal number of calls to Unpause to 1176 // resume execution. 1177 func (k *Kernel) Pause() { 1178 k.extMu.Lock() 1179 k.tasks.BeginExternalStop() 1180 k.extMu.Unlock() 1181 k.tasks.runningGoroutines.Wait() 1182 k.tasks.aioGoroutines.Wait() 1183 } 1184 1185 // ReceiveTaskStates receives full states for all tasks. 1186 func (k *Kernel) ReceiveTaskStates() { 1187 k.extMu.Lock() 1188 k.tasks.PullFullState() 1189 k.extMu.Unlock() 1190 } 1191 1192 // Unpause ends the effect of a previous call to Pause. If Unpause is called 1193 // without a matching preceding call to Pause, Unpause may panic. 1194 func (k *Kernel) Unpause() { 1195 k.extMu.Lock() 1196 defer k.extMu.Unlock() 1197 k.tasks.EndExternalStop() 1198 } 1199 1200 // SendExternalSignal injects a signal into the kernel. 1201 // 1202 // context is used only for debugging to describe how the signal was received. 1203 // 1204 // Preconditions: Kernel must have an init process. 1205 func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { 1206 k.extMu.Lock() 1207 defer k.extMu.Unlock() 1208 k.sendExternalSignal(info, context) 1209 } 1210 1211 // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. 1212 // This function doesn't skip signals like SendExternalSignal does. 1213 func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { 1214 k.extMu.Lock() 1215 defer k.extMu.Unlock() 1216 return tg.SendSignal(info) 1217 } 1218 1219 // SendContainerSignal sends the given signal to all processes inside the 1220 // namespace that match the given container ID. 1221 func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { 1222 k.extMu.Lock() 1223 defer k.extMu.Unlock() 1224 k.tasks.mu.RLock() 1225 defer k.tasks.mu.RUnlock() 1226 1227 var lastErr error 1228 for tg := range k.tasks.Root.tgids { 1229 if tg.leader.ContainerID() == cid { 1230 tg.signalHandlers.mu.Lock() 1231 infoCopy := *info 1232 if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { 1233 lastErr = err 1234 } 1235 tg.signalHandlers.mu.Unlock() 1236 } 1237 } 1238 return lastErr 1239 } 1240 1241 // RebuildTraceContexts rebuilds the trace context for all tasks. 1242 // 1243 // Unfortunately, if these are built while tracing is not enabled, then we will 1244 // not have meaningful trace data. Rebuilding here ensures that we can do so 1245 // after tracing has been enabled. 1246 func (k *Kernel) RebuildTraceContexts() { 1247 // We need to pause all task goroutines because Task.rebuildTraceContext() 1248 // replaces Task.traceContext and Task.traceTask, which are 1249 // task-goroutine-exclusive (i.e. the task goroutine assumes that it can 1250 // access them without synchronization) for performance. 1251 k.Pause() 1252 defer k.Unpause() 1253 1254 k.extMu.Lock() 1255 defer k.extMu.Unlock() 1256 k.tasks.mu.RLock() 1257 defer k.tasks.mu.RUnlock() 1258 1259 for t, tid := range k.tasks.Root.tids { 1260 t.rebuildTraceContext(tid) 1261 } 1262 } 1263 1264 // FeatureSet returns the FeatureSet. 1265 func (k *Kernel) FeatureSet() cpuid.FeatureSet { 1266 return k.featureSet 1267 } 1268 1269 // Timekeeper returns the Timekeeper. 1270 func (k *Kernel) Timekeeper() *Timekeeper { 1271 return k.timekeeper 1272 } 1273 1274 // TaskSet returns the TaskSet. 1275 func (k *Kernel) TaskSet() *TaskSet { 1276 return k.tasks 1277 } 1278 1279 // RootUserNamespace returns the root UserNamespace. 1280 func (k *Kernel) RootUserNamespace() *auth.UserNamespace { 1281 return k.rootUserNamespace 1282 } 1283 1284 // RootUTSNamespace returns the root UTSNamespace. 1285 func (k *Kernel) RootUTSNamespace() *UTSNamespace { 1286 return k.rootUTSNamespace 1287 } 1288 1289 // RootIPCNamespace takes a reference and returns the root IPCNamespace. 1290 func (k *Kernel) RootIPCNamespace() *IPCNamespace { 1291 k.rootIPCNamespace.IncRef() 1292 return k.rootIPCNamespace 1293 } 1294 1295 // RootPIDNamespace returns the root PIDNamespace. 1296 func (k *Kernel) RootPIDNamespace() *PIDNamespace { 1297 return k.tasks.Root 1298 } 1299 1300 // RootAbstractSocketNamespace returns the root AbstractSocketNamespace. 1301 func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { 1302 return k.rootAbstractSocketNamespace 1303 } 1304 1305 // RootNetworkNamespace returns the root network namespace, always non-nil. 1306 func (k *Kernel) RootNetworkNamespace() *inet.Namespace { 1307 return k.rootNetworkNamespace 1308 } 1309 1310 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or 1311 // nil if no such thread group exists. GlobalInit may return a thread group 1312 // containing no tasks if the thread group has already exited. 1313 func (k *Kernel) GlobalInit() *ThreadGroup { 1314 k.extMu.Lock() 1315 defer k.extMu.Unlock() 1316 return k.globalInit 1317 } 1318 1319 // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. 1320 func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { 1321 k.globalInit = tg 1322 } 1323 1324 // ApplicationCores returns the number of CPUs visible to sandboxed 1325 // applications. 1326 func (k *Kernel) ApplicationCores() uint { 1327 return k.applicationCores 1328 } 1329 1330 // RealtimeClock returns the application CLOCK_REALTIME clock. 1331 func (k *Kernel) RealtimeClock() ktime.Clock { 1332 return k.timekeeper.realtimeClock 1333 } 1334 1335 // MonotonicClock returns the application CLOCK_MONOTONIC clock. 1336 func (k *Kernel) MonotonicClock() ktime.Clock { 1337 return k.timekeeper.monotonicClock 1338 } 1339 1340 // CPUClockNow returns the current value of k.cpuClock. 1341 func (k *Kernel) CPUClockNow() uint64 { 1342 return k.cpuClock.Load() 1343 } 1344 1345 // Syslog returns the syslog. 1346 func (k *Kernel) Syslog() *syslog { 1347 return &k.syslog 1348 } 1349 1350 // GenerateInotifyCookie generates a unique inotify event cookie. 1351 // 1352 // Returned values may overlap with previously returned values if the value 1353 // space is exhausted. 0 is not a valid cookie value, all other values 1354 // representable in a uint32 are allowed. 1355 func (k *Kernel) GenerateInotifyCookie() uint32 { 1356 id := k.nextInotifyCookie.Add(1) 1357 // Wrap-around is explicitly allowed for inotify event cookies. 1358 if id == 0 { 1359 id = k.nextInotifyCookie.Add(1) 1360 } 1361 return id 1362 } 1363 1364 // NetlinkPorts returns the netlink port manager. 1365 func (k *Kernel) NetlinkPorts() *port.Manager { 1366 return k.netlinkPorts 1367 } 1368 1369 var ( 1370 errSaved = errors.New("sandbox has been successfully saved") 1371 errAutoSaved = errors.New("sandbox has been successfully auto-saved") 1372 ) 1373 1374 // SaveStatus returns the sandbox save status. If it was saved successfully, 1375 // autosaved indicates whether save was triggered by autosave. If it was not 1376 // saved successfully, err indicates the sandbox error that caused the kernel to 1377 // exit during save. 1378 func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { 1379 k.extMu.Lock() 1380 defer k.extMu.Unlock() 1381 switch k.saveStatus { 1382 case nil: 1383 return false, false, nil 1384 case errSaved: 1385 return true, false, nil 1386 case errAutoSaved: 1387 return true, true, nil 1388 default: 1389 return false, false, k.saveStatus 1390 } 1391 } 1392 1393 // SetSaveSuccess sets the flag indicating that save completed successfully, if 1394 // no status was already set. 1395 func (k *Kernel) SetSaveSuccess(autosave bool) { 1396 k.extMu.Lock() 1397 defer k.extMu.Unlock() 1398 if k.saveStatus == nil { 1399 if autosave { 1400 k.saveStatus = errAutoSaved 1401 } else { 1402 k.saveStatus = errSaved 1403 } 1404 } 1405 } 1406 1407 // SetSaveError sets the sandbox error that caused the kernel to exit during 1408 // save, if one is not already set. 1409 func (k *Kernel) SetSaveError(err error) { 1410 k.extMu.Lock() 1411 defer k.extMu.Unlock() 1412 if k.saveStatus == nil { 1413 k.saveStatus = err 1414 } 1415 } 1416 1417 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or 1418 // LoadFrom. 1419 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { 1420 k.mf = mf 1421 } 1422 1423 // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. 1424 func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { 1425 return k.mf 1426 } 1427 1428 // SupervisorContext returns a Context with maximum privileges in k. It should 1429 // only be used by goroutines outside the control of the emulated kernel 1430 // defined by e. 1431 // 1432 // Callers are responsible for ensuring that the returned Context is not used 1433 // concurrently with changes to the Kernel. 1434 func (k *Kernel) SupervisorContext() context.Context { 1435 return &supervisorContext{ 1436 Kernel: k, 1437 Logger: log.Log(), 1438 } 1439 } 1440 1441 // SocketRecord represents a socket recorded in Kernel.sockets. 1442 // 1443 // +stateify savable 1444 type SocketRecord struct { 1445 k *Kernel 1446 Sock *vfs.FileDescription 1447 ID uint64 // Socket table entry number. 1448 } 1449 1450 // RecordSocket adds a socket to the system-wide socket table for 1451 // tracking. 1452 // 1453 // Precondition: Caller must hold a reference to sock. 1454 // 1455 // Note that the socket table will not hold a reference on the 1456 // vfs.FileDescription. 1457 func (k *Kernel) RecordSocket(sock *vfs.FileDescription) { 1458 k.extMu.Lock() 1459 if _, ok := k.sockets[sock]; ok { 1460 panic(fmt.Sprintf("Socket %p added twice", sock)) 1461 } 1462 id := k.nextSocketRecord 1463 k.nextSocketRecord++ 1464 s := &SocketRecord{ 1465 k: k, 1466 ID: id, 1467 Sock: sock, 1468 } 1469 k.sockets[sock] = s 1470 k.extMu.Unlock() 1471 } 1472 1473 // DeleteSocket removes a socket from the system-wide socket table. 1474 func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) { 1475 k.extMu.Lock() 1476 delete(k.sockets, sock) 1477 k.extMu.Unlock() 1478 } 1479 1480 // ListSockets returns a snapshot of all sockets. 1481 // 1482 // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef() 1483 // to get a reference on a socket in the table. 1484 func (k *Kernel) ListSockets() []*SocketRecord { 1485 k.extMu.Lock() 1486 var socks []*SocketRecord 1487 for _, s := range k.sockets { 1488 socks = append(socks, s) 1489 } 1490 k.extMu.Unlock() 1491 return socks 1492 } 1493 1494 // supervisorContext is a privileged context. 1495 type supervisorContext struct { 1496 context.NoTask 1497 log.Logger 1498 *Kernel 1499 } 1500 1501 // Deadline implements context.Context.Deadline. 1502 func (*Kernel) Deadline() (time.Time, bool) { 1503 return time.Time{}, false 1504 } 1505 1506 // Done implements context.Context.Done. 1507 func (*Kernel) Done() <-chan struct{} { 1508 return nil 1509 } 1510 1511 // Err implements context.Context.Err. 1512 func (*Kernel) Err() error { 1513 return nil 1514 } 1515 1516 // Value implements context.Context. 1517 func (ctx *supervisorContext) Value(key any) any { 1518 switch key { 1519 case CtxCanTrace: 1520 // The supervisor context can trace anything. (None of 1521 // supervisorContext's users are expected to invoke ptrace, but ptrace 1522 // permissions are required for certain file accesses.) 1523 return func(*Task, bool) bool { return true } 1524 case CtxKernel: 1525 return ctx.Kernel 1526 case CtxPIDNamespace: 1527 return ctx.Kernel.tasks.Root 1528 case CtxUTSNamespace: 1529 return ctx.Kernel.rootUTSNamespace 1530 case ipc.CtxIPCNamespace: 1531 ipcns := ctx.Kernel.rootIPCNamespace 1532 ipcns.IncRef() 1533 return ipcns 1534 case auth.CtxCredentials: 1535 // The supervisor context is global root. 1536 return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace) 1537 case vfs.CtxRoot: 1538 if ctx.Kernel.globalInit == nil { 1539 return vfs.VirtualDentry{} 1540 } 1541 root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root() 1542 root.IncRef() 1543 return root 1544 case vfs.CtxMountNamespace: 1545 if ctx.Kernel.globalInit == nil { 1546 return nil 1547 } 1548 mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace() 1549 mntns.IncRef() 1550 return mntns 1551 case inet.CtxStack: 1552 return ctx.Kernel.RootNetworkNamespace().Stack() 1553 case ktime.CtxRealtimeClock: 1554 return ctx.Kernel.RealtimeClock() 1555 case limits.CtxLimits: 1556 // No limits apply. 1557 return limits.NewLimitSet() 1558 case pgalloc.CtxMemoryFile: 1559 return ctx.Kernel.mf 1560 case pgalloc.CtxMemoryFileProvider: 1561 return ctx.Kernel 1562 case platform.CtxPlatform: 1563 return ctx.Kernel 1564 case uniqueid.CtxGlobalUniqueID: 1565 return ctx.Kernel.UniqueID() 1566 case uniqueid.CtxGlobalUniqueIDProvider: 1567 return ctx.Kernel 1568 case uniqueid.CtxInotifyCookie: 1569 return ctx.Kernel.GenerateInotifyCookie() 1570 case unimpl.CtxEvents: 1571 return ctx.Kernel 1572 case cpuid.CtxFeatureSet: 1573 return ctx.Kernel.featureSet 1574 default: 1575 return nil 1576 } 1577 } 1578 1579 // Rate limits for the number of unimplemented syscall events. 1580 const ( 1581 unimplementedSyscallsMaxRate = 100 // events per second 1582 unimplementedSyscallBurst = 1000 // events 1583 ) 1584 1585 // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event 1586 // channel. 1587 func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { 1588 k.unimplementedSyscallEmitterOnce.Do(func() { 1589 k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) 1590 }) 1591 1592 t := TaskFromContext(ctx) 1593 IncrementUnimplementedSyscallCounter(sysno) 1594 _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ 1595 Tid: int32(t.ThreadID()), 1596 Registers: t.Arch().StateData().Proto(), 1597 }) 1598 } 1599 1600 // VFS returns the virtual filesystem for the kernel. 1601 func (k *Kernel) VFS() *vfs.VirtualFilesystem { 1602 return &k.vfs 1603 } 1604 1605 // SetHostMount sets the hostfs mount. 1606 func (k *Kernel) SetHostMount(mnt *vfs.Mount) { 1607 if k.hostMount != nil { 1608 panic("Kernel.hostMount cannot be set more than once") 1609 } 1610 k.hostMount = mnt 1611 } 1612 1613 // HostMount returns the hostfs mount. 1614 func (k *Kernel) HostMount() *vfs.Mount { 1615 return k.hostMount 1616 } 1617 1618 // PipeMount returns the pipefs mount. 1619 func (k *Kernel) PipeMount() *vfs.Mount { 1620 return k.pipeMount 1621 } 1622 1623 // NsfsMount returns the nsfs mount. 1624 func (k *Kernel) NsfsMount() *vfs.Mount { 1625 return k.nsfsMount 1626 } 1627 1628 // ShmMount returns the tmpfs mount. 1629 func (k *Kernel) ShmMount() *vfs.Mount { 1630 return k.shmMount 1631 } 1632 1633 // SocketMount returns the sockfs mount. 1634 func (k *Kernel) SocketMount() *vfs.Mount { 1635 return k.socketMount 1636 } 1637 1638 // CgroupRegistry returns the cgroup registry. 1639 func (k *Kernel) CgroupRegistry() *CgroupRegistry { 1640 return k.cgroupRegistry 1641 } 1642 1643 // Release releases resources owned by k. 1644 // 1645 // Precondition: This should only be called after the kernel is fully 1646 // initialized, e.g. after k.Start() has been called. 1647 func (k *Kernel) Release() { 1648 ctx := k.SupervisorContext() 1649 k.hostMount.DecRef(ctx) 1650 k.pipeMount.DecRef(ctx) 1651 k.nsfsMount.DecRef(ctx) 1652 k.shmMount.DecRef(ctx) 1653 k.socketMount.DecRef(ctx) 1654 k.vfs.Release(ctx) 1655 k.timekeeper.Destroy() 1656 k.vdso.Release(ctx) 1657 k.RootNetworkNamespace().DecRef(ctx) 1658 } 1659 1660 // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup 1661 // hierarchy. 1662 // 1663 // Precondition: root must be a new cgroup with no tasks. This implies the 1664 // controllers for root are also new and currently manage no task, which in turn 1665 // implies the new cgroup can be populated without migrating tasks between 1666 // cgroups. 1667 func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { 1668 k.tasks.mu.RLock() 1669 k.tasks.forEachTaskLocked(func(t *Task) { 1670 if t.exitState != TaskExitNone { 1671 return 1672 } 1673 t.mu.Lock() 1674 // A task can be in the cgroup if it has been created after the 1675 // cgroup hierarchy was registered. 1676 t.enterCgroupIfNotYetLocked(root) 1677 t.mu.Unlock() 1678 }) 1679 k.tasks.mu.RUnlock() 1680 } 1681 1682 // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the 1683 // hierarchy with the provided id. This is intended for use during hierarchy 1684 // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. 1685 func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { 1686 var releasedCGs []Cgroup 1687 1688 k.tasks.mu.RLock() 1689 // We'll have one cgroup per hierarchy per task. 1690 releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids)) 1691 k.tasks.forEachTaskLocked(func(t *Task) { 1692 if t.exitState != TaskExitNone { 1693 return 1694 } 1695 t.mu.Lock() 1696 for cg := range t.cgroups { 1697 if cg.HierarchyID() == hid { 1698 cg.Leave(t) 1699 t.resetMemCgID(cg) 1700 delete(t.cgroups, cg) 1701 releasedCGs = append(releasedCGs, cg) 1702 // A task can't be part of multiple cgroups from the same 1703 // hierarchy, so we can skip checking the rest once we find a 1704 // match. 1705 break 1706 } 1707 } 1708 t.mu.Unlock() 1709 }) 1710 k.tasks.mu.RUnlock() 1711 1712 for _, c := range releasedCGs { 1713 c.decRef() 1714 } 1715 } 1716 1717 func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) { 1718 k.tasks.mu.RLock() 1719 oldRootDecRefs := 0 1720 k.tasks.forEachTaskLocked(func(t *Task) { 1721 t.mu.Lock() 1722 defer t.mu.Unlock() 1723 if fsc := t.fsContext; fsc != nil { 1724 fsc.mu.Lock() 1725 defer fsc.mu.Unlock() 1726 if fsc.root == oldRoot { 1727 newRoot.IncRef() 1728 oldRootDecRefs++ 1729 fsc.root = newRoot 1730 } 1731 if fsc.cwd == oldRoot { 1732 newRoot.IncRef() 1733 oldRootDecRefs++ 1734 fsc.cwd = newRoot 1735 } 1736 } 1737 }) 1738 k.tasks.mu.RUnlock() 1739 for i := 0; i < oldRootDecRefs; i++ { 1740 oldRoot.DecRef(ctx) 1741 } 1742 } 1743 1744 func (k *Kernel) GetUserCounters(uid auth.KUID) *userCounters { 1745 k.userCountersMapMu.Lock() 1746 defer k.userCountersMapMu.Unlock() 1747 1748 if uc, ok := k.userCountersMap[uid]; ok { 1749 return uc 1750 } 1751 1752 uc := &userCounters{} 1753 k.userCountersMap[uid] = uc 1754 return uc 1755 }