gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/kernel.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernel provides an emulation of the Linux kernel. 16 // 17 // See README.md for a detailed overview. 18 // 19 // Lock order (outermost locks must be taken first): 20 // 21 // Kernel.extMu 22 // ThreadGroup.timerMu 23 // ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu 24 // TaskSet.mu 25 // SignalHandlers.mu 26 // Task.mu 27 // runningTasksMu 28 // 29 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking 30 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same 31 // time requires locking all of their signal mutexes first. 32 package kernel 33 34 import ( 35 "errors" 36 "fmt" 37 "io" 38 "path/filepath" 39 "time" 40 41 "gvisor.dev/gvisor/pkg/abi/linux" 42 "gvisor.dev/gvisor/pkg/atomicbitops" 43 "gvisor.dev/gvisor/pkg/cleanup" 44 "gvisor.dev/gvisor/pkg/context" 45 "gvisor.dev/gvisor/pkg/cpuid" 46 "gvisor.dev/gvisor/pkg/devutil" 47 "gvisor.dev/gvisor/pkg/errors/linuxerr" 48 "gvisor.dev/gvisor/pkg/eventchannel" 49 "gvisor.dev/gvisor/pkg/fd" 50 "gvisor.dev/gvisor/pkg/fspath" 51 "gvisor.dev/gvisor/pkg/log" 52 "gvisor.dev/gvisor/pkg/refs" 53 "gvisor.dev/gvisor/pkg/sentry/arch" 54 "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" 55 "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" 56 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" 57 "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" 58 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" 59 "gvisor.dev/gvisor/pkg/sentry/hostcpu" 60 "gvisor.dev/gvisor/pkg/sentry/inet" 61 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 62 "gvisor.dev/gvisor/pkg/sentry/kernel/futex" 63 "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" 64 "gvisor.dev/gvisor/pkg/sentry/kernel/sched" 65 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 66 "gvisor.dev/gvisor/pkg/sentry/limits" 67 "gvisor.dev/gvisor/pkg/sentry/loader" 68 "gvisor.dev/gvisor/pkg/sentry/mm" 69 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 70 "gvisor.dev/gvisor/pkg/sentry/platform" 71 "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" 72 sentrytime "gvisor.dev/gvisor/pkg/sentry/time" 73 "gvisor.dev/gvisor/pkg/sentry/unimpl" 74 uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" 75 "gvisor.dev/gvisor/pkg/sentry/uniqueid" 76 "gvisor.dev/gvisor/pkg/sentry/vfs" 77 "gvisor.dev/gvisor/pkg/state" 78 "gvisor.dev/gvisor/pkg/state/statefile" 79 "gvisor.dev/gvisor/pkg/sync" 80 "gvisor.dev/gvisor/pkg/tcpip" 81 ) 82 83 // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to 84 // allow easy access everywhere. 85 var IOUringEnabled = false 86 87 // UserCounters is a set of user counters. 88 // 89 // +stateify savable 90 type UserCounters struct { 91 uid auth.KUID 92 93 rlimitNProc atomicbitops.Uint64 94 } 95 96 // incRLimitNProc increments the rlimitNProc counter. 97 func (uc *UserCounters) incRLimitNProc(ctx context.Context) error { 98 lim := limits.FromContext(ctx).Get(limits.ProcessCount) 99 creds := auth.CredentialsFromContext(ctx) 100 nproc := uc.rlimitNProc.Add(1) 101 if nproc > lim.Cur && 102 !creds.HasCapability(linux.CAP_SYS_ADMIN) && 103 !creds.HasCapability(linux.CAP_SYS_RESOURCE) { 104 uc.rlimitNProc.Add(^uint64(0)) 105 return linuxerr.EAGAIN 106 } 107 return nil 108 } 109 110 // decRLimitNProc decrements the rlimitNProc counter. 111 func (uc *UserCounters) decRLimitNProc() { 112 uc.rlimitNProc.Add(^uint64(0)) 113 } 114 115 // CgroupMount contains the cgroup mount. These mounts are created for the root 116 // container by default and are stored in the kernel. 117 // 118 // +stateify savable 119 type CgroupMount struct { 120 Fs *vfs.Filesystem 121 Root *vfs.Dentry 122 Mount *vfs.Mount 123 } 124 125 // Kernel represents an emulated Linux kernel. It must be initialized by calling 126 // Init() or LoadFrom(). 127 // 128 // +stateify savable 129 type Kernel struct { 130 // extMu serializes external changes to the Kernel with calls to 131 // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel 132 // remains frozen for the duration of the call; it requires that the Kernel 133 // is paused as a precondition, which ensures that none of the tasks 134 // running within the Kernel can affect its state, but extMu is required to 135 // ensure that concurrent users of the Kernel *outside* the Kernel's 136 // control cannot affect its state by calling e.g. 137 // Kernel.SendExternalSignal.) 138 extMu sync.Mutex `state:"nosave"` 139 140 // started is true if Start has been called. Unless otherwise specified, 141 // all Kernel fields become immutable once started becomes true. 142 started bool `state:"nosave"` 143 144 // All of the following fields are immutable unless otherwise specified. 145 146 // Platform is the platform that is used to execute tasks in the created 147 // Kernel. 148 platform.Platform `state:"nosave"` 149 150 // mf provides application memory. 151 mf *pgalloc.MemoryFile `state:"nosave"` 152 153 // See InitKernelArgs for the meaning of these fields. 154 featureSet cpuid.FeatureSet 155 timekeeper *Timekeeper 156 tasks *TaskSet 157 rootUserNamespace *auth.UserNamespace 158 rootNetworkNamespace *inet.Namespace 159 applicationCores uint 160 useHostCores bool 161 extraAuxv []arch.AuxEntry 162 vdso *loader.VDSO 163 rootUTSNamespace *UTSNamespace 164 rootIPCNamespace *IPCNamespace 165 166 // futexes is the "root" futex.Manager, from which all others are forked. 167 // This is necessary to ensure that shared futexes are coherent across all 168 // tasks, including those created by CreateProcess. 169 futexes *futex.Manager 170 171 // globalInit is the thread group whose leader has ID 1 in the root PID 172 // namespace. globalInit is stored separately so that it is accessible even 173 // after all tasks in the thread group have exited, such that ID 1 is no 174 // longer mapped. 175 // 176 // globalInit is mutable until it is assigned by the first successful call 177 // to CreateProcess, and is protected by extMu. 178 globalInit *ThreadGroup 179 180 // syslog is the kernel log. 181 syslog syslog 182 183 runningTasksMu runningTasksMutex `state:"nosave"` 184 185 // runningTasks is the total count of tasks currently in 186 // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are 187 // not blocked or stopped. 188 // 189 // runningTasks must be accessed atomically. Increments from 0 to 1 are 190 // further protected by runningTasksMu (see incRunningTasks). 191 runningTasks atomicbitops.Int64 192 193 // runningTasksCond is signaled when runningTasks is incremented from 0 to 1. 194 // 195 // Invariant: runningTasksCond.L == &runningTasksMu. 196 runningTasksCond sync.Cond `state:"nosave"` 197 198 // cpuClock is incremented every linux.ClockTick by a goroutine running 199 // kernel.runCPUClockTicker() while runningTasks != 0. 200 // 201 // cpuClock is used to measure task CPU usage, since sampling monotonicClock 202 // twice on every syscall turns out to be unreasonably expensive. This is 203 // similar to how Linux does task CPU accounting on x86 204 // (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing 205 // information to improve resolution 206 // (kernel/sched/cputime.c:cputime_adjust()), which we can't do since 207 // "preeemptive" scheduling is managed by the Go runtime, which doesn't 208 // provide this information. 209 // 210 // cpuClock is mutable, and is accessed using atomic memory operations. 211 cpuClock atomicbitops.Uint64 212 213 // cpuClockTickTimer drives increments of cpuClock. 214 cpuClockTickTimer *time.Timer `state:"nosave"` 215 216 // cpuClockMu is used to make increments of cpuClock, and updates of timers 217 // based on cpuClock, atomic. 218 cpuClockMu cpuClockMutex `state:"nosave"` 219 220 // cpuClockTickerRunning is true if the goroutine that increments cpuClock is 221 // running and false if it is blocked in runningTasksCond.Wait() or if it 222 // never started. 223 // 224 // cpuClockTickerRunning is protected by runningTasksMu. 225 cpuClockTickerRunning bool 226 227 // cpuClockTickerWakeCh is sent to to wake the goroutine that increments 228 // cpuClock if it's sleeping between ticks. 229 cpuClockTickerWakeCh chan struct{} `state:"nosave"` 230 231 // cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions 232 // from true to false. 233 // 234 // Invariant: cpuClockTickerStopCond.L == &runningTasksMu. 235 cpuClockTickerStopCond sync.Cond `state:"nosave"` 236 237 // uniqueID is used to generate unique identifiers. 238 // 239 // uniqueID is mutable, and is accessed using atomic memory operations. 240 uniqueID atomicbitops.Uint64 241 242 // nextInotifyCookie is a monotonically increasing counter used for 243 // generating unique inotify event cookies. 244 // 245 // nextInotifyCookie is mutable. 246 nextInotifyCookie atomicbitops.Uint32 247 248 // netlinkPorts manages allocation of netlink socket port IDs. 249 netlinkPorts *port.Manager 250 251 // saveStatus is nil if the sandbox has not been saved, errSaved or 252 // errAutoSaved if it has been saved successfully, or the error causing the 253 // sandbox to exit during save. 254 // It is protected by extMu. 255 saveStatus error `state:"nosave"` 256 257 // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. 258 danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` 259 260 // sockets records all network sockets in the system. Protected by extMu. 261 sockets map[*vfs.FileDescription]*SocketRecord 262 263 // nextSocketRecord is the next entry number to use in sockets. Protected 264 // by extMu. 265 nextSocketRecord uint64 266 267 // unimplementedSyscallEmitterOnce is used in the initialization of 268 // unimplementedSyscallEmitter. 269 unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` 270 271 // unimplementedSyscallEmitter is used to emit unimplemented syscall 272 // events. This is initialized lazily on the first unimplemented 273 // syscall. 274 unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` 275 276 // SpecialOpts contains special kernel options. 277 SpecialOpts 278 279 // vfs keeps the filesystem state used across the kernel. 280 vfs vfs.VirtualFilesystem 281 282 // hostMount is the Mount used for file descriptors that were imported 283 // from the host. 284 hostMount *vfs.Mount 285 286 // pipeMount is the Mount used for pipes created by the pipe() and pipe2() 287 // syscalls (as opposed to named pipes created by mknod()). 288 pipeMount *vfs.Mount 289 290 // nsfsMount is the Mount used for namespaces. 291 nsfsMount *vfs.Mount 292 293 // shmMount is the Mount used for anonymous files created by the 294 // memfd_create() syscalls. It is analogous to Linux's shm_mnt. 295 shmMount *vfs.Mount 296 297 // socketMount is the Mount used for sockets created by the socket() and 298 // socketpair() syscalls. There are several cases where a socket dentry will 299 // not be contained in socketMount: 300 // 1. Socket files created by mknod() 301 // 2. Socket fds imported from the host (Kernel.hostMount is used for these) 302 // 3. Socket files created by binding Unix sockets to a file path 303 socketMount *vfs.Mount 304 305 // sysVShmDevID is the device number used by SysV shm segments. In Linux, 306 // SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number. 307 // In gVisor, the shm implementation does not use shmMount, extracting 308 // shmMount's device number is inconvenient, applications accept a 309 // different device number in practice, and using a distinct device number 310 // avoids the possibility of inode number collisions due to the hack 311 // described in shm.Shm.InodeID(). 312 sysVShmDevID uint32 313 314 // If set to true, report address space activation waits as if the task is in 315 // external wait so that the watchdog doesn't report the task stuck. 316 SleepForAddressSpaceActivation bool 317 318 // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a 319 // tracee-tracer relationship. The key is a process (technically, the thread 320 // group leader) that can be traced by any thread that is a descendant of the 321 // value. If the value is nil, then anyone can trace the process represented by 322 // the key. 323 // 324 // ptraceExceptions is protected by the TaskSet mutex. 325 ptraceExceptions map[*Task]*Task 326 327 // YAMAPtraceScope is the current level of YAMA ptrace restrictions. 328 YAMAPtraceScope atomicbitops.Int32 329 330 // cgroupRegistry contains the set of active cgroup controllers on the 331 // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on 332 // the system. 333 cgroupRegistry *CgroupRegistry 334 335 // cgroupMountsMap maps the cgroup controller names to the cgroup mounts 336 // created for the root container. These mounts are then bind mounted 337 // for other application containers by creating their own container 338 // directories. 339 cgroupMountsMap map[string]*CgroupMount 340 cgroupMountsMapMu cgroupMountsMutex `state:"nosave"` 341 342 // userCountersMap maps auth.KUID into a set of user counters. 343 userCountersMap map[auth.KUID]*UserCounters 344 userCountersMapMu userCountersMutex `state:"nosave"` 345 346 // MaxFDLimit specifies the maximum file descriptor number that can be 347 // used by processes. 348 MaxFDLimit atomicbitops.Int32 349 350 // devGofers maps containers (using its name) to its device gofer client. 351 devGofers map[string]*devutil.GoferClient `state:"nosave"` 352 devGofersMu sync.Mutex `state:"nosave"` 353 354 // containerNames store the container name based on their container ID. 355 // Names are preserved between save/restore session, while IDs can change. 356 // 357 // Mapping: cid -> name. 358 // It's protected by extMu. 359 containerNames map[string]string 360 } 361 362 // InitKernelArgs holds arguments to Init. 363 type InitKernelArgs struct { 364 // FeatureSet is the emulated CPU feature set. 365 FeatureSet cpuid.FeatureSet 366 367 // Timekeeper manages time for all tasks in the system. 368 Timekeeper *Timekeeper 369 370 // RootUserNamespace is the root user namespace. 371 RootUserNamespace *auth.UserNamespace 372 373 // RootNetworkNamespace is the root network namespace. If nil, no networking 374 // will be available. 375 RootNetworkNamespace *inet.Namespace 376 377 // ApplicationCores is the number of logical CPUs visible to sandboxed 378 // applications. The set of logical CPU IDs is [0, ApplicationCores); thus 379 // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the 380 // most significant bit in cpu_possible_mask + 1. 381 ApplicationCores uint 382 383 // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU 384 // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a 385 // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it 386 // will be overridden. 387 UseHostCores bool 388 389 // ExtraAuxv contains additional auxiliary vector entries that are added to 390 // each process by the ELF loader. 391 ExtraAuxv []arch.AuxEntry 392 393 // Vdso holds the VDSO and its parameter page. 394 Vdso *loader.VDSO 395 396 // RootUTSNamespace is the root UTS namespace. 397 RootUTSNamespace *UTSNamespace 398 399 // RootIPCNamespace is the root IPC namespace. 400 RootIPCNamespace *IPCNamespace 401 402 // PIDNamespace is the root PID namespace. 403 PIDNamespace *PIDNamespace 404 405 // MaxFDLimit specifies the maximum file descriptor number that can be 406 // used by processes. If it is zero, the limit will be set to 407 // unlimited. 408 MaxFDLimit int32 409 } 410 411 // Init initialize the Kernel with no tasks. 412 // 413 // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile 414 // before calling Init. 415 func (k *Kernel) Init(args InitKernelArgs) error { 416 if args.Timekeeper == nil { 417 return fmt.Errorf("args.Timekeeper is nil") 418 } 419 if args.Timekeeper.clocks == nil { 420 return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") 421 } 422 if args.RootUserNamespace == nil { 423 return fmt.Errorf("args.RootUserNamespace is nil") 424 } 425 if args.ApplicationCores == 0 { 426 return fmt.Errorf("args.ApplicationCores is 0") 427 } 428 429 k.featureSet = args.FeatureSet 430 k.timekeeper = args.Timekeeper 431 k.tasks = newTaskSet(args.PIDNamespace) 432 k.rootUserNamespace = args.RootUserNamespace 433 k.rootUTSNamespace = args.RootUTSNamespace 434 k.rootIPCNamespace = args.RootIPCNamespace 435 k.rootNetworkNamespace = args.RootNetworkNamespace 436 if k.rootNetworkNamespace == nil { 437 k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace) 438 } 439 k.runningTasksCond.L = &k.runningTasksMu 440 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 441 k.cpuClockTickerStopCond.L = &k.runningTasksMu 442 k.applicationCores = args.ApplicationCores 443 if args.UseHostCores { 444 k.useHostCores = true 445 maxCPU, err := hostcpu.MaxPossibleCPU() 446 if err != nil { 447 return fmt.Errorf("failed to get maximum CPU number: %v", err) 448 } 449 minAppCores := uint(maxCPU) + 1 450 if k.applicationCores < minAppCores { 451 log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) 452 k.applicationCores = minAppCores 453 } 454 } 455 k.extraAuxv = args.ExtraAuxv 456 k.vdso = args.Vdso 457 k.futexes = futex.NewManager() 458 k.netlinkPorts = port.New() 459 k.ptraceExceptions = make(map[*Task]*Task) 460 k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL) 461 k.userCountersMap = make(map[auth.KUID]*UserCounters) 462 if args.MaxFDLimit == 0 { 463 args.MaxFDLimit = MaxFdLimit 464 } 465 k.MaxFDLimit.Store(args.MaxFDLimit) 466 k.containerNames = make(map[string]string) 467 468 ctx := k.SupervisorContext() 469 if err := k.vfs.Init(ctx); err != nil { 470 return fmt.Errorf("failed to initialize VFS: %v", err) 471 } 472 473 err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) 474 if err != nil { 475 return fmt.Errorf("failed to create mqfs filesystem: %v", err) 476 } 477 478 pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) 479 if err != nil { 480 return fmt.Errorf("failed to create pipefs filesystem: %v", err) 481 } 482 defer pipeFilesystem.DecRef(ctx) 483 pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) 484 k.pipeMount = pipeMount 485 486 nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs) 487 if err != nil { 488 return fmt.Errorf("failed to create nsfs filesystem: %v", err) 489 } 490 defer nsfsFilesystem.DecRef(ctx) 491 k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) 492 k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) 493 k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) 494 k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) 495 496 tmpfsOpts := vfs.GetFilesystemOptions{ 497 InternalData: tmpfs.FilesystemOpts{ 498 // See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT). 499 // Note how mm/shmem.c:shmem_fill_super() does not provide a default 500 // value for sbinfo->max_blocks when SB_KERNMOUNT is set. 501 DisableDefaultSizeLimit: true, 502 }, 503 InternalMount: true, 504 } 505 tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts) 506 if err != nil { 507 return fmt.Errorf("failed to create tmpfs filesystem: %v", err) 508 } 509 defer tmpfsFilesystem.DecRef(ctx) 510 defer tmpfsRoot.DecRef(ctx) 511 k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) 512 513 socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) 514 if err != nil { 515 return fmt.Errorf("failed to create sockfs filesystem: %v", err) 516 } 517 defer socketFilesystem.DecRef(ctx) 518 k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) 519 520 sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor() 521 if err != nil { 522 return fmt.Errorf("failed to get device number for SysV shm: %v", err) 523 } 524 k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor) 525 526 k.sockets = make(map[*vfs.FileDescription]*SocketRecord) 527 528 k.cgroupRegistry = newCgroupRegistry() 529 return nil 530 } 531 532 // +stateify savable 533 type privateMemoryFileMetadata struct { 534 owners []string 535 } 536 537 func savePrivateMFs(ctx context.Context, w io.Writer, pw io.Writer, mfsToSave map[string]*pgalloc.MemoryFile, mfOpts pgalloc.SaveOpts) error { 538 // mfOpts.ExcludeCommittedZeroPages is expected to reflect application 539 // memory usage behavior, but not necessarily usage of private MemoryFiles. 540 mfOpts.ExcludeCommittedZeroPages = false 541 542 var meta privateMemoryFileMetadata 543 // Generate the order in which private memory files are saved. 544 for fsID := range mfsToSave { 545 meta.owners = append(meta.owners, fsID) 546 } 547 // Save the metadata. 548 if _, err := state.Save(ctx, w, &meta); err != nil { 549 return err 550 } 551 // Followed by the private memory files in order. 552 for _, fsID := range meta.owners { 553 if err := mfsToSave[fsID].SaveTo(ctx, w, pw, mfOpts); err != nil { 554 return err 555 } 556 } 557 return nil 558 } 559 560 func loadPrivateMFs(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error { 561 // Load the metadata. 562 var meta privateMemoryFileMetadata 563 if _, err := state.Load(ctx, r, &meta); err != nil { 564 return err 565 } 566 mfmap := pgalloc.MemoryFileMapFromContext(ctx) 567 // Ensure that it is consistent with CtxFilesystemMemoryFileMap. 568 if len(mfmap) != len(meta.owners) { 569 return fmt.Errorf("inconsistent private memory files on restore: savedMFOwners = %v, CtxFilesystemMemoryFileMap = %v", meta.owners, mfmap) 570 } 571 // Load all private memory files. 572 for _, fsID := range meta.owners { 573 mf, ok := mfmap[fsID] 574 if !ok { 575 return fmt.Errorf("saved memory file for %q was not configured on restore", fsID) 576 } 577 if err := mf.LoadFrom(ctx, r, pr); err != nil { 578 return err 579 } 580 } 581 return nil 582 } 583 584 // SaveTo saves the state of k to w. 585 // 586 // Preconditions: The kernel must be paused throughout the call to SaveTo. 587 func (k *Kernel) SaveTo(ctx context.Context, w io.Writer, pagesMetadata, pagesFile *fd.FD, mfOpts pgalloc.SaveOpts) error { 588 saveStart := time.Now() 589 590 // Do not allow other Kernel methods to affect it while it's being saved. 591 k.extMu.Lock() 592 defer k.extMu.Unlock() 593 594 // Stop time. 595 k.pauseTimeLocked(ctx) 596 defer k.resumeTimeLocked(ctx) 597 598 // Evict all evictable MemoryFile allocations. 599 k.mf.StartEvictions() 600 k.mf.WaitForEvictions() 601 602 // Discard unsavable mappings, such as those for host file descriptors. 603 if err := k.invalidateUnsavableMappings(ctx); err != nil { 604 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 605 } 606 607 // Capture all private memory files. 608 mfsToSave := make(map[string]*pgalloc.MemoryFile) 609 vfsCtx := context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfsToSave) 610 // Prepare filesystems for saving. This must be done after 611 // invalidateUnsavableMappings(), since dropping memory mappings may 612 // affect filesystem state (e.g. page cache reference counts). 613 if err := k.vfs.PrepareSave(vfsCtx); err != nil { 614 return err 615 } 616 // Mark all to-be-saved MemoryFiles as savable to inform kernel save below. 617 k.mf.MarkSavable() 618 for _, mf := range mfsToSave { 619 mf.MarkSavable() 620 } 621 622 // Save the CPUID FeatureSet before the rest of the kernel so we can 623 // verify its compatibility on restore before attempting to restore the 624 // entire kernel, which may fail on an incompatible machine. 625 // 626 // N.B. This will also be saved along with the full kernel save below. 627 cpuidStart := time.Now() 628 if _, err := state.Save(ctx, w, &k.featureSet); err != nil { 629 return err 630 } 631 log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) 632 633 // Save the timekeeper's state. 634 635 if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil { 636 // Pause the network stack. 637 netstackPauseStart := time.Now() 638 log.Infof("Pausing root network namespace") 639 k.rootNetworkNamespace.Stack().Pause() 640 defer k.rootNetworkNamespace.Stack().Resume() 641 log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart)) 642 } 643 644 // Save the kernel state. 645 kernelStart := time.Now() 646 stats, err := state.Save(ctx, w, k) 647 if err != nil { 648 return err 649 } 650 log.Infof("Kernel save stats: %s", stats.String()) 651 log.Infof("Kernel save took [%s].", time.Since(kernelStart)) 652 653 // Save the memory files' state. 654 memoryStart := time.Now() 655 pmw := w 656 if pagesMetadata != nil { 657 pmw = pagesMetadata 658 } 659 pw := w 660 if pagesFile != nil { 661 pw = pagesFile 662 } 663 if err := k.mf.SaveTo(ctx, pmw, pw, mfOpts); err != nil { 664 return err 665 } 666 if err := savePrivateMFs(ctx, pmw, pw, mfsToSave, mfOpts); err != nil { 667 return err 668 } 669 log.Infof("Memory files save took [%s].", time.Since(memoryStart)) 670 671 log.Infof("Overall save took [%s].", time.Since(saveStart)) 672 673 return nil 674 } 675 676 // Preconditions: The kernel must be paused. 677 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { 678 invalidated := make(map[*mm.MemoryManager]struct{}) 679 k.tasks.mu.RLock() 680 defer k.tasks.mu.RUnlock() 681 for t := range k.tasks.Root.tids { 682 // We can skip locking Task.mu here since the kernel is paused. 683 if memMgr := t.image.MemoryManager; memMgr != nil { 684 if _, ok := invalidated[memMgr]; !ok { 685 if err := memMgr.InvalidateUnsavable(ctx); err != nil { 686 return err 687 } 688 invalidated[memMgr] = struct{}{} 689 } 690 } 691 // I really wish we just had a sync.Map of all MMs... 692 if r, ok := t.runState.(*runSyscallAfterExecStop); ok { 693 if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { 694 return err 695 } 696 } 697 } 698 return nil 699 } 700 701 // LoadFrom returns a new Kernel loaded from args. 702 func (k *Kernel) LoadFrom(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { 703 loadStart := time.Now() 704 705 var ( 706 mfLoadWg sync.WaitGroup 707 mfLoadErr error 708 ) 709 parallelMfLoad := pagesMetadata != nil && pagesFile != nil 710 if parallelMfLoad { 711 // Parallelize MemoryFile load and kernel load. Both are independent. 712 mfLoadWg.Add(1) 713 go func() { 714 defer mfLoadWg.Done() 715 mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile) 716 }() 717 // Defer a Wait() so we wait for k.loadMemoryFiles() to complete even if we 718 // error out without reaching the other Wait() below. 719 defer mfLoadWg.Wait() 720 } 721 722 k.runningTasksCond.L = &k.runningTasksMu 723 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 724 k.cpuClockTickerStopCond.L = &k.runningTasksMu 725 726 initAppCores := k.applicationCores 727 728 // Load the pre-saved CPUID FeatureSet. 729 // 730 // N.B. This was also saved along with the full kernel below, so we 731 // don't need to explicitly install it in the Kernel. 732 cpuidStart := time.Now() 733 if _, err := state.Load(ctx, r, &k.featureSet); err != nil { 734 return err 735 } 736 log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) 737 738 // Verify that the FeatureSet is usable on this host. We do this before 739 // Kernel load so that the explicit CPUID mismatch error has priority 740 // over floating point state restore errors that may occur on load on 741 // an incompatible machine. 742 if err := k.featureSet.CheckHostCompatible(); err != nil { 743 return err 744 } 745 746 // Load the kernel state. 747 kernelStart := time.Now() 748 stats, err := state.Load(ctx, r, k) 749 if err != nil { 750 return err 751 } 752 log.Infof("Kernel load stats: %s", stats.String()) 753 log.Infof("Kernel load took [%s].", time.Since(kernelStart)) 754 755 if parallelMfLoad { 756 mfLoadWg.Wait() 757 } else { 758 mfLoadErr = k.loadMemoryFiles(ctx, r, pagesMetadata, pagesFile) 759 } 760 if mfLoadErr != nil { 761 return mfLoadErr 762 } 763 764 // rootNetworkNamespace should be populated after loading the state file. 765 // Restore the root network stack. 766 k.rootNetworkNamespace.RestoreRootStack(net) 767 768 k.Timekeeper().SetClocks(clocks) 769 770 if timeReady != nil { 771 close(timeReady) 772 } 773 774 if net != nil { 775 net.Restore() 776 } 777 778 if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { 779 return err 780 } 781 782 tcpip.AsyncLoading.Wait() 783 784 log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) 785 786 // Applications may size per-cpu structures based on k.applicationCores, so 787 // it can't change across save/restore. When we are virtualizing CPU 788 // numbers, this isn't a problem. However, when we are exposing host CPU 789 // assignments, we can't tolerate an increase in the number of host CPUs, 790 // which could result in getcpu(2) returning CPUs that applications expect 791 // not to exist. 792 if k.useHostCores && initAppCores > k.applicationCores { 793 return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) 794 } 795 796 return nil 797 } 798 799 func (k *Kernel) loadMemoryFiles(ctx context.Context, r io.Reader, pagesMetadata, pagesFile *fd.FD) error { 800 // Load the memory files' state. 801 memoryStart := time.Now() 802 pmr := r 803 if pagesMetadata != nil { 804 pmr = pagesMetadata 805 } 806 var pr *statefile.AsyncReader 807 if pagesFile != nil { 808 pr = statefile.NewAsyncReader(pagesFile, 0 /* off */) 809 } 810 if err := k.mf.LoadFrom(ctx, pmr, pr); err != nil { 811 return err 812 } 813 if err := loadPrivateMFs(ctx, pmr, pr); err != nil { 814 return err 815 } 816 if pr != nil { 817 if err := pr.Close(); err != nil { 818 return err 819 } 820 } 821 log.Infof("Memory files load took [%s].", time.Since(memoryStart)) 822 return nil 823 } 824 825 // UniqueID returns a unique identifier. 826 func (k *Kernel) UniqueID() uint64 { 827 id := k.uniqueID.Add(1) 828 if id == 0 { 829 panic("unique identifier generator wrapped around") 830 } 831 return id 832 } 833 834 // CreateProcessArgs holds arguments to kernel.CreateProcess. 835 type CreateProcessArgs struct { 836 // Filename is the filename to load as the init binary. 837 // 838 // If this is provided as "", File will be checked, then the file will be 839 // guessed via Argv[0]. 840 Filename string 841 842 // File is a passed host FD pointing to a file to load as the init binary. 843 // 844 // This is checked if and only if Filename is "". 845 File *vfs.FileDescription 846 847 // Argv is a list of arguments. 848 Argv []string 849 850 // Envv is a list of environment variables. 851 Envv []string 852 853 // WorkingDirectory is the initial working directory. 854 // 855 // This defaults to the root if empty. 856 WorkingDirectory string 857 858 // Credentials is the initial credentials. 859 Credentials *auth.Credentials 860 861 // FDTable is the initial set of file descriptors. If CreateProcess succeeds, 862 // it takes a reference on FDTable. 863 FDTable *FDTable 864 865 // Umask is the initial umask. 866 Umask uint 867 868 // Limits are the initial resource limits. 869 Limits *limits.LimitSet 870 871 // MaxSymlinkTraversals is the maximum number of symlinks to follow 872 // during resolution. 873 MaxSymlinkTraversals uint 874 875 // UTSNamespace is the initial UTS namespace. 876 UTSNamespace *UTSNamespace 877 878 // IPCNamespace is the initial IPC namespace. 879 IPCNamespace *IPCNamespace 880 881 // PIDNamespace is the initial PID Namespace. 882 PIDNamespace *PIDNamespace 883 884 // MountNamespace optionally contains the mount namespace for this 885 // process. If nil, the init process's mount namespace is used. 886 // 887 // Anyone setting MountNamespace must donate a reference (i.e. 888 // increment it). 889 MountNamespace *vfs.MountNamespace 890 891 // ContainerID is the container that the process belongs to. 892 ContainerID string 893 894 // InitialCgroups are the cgroups the container is initialized to. 895 InitialCgroups map[Cgroup]struct{} 896 897 // Origin indicates how the task was first created. 898 Origin TaskOrigin 899 } 900 901 // NewContext returns a context.Context that represents the task that will be 902 // created by args.NewContext(k). 903 func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context { 904 return &createProcessContext{ 905 Context: context.Background(), 906 kernel: k, 907 args: args, 908 } 909 } 910 911 // createProcessContext is a context.Context that represents the context 912 // associated with a task that is being created. 913 type createProcessContext struct { 914 context.Context 915 kernel *Kernel 916 args *CreateProcessArgs 917 } 918 919 // Value implements context.Context.Value. 920 func (ctx *createProcessContext) Value(key any) any { 921 switch key { 922 case CtxKernel: 923 return ctx.kernel 924 case CtxPIDNamespace: 925 return ctx.args.PIDNamespace 926 case CtxUTSNamespace: 927 utsns := ctx.args.UTSNamespace 928 utsns.IncRef() 929 return utsns 930 case ipc.CtxIPCNamespace: 931 ipcns := ctx.args.IPCNamespace 932 ipcns.IncRef() 933 return ipcns 934 case auth.CtxCredentials: 935 return ctx.args.Credentials 936 case vfs.CtxRoot: 937 if ctx.args.MountNamespace == nil { 938 return nil 939 } 940 root := ctx.args.MountNamespace.Root(ctx) 941 return root 942 case vfs.CtxMountNamespace: 943 if ctx.kernel.globalInit == nil { 944 return nil 945 } 946 mntns := ctx.kernel.GlobalInit().Leader().MountNamespace() 947 mntns.IncRef() 948 return mntns 949 case devutil.CtxDevGoferClient: 950 return ctx.kernel.GetDevGoferClient(ctx.kernel.ContainerName(ctx.args.ContainerID)) 951 case inet.CtxStack: 952 return ctx.kernel.RootNetworkNamespace().Stack() 953 case ktime.CtxRealtimeClock: 954 return ctx.kernel.RealtimeClock() 955 case limits.CtxLimits: 956 return ctx.args.Limits 957 case pgalloc.CtxMemoryCgroupID: 958 return ctx.getMemoryCgroupID() 959 case pgalloc.CtxMemoryFile: 960 return ctx.kernel.mf 961 case platform.CtxPlatform: 962 return ctx.kernel 963 case uniqueid.CtxGlobalUniqueID: 964 return ctx.kernel.UniqueID() 965 case uniqueid.CtxGlobalUniqueIDProvider: 966 return ctx.kernel 967 case uniqueid.CtxInotifyCookie: 968 return ctx.kernel.GenerateInotifyCookie() 969 case unimpl.CtxEvents: 970 return ctx.kernel 971 default: 972 return nil 973 } 974 } 975 976 func (ctx *createProcessContext) getMemoryCgroupID() uint32 { 977 for cg := range ctx.args.InitialCgroups { 978 for _, ctl := range cg.Controllers() { 979 if ctl.Type() == CgroupControllerMemory { 980 return cg.ID() 981 } 982 } 983 } 984 return InvalidCgroupID 985 } 986 987 // CreateProcess creates a new task in a new thread group with the given 988 // options. The new task has no parent and is in the root PID namespace. 989 // 990 // If k.Start() has already been called, then the created process must be 991 // started by calling kernel.StartProcess(tg). 992 // 993 // If k.Start() has not yet been called, then the created task will begin 994 // running when k.Start() is called. 995 // 996 // CreateProcess has no analogue in Linux; it is used to create the initial 997 // application task, as well as processes started by the control server. 998 func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { 999 k.extMu.Lock() 1000 defer k.extMu.Unlock() 1001 log.Infof("EXEC: %v", args.Argv) 1002 1003 ctx := args.NewContext(k) 1004 mntns := args.MountNamespace 1005 if mntns == nil { 1006 if k.globalInit == nil { 1007 return nil, 0, fmt.Errorf("mount namespace is nil") 1008 } 1009 // Add a reference to the namespace, which is transferred to the new process. 1010 mntns = k.globalInit.Leader().MountNamespace() 1011 mntns.IncRef() 1012 } 1013 // Get the root directory from the MountNamespace. 1014 root := mntns.Root(ctx) 1015 defer root.DecRef(ctx) 1016 1017 // Grab the working directory. 1018 wd := root // Default. 1019 if args.WorkingDirectory != "" { 1020 pop := vfs.PathOperation{ 1021 Root: root, 1022 Start: wd, 1023 Path: fspath.Parse(args.WorkingDirectory), 1024 FollowFinalSymlink: true, 1025 } 1026 // NOTE(b/236028361): Do not set CheckSearchable flag to true. 1027 // Application is allowed to start with a working directory that it can 1028 // not access/search. This is consistent with Docker and VFS1. Runc 1029 // explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry 1030 // chdir to fix EPERM"). As described in the commit, runc unintentionally 1031 // allowed this behavior in a couple of releases and applications started 1032 // relying on it. So they decided to allow it for backward compatibility. 1033 var err error 1034 wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{}) 1035 if err != nil { 1036 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 1037 } 1038 defer wd.DecRef(ctx) 1039 } 1040 fsContext := NewFSContext(root, wd, args.Umask) 1041 1042 tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) 1043 cu := cleanup.Make(func() { 1044 tg.Release(ctx) 1045 }) 1046 defer cu.Clean() 1047 1048 // Check which file to start from. 1049 switch { 1050 case args.Filename != "": 1051 // If a filename is given, take that. 1052 // Set File to nil so we resolve the path in LoadTaskImage. 1053 args.File = nil 1054 case args.File != nil: 1055 // If File is set, take the File provided directly. 1056 args.Filename = args.File.MappedName(ctx) 1057 default: 1058 // Otherwise look at Argv and see if the first argument is a valid path. 1059 if len(args.Argv) == 0 { 1060 return nil, 0, fmt.Errorf("no filename or command provided") 1061 } 1062 if !filepath.IsAbs(args.Argv[0]) { 1063 return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) 1064 } 1065 args.Filename = args.Argv[0] 1066 } 1067 1068 // Create a fresh task context. 1069 remainingTraversals := args.MaxSymlinkTraversals 1070 loadArgs := loader.LoadArgs{ 1071 Root: root, 1072 WorkingDir: wd, 1073 RemainingTraversals: &remainingTraversals, 1074 ResolveFinal: true, 1075 Filename: args.Filename, 1076 File: args.File, 1077 CloseOnExec: false, 1078 Argv: args.Argv, 1079 Envv: args.Envv, 1080 Features: k.featureSet, 1081 } 1082 1083 image, se := k.LoadTaskImage(ctx, loadArgs) 1084 if se != nil { 1085 return nil, 0, errors.New(se.String()) 1086 } 1087 var capData auth.VfsCapData 1088 if len(image.FileCaps()) != 0 { 1089 var err error 1090 capData, err = auth.VfsCapDataOf([]byte(image.FileCaps())) 1091 if err != nil { 1092 return nil, 0, err 1093 } 1094 } 1095 creds, err := auth.CapsFromVfsCaps(capData, args.Credentials) 1096 if err != nil { 1097 return nil, 0, err 1098 } 1099 args.FDTable.IncRef() 1100 1101 // Create the task. 1102 config := &TaskConfig{ 1103 Kernel: k, 1104 ThreadGroup: tg, 1105 TaskImage: image, 1106 FSContext: fsContext, 1107 FDTable: args.FDTable, 1108 Credentials: creds, 1109 NetworkNamespace: k.RootNetworkNamespace(), 1110 AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), 1111 UTSNamespace: args.UTSNamespace, 1112 IPCNamespace: args.IPCNamespace, 1113 MountNamespace: mntns, 1114 ContainerID: args.ContainerID, 1115 InitialCgroups: args.InitialCgroups, 1116 UserCounters: k.GetUserCounters(args.Credentials.RealKUID), 1117 Origin: args.Origin, 1118 // A task with no parent starts out with no session keyring. 1119 SessionKeyring: nil, 1120 } 1121 config.UTSNamespace.IncRef() 1122 config.IPCNamespace.IncRef() 1123 config.NetworkNamespace.IncRef() 1124 t, err := k.tasks.NewTask(ctx, config) 1125 if err != nil { 1126 return nil, 0, err 1127 } 1128 t.traceExecEvent(image) // Simulate exec for tracing. 1129 1130 // Success. 1131 cu.Release() 1132 tgid := k.tasks.Root.IDOfThreadGroup(tg) 1133 if k.globalInit == nil { 1134 k.globalInit = tg 1135 } 1136 return tg, tgid, nil 1137 } 1138 1139 // StartProcess starts running a process that was created with CreateProcess. 1140 func (k *Kernel) StartProcess(tg *ThreadGroup) { 1141 t := tg.Leader() 1142 tid := k.tasks.Root.IDOfTask(t) 1143 t.Start(tid) 1144 } 1145 1146 // Start starts execution of all tasks in k. 1147 // 1148 // Preconditions: Start may be called exactly once. 1149 func (k *Kernel) Start() error { 1150 k.extMu.Lock() 1151 defer k.extMu.Unlock() 1152 1153 if k.started { 1154 return fmt.Errorf("kernel already started") 1155 } 1156 1157 k.started = true 1158 k.cpuClockTickTimer = time.NewTimer(linux.ClockTick) 1159 k.runningTasksMu.Lock() 1160 k.cpuClockTickerRunning = true 1161 k.runningTasksMu.Unlock() 1162 go k.runCPUClockTicker() 1163 // If k was created by LoadKernelFrom, timers were stopped during 1164 // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, 1165 // this is a no-op. 1166 k.resumeTimeLocked(k.SupervisorContext()) 1167 k.tasks.mu.RLock() 1168 ts := make([]*Task, 0, len(k.tasks.Root.tids)) 1169 for t := range k.tasks.Root.tids { 1170 ts = append(ts, t) 1171 } 1172 k.tasks.mu.RUnlock() 1173 // Start task goroutines. 1174 // NOTE(b/235349091): We don't actually need the TaskSet mutex, we just 1175 // need to make sure we only call t.Start() once for each task. Holding the 1176 // mutex for each task start may cause a nested locking error. 1177 for _, t := range ts { 1178 t.Start(t.ThreadID()) 1179 } 1180 return nil 1181 } 1182 1183 // pauseTimeLocked pauses all Timers and Timekeeper updates. 1184 // 1185 // Preconditions: 1186 // - Any task goroutines running in k must be stopped. 1187 // - k.extMu must be locked. 1188 func (k *Kernel) pauseTimeLocked(ctx context.Context) { 1189 // Since all task goroutines have been stopped by precondition, the CPU clock 1190 // ticker should stop on its own; wait for it to do so, waking it up from 1191 // sleeping between ticks if necessary. 1192 k.runningTasksMu.Lock() 1193 for k.cpuClockTickerRunning { 1194 select { 1195 case k.cpuClockTickerWakeCh <- struct{}{}: 1196 default: 1197 } 1198 k.cpuClockTickerStopCond.Wait() 1199 } 1200 k.runningTasksMu.Unlock() 1201 1202 // By precondition, nothing else can be interacting with PIDNamespace.tids 1203 // or FDTable.files, so we can iterate them without synchronization. (We 1204 // can't hold the TaskSet mutex when pausing thread group timers because 1205 // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet 1206 // mutex, while holding the Timer mutex.) 1207 for t := range k.tasks.Root.tids { 1208 if t == t.tg.leader { 1209 t.tg.itimerRealTimer.Pause() 1210 for _, it := range t.tg.timers { 1211 it.PauseTimer() 1212 } 1213 } 1214 // This means we'll iterate FDTables shared by multiple tasks repeatedly, 1215 // but ktime.Timer.Pause is idempotent so this is harmless. 1216 if t.fdTable != nil { 1217 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1218 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1219 tfd.PauseTimer() 1220 } 1221 }) 1222 } 1223 } 1224 k.timekeeper.PauseUpdates() 1225 } 1226 1227 // resumeTimeLocked resumes all Timers and Timekeeper updates. If 1228 // pauseTimeLocked has not been previously called, resumeTimeLocked has no 1229 // effect. 1230 // 1231 // Preconditions: 1232 // - Any task goroutines running in k must be stopped. 1233 // - k.extMu must be locked. 1234 func (k *Kernel) resumeTimeLocked(ctx context.Context) { 1235 // The CPU clock ticker will automatically resume as task goroutines resume 1236 // execution. 1237 1238 k.timekeeper.ResumeUpdates() 1239 for t := range k.tasks.Root.tids { 1240 if t == t.tg.leader { 1241 t.tg.itimerRealTimer.Resume() 1242 for _, it := range t.tg.timers { 1243 it.ResumeTimer() 1244 } 1245 } 1246 if t.fdTable != nil { 1247 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1248 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1249 tfd.ResumeTimer() 1250 } 1251 }) 1252 } 1253 } 1254 } 1255 1256 func (k *Kernel) incRunningTasks() { 1257 for { 1258 tasks := k.runningTasks.Load() 1259 if tasks != 0 { 1260 // Standard case. Simply increment. 1261 if !k.runningTasks.CompareAndSwap(tasks, tasks+1) { 1262 continue 1263 } 1264 return 1265 } 1266 1267 // Transition from 0 -> 1. 1268 k.runningTasksMu.Lock() 1269 if k.runningTasks.Load() != 0 { 1270 // Raced with another transition and lost. 1271 k.runningTasks.Add(1) 1272 k.runningTasksMu.Unlock() 1273 return 1274 } 1275 if !k.cpuClockTickerRunning { 1276 select { 1277 case tickTime := <-k.cpuClockTickTimer.C: 1278 // Rearm the timer since we consumed the wakeup. Estimate how much time 1279 // remains on the current tick so that periodic workloads interact with 1280 // the (periodic) CPU clock ticker in the same way that they would 1281 // without the optimization of putting the ticker to sleep. 1282 missedNS := time.Since(tickTime).Nanoseconds() 1283 missedTicks := missedNS / linux.ClockTick.Nanoseconds() 1284 thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds() 1285 k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS)) 1286 // Increment k.cpuClock on the CPU clock ticker goroutine's behalf. 1287 // (Whole missed ticks don't matter, and adding them to k.cpuClock will 1288 // just confuse the watchdog.) At the time the tick occurred, all task 1289 // goroutines were asleep, so there's nothing else to do. This ensures 1290 // that our caller (Task.accountTaskGoroutineLeave()) records an 1291 // updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly 1292 // accounted as having resumed execution in the sentry during this tick 1293 // instead of at the end of the previous one. 1294 k.cpuClock.Add(1) 1295 default: 1296 } 1297 // We are transitioning from idle to active. Set k.cpuClockTickerRunning 1298 // = true here so that if we transition to idle and then active again 1299 // before the CPU clock ticker goroutine has a chance to run, the first 1300 // call to k.incRunningTasks() at the end of that cycle does not try to 1301 // steal k.cpuClockTickTimer.C again, as this would allow workloads that 1302 // rapidly cycle between idle and active to starve the CPU clock ticker 1303 // of chances to observe task goroutines in a running state and account 1304 // their CPU usage. 1305 k.cpuClockTickerRunning = true 1306 k.runningTasksCond.Signal() 1307 } 1308 // This store must happen after the increment of k.cpuClock above to ensure 1309 // that concurrent calls to Task.accountTaskGoroutineLeave() also observe 1310 // the updated k.cpuClock. 1311 k.runningTasks.Store(1) 1312 k.runningTasksMu.Unlock() 1313 return 1314 } 1315 } 1316 1317 func (k *Kernel) decRunningTasks() { 1318 tasks := k.runningTasks.Add(-1) 1319 if tasks < 0 { 1320 panic(fmt.Sprintf("Invalid running count %d", tasks)) 1321 } 1322 1323 // Nothing to do. The next CPU clock tick will disable the timer if 1324 // there is still nothing running. This provides approximately one tick 1325 // of slack in which we can switch back and forth between idle and 1326 // active without an expensive transition. 1327 } 1328 1329 // WaitExited blocks until all tasks in k have exited. 1330 func (k *Kernel) WaitExited() { 1331 k.tasks.liveGoroutines.Wait() 1332 } 1333 1334 // Kill requests that all tasks in k immediately exit as if group exiting with 1335 // status ws. Kill does not wait for tasks to exit. 1336 func (k *Kernel) Kill(ws linux.WaitStatus) { 1337 k.extMu.Lock() 1338 defer k.extMu.Unlock() 1339 k.tasks.Kill(ws) 1340 } 1341 1342 // Pause requests that all tasks in k temporarily stop executing, and blocks 1343 // until all tasks and asynchronous I/O operations in k have stopped. Multiple 1344 // calls to Pause nest and require an equal number of calls to Unpause to 1345 // resume execution. 1346 func (k *Kernel) Pause() { 1347 k.extMu.Lock() 1348 k.tasks.BeginExternalStop() 1349 k.extMu.Unlock() 1350 k.tasks.runningGoroutines.Wait() 1351 k.tasks.aioGoroutines.Wait() 1352 } 1353 1354 // ReceiveTaskStates receives full states for all tasks. 1355 func (k *Kernel) ReceiveTaskStates() { 1356 k.extMu.Lock() 1357 k.tasks.PullFullState() 1358 k.extMu.Unlock() 1359 } 1360 1361 // Unpause ends the effect of a previous call to Pause. If Unpause is called 1362 // without a matching preceding call to Pause, Unpause may panic. 1363 func (k *Kernel) Unpause() { 1364 k.extMu.Lock() 1365 defer k.extMu.Unlock() 1366 k.tasks.EndExternalStop() 1367 } 1368 1369 // SendExternalSignal injects a signal into the kernel. 1370 // 1371 // context is used only for debugging to describe how the signal was received. 1372 // 1373 // Preconditions: Kernel must have an init process. 1374 func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { 1375 k.extMu.Lock() 1376 defer k.extMu.Unlock() 1377 k.sendExternalSignal(info, context) 1378 } 1379 1380 // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. 1381 // 1382 // This function doesn't skip signals like SendExternalSignal does. 1383 func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { 1384 k.extMu.Lock() 1385 defer k.extMu.Unlock() 1386 return tg.SendSignal(info) 1387 } 1388 1389 // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the 1390 // given process group. 1391 // 1392 // This function doesn't skip signals like SendExternalSignal does. 1393 func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error { 1394 k.extMu.Lock() 1395 defer k.extMu.Unlock() 1396 // If anything goes wrong, we'll return the error, but still try our 1397 // best to deliver to other processes in the group. 1398 var firstErr error 1399 for _, tg := range k.TaskSet().Root.ThreadGroups() { 1400 if tg.ProcessGroup() != pg { 1401 continue 1402 } 1403 if err := tg.SendSignal(info); err != nil && firstErr == nil { 1404 firstErr = err 1405 } 1406 } 1407 return firstErr 1408 } 1409 1410 // SendContainerSignal sends the given signal to all processes inside the 1411 // namespace that match the given container ID. 1412 func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { 1413 k.extMu.Lock() 1414 defer k.extMu.Unlock() 1415 k.tasks.mu.RLock() 1416 defer k.tasks.mu.RUnlock() 1417 1418 var lastErr error 1419 for tg := range k.tasks.Root.tgids { 1420 if tg.leader.ContainerID() == cid { 1421 tg.signalHandlers.mu.Lock() 1422 infoCopy := *info 1423 if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { 1424 lastErr = err 1425 } 1426 tg.signalHandlers.mu.Unlock() 1427 } 1428 } 1429 return lastErr 1430 } 1431 1432 // RebuildTraceContexts rebuilds the trace context for all tasks. 1433 // 1434 // Unfortunately, if these are built while tracing is not enabled, then we will 1435 // not have meaningful trace data. Rebuilding here ensures that we can do so 1436 // after tracing has been enabled. 1437 func (k *Kernel) RebuildTraceContexts() { 1438 // We need to pause all task goroutines because Task.rebuildTraceContext() 1439 // replaces Task.traceContext and Task.traceTask, which are 1440 // task-goroutine-exclusive (i.e. the task goroutine assumes that it can 1441 // access them without synchronization) for performance. 1442 k.Pause() 1443 defer k.Unpause() 1444 1445 k.extMu.Lock() 1446 defer k.extMu.Unlock() 1447 k.tasks.mu.RLock() 1448 defer k.tasks.mu.RUnlock() 1449 1450 for t, tid := range k.tasks.Root.tids { 1451 t.rebuildTraceContext(tid) 1452 } 1453 } 1454 1455 // FeatureSet returns the FeatureSet. 1456 func (k *Kernel) FeatureSet() cpuid.FeatureSet { 1457 return k.featureSet 1458 } 1459 1460 // Timekeeper returns the Timekeeper. 1461 func (k *Kernel) Timekeeper() *Timekeeper { 1462 return k.timekeeper 1463 } 1464 1465 // TaskSet returns the TaskSet. 1466 func (k *Kernel) TaskSet() *TaskSet { 1467 return k.tasks 1468 } 1469 1470 // RootUserNamespace returns the root UserNamespace. 1471 func (k *Kernel) RootUserNamespace() *auth.UserNamespace { 1472 return k.rootUserNamespace 1473 } 1474 1475 // RootUTSNamespace returns the root UTSNamespace. 1476 func (k *Kernel) RootUTSNamespace() *UTSNamespace { 1477 return k.rootUTSNamespace 1478 } 1479 1480 // RootIPCNamespace takes a reference and returns the root IPCNamespace. 1481 func (k *Kernel) RootIPCNamespace() *IPCNamespace { 1482 return k.rootIPCNamespace 1483 } 1484 1485 // RootPIDNamespace returns the root PIDNamespace. 1486 func (k *Kernel) RootPIDNamespace() *PIDNamespace { 1487 return k.tasks.Root 1488 } 1489 1490 // RootNetworkNamespace returns the root network namespace, always non-nil. 1491 func (k *Kernel) RootNetworkNamespace() *inet.Namespace { 1492 return k.rootNetworkNamespace 1493 } 1494 1495 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or 1496 // nil if no such thread group exists. GlobalInit may return a thread group 1497 // containing no tasks if the thread group has already exited. 1498 func (k *Kernel) GlobalInit() *ThreadGroup { 1499 k.extMu.Lock() 1500 defer k.extMu.Unlock() 1501 return k.globalInit 1502 } 1503 1504 // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. 1505 func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { 1506 k.globalInit = tg 1507 } 1508 1509 // ApplicationCores returns the number of CPUs visible to sandboxed 1510 // applications. 1511 func (k *Kernel) ApplicationCores() uint { 1512 return k.applicationCores 1513 } 1514 1515 // RealtimeClock returns the application CLOCK_REALTIME clock. 1516 func (k *Kernel) RealtimeClock() ktime.Clock { 1517 return k.timekeeper.realtimeClock 1518 } 1519 1520 // MonotonicClock returns the application CLOCK_MONOTONIC clock. 1521 func (k *Kernel) MonotonicClock() ktime.Clock { 1522 return k.timekeeper.monotonicClock 1523 } 1524 1525 // CPUClockNow returns the current value of k.cpuClock. 1526 func (k *Kernel) CPUClockNow() uint64 { 1527 return k.cpuClock.Load() 1528 } 1529 1530 // Syslog returns the syslog. 1531 func (k *Kernel) Syslog() *syslog { 1532 return &k.syslog 1533 } 1534 1535 // GenerateInotifyCookie generates a unique inotify event cookie. 1536 // 1537 // Returned values may overlap with previously returned values if the value 1538 // space is exhausted. 0 is not a valid cookie value, all other values 1539 // representable in a uint32 are allowed. 1540 func (k *Kernel) GenerateInotifyCookie() uint32 { 1541 id := k.nextInotifyCookie.Add(1) 1542 // Wrap-around is explicitly allowed for inotify event cookies. 1543 if id == 0 { 1544 id = k.nextInotifyCookie.Add(1) 1545 } 1546 return id 1547 } 1548 1549 // NetlinkPorts returns the netlink port manager. 1550 func (k *Kernel) NetlinkPorts() *port.Manager { 1551 return k.netlinkPorts 1552 } 1553 1554 var ( 1555 errSaved = errors.New("sandbox has been successfully saved") 1556 errAutoSaved = errors.New("sandbox has been successfully auto-saved") 1557 ) 1558 1559 // SaveStatus returns the sandbox save status. If it was saved successfully, 1560 // autosaved indicates whether save was triggered by autosave. If it was not 1561 // saved successfully, err indicates the sandbox error that caused the kernel to 1562 // exit during save. 1563 func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { 1564 k.extMu.Lock() 1565 defer k.extMu.Unlock() 1566 switch k.saveStatus { 1567 case nil: 1568 return false, false, nil 1569 case errSaved: 1570 return true, false, nil 1571 case errAutoSaved: 1572 return true, true, nil 1573 default: 1574 return false, false, k.saveStatus 1575 } 1576 } 1577 1578 // SetSaveSuccess sets the flag indicating that save completed successfully, if 1579 // no status was already set. 1580 func (k *Kernel) SetSaveSuccess(autosave bool) { 1581 k.extMu.Lock() 1582 defer k.extMu.Unlock() 1583 if k.saveStatus == nil { 1584 if autosave { 1585 k.saveStatus = errAutoSaved 1586 } else { 1587 k.saveStatus = errSaved 1588 } 1589 } 1590 } 1591 1592 // SetSaveError sets the sandbox error that caused the kernel to exit during 1593 // save, if one is not already set. 1594 func (k *Kernel) SetSaveError(err error) { 1595 k.extMu.Lock() 1596 defer k.extMu.Unlock() 1597 if k.saveStatus == nil { 1598 k.saveStatus = err 1599 } 1600 } 1601 1602 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or 1603 // LoadFrom. 1604 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { 1605 k.mf = mf 1606 } 1607 1608 // MemoryFile returns the MemoryFile that provides application memory. 1609 func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { 1610 return k.mf 1611 } 1612 1613 // SupervisorContext returns a Context with maximum privileges in k. It should 1614 // only be used by goroutines outside the control of the emulated kernel 1615 // defined by e. 1616 // 1617 // Callers are responsible for ensuring that the returned Context is not used 1618 // concurrently with changes to the Kernel. 1619 func (k *Kernel) SupervisorContext() context.Context { 1620 return &supervisorContext{ 1621 Kernel: k, 1622 Logger: log.Log(), 1623 } 1624 } 1625 1626 // SocketRecord represents a socket recorded in Kernel.sockets. 1627 // 1628 // +stateify savable 1629 type SocketRecord struct { 1630 k *Kernel 1631 Sock *vfs.FileDescription 1632 ID uint64 // Socket table entry number. 1633 } 1634 1635 // RecordSocket adds a socket to the system-wide socket table for 1636 // tracking. 1637 // 1638 // Precondition: Caller must hold a reference to sock. 1639 // 1640 // Note that the socket table will not hold a reference on the 1641 // vfs.FileDescription. 1642 func (k *Kernel) RecordSocket(sock *vfs.FileDescription) { 1643 k.extMu.Lock() 1644 if _, ok := k.sockets[sock]; ok { 1645 panic(fmt.Sprintf("Socket %p added twice", sock)) 1646 } 1647 id := k.nextSocketRecord 1648 k.nextSocketRecord++ 1649 s := &SocketRecord{ 1650 k: k, 1651 ID: id, 1652 Sock: sock, 1653 } 1654 k.sockets[sock] = s 1655 k.extMu.Unlock() 1656 } 1657 1658 // DeleteSocket removes a socket from the system-wide socket table. 1659 func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) { 1660 k.extMu.Lock() 1661 delete(k.sockets, sock) 1662 k.extMu.Unlock() 1663 } 1664 1665 // ListSockets returns a snapshot of all sockets. 1666 // 1667 // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef() 1668 // to get a reference on a socket in the table. 1669 func (k *Kernel) ListSockets() []*SocketRecord { 1670 k.extMu.Lock() 1671 var socks []*SocketRecord 1672 for _, s := range k.sockets { 1673 socks = append(socks, s) 1674 } 1675 k.extMu.Unlock() 1676 return socks 1677 } 1678 1679 // supervisorContext is a privileged context. 1680 type supervisorContext struct { 1681 context.NoTask 1682 log.Logger 1683 *Kernel 1684 } 1685 1686 // Deadline implements context.Context.Deadline. 1687 func (*Kernel) Deadline() (time.Time, bool) { 1688 return time.Time{}, false 1689 } 1690 1691 // Done implements context.Context.Done. 1692 func (*Kernel) Done() <-chan struct{} { 1693 return nil 1694 } 1695 1696 // Err implements context.Context.Err. 1697 func (*Kernel) Err() error { 1698 return nil 1699 } 1700 1701 // Value implements context.Context. 1702 func (ctx *supervisorContext) Value(key any) any { 1703 switch key { 1704 case CtxCanTrace: 1705 // The supervisor context can trace anything. (None of 1706 // supervisorContext's users are expected to invoke ptrace, but ptrace 1707 // permissions are required for certain file accesses.) 1708 return func(*Task, bool) bool { return true } 1709 case CtxKernel: 1710 return ctx.Kernel 1711 case CtxPIDNamespace: 1712 return ctx.Kernel.tasks.Root 1713 case CtxUTSNamespace: 1714 utsns := ctx.Kernel.rootUTSNamespace 1715 utsns.IncRef() 1716 return utsns 1717 case ipc.CtxIPCNamespace: 1718 ipcns := ctx.Kernel.rootIPCNamespace 1719 ipcns.IncRef() 1720 return ipcns 1721 case auth.CtxCredentials: 1722 // The supervisor context is global root. 1723 return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace) 1724 case vfs.CtxRoot: 1725 if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { 1726 return vfs.VirtualDentry{} 1727 } 1728 root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx) 1729 return root 1730 case vfs.CtxMountNamespace: 1731 if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { 1732 return nil 1733 } 1734 mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace() 1735 mntns.IncRef() 1736 return mntns 1737 case inet.CtxStack: 1738 return ctx.Kernel.RootNetworkNamespace().Stack() 1739 case ktime.CtxRealtimeClock: 1740 return ctx.Kernel.RealtimeClock() 1741 case limits.CtxLimits: 1742 // No limits apply. 1743 return limits.NewLimitSet() 1744 case pgalloc.CtxMemoryFile: 1745 return ctx.Kernel.mf 1746 case platform.CtxPlatform: 1747 return ctx.Kernel 1748 case uniqueid.CtxGlobalUniqueID: 1749 return ctx.Kernel.UniqueID() 1750 case uniqueid.CtxGlobalUniqueIDProvider: 1751 return ctx.Kernel 1752 case uniqueid.CtxInotifyCookie: 1753 return ctx.Kernel.GenerateInotifyCookie() 1754 case unimpl.CtxEvents: 1755 return ctx.Kernel 1756 case cpuid.CtxFeatureSet: 1757 return ctx.Kernel.featureSet 1758 default: 1759 return nil 1760 } 1761 } 1762 1763 // Rate limits for the number of unimplemented syscall events. 1764 const ( 1765 unimplementedSyscallsMaxRate = 100 // events per second 1766 unimplementedSyscallBurst = 1000 // events 1767 ) 1768 1769 // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event 1770 // channel. 1771 func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { 1772 k.unimplementedSyscallEmitterOnce.Do(func() { 1773 k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) 1774 }) 1775 1776 t := TaskFromContext(ctx) 1777 IncrementUnimplementedSyscallCounter(sysno) 1778 _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ 1779 Tid: int32(t.ThreadID()), 1780 Registers: t.Arch().StateData().Proto(), 1781 }) 1782 } 1783 1784 // VFS returns the virtual filesystem for the kernel. 1785 func (k *Kernel) VFS() *vfs.VirtualFilesystem { 1786 return &k.vfs 1787 } 1788 1789 // SetHostMount sets the hostfs mount. 1790 func (k *Kernel) SetHostMount(mnt *vfs.Mount) { 1791 if k.hostMount != nil { 1792 panic("Kernel.hostMount cannot be set more than once") 1793 } 1794 k.hostMount = mnt 1795 } 1796 1797 // HostMount returns the hostfs mount. 1798 func (k *Kernel) HostMount() *vfs.Mount { 1799 return k.hostMount 1800 } 1801 1802 // PipeMount returns the pipefs mount. 1803 func (k *Kernel) PipeMount() *vfs.Mount { 1804 return k.pipeMount 1805 } 1806 1807 // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace. 1808 func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter { 1809 return nsfs.NewInode(ctx, k.nsfsMount, ns) 1810 } 1811 1812 // ShmMount returns the tmpfs mount. 1813 func (k *Kernel) ShmMount() *vfs.Mount { 1814 return k.shmMount 1815 } 1816 1817 // SocketMount returns the sockfs mount. 1818 func (k *Kernel) SocketMount() *vfs.Mount { 1819 return k.socketMount 1820 } 1821 1822 // CgroupRegistry returns the cgroup registry. 1823 func (k *Kernel) CgroupRegistry() *CgroupRegistry { 1824 return k.cgroupRegistry 1825 } 1826 1827 // AddCgroupMount adds the cgroup mounts to the cgroupMountsMap. These cgroup 1828 // mounts are created during the creation of root container process and the 1829 // reference ownership is transferred to the kernel. 1830 func (k *Kernel) AddCgroupMount(ctl string, mnt *CgroupMount) { 1831 k.cgroupMountsMapMu.Lock() 1832 defer k.cgroupMountsMapMu.Unlock() 1833 1834 if k.cgroupMountsMap == nil { 1835 k.cgroupMountsMap = make(map[string]*CgroupMount) 1836 } 1837 k.cgroupMountsMap[ctl] = mnt 1838 } 1839 1840 // GetCgroupMount returns the cgroup mount for the given cgroup controller. 1841 func (k *Kernel) GetCgroupMount(ctl string) *CgroupMount { 1842 k.cgroupMountsMapMu.Lock() 1843 defer k.cgroupMountsMapMu.Unlock() 1844 1845 return k.cgroupMountsMap[ctl] 1846 } 1847 1848 // releaseCgroupMounts releases the cgroup mounts. 1849 func (k *Kernel) releaseCgroupMounts(ctx context.Context) { 1850 k.cgroupMountsMapMu.Lock() 1851 defer k.cgroupMountsMapMu.Unlock() 1852 1853 for _, m := range k.cgroupMountsMap { 1854 m.Mount.DecRef(ctx) 1855 m.Root.DecRef(ctx) 1856 m.Fs.DecRef(ctx) 1857 } 1858 } 1859 1860 // Release releases resources owned by k. 1861 // 1862 // Precondition: This should only be called after the kernel is fully 1863 // initialized, e.g. after k.Start() has been called. 1864 func (k *Kernel) Release() { 1865 ctx := k.SupervisorContext() 1866 k.releaseCgroupMounts(ctx) 1867 k.hostMount.DecRef(ctx) 1868 k.pipeMount.DecRef(ctx) 1869 k.nsfsMount.DecRef(ctx) 1870 k.shmMount.DecRef(ctx) 1871 k.socketMount.DecRef(ctx) 1872 k.vfs.Release(ctx) 1873 k.timekeeper.Destroy() 1874 k.vdso.Release(ctx) 1875 k.RootNetworkNamespace().DecRef(ctx) 1876 k.rootIPCNamespace.DecRef(ctx) 1877 k.rootUTSNamespace.DecRef(ctx) 1878 k.cleaupDevGofers() 1879 } 1880 1881 // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup 1882 // hierarchy. 1883 // 1884 // Precondition: root must be a new cgroup with no tasks. This implies the 1885 // controllers for root are also new and currently manage no task, which in turn 1886 // implies the new cgroup can be populated without migrating tasks between 1887 // cgroups. 1888 func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { 1889 k.tasks.mu.RLock() 1890 k.tasks.forEachTaskLocked(func(t *Task) { 1891 if t.exitState != TaskExitNone { 1892 return 1893 } 1894 t.mu.Lock() 1895 // A task can be in the cgroup if it has been created after the 1896 // cgroup hierarchy was registered. 1897 t.enterCgroupIfNotYetLocked(root) 1898 t.mu.Unlock() 1899 }) 1900 k.tasks.mu.RUnlock() 1901 } 1902 1903 // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the 1904 // hierarchy with the provided id. This is intended for use during hierarchy 1905 // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. 1906 func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { 1907 var releasedCGs []Cgroup 1908 1909 k.tasks.mu.RLock() 1910 // We'll have one cgroup per hierarchy per task. 1911 releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids)) 1912 k.tasks.forEachTaskLocked(func(t *Task) { 1913 if t.exitState != TaskExitNone { 1914 return 1915 } 1916 t.mu.Lock() 1917 for cg := range t.cgroups { 1918 if cg.HierarchyID() == hid { 1919 cg.Leave(t) 1920 t.ResetMemCgIDFromCgroup(cg) 1921 delete(t.cgroups, cg) 1922 releasedCGs = append(releasedCGs, cg) 1923 // A task can't be part of multiple cgroups from the same 1924 // hierarchy, so we can skip checking the rest once we find a 1925 // match. 1926 break 1927 } 1928 } 1929 t.mu.Unlock() 1930 }) 1931 k.tasks.mu.RUnlock() 1932 1933 for _, c := range releasedCGs { 1934 c.decRef() 1935 } 1936 } 1937 1938 // ReplaceFSContextRoots updates root and cwd to `newRoot` in the FSContext 1939 // across all tasks whose old root or cwd were `oldRoot`. 1940 func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) { 1941 k.tasks.mu.RLock() 1942 oldRootDecRefs := 0 1943 k.tasks.forEachTaskLocked(func(t *Task) { 1944 t.mu.Lock() 1945 defer t.mu.Unlock() 1946 if fsc := t.fsContext; fsc != nil { 1947 fsc.mu.Lock() 1948 defer fsc.mu.Unlock() 1949 if fsc.root == oldRoot { 1950 newRoot.IncRef() 1951 oldRootDecRefs++ 1952 fsc.root = newRoot 1953 } 1954 if fsc.cwd == oldRoot { 1955 newRoot.IncRef() 1956 oldRootDecRefs++ 1957 fsc.cwd = newRoot 1958 } 1959 } 1960 }) 1961 k.tasks.mu.RUnlock() 1962 for i := 0; i < oldRootDecRefs; i++ { 1963 oldRoot.DecRef(ctx) 1964 } 1965 } 1966 1967 // GetUserCounters returns the user counters for the given KUID. 1968 func (k *Kernel) GetUserCounters(uid auth.KUID) *UserCounters { 1969 k.userCountersMapMu.Lock() 1970 defer k.userCountersMapMu.Unlock() 1971 1972 if uc, ok := k.userCountersMap[uid]; ok { 1973 return uc 1974 } 1975 1976 uc := &UserCounters{} 1977 k.userCountersMap[uid] = uc 1978 return uc 1979 } 1980 1981 // AddDevGofer initializes the dev gofer connection and starts tracking it. 1982 // It takes ownership of goferFD. 1983 func (k *Kernel) AddDevGofer(contName string, goferFD int) error { 1984 client, err := devutil.NewGoferClient(k.SupervisorContext(), contName, goferFD) 1985 if err != nil { 1986 return err 1987 } 1988 1989 k.devGofersMu.Lock() 1990 defer k.devGofersMu.Unlock() 1991 if k.devGofers == nil { 1992 k.devGofers = make(map[string]*devutil.GoferClient) 1993 } 1994 k.devGofers[contName] = client 1995 return nil 1996 } 1997 1998 // RemoveDevGofer closes the dev gofer connection, if one exists, and stops 1999 // tracking it. 2000 func (k *Kernel) RemoveDevGofer(contName string) { 2001 k.devGofersMu.Lock() 2002 defer k.devGofersMu.Unlock() 2003 client, ok := k.devGofers[contName] 2004 if !ok { 2005 return 2006 } 2007 client.Close() 2008 delete(k.devGofers, contName) 2009 } 2010 2011 // GetDevGoferClient implements 2012 // devutil.GoferClientProviderFromContext.GetDevGoferClient. 2013 func (k *Kernel) GetDevGoferClient(contName string) *devutil.GoferClient { 2014 k.devGofersMu.Lock() 2015 defer k.devGofersMu.Unlock() 2016 return k.devGofers[contName] 2017 } 2018 2019 func (k *Kernel) cleaupDevGofers() { 2020 k.devGofersMu.Lock() 2021 defer k.devGofersMu.Unlock() 2022 for _, client := range k.devGofers { 2023 client.Close() 2024 } 2025 k.devGofers = nil 2026 } 2027 2028 // RegisterContainerName registers a container name for a given container ID. 2029 func (k *Kernel) RegisterContainerName(cid, containerName string) { 2030 k.extMu.Lock() 2031 defer k.extMu.Unlock() 2032 k.containerNames[cid] = containerName 2033 } 2034 2035 // RestoreContainerMapping remaps old container IDs to new ones after a restore. 2036 // containerIDs maps "name -> new container ID". Note that container names remain 2037 // constant between restore sessions. 2038 func (k *Kernel) RestoreContainerMapping(containerIDs map[string]string) { 2039 k.extMu.Lock() 2040 defer k.extMu.Unlock() 2041 2042 // Delete mapping from old session and replace with new values. 2043 k.containerNames = make(map[string]string) 2044 for name, cid := range containerIDs { 2045 k.containerNames[cid] = name 2046 } 2047 } 2048 2049 // ContainerName returns the container name for a given container ID. 2050 func (k *Kernel) ContainerName(cid string) string { 2051 k.extMu.Lock() 2052 defer k.extMu.Unlock() 2053 return k.containerNames[cid] 2054 }