github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/kernel.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernel provides an emulation of the Linux kernel. 16 // 17 // See README.md for a detailed overview. 18 // 19 // Lock order (outermost locks must be taken first): 20 // 21 // Kernel.extMu 22 // ThreadGroup.timerMu 23 // ktime.Timer.mu (for IntervalTimer) and Kernel.cpuClockMu 24 // TaskSet.mu 25 // SignalHandlers.mu 26 // Task.mu 27 // runningTasksMu 28 // 29 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking 30 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same 31 // time requires locking all of their signal mutexes first. 32 package kernel 33 34 import ( 35 "errors" 36 "fmt" 37 "path/filepath" 38 "time" 39 40 "github.com/metacubex/gvisor/pkg/abi/linux" 41 "github.com/metacubex/gvisor/pkg/atomicbitops" 42 "github.com/metacubex/gvisor/pkg/cleanup" 43 "github.com/metacubex/gvisor/pkg/context" 44 "github.com/metacubex/gvisor/pkg/cpuid" 45 "github.com/metacubex/gvisor/pkg/devutil" 46 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 47 "github.com/metacubex/gvisor/pkg/eventchannel" 48 "github.com/metacubex/gvisor/pkg/fspath" 49 "github.com/metacubex/gvisor/pkg/log" 50 "github.com/metacubex/gvisor/pkg/refs" 51 "github.com/metacubex/gvisor/pkg/sentry/arch" 52 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/nsfs" 53 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/pipefs" 54 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/sockfs" 55 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/timerfd" 56 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs" 57 "github.com/metacubex/gvisor/pkg/sentry/hostcpu" 58 "github.com/metacubex/gvisor/pkg/sentry/inet" 59 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 60 "github.com/metacubex/gvisor/pkg/sentry/kernel/futex" 61 "github.com/metacubex/gvisor/pkg/sentry/kernel/ipc" 62 "github.com/metacubex/gvisor/pkg/sentry/kernel/sched" 63 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 64 "github.com/metacubex/gvisor/pkg/sentry/limits" 65 "github.com/metacubex/gvisor/pkg/sentry/loader" 66 "github.com/metacubex/gvisor/pkg/sentry/mm" 67 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 68 "github.com/metacubex/gvisor/pkg/sentry/platform" 69 "github.com/metacubex/gvisor/pkg/sentry/socket/netlink/port" 70 sentrytime "github.com/metacubex/gvisor/pkg/sentry/time" 71 "github.com/metacubex/gvisor/pkg/sentry/unimpl" 72 uspb "github.com/metacubex/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" 73 "github.com/metacubex/gvisor/pkg/sentry/uniqueid" 74 "github.com/metacubex/gvisor/pkg/sentry/vfs" 75 "github.com/metacubex/gvisor/pkg/state" 76 "github.com/metacubex/gvisor/pkg/state/wire" 77 "github.com/metacubex/gvisor/pkg/sync" 78 "github.com/metacubex/gvisor/pkg/tcpip" 79 ) 80 81 // IOUringEnabled is set to true when IO_URING is enabled. Added as a global to 82 // allow easy access everywhere. 83 var IOUringEnabled = false 84 85 // UserCounters is a set of user counters. 86 // 87 // +stateify savable 88 type UserCounters struct { 89 uid auth.KUID 90 91 rlimitNProc atomicbitops.Uint64 92 } 93 94 // incRLimitNProc increments the rlimitNProc counter. 95 func (uc *UserCounters) incRLimitNProc(ctx context.Context) error { 96 lim := limits.FromContext(ctx).Get(limits.ProcessCount) 97 creds := auth.CredentialsFromContext(ctx) 98 nproc := uc.rlimitNProc.Add(1) 99 if nproc > lim.Cur && 100 !creds.HasCapability(linux.CAP_SYS_ADMIN) && 101 !creds.HasCapability(linux.CAP_SYS_RESOURCE) { 102 uc.rlimitNProc.Add(^uint64(0)) 103 return linuxerr.EAGAIN 104 } 105 return nil 106 } 107 108 // decRLimitNProc decrements the rlimitNProc counter. 109 func (uc *UserCounters) decRLimitNProc() { 110 uc.rlimitNProc.Add(^uint64(0)) 111 } 112 113 // CgroupMount contains the cgroup mount. These mounts are created for the root 114 // container by default and are stored in the kernel. 115 // 116 // +stateify savable 117 type CgroupMount struct { 118 Fs *vfs.Filesystem 119 Root *vfs.Dentry 120 Mount *vfs.Mount 121 } 122 123 // Kernel represents an emulated Linux kernel. It must be initialized by calling 124 // Init() or LoadFrom(). 125 // 126 // +stateify savable 127 type Kernel struct { 128 // extMu serializes external changes to the Kernel with calls to 129 // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel 130 // remains frozen for the duration of the call; it requires that the Kernel 131 // is paused as a precondition, which ensures that none of the tasks 132 // running within the Kernel can affect its state, but extMu is required to 133 // ensure that concurrent users of the Kernel *outside* the Kernel's 134 // control cannot affect its state by calling e.g. 135 // Kernel.SendExternalSignal.) 136 extMu sync.Mutex `state:"nosave"` 137 138 // started is true if Start has been called. Unless otherwise specified, 139 // all Kernel fields become immutable once started becomes true. 140 started bool `state:"nosave"` 141 142 // All of the following fields are immutable unless otherwise specified. 143 144 // Platform is the platform that is used to execute tasks in the created 145 // Kernel. 146 platform.Platform `state:"nosave"` 147 148 // mf provides application memory. 149 mf *pgalloc.MemoryFile `state:"nosave"` 150 151 // See InitKernelArgs for the meaning of these fields. 152 featureSet cpuid.FeatureSet 153 timekeeper *Timekeeper 154 tasks *TaskSet 155 rootUserNamespace *auth.UserNamespace 156 rootNetworkNamespace *inet.Namespace 157 applicationCores uint 158 useHostCores bool 159 extraAuxv []arch.AuxEntry 160 vdso *loader.VDSO 161 rootUTSNamespace *UTSNamespace 162 rootIPCNamespace *IPCNamespace 163 164 // futexes is the "root" futex.Manager, from which all others are forked. 165 // This is necessary to ensure that shared futexes are coherent across all 166 // tasks, including those created by CreateProcess. 167 futexes *futex.Manager 168 169 // globalInit is the thread group whose leader has ID 1 in the root PID 170 // namespace. globalInit is stored separately so that it is accessible even 171 // after all tasks in the thread group have exited, such that ID 1 is no 172 // longer mapped. 173 // 174 // globalInit is mutable until it is assigned by the first successful call 175 // to CreateProcess, and is protected by extMu. 176 globalInit *ThreadGroup 177 178 // syslog is the kernel log. 179 syslog syslog 180 181 runningTasksMu runningTasksMutex `state:"nosave"` 182 183 // runningTasks is the total count of tasks currently in 184 // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are 185 // not blocked or stopped. 186 // 187 // runningTasks must be accessed atomically. Increments from 0 to 1 are 188 // further protected by runningTasksMu (see incRunningTasks). 189 runningTasks atomicbitops.Int64 190 191 // runningTasksCond is signaled when runningTasks is incremented from 0 to 1. 192 // 193 // Invariant: runningTasksCond.L == &runningTasksMu. 194 runningTasksCond sync.Cond `state:"nosave"` 195 196 // cpuClock is incremented every linux.ClockTick by a goroutine running 197 // kernel.runCPUClockTicker() while runningTasks != 0. 198 // 199 // cpuClock is used to measure task CPU usage, since sampling monotonicClock 200 // twice on every syscall turns out to be unreasonably expensive. This is 201 // similar to how Linux does task CPU accounting on x86 202 // (CONFIG_IRQ_TIME_ACCOUNTING), although Linux also uses scheduler timing 203 // information to improve resolution 204 // (kernel/sched/cputime.c:cputime_adjust()), which we can't do since 205 // "preeemptive" scheduling is managed by the Go runtime, which doesn't 206 // provide this information. 207 // 208 // cpuClock is mutable, and is accessed using atomic memory operations. 209 cpuClock atomicbitops.Uint64 210 211 // cpuClockTickTimer drives increments of cpuClock. 212 cpuClockTickTimer *time.Timer `state:"nosave"` 213 214 // cpuClockMu is used to make increments of cpuClock, and updates of timers 215 // based on cpuClock, atomic. 216 cpuClockMu cpuClockMutex `state:"nosave"` 217 218 // cpuClockTickerRunning is true if the goroutine that increments cpuClock is 219 // running and false if it is blocked in runningTasksCond.Wait() or if it 220 // never started. 221 // 222 // cpuClockTickerRunning is protected by runningTasksMu. 223 cpuClockTickerRunning bool 224 225 // cpuClockTickerWakeCh is sent to to wake the goroutine that increments 226 // cpuClock if it's sleeping between ticks. 227 cpuClockTickerWakeCh chan struct{} `state:"nosave"` 228 229 // cpuClockTickerStopCond is broadcast when cpuClockTickerRunning transitions 230 // from true to false. 231 // 232 // Invariant: cpuClockTickerStopCond.L == &runningTasksMu. 233 cpuClockTickerStopCond sync.Cond `state:"nosave"` 234 235 // uniqueID is used to generate unique identifiers. 236 // 237 // uniqueID is mutable, and is accessed using atomic memory operations. 238 uniqueID atomicbitops.Uint64 239 240 // nextInotifyCookie is a monotonically increasing counter used for 241 // generating unique inotify event cookies. 242 // 243 // nextInotifyCookie is mutable. 244 nextInotifyCookie atomicbitops.Uint32 245 246 // netlinkPorts manages allocation of netlink socket port IDs. 247 netlinkPorts *port.Manager 248 249 // saveStatus is nil if the sandbox has not been saved, errSaved or 250 // errAutoSaved if it has been saved successfully, or the error causing the 251 // sandbox to exit during save. 252 // It is protected by extMu. 253 saveStatus error `state:"nosave"` 254 255 // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. 256 danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` 257 258 // sockets records all network sockets in the system. Protected by extMu. 259 sockets map[*vfs.FileDescription]*SocketRecord 260 261 // nextSocketRecord is the next entry number to use in sockets. Protected 262 // by extMu. 263 nextSocketRecord uint64 264 265 // unimplementedSyscallEmitterOnce is used in the initialization of 266 // unimplementedSyscallEmitter. 267 unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` 268 269 // unimplementedSyscallEmitter is used to emit unimplemented syscall 270 // events. This is initialized lazily on the first unimplemented 271 // syscall. 272 unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` 273 274 // SpecialOpts contains special kernel options. 275 SpecialOpts 276 277 // vfs keeps the filesystem state used across the kernel. 278 vfs vfs.VirtualFilesystem 279 280 // hostMount is the Mount used for file descriptors that were imported 281 // from the host. 282 hostMount *vfs.Mount 283 284 // pipeMount is the Mount used for pipes created by the pipe() and pipe2() 285 // syscalls (as opposed to named pipes created by mknod()). 286 pipeMount *vfs.Mount 287 288 // nsfsMount is the Mount used for namespaces. 289 nsfsMount *vfs.Mount 290 291 // shmMount is the Mount used for anonymous files created by the 292 // memfd_create() syscalls. It is analogous to Linux's shm_mnt. 293 shmMount *vfs.Mount 294 295 // socketMount is the Mount used for sockets created by the socket() and 296 // socketpair() syscalls. There are several cases where a socket dentry will 297 // not be contained in socketMount: 298 // 1. Socket files created by mknod() 299 // 2. Socket fds imported from the host (Kernel.hostMount is used for these) 300 // 3. Socket files created by binding Unix sockets to a file path 301 socketMount *vfs.Mount 302 303 // sysVShmDevID is the device number used by SysV shm segments. In Linux, 304 // SysV shm uses shmem_file_setup() and thus uses shm_mnt's device number. 305 // In gVisor, the shm implementation does not use shmMount, extracting 306 // shmMount's device number is inconvenient, applications accept a 307 // different device number in practice, and using a distinct device number 308 // avoids the possibility of inode number collisions due to the hack 309 // described in shm.Shm.InodeID(). 310 sysVShmDevID uint32 311 312 // If set to true, report address space activation waits as if the task is in 313 // external wait so that the watchdog doesn't report the task stuck. 314 SleepForAddressSpaceActivation bool 315 316 // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a 317 // tracee-tracer relationship. The key is a process (technically, the thread 318 // group leader) that can be traced by any thread that is a descendant of the 319 // value. If the value is nil, then anyone can trace the process represented by 320 // the key. 321 // 322 // ptraceExceptions is protected by the TaskSet mutex. 323 ptraceExceptions map[*Task]*Task 324 325 // YAMAPtraceScope is the current level of YAMA ptrace restrictions. 326 YAMAPtraceScope atomicbitops.Int32 327 328 // cgroupRegistry contains the set of active cgroup controllers on the 329 // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on 330 // the system. 331 cgroupRegistry *CgroupRegistry 332 333 // cgroupMountsMap maps the cgroup controller names to the cgroup mounts 334 // created for the root container. These mounts are then bind mounted 335 // for other application containers by creating their own container 336 // directories. 337 cgroupMountsMap map[string]*CgroupMount 338 cgroupMountsMapMu cgroupMountsMutex `state:"nosave"` 339 340 // userCountersMap maps auth.KUID into a set of user counters. 341 userCountersMap map[auth.KUID]*UserCounters 342 userCountersMapMu userCountersMutex `state:"nosave"` 343 344 // MaxFDLimit specifies the maximum file descriptor number that can be 345 // used by processes. 346 MaxFDLimit atomicbitops.Int32 347 348 // devGofers maps container ID to its device gofer client. 349 devGofers map[string]*devutil.GoferClient `state:"nosave"` 350 devGofersMu sync.Mutex `state:"nosave"` 351 } 352 353 // InitKernelArgs holds arguments to Init. 354 type InitKernelArgs struct { 355 // FeatureSet is the emulated CPU feature set. 356 FeatureSet cpuid.FeatureSet 357 358 // Timekeeper manages time for all tasks in the system. 359 Timekeeper *Timekeeper 360 361 // RootUserNamespace is the root user namespace. 362 RootUserNamespace *auth.UserNamespace 363 364 // RootNetworkNamespace is the root network namespace. If nil, no networking 365 // will be available. 366 RootNetworkNamespace *inet.Namespace 367 368 // ApplicationCores is the number of logical CPUs visible to sandboxed 369 // applications. The set of logical CPU IDs is [0, ApplicationCores); thus 370 // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the 371 // most significant bit in cpu_possible_mask + 1. 372 ApplicationCores uint 373 374 // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU 375 // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a 376 // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it 377 // will be overridden. 378 UseHostCores bool 379 380 // ExtraAuxv contains additional auxiliary vector entries that are added to 381 // each process by the ELF loader. 382 ExtraAuxv []arch.AuxEntry 383 384 // Vdso holds the VDSO and its parameter page. 385 Vdso *loader.VDSO 386 387 // RootUTSNamespace is the root UTS namespace. 388 RootUTSNamespace *UTSNamespace 389 390 // RootIPCNamespace is the root IPC namespace. 391 RootIPCNamespace *IPCNamespace 392 393 // PIDNamespace is the root PID namespace. 394 PIDNamespace *PIDNamespace 395 396 // MaxFDLimit specifies the maximum file descriptor number that can be 397 // used by processes. If it is zero, the limit will be set to 398 // unlimited. 399 MaxFDLimit int32 400 } 401 402 // Init initialize the Kernel with no tasks. 403 // 404 // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile 405 // before calling Init. 406 func (k *Kernel) Init(args InitKernelArgs) error { 407 if args.Timekeeper == nil { 408 return fmt.Errorf("args.Timekeeper is nil") 409 } 410 if args.Timekeeper.clocks == nil { 411 return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") 412 } 413 if args.RootUserNamespace == nil { 414 return fmt.Errorf("args.RootUserNamespace is nil") 415 } 416 if args.ApplicationCores == 0 { 417 return fmt.Errorf("args.ApplicationCores is 0") 418 } 419 420 k.featureSet = args.FeatureSet 421 k.timekeeper = args.Timekeeper 422 k.tasks = newTaskSet(args.PIDNamespace) 423 k.rootUserNamespace = args.RootUserNamespace 424 k.rootUTSNamespace = args.RootUTSNamespace 425 k.rootIPCNamespace = args.RootIPCNamespace 426 k.rootNetworkNamespace = args.RootNetworkNamespace 427 if k.rootNetworkNamespace == nil { 428 k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil, args.RootUserNamespace) 429 } 430 k.runningTasksCond.L = &k.runningTasksMu 431 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 432 k.cpuClockTickerStopCond.L = &k.runningTasksMu 433 k.applicationCores = args.ApplicationCores 434 if args.UseHostCores { 435 k.useHostCores = true 436 maxCPU, err := hostcpu.MaxPossibleCPU() 437 if err != nil { 438 return fmt.Errorf("failed to get maximum CPU number: %v", err) 439 } 440 minAppCores := uint(maxCPU) + 1 441 if k.applicationCores < minAppCores { 442 log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) 443 k.applicationCores = minAppCores 444 } 445 } 446 k.extraAuxv = args.ExtraAuxv 447 k.vdso = args.Vdso 448 k.futexes = futex.NewManager() 449 k.netlinkPorts = port.New() 450 k.ptraceExceptions = make(map[*Task]*Task) 451 k.YAMAPtraceScope = atomicbitops.FromInt32(linux.YAMA_SCOPE_RELATIONAL) 452 k.userCountersMap = make(map[auth.KUID]*UserCounters) 453 if args.MaxFDLimit == 0 { 454 args.MaxFDLimit = MaxFdLimit 455 } 456 k.MaxFDLimit.Store(args.MaxFDLimit) 457 458 ctx := k.SupervisorContext() 459 if err := k.vfs.Init(ctx); err != nil { 460 return fmt.Errorf("failed to initialize VFS: %v", err) 461 } 462 463 err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) 464 if err != nil { 465 return fmt.Errorf("failed to create mqfs filesystem: %v", err) 466 } 467 468 pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) 469 if err != nil { 470 return fmt.Errorf("failed to create pipefs filesystem: %v", err) 471 } 472 defer pipeFilesystem.DecRef(ctx) 473 pipeMount := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) 474 k.pipeMount = pipeMount 475 476 nsfsFilesystem, err := nsfs.NewFilesystem(&k.vfs) 477 if err != nil { 478 return fmt.Errorf("failed to create nsfs filesystem: %v", err) 479 } 480 defer nsfsFilesystem.DecRef(ctx) 481 k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) 482 k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) 483 k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) 484 k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) 485 486 tmpfsOpts := vfs.GetFilesystemOptions{ 487 InternalData: tmpfs.FilesystemOpts{ 488 // See mm/shmem.c:shmem_init() => vfs_kern_mount(flags=SB_KERNMOUNT). 489 // Note how mm/shmem.c:shmem_fill_super() does not provide a default 490 // value for sbinfo->max_blocks when SB_KERNMOUNT is set. 491 DisableDefaultSizeLimit: true, 492 }, 493 InternalMount: true, 494 } 495 tmpfsFilesystem, tmpfsRoot, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace), "", tmpfsOpts) 496 if err != nil { 497 return fmt.Errorf("failed to create tmpfs filesystem: %v", err) 498 } 499 defer tmpfsFilesystem.DecRef(ctx) 500 defer tmpfsRoot.DecRef(ctx) 501 k.shmMount = k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) 502 503 socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) 504 if err != nil { 505 return fmt.Errorf("failed to create sockfs filesystem: %v", err) 506 } 507 defer socketFilesystem.DecRef(ctx) 508 k.socketMount = k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) 509 510 sysVShmDevMinor, err := k.vfs.GetAnonBlockDevMinor() 511 if err != nil { 512 return fmt.Errorf("failed to get device number for SysV shm: %v", err) 513 } 514 k.sysVShmDevID = linux.MakeDeviceID(linux.UNNAMED_MAJOR, sysVShmDevMinor) 515 516 k.sockets = make(map[*vfs.FileDescription]*SocketRecord) 517 518 k.cgroupRegistry = newCgroupRegistry() 519 return nil 520 } 521 522 // +stateify savable 523 type privateMemoryFileMetadata struct { 524 owners []string 525 } 526 527 func savePrivateMFs(ctx context.Context, w wire.Writer, mfsToSave map[string]*pgalloc.MemoryFile) error { 528 var meta privateMemoryFileMetadata 529 // Generate the order in which private memory files are saved. 530 for fsID := range mfsToSave { 531 meta.owners = append(meta.owners, fsID) 532 } 533 // Save the metadata. 534 if _, err := state.Save(ctx, w, &meta); err != nil { 535 return err 536 } 537 // Followed by the private memory files in order. 538 for _, fsID := range meta.owners { 539 if err := mfsToSave[fsID].SaveTo(ctx, w); err != nil { 540 return err 541 } 542 } 543 return nil 544 } 545 546 func loadPrivateMFs(ctx context.Context, r wire.Reader) error { 547 // Load the metadata. 548 var meta privateMemoryFileMetadata 549 if _, err := state.Load(ctx, r, &meta); err != nil { 550 return err 551 } 552 mfmap := pgalloc.MemoryFileMapFromContext(ctx) 553 // Ensure that it is consistent with CtxFilesystemMemoryFileMap. 554 if len(mfmap) != len(meta.owners) { 555 return fmt.Errorf("inconsistent private memory files on restore: savedMFOwners = %v, CtxFilesystemMemoryFileMap = %v", meta.owners, mfmap) 556 } 557 // Load all private memory files. 558 for _, fsID := range meta.owners { 559 mf, ok := mfmap[fsID] 560 if !ok { 561 return fmt.Errorf("saved memory file for %q was not configured on restore", fsID) 562 } 563 if err := mf.LoadFrom(ctx, r); err != nil { 564 return err 565 } 566 } 567 return nil 568 } 569 570 // SaveTo saves the state of k to w. 571 // 572 // Preconditions: The kernel must be paused throughout the call to SaveTo. 573 func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { 574 saveStart := time.Now() 575 576 // Do not allow other Kernel methods to affect it while it's being saved. 577 k.extMu.Lock() 578 defer k.extMu.Unlock() 579 580 // Stop time. 581 k.pauseTimeLocked(ctx) 582 defer k.resumeTimeLocked(ctx) 583 584 // Evict all evictable MemoryFile allocations. 585 k.mf.StartEvictions() 586 k.mf.WaitForEvictions() 587 588 // Discard unsavable mappings, such as those for host file descriptors. 589 if err := k.invalidateUnsavableMappings(ctx); err != nil { 590 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 591 } 592 593 // Capture all private memory files. 594 mfsToSave := make(map[string]*pgalloc.MemoryFile) 595 vfsCtx := context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfsToSave) 596 // Prepare filesystems for saving. This must be done after 597 // invalidateUnsavableMappings(), since dropping memory mappings may 598 // affect filesystem state (e.g. page cache reference counts). 599 if err := k.vfs.PrepareSave(vfsCtx); err != nil { 600 return err 601 } 602 // Mark all to-be-saved MemoryFiles as savable to inform kernel save below. 603 k.mf.MarkSavable() 604 for _, mf := range mfsToSave { 605 mf.MarkSavable() 606 } 607 608 // Save the CPUID FeatureSet before the rest of the kernel so we can 609 // verify its compatibility on restore before attempting to restore the 610 // entire kernel, which may fail on an incompatible machine. 611 // 612 // N.B. This will also be saved along with the full kernel save below. 613 cpuidStart := time.Now() 614 if _, err := state.Save(ctx, w, &k.featureSet); err != nil { 615 return err 616 } 617 log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) 618 619 // Save the timekeeper's state. 620 621 if rootNS := k.rootNetworkNamespace; rootNS != nil && rootNS.Stack() != nil { 622 // Pause the network stack. 623 netstackPauseStart := time.Now() 624 log.Infof("Pausing root network namespace") 625 k.rootNetworkNamespace.Stack().Pause() 626 defer k.rootNetworkNamespace.Stack().Resume() 627 log.Infof("Pausing root network namespace took [%s].", time.Since(netstackPauseStart)) 628 } 629 630 // Save the kernel state. 631 kernelStart := time.Now() 632 stats, err := state.Save(ctx, w, k) 633 if err != nil { 634 return err 635 } 636 log.Infof("Kernel save stats: %s", stats.String()) 637 log.Infof("Kernel save took [%s].", time.Since(kernelStart)) 638 639 // Save the memory files' state. 640 memoryStart := time.Now() 641 if err := k.mf.SaveTo(ctx, w); err != nil { 642 return err 643 } 644 if err := savePrivateMFs(ctx, w, mfsToSave); err != nil { 645 return err 646 } 647 log.Infof("Memory files save took [%s].", time.Since(memoryStart)) 648 649 log.Infof("Overall save took [%s].", time.Since(saveStart)) 650 651 return nil 652 } 653 654 // Preconditions: The kernel must be paused. 655 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { 656 invalidated := make(map[*mm.MemoryManager]struct{}) 657 k.tasks.mu.RLock() 658 defer k.tasks.mu.RUnlock() 659 for t := range k.tasks.Root.tids { 660 // We can skip locking Task.mu here since the kernel is paused. 661 if memMgr := t.image.MemoryManager; memMgr != nil { 662 if _, ok := invalidated[memMgr]; !ok { 663 if err := memMgr.InvalidateUnsavable(ctx); err != nil { 664 return err 665 } 666 invalidated[memMgr] = struct{}{} 667 } 668 } 669 // I really wish we just had a sync.Map of all MMs... 670 if r, ok := t.runState.(*runSyscallAfterExecStop); ok { 671 if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { 672 return err 673 } 674 } 675 } 676 return nil 677 } 678 679 // LoadFrom returns a new Kernel loaded from args. 680 func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { 681 loadStart := time.Now() 682 683 k.runningTasksCond.L = &k.runningTasksMu 684 k.cpuClockTickerWakeCh = make(chan struct{}, 1) 685 k.cpuClockTickerStopCond.L = &k.runningTasksMu 686 687 initAppCores := k.applicationCores 688 689 // Load the pre-saved CPUID FeatureSet. 690 // 691 // N.B. This was also saved along with the full kernel below, so we 692 // don't need to explicitly install it in the Kernel. 693 cpuidStart := time.Now() 694 if _, err := state.Load(ctx, r, &k.featureSet); err != nil { 695 return err 696 } 697 log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) 698 699 // Verify that the FeatureSet is usable on this host. We do this before 700 // Kernel load so that the explicit CPUID mismatch error has priority 701 // over floating point state restore errors that may occur on load on 702 // an incompatible machine. 703 if err := k.featureSet.CheckHostCompatible(); err != nil { 704 return err 705 } 706 707 // Load the kernel state. 708 kernelStart := time.Now() 709 stats, err := state.Load(ctx, r, k) 710 if err != nil { 711 return err 712 } 713 log.Infof("Kernel load stats: %s", stats.String()) 714 log.Infof("Kernel load took [%s].", time.Since(kernelStart)) 715 716 // rootNetworkNamespace should be populated after loading the state file. 717 // Restore the root network stack. 718 k.rootNetworkNamespace.RestoreRootStack(net) 719 720 // Load the memory files' state. 721 memoryStart := time.Now() 722 if err := k.mf.LoadFrom(ctx, r); err != nil { 723 return err 724 } 725 if err := loadPrivateMFs(ctx, r); err != nil { 726 return err 727 } 728 log.Infof("Memory files load took [%s].", time.Since(memoryStart)) 729 730 log.Infof("Overall load took [%s]", time.Since(loadStart)) 731 732 k.Timekeeper().SetClocks(clocks) 733 734 if timeReady != nil { 735 close(timeReady) 736 } 737 738 if net != nil { 739 net.Restore() 740 } 741 742 if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { 743 return err 744 } 745 746 tcpip.AsyncLoading.Wait() 747 748 log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) 749 750 // Applications may size per-cpu structures based on k.applicationCores, so 751 // it can't change across save/restore. When we are virtualizing CPU 752 // numbers, this isn't a problem. However, when we are exposing host CPU 753 // assignments, we can't tolerate an increase in the number of host CPUs, 754 // which could result in getcpu(2) returning CPUs that applications expect 755 // not to exist. 756 if k.useHostCores && initAppCores > k.applicationCores { 757 return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) 758 } 759 760 return nil 761 } 762 763 // UniqueID returns a unique identifier. 764 func (k *Kernel) UniqueID() uint64 { 765 id := k.uniqueID.Add(1) 766 if id == 0 { 767 panic("unique identifier generator wrapped around") 768 } 769 return id 770 } 771 772 // CreateProcessArgs holds arguments to kernel.CreateProcess. 773 type CreateProcessArgs struct { 774 // Filename is the filename to load as the init binary. 775 // 776 // If this is provided as "", File will be checked, then the file will be 777 // guessed via Argv[0]. 778 Filename string 779 780 // File is a passed host FD pointing to a file to load as the init binary. 781 // 782 // This is checked if and only if Filename is "". 783 File *vfs.FileDescription 784 785 // Argv is a list of arguments. 786 Argv []string 787 788 // Envv is a list of environment variables. 789 Envv []string 790 791 // WorkingDirectory is the initial working directory. 792 // 793 // This defaults to the root if empty. 794 WorkingDirectory string 795 796 // Credentials is the initial credentials. 797 Credentials *auth.Credentials 798 799 // FDTable is the initial set of file descriptors. If CreateProcess succeeds, 800 // it takes a reference on FDTable. 801 FDTable *FDTable 802 803 // Umask is the initial umask. 804 Umask uint 805 806 // Limits are the initial resource limits. 807 Limits *limits.LimitSet 808 809 // MaxSymlinkTraversals is the maximum number of symlinks to follow 810 // during resolution. 811 MaxSymlinkTraversals uint 812 813 // UTSNamespace is the initial UTS namespace. 814 UTSNamespace *UTSNamespace 815 816 // IPCNamespace is the initial IPC namespace. 817 IPCNamespace *IPCNamespace 818 819 // PIDNamespace is the initial PID Namespace. 820 PIDNamespace *PIDNamespace 821 822 // MountNamespace optionally contains the mount namespace for this 823 // process. If nil, the init process's mount namespace is used. 824 // 825 // Anyone setting MountNamespace must donate a reference (i.e. 826 // increment it). 827 MountNamespace *vfs.MountNamespace 828 829 // ContainerID is the container that the process belongs to. 830 ContainerID string 831 832 // InitialCgroups are the cgroups the container is initialized to. 833 InitialCgroups map[Cgroup]struct{} 834 } 835 836 // NewContext returns a context.Context that represents the task that will be 837 // created by args.NewContext(k). 838 func (args *CreateProcessArgs) NewContext(k *Kernel) context.Context { 839 return &createProcessContext{ 840 Context: context.Background(), 841 kernel: k, 842 args: args, 843 } 844 } 845 846 // createProcessContext is a context.Context that represents the context 847 // associated with a task that is being created. 848 type createProcessContext struct { 849 context.Context 850 kernel *Kernel 851 args *CreateProcessArgs 852 } 853 854 // Value implements context.Context.Value. 855 func (ctx *createProcessContext) Value(key any) any { 856 switch key { 857 case CtxKernel: 858 return ctx.kernel 859 case CtxPIDNamespace: 860 return ctx.args.PIDNamespace 861 case CtxUTSNamespace: 862 utsns := ctx.args.UTSNamespace 863 utsns.IncRef() 864 return utsns 865 case ipc.CtxIPCNamespace: 866 ipcns := ctx.args.IPCNamespace 867 ipcns.IncRef() 868 return ipcns 869 case auth.CtxCredentials: 870 return ctx.args.Credentials 871 case vfs.CtxRoot: 872 if ctx.args.MountNamespace == nil { 873 return nil 874 } 875 root := ctx.args.MountNamespace.Root(ctx) 876 return root 877 case vfs.CtxMountNamespace: 878 if ctx.kernel.globalInit == nil { 879 return nil 880 } 881 mntns := ctx.kernel.GlobalInit().Leader().MountNamespace() 882 mntns.IncRef() 883 return mntns 884 case devutil.CtxDevGoferClient: 885 return ctx.kernel.getDevGoferClient(ctx.args.ContainerID) 886 case inet.CtxStack: 887 return ctx.kernel.RootNetworkNamespace().Stack() 888 case ktime.CtxRealtimeClock: 889 return ctx.kernel.RealtimeClock() 890 case limits.CtxLimits: 891 return ctx.args.Limits 892 case pgalloc.CtxMemoryCgroupID: 893 return ctx.getMemoryCgroupID() 894 case pgalloc.CtxMemoryFile: 895 return ctx.kernel.mf 896 case platform.CtxPlatform: 897 return ctx.kernel 898 case uniqueid.CtxGlobalUniqueID: 899 return ctx.kernel.UniqueID() 900 case uniqueid.CtxGlobalUniqueIDProvider: 901 return ctx.kernel 902 case uniqueid.CtxInotifyCookie: 903 return ctx.kernel.GenerateInotifyCookie() 904 case unimpl.CtxEvents: 905 return ctx.kernel 906 default: 907 return nil 908 } 909 } 910 911 func (ctx *createProcessContext) getMemoryCgroupID() uint32 { 912 for cg := range ctx.args.InitialCgroups { 913 for _, ctl := range cg.Controllers() { 914 if ctl.Type() == CgroupControllerMemory { 915 return cg.ID() 916 } 917 } 918 } 919 return InvalidCgroupID 920 } 921 922 // CreateProcess creates a new task in a new thread group with the given 923 // options. The new task has no parent and is in the root PID namespace. 924 // 925 // If k.Start() has already been called, then the created process must be 926 // started by calling kernel.StartProcess(tg). 927 // 928 // If k.Start() has not yet been called, then the created task will begin 929 // running when k.Start() is called. 930 // 931 // CreateProcess has no analogue in Linux; it is used to create the initial 932 // application task, as well as processes started by the control server. 933 func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { 934 k.extMu.Lock() 935 defer k.extMu.Unlock() 936 log.Infof("EXEC: %v", args.Argv) 937 938 ctx := args.NewContext(k) 939 mntns := args.MountNamespace 940 if mntns == nil { 941 if k.globalInit == nil { 942 return nil, 0, fmt.Errorf("mount namespace is nil") 943 } 944 // Add a reference to the namespace, which is transferred to the new process. 945 mntns = k.globalInit.Leader().MountNamespace() 946 mntns.IncRef() 947 } 948 // Get the root directory from the MountNamespace. 949 root := mntns.Root(ctx) 950 defer root.DecRef(ctx) 951 952 // Grab the working directory. 953 wd := root // Default. 954 if args.WorkingDirectory != "" { 955 pop := vfs.PathOperation{ 956 Root: root, 957 Start: wd, 958 Path: fspath.Parse(args.WorkingDirectory), 959 FollowFinalSymlink: true, 960 } 961 // NOTE(b/236028361): Do not set CheckSearchable flag to true. 962 // Application is allowed to start with a working directory that it can 963 // not access/search. This is consistent with Docker and VFS1. Runc 964 // explicitly allows for this in 6ce2d63a5db6 ("libct/init_linux: retry 965 // chdir to fix EPERM"). As described in the commit, runc unintentionally 966 // allowed this behavior in a couple of releases and applications started 967 // relying on it. So they decided to allow it for backward compatibility. 968 var err error 969 wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{}) 970 if err != nil { 971 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 972 } 973 defer wd.DecRef(ctx) 974 } 975 fsContext := NewFSContext(root, wd, args.Umask) 976 977 tg := k.NewThreadGroup(args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) 978 cu := cleanup.Make(func() { 979 tg.Release(ctx) 980 }) 981 defer cu.Clean() 982 983 // Check which file to start from. 984 switch { 985 case args.Filename != "": 986 // If a filename is given, take that. 987 // Set File to nil so we resolve the path in LoadTaskImage. 988 args.File = nil 989 case args.File != nil: 990 // If File is set, take the File provided directly. 991 args.Filename = args.File.MappedName(ctx) 992 default: 993 // Otherwise look at Argv and see if the first argument is a valid path. 994 if len(args.Argv) == 0 { 995 return nil, 0, fmt.Errorf("no filename or command provided") 996 } 997 if !filepath.IsAbs(args.Argv[0]) { 998 return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) 999 } 1000 args.Filename = args.Argv[0] 1001 } 1002 1003 // Create a fresh task context. 1004 remainingTraversals := args.MaxSymlinkTraversals 1005 loadArgs := loader.LoadArgs{ 1006 Root: root, 1007 WorkingDir: wd, 1008 RemainingTraversals: &remainingTraversals, 1009 ResolveFinal: true, 1010 Filename: args.Filename, 1011 File: args.File, 1012 CloseOnExec: false, 1013 Argv: args.Argv, 1014 Envv: args.Envv, 1015 Features: k.featureSet, 1016 } 1017 1018 image, se := k.LoadTaskImage(ctx, loadArgs) 1019 if se != nil { 1020 return nil, 0, errors.New(se.String()) 1021 } 1022 var capData auth.VfsCapData 1023 if len(image.FileCaps()) != 0 { 1024 var err error 1025 capData, err = auth.VfsCapDataOf([]byte(image.FileCaps())) 1026 if err != nil { 1027 return nil, 0, err 1028 } 1029 } 1030 creds, err := auth.CapsFromVfsCaps(capData, args.Credentials) 1031 if err != nil { 1032 return nil, 0, err 1033 } 1034 args.FDTable.IncRef() 1035 1036 // Create the task. 1037 config := &TaskConfig{ 1038 Kernel: k, 1039 ThreadGroup: tg, 1040 TaskImage: image, 1041 FSContext: fsContext, 1042 FDTable: args.FDTable, 1043 Credentials: creds, 1044 NetworkNamespace: k.RootNetworkNamespace(), 1045 AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), 1046 UTSNamespace: args.UTSNamespace, 1047 IPCNamespace: args.IPCNamespace, 1048 MountNamespace: mntns, 1049 ContainerID: args.ContainerID, 1050 InitialCgroups: args.InitialCgroups, 1051 UserCounters: k.GetUserCounters(args.Credentials.RealKUID), 1052 // A task with no parent starts out with no session keyring. 1053 SessionKeyring: nil, 1054 } 1055 config.UTSNamespace.IncRef() 1056 config.IPCNamespace.IncRef() 1057 config.NetworkNamespace.IncRef() 1058 t, err := k.tasks.NewTask(ctx, config) 1059 if err != nil { 1060 return nil, 0, err 1061 } 1062 t.traceExecEvent(image) // Simulate exec for tracing. 1063 1064 // Success. 1065 cu.Release() 1066 tgid := k.tasks.Root.IDOfThreadGroup(tg) 1067 if k.globalInit == nil { 1068 k.globalInit = tg 1069 } 1070 return tg, tgid, nil 1071 } 1072 1073 // StartProcess starts running a process that was created with CreateProcess. 1074 func (k *Kernel) StartProcess(tg *ThreadGroup) { 1075 t := tg.Leader() 1076 tid := k.tasks.Root.IDOfTask(t) 1077 t.Start(tid) 1078 } 1079 1080 // Start starts execution of all tasks in k. 1081 // 1082 // Preconditions: Start may be called exactly once. 1083 func (k *Kernel) Start() error { 1084 k.extMu.Lock() 1085 defer k.extMu.Unlock() 1086 1087 if k.started { 1088 return fmt.Errorf("kernel already started") 1089 } 1090 1091 k.started = true 1092 k.cpuClockTickTimer = time.NewTimer(linux.ClockTick) 1093 k.runningTasksMu.Lock() 1094 k.cpuClockTickerRunning = true 1095 k.runningTasksMu.Unlock() 1096 go k.runCPUClockTicker() 1097 // If k was created by LoadKernelFrom, timers were stopped during 1098 // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, 1099 // this is a no-op. 1100 k.resumeTimeLocked(k.SupervisorContext()) 1101 k.tasks.mu.RLock() 1102 ts := make([]*Task, 0, len(k.tasks.Root.tids)) 1103 for t := range k.tasks.Root.tids { 1104 ts = append(ts, t) 1105 } 1106 k.tasks.mu.RUnlock() 1107 // Start task goroutines. 1108 // NOTE(b/235349091): We don't actually need the TaskSet mutex, we just 1109 // need to make sure we only call t.Start() once for each task. Holding the 1110 // mutex for each task start may cause a nested locking error. 1111 for _, t := range ts { 1112 t.Start(t.ThreadID()) 1113 } 1114 return nil 1115 } 1116 1117 // pauseTimeLocked pauses all Timers and Timekeeper updates. 1118 // 1119 // Preconditions: 1120 // - Any task goroutines running in k must be stopped. 1121 // - k.extMu must be locked. 1122 func (k *Kernel) pauseTimeLocked(ctx context.Context) { 1123 // Since all task goroutines have been stopped by precondition, the CPU clock 1124 // ticker should stop on its own; wait for it to do so, waking it up from 1125 // sleeping between ticks if necessary. 1126 k.runningTasksMu.Lock() 1127 for k.cpuClockTickerRunning { 1128 select { 1129 case k.cpuClockTickerWakeCh <- struct{}{}: 1130 default: 1131 } 1132 k.cpuClockTickerStopCond.Wait() 1133 } 1134 k.runningTasksMu.Unlock() 1135 1136 // By precondition, nothing else can be interacting with PIDNamespace.tids 1137 // or FDTable.files, so we can iterate them without synchronization. (We 1138 // can't hold the TaskSet mutex when pausing thread group timers because 1139 // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet 1140 // mutex, while holding the Timer mutex.) 1141 for t := range k.tasks.Root.tids { 1142 if t == t.tg.leader { 1143 t.tg.itimerRealTimer.Pause() 1144 for _, it := range t.tg.timers { 1145 it.PauseTimer() 1146 } 1147 } 1148 // This means we'll iterate FDTables shared by multiple tasks repeatedly, 1149 // but ktime.Timer.Pause is idempotent so this is harmless. 1150 if t.fdTable != nil { 1151 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1152 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1153 tfd.PauseTimer() 1154 } 1155 }) 1156 } 1157 } 1158 k.timekeeper.PauseUpdates() 1159 } 1160 1161 // resumeTimeLocked resumes all Timers and Timekeeper updates. If 1162 // pauseTimeLocked has not been previously called, resumeTimeLocked has no 1163 // effect. 1164 // 1165 // Preconditions: 1166 // - Any task goroutines running in k must be stopped. 1167 // - k.extMu must be locked. 1168 func (k *Kernel) resumeTimeLocked(ctx context.Context) { 1169 // The CPU clock ticker will automatically resume as task goroutines resume 1170 // execution. 1171 1172 k.timekeeper.ResumeUpdates() 1173 for t := range k.tasks.Root.tids { 1174 if t == t.tg.leader { 1175 t.tg.itimerRealTimer.Resume() 1176 for _, it := range t.tg.timers { 1177 it.ResumeTimer() 1178 } 1179 } 1180 if t.fdTable != nil { 1181 t.fdTable.forEach(ctx, func(_ int32, fd *vfs.FileDescription, _ FDFlags) { 1182 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1183 tfd.ResumeTimer() 1184 } 1185 }) 1186 } 1187 } 1188 } 1189 1190 func (k *Kernel) incRunningTasks() { 1191 for { 1192 tasks := k.runningTasks.Load() 1193 if tasks != 0 { 1194 // Standard case. Simply increment. 1195 if !k.runningTasks.CompareAndSwap(tasks, tasks+1) { 1196 continue 1197 } 1198 return 1199 } 1200 1201 // Transition from 0 -> 1. 1202 k.runningTasksMu.Lock() 1203 if k.runningTasks.Load() != 0 { 1204 // Raced with another transition and lost. 1205 k.runningTasks.Add(1) 1206 k.runningTasksMu.Unlock() 1207 return 1208 } 1209 if !k.cpuClockTickerRunning { 1210 select { 1211 case tickTime := <-k.cpuClockTickTimer.C: 1212 // Rearm the timer since we consumed the wakeup. Estimate how much time 1213 // remains on the current tick so that periodic workloads interact with 1214 // the (periodic) CPU clock ticker in the same way that they would 1215 // without the optimization of putting the ticker to sleep. 1216 missedNS := time.Since(tickTime).Nanoseconds() 1217 missedTicks := missedNS / linux.ClockTick.Nanoseconds() 1218 thisTickNS := missedNS - missedTicks*linux.ClockTick.Nanoseconds() 1219 k.cpuClockTickTimer.Reset(time.Duration(linux.ClockTick.Nanoseconds() - thisTickNS)) 1220 // Increment k.cpuClock on the CPU clock ticker goroutine's behalf. 1221 // (Whole missed ticks don't matter, and adding them to k.cpuClock will 1222 // just confuse the watchdog.) At the time the tick occurred, all task 1223 // goroutines were asleep, so there's nothing else to do. This ensures 1224 // that our caller (Task.accountTaskGoroutineLeave()) records an 1225 // updated k.cpuClock in Task.gosched.Timestamp, so that it's correctly 1226 // accounted as having resumed execution in the sentry during this tick 1227 // instead of at the end of the previous one. 1228 k.cpuClock.Add(1) 1229 default: 1230 } 1231 // We are transitioning from idle to active. Set k.cpuClockTickerRunning 1232 // = true here so that if we transition to idle and then active again 1233 // before the CPU clock ticker goroutine has a chance to run, the first 1234 // call to k.incRunningTasks() at the end of that cycle does not try to 1235 // steal k.cpuClockTickTimer.C again, as this would allow workloads that 1236 // rapidly cycle between idle and active to starve the CPU clock ticker 1237 // of chances to observe task goroutines in a running state and account 1238 // their CPU usage. 1239 k.cpuClockTickerRunning = true 1240 k.runningTasksCond.Signal() 1241 } 1242 // This store must happen after the increment of k.cpuClock above to ensure 1243 // that concurrent calls to Task.accountTaskGoroutineLeave() also observe 1244 // the updated k.cpuClock. 1245 k.runningTasks.Store(1) 1246 k.runningTasksMu.Unlock() 1247 return 1248 } 1249 } 1250 1251 func (k *Kernel) decRunningTasks() { 1252 tasks := k.runningTasks.Add(-1) 1253 if tasks < 0 { 1254 panic(fmt.Sprintf("Invalid running count %d", tasks)) 1255 } 1256 1257 // Nothing to do. The next CPU clock tick will disable the timer if 1258 // there is still nothing running. This provides approximately one tick 1259 // of slack in which we can switch back and forth between idle and 1260 // active without an expensive transition. 1261 } 1262 1263 // WaitExited blocks until all tasks in k have exited. 1264 func (k *Kernel) WaitExited() { 1265 k.tasks.liveGoroutines.Wait() 1266 } 1267 1268 // Kill requests that all tasks in k immediately exit as if group exiting with 1269 // status ws. Kill does not wait for tasks to exit. 1270 func (k *Kernel) Kill(ws linux.WaitStatus) { 1271 k.extMu.Lock() 1272 defer k.extMu.Unlock() 1273 k.tasks.Kill(ws) 1274 } 1275 1276 // Pause requests that all tasks in k temporarily stop executing, and blocks 1277 // until all tasks and asynchronous I/O operations in k have stopped. Multiple 1278 // calls to Pause nest and require an equal number of calls to Unpause to 1279 // resume execution. 1280 func (k *Kernel) Pause() { 1281 k.extMu.Lock() 1282 k.tasks.BeginExternalStop() 1283 k.extMu.Unlock() 1284 k.tasks.runningGoroutines.Wait() 1285 k.tasks.aioGoroutines.Wait() 1286 } 1287 1288 // ReceiveTaskStates receives full states for all tasks. 1289 func (k *Kernel) ReceiveTaskStates() { 1290 k.extMu.Lock() 1291 k.tasks.PullFullState() 1292 k.extMu.Unlock() 1293 } 1294 1295 // Unpause ends the effect of a previous call to Pause. If Unpause is called 1296 // without a matching preceding call to Pause, Unpause may panic. 1297 func (k *Kernel) Unpause() { 1298 k.extMu.Lock() 1299 defer k.extMu.Unlock() 1300 k.tasks.EndExternalStop() 1301 } 1302 1303 // SendExternalSignal injects a signal into the kernel. 1304 // 1305 // context is used only for debugging to describe how the signal was received. 1306 // 1307 // Preconditions: Kernel must have an init process. 1308 func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { 1309 k.extMu.Lock() 1310 defer k.extMu.Unlock() 1311 k.sendExternalSignal(info, context) 1312 } 1313 1314 // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. 1315 // 1316 // This function doesn't skip signals like SendExternalSignal does. 1317 func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { 1318 k.extMu.Lock() 1319 defer k.extMu.Unlock() 1320 return tg.SendSignal(info) 1321 } 1322 1323 // SendExternalSignalProcessGroup sends a signal to all ThreadGroups in the 1324 // given process group. 1325 // 1326 // This function doesn't skip signals like SendExternalSignal does. 1327 func (k *Kernel) SendExternalSignalProcessGroup(pg *ProcessGroup, info *linux.SignalInfo) error { 1328 k.extMu.Lock() 1329 defer k.extMu.Unlock() 1330 // If anything goes wrong, we'll return the error, but still try our 1331 // best to deliver to other processes in the group. 1332 var firstErr error 1333 for _, tg := range k.TaskSet().Root.ThreadGroups() { 1334 if tg.ProcessGroup() != pg { 1335 continue 1336 } 1337 if err := tg.SendSignal(info); err != nil && firstErr == nil { 1338 firstErr = err 1339 } 1340 } 1341 return firstErr 1342 } 1343 1344 // SendContainerSignal sends the given signal to all processes inside the 1345 // namespace that match the given container ID. 1346 func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { 1347 k.extMu.Lock() 1348 defer k.extMu.Unlock() 1349 k.tasks.mu.RLock() 1350 defer k.tasks.mu.RUnlock() 1351 1352 var lastErr error 1353 for tg := range k.tasks.Root.tgids { 1354 if tg.leader.ContainerID() == cid { 1355 tg.signalHandlers.mu.Lock() 1356 infoCopy := *info 1357 if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { 1358 lastErr = err 1359 } 1360 tg.signalHandlers.mu.Unlock() 1361 } 1362 } 1363 return lastErr 1364 } 1365 1366 // RebuildTraceContexts rebuilds the trace context for all tasks. 1367 // 1368 // Unfortunately, if these are built while tracing is not enabled, then we will 1369 // not have meaningful trace data. Rebuilding here ensures that we can do so 1370 // after tracing has been enabled. 1371 func (k *Kernel) RebuildTraceContexts() { 1372 // We need to pause all task goroutines because Task.rebuildTraceContext() 1373 // replaces Task.traceContext and Task.traceTask, which are 1374 // task-goroutine-exclusive (i.e. the task goroutine assumes that it can 1375 // access them without synchronization) for performance. 1376 k.Pause() 1377 defer k.Unpause() 1378 1379 k.extMu.Lock() 1380 defer k.extMu.Unlock() 1381 k.tasks.mu.RLock() 1382 defer k.tasks.mu.RUnlock() 1383 1384 for t, tid := range k.tasks.Root.tids { 1385 t.rebuildTraceContext(tid) 1386 } 1387 } 1388 1389 // FeatureSet returns the FeatureSet. 1390 func (k *Kernel) FeatureSet() cpuid.FeatureSet { 1391 return k.featureSet 1392 } 1393 1394 // Timekeeper returns the Timekeeper. 1395 func (k *Kernel) Timekeeper() *Timekeeper { 1396 return k.timekeeper 1397 } 1398 1399 // TaskSet returns the TaskSet. 1400 func (k *Kernel) TaskSet() *TaskSet { 1401 return k.tasks 1402 } 1403 1404 // RootUserNamespace returns the root UserNamespace. 1405 func (k *Kernel) RootUserNamespace() *auth.UserNamespace { 1406 return k.rootUserNamespace 1407 } 1408 1409 // RootUTSNamespace returns the root UTSNamespace. 1410 func (k *Kernel) RootUTSNamespace() *UTSNamespace { 1411 return k.rootUTSNamespace 1412 } 1413 1414 // RootIPCNamespace takes a reference and returns the root IPCNamespace. 1415 func (k *Kernel) RootIPCNamespace() *IPCNamespace { 1416 return k.rootIPCNamespace 1417 } 1418 1419 // RootPIDNamespace returns the root PIDNamespace. 1420 func (k *Kernel) RootPIDNamespace() *PIDNamespace { 1421 return k.tasks.Root 1422 } 1423 1424 // RootNetworkNamespace returns the root network namespace, always non-nil. 1425 func (k *Kernel) RootNetworkNamespace() *inet.Namespace { 1426 return k.rootNetworkNamespace 1427 } 1428 1429 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or 1430 // nil if no such thread group exists. GlobalInit may return a thread group 1431 // containing no tasks if the thread group has already exited. 1432 func (k *Kernel) GlobalInit() *ThreadGroup { 1433 k.extMu.Lock() 1434 defer k.extMu.Unlock() 1435 return k.globalInit 1436 } 1437 1438 // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. 1439 func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { 1440 k.globalInit = tg 1441 } 1442 1443 // ApplicationCores returns the number of CPUs visible to sandboxed 1444 // applications. 1445 func (k *Kernel) ApplicationCores() uint { 1446 return k.applicationCores 1447 } 1448 1449 // RealtimeClock returns the application CLOCK_REALTIME clock. 1450 func (k *Kernel) RealtimeClock() ktime.Clock { 1451 return k.timekeeper.realtimeClock 1452 } 1453 1454 // MonotonicClock returns the application CLOCK_MONOTONIC clock. 1455 func (k *Kernel) MonotonicClock() ktime.Clock { 1456 return k.timekeeper.monotonicClock 1457 } 1458 1459 // CPUClockNow returns the current value of k.cpuClock. 1460 func (k *Kernel) CPUClockNow() uint64 { 1461 return k.cpuClock.Load() 1462 } 1463 1464 // Syslog returns the syslog. 1465 func (k *Kernel) Syslog() *syslog { 1466 return &k.syslog 1467 } 1468 1469 // GenerateInotifyCookie generates a unique inotify event cookie. 1470 // 1471 // Returned values may overlap with previously returned values if the value 1472 // space is exhausted. 0 is not a valid cookie value, all other values 1473 // representable in a uint32 are allowed. 1474 func (k *Kernel) GenerateInotifyCookie() uint32 { 1475 id := k.nextInotifyCookie.Add(1) 1476 // Wrap-around is explicitly allowed for inotify event cookies. 1477 if id == 0 { 1478 id = k.nextInotifyCookie.Add(1) 1479 } 1480 return id 1481 } 1482 1483 // NetlinkPorts returns the netlink port manager. 1484 func (k *Kernel) NetlinkPorts() *port.Manager { 1485 return k.netlinkPorts 1486 } 1487 1488 var ( 1489 errSaved = errors.New("sandbox has been successfully saved") 1490 errAutoSaved = errors.New("sandbox has been successfully auto-saved") 1491 ) 1492 1493 // SaveStatus returns the sandbox save status. If it was saved successfully, 1494 // autosaved indicates whether save was triggered by autosave. If it was not 1495 // saved successfully, err indicates the sandbox error that caused the kernel to 1496 // exit during save. 1497 func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { 1498 k.extMu.Lock() 1499 defer k.extMu.Unlock() 1500 switch k.saveStatus { 1501 case nil: 1502 return false, false, nil 1503 case errSaved: 1504 return true, false, nil 1505 case errAutoSaved: 1506 return true, true, nil 1507 default: 1508 return false, false, k.saveStatus 1509 } 1510 } 1511 1512 // SetSaveSuccess sets the flag indicating that save completed successfully, if 1513 // no status was already set. 1514 func (k *Kernel) SetSaveSuccess(autosave bool) { 1515 k.extMu.Lock() 1516 defer k.extMu.Unlock() 1517 if k.saveStatus == nil { 1518 if autosave { 1519 k.saveStatus = errAutoSaved 1520 } else { 1521 k.saveStatus = errSaved 1522 } 1523 } 1524 } 1525 1526 // SetSaveError sets the sandbox error that caused the kernel to exit during 1527 // save, if one is not already set. 1528 func (k *Kernel) SetSaveError(err error) { 1529 k.extMu.Lock() 1530 defer k.extMu.Unlock() 1531 if k.saveStatus == nil { 1532 k.saveStatus = err 1533 } 1534 } 1535 1536 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or 1537 // LoadFrom. 1538 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { 1539 k.mf = mf 1540 } 1541 1542 // MemoryFile returns the MemoryFile that provides application memory. 1543 func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { 1544 return k.mf 1545 } 1546 1547 // SupervisorContext returns a Context with maximum privileges in k. It should 1548 // only be used by goroutines outside the control of the emulated kernel 1549 // defined by e. 1550 // 1551 // Callers are responsible for ensuring that the returned Context is not used 1552 // concurrently with changes to the Kernel. 1553 func (k *Kernel) SupervisorContext() context.Context { 1554 return &supervisorContext{ 1555 Kernel: k, 1556 Logger: log.Log(), 1557 } 1558 } 1559 1560 // SocketRecord represents a socket recorded in Kernel.sockets. 1561 // 1562 // +stateify savable 1563 type SocketRecord struct { 1564 k *Kernel 1565 Sock *vfs.FileDescription 1566 ID uint64 // Socket table entry number. 1567 } 1568 1569 // RecordSocket adds a socket to the system-wide socket table for 1570 // tracking. 1571 // 1572 // Precondition: Caller must hold a reference to sock. 1573 // 1574 // Note that the socket table will not hold a reference on the 1575 // vfs.FileDescription. 1576 func (k *Kernel) RecordSocket(sock *vfs.FileDescription) { 1577 k.extMu.Lock() 1578 if _, ok := k.sockets[sock]; ok { 1579 panic(fmt.Sprintf("Socket %p added twice", sock)) 1580 } 1581 id := k.nextSocketRecord 1582 k.nextSocketRecord++ 1583 s := &SocketRecord{ 1584 k: k, 1585 ID: id, 1586 Sock: sock, 1587 } 1588 k.sockets[sock] = s 1589 k.extMu.Unlock() 1590 } 1591 1592 // DeleteSocket removes a socket from the system-wide socket table. 1593 func (k *Kernel) DeleteSocket(sock *vfs.FileDescription) { 1594 k.extMu.Lock() 1595 delete(k.sockets, sock) 1596 k.extMu.Unlock() 1597 } 1598 1599 // ListSockets returns a snapshot of all sockets. 1600 // 1601 // Callers of ListSockets() should use SocketRecord.Sock.TryIncRef() 1602 // to get a reference on a socket in the table. 1603 func (k *Kernel) ListSockets() []*SocketRecord { 1604 k.extMu.Lock() 1605 var socks []*SocketRecord 1606 for _, s := range k.sockets { 1607 socks = append(socks, s) 1608 } 1609 k.extMu.Unlock() 1610 return socks 1611 } 1612 1613 // supervisorContext is a privileged context. 1614 type supervisorContext struct { 1615 context.NoTask 1616 log.Logger 1617 *Kernel 1618 } 1619 1620 // Deadline implements context.Context.Deadline. 1621 func (*Kernel) Deadline() (time.Time, bool) { 1622 return time.Time{}, false 1623 } 1624 1625 // Done implements context.Context.Done. 1626 func (*Kernel) Done() <-chan struct{} { 1627 return nil 1628 } 1629 1630 // Err implements context.Context.Err. 1631 func (*Kernel) Err() error { 1632 return nil 1633 } 1634 1635 // Value implements context.Context. 1636 func (ctx *supervisorContext) Value(key any) any { 1637 switch key { 1638 case CtxCanTrace: 1639 // The supervisor context can trace anything. (None of 1640 // supervisorContext's users are expected to invoke ptrace, but ptrace 1641 // permissions are required for certain file accesses.) 1642 return func(*Task, bool) bool { return true } 1643 case CtxKernel: 1644 return ctx.Kernel 1645 case CtxPIDNamespace: 1646 return ctx.Kernel.tasks.Root 1647 case CtxUTSNamespace: 1648 utsns := ctx.Kernel.rootUTSNamespace 1649 utsns.IncRef() 1650 return utsns 1651 case ipc.CtxIPCNamespace: 1652 ipcns := ctx.Kernel.rootIPCNamespace 1653 ipcns.IncRef() 1654 return ipcns 1655 case auth.CtxCredentials: 1656 // The supervisor context is global root. 1657 return auth.NewRootCredentials(ctx.Kernel.rootUserNamespace) 1658 case vfs.CtxRoot: 1659 if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { 1660 return vfs.VirtualDentry{} 1661 } 1662 root := ctx.Kernel.GlobalInit().Leader().MountNamespace().Root(ctx) 1663 return root 1664 case vfs.CtxMountNamespace: 1665 if ctx.Kernel.globalInit == nil || ctx.Kernel.globalInit.Leader() == nil { 1666 return nil 1667 } 1668 mntns := ctx.Kernel.GlobalInit().Leader().MountNamespace() 1669 mntns.IncRef() 1670 return mntns 1671 case inet.CtxStack: 1672 return ctx.Kernel.RootNetworkNamespace().Stack() 1673 case ktime.CtxRealtimeClock: 1674 return ctx.Kernel.RealtimeClock() 1675 case limits.CtxLimits: 1676 // No limits apply. 1677 return limits.NewLimitSet() 1678 case pgalloc.CtxMemoryFile: 1679 return ctx.Kernel.mf 1680 case platform.CtxPlatform: 1681 return ctx.Kernel 1682 case uniqueid.CtxGlobalUniqueID: 1683 return ctx.Kernel.UniqueID() 1684 case uniqueid.CtxGlobalUniqueIDProvider: 1685 return ctx.Kernel 1686 case uniqueid.CtxInotifyCookie: 1687 return ctx.Kernel.GenerateInotifyCookie() 1688 case unimpl.CtxEvents: 1689 return ctx.Kernel 1690 case cpuid.CtxFeatureSet: 1691 return ctx.Kernel.featureSet 1692 default: 1693 return nil 1694 } 1695 } 1696 1697 // Rate limits for the number of unimplemented syscall events. 1698 const ( 1699 unimplementedSyscallsMaxRate = 100 // events per second 1700 unimplementedSyscallBurst = 1000 // events 1701 ) 1702 1703 // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event 1704 // channel. 1705 func (k *Kernel) EmitUnimplementedEvent(ctx context.Context, sysno uintptr) { 1706 k.unimplementedSyscallEmitterOnce.Do(func() { 1707 k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) 1708 }) 1709 1710 t := TaskFromContext(ctx) 1711 IncrementUnimplementedSyscallCounter(sysno) 1712 _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ 1713 Tid: int32(t.ThreadID()), 1714 Registers: t.Arch().StateData().Proto(), 1715 }) 1716 } 1717 1718 // VFS returns the virtual filesystem for the kernel. 1719 func (k *Kernel) VFS() *vfs.VirtualFilesystem { 1720 return &k.vfs 1721 } 1722 1723 // SetHostMount sets the hostfs mount. 1724 func (k *Kernel) SetHostMount(mnt *vfs.Mount) { 1725 if k.hostMount != nil { 1726 panic("Kernel.hostMount cannot be set more than once") 1727 } 1728 k.hostMount = mnt 1729 } 1730 1731 // HostMount returns the hostfs mount. 1732 func (k *Kernel) HostMount() *vfs.Mount { 1733 return k.hostMount 1734 } 1735 1736 // PipeMount returns the pipefs mount. 1737 func (k *Kernel) PipeMount() *vfs.Mount { 1738 return k.pipeMount 1739 } 1740 1741 // GetNamespaceInode returns a new nsfs inode which serves as a reference counter for the namespace. 1742 func (k *Kernel) GetNamespaceInode(ctx context.Context, ns vfs.Namespace) refs.TryRefCounter { 1743 return nsfs.NewInode(ctx, k.nsfsMount, ns) 1744 } 1745 1746 // ShmMount returns the tmpfs mount. 1747 func (k *Kernel) ShmMount() *vfs.Mount { 1748 return k.shmMount 1749 } 1750 1751 // SocketMount returns the sockfs mount. 1752 func (k *Kernel) SocketMount() *vfs.Mount { 1753 return k.socketMount 1754 } 1755 1756 // CgroupRegistry returns the cgroup registry. 1757 func (k *Kernel) CgroupRegistry() *CgroupRegistry { 1758 return k.cgroupRegistry 1759 } 1760 1761 // AddCgroupMount adds the cgroup mounts to the cgroupMountsMap. These cgroup 1762 // mounts are created during the creation of root container process and the 1763 // reference ownership is transferred to the kernel. 1764 func (k *Kernel) AddCgroupMount(ctl string, mnt *CgroupMount) { 1765 k.cgroupMountsMapMu.Lock() 1766 defer k.cgroupMountsMapMu.Unlock() 1767 1768 if k.cgroupMountsMap == nil { 1769 k.cgroupMountsMap = make(map[string]*CgroupMount) 1770 } 1771 k.cgroupMountsMap[ctl] = mnt 1772 } 1773 1774 // GetCgroupMount returns the cgroup mount for the given cgroup controller. 1775 func (k *Kernel) GetCgroupMount(ctl string) *CgroupMount { 1776 k.cgroupMountsMapMu.Lock() 1777 defer k.cgroupMountsMapMu.Unlock() 1778 1779 return k.cgroupMountsMap[ctl] 1780 } 1781 1782 // releaseCgroupMounts releases the cgroup mounts. 1783 func (k *Kernel) releaseCgroupMounts(ctx context.Context) { 1784 k.cgroupMountsMapMu.Lock() 1785 defer k.cgroupMountsMapMu.Unlock() 1786 1787 for _, m := range k.cgroupMountsMap { 1788 m.Mount.DecRef(ctx) 1789 m.Root.DecRef(ctx) 1790 m.Fs.DecRef(ctx) 1791 } 1792 } 1793 1794 // Release releases resources owned by k. 1795 // 1796 // Precondition: This should only be called after the kernel is fully 1797 // initialized, e.g. after k.Start() has been called. 1798 func (k *Kernel) Release() { 1799 ctx := k.SupervisorContext() 1800 k.releaseCgroupMounts(ctx) 1801 k.hostMount.DecRef(ctx) 1802 k.pipeMount.DecRef(ctx) 1803 k.nsfsMount.DecRef(ctx) 1804 k.shmMount.DecRef(ctx) 1805 k.socketMount.DecRef(ctx) 1806 k.vfs.Release(ctx) 1807 k.timekeeper.Destroy() 1808 k.vdso.Release(ctx) 1809 k.RootNetworkNamespace().DecRef(ctx) 1810 k.rootIPCNamespace.DecRef(ctx) 1811 k.rootUTSNamespace.DecRef(ctx) 1812 k.cleaupDevGofers() 1813 } 1814 1815 // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup 1816 // hierarchy. 1817 // 1818 // Precondition: root must be a new cgroup with no tasks. This implies the 1819 // controllers for root are also new and currently manage no task, which in turn 1820 // implies the new cgroup can be populated without migrating tasks between 1821 // cgroups. 1822 func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { 1823 k.tasks.mu.RLock() 1824 k.tasks.forEachTaskLocked(func(t *Task) { 1825 if t.exitState != TaskExitNone { 1826 return 1827 } 1828 t.mu.Lock() 1829 // A task can be in the cgroup if it has been created after the 1830 // cgroup hierarchy was registered. 1831 t.enterCgroupIfNotYetLocked(root) 1832 t.mu.Unlock() 1833 }) 1834 k.tasks.mu.RUnlock() 1835 } 1836 1837 // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the 1838 // hierarchy with the provided id. This is intended for use during hierarchy 1839 // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. 1840 func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { 1841 var releasedCGs []Cgroup 1842 1843 k.tasks.mu.RLock() 1844 // We'll have one cgroup per hierarchy per task. 1845 releasedCGs = make([]Cgroup, 0, len(k.tasks.Root.tids)) 1846 k.tasks.forEachTaskLocked(func(t *Task) { 1847 if t.exitState != TaskExitNone { 1848 return 1849 } 1850 t.mu.Lock() 1851 for cg := range t.cgroups { 1852 if cg.HierarchyID() == hid { 1853 cg.Leave(t) 1854 t.ResetMemCgIDFromCgroup(cg) 1855 delete(t.cgroups, cg) 1856 releasedCGs = append(releasedCGs, cg) 1857 // A task can't be part of multiple cgroups from the same 1858 // hierarchy, so we can skip checking the rest once we find a 1859 // match. 1860 break 1861 } 1862 } 1863 t.mu.Unlock() 1864 }) 1865 k.tasks.mu.RUnlock() 1866 1867 for _, c := range releasedCGs { 1868 c.decRef() 1869 } 1870 } 1871 1872 // ReplaceFSContextRoots updates root and cwd to `newRoot` in the FSContext 1873 // across all tasks whose old root or cwd were `oldRoot`. 1874 func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualDentry, newRoot vfs.VirtualDentry) { 1875 k.tasks.mu.RLock() 1876 oldRootDecRefs := 0 1877 k.tasks.forEachTaskLocked(func(t *Task) { 1878 t.mu.Lock() 1879 defer t.mu.Unlock() 1880 if fsc := t.fsContext; fsc != nil { 1881 fsc.mu.Lock() 1882 defer fsc.mu.Unlock() 1883 if fsc.root == oldRoot { 1884 newRoot.IncRef() 1885 oldRootDecRefs++ 1886 fsc.root = newRoot 1887 } 1888 if fsc.cwd == oldRoot { 1889 newRoot.IncRef() 1890 oldRootDecRefs++ 1891 fsc.cwd = newRoot 1892 } 1893 } 1894 }) 1895 k.tasks.mu.RUnlock() 1896 for i := 0; i < oldRootDecRefs; i++ { 1897 oldRoot.DecRef(ctx) 1898 } 1899 } 1900 1901 // GetUserCounters returns the user counters for the given KUID. 1902 func (k *Kernel) GetUserCounters(uid auth.KUID) *UserCounters { 1903 k.userCountersMapMu.Lock() 1904 defer k.userCountersMapMu.Unlock() 1905 1906 if uc, ok := k.userCountersMap[uid]; ok { 1907 return uc 1908 } 1909 1910 uc := &UserCounters{} 1911 k.userCountersMap[uid] = uc 1912 return uc 1913 } 1914 1915 // AddDevGofer initializes the dev gofer connection and starts tracking it. 1916 // It takes ownership of goferFD. 1917 func (k *Kernel) AddDevGofer(cid string, goferFD int) error { 1918 client, err := devutil.NewGoferClient(k.SupervisorContext(), goferFD) 1919 if err != nil { 1920 return err 1921 } 1922 1923 k.devGofersMu.Lock() 1924 defer k.devGofersMu.Unlock() 1925 if k.devGofers == nil { 1926 k.devGofers = make(map[string]*devutil.GoferClient) 1927 } 1928 k.devGofers[cid] = client 1929 return nil 1930 } 1931 1932 // RemoveDevGofer closes the dev gofer connection, if one exists, and stops 1933 // tracking it. 1934 func (k *Kernel) RemoveDevGofer(cid string) { 1935 k.devGofersMu.Lock() 1936 defer k.devGofersMu.Unlock() 1937 client, ok := k.devGofers[cid] 1938 if !ok { 1939 return 1940 } 1941 client.Close() 1942 delete(k.devGofers, cid) 1943 } 1944 1945 func (k *Kernel) getDevGoferClient(cid string) *devutil.GoferClient { 1946 k.devGofersMu.Lock() 1947 defer k.devGofersMu.Unlock() 1948 return k.devGofers[cid] 1949 } 1950 1951 func (k *Kernel) cleaupDevGofers() { 1952 k.devGofersMu.Lock() 1953 defer k.devGofersMu.Unlock() 1954 for _, client := range k.devGofers { 1955 client.Close() 1956 } 1957 k.devGofers = nil 1958 }