github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/kernel.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernel provides an emulation of the Linux kernel. 16 // 17 // See README.md for a detailed overview. 18 // 19 // Lock order (outermost locks must be taken first): 20 // 21 // Kernel.extMu 22 // ThreadGroup.timerMu 23 // ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer) 24 // TaskSet.mu 25 // SignalHandlers.mu 26 // Task.mu 27 // runningTasksMu 28 // 29 // Locking SignalHandlers.mu in multiple SignalHandlers requires locking 30 // TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same 31 // time requires locking all of their signal mutexes first. 32 package kernel 33 34 import ( 35 "errors" 36 "fmt" 37 "path/filepath" 38 "sync/atomic" 39 "time" 40 41 "github.com/SagerNet/gvisor/pkg/abi/linux" 42 "github.com/SagerNet/gvisor/pkg/cleanup" 43 "github.com/SagerNet/gvisor/pkg/context" 44 "github.com/SagerNet/gvisor/pkg/cpuid" 45 "github.com/SagerNet/gvisor/pkg/eventchannel" 46 "github.com/SagerNet/gvisor/pkg/fspath" 47 "github.com/SagerNet/gvisor/pkg/log" 48 "github.com/SagerNet/gvisor/pkg/refs" 49 "github.com/SagerNet/gvisor/pkg/sentry/arch" 50 "github.com/SagerNet/gvisor/pkg/sentry/fs" 51 oldtimerfd "github.com/SagerNet/gvisor/pkg/sentry/fs/timerfd" 52 "github.com/SagerNet/gvisor/pkg/sentry/fsbridge" 53 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/pipefs" 54 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/sockfs" 55 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/timerfd" 56 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/tmpfs" 57 "github.com/SagerNet/gvisor/pkg/sentry/hostcpu" 58 "github.com/SagerNet/gvisor/pkg/sentry/inet" 59 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 60 "github.com/SagerNet/gvisor/pkg/sentry/kernel/epoll" 61 "github.com/SagerNet/gvisor/pkg/sentry/kernel/futex" 62 "github.com/SagerNet/gvisor/pkg/sentry/kernel/sched" 63 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 64 "github.com/SagerNet/gvisor/pkg/sentry/limits" 65 "github.com/SagerNet/gvisor/pkg/sentry/loader" 66 "github.com/SagerNet/gvisor/pkg/sentry/mm" 67 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 68 "github.com/SagerNet/gvisor/pkg/sentry/platform" 69 "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/port" 70 sentrytime "github.com/SagerNet/gvisor/pkg/sentry/time" 71 "github.com/SagerNet/gvisor/pkg/sentry/unimpl" 72 uspb "github.com/SagerNet/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" 73 "github.com/SagerNet/gvisor/pkg/sentry/uniqueid" 74 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 75 "github.com/SagerNet/gvisor/pkg/state" 76 "github.com/SagerNet/gvisor/pkg/state/wire" 77 "github.com/SagerNet/gvisor/pkg/sync" 78 "github.com/SagerNet/gvisor/pkg/tcpip" 79 ) 80 81 // VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow 82 // easy access everywhere. To be removed once VFS2 becomes the default. 83 var VFS2Enabled = false 84 85 // FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow 86 // easy access everywhere. To be removed once FUSE is completed. 87 var FUSEEnabled = false 88 89 // Kernel represents an emulated Linux kernel. It must be initialized by calling 90 // Init() or LoadFrom(). 91 // 92 // +stateify savable 93 type Kernel struct { 94 // extMu serializes external changes to the Kernel with calls to 95 // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel 96 // remains frozen for the duration of the call; it requires that the Kernel 97 // is paused as a precondition, which ensures that none of the tasks 98 // running within the Kernel can affect its state, but extMu is required to 99 // ensure that concurrent users of the Kernel *outside* the Kernel's 100 // control cannot affect its state by calling e.g. 101 // Kernel.SendExternalSignal.) 102 extMu sync.Mutex `state:"nosave"` 103 104 // started is true if Start has been called. Unless otherwise specified, 105 // all Kernel fields become immutable once started becomes true. 106 started bool `state:"nosave"` 107 108 // All of the following fields are immutable unless otherwise specified. 109 110 // Platform is the platform that is used to execute tasks in the created 111 // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is 112 // embedded anonymously (the same issue applies). 113 platform.Platform `state:"nosave"` 114 115 // mf provides application memory. 116 mf *pgalloc.MemoryFile `state:"nosave"` 117 118 // See InitKernelArgs for the meaning of these fields. 119 featureSet *cpuid.FeatureSet 120 timekeeper *Timekeeper 121 tasks *TaskSet 122 rootUserNamespace *auth.UserNamespace 123 rootNetworkNamespace *inet.Namespace 124 applicationCores uint 125 useHostCores bool 126 extraAuxv []arch.AuxEntry 127 vdso *loader.VDSO 128 rootUTSNamespace *UTSNamespace 129 rootIPCNamespace *IPCNamespace 130 rootAbstractSocketNamespace *AbstractSocketNamespace 131 132 // futexes is the "root" futex.Manager, from which all others are forked. 133 // This is necessary to ensure that shared futexes are coherent across all 134 // tasks, including those created by CreateProcess. 135 futexes *futex.Manager 136 137 // globalInit is the thread group whose leader has ID 1 in the root PID 138 // namespace. globalInit is stored separately so that it is accessible even 139 // after all tasks in the thread group have exited, such that ID 1 is no 140 // longer mapped. 141 // 142 // globalInit is mutable until it is assigned by the first successful call 143 // to CreateProcess, and is protected by extMu. 144 globalInit *ThreadGroup 145 146 // syslog is the kernel log. 147 syslog syslog 148 149 // runningTasksMu synchronizes disable/enable of cpuClockTicker when 150 // the kernel is idle (runningTasks == 0). 151 // 152 // runningTasksMu is used to exclude critical sections when the timer 153 // disables itself and when the first active task enables the timer, 154 // ensuring that tasks always see a valid cpuClock value. 155 runningTasksMu sync.Mutex `state:"nosave"` 156 157 // runningTasks is the total count of tasks currently in 158 // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are 159 // not blocked or stopped. 160 // 161 // runningTasks must be accessed atomically. Increments from 0 to 1 are 162 // further protected by runningTasksMu (see incRunningTasks). 163 runningTasks int64 164 165 // cpuClock is incremented every linux.ClockTick. cpuClock is used to 166 // measure task CPU usage, since sampling monotonicClock twice on every 167 // syscall turns out to be unreasonably expensive. This is similar to how 168 // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING), 169 // although Linux also uses scheduler timing information to improve 170 // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do 171 // since "preeemptive" scheduling is managed by the Go runtime, which 172 // doesn't provide this information. 173 // 174 // cpuClock is mutable, and is accessed using atomic memory operations. 175 cpuClock uint64 176 177 // cpuClockTicker increments cpuClock. 178 cpuClockTicker *ktime.Timer `state:"nosave"` 179 180 // cpuClockTickerDisabled indicates that cpuClockTicker has been 181 // disabled because no tasks are running. 182 // 183 // cpuClockTickerDisabled is protected by runningTasksMu. 184 cpuClockTickerDisabled bool 185 186 // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the 187 // point it was disabled. It is cached here to avoid a lock ordering 188 // violation with cpuClockTicker.mu when runningTaskMu is held. 189 // 190 // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is 191 // true. 192 // 193 // cpuClockTickerSetting is protected by runningTasksMu. 194 cpuClockTickerSetting ktime.Setting 195 196 // uniqueID is used to generate unique identifiers. 197 // 198 // uniqueID is mutable, and is accessed using atomic memory operations. 199 uniqueID uint64 200 201 // nextInotifyCookie is a monotonically increasing counter used for 202 // generating unique inotify event cookies. 203 // 204 // nextInotifyCookie is mutable, and is accessed using atomic memory 205 // operations. 206 nextInotifyCookie uint32 207 208 // netlinkPorts manages allocation of netlink socket port IDs. 209 netlinkPorts *port.Manager 210 211 // saveStatus is nil if the sandbox has not been saved, errSaved or 212 // errAutoSaved if it has been saved successfully, or the error causing the 213 // sandbox to exit during save. 214 // It is protected by extMu. 215 saveStatus error `state:"nosave"` 216 217 // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. 218 danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` 219 220 // sockets is the list of all network sockets in the system. 221 // Protected by extMu. 222 // TODO(github.com/SagerNet/issue/1624): Only used by VFS1. 223 sockets socketList 224 225 // socketsVFS2 records all network sockets in the system. Protected by 226 // extMu. 227 socketsVFS2 map[*vfs.FileDescription]*SocketRecord 228 229 // nextSocketRecord is the next entry number to use in sockets. Protected 230 // by extMu. 231 nextSocketRecord uint64 232 233 // deviceRegistry is used to save/restore device.SimpleDevices. 234 deviceRegistry struct{} `state:".(*device.Registry)"` 235 236 // DirentCacheLimiter controls the number of total dirent entries can be in 237 // caches. Not all caches use it, only the caches that use host resources use 238 // the limiter. It may be nil if disabled. 239 DirentCacheLimiter *fs.DirentCacheLimiter 240 241 // unimplementedSyscallEmitterOnce is used in the initialization of 242 // unimplementedSyscallEmitter. 243 unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` 244 245 // unimplementedSyscallEmitter is used to emit unimplemented syscall 246 // events. This is initialized lazily on the first unimplemented 247 // syscall. 248 unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` 249 250 // SpecialOpts contains special kernel options. 251 SpecialOpts 252 253 // vfs keeps the filesystem state used across the kernel. 254 vfs vfs.VirtualFilesystem 255 256 // hostMount is the Mount used for file descriptors that were imported 257 // from the host. 258 hostMount *vfs.Mount 259 260 // pipeMount is the Mount used for pipes created by the pipe() and pipe2() 261 // syscalls (as opposed to named pipes created by mknod()). 262 pipeMount *vfs.Mount 263 264 // shmMount is the Mount used for anonymous files created by the 265 // memfd_create() syscalls. It is analagous to Linux's shm_mnt. 266 shmMount *vfs.Mount 267 268 // socketMount is the Mount used for sockets created by the socket() and 269 // socketpair() syscalls. There are several cases where a socket dentry will 270 // not be contained in socketMount: 271 // 1. Socket files created by mknod() 272 // 2. Socket fds imported from the host (Kernel.hostMount is used for these) 273 // 3. Socket files created by binding Unix sockets to a file path 274 socketMount *vfs.Mount 275 276 // If set to true, report address space activation waits as if the task is in 277 // external wait so that the watchdog doesn't report the task stuck. 278 SleepForAddressSpaceActivation bool 279 280 // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a 281 // tracee-tracer relationship. The key is a process (technically, the thread 282 // group leader) that can be traced by any thread that is a descendant of the 283 // value. If the value is nil, then anyone can trace the process represented by 284 // the key. 285 // 286 // ptraceExceptions is protected by the TaskSet mutex. 287 ptraceExceptions map[*Task]*Task 288 289 // YAMAPtraceScope is the current level of YAMA ptrace restrictions. 290 YAMAPtraceScope int32 291 292 // cgroupRegistry contains the set of active cgroup controllers on the 293 // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on 294 // the system. 295 cgroupRegistry *CgroupRegistry 296 } 297 298 // InitKernelArgs holds arguments to Init. 299 type InitKernelArgs struct { 300 // FeatureSet is the emulated CPU feature set. 301 FeatureSet *cpuid.FeatureSet 302 303 // Timekeeper manages time for all tasks in the system. 304 Timekeeper *Timekeeper 305 306 // RootUserNamespace is the root user namespace. 307 RootUserNamespace *auth.UserNamespace 308 309 // RootNetworkNamespace is the root network namespace. If nil, no networking 310 // will be available. 311 RootNetworkNamespace *inet.Namespace 312 313 // ApplicationCores is the number of logical CPUs visible to sandboxed 314 // applications. The set of logical CPU IDs is [0, ApplicationCores); thus 315 // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the 316 // most significant bit in cpu_possible_mask + 1. 317 ApplicationCores uint 318 319 // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU 320 // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a 321 // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it 322 // will be overridden. 323 UseHostCores bool 324 325 // ExtraAuxv contains additional auxiliary vector entries that are added to 326 // each process by the ELF loader. 327 ExtraAuxv []arch.AuxEntry 328 329 // Vdso holds the VDSO and its parameter page. 330 Vdso *loader.VDSO 331 332 // RootUTSNamespace is the root UTS namespace. 333 RootUTSNamespace *UTSNamespace 334 335 // RootIPCNamespace is the root IPC namespace. 336 RootIPCNamespace *IPCNamespace 337 338 // RootAbstractSocketNamespace is the root Abstract Socket namespace. 339 RootAbstractSocketNamespace *AbstractSocketNamespace 340 341 // PIDNamespace is the root PID namespace. 342 PIDNamespace *PIDNamespace 343 } 344 345 // Init initialize the Kernel with no tasks. 346 // 347 // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile 348 // before calling Init. 349 func (k *Kernel) Init(args InitKernelArgs) error { 350 if args.FeatureSet == nil { 351 return fmt.Errorf("args.FeatureSet is nil") 352 } 353 if args.Timekeeper == nil { 354 return fmt.Errorf("args.Timekeeper is nil") 355 } 356 if args.Timekeeper.clocks == nil { 357 return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") 358 } 359 if args.RootUserNamespace == nil { 360 return fmt.Errorf("args.RootUserNamespace is nil") 361 } 362 if args.ApplicationCores == 0 { 363 return fmt.Errorf("args.ApplicationCores is 0") 364 } 365 366 k.featureSet = args.FeatureSet 367 k.timekeeper = args.Timekeeper 368 k.tasks = newTaskSet(args.PIDNamespace) 369 k.rootUserNamespace = args.RootUserNamespace 370 k.rootUTSNamespace = args.RootUTSNamespace 371 k.rootIPCNamespace = args.RootIPCNamespace 372 k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace 373 k.rootNetworkNamespace = args.RootNetworkNamespace 374 if k.rootNetworkNamespace == nil { 375 k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) 376 } 377 k.applicationCores = args.ApplicationCores 378 if args.UseHostCores { 379 k.useHostCores = true 380 maxCPU, err := hostcpu.MaxPossibleCPU() 381 if err != nil { 382 return fmt.Errorf("failed to get maximum CPU number: %v", err) 383 } 384 minAppCores := uint(maxCPU) + 1 385 if k.applicationCores < minAppCores { 386 log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores) 387 k.applicationCores = minAppCores 388 } 389 } 390 k.extraAuxv = args.ExtraAuxv 391 k.vdso = args.Vdso 392 k.futexes = futex.NewManager() 393 k.netlinkPorts = port.New() 394 k.ptraceExceptions = make(map[*Task]*Task) 395 k.YAMAPtraceScope = linux.YAMA_SCOPE_RELATIONAL 396 397 if VFS2Enabled { 398 ctx := k.SupervisorContext() 399 if err := k.vfs.Init(ctx); err != nil { 400 return fmt.Errorf("failed to initialize VFS: %v", err) 401 } 402 403 pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) 404 if err != nil { 405 return fmt.Errorf("failed to create pipefs filesystem: %v", err) 406 } 407 defer pipeFilesystem.DecRef(ctx) 408 pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) 409 if err != nil { 410 return fmt.Errorf("failed to create pipefs mount: %v", err) 411 } 412 k.pipeMount = pipeMount 413 414 tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) 415 if err != nil { 416 return fmt.Errorf("failed to create tmpfs filesystem: %v", err) 417 } 418 defer tmpfsFilesystem.DecRef(ctx) 419 defer tmpfsRoot.DecRef(ctx) 420 shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) 421 if err != nil { 422 return fmt.Errorf("failed to create tmpfs mount: %v", err) 423 } 424 k.shmMount = shmMount 425 426 socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) 427 if err != nil { 428 return fmt.Errorf("failed to create sockfs filesystem: %v", err) 429 } 430 defer socketFilesystem.DecRef(ctx) 431 socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) 432 if err != nil { 433 return fmt.Errorf("failed to create sockfs mount: %v", err) 434 } 435 k.socketMount = socketMount 436 437 k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord) 438 439 k.cgroupRegistry = newCgroupRegistry() 440 } 441 return nil 442 } 443 444 // SaveTo saves the state of k to w. 445 // 446 // Preconditions: The kernel must be paused throughout the call to SaveTo. 447 func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { 448 saveStart := time.Now() 449 450 // Do not allow other Kernel methods to affect it while it's being saved. 451 k.extMu.Lock() 452 defer k.extMu.Unlock() 453 454 // Stop time. 455 k.pauseTimeLocked(ctx) 456 defer k.resumeTimeLocked(ctx) 457 458 // Evict all evictable MemoryFile allocations. 459 k.mf.StartEvictions() 460 k.mf.WaitForEvictions() 461 462 if VFS2Enabled { 463 // Discard unsavable mappings, such as those for host file descriptors. 464 if err := k.invalidateUnsavableMappings(ctx); err != nil { 465 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 466 } 467 468 // Prepare filesystems for saving. This must be done after 469 // invalidateUnsavableMappings(), since dropping memory mappings may 470 // affect filesystem state (e.g. page cache reference counts). 471 if err := k.vfs.PrepareSave(ctx); err != nil { 472 return err 473 } 474 } else { 475 // Flush cached file writes to backing storage. This must come after 476 // MemoryFile eviction since eviction may cause file writes. 477 if err := k.flushWritesToFiles(ctx); err != nil { 478 return err 479 } 480 481 // Remove all epoll waiter objects from underlying wait queues. 482 // NOTE: for programs to resume execution in future snapshot scenarios, 483 // we will need to re-establish these waiter objects after saving. 484 k.tasks.unregisterEpollWaiters(ctx) 485 486 // Clear the dirent cache before saving because Dirents must be Loaded in a 487 // particular order (parents before children), and Loading dirents from a cache 488 // breaks that order. 489 if err := k.flushMountSourceRefs(ctx); err != nil { 490 return err 491 } 492 493 // Ensure that all inode and mount release operations have completed. 494 fs.AsyncBarrier() 495 496 // Once all fs work has completed (flushed references have all been released), 497 // reset mount mappings. This allows individual mounts to save how inodes map 498 // to filesystem resources. Without this, fs.Inodes cannot be restored. 499 fs.SaveInodeMappings() 500 501 // Discard unsavable mappings, such as those for host file descriptors. 502 // This must be done after waiting for "asynchronous fs work", which 503 // includes async I/O that may touch application memory. 504 // 505 // TODO(github.com/SagerNet/issue/1624): This rationale is believed to be 506 // obsolete since AIO callbacks are now waited-for by Kernel.Pause(), 507 // but this order is conservatively retained for VFS1. 508 if err := k.invalidateUnsavableMappings(ctx); err != nil { 509 return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) 510 } 511 } 512 513 // Save the CPUID FeatureSet before the rest of the kernel so we can 514 // verify its compatibility on restore before attempting to restore the 515 // entire kernel, which may fail on an incompatible machine. 516 // 517 // N.B. This will also be saved along with the full kernel save below. 518 cpuidStart := time.Now() 519 if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil { 520 return err 521 } 522 log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) 523 524 // Save the timekeeper's state. 525 526 // Save the kernel state. 527 kernelStart := time.Now() 528 stats, err := state.Save(ctx, w, k) 529 if err != nil { 530 return err 531 } 532 log.Infof("Kernel save stats: %s", stats.String()) 533 log.Infof("Kernel save took [%s].", time.Since(kernelStart)) 534 535 // Save the memory file's state. 536 memoryStart := time.Now() 537 if err := k.mf.SaveTo(ctx, w); err != nil { 538 return err 539 } 540 log.Infof("Memory save took [%s].", time.Since(memoryStart)) 541 542 log.Infof("Overall save took [%s].", time.Since(saveStart)) 543 544 return nil 545 } 546 547 // flushMountSourceRefs flushes the MountSources for all mounted filesystems 548 // and open FDs. 549 // 550 // Preconditions: !VFS2Enabled. 551 func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { 552 // Flush all mount sources for currently mounted filesystems in each task. 553 flushed := make(map[*fs.MountNamespace]struct{}) 554 k.tasks.mu.RLock() 555 k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { 556 if _, ok := flushed[tg.mounts]; ok { 557 // Already flushed. 558 return 559 } 560 tg.mounts.FlushMountSourceRefs() 561 flushed[tg.mounts] = struct{}{} 562 }) 563 k.tasks.mu.RUnlock() 564 565 // There may be some open FDs whose filesystems have been unmounted. We 566 // must flush those as well. 567 return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { 568 file.Dirent.Inode.MountSource.FlushDirentRefs() 569 return nil 570 }) 571 } 572 573 // forEachFDPaused applies the given function to each open file descriptor in 574 // each task. 575 // 576 // Precondition: Must be called with the kernel paused. 577 func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { 578 ts.mu.RLock() 579 defer ts.mu.RUnlock() 580 for t := range ts.Root.tids { 581 // We can skip locking Task.mu here since the kernel is paused. 582 if t.fdTable == nil { 583 continue 584 } 585 t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { 586 if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { 587 err = lastErr 588 } 589 }) 590 } 591 return err 592 } 593 594 // Preconditions: !VFS2Enabled. 595 func (k *Kernel) flushWritesToFiles(ctx context.Context) error { 596 return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { 597 if flags := file.Flags(); !flags.Write { 598 return nil 599 } 600 if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) { 601 return nil 602 } 603 // Here we need all metadata synced. 604 syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) 605 if err := fs.SaveFileFsyncError(syncErr); err != nil { 606 name, _ := file.Dirent.FullName(nil /* root */) 607 // Wrap this error in ErrSaveRejection so that it will trigger a save 608 // error, rather than a panic. This also allows us to distinguish Fsync 609 // errors from state file errors in state.Save. 610 return &fs.ErrSaveRejection{ 611 Err: fmt.Errorf("%q was not sufficiently synced: %w", name, err), 612 } 613 } 614 return nil 615 }) 616 } 617 618 // Preconditions: !VFS2Enabled. 619 func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { 620 ts.mu.RLock() 621 defer ts.mu.RUnlock() 622 623 // Tasks that belong to the same process could potentially point to the 624 // same FDTable. So we retain a map of processed ones to avoid 625 // processing the same FDTable multiple times. 626 processed := make(map[*FDTable]struct{}) 627 for t := range ts.Root.tids { 628 // We can skip locking Task.mu here since the kernel is paused. 629 if t.fdTable == nil { 630 continue 631 } 632 if _, ok := processed[t.fdTable]; ok { 633 continue 634 } 635 t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { 636 if e, ok := file.FileOperations.(*epoll.EventPoll); ok { 637 e.UnregisterEpollWaiters() 638 } 639 }) 640 processed[t.fdTable] = struct{}{} 641 } 642 } 643 644 // Preconditions: The kernel must be paused. 645 func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { 646 invalidated := make(map[*mm.MemoryManager]struct{}) 647 k.tasks.mu.RLock() 648 defer k.tasks.mu.RUnlock() 649 for t := range k.tasks.Root.tids { 650 // We can skip locking Task.mu here since the kernel is paused. 651 if memMgr := t.image.MemoryManager; memMgr != nil { 652 if _, ok := invalidated[memMgr]; !ok { 653 if err := memMgr.InvalidateUnsavable(ctx); err != nil { 654 return err 655 } 656 invalidated[memMgr] = struct{}{} 657 } 658 } 659 // I really wish we just had a sync.Map of all MMs... 660 if r, ok := t.runState.(*runSyscallAfterExecStop); ok { 661 if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil { 662 return err 663 } 664 } 665 } 666 return nil 667 } 668 669 // LoadFrom returns a new Kernel loaded from args. 670 func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { 671 loadStart := time.Now() 672 673 initAppCores := k.applicationCores 674 675 // Load the pre-saved CPUID FeatureSet. 676 // 677 // N.B. This was also saved along with the full kernel below, so we 678 // don't need to explicitly install it in the Kernel. 679 cpuidStart := time.Now() 680 var features cpuid.FeatureSet 681 if _, err := state.Load(ctx, r, &features); err != nil { 682 return err 683 } 684 log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) 685 686 // Verify that the FeatureSet is usable on this host. We do this before 687 // Kernel load so that the explicit CPUID mismatch error has priority 688 // over floating point state restore errors that may occur on load on 689 // an incompatible machine. 690 if err := features.CheckHostCompatible(); err != nil { 691 return err 692 } 693 694 // Load the kernel state. 695 kernelStart := time.Now() 696 stats, err := state.Load(ctx, r, k) 697 if err != nil { 698 return err 699 } 700 log.Infof("Kernel load stats: %s", stats.String()) 701 log.Infof("Kernel load took [%s].", time.Since(kernelStart)) 702 703 // rootNetworkNamespace should be populated after loading the state file. 704 // Restore the root network stack. 705 k.rootNetworkNamespace.RestoreRootStack(net) 706 707 // Load the memory file's state. 708 memoryStart := time.Now() 709 if err := k.mf.LoadFrom(ctx, r); err != nil { 710 return err 711 } 712 log.Infof("Memory load took [%s].", time.Since(memoryStart)) 713 714 log.Infof("Overall load took [%s]", time.Since(loadStart)) 715 716 k.Timekeeper().SetClocks(clocks) 717 718 if timeReady != nil { 719 close(timeReady) 720 } 721 722 if net != nil { 723 net.Resume() 724 } 725 726 if VFS2Enabled { 727 if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { 728 return err 729 } 730 } else { 731 // Ensure that all pending asynchronous work is complete: 732 // - namedpipe opening 733 // - inode file opening 734 if err := fs.AsyncErrorBarrier(); err != nil { 735 return err 736 } 737 } 738 739 tcpip.AsyncLoading.Wait() 740 741 log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) 742 743 // Applications may size per-cpu structures based on k.applicationCores, so 744 // it can't change across save/restore. When we are virtualizing CPU 745 // numbers, this isn't a problem. However, when we are exposing host CPU 746 // assignments, we can't tolerate an increase in the number of host CPUs, 747 // which could result in getcpu(2) returning CPUs that applications expect 748 // not to exist. 749 if k.useHostCores && initAppCores > k.applicationCores { 750 return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores) 751 } 752 753 return nil 754 } 755 756 // UniqueID returns a unique identifier. 757 func (k *Kernel) UniqueID() uint64 { 758 id := atomic.AddUint64(&k.uniqueID, 1) 759 if id == 0 { 760 panic("unique identifier generator wrapped around") 761 } 762 return id 763 } 764 765 // CreateProcessArgs holds arguments to kernel.CreateProcess. 766 type CreateProcessArgs struct { 767 // Filename is the filename to load as the init binary. 768 // 769 // If this is provided as "", File will be checked, then the file will be 770 // guessed via Argv[0]. 771 Filename string 772 773 // File is a passed host FD pointing to a file to load as the init binary. 774 // 775 // This is checked if and only if Filename is "". 776 File fsbridge.File 777 778 // Argvv is a list of arguments. 779 Argv []string 780 781 // Envv is a list of environment variables. 782 Envv []string 783 784 // WorkingDirectory is the initial working directory. 785 // 786 // This defaults to the root if empty. 787 WorkingDirectory string 788 789 // Credentials is the initial credentials. 790 Credentials *auth.Credentials 791 792 // FDTable is the initial set of file descriptors. If CreateProcess succeeds, 793 // it takes a reference on FDTable. 794 FDTable *FDTable 795 796 // Umask is the initial umask. 797 Umask uint 798 799 // Limits is the initial resource limits. 800 Limits *limits.LimitSet 801 802 // MaxSymlinkTraversals is the maximum number of symlinks to follow 803 // during resolution. 804 MaxSymlinkTraversals uint 805 806 // UTSNamespace is the initial UTS namespace. 807 UTSNamespace *UTSNamespace 808 809 // IPCNamespace is the initial IPC namespace. 810 IPCNamespace *IPCNamespace 811 812 // PIDNamespace is the initial PID Namespace. 813 PIDNamespace *PIDNamespace 814 815 // AbstractSocketNamespace is the initial Abstract Socket namespace. 816 AbstractSocketNamespace *AbstractSocketNamespace 817 818 // MountNamespace optionally contains the mount namespace for this 819 // process. If nil, the init process's mount namespace is used. 820 // 821 // Anyone setting MountNamespace must donate a reference (i.e. 822 // increment it). 823 MountNamespace *fs.MountNamespace 824 825 // MountNamespaceVFS2 optionally contains the mount namespace for this 826 // process. If nil, the init process's mount namespace is used. 827 // 828 // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. 829 // increment it). 830 MountNamespaceVFS2 *vfs.MountNamespace 831 832 // ContainerID is the container that the process belongs to. 833 ContainerID string 834 } 835 836 // NewContext returns a context.Context that represents the task that will be 837 // created by args.NewContext(k). 838 func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext { 839 return &createProcessContext{ 840 Logger: log.Log(), 841 k: k, 842 args: args, 843 } 844 } 845 846 // createProcessContext is a context.Context that represents the context 847 // associated with a task that is being created. 848 type createProcessContext struct { 849 context.NoopSleeper 850 log.Logger 851 k *Kernel 852 args *CreateProcessArgs 853 } 854 855 // Value implements context.Context.Value. 856 func (ctx *createProcessContext) Value(key interface{}) interface{} { 857 switch key { 858 case CtxKernel: 859 return ctx.k 860 case CtxPIDNamespace: 861 return ctx.args.PIDNamespace 862 case CtxUTSNamespace: 863 return ctx.args.UTSNamespace 864 case CtxIPCNamespace: 865 ipcns := ctx.args.IPCNamespace 866 ipcns.IncRef() 867 return ipcns 868 case auth.CtxCredentials: 869 return ctx.args.Credentials 870 case fs.CtxRoot: 871 if ctx.args.MountNamespace != nil { 872 // MountNamespace.Root() will take a reference on the root dirent for us. 873 return ctx.args.MountNamespace.Root() 874 } 875 return nil 876 case vfs.CtxRoot: 877 if ctx.args.MountNamespaceVFS2 == nil { 878 return nil 879 } 880 root := ctx.args.MountNamespaceVFS2.Root() 881 root.IncRef() 882 return root 883 case vfs.CtxMountNamespace: 884 if ctx.k.globalInit == nil { 885 return nil 886 } 887 mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() 888 mntns.IncRef() 889 return mntns 890 case fs.CtxDirentCacheLimiter: 891 return ctx.k.DirentCacheLimiter 892 case inet.CtxStack: 893 return ctx.k.RootNetworkNamespace().Stack() 894 case ktime.CtxRealtimeClock: 895 return ctx.k.RealtimeClock() 896 case limits.CtxLimits: 897 return ctx.args.Limits 898 case pgalloc.CtxMemoryFile: 899 return ctx.k.mf 900 case pgalloc.CtxMemoryFileProvider: 901 return ctx.k 902 case platform.CtxPlatform: 903 return ctx.k 904 case uniqueid.CtxGlobalUniqueID: 905 return ctx.k.UniqueID() 906 case uniqueid.CtxGlobalUniqueIDProvider: 907 return ctx.k 908 case uniqueid.CtxInotifyCookie: 909 return ctx.k.GenerateInotifyCookie() 910 case unimpl.CtxEvents: 911 return ctx.k 912 default: 913 return nil 914 } 915 } 916 917 // CreateProcess creates a new task in a new thread group with the given 918 // options. The new task has no parent and is in the root PID namespace. 919 // 920 // If k.Start() has already been called, then the created process must be 921 // started by calling kernel.StartProcess(tg). 922 // 923 // If k.Start() has not yet been called, then the created task will begin 924 // running when k.Start() is called. 925 // 926 // CreateProcess has no analogue in Linux; it is used to create the initial 927 // application task, as well as processes started by the control server. 928 func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) { 929 k.extMu.Lock() 930 defer k.extMu.Unlock() 931 log.Infof("EXEC: %v", args.Argv) 932 933 ctx := args.NewContext(k) 934 935 var ( 936 opener fsbridge.Lookup 937 fsContext *FSContext 938 mntns *fs.MountNamespace 939 mntnsVFS2 *vfs.MountNamespace 940 ) 941 942 if VFS2Enabled { 943 mntnsVFS2 = args.MountNamespaceVFS2 944 if mntnsVFS2 == nil { 945 // Add a reference to the namespace, which is transferred to the new process. 946 mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() 947 mntnsVFS2.IncRef() 948 } 949 // Get the root directory from the MountNamespace. 950 root := mntnsVFS2.Root() 951 root.IncRef() 952 defer root.DecRef(ctx) 953 954 // Grab the working directory. 955 wd := root // Default. 956 if args.WorkingDirectory != "" { 957 pop := vfs.PathOperation{ 958 Root: root, 959 Start: wd, 960 Path: fspath.Parse(args.WorkingDirectory), 961 FollowFinalSymlink: true, 962 } 963 var err error 964 wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ 965 CheckSearchable: true, 966 }) 967 if err != nil { 968 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 969 } 970 defer wd.DecRef(ctx) 971 } 972 opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) 973 fsContext = NewFSContextVFS2(root, wd, args.Umask) 974 975 } else { 976 mntns = args.MountNamespace 977 if mntns == nil { 978 mntns = k.GlobalInit().Leader().MountNamespace() 979 mntns.IncRef() 980 } 981 // Get the root directory from the MountNamespace. 982 root := mntns.Root() 983 // The call to newFSContext below will take a reference on root, so we 984 // don't need to hold this one. 985 defer root.DecRef(ctx) 986 987 // Grab the working directory. 988 remainingTraversals := args.MaxSymlinkTraversals 989 wd := root // Default. 990 if args.WorkingDirectory != "" { 991 var err error 992 wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) 993 if err != nil { 994 return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) 995 } 996 defer wd.DecRef(ctx) 997 } 998 opener = fsbridge.NewFSLookup(mntns, root, wd) 999 fsContext = newFSContext(root, wd, args.Umask) 1000 } 1001 1002 tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) 1003 cu := cleanup.Make(func() { 1004 tg.Release(ctx) 1005 }) 1006 defer cu.Clean() 1007 1008 // Check which file to start from. 1009 switch { 1010 case args.Filename != "": 1011 // If a filename is given, take that. 1012 // Set File to nil so we resolve the path in LoadTaskImage. 1013 args.File = nil 1014 case args.File != nil: 1015 // If File is set, take the File provided directly. 1016 default: 1017 // Otherwise look at Argv and see if the first argument is a valid path. 1018 if len(args.Argv) == 0 { 1019 return nil, 0, fmt.Errorf("no filename or command provided") 1020 } 1021 if !filepath.IsAbs(args.Argv[0]) { 1022 return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0]) 1023 } 1024 args.Filename = args.Argv[0] 1025 } 1026 1027 // Create a fresh task context. 1028 remainingTraversals := args.MaxSymlinkTraversals 1029 loadArgs := loader.LoadArgs{ 1030 Opener: opener, 1031 RemainingTraversals: &remainingTraversals, 1032 ResolveFinal: true, 1033 Filename: args.Filename, 1034 File: args.File, 1035 CloseOnExec: false, 1036 Argv: args.Argv, 1037 Envv: args.Envv, 1038 Features: k.featureSet, 1039 } 1040 1041 image, se := k.LoadTaskImage(ctx, loadArgs) 1042 if se != nil { 1043 return nil, 0, errors.New(se.String()) 1044 } 1045 1046 // Take a reference on the FDTable, which will be transferred to 1047 // TaskSet.NewTask(). 1048 args.FDTable.IncRef() 1049 1050 // Create the task. 1051 config := &TaskConfig{ 1052 Kernel: k, 1053 ThreadGroup: tg, 1054 TaskImage: image, 1055 FSContext: fsContext, 1056 FDTable: args.FDTable, 1057 Credentials: args.Credentials, 1058 NetworkNamespace: k.RootNetworkNamespace(), 1059 AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), 1060 UTSNamespace: args.UTSNamespace, 1061 IPCNamespace: args.IPCNamespace, 1062 AbstractSocketNamespace: args.AbstractSocketNamespace, 1063 MountNamespaceVFS2: mntnsVFS2, 1064 ContainerID: args.ContainerID, 1065 } 1066 t, err := k.tasks.NewTask(ctx, config) 1067 if err != nil { 1068 return nil, 0, err 1069 } 1070 t.traceExecEvent(image) // Simulate exec for tracing. 1071 1072 // Success. 1073 cu.Release() 1074 tgid := k.tasks.Root.IDOfThreadGroup(tg) 1075 if k.globalInit == nil { 1076 k.globalInit = tg 1077 } 1078 return tg, tgid, nil 1079 } 1080 1081 // StartProcess starts running a process that was created with CreateProcess. 1082 func (k *Kernel) StartProcess(tg *ThreadGroup) { 1083 t := tg.Leader() 1084 tid := k.tasks.Root.IDOfTask(t) 1085 t.Start(tid) 1086 } 1087 1088 // Start starts execution of all tasks in k. 1089 // 1090 // Preconditions: Start may be called exactly once. 1091 func (k *Kernel) Start() error { 1092 k.extMu.Lock() 1093 defer k.extMu.Unlock() 1094 1095 if k.globalInit == nil { 1096 return fmt.Errorf("kernel contains no tasks") 1097 } 1098 if k.started { 1099 return fmt.Errorf("kernel already started") 1100 } 1101 1102 k.started = true 1103 k.cpuClockTicker = ktime.NewTimer(k.timekeeper.monotonicClock, newKernelCPUClockTicker(k)) 1104 k.cpuClockTicker.Swap(ktime.Setting{ 1105 Enabled: true, 1106 Period: linux.ClockTick, 1107 }) 1108 // If k was created by LoadKernelFrom, timers were stopped during 1109 // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, 1110 // this is a no-op. 1111 k.resumeTimeLocked(k.SupervisorContext()) 1112 // Start task goroutines. 1113 k.tasks.mu.RLock() 1114 defer k.tasks.mu.RUnlock() 1115 for t, tid := range k.tasks.Root.tids { 1116 t.Start(tid) 1117 } 1118 return nil 1119 } 1120 1121 // pauseTimeLocked pauses all Timers and Timekeeper updates. 1122 // 1123 // Preconditions: 1124 // * Any task goroutines running in k must be stopped. 1125 // * k.extMu must be locked. 1126 func (k *Kernel) pauseTimeLocked(ctx context.Context) { 1127 // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before 1128 // Kernel.Start(). 1129 if k.cpuClockTicker != nil { 1130 k.cpuClockTicker.Pause() 1131 } 1132 1133 // By precondition, nothing else can be interacting with PIDNamespace.tids 1134 // or FDTable.files, so we can iterate them without synchronization. (We 1135 // can't hold the TaskSet mutex when pausing thread group timers because 1136 // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet 1137 // mutex, while holding the Timer mutex.) 1138 for t := range k.tasks.Root.tids { 1139 if t == t.tg.leader { 1140 t.tg.itimerRealTimer.Pause() 1141 for _, it := range t.tg.timers { 1142 it.PauseTimer() 1143 } 1144 } 1145 // This means we'll iterate FDTables shared by multiple tasks repeatedly, 1146 // but ktime.Timer.Pause is idempotent so this is harmless. 1147 if t.fdTable != nil { 1148 t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { 1149 if VFS2Enabled { 1150 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1151 tfd.PauseTimer() 1152 } 1153 } else { 1154 if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { 1155 tfd.PauseTimer() 1156 } 1157 } 1158 }) 1159 } 1160 } 1161 k.timekeeper.PauseUpdates() 1162 } 1163 1164 // resumeTimeLocked resumes all Timers and Timekeeper updates. If 1165 // pauseTimeLocked has not been previously called, resumeTimeLocked has no 1166 // effect. 1167 // 1168 // Preconditions: 1169 // * Any task goroutines running in k must be stopped. 1170 // * k.extMu must be locked. 1171 func (k *Kernel) resumeTimeLocked(ctx context.Context) { 1172 if k.cpuClockTicker != nil { 1173 k.cpuClockTicker.Resume() 1174 } 1175 1176 k.timekeeper.ResumeUpdates() 1177 for t := range k.tasks.Root.tids { 1178 if t == t.tg.leader { 1179 t.tg.itimerRealTimer.Resume() 1180 for _, it := range t.tg.timers { 1181 it.ResumeTimer() 1182 } 1183 } 1184 if t.fdTable != nil { 1185 t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { 1186 if VFS2Enabled { 1187 if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { 1188 tfd.ResumeTimer() 1189 } 1190 } else { 1191 if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { 1192 tfd.ResumeTimer() 1193 } 1194 } 1195 }) 1196 } 1197 } 1198 } 1199 1200 func (k *Kernel) incRunningTasks() { 1201 for { 1202 tasks := atomic.LoadInt64(&k.runningTasks) 1203 if tasks != 0 { 1204 // Standard case. Simply increment. 1205 if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) { 1206 continue 1207 } 1208 return 1209 } 1210 1211 // Transition from 0 -> 1. Synchronize with other transitions and timer. 1212 k.runningTasksMu.Lock() 1213 tasks = atomic.LoadInt64(&k.runningTasks) 1214 if tasks != 0 { 1215 // We're no longer the first task, no need to 1216 // re-enable. 1217 atomic.AddInt64(&k.runningTasks, 1) 1218 k.runningTasksMu.Unlock() 1219 return 1220 } 1221 1222 if !k.cpuClockTickerDisabled { 1223 // Timer was never disabled. 1224 atomic.StoreInt64(&k.runningTasks, 1) 1225 k.runningTasksMu.Unlock() 1226 return 1227 } 1228 1229 // We need to update cpuClock for all of the ticks missed while we 1230 // slept, and then re-enable the timer. 1231 // 1232 // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify 1233 // always increments cpuClock by 1 regardless of the number of 1234 // expirations as a heuristic to avoid over-accounting in cases of CPU 1235 // throttling. 1236 // 1237 // We want to cover the normal case, when all time should be accounted, 1238 // so we increment for all expirations. Throttling is less concerning 1239 // here because the ticker is only disabled from Notify. This means 1240 // that Notify must schedule and compensate for the throttled period 1241 // before the timer is disabled. Throttling while the timer is disabled 1242 // doesn't matter, as nothing is running or reading cpuClock anyways. 1243 // 1244 // S/R also adds complication, as there are two cases. Recall that 1245 // monotonicClock will jump forward on restore. 1246 // 1247 // 1. If the ticker is enabled during save, then on Restore Notify is 1248 // called with many expirations, covering the time jump, but cpuClock 1249 // is only incremented by 1. 1250 // 1251 // 2. If the ticker is disabled during save, then after Restore the 1252 // first wakeup will call this function and cpuClock will be 1253 // incremented by the number of expirations across the S/R. 1254 // 1255 // These cause very different value of cpuClock. But again, since 1256 // nothing was running while the ticker was disabled, those differences 1257 // don't matter. 1258 setting, exp := k.cpuClockTickerSetting.At(k.timekeeper.monotonicClock.Now()) 1259 if exp > 0 { 1260 atomic.AddUint64(&k.cpuClock, exp) 1261 } 1262 1263 // Now that cpuClock is updated it is safe to allow other tasks to 1264 // transition to running. 1265 atomic.StoreInt64(&k.runningTasks, 1) 1266 1267 // N.B. we must unlock before calling Swap to maintain lock ordering. 1268 // 1269 // cpuClockTickerDisabled need not wait until after Swap to become 1270 // true. It is sufficient that the timer *will* be enabled. 1271 k.cpuClockTickerDisabled = false 1272 k.runningTasksMu.Unlock() 1273 1274 // This won't call Notify (unless it's been ClockTick since setting.At 1275 // above). This means we skip the thread group work in Notify. However, 1276 // since nothing was running while we were disabled, none of the timers 1277 // could have expired. 1278 k.cpuClockTicker.Swap(setting) 1279 1280 return 1281 } 1282 } 1283 1284 func (k *Kernel) decRunningTasks() { 1285 tasks := atomic.AddInt64(&k.runningTasks, -1) 1286 if tasks < 0 { 1287 panic(fmt.Sprintf("Invalid running count %d", tasks)) 1288 } 1289 1290 // Nothing to do. The next CPU clock tick will disable the timer if 1291 // there is still nothing running. This provides approximately one tick 1292 // of slack in which we can switch back and forth between idle and 1293 // active without an expensive transition. 1294 } 1295 1296 // WaitExited blocks until all tasks in k have exited. 1297 func (k *Kernel) WaitExited() { 1298 k.tasks.liveGoroutines.Wait() 1299 } 1300 1301 // Kill requests that all tasks in k immediately exit as if group exiting with 1302 // status es. Kill does not wait for tasks to exit. 1303 func (k *Kernel) Kill(es ExitStatus) { 1304 k.extMu.Lock() 1305 defer k.extMu.Unlock() 1306 k.tasks.Kill(es) 1307 } 1308 1309 // Pause requests that all tasks in k temporarily stop executing, and blocks 1310 // until all tasks and asynchronous I/O operations in k have stopped. Multiple 1311 // calls to Pause nest and require an equal number of calls to Unpause to 1312 // resume execution. 1313 func (k *Kernel) Pause() { 1314 k.extMu.Lock() 1315 k.tasks.BeginExternalStop() 1316 k.extMu.Unlock() 1317 k.tasks.runningGoroutines.Wait() 1318 k.tasks.aioGoroutines.Wait() 1319 } 1320 1321 // ReceiveTaskStates receives full states for all tasks. 1322 func (k *Kernel) ReceiveTaskStates() { 1323 k.extMu.Lock() 1324 k.tasks.PullFullState() 1325 k.extMu.Unlock() 1326 } 1327 1328 // Unpause ends the effect of a previous call to Pause. If Unpause is called 1329 // without a matching preceding call to Pause, Unpause may panic. 1330 func (k *Kernel) Unpause() { 1331 k.extMu.Lock() 1332 defer k.extMu.Unlock() 1333 k.tasks.EndExternalStop() 1334 } 1335 1336 // SendExternalSignal injects a signal into the kernel. 1337 // 1338 // context is used only for debugging to describe how the signal was received. 1339 // 1340 // Preconditions: Kernel must have an init process. 1341 func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { 1342 k.extMu.Lock() 1343 defer k.extMu.Unlock() 1344 k.sendExternalSignal(info, context) 1345 } 1346 1347 // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. 1348 // This function doesn't skip signals like SendExternalSignal does. 1349 func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { 1350 k.extMu.Lock() 1351 defer k.extMu.Unlock() 1352 return tg.SendSignal(info) 1353 } 1354 1355 // SendContainerSignal sends the given signal to all processes inside the 1356 // namespace that match the given container ID. 1357 func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { 1358 k.extMu.Lock() 1359 defer k.extMu.Unlock() 1360 k.tasks.mu.RLock() 1361 defer k.tasks.mu.RUnlock() 1362 1363 var lastErr error 1364 for tg := range k.tasks.Root.tgids { 1365 if tg.leader.ContainerID() == cid { 1366 tg.signalHandlers.mu.Lock() 1367 infoCopy := *info 1368 if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil { 1369 lastErr = err 1370 } 1371 tg.signalHandlers.mu.Unlock() 1372 } 1373 } 1374 return lastErr 1375 } 1376 1377 // RebuildTraceContexts rebuilds the trace context for all tasks. 1378 // 1379 // Unfortunately, if these are built while tracing is not enabled, then we will 1380 // not have meaningful trace data. Rebuilding here ensures that we can do so 1381 // after tracing has been enabled. 1382 func (k *Kernel) RebuildTraceContexts() { 1383 // We need to pause all task goroutines because Task.rebuildTraceContext() 1384 // replaces Task.traceContext and Task.traceTask, which are 1385 // task-goroutine-exclusive (i.e. the task goroutine assumes that it can 1386 // access them without synchronization) for performance. 1387 k.Pause() 1388 defer k.Unpause() 1389 1390 k.extMu.Lock() 1391 defer k.extMu.Unlock() 1392 k.tasks.mu.RLock() 1393 defer k.tasks.mu.RUnlock() 1394 1395 for t, tid := range k.tasks.Root.tids { 1396 t.rebuildTraceContext(tid) 1397 } 1398 } 1399 1400 // FeatureSet returns the FeatureSet. 1401 func (k *Kernel) FeatureSet() *cpuid.FeatureSet { 1402 return k.featureSet 1403 } 1404 1405 // Timekeeper returns the Timekeeper. 1406 func (k *Kernel) Timekeeper() *Timekeeper { 1407 return k.timekeeper 1408 } 1409 1410 // TaskSet returns the TaskSet. 1411 func (k *Kernel) TaskSet() *TaskSet { 1412 return k.tasks 1413 } 1414 1415 // RootUserNamespace returns the root UserNamespace. 1416 func (k *Kernel) RootUserNamespace() *auth.UserNamespace { 1417 return k.rootUserNamespace 1418 } 1419 1420 // RootUTSNamespace returns the root UTSNamespace. 1421 func (k *Kernel) RootUTSNamespace() *UTSNamespace { 1422 return k.rootUTSNamespace 1423 } 1424 1425 // RootIPCNamespace takes a reference and returns the root IPCNamespace. 1426 func (k *Kernel) RootIPCNamespace() *IPCNamespace { 1427 k.rootIPCNamespace.IncRef() 1428 return k.rootIPCNamespace 1429 } 1430 1431 // RootPIDNamespace returns the root PIDNamespace. 1432 func (k *Kernel) RootPIDNamespace() *PIDNamespace { 1433 return k.tasks.Root 1434 } 1435 1436 // RootAbstractSocketNamespace returns the root AbstractSocketNamespace. 1437 func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { 1438 return k.rootAbstractSocketNamespace 1439 } 1440 1441 // RootNetworkNamespace returns the root network namespace, always non-nil. 1442 func (k *Kernel) RootNetworkNamespace() *inet.Namespace { 1443 return k.rootNetworkNamespace 1444 } 1445 1446 // GlobalInit returns the thread group with ID 1 in the root PID namespace, or 1447 // nil if no such thread group exists. GlobalInit may return a thread group 1448 // containing no tasks if the thread group has already exited. 1449 func (k *Kernel) GlobalInit() *ThreadGroup { 1450 k.extMu.Lock() 1451 defer k.extMu.Unlock() 1452 return k.globalInit 1453 } 1454 1455 // TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace. 1456 func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) { 1457 k.globalInit = tg 1458 } 1459 1460 // ApplicationCores returns the number of CPUs visible to sandboxed 1461 // applications. 1462 func (k *Kernel) ApplicationCores() uint { 1463 return k.applicationCores 1464 } 1465 1466 // RealtimeClock returns the application CLOCK_REALTIME clock. 1467 func (k *Kernel) RealtimeClock() ktime.Clock { 1468 return k.timekeeper.realtimeClock 1469 } 1470 1471 // MonotonicClock returns the application CLOCK_MONOTONIC clock. 1472 func (k *Kernel) MonotonicClock() ktime.Clock { 1473 return k.timekeeper.monotonicClock 1474 } 1475 1476 // CPUClockNow returns the current value of k.cpuClock. 1477 func (k *Kernel) CPUClockNow() uint64 { 1478 return atomic.LoadUint64(&k.cpuClock) 1479 } 1480 1481 // Syslog returns the syslog. 1482 func (k *Kernel) Syslog() *syslog { 1483 return &k.syslog 1484 } 1485 1486 // GenerateInotifyCookie generates a unique inotify event cookie. 1487 // 1488 // Returned values may overlap with previously returned values if the value 1489 // space is exhausted. 0 is not a valid cookie value, all other values 1490 // representable in a uint32 are allowed. 1491 func (k *Kernel) GenerateInotifyCookie() uint32 { 1492 id := atomic.AddUint32(&k.nextInotifyCookie, 1) 1493 // Wrap-around is explicitly allowed for inotify event cookies. 1494 if id == 0 { 1495 id = atomic.AddUint32(&k.nextInotifyCookie, 1) 1496 } 1497 return id 1498 } 1499 1500 // NetlinkPorts returns the netlink port manager. 1501 func (k *Kernel) NetlinkPorts() *port.Manager { 1502 return k.netlinkPorts 1503 } 1504 1505 var ( 1506 errSaved = errors.New("sandbox has been successfully saved") 1507 errAutoSaved = errors.New("sandbox has been successfully auto-saved") 1508 ) 1509 1510 // SaveStatus returns the sandbox save status. If it was saved successfully, 1511 // autosaved indicates whether save was triggered by autosave. If it was not 1512 // saved successfully, err indicates the sandbox error that caused the kernel to 1513 // exit during save. 1514 func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) { 1515 k.extMu.Lock() 1516 defer k.extMu.Unlock() 1517 switch k.saveStatus { 1518 case nil: 1519 return false, false, nil 1520 case errSaved: 1521 return true, false, nil 1522 case errAutoSaved: 1523 return true, true, nil 1524 default: 1525 return false, false, k.saveStatus 1526 } 1527 } 1528 1529 // SetSaveSuccess sets the flag indicating that save completed successfully, if 1530 // no status was already set. 1531 func (k *Kernel) SetSaveSuccess(autosave bool) { 1532 k.extMu.Lock() 1533 defer k.extMu.Unlock() 1534 if k.saveStatus == nil { 1535 if autosave { 1536 k.saveStatus = errAutoSaved 1537 } else { 1538 k.saveStatus = errSaved 1539 } 1540 } 1541 } 1542 1543 // SetSaveError sets the sandbox error that caused the kernel to exit during 1544 // save, if one is not already set. 1545 func (k *Kernel) SetSaveError(err error) { 1546 k.extMu.Lock() 1547 defer k.extMu.Unlock() 1548 if k.saveStatus == nil { 1549 k.saveStatus = err 1550 } 1551 } 1552 1553 // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or 1554 // LoadFrom. 1555 func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { 1556 k.mf = mf 1557 } 1558 1559 // MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. 1560 func (k *Kernel) MemoryFile() *pgalloc.MemoryFile { 1561 return k.mf 1562 } 1563 1564 // SupervisorContext returns a Context with maximum privileges in k. It should 1565 // only be used by goroutines outside the control of the emulated kernel 1566 // defined by e. 1567 // 1568 // Callers are responsible for ensuring that the returned Context is not used 1569 // concurrently with changes to the Kernel. 1570 func (k *Kernel) SupervisorContext() context.Context { 1571 return supervisorContext{ 1572 Logger: log.Log(), 1573 k: k, 1574 } 1575 } 1576 1577 // SocketRecord represents a socket recorded in Kernel.socketsVFS2. 1578 // 1579 // +stateify savable 1580 type SocketRecord struct { 1581 k *Kernel 1582 Sock *refs.WeakRef // TODO(github.com/SagerNet/issue/1624): Only used by VFS1. 1583 SockVFS2 *vfs.FileDescription // Only used by VFS2. 1584 ID uint64 // Socket table entry number. 1585 } 1586 1587 // SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements 1588 // refs.WeakRefUser for sockets stored in the socket table. 1589 // 1590 // +stateify savable 1591 type SocketRecordVFS1 struct { 1592 socketEntry 1593 SocketRecord 1594 } 1595 1596 // WeakRefGone implements refs.WeakRefUser.WeakRefGone. 1597 func (s *SocketRecordVFS1) WeakRefGone(context.Context) { 1598 s.k.extMu.Lock() 1599 s.k.sockets.Remove(s) 1600 s.k.extMu.Unlock() 1601 } 1602 1603 // RecordSocket adds a socket to the system-wide socket table for tracking. 1604 // 1605 // Precondition: Caller must hold a reference to sock. 1606 func (k *Kernel) RecordSocket(sock *fs.File) { 1607 k.extMu.Lock() 1608 id := k.nextSocketRecord 1609 k.nextSocketRecord++ 1610 s := &SocketRecordVFS1{ 1611 SocketRecord: SocketRecord{ 1612 k: k, 1613 ID: id, 1614 }, 1615 } 1616 s.Sock = refs.NewWeakRef(sock, s) 1617 k.sockets.PushBack(s) 1618 k.extMu.Unlock() 1619 } 1620 1621 // RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for 1622 // tracking. 1623 // 1624 // Precondition: Caller must hold a reference to sock. 1625 // 1626 // Note that the socket table will not hold a reference on the 1627 // vfs.FileDescription. 1628 func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { 1629 k.extMu.Lock() 1630 if _, ok := k.socketsVFS2[sock]; ok { 1631 panic(fmt.Sprintf("Socket %p added twice", sock)) 1632 } 1633 id := k.nextSocketRecord 1634 k.nextSocketRecord++ 1635 s := &SocketRecord{ 1636 k: k, 1637 ID: id, 1638 SockVFS2: sock, 1639 } 1640 k.socketsVFS2[sock] = s 1641 k.extMu.Unlock() 1642 } 1643 1644 // DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table. 1645 func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) { 1646 k.extMu.Lock() 1647 delete(k.socketsVFS2, sock) 1648 k.extMu.Unlock() 1649 } 1650 1651 // ListSockets returns a snapshot of all sockets. 1652 // 1653 // Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef() 1654 // to get a reference on a socket in the table. 1655 func (k *Kernel) ListSockets() []*SocketRecord { 1656 k.extMu.Lock() 1657 var socks []*SocketRecord 1658 if VFS2Enabled { 1659 for _, s := range k.socketsVFS2 { 1660 socks = append(socks, s) 1661 } 1662 } else { 1663 for s := k.sockets.Front(); s != nil; s = s.Next() { 1664 socks = append(socks, &s.SocketRecord) 1665 } 1666 } 1667 k.extMu.Unlock() 1668 return socks 1669 } 1670 1671 // supervisorContext is a privileged context. 1672 type supervisorContext struct { 1673 context.NoopSleeper 1674 log.Logger 1675 k *Kernel 1676 } 1677 1678 // Value implements context.Context. 1679 func (ctx supervisorContext) Value(key interface{}) interface{} { 1680 switch key { 1681 case CtxCanTrace: 1682 // The supervisor context can trace anything. (None of 1683 // supervisorContext's users are expected to invoke ptrace, but ptrace 1684 // permissions are required for certain file accesses.) 1685 return func(*Task, bool) bool { return true } 1686 case CtxKernel: 1687 return ctx.k 1688 case CtxPIDNamespace: 1689 return ctx.k.tasks.Root 1690 case CtxUTSNamespace: 1691 return ctx.k.rootUTSNamespace 1692 case CtxIPCNamespace: 1693 ipcns := ctx.k.rootIPCNamespace 1694 ipcns.IncRef() 1695 return ipcns 1696 case auth.CtxCredentials: 1697 // The supervisor context is global root. 1698 return auth.NewRootCredentials(ctx.k.rootUserNamespace) 1699 case fs.CtxRoot: 1700 if ctx.k.globalInit != nil { 1701 return ctx.k.globalInit.mounts.Root() 1702 } 1703 return nil 1704 case vfs.CtxRoot: 1705 if ctx.k.globalInit == nil { 1706 return vfs.VirtualDentry{} 1707 } 1708 root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root() 1709 root.IncRef() 1710 return root 1711 case vfs.CtxMountNamespace: 1712 if ctx.k.globalInit == nil { 1713 return nil 1714 } 1715 mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() 1716 mntns.IncRef() 1717 return mntns 1718 case fs.CtxDirentCacheLimiter: 1719 return ctx.k.DirentCacheLimiter 1720 case inet.CtxStack: 1721 return ctx.k.RootNetworkNamespace().Stack() 1722 case ktime.CtxRealtimeClock: 1723 return ctx.k.RealtimeClock() 1724 case limits.CtxLimits: 1725 // No limits apply. 1726 return limits.NewLimitSet() 1727 case pgalloc.CtxMemoryFile: 1728 return ctx.k.mf 1729 case pgalloc.CtxMemoryFileProvider: 1730 return ctx.k 1731 case platform.CtxPlatform: 1732 return ctx.k 1733 case uniqueid.CtxGlobalUniqueID: 1734 return ctx.k.UniqueID() 1735 case uniqueid.CtxGlobalUniqueIDProvider: 1736 return ctx.k 1737 case uniqueid.CtxInotifyCookie: 1738 return ctx.k.GenerateInotifyCookie() 1739 case unimpl.CtxEvents: 1740 return ctx.k 1741 default: 1742 return nil 1743 } 1744 } 1745 1746 // Rate limits for the number of unimplemented syscall events. 1747 const ( 1748 unimplementedSyscallsMaxRate = 100 // events per second 1749 unimplementedSyscallBurst = 1000 // events 1750 ) 1751 1752 // EmitUnimplementedEvent emits an UnimplementedSyscall event via the event 1753 // channel. 1754 func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { 1755 k.unimplementedSyscallEmitterOnce.Do(func() { 1756 k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) 1757 }) 1758 1759 t := TaskFromContext(ctx) 1760 _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ 1761 Tid: int32(t.ThreadID()), 1762 Registers: t.Arch().StateData().Proto(), 1763 }) 1764 } 1765 1766 // VFS returns the virtual filesystem for the kernel. 1767 func (k *Kernel) VFS() *vfs.VirtualFilesystem { 1768 return &k.vfs 1769 } 1770 1771 // SetHostMount sets the hostfs mount. 1772 func (k *Kernel) SetHostMount(mnt *vfs.Mount) { 1773 if k.hostMount != nil { 1774 panic("Kernel.hostMount cannot be set more than once") 1775 } 1776 k.hostMount = mnt 1777 } 1778 1779 // HostMount returns the hostfs mount. 1780 func (k *Kernel) HostMount() *vfs.Mount { 1781 return k.hostMount 1782 } 1783 1784 // PipeMount returns the pipefs mount. 1785 func (k *Kernel) PipeMount() *vfs.Mount { 1786 return k.pipeMount 1787 } 1788 1789 // ShmMount returns the tmpfs mount. 1790 func (k *Kernel) ShmMount() *vfs.Mount { 1791 return k.shmMount 1792 } 1793 1794 // SocketMount returns the sockfs mount. 1795 func (k *Kernel) SocketMount() *vfs.Mount { 1796 return k.socketMount 1797 } 1798 1799 // CgroupRegistry returns the cgroup registry. 1800 func (k *Kernel) CgroupRegistry() *CgroupRegistry { 1801 return k.cgroupRegistry 1802 } 1803 1804 // Release releases resources owned by k. 1805 // 1806 // Precondition: This should only be called after the kernel is fully 1807 // initialized, e.g. after k.Start() has been called. 1808 func (k *Kernel) Release() { 1809 ctx := k.SupervisorContext() 1810 if VFS2Enabled { 1811 k.hostMount.DecRef(ctx) 1812 k.pipeMount.DecRef(ctx) 1813 k.shmMount.DecRef(ctx) 1814 k.socketMount.DecRef(ctx) 1815 k.vfs.Release(ctx) 1816 } 1817 k.timekeeper.Destroy() 1818 k.vdso.Release(ctx) 1819 } 1820 1821 // PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup 1822 // hierarchy. 1823 // 1824 // Precondition: root must be a new cgroup with no tasks. This implies the 1825 // controllers for root are also new and currently manage no task, which in turn 1826 // implies the new cgroup can be populated without migrating tasks between 1827 // cgroups. 1828 func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { 1829 k.tasks.mu.RLock() 1830 k.tasks.forEachTaskLocked(func(t *Task) { 1831 if t.exitState != TaskExitNone { 1832 return 1833 } 1834 t.mu.Lock() 1835 // A task can be in the cgroup if it has been created after the 1836 // cgroup hierarchy was registered. 1837 t.enterCgroupIfNotYetLocked(root) 1838 t.mu.Unlock() 1839 }) 1840 k.tasks.mu.RUnlock() 1841 } 1842 1843 // ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the 1844 // hierarchy with the provided id. This is intended for use during hierarchy 1845 // teardown, as otherwise the tasks would be orphaned w.r.t to some controllers. 1846 func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { 1847 k.tasks.mu.RLock() 1848 k.tasks.forEachTaskLocked(func(t *Task) { 1849 if t.exitState != TaskExitNone { 1850 return 1851 } 1852 t.mu.Lock() 1853 for cg := range t.cgroups { 1854 if cg.HierarchyID() == hid { 1855 t.leaveCgroupLocked(cg) 1856 } 1857 } 1858 t.mu.Unlock() 1859 }) 1860 k.tasks.mu.RUnlock() 1861 }