github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/loader.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package boot loads the kernel and runs a container. 16 package boot 17 18 import ( 19 "errors" 20 "fmt" 21 mrand "math/rand" 22 "os" 23 "runtime" 24 "sync/atomic" 25 gtime "time" 26 27 specs "github.com/opencontainers/runtime-spec/specs-go" 28 "golang.org/x/sys/unix" 29 "github.com/SagerNet/gvisor/pkg/abi/linux" 30 "github.com/SagerNet/gvisor/pkg/bpf" 31 "github.com/SagerNet/gvisor/pkg/context" 32 "github.com/SagerNet/gvisor/pkg/coverage" 33 "github.com/SagerNet/gvisor/pkg/cpuid" 34 "github.com/SagerNet/gvisor/pkg/fd" 35 "github.com/SagerNet/gvisor/pkg/log" 36 "github.com/SagerNet/gvisor/pkg/memutil" 37 "github.com/SagerNet/gvisor/pkg/rand" 38 "github.com/SagerNet/gvisor/pkg/refs" 39 "github.com/SagerNet/gvisor/pkg/refsvfs2" 40 "github.com/SagerNet/gvisor/pkg/sentry/control" 41 "github.com/SagerNet/gvisor/pkg/sentry/fdimport" 42 "github.com/SagerNet/gvisor/pkg/sentry/fs" 43 "github.com/SagerNet/gvisor/pkg/sentry/fs/host" 44 "github.com/SagerNet/gvisor/pkg/sentry/fs/user" 45 hostvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/host" 46 "github.com/SagerNet/gvisor/pkg/sentry/inet" 47 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 48 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 49 "github.com/SagerNet/gvisor/pkg/sentry/loader" 50 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 51 "github.com/SagerNet/gvisor/pkg/sentry/platform" 52 "github.com/SagerNet/gvisor/pkg/sentry/sighandling" 53 "github.com/SagerNet/gvisor/pkg/sentry/socket/netfilter" 54 "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux/vfs2" 55 "github.com/SagerNet/gvisor/pkg/sentry/time" 56 "github.com/SagerNet/gvisor/pkg/sentry/usage" 57 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 58 "github.com/SagerNet/gvisor/pkg/sentry/watchdog" 59 "github.com/SagerNet/gvisor/pkg/sync" 60 "github.com/SagerNet/gvisor/pkg/tcpip" 61 "github.com/SagerNet/gvisor/pkg/tcpip/link/loopback" 62 "github.com/SagerNet/gvisor/pkg/tcpip/link/sniffer" 63 "github.com/SagerNet/gvisor/pkg/tcpip/network/arp" 64 "github.com/SagerNet/gvisor/pkg/tcpip/network/ipv4" 65 "github.com/SagerNet/gvisor/pkg/tcpip/network/ipv6" 66 "github.com/SagerNet/gvisor/pkg/tcpip/stack" 67 "github.com/SagerNet/gvisor/pkg/tcpip/transport/icmp" 68 "github.com/SagerNet/gvisor/pkg/tcpip/transport/raw" 69 "github.com/SagerNet/gvisor/pkg/tcpip/transport/tcp" 70 "github.com/SagerNet/gvisor/pkg/tcpip/transport/udp" 71 "github.com/SagerNet/gvisor/runsc/boot/filter" 72 _ "github.com/SagerNet/gvisor/runsc/boot/platforms" // register all platforms. 73 "github.com/SagerNet/gvisor/runsc/boot/pprof" 74 "github.com/SagerNet/gvisor/runsc/config" 75 "github.com/SagerNet/gvisor/runsc/specutils" 76 "github.com/SagerNet/gvisor/runsc/specutils/seccomp" 77 78 // Top-level inet providers. 79 "github.com/SagerNet/gvisor/pkg/sentry/socket/hostinet" 80 "github.com/SagerNet/gvisor/pkg/sentry/socket/netstack" 81 82 // Include other supported socket providers. 83 _ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink" 84 _ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/route" 85 _ "github.com/SagerNet/gvisor/pkg/sentry/socket/netlink/uevent" 86 _ "github.com/SagerNet/gvisor/pkg/sentry/socket/unix" 87 ) 88 89 type containerInfo struct { 90 conf *config.Config 91 92 // spec is the base configuration for the root container. 93 spec *specs.Spec 94 95 // procArgs refers to the container's init task. 96 procArgs kernel.CreateProcessArgs 97 98 // stdioFDs contains stdin, stdout, and stderr. 99 stdioFDs []*fd.FD 100 101 // goferFDs are the FDs that attach the sandbox to the gofers. 102 goferFDs []*fd.FD 103 } 104 105 // Loader keeps state needed to start the kernel and run the container. 106 type Loader struct { 107 // k is the kernel. 108 k *kernel.Kernel 109 110 // ctrl is the control server. 111 ctrl *controller 112 113 // root contains information about the root container in the sandbox. 114 root containerInfo 115 116 watchdog *watchdog.Watchdog 117 118 // stopSignalForwarding disables forwarding of signals to the sandboxed 119 // container. It should be called when a sandbox is destroyed. 120 stopSignalForwarding func() 121 122 // restore is set to true if we are restoring a container. 123 restore bool 124 125 // sandboxID is the ID for the whole sandbox. 126 sandboxID string 127 128 // mu guards processes. 129 mu sync.Mutex 130 131 // processes maps containers init process and invocation of exec. Root 132 // processes are keyed with container ID and pid=0, while exec invocations 133 // have the corresponding pid set. 134 // 135 // processes is guardded by mu. 136 processes map[execID]*execProcess 137 138 // mountHints provides extra information about mounts for containers that 139 // apply to the entire pod. 140 mountHints *podMountHints 141 } 142 143 // execID uniquely identifies a sentry process that is executed in a container. 144 type execID struct { 145 cid string 146 pid kernel.ThreadID 147 } 148 149 // execProcess contains the thread group and host TTY of a sentry process. 150 type execProcess struct { 151 // tg will be nil for containers that haven't started yet. 152 tg *kernel.ThreadGroup 153 154 // tty will be nil if the process is not attached to a terminal. 155 tty *host.TTYFileOperations 156 157 // tty will be nil if the process is not attached to a terminal. 158 ttyVFS2 *hostvfs2.TTYFileDescription 159 160 // pidnsPath is the pid namespace path in spec 161 pidnsPath string 162 163 // hostTTY is present when creating a sub-container with terminal enabled. 164 // TTY file is passed during container create and must be saved until 165 // container start. 166 hostTTY *fd.FD 167 } 168 169 func init() { 170 // Initialize the random number generator. 171 mrand.Seed(gtime.Now().UnixNano()) 172 } 173 174 // Args are the arguments for New(). 175 type Args struct { 176 // Id is the sandbox ID. 177 ID string 178 // Spec is the sandbox specification. 179 Spec *specs.Spec 180 // Conf is the system configuration. 181 Conf *config.Config 182 // ControllerFD is the FD to the URPC controller. The Loader takes ownership 183 // of this FD and may close it at any time. 184 ControllerFD int 185 // Device is an optional argument that is passed to the platform. The Loader 186 // takes ownership of this file and may close it at any time. 187 Device *os.File 188 // GoferFDs is an array of FDs used to connect with the Gofer. The Loader 189 // takes ownership of these FDs and may close them at any time. 190 GoferFDs []int 191 // StdioFDs is the stdio for the application. The Loader takes ownership of 192 // these FDs and may close them at any time. 193 StdioFDs []int 194 // NumCPU is the number of CPUs to create inside the sandbox. 195 NumCPU int 196 // TotalMem is the initial amount of total memory to report back to the 197 // container. 198 TotalMem uint64 199 // UserLogFD is the file descriptor to write user logs to. 200 UserLogFD int 201 } 202 203 // make sure stdioFDs are always the same on initial start and on restore 204 const startingStdioFD = 256 205 206 // New initializes a new kernel loader configured by spec. 207 // New also handles setting up a kernel for restoring a container. 208 func New(args Args) (*Loader, error) { 209 // We initialize the rand package now to make sure /dev/urandom is pre-opened 210 // on kernels that do not support getrandom(2). 211 if err := rand.Init(); err != nil { 212 return nil, fmt.Errorf("setting up rand: %w", err) 213 } 214 215 if err := usage.Init(); err != nil { 216 return nil, fmt.Errorf("setting up memory usage: %w", err) 217 } 218 219 // Is this a VFSv2 kernel? 220 if args.Conf.VFS2 { 221 kernel.VFS2Enabled = true 222 if args.Conf.FUSE { 223 kernel.FUSEEnabled = true 224 } 225 226 vfs2.Override() 227 } 228 229 // Make host FDs stable between invocations. Host FDs must map to the exact 230 // same number when the sandbox is restored. Otherwise the wrong FD will be 231 // used. 232 info := containerInfo{} 233 newfd := startingStdioFD 234 235 for _, stdioFD := range args.StdioFDs { 236 // Check that newfd is unused to avoid clobbering over it. 237 if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { 238 if err != nil { 239 return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) 240 } 241 return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) 242 } 243 244 err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) 245 if err != nil { 246 return nil, fmt.Errorf("dup3 of stdios failed: %w", err) 247 } 248 info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) 249 _ = unix.Close(stdioFD) 250 newfd++ 251 } 252 for _, goferFD := range args.GoferFDs { 253 info.goferFDs = append(info.goferFDs, fd.New(goferFD)) 254 } 255 256 // Create kernel and platform. 257 p, err := createPlatform(args.Conf, args.Device) 258 if err != nil { 259 return nil, fmt.Errorf("creating platform: %w", err) 260 } 261 k := &kernel.Kernel{ 262 Platform: p, 263 } 264 265 // Create memory file. 266 mf, err := createMemoryFile() 267 if err != nil { 268 return nil, fmt.Errorf("creating memory file: %w", err) 269 } 270 k.SetMemoryFile(mf) 271 272 // Create VDSO. 273 // 274 // Pass k as the platform since it is savable, unlike the actual platform. 275 vdso, err := loader.PrepareVDSO(k) 276 if err != nil { 277 return nil, fmt.Errorf("creating vdso: %w", err) 278 } 279 280 // Create timekeeper. 281 tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) 282 tk.SetClocks(time.NewCalibratedClocks()) 283 284 if err := enableStrace(args.Conf); err != nil { 285 return nil, fmt.Errorf("enabling strace: %w", err) 286 } 287 288 // Create root network namespace/stack. 289 netns, err := newRootNetworkNamespace(args.Conf, tk, k) 290 if err != nil { 291 return nil, fmt.Errorf("creating network: %w", err) 292 } 293 294 // Create capabilities. 295 caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) 296 if err != nil { 297 return nil, fmt.Errorf("converting capabilities: %w", err) 298 } 299 300 // Convert the spec's additional GIDs to KGIDs. 301 extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) 302 for _, GID := range args.Spec.Process.User.AdditionalGids { 303 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 304 } 305 306 // Create credentials. 307 creds := auth.NewUserCredentials( 308 auth.KUID(args.Spec.Process.User.UID), 309 auth.KGID(args.Spec.Process.User.GID), 310 extraKGIDs, 311 caps, 312 auth.NewRootUserNamespace()) 313 314 if args.NumCPU == 0 { 315 args.NumCPU = runtime.NumCPU() 316 } 317 log.Infof("CPUs: %d", args.NumCPU) 318 runtime.GOMAXPROCS(args.NumCPU) 319 320 if args.TotalMem > 0 { 321 // Adjust the total memory returned by the Sentry so that applications that 322 // use /proc/meminfo can make allocations based on this limit. 323 usage.MaximumTotalMemoryBytes = args.TotalMem 324 log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) 325 } 326 327 // Initiate the Kernel object, which is required by the Context passed 328 // to createVFS in order to mount (among other things) procfs. 329 if err = k.Init(kernel.InitKernelArgs{ 330 FeatureSet: cpuid.HostFeatureSet(), 331 Timekeeper: tk, 332 RootUserNamespace: creds.UserNamespace, 333 RootNetworkNamespace: netns, 334 ApplicationCores: uint(args.NumCPU), 335 Vdso: vdso, 336 RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), 337 RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), 338 RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), 339 PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), 340 }); err != nil { 341 return nil, fmt.Errorf("initializing kernel: %w", err) 342 } 343 344 if kernel.VFS2Enabled { 345 if err := registerFilesystems(k); err != nil { 346 return nil, fmt.Errorf("registering filesystems: %w", err) 347 } 348 } 349 350 if err := adjustDirentCache(k); err != nil { 351 return nil, err 352 } 353 354 // Turn on packet logging if enabled. 355 if args.Conf.LogPackets { 356 log.Infof("Packet logging enabled") 357 atomic.StoreUint32(&sniffer.LogPackets, 1) 358 } else { 359 log.Infof("Packet logging disabled") 360 atomic.StoreUint32(&sniffer.LogPackets, 0) 361 } 362 363 // Create a watchdog. 364 dogOpts := watchdog.DefaultOpts 365 dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction 366 dog := watchdog.New(k, dogOpts) 367 368 procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) 369 if err != nil { 370 return nil, fmt.Errorf("creating init process for root container: %w", err) 371 } 372 info.procArgs = procArgs 373 374 if err := initCompatLogs(args.UserLogFD); err != nil { 375 return nil, fmt.Errorf("initializing compat logs: %w", err) 376 } 377 378 mountHints, err := newPodMountHints(args.Spec) 379 if err != nil { 380 return nil, fmt.Errorf("creating pod mount hints: %w", err) 381 } 382 383 info.conf = args.Conf 384 info.spec = args.Spec 385 386 if kernel.VFS2Enabled { 387 // Set up host mount that will be used for imported fds. 388 hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) 389 if err != nil { 390 return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) 391 } 392 defer hostFilesystem.DecRef(k.SupervisorContext()) 393 hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) 394 if err != nil { 395 return nil, fmt.Errorf("failed to create hostfs mount: %w", err) 396 } 397 k.SetHostMount(hostMount) 398 } 399 400 eid := execID{cid: args.ID} 401 l := &Loader{ 402 k: k, 403 watchdog: dog, 404 sandboxID: args.ID, 405 processes: map[execID]*execProcess{eid: {}}, 406 mountHints: mountHints, 407 root: info, 408 } 409 410 // We don't care about child signals; some platforms can generate a 411 // tremendous number of useless ones (I'm looking at you, ptrace). 412 if err := sighandling.IgnoreChildStop(); err != nil { 413 return nil, fmt.Errorf("ignore child stop signals failed: %w", err) 414 } 415 416 // Create the control server using the provided FD. 417 // 418 // This must be done *after* we have initialized the kernel since the 419 // controller is used to configure the kernel's network stack. 420 ctrl, err := newController(args.ControllerFD, l) 421 if err != nil { 422 return nil, fmt.Errorf("creating control server: %w", err) 423 } 424 l.ctrl = ctrl 425 426 // Only start serving after Loader is set to controller and controller is set 427 // to Loader, because they are both used in the urpc methods. 428 if err := ctrl.srv.StartServing(); err != nil { 429 return nil, fmt.Errorf("starting control server: %w", err) 430 } 431 432 return l, nil 433 } 434 435 // createProcessArgs creates args that can be used with kernel.CreateProcess. 436 func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { 437 // Create initial limits. 438 ls, err := createLimitSet(spec) 439 if err != nil { 440 return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) 441 } 442 env, err := specutils.ResolveEnvs(spec.Process.Env) 443 if err != nil { 444 return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) 445 } 446 447 wd := spec.Process.Cwd 448 if wd == "" { 449 wd = "/" 450 } 451 452 // Create the process arguments. 453 procArgs := kernel.CreateProcessArgs{ 454 Argv: spec.Process.Args, 455 Envv: env, 456 WorkingDirectory: wd, 457 Credentials: creds, 458 Umask: 0022, 459 Limits: ls, 460 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 461 UTSNamespace: k.RootUTSNamespace(), 462 IPCNamespace: k.RootIPCNamespace(), 463 AbstractSocketNamespace: k.RootAbstractSocketNamespace(), 464 ContainerID: id, 465 PIDNamespace: pidns, 466 } 467 468 return procArgs, nil 469 } 470 471 // Destroy cleans up all resources used by the loader. 472 // 473 // Note that this will block until all open control server connections have 474 // been closed. For that reason, this should NOT be called in a defer, because 475 // a panic in a control server rpc would then hang forever. 476 func (l *Loader) Destroy() { 477 if l.stopSignalForwarding != nil { 478 l.stopSignalForwarding() 479 } 480 l.watchdog.Stop() 481 482 // Stop the control server. This will indirectly stop any 483 // long-running control operations that are in flight, e.g. 484 // profiling operations. 485 l.ctrl.stop() 486 487 // Release all kernel resources. This is only safe after we can no longer 488 // save/restore. 489 l.k.Release() 490 491 // In the success case, stdioFDs and goferFDs will only contain 492 // released/closed FDs that ownership has been passed over to host FDs and 493 // gofer sessions. Close them here in case of failure. 494 for _, f := range l.root.stdioFDs { 495 _ = f.Close() 496 } 497 for _, f := range l.root.goferFDs { 498 _ = f.Close() 499 } 500 } 501 502 func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { 503 p, err := platform.Lookup(conf.Platform) 504 if err != nil { 505 panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) 506 } 507 log.Infof("Platform: %s", conf.Platform) 508 return p.New(deviceFile) 509 } 510 511 func createMemoryFile() (*pgalloc.MemoryFile, error) { 512 const memfileName = "runsc-memory" 513 memfd, err := memutil.CreateMemFD(memfileName, 0) 514 if err != nil { 515 return nil, fmt.Errorf("error creating memfd: %w", err) 516 } 517 memfile := os.NewFile(uintptr(memfd), memfileName) 518 // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if 519 // there are memory cgroups specified, because at this point we're already 520 // in a mount namespace in which the relevant cgroupfs is not visible. 521 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 522 if err != nil { 523 _ = memfile.Close() 524 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) 525 } 526 return mf, nil 527 } 528 529 // installSeccompFilters installs sandbox seccomp filters with the host. 530 func (l *Loader) installSeccompFilters() error { 531 if l.root.conf.DisableSeccomp { 532 filter.Report("syscall filter is DISABLED. Running in less secure mode.") 533 } else { 534 opts := filter.Options{ 535 Platform: l.k.Platform, 536 HostNetwork: l.root.conf.Network == config.NetworkHost, 537 ProfileEnable: l.root.conf.ProfileEnable, 538 ControllerFD: l.ctrl.srv.FD(), 539 } 540 if err := filter.Install(opts); err != nil { 541 return fmt.Errorf("installing seccomp filters: %w", err) 542 } 543 } 544 return nil 545 } 546 547 // Run runs the root container. 548 func (l *Loader) Run() error { 549 err := l.run() 550 l.ctrl.manager.startResultChan <- err 551 if err != nil { 552 // Give the controller some time to send the error to the 553 // runtime. If we return too quickly here the process will exit 554 // and the control connection will be closed before the error 555 // is returned. 556 gtime.Sleep(2 * gtime.Second) 557 return err 558 } 559 return nil 560 } 561 562 func (l *Loader) run() error { 563 if l.root.conf.Network == config.NetworkHost { 564 // Delay host network configuration to this point because network namespace 565 // is configured after the loader is created and before Run() is called. 566 log.Debugf("Configuring host network") 567 s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) 568 if err := s.Configure(); err != nil { 569 return err 570 } 571 } 572 573 l.mu.Lock() 574 defer l.mu.Unlock() 575 576 eid := execID{cid: l.sandboxID} 577 ep, ok := l.processes[eid] 578 if !ok { 579 return fmt.Errorf("trying to start deleted container %q", l.sandboxID) 580 } 581 582 // If we are restoring, we do not want to create a process. 583 // l.restore is set by the container manager when a restore call is made. 584 if !l.restore { 585 if l.root.conf.ProfileEnable { 586 pprof.Initialize() 587 } 588 589 // Finally done with all configuration. Setup filters before user code 590 // is loaded. 591 if err := l.installSeccompFilters(); err != nil { 592 return err 593 } 594 595 // Create the root container init task. It will begin running 596 // when the kernel is started. 597 var err error 598 _, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root) 599 if err != nil { 600 return err 601 } 602 } 603 604 ep.tg = l.k.GlobalInit() 605 if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { 606 ep.pidnsPath = ns.Path 607 } 608 609 // Handle signals by forwarding them to the root container process 610 // (except for panic signal, which should cause a panic). 611 l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { 612 // Panic signal should cause a panic. 613 if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { 614 panic("Signal-induced panic") 615 } 616 617 // Otherwise forward to root container. 618 deliveryMode := DeliverToProcess 619 if l.root.spec.Process.Terminal { 620 // Since we are running with a console, we should forward the signal to 621 // the foreground process group so that job control signals like ^C can 622 // be handled properly. 623 deliveryMode = DeliverToForegroundProcessGroup 624 } 625 log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) 626 if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { 627 log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) 628 } 629 }) 630 631 log.Infof("Process should have started...") 632 l.watchdog.Start() 633 return l.k.Start() 634 } 635 636 // createContainer creates a new container inside the sandbox. 637 func (l *Loader) createContainer(cid string, tty *fd.FD) error { 638 l.mu.Lock() 639 defer l.mu.Unlock() 640 641 eid := execID{cid: cid} 642 if _, ok := l.processes[eid]; ok { 643 return fmt.Errorf("container %q already exists", cid) 644 } 645 l.processes[eid] = &execProcess{hostTTY: tty} 646 return nil 647 } 648 649 // startContainer starts a child container. It returns the thread group ID of 650 // the newly created process. Used FDs are either closed or released. It's safe 651 // for the caller to close any remaining files upon return. 652 func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error { 653 // Create capabilities. 654 caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) 655 if err != nil { 656 return fmt.Errorf("creating capabilities: %w", err) 657 } 658 659 l.mu.Lock() 660 defer l.mu.Unlock() 661 662 ep := l.processes[execID{cid: cid}] 663 if ep == nil { 664 return fmt.Errorf("trying to start a deleted container %q", cid) 665 } 666 667 // Convert the spec's additional GIDs to KGIDs. 668 extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) 669 for _, GID := range spec.Process.User.AdditionalGids { 670 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 671 } 672 673 // Create credentials. We reuse the root user namespace because the 674 // sentry currently supports only 1 mount namespace, which is tied to a 675 // single user namespace. Thus we must run in the same user namespace 676 // to access mounts. 677 creds := auth.NewUserCredentials( 678 auth.KUID(spec.Process.User.UID), 679 auth.KGID(spec.Process.User.GID), 680 extraKGIDs, 681 caps, 682 l.k.RootUserNamespace()) 683 684 var pidns *kernel.PIDNamespace 685 if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { 686 if ns.Path != "" { 687 for _, p := range l.processes { 688 if ns.Path == p.pidnsPath { 689 pidns = p.tg.PIDNamespace() 690 break 691 } 692 } 693 } 694 if pidns == nil { 695 pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) 696 } 697 ep.pidnsPath = ns.Path 698 } else { 699 pidns = l.k.RootPIDNamespace() 700 } 701 702 info := &containerInfo{ 703 conf: conf, 704 spec: spec, 705 goferFDs: goferFDs, 706 } 707 info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) 708 if err != nil { 709 return fmt.Errorf("creating new process: %w", err) 710 } 711 712 // Use stdios or TTY depending on the spec configuration. 713 if spec.Process.Terminal { 714 if l := len(stdioFDs); l != 0 { 715 return fmt.Errorf("using TTY, stdios not expected: %d", l) 716 } 717 if ep.hostTTY == nil { 718 return fmt.Errorf("terminal enabled but no TTY provided (--console-socket possibly passed)") 719 } 720 info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} 721 ep.hostTTY = nil 722 } else { 723 info.stdioFDs = stdioFDs 724 } 725 726 ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info) 727 if err != nil { 728 return err 729 } 730 l.k.StartProcess(ep.tg) 731 return nil 732 } 733 734 func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { 735 // Create the FD map, which will set stdin, stdout, and stderr. 736 ctx := info.procArgs.NewContext(l.k) 737 fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs) 738 if err != nil { 739 return nil, nil, nil, fmt.Errorf("importing fds: %w", err) 740 } 741 // CreateProcess takes a reference on fdTable if successful. We won't need 742 // ours either way. 743 info.procArgs.FDTable = fdTable 744 745 // Setup the child container file system. 746 l.startGoferMonitor(cid, info.goferFDs) 747 748 mntr := newContainerMounter(info, l.k, l.mountHints, kernel.VFS2Enabled) 749 if root { 750 if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { 751 return nil, nil, nil, err 752 } 753 } 754 if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil { 755 return nil, nil, nil, err 756 } 757 758 // Add the HOME environment variable if it is not already set. 759 var envv []string 760 if kernel.VFS2Enabled { 761 envv, err = user.MaybeAddExecUserHomeVFS2(ctx, info.procArgs.MountNamespaceVFS2, 762 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 763 764 } else { 765 envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, 766 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 767 } 768 if err != nil { 769 return nil, nil, nil, err 770 } 771 info.procArgs.Envv = envv 772 773 // Create and start the new process. 774 tg, _, err := l.k.CreateProcess(info.procArgs) 775 if err != nil { 776 return nil, nil, nil, fmt.Errorf("creating process: %w", err) 777 } 778 // CreateProcess takes a reference on FDTable if successful. 779 info.procArgs.FDTable.DecRef(ctx) 780 781 // Set the foreground process group on the TTY to the global init process 782 // group, since that is what we are about to start running. 783 switch { 784 case ttyFileVFS2 != nil: 785 ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) 786 case ttyFile != nil: 787 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 788 } 789 790 // Install seccomp filters with the new task if there are any. 791 if info.conf.OCISeccomp { 792 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 793 program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) 794 if err != nil { 795 return nil, nil, nil, fmt.Errorf("building seccomp program: %w", err) 796 } 797 798 if log.IsLogging(log.Debug) { 799 out, _ := bpf.DecodeProgram(program) 800 log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) 801 } 802 803 task := tg.Leader() 804 // NOTE: It seems Flags are ignored by runc so we ignore them too. 805 if err := task.AppendSyscallFilter(program, true); err != nil { 806 return nil, nil, nil, fmt.Errorf("appending seccomp filters: %w", err) 807 } 808 } 809 } else { 810 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 811 log.Warningf("Seccomp spec is being ignored") 812 } 813 } 814 815 return tg, ttyFile, ttyFileVFS2, nil 816 } 817 818 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on 819 // the gofer FDs looking for disconnects, and kills the container processes if a 820 // disconnect occurs in any of the gofer FDs. 821 func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { 822 go func() { 823 log.Debugf("Monitoring gofer health for container %q", cid) 824 var events []unix.PollFd 825 for _, goferFD := range goferFDs { 826 events = append(events, unix.PollFd{ 827 Fd: int32(goferFD.FD()), 828 Events: unix.POLLHUP | unix.POLLRDHUP, 829 }) 830 } 831 _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { 832 // Use ppoll instead of poll because it's already whilelisted in seccomp. 833 n, err := unix.Ppoll(events, nil, nil) 834 return uintptr(n), 0, err 835 }) 836 if err != nil { 837 panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) 838 } 839 840 l.mu.Lock() 841 defer l.mu.Unlock() 842 843 // The gofer could have been stopped due to a normal container shutdown. 844 // Check if the container has not stopped yet. 845 if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { 846 log.Infof("Gofer socket disconnected, killing container %q", cid) 847 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 848 log.Warningf("Error killing container %q after gofer stopped: %s", cid, err) 849 } 850 } 851 }() 852 } 853 854 // destroyContainer stops a container if it is still running and cleans up its 855 // filesystem. 856 func (l *Loader) destroyContainer(cid string) error { 857 l.mu.Lock() 858 defer l.mu.Unlock() 859 860 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 861 if err != nil { 862 // Container doesn't exist. 863 return err 864 } 865 866 // The container exists, but has it been started? 867 if tg != nil { 868 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 869 return fmt.Errorf("sending SIGKILL to all container processes: %w", err) 870 } 871 // Wait for all processes that belong to the container to exit (including 872 // exec'd processes). 873 for _, t := range l.k.TaskSet().Root.Tasks() { 874 if t.ContainerID() == cid { 875 t.ThreadGroup().WaitExited() 876 } 877 } 878 879 // At this point, all processes inside of the container have exited, 880 // releasing all references to the container's MountNamespace and 881 // causing all submounts and overlays to be unmounted. 882 // 883 // Since the container's MountNamespace has been released, 884 // MountNamespace.destroy() will have executed, but that function may 885 // trigger async close operations. We must wait for those to complete 886 // before returning, otherwise the caller may kill the gofer before 887 // they complete, causing a cascade of failing RPCs. 888 fs.AsyncBarrier() 889 } 890 891 // No more failure from this point on. Remove all container thread groups 892 // from the map. 893 for key := range l.processes { 894 if key.cid == cid { 895 delete(l.processes, key) 896 } 897 } 898 899 log.Debugf("Container destroyed, cid: %s", cid) 900 return nil 901 } 902 903 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { 904 // Hold the lock for the entire operation to ensure that exec'd process is 905 // added to 'processes' in case it races with destroyContainer(). 906 l.mu.Lock() 907 defer l.mu.Unlock() 908 909 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) 910 if err != nil { 911 return 0, err 912 } 913 if tg == nil { 914 return 0, fmt.Errorf("container %q not started", args.ContainerID) 915 } 916 917 // Get the container MountNamespace from the Task. Try to acquire ref may fail 918 // in case it raced with task exit. 919 if kernel.VFS2Enabled { 920 // task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves. 921 args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() 922 if !args.MountNamespaceVFS2.TryIncRef() { 923 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 924 } 925 } else { 926 var reffed bool 927 tg.Leader().WithMuLocked(func(t *kernel.Task) { 928 // task.MountNamespace() does not take a ref, so we must do so ourselves. 929 args.MountNamespace = t.MountNamespace() 930 reffed = args.MountNamespace.TryIncRef() 931 }) 932 if !reffed { 933 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 934 } 935 } 936 937 args.Envv, err = specutils.ResolveEnvs(args.Envv) 938 if err != nil { 939 return 0, fmt.Errorf("resolving env: %w", err) 940 } 941 942 // Add the HOME environment variable if it is not already set. 943 if kernel.VFS2Enabled { 944 root := args.MountNamespaceVFS2.Root() 945 ctx := vfs.WithRoot(l.k.SupervisorContext(), root) 946 defer args.MountNamespaceVFS2.DecRef(ctx) 947 envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) 948 if err != nil { 949 return 0, err 950 } 951 args.Envv = envv 952 } else { 953 root := args.MountNamespace.Root() 954 ctx := fs.WithRoot(l.k.SupervisorContext(), root) 955 defer args.MountNamespace.DecRef(ctx) 956 defer root.DecRef(ctx) 957 envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) 958 if err != nil { 959 return 0, err 960 } 961 args.Envv = envv 962 } 963 args.PIDNamespace = tg.PIDNamespace() 964 965 args.Limits, err = createLimitSet(l.root.spec) 966 if err != nil { 967 return 0, fmt.Errorf("creating limits: %w", err) 968 } 969 970 // Start the process. 971 proc := control.Proc{Kernel: l.k} 972 newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args) 973 if err != nil { 974 return 0, err 975 } 976 977 eid := execID{cid: args.ContainerID, pid: tgid} 978 l.processes[eid] = &execProcess{ 979 tg: newTG, 980 tty: ttyFile, 981 ttyVFS2: ttyFileVFS2, 982 } 983 log.Debugf("updated processes: %s", l.processes) 984 985 return tgid, nil 986 } 987 988 // waitContainer waits for the init process of a container to exit. 989 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { 990 // Don't defer unlock, as doing so would make it impossible for 991 // multiple clients to wait on the same container. 992 tg, err := l.threadGroupFromID(execID{cid: cid}) 993 if err != nil { 994 return fmt.Errorf("can't wait for container %q: %w", cid, err) 995 } 996 997 // If the thread either has already exited or exits during waiting, 998 // consider the container exited. 999 ws := l.wait(tg) 1000 *waitStatus = ws 1001 1002 // Check for leaks and write coverage report after the root container has 1003 // exited. This guarantees that the report is written in cases where the 1004 // sandbox is killed by a signal after the ContainerWait request is completed. 1005 if l.root.procArgs.ContainerID == cid { 1006 // All sentry-created resources should have been released at this point. 1007 refsvfs2.DoLeakCheck() 1008 _ = coverage.Report() 1009 } 1010 return nil 1011 } 1012 1013 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { 1014 if tgid <= 0 { 1015 return fmt.Errorf("PID (%d) must be positive", tgid) 1016 } 1017 1018 // Try to find a process that was exec'd 1019 eid := execID{cid: cid, pid: tgid} 1020 execTG, err := l.threadGroupFromID(eid) 1021 if err == nil { 1022 ws := l.wait(execTG) 1023 *waitStatus = ws 1024 1025 l.mu.Lock() 1026 delete(l.processes, eid) 1027 log.Debugf("updated processes (removal): %s", l.processes) 1028 l.mu.Unlock() 1029 return nil 1030 } 1031 1032 // The caller may be waiting on a process not started directly via exec. 1033 // In this case, find the process in the container's PID namespace. 1034 initTG, err := l.threadGroupFromID(execID{cid: cid}) 1035 if err != nil { 1036 return fmt.Errorf("waiting for PID %d: %w", tgid, err) 1037 } 1038 tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) 1039 if tg == nil { 1040 return fmt.Errorf("waiting for PID %d: no such process", tgid) 1041 } 1042 if tg.Leader().ContainerID() != cid { 1043 return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) 1044 } 1045 ws := l.wait(tg) 1046 *waitStatus = ws 1047 return nil 1048 } 1049 1050 // wait waits for the process with TGID 'tgid' in a container's PID namespace 1051 // to exit. 1052 func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { 1053 tg.WaitExited() 1054 return tg.ExitStatus().Status() 1055 } 1056 1057 // WaitForStartSignal waits for a start signal from the control server. 1058 func (l *Loader) WaitForStartSignal() { 1059 <-l.ctrl.manager.startChan 1060 } 1061 1062 // WaitExit waits for the root container to exit, and returns its exit status. 1063 func (l *Loader) WaitExit() kernel.ExitStatus { 1064 // Wait for container. 1065 l.k.WaitExited() 1066 1067 // Check all references. 1068 refs.OnExit() 1069 1070 return l.k.GlobalInit().ExitStatus() 1071 } 1072 1073 func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { 1074 // Create an empty network stack because the network namespace may be empty at 1075 // this point. Netns is configured before Run() is called. Netstack is 1076 // configured using a control uRPC message. Host network is configured inside 1077 // Run(). 1078 switch conf.Network { 1079 case config.NetworkHost: 1080 // No network namespacing support for hostinet yet, hence creator is nil. 1081 return inet.NewRootNamespace(hostinet.NewStack(), nil), nil 1082 1083 case config.NetworkNone, config.NetworkSandbox: 1084 s, err := newEmptySandboxNetworkStack(clock, uniqueID) 1085 if err != nil { 1086 return nil, err 1087 } 1088 creator := &sandboxNetstackCreator{ 1089 clock: clock, 1090 uniqueID: uniqueID, 1091 } 1092 return inet.NewRootNamespace(s, creator), nil 1093 1094 default: 1095 panic(fmt.Sprintf("invalid network configuration: %d", conf.Network)) 1096 } 1097 1098 } 1099 1100 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { 1101 netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} 1102 transProtos := []stack.TransportProtocolFactory{ 1103 tcp.NewProtocol, 1104 udp.NewProtocol, 1105 icmp.NewProtocol4, 1106 icmp.NewProtocol6, 1107 } 1108 s := netstack.Stack{Stack: stack.New(stack.Options{ 1109 NetworkProtocols: netProtos, 1110 TransportProtocols: transProtos, 1111 Clock: clock, 1112 Stats: netstack.Metrics, 1113 HandleLocal: true, 1114 // Enable raw sockets for users with sufficient 1115 // privileges. 1116 RawFactory: raw.EndpointFactory{}, 1117 UniqueID: uniqueID, 1118 DefaultIPTables: netfilter.DefaultLinuxTables, 1119 })} 1120 1121 // Enable SACK Recovery. 1122 { 1123 opt := tcpip.TCPSACKEnabled(true) 1124 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1125 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1126 } 1127 } 1128 1129 // Set default TTLs as required by socket/netstack. 1130 { 1131 opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) 1132 if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { 1133 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) 1134 } 1135 if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { 1136 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) 1137 } 1138 } 1139 1140 // Enable Receive Buffer Auto-Tuning. 1141 { 1142 opt := tcpip.TCPModerateReceiveBufferOption(true) 1143 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1144 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1145 } 1146 } 1147 1148 return &s, nil 1149 } 1150 1151 // sandboxNetstackCreator implements kernel.NetworkStackCreator. 1152 // 1153 // +stateify savable 1154 type sandboxNetstackCreator struct { 1155 clock tcpip.Clock 1156 uniqueID stack.UniqueID 1157 } 1158 1159 // CreateStack implements kernel.NetworkStackCreator.CreateStack. 1160 func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { 1161 s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) 1162 if err != nil { 1163 return nil, err 1164 } 1165 1166 // Setup loopback. 1167 n := &Network{Stack: s.(*netstack.Stack).Stack} 1168 nicID := tcpip.NICID(f.uniqueID.UniqueID()) 1169 link := DefaultLoopbackLink 1170 linkEP := loopback.New() 1171 if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { 1172 return nil, err 1173 } 1174 1175 return s, nil 1176 } 1177 1178 // signal sends a signal to one or more processes in a container. If PID is 0, 1179 // then the container init process is used. Depending on the SignalDeliveryMode 1180 // option, the signal may be sent directly to the indicated process, to all 1181 // processes in the container, or to the foreground process group. pid is 1182 // relative to the root PID namespace, not the container's. 1183 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { 1184 if pid < 0 { 1185 return fmt.Errorf("PID (%d) must be positive", pid) 1186 } 1187 1188 switch mode { 1189 case DeliverToProcess: 1190 if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { 1191 return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) 1192 } 1193 return nil 1194 1195 case DeliverToForegroundProcessGroup: 1196 if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { 1197 return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) 1198 } 1199 return nil 1200 1201 case DeliverToAllProcesses: 1202 if pid != 0 { 1203 return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) 1204 } 1205 // Check that the container has actually started before signaling it. 1206 if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { 1207 return err 1208 } 1209 if err := l.signalAllProcesses(cid, signo); err != nil { 1210 return fmt.Errorf("signaling all processes in container %q: %w", cid, err) 1211 } 1212 return nil 1213 1214 default: 1215 panic(fmt.Sprintf("unknown signal delivery mode %s", mode)) 1216 } 1217 } 1218 1219 // signalProcess sends signal to process in the given container. tgid is 1220 // relative to the root PID namespace, not the container's. 1221 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { 1222 execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) 1223 if err == nil { 1224 // Send signal directly to the identified process. 1225 return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) 1226 } 1227 1228 // The caller may be signaling a process not started directly via exec. 1229 // In this case, find the process and check that the process belongs to the 1230 // container in question. 1231 tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) 1232 if tg == nil { 1233 return fmt.Errorf("no such process with PID %d", tgid) 1234 } 1235 if tg.Leader().ContainerID() != cid { 1236 return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) 1237 } 1238 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1239 } 1240 1241 // signalForegrondProcessGroup looks up foreground process group from the TTY 1242 // for the given "tgid" inside container "cid", and send the signal to it. 1243 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { 1244 l.mu.Lock() 1245 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) 1246 if err != nil { 1247 l.mu.Unlock() 1248 return fmt.Errorf("no thread group found: %w", err) 1249 } 1250 if tg == nil { 1251 l.mu.Unlock() 1252 return fmt.Errorf("container %q not started", cid) 1253 } 1254 1255 tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) 1256 l.mu.Unlock() 1257 if err != nil { 1258 return fmt.Errorf("no thread group found: %w", err) 1259 } 1260 1261 var pg *kernel.ProcessGroup 1262 switch { 1263 case ttyVFS2 != nil: 1264 pg = ttyVFS2.ForegroundProcessGroup() 1265 case tty != nil: 1266 pg = tty.ForegroundProcessGroup() 1267 default: 1268 return fmt.Errorf("no TTY attached") 1269 } 1270 if pg == nil { 1271 // No foreground process group has been set. Signal the 1272 // original thread group. 1273 log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) 1274 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1275 } 1276 // Send the signal to all processes in the process group. 1277 var lastErr error 1278 for _, tg := range l.k.TaskSet().Root.ThreadGroups() { 1279 if tg.ProcessGroup() != pg { 1280 continue 1281 } 1282 if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}); err != nil { 1283 lastErr = err 1284 } 1285 } 1286 return lastErr 1287 } 1288 1289 // signalAllProcesses that belong to specified container. It's a noop if the 1290 // container hasn't started or has exited. 1291 func (l *Loader) signalAllProcesses(cid string, signo int32) error { 1292 // Pause the kernel to prevent new processes from being created while 1293 // the signal is delivered. This prevents process leaks when SIGKILL is 1294 // sent to the entire container. 1295 l.k.Pause() 1296 defer l.k.Unpause() 1297 return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) 1298 } 1299 1300 // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it 1301 // acquires mutex before calling it and fails in case container hasn't started 1302 // yet. 1303 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { 1304 l.mu.Lock() 1305 defer l.mu.Unlock() 1306 tg, err := l.tryThreadGroupFromIDLocked(key) 1307 if err != nil { 1308 return nil, err 1309 } 1310 if tg == nil { 1311 return nil, fmt.Errorf("container %q not started", key.cid) 1312 } 1313 return tg, nil 1314 } 1315 1316 // tryThreadGroupFromIDLocked returns the thread group for the given execution 1317 // ID. It may return nil in case the container has not started yet. Returns 1318 // error if execution ID is invalid or if the container cannot be found (maybe 1319 // it has been deleted). Caller must hold 'mu'. 1320 func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { 1321 ep := l.processes[key] 1322 if ep == nil { 1323 return nil, fmt.Errorf("container %q not found", key.cid) 1324 } 1325 return ep.tg, nil 1326 } 1327 1328 // ttyFromIDLocked returns the TTY files for the given execution ID. It may 1329 // return nil in case the container has not started yet. Returns error if 1330 // execution ID is invalid or if the container cannot be found (maybe it has 1331 // been deleted). Caller must hold 'mu'. 1332 func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { 1333 ep := l.processes[key] 1334 if ep == nil { 1335 return nil, nil, fmt.Errorf("container %q not found", key.cid) 1336 } 1337 return ep.tty, ep.ttyVFS2, nil 1338 } 1339 1340 func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { 1341 if len(stdioFDs) != 3 { 1342 return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) 1343 } 1344 1345 k := kernel.KernelFromContext(ctx) 1346 fdTable := k.NewFDTable() 1347 ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) 1348 if err != nil { 1349 fdTable.DecRef(ctx) 1350 return nil, nil, nil, err 1351 } 1352 return fdTable, ttyFile, ttyFileVFS2, nil 1353 }