github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/loader.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package boot loads the kernel and runs a container. 16 package boot 17 18 import ( 19 "errors" 20 "fmt" 21 mrand "math/rand" 22 "os" 23 "runtime" 24 "strconv" 25 gtime "time" 26 27 specs "github.com/opencontainers/runtime-spec/specs-go" 28 "github.com/syndtr/gocapability/capability" 29 "golang.org/x/sys/unix" 30 "github.com/metacubex/gvisor/pkg/abi/linux" 31 "github.com/metacubex/gvisor/pkg/bpf" 32 "github.com/metacubex/gvisor/pkg/cleanup" 33 "github.com/metacubex/gvisor/pkg/context" 34 "github.com/metacubex/gvisor/pkg/coverage" 35 "github.com/metacubex/gvisor/pkg/cpuid" 36 "github.com/metacubex/gvisor/pkg/fd" 37 "github.com/metacubex/gvisor/pkg/log" 38 "github.com/metacubex/gvisor/pkg/memutil" 39 "github.com/metacubex/gvisor/pkg/rand" 40 "github.com/metacubex/gvisor/pkg/refs" 41 "github.com/metacubex/gvisor/pkg/sentry/control" 42 "github.com/metacubex/gvisor/pkg/sentry/devices/nvproxy" 43 "github.com/metacubex/gvisor/pkg/sentry/fdimport" 44 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/host" 45 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs" 46 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/user" 47 "github.com/metacubex/gvisor/pkg/sentry/inet" 48 "github.com/metacubex/gvisor/pkg/sentry/kernel" 49 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 50 "github.com/metacubex/gvisor/pkg/sentry/loader" 51 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 52 "github.com/metacubex/gvisor/pkg/sentry/platform" 53 "github.com/metacubex/gvisor/pkg/sentry/seccheck" 54 pb "github.com/metacubex/gvisor/pkg/sentry/seccheck/points/points_go_proto" 55 "github.com/metacubex/gvisor/pkg/sentry/socket/netfilter" 56 "github.com/metacubex/gvisor/pkg/sentry/time" 57 "github.com/metacubex/gvisor/pkg/sentry/usage" 58 "github.com/metacubex/gvisor/pkg/sentry/vfs" 59 "github.com/metacubex/gvisor/pkg/sentry/watchdog" 60 "github.com/metacubex/gvisor/pkg/sighandling" 61 "github.com/metacubex/gvisor/pkg/sync" 62 "github.com/metacubex/gvisor/pkg/tcpip" 63 "github.com/metacubex/gvisor/pkg/tcpip/link/ethernet" 64 "github.com/metacubex/gvisor/pkg/tcpip/link/loopback" 65 "github.com/metacubex/gvisor/pkg/tcpip/link/sniffer" 66 "github.com/metacubex/gvisor/pkg/tcpip/network/arp" 67 "github.com/metacubex/gvisor/pkg/tcpip/network/ipv4" 68 "github.com/metacubex/gvisor/pkg/tcpip/network/ipv6" 69 "github.com/metacubex/gvisor/pkg/tcpip/stack" 70 "github.com/metacubex/gvisor/pkg/tcpip/transport/icmp" 71 "github.com/metacubex/gvisor/pkg/tcpip/transport/raw" 72 "github.com/metacubex/gvisor/pkg/tcpip/transport/tcp" 73 "github.com/metacubex/gvisor/pkg/tcpip/transport/udp" 74 "github.com/metacubex/gvisor/runsc/boot/filter" 75 _ "github.com/metacubex/gvisor/runsc/boot/platforms" // register all platforms. 76 pf "github.com/metacubex/gvisor/runsc/boot/portforward" 77 "github.com/metacubex/gvisor/runsc/boot/pprof" 78 "github.com/metacubex/gvisor/runsc/config" 79 "github.com/metacubex/gvisor/runsc/profile" 80 "github.com/metacubex/gvisor/runsc/specutils" 81 "github.com/metacubex/gvisor/runsc/specutils/seccomp" 82 83 // Top-level inet providers. 84 "github.com/metacubex/gvisor/pkg/sentry/socket/hostinet" 85 "github.com/metacubex/gvisor/pkg/sentry/socket/netstack" 86 87 // Include other supported socket providers. 88 _ "github.com/metacubex/gvisor/pkg/sentry/socket/netlink" 89 _ "github.com/metacubex/gvisor/pkg/sentry/socket/netlink/route" 90 _ "github.com/metacubex/gvisor/pkg/sentry/socket/netlink/uevent" 91 _ "github.com/metacubex/gvisor/pkg/sentry/socket/unix" 92 ) 93 94 type containerInfo struct { 95 cid string 96 97 containerName string 98 99 conf *config.Config 100 101 // spec is the base configuration for the root container. 102 spec *specs.Spec 103 104 // procArgs refers to the container's init task. 105 procArgs kernel.CreateProcessArgs 106 107 // stdioFDs contains stdin, stdout, and stderr. 108 stdioFDs []*fd.FD 109 110 // passFDs are mappings of user-supplied host to guest file descriptors. 111 passFDs []fdMapping 112 113 // execFD is the host file descriptor used for program execution. 114 execFD *fd.FD 115 116 // goferFDs are the FDs that attach the sandbox to the gofers. 117 goferFDs []*fd.FD 118 119 // devGoferFD is the FD to attach the sandbox to the dev gofer. 120 devGoferFD *fd.FD 121 122 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 123 // overlayfs mount for certain gofer mounts. 124 goferFilestoreFDs []*fd.FD 125 126 // goferMountConfs contains information about how the gofer mounts have been 127 // configured. The first entry is for rootfs and the following entries are 128 // for bind mounts in Spec.Mounts (in the same order). 129 goferMountConfs []GoferMountConf 130 131 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 132 nvidiaUVMDevMajor uint32 133 134 // nvidiaDriverVersion is the Nvidia driver version on the host. 135 nvidiaDriverVersion string 136 } 137 138 // Loader keeps state needed to start the kernel and run the container. 139 type Loader struct { 140 // k is the kernel. 141 k *kernel.Kernel 142 143 // ctrl is the control server. 144 ctrl *controller 145 146 // root contains information about the root container in the sandbox. 147 root containerInfo 148 149 watchdog *watchdog.Watchdog 150 151 // stopSignalForwarding disables forwarding of signals to the sandboxed 152 // container. It should be called when a sandbox is destroyed. 153 stopSignalForwarding func() 154 155 // stopProfiling stops profiling started at container creation. It 156 // should be called when a sandbox is destroyed. 157 stopProfiling func() 158 159 // PreSeccompCallback is called right before installing seccomp filters. 160 PreSeccompCallback func() 161 162 // restore is set to true if we are restoring a container. 163 restore bool 164 165 // sandboxID is the ID for the whole sandbox. 166 sandboxID string 167 168 // mountHints provides extra information about mounts for containers that 169 // apply to the entire pod. 170 mountHints *PodMountHints 171 172 // productName is the value to show in 173 // /sys/devices/virtual/dmi/id/product_name. 174 productName string 175 176 // mu guards the fields below. 177 mu sync.Mutex 178 179 // sharedMounts holds VFS mounts that may be shared between containers within 180 // the same pod. It is mapped by mount source. 181 sharedMounts map[string]*vfs.Mount 182 183 // processes maps containers init process and invocation of exec. Root 184 // processes are keyed with container ID and pid=0, while exec invocations 185 // have the corresponding pid set. 186 // 187 // processes is guarded by mu. 188 processes map[execID]*execProcess 189 190 // portForwardProxies is a list of active port forwarding connections. 191 // 192 // portForwardProxies is guarded by mu. 193 portForwardProxies []*pf.Proxy 194 } 195 196 // execID uniquely identifies a sentry process that is executed in a container. 197 type execID struct { 198 cid string 199 pid kernel.ThreadID 200 } 201 202 // execProcess contains the thread group and host TTY of a sentry process. 203 type execProcess struct { 204 // tg will be nil for containers that haven't started yet. 205 tg *kernel.ThreadGroup 206 207 // tty will be nil if the process is not attached to a terminal. 208 tty *host.TTYFileDescription 209 210 // pidnsPath is the pid namespace path in spec 211 pidnsPath string 212 213 // hostTTY is present when creating a sub-container with terminal enabled. 214 // TTY file is passed during container create and must be saved until 215 // container start. 216 hostTTY *fd.FD 217 } 218 219 // fdMapping maps guest to host file descriptors. Guest file descriptors are 220 // exposed to the application inside the sandbox through the FD table. 221 type fdMapping struct { 222 guest int 223 host *fd.FD 224 } 225 226 // FDMapping is a helper type to represent a mapping from guest to host file 227 // descriptors. In contrast to the unexported fdMapping type, it does not imply 228 // file ownership. 229 type FDMapping struct { 230 Guest int 231 Host int 232 } 233 234 func init() { 235 // Initialize the random number generator. 236 mrand.Seed(gtime.Now().UnixNano()) 237 } 238 239 // Args are the arguments for New(). 240 type Args struct { 241 // Id is the sandbox ID. 242 ID string 243 // Spec is the sandbox specification. 244 Spec *specs.Spec 245 // Conf is the system configuration. 246 Conf *config.Config 247 // ControllerFD is the FD to the URPC controller. The Loader takes ownership 248 // of this FD and may close it at any time. 249 ControllerFD int 250 // Device is an optional argument that is passed to the platform. The Loader 251 // takes ownership of this file and may close it at any time. 252 Device *os.File 253 // GoferFDs is an array of FDs used to connect with the Gofer. The Loader 254 // takes ownership of these FDs and may close them at any time. 255 GoferFDs []int 256 // DevGoferFD is the FD for the dev gofer connection. The Loader takes 257 // ownership of this FD and may close it at any time. 258 DevGoferFD int 259 // StdioFDs is the stdio for the application. The Loader takes ownership of 260 // these FDs and may close them at any time. 261 StdioFDs []int 262 // PassFDs are user-supplied FD mappings from host to guest descriptors. 263 // The Loader takes ownership of these FDs and may close them at any time. 264 PassFDs []FDMapping 265 // ExecFD is the host file descriptor used for program execution. 266 ExecFD int 267 // GoferFilestoreFDs are FDs to the regular files that will back the tmpfs or 268 // overlayfs mount for certain gofer mounts. 269 GoferFilestoreFDs []int 270 // GoferMountConfs contains information about how the gofer mounts have been 271 // configured. The first entry is for rootfs and the following entries are 272 // for bind mounts in Spec.Mounts (in the same order). 273 GoferMountConfs []GoferMountConf 274 // NumCPU is the number of CPUs to create inside the sandbox. 275 NumCPU int 276 // TotalMem is the initial amount of total memory to report back to the 277 // container. 278 TotalMem uint64 279 // TotalHostMem is the total memory reported by host /proc/meminfo. 280 TotalHostMem uint64 281 // UserLogFD is the file descriptor to write user logs to. 282 UserLogFD int 283 // ProductName is the value to show in 284 // /sys/devices/virtual/dmi/id/product_name. 285 ProductName string 286 // PodInitConfigFD is the file descriptor to a file passed in the 287 // --pod-init-config flag 288 PodInitConfigFD int 289 // SinkFDs is an ordered array of file descriptors to be used by seccheck 290 // sinks configured from the --pod-init-config file. 291 SinkFDs []int 292 // ProfileOpts contains the set of profiles to enable and the 293 // corresponding FDs where profile data will be written. 294 ProfileOpts profile.Opts 295 // NvidiaDriverVersion is the Nvidia driver version on the host. 296 NvidiaDriverVersion string 297 } 298 299 // make sure stdioFDs are always the same on initial start and on restore 300 const startingStdioFD = 256 301 302 func getRootCredentials(spec *specs.Spec, conf *config.Config, userNs *auth.UserNamespace) *auth.Credentials { 303 // Create capabilities. 304 caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) 305 if err != nil { 306 return nil 307 } 308 309 // Convert the spec's additional GIDs to KGIDs. 310 extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) 311 for _, GID := range spec.Process.User.AdditionalGids { 312 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 313 } 314 315 if userNs == nil { 316 userNs = auth.NewRootUserNamespace() 317 } 318 // Create credentials. 319 creds := auth.NewUserCredentials( 320 auth.KUID(spec.Process.User.UID), 321 auth.KGID(spec.Process.User.GID), 322 extraKGIDs, 323 caps, 324 userNs) 325 326 return creds 327 } 328 329 // New initializes a new kernel loader configured by spec. 330 // New also handles setting up a kernel for restoring a container. 331 func New(args Args) (*Loader, error) { 332 stopProfiling := profile.Start(args.ProfileOpts) 333 334 // Initialize seccheck points. 335 seccheck.Initialize() 336 337 // We initialize the rand package now to make sure /dev/urandom is pre-opened 338 // on kernels that do not support getrandom(2). 339 if err := rand.Init(); err != nil { 340 return nil, fmt.Errorf("setting up rand: %w", err) 341 } 342 343 if err := usage.Init(); err != nil { 344 return nil, fmt.Errorf("setting up memory usage: %w", err) 345 } 346 347 if specutils.NVProxyEnabled(args.Spec, args.Conf) { 348 nvproxy.Init() 349 } 350 351 kernel.IOUringEnabled = args.Conf.IOUring 352 353 info := containerInfo{ 354 cid: args.ID, 355 containerName: specutils.ContainerName(args.Spec), 356 conf: args.Conf, 357 spec: args.Spec, 358 goferMountConfs: args.GoferMountConfs, 359 nvidiaDriverVersion: args.NvidiaDriverVersion, 360 } 361 362 // Make host FDs stable between invocations. Host FDs must map to the exact 363 // same number when the sandbox is restored. Otherwise the wrong FD will be 364 // used. 365 newfd := startingStdioFD 366 367 for _, stdioFD := range args.StdioFDs { 368 // Check that newfd is unused to avoid clobbering over it. 369 if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { 370 if err != nil { 371 return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) 372 } 373 return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) 374 } 375 376 err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) 377 if err != nil { 378 return nil, fmt.Errorf("dup3 of stdios failed: %w", err) 379 } 380 info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) 381 _ = unix.Close(stdioFD) 382 newfd++ 383 } 384 for _, goferFD := range args.GoferFDs { 385 info.goferFDs = append(info.goferFDs, fd.New(goferFD)) 386 } 387 for _, filestoreFD := range args.GoferFilestoreFDs { 388 info.goferFilestoreFDs = append(info.goferFilestoreFDs, fd.New(filestoreFD)) 389 } 390 if args.DevGoferFD >= 0 { 391 info.devGoferFD = fd.New(args.DevGoferFD) 392 } 393 if args.ExecFD >= 0 { 394 info.execFD = fd.New(args.ExecFD) 395 } 396 397 for _, customFD := range args.PassFDs { 398 info.passFDs = append(info.passFDs, fdMapping{ 399 host: fd.New(customFD.Host), 400 guest: customFD.Guest, 401 }) 402 } 403 404 // Create kernel and platform. 405 p, err := createPlatform(args.Conf, args.Device) 406 if err != nil { 407 return nil, fmt.Errorf("creating platform: %w", err) 408 } 409 if specutils.NVProxyEnabled(args.Spec, args.Conf) && p.OwnsPageTables() { 410 return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform) 411 } 412 k := &kernel.Kernel{ 413 Platform: p, 414 } 415 416 // Create memory file. 417 mf, err := createMemoryFile() 418 if err != nil { 419 return nil, fmt.Errorf("creating memory file: %w", err) 420 } 421 k.SetMemoryFile(mf) 422 423 // Create VDSO. 424 // 425 // Pass k as the platform since it is savable, unlike the actual platform. 426 vdso, err := loader.PrepareVDSO(k.MemoryFile()) 427 if err != nil { 428 return nil, fmt.Errorf("creating vdso: %w", err) 429 } 430 431 // Create timekeeper. 432 tk := kernel.NewTimekeeper(k.MemoryFile(), vdso.ParamPage.FileRange()) 433 tk.SetClocks(time.NewCalibratedClocks()) 434 435 if err := enableStrace(args.Conf); err != nil { 436 return nil, fmt.Errorf("enabling strace: %w", err) 437 } 438 439 creds := getRootCredentials(args.Spec, args.Conf, nil /* UserNamespace */) 440 if creds == nil { 441 return nil, fmt.Errorf("getting root credentials") 442 } 443 // Create root network namespace/stack. 444 netns, err := newRootNetworkNamespace(args.Conf, tk, k, creds.UserNamespace) 445 if err != nil { 446 return nil, fmt.Errorf("creating network: %w", err) 447 } 448 449 if args.NumCPU == 0 { 450 args.NumCPU = runtime.NumCPU() 451 } 452 log.Infof("CPUs: %d", args.NumCPU) 453 runtime.GOMAXPROCS(args.NumCPU) 454 455 if args.TotalHostMem > 0 { 456 // As per tmpfs(5), the default size limit is 50% of total physical RAM. 457 // See mm/shmem.c:shmem_default_max_blocks(). 458 tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2) 459 } 460 461 if args.TotalMem > 0 { 462 // Adjust the total memory returned by the Sentry so that applications that 463 // use /proc/meminfo can make allocations based on this limit. 464 usage.MinimumTotalMemoryBytes = args.TotalMem 465 usage.MaximumTotalMemoryBytes = args.TotalMem 466 log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) 467 } 468 469 maxFDLimit := kernel.MaxFdLimit 470 if args.Spec.Linux != nil && args.Spec.Linux.Sysctl != nil { 471 if val, ok := args.Spec.Linux.Sysctl["fs.nr_open"]; ok { 472 nrOpen, err := strconv.Atoi(val) 473 if err != nil { 474 return nil, fmt.Errorf("setting fs.nr_open=%s: %w", val, err) 475 } 476 if nrOpen <= 0 || nrOpen > int(kernel.MaxFdLimit) { 477 return nil, fmt.Errorf("setting fs.nr_open=%s", val) 478 } 479 maxFDLimit = int32(nrOpen) 480 } 481 } 482 // Initiate the Kernel object, which is required by the Context passed 483 // to createVFS in order to mount (among other things) procfs. 484 if err = k.Init(kernel.InitKernelArgs{ 485 FeatureSet: cpuid.HostFeatureSet().Fixed(), 486 Timekeeper: tk, 487 RootUserNamespace: creds.UserNamespace, 488 RootNetworkNamespace: netns, 489 ApplicationCores: uint(args.NumCPU), 490 Vdso: vdso, 491 RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), 492 RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), 493 PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), 494 MaxFDLimit: maxFDLimit, 495 }); err != nil { 496 return nil, fmt.Errorf("initializing kernel: %w", err) 497 } 498 499 if err := registerFilesystems(k, &info); err != nil { 500 return nil, fmt.Errorf("registering filesystems: %w", err) 501 } 502 503 // Turn on packet logging if enabled. 504 if args.Conf.LogPackets { 505 log.Infof("Packet logging enabled") 506 sniffer.LogPackets.Store(1) 507 } else { 508 log.Infof("Packet logging disabled") 509 sniffer.LogPackets.Store(0) 510 } 511 512 // Create a watchdog. 513 dogOpts := watchdog.DefaultOpts 514 dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction 515 dog := watchdog.New(k, dogOpts) 516 517 procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) 518 if err != nil { 519 return nil, fmt.Errorf("creating init process for root container: %w", err) 520 } 521 info.procArgs = procArgs 522 523 if err := initCompatLogs(args.UserLogFD); err != nil { 524 return nil, fmt.Errorf("initializing compat logs: %w", err) 525 } 526 527 mountHints, err := NewPodMountHints(args.Spec) 528 if err != nil { 529 return nil, fmt.Errorf("creating pod mount hints: %w", err) 530 } 531 532 // Set up host mount that will be used for imported fds. 533 hostFilesystem, err := host.NewFilesystem(k.VFS()) 534 if err != nil { 535 return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) 536 } 537 defer hostFilesystem.DecRef(k.SupervisorContext()) 538 k.SetHostMount(k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})) 539 540 if args.PodInitConfigFD >= 0 { 541 if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil { 542 log.Warningf("unable to configure event session: %v", err) 543 } 544 } 545 546 eid := execID{cid: args.ID} 547 l := &Loader{ 548 k: k, 549 watchdog: dog, 550 sandboxID: args.ID, 551 processes: map[execID]*execProcess{eid: {}}, 552 mountHints: mountHints, 553 sharedMounts: make(map[string]*vfs.Mount), 554 root: info, 555 stopProfiling: stopProfiling, 556 productName: args.ProductName, 557 } 558 559 // We don't care about child signals; some platforms can generate a 560 // tremendous number of useless ones (I'm looking at you, ptrace). 561 if err := sighandling.IgnoreChildStop(); err != nil { 562 return nil, fmt.Errorf("ignore child stop signals failed: %w", err) 563 } 564 565 // Create the control server using the provided FD. 566 // 567 // This must be done *after* we have initialized the kernel since the 568 // controller is used to configure the kernel's network stack. 569 ctrl, err := newController(args.ControllerFD, l) 570 if err != nil { 571 return nil, fmt.Errorf("creating control server: %w", err) 572 } 573 l.ctrl = ctrl 574 575 // Only start serving after Loader is set to controller and controller is set 576 // to Loader, because they are both used in the urpc methods. 577 if err := ctrl.srv.StartServing(); err != nil { 578 return nil, fmt.Errorf("starting control server: %w", err) 579 } 580 581 return l, nil 582 } 583 584 // createProcessArgs creates args that can be used with kernel.CreateProcess. 585 func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { 586 // Create initial limits. 587 ls, err := createLimitSet(spec) 588 if err != nil { 589 return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) 590 } 591 env, err := specutils.ResolveEnvs(spec.Process.Env) 592 if err != nil { 593 return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) 594 } 595 596 wd := spec.Process.Cwd 597 if wd == "" { 598 wd = "/" 599 } 600 601 // Create the process arguments. 602 procArgs := kernel.CreateProcessArgs{ 603 Argv: spec.Process.Args, 604 Envv: env, 605 WorkingDirectory: wd, 606 Credentials: creds, 607 Umask: 0022, 608 Limits: ls, 609 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 610 UTSNamespace: k.RootUTSNamespace(), 611 IPCNamespace: k.RootIPCNamespace(), 612 ContainerID: id, 613 PIDNamespace: pidns, 614 } 615 616 return procArgs, nil 617 } 618 619 // Destroy cleans up all resources used by the loader. 620 // 621 // Note that this will block until all open control server connections have 622 // been closed. For that reason, this should NOT be called in a defer, because 623 // a panic in a control server rpc would then hang forever. 624 func (l *Loader) Destroy() { 625 if l.stopSignalForwarding != nil { 626 l.stopSignalForwarding() 627 } 628 l.watchdog.Stop() 629 630 ctx := l.k.SupervisorContext() 631 for _, m := range l.sharedMounts { 632 m.DecRef(ctx) 633 } 634 635 // Stop the control server. This will indirectly stop any 636 // long-running control operations that are in flight, e.g. 637 // profiling operations. 638 l.ctrl.stop() 639 640 // Release all kernel resources. This is only safe after we can no longer 641 // save/restore. 642 l.k.Release() 643 644 // Release any dangling tcp connections. 645 tcpip.ReleaseDanglingEndpoints() 646 647 // In the success case, all FDs in l.root will only contain released/closed 648 // FDs whose ownership has been passed over to host FDs and gofer sessions. 649 // Close them here in case of failure. 650 for _, f := range l.root.stdioFDs { 651 _ = f.Close() 652 } 653 for _, f := range l.root.passFDs { 654 _ = f.host.Close() 655 } 656 for _, f := range l.root.goferFDs { 657 _ = f.Close() 658 } 659 for _, f := range l.root.goferFilestoreFDs { 660 _ = f.Close() 661 } 662 if l.root.devGoferFD != nil { 663 _ = l.root.devGoferFD.Close() 664 } 665 666 l.stopProfiling() 667 // Check all references. 668 refs.OnExit() 669 } 670 671 func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { 672 p, err := platform.Lookup(conf.Platform) 673 if err != nil { 674 panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) 675 } 676 log.Infof("Platform: %s", conf.Platform) 677 return p.New(deviceFile) 678 } 679 680 func createMemoryFile() (*pgalloc.MemoryFile, error) { 681 const memfileName = "runsc-memory" 682 memfd, err := memutil.CreateMemFD(memfileName, 0) 683 if err != nil { 684 return nil, fmt.Errorf("error creating memfd: %w", err) 685 } 686 memfile := os.NewFile(uintptr(memfd), memfileName) 687 // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if 688 // there are memory cgroups specified, because at this point we're already 689 // in a mount namespace in which the relevant cgroupfs is not visible. 690 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 691 if err != nil { 692 _ = memfile.Close() 693 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) 694 } 695 return mf, nil 696 } 697 698 // installSeccompFilters installs sandbox seccomp filters with the host. 699 func (l *Loader) installSeccompFilters() error { 700 if l.PreSeccompCallback != nil { 701 l.PreSeccompCallback() 702 } 703 if l.root.conf.DisableSeccomp { 704 log.Warningf("*** SECCOMP WARNING: syscall filter is DISABLED. Running in less secure mode.") 705 } else { 706 hostnet := l.root.conf.Network == config.NetworkHost 707 opts := filter.Options{ 708 Platform: l.k.Platform.SeccompInfo(), 709 HostNetwork: hostnet, 710 HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw, 711 HostFilesystem: l.root.conf.DirectFS, 712 ProfileEnable: l.root.conf.ProfileEnable, 713 NVProxy: specutils.NVProxyEnabled(l.root.spec, l.root.conf), 714 TPUProxy: specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf), 715 ControllerFD: uint32(l.ctrl.srv.FD()), 716 } 717 if err := filter.Install(opts); err != nil { 718 return fmt.Errorf("installing seccomp filters: %w", err) 719 } 720 } 721 return nil 722 } 723 724 // Run runs the root container. 725 func (l *Loader) Run() error { 726 err := l.run() 727 l.ctrl.manager.startResultChan <- err 728 if err != nil { 729 // Give the controller some time to send the error to the 730 // runtime. If we return too quickly here the process will exit 731 // and the control connection will be closed before the error 732 // is returned. 733 gtime.Sleep(2 * gtime.Second) 734 return err 735 } 736 return nil 737 } 738 739 func (l *Loader) run() error { 740 if l.root.conf.Network == config.NetworkHost { 741 // Delay host network configuration to this point because network namespace 742 // is configured after the loader is created and before Run() is called. 743 log.Debugf("Configuring host network") 744 s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) 745 if err := s.Configure(l.root.conf.EnableRaw); err != nil { 746 return err 747 } 748 } 749 750 l.mu.Lock() 751 defer l.mu.Unlock() 752 753 eid := execID{cid: l.sandboxID} 754 ep, ok := l.processes[eid] 755 if !ok { 756 return fmt.Errorf("trying to start deleted container %q", l.sandboxID) 757 } 758 759 // If we are restoring, we do not want to create a process. 760 // l.restore is set by the container manager when a restore call is made. 761 if !l.restore { 762 if l.root.conf.ProfileEnable { 763 pprof.Initialize() 764 } 765 766 // Finally done with all configuration. Setup filters before user code 767 // is loaded. 768 if err := l.installSeccompFilters(); err != nil { 769 return err 770 } 771 772 // Create the root container init task. It will begin running 773 // when the kernel is started. 774 var ( 775 tg *kernel.ThreadGroup 776 err error 777 ) 778 tg, ep.tty, err = l.createContainerProcess(&l.root) 779 if err != nil { 780 return err 781 } 782 783 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 784 evt := pb.Start{ 785 Id: l.sandboxID, 786 Cwd: l.root.spec.Process.Cwd, 787 Args: l.root.spec.Process.Args, 788 Terminal: l.root.spec.Process.Terminal, 789 } 790 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 791 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 792 evt.Env = l.root.spec.Process.Env 793 } 794 if !fields.Context.Empty() { 795 evt.ContextData = &pb.ContextData{} 796 kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData) 797 } 798 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 799 return c.ContainerStart(context.Background(), fields, &evt) 800 }) 801 } 802 } 803 804 ep.tg = l.k.GlobalInit() 805 if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { 806 ep.pidnsPath = ns.Path 807 } 808 809 // Handle signals by forwarding them to the root container process 810 // (except for panic signal, which should cause a panic). 811 l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { 812 // Panic signal should cause a panic. 813 if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { 814 panic("Signal-induced panic") 815 } 816 817 // Otherwise forward to root container. 818 deliveryMode := DeliverToProcess 819 if l.root.spec.Process.Terminal { 820 // Since we are running with a console, we should forward the signal to 821 // the foreground process group so that job control signals like ^C can 822 // be handled properly. 823 deliveryMode = DeliverToForegroundProcessGroup 824 } 825 log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) 826 if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { 827 log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) 828 } 829 }) 830 831 log.Infof("Process should have started...") 832 l.watchdog.Start() 833 return l.k.Start() 834 } 835 836 // createSubcontainer creates a new container inside the sandbox. 837 func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { 838 l.mu.Lock() 839 defer l.mu.Unlock() 840 841 eid := execID{cid: cid} 842 if _, ok := l.processes[eid]; ok { 843 return fmt.Errorf("container %q already exists", cid) 844 } 845 l.processes[eid] = &execProcess{hostTTY: tty} 846 return nil 847 } 848 849 // startSubcontainer starts a child container. It returns the thread group ID of 850 // the newly created process. Used FDs are either closed or released. It's safe 851 // for the caller to close any remaining files upon return. 852 func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error { 853 l.mu.Lock() 854 defer l.mu.Unlock() 855 856 ep := l.processes[execID{cid: cid}] 857 if ep == nil { 858 return fmt.Errorf("trying to start a deleted container %q", cid) 859 } 860 861 // Create credentials. We reuse the root user namespace because the 862 // sentry currently supports only 1 mount namespace, which is tied to a 863 // single user namespace. Thus we must run in the same user namespace 864 // to access mounts. 865 creds := getRootCredentials(spec, conf, l.k.RootUserNamespace()) 866 if creds == nil { 867 return fmt.Errorf("getting root credentials") 868 } 869 var pidns *kernel.PIDNamespace 870 if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { 871 if ns.Path != "" { 872 for _, p := range l.processes { 873 if ns.Path == p.pidnsPath { 874 log.Debugf("Joining PID namespace named %q", ns.Path) 875 pidns = p.tg.PIDNamespace() 876 break 877 } 878 } 879 } 880 if pidns == nil { 881 log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path) 882 pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) 883 } 884 ep.pidnsPath = ns.Path 885 } else { 886 pidns = l.k.RootPIDNamespace() 887 } 888 889 info := &containerInfo{ 890 cid: cid, 891 containerName: specutils.ContainerName(spec), 892 conf: conf, 893 spec: spec, 894 goferFDs: goferFDs, 895 devGoferFD: devGoferFD, 896 goferFilestoreFDs: goferFilestoreFDs, 897 goferMountConfs: goferMountConfs, 898 nvidiaUVMDevMajor: l.root.nvidiaUVMDevMajor, 899 nvidiaDriverVersion: l.root.nvidiaDriverVersion, 900 } 901 var err error 902 info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) 903 if err != nil { 904 return fmt.Errorf("creating new process: %w", err) 905 } 906 907 // Use stdios or TTY depending on the spec configuration. 908 if spec.Process.Terminal { 909 if l := len(stdioFDs); l != 0 { 910 return fmt.Errorf("using TTY, stdios not expected: %d", l) 911 } 912 if ep.hostTTY == nil { 913 return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") 914 } 915 info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} 916 ep.hostTTY = nil 917 } else { 918 info.stdioFDs = stdioFDs 919 } 920 921 var cu cleanup.Cleanup 922 defer cu.Clean() 923 if devGoferFD != nil { 924 cu.Add(func() { 925 // createContainerProcess() will consume devGoferFD and initialize a gofer 926 // connection. This connection is owned by l.k. In case of failure, we want 927 // to clean up this gofer connection so that the gofer process can exit. 928 l.k.RemoveDevGofer(cid) 929 }) 930 } 931 932 ep.tg, ep.tty, err = l.createContainerProcess(info) 933 if err != nil { 934 return err 935 } 936 937 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 938 evt := pb.Start{ 939 Id: cid, 940 Cwd: spec.Process.Cwd, 941 Args: spec.Process.Args, 942 Terminal: spec.Process.Terminal, 943 } 944 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 945 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 946 evt.Env = spec.Process.Env 947 } 948 if !fields.Context.Empty() { 949 evt.ContextData = &pb.ContextData{} 950 kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData) 951 } 952 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 953 return c.ContainerStart(context.Background(), fields, &evt) 954 }) 955 } 956 957 l.k.StartProcess(ep.tg) 958 // No more failures from this point on. 959 cu.Release() 960 return nil 961 } 962 963 // +checklocks:l.mu 964 func (l *Loader) createContainerProcess(info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) { 965 // Create the FD map, which will set stdin, stdout, and stderr. 966 ctx := info.procArgs.NewContext(l.k) 967 fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User, info.containerName) 968 if err != nil { 969 return nil, nil, fmt.Errorf("importing fds: %w", err) 970 } 971 // CreateProcess takes a reference on fdTable if successful. We won't need 972 // ours either way. 973 info.procArgs.FDTable = fdTable 974 975 if info.execFD != nil { 976 if info.procArgs.Filename != "" { 977 return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 978 } 979 file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{ 980 Readonly: true, 981 Savable: true, 982 VirtualOwner: true, 983 UID: auth.KUID(info.spec.Process.User.UID), 984 GID: auth.KGID(info.spec.Process.User.GID), 985 }) 986 if err != nil { 987 return nil, nil, err 988 } 989 defer file.DecRef(ctx) 990 info.execFD.Release() 991 992 info.procArgs.File = file 993 } 994 995 // Gofer FDs must be ordered and the first FD is always the rootfs. 996 if len(info.goferFDs) < 1 { 997 return nil, nil, fmt.Errorf("rootfs gofer FD not found") 998 } 999 l.startGoferMonitor(info) 1000 1001 if l.root.cid == l.sandboxID { 1002 // Mounts cgroups for all the controllers. 1003 if err := l.mountCgroupMounts(info.conf, info.procArgs.Credentials); err != nil { 1004 return nil, nil, err 1005 } 1006 } 1007 // We can share l.sharedMounts with containerMounter since l.mu is locked. 1008 // Hence, mntr must only be used within this function (while l.mu is locked). 1009 mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID) 1010 if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil { 1011 return nil, nil, err 1012 } 1013 defer func() { 1014 for cg := range info.procArgs.InitialCgroups { 1015 cg.Dentry.DecRef(ctx) 1016 } 1017 }() 1018 1019 // Add the HOME environment variable if it is not already set. 1020 info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, 1021 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 1022 if err != nil { 1023 return nil, nil, err 1024 } 1025 1026 // Create and start the new process. 1027 tg, _, err := l.k.CreateProcess(info.procArgs) 1028 if err != nil { 1029 return nil, nil, fmt.Errorf("creating process: %w", err) 1030 } 1031 // CreateProcess takes a reference on FDTable if successful. 1032 info.procArgs.FDTable.DecRef(ctx) 1033 1034 // Set the foreground process group on the TTY to the global init process 1035 // group, since that is what we are about to start running. 1036 if ttyFile != nil { 1037 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 1038 } 1039 1040 // Install seccomp filters with the new task if there are any. 1041 if info.conf.OCISeccomp { 1042 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 1043 program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) 1044 if err != nil { 1045 return nil, nil, fmt.Errorf("building seccomp program: %w", err) 1046 } 1047 1048 if log.IsLogging(log.Debug) { 1049 out, _ := bpf.DecodeProgram(program) 1050 log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) 1051 } 1052 1053 task := tg.Leader() 1054 // NOTE: It seems Flags are ignored by runc so we ignore them too. 1055 if err := task.AppendSyscallFilter(program, true); err != nil { 1056 return nil, nil, fmt.Errorf("appending seccomp filters: %w", err) 1057 } 1058 } 1059 } else { 1060 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 1061 log.Warningf("Seccomp spec is being ignored") 1062 } 1063 } 1064 1065 return tg, ttyFile, nil 1066 } 1067 1068 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on 1069 // the gofer FD looking for disconnects, and kills the container processes if 1070 // the gofer connection disconnects. 1071 func (l *Loader) startGoferMonitor(info *containerInfo) { 1072 // We need to pick a suitable gofer connection that is expected to be alive 1073 // for the entire container lifecycle. Only the following can be used: 1074 // 1. Rootfs gofer connection 1075 // 2. Device gofer connection 1076 // 1077 // Note that other gofer mounts are allowed to be unmounted and disconnected. 1078 goferFD := -1 1079 if info.goferMountConfs[0].ShouldUseLisafs() { 1080 goferFD = info.goferFDs[0].FD() 1081 } else if info.devGoferFD != nil { 1082 goferFD = info.devGoferFD.FD() 1083 } 1084 if goferFD < 0 { 1085 log.Warningf("could not find a suitable gofer FD to monitor") 1086 return 1087 } 1088 go func() { 1089 log.Debugf("Monitoring gofer health for container %q", info.cid) 1090 events := []unix.PollFd{ 1091 { 1092 Fd: int32(goferFD), 1093 Events: unix.POLLHUP | unix.POLLRDHUP, 1094 }, 1095 } 1096 _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { 1097 // Use ppoll instead of poll because it's already allowed in seccomp. 1098 n, err := unix.Ppoll(events, nil, nil) 1099 return uintptr(n), 0, err 1100 }) 1101 if err != nil { 1102 panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) 1103 } 1104 1105 l.mu.Lock() 1106 defer l.mu.Unlock() 1107 1108 // The gofer could have been stopped due to a normal container shutdown. 1109 // Check if the container has not stopped yet. 1110 if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: info.cid}); tg != nil { 1111 log.Infof("Gofer socket disconnected, killing container %q", info.cid) 1112 if err := l.signalAllProcesses(info.cid, int32(linux.SIGKILL)); err != nil { 1113 log.Warningf("Error killing container %q after gofer stopped: %s", info.cid, err) 1114 } 1115 } 1116 }() 1117 } 1118 1119 // destroySubcontainer stops a container if it is still running and cleans up 1120 // its filesystem. 1121 func (l *Loader) destroySubcontainer(cid string) error { 1122 l.mu.Lock() 1123 defer l.mu.Unlock() 1124 1125 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1126 if err != nil { 1127 // Container doesn't exist. 1128 return err 1129 } 1130 1131 // The container exists, but has it been started? 1132 if tg != nil { 1133 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1134 return fmt.Errorf("sending SIGKILL to all container processes: %w", err) 1135 } 1136 // Wait for all processes that belong to the container to exit (including 1137 // exec'd processes). 1138 for _, t := range l.k.TaskSet().Root.Tasks() { 1139 if t.ContainerID() == cid { 1140 t.ThreadGroup().WaitExited() 1141 } 1142 } 1143 } 1144 1145 // No more failure from this point on. 1146 1147 // Remove all container thread groups from the map. 1148 for key := range l.processes { 1149 if key.cid == cid { 1150 delete(l.processes, key) 1151 } 1152 } 1153 // Cleanup the device gofer. 1154 l.k.RemoveDevGofer(cid) 1155 1156 log.Debugf("Container destroyed, cid: %s", cid) 1157 return nil 1158 } 1159 1160 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { 1161 // Hold the lock for the entire operation to ensure that exec'd process is 1162 // added to 'processes' in case it races with destroyContainer(). 1163 l.mu.Lock() 1164 defer l.mu.Unlock() 1165 1166 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) 1167 if err != nil { 1168 return 0, err 1169 } 1170 if tg == nil { 1171 return 0, fmt.Errorf("container %q not started", args.ContainerID) 1172 } 1173 1174 // Get the container MountNamespace from the Task. Try to acquire ref may fail 1175 // in case it raced with task exit. 1176 // task.MountNamespace() does not take a ref, so we must do so ourselves. 1177 args.MountNamespace = tg.Leader().MountNamespace() 1178 if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() { 1179 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 1180 } 1181 1182 args.Envv, err = specutils.ResolveEnvs(args.Envv) 1183 if err != nil { 1184 return 0, fmt.Errorf("resolving env: %w", err) 1185 } 1186 1187 // Add the HOME environment variable if it is not already set. 1188 sctx := l.k.SupervisorContext() 1189 root := args.MountNamespace.Root(sctx) 1190 defer root.DecRef(sctx) 1191 ctx := vfs.WithRoot(sctx, root) 1192 defer args.MountNamespace.DecRef(ctx) 1193 args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) 1194 if err != nil { 1195 return 0, err 1196 } 1197 args.PIDNamespace = tg.PIDNamespace() 1198 1199 args.Limits, err = createLimitSet(l.root.spec) 1200 if err != nil { 1201 return 0, fmt.Errorf("creating limits: %w", err) 1202 } 1203 1204 // Start the process. 1205 proc := control.Proc{Kernel: l.k} 1206 newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) 1207 if err != nil { 1208 return 0, err 1209 } 1210 1211 eid := execID{cid: args.ContainerID, pid: tgid} 1212 l.processes[eid] = &execProcess{ 1213 tg: newTG, 1214 tty: ttyFile, 1215 } 1216 log.Debugf("updated processes: %v", l.processes) 1217 1218 return tgid, nil 1219 } 1220 1221 // waitContainer waits for the init process of a container to exit. 1222 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { 1223 // Don't defer unlock, as doing so would make it impossible for 1224 // multiple clients to wait on the same container. 1225 tg, err := l.threadGroupFromID(execID{cid: cid}) 1226 if err != nil { 1227 return fmt.Errorf("can't wait for container %q: %w", cid, err) 1228 } 1229 1230 // If the thread either has already exited or exits during waiting, 1231 // consider the container exited. 1232 ws := l.wait(tg) 1233 *waitStatus = ws 1234 1235 // Check for leaks and write coverage report after the root container has 1236 // exited. This guarantees that the report is written in cases where the 1237 // sandbox is killed by a signal after the ContMgrWait request is completed. 1238 if l.root.procArgs.ContainerID == cid { 1239 // All sentry-created resources should have been released at this point. 1240 _ = coverage.Report() 1241 } 1242 return nil 1243 } 1244 1245 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { 1246 if tgid <= 0 { 1247 return fmt.Errorf("PID (%d) must be positive", tgid) 1248 } 1249 1250 // Try to find a process that was exec'd 1251 eid := execID{cid: cid, pid: tgid} 1252 execTG, err := l.threadGroupFromID(eid) 1253 if err == nil { 1254 ws := l.wait(execTG) 1255 *waitStatus = ws 1256 1257 l.mu.Lock() 1258 delete(l.processes, eid) 1259 log.Debugf("updated processes (removal): %v", l.processes) 1260 l.mu.Unlock() 1261 return nil 1262 } 1263 1264 // The caller may be waiting on a process not started directly via exec. 1265 // In this case, find the process in the container's PID namespace. 1266 initTG, err := l.threadGroupFromID(execID{cid: cid}) 1267 if err != nil { 1268 return fmt.Errorf("waiting for PID %d: %w", tgid, err) 1269 } 1270 tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) 1271 if tg == nil { 1272 return fmt.Errorf("waiting for PID %d: no such process", tgid) 1273 } 1274 if tg.Leader().ContainerID() != cid { 1275 return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) 1276 } 1277 ws := l.wait(tg) 1278 *waitStatus = ws 1279 return nil 1280 } 1281 1282 // wait waits for the process with TGID 'tgid' in a container's PID namespace 1283 // to exit. 1284 func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { 1285 tg.WaitExited() 1286 return uint32(tg.ExitStatus()) 1287 } 1288 1289 // WaitForStartSignal waits for a start signal from the control server. 1290 func (l *Loader) WaitForStartSignal() { 1291 <-l.ctrl.manager.startChan 1292 } 1293 1294 // WaitExit waits for the root container to exit, and returns its exit status. 1295 func (l *Loader) WaitExit() linux.WaitStatus { 1296 // Wait for container. 1297 l.k.WaitExited() 1298 1299 return l.k.GlobalInit().ExitStatus() 1300 } 1301 1302 func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) { 1303 // Create an empty network stack because the network namespace may be empty at 1304 // this point. Netns is configured before Run() is called. Netstack is 1305 // configured using a control uRPC message. Host network is configured inside 1306 // Run(). 1307 switch conf.Network { 1308 case config.NetworkHost: 1309 // If configured for raw socket support with host network 1310 // stack, make sure that we have CAP_NET_RAW the host, 1311 // otherwise we can't make raw sockets. 1312 if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) { 1313 return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability") 1314 } 1315 // No network namespacing support for hostinet yet, hence creator is nil. 1316 return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil 1317 1318 case config.NetworkNone, config.NetworkSandbox: 1319 s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) 1320 if err != nil { 1321 return nil, err 1322 } 1323 creator := &sandboxNetstackCreator{ 1324 clock: clock, 1325 uniqueID: uniqueID, 1326 allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, 1327 } 1328 return inet.NewRootNamespace(s, creator, userns), nil 1329 1330 default: 1331 panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) 1332 } 1333 1334 } 1335 1336 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { 1337 netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} 1338 transProtos := []stack.TransportProtocolFactory{ 1339 tcp.NewProtocol, 1340 udp.NewProtocol, 1341 icmp.NewProtocol4, 1342 icmp.NewProtocol6, 1343 } 1344 s := netstack.Stack{Stack: stack.New(stack.Options{ 1345 NetworkProtocols: netProtos, 1346 TransportProtocols: transProtos, 1347 Clock: clock, 1348 Stats: netstack.Metrics, 1349 HandleLocal: true, 1350 // Enable raw sockets for users with sufficient 1351 // privileges. 1352 RawFactory: raw.EndpointFactory{}, 1353 AllowPacketEndpointWrite: allowPacketEndpointWrite, 1354 UniqueID: uniqueID, 1355 DefaultIPTables: netfilter.DefaultLinuxTables, 1356 })} 1357 1358 // Enable SACK Recovery. 1359 { 1360 opt := tcpip.TCPSACKEnabled(true) 1361 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1362 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1363 } 1364 } 1365 1366 // Set default TTLs as required by socket/netstack. 1367 { 1368 opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) 1369 if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { 1370 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) 1371 } 1372 if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { 1373 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) 1374 } 1375 } 1376 1377 // Enable Receive Buffer Auto-Tuning. 1378 { 1379 opt := tcpip.TCPModerateReceiveBufferOption(true) 1380 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1381 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1382 } 1383 } 1384 1385 return &s, nil 1386 } 1387 1388 // sandboxNetstackCreator implements kernel.NetworkStackCreator. 1389 // 1390 // +stateify savable 1391 type sandboxNetstackCreator struct { 1392 clock tcpip.Clock 1393 uniqueID stack.UniqueID 1394 allowPacketEndpointWrite bool 1395 } 1396 1397 // CreateStack implements kernel.NetworkStackCreator.CreateStack. 1398 func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { 1399 s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) 1400 if err != nil { 1401 return nil, err 1402 } 1403 1404 // Setup loopback. 1405 n := &Network{Stack: s.(*netstack.Stack).Stack} 1406 nicID := tcpip.NICID(f.uniqueID.UniqueID()) 1407 link := DefaultLoopbackLink 1408 linkEP := ethernet.New(loopback.New()) 1409 opts := stack.NICOptions{ 1410 Name: link.Name, 1411 DeliverLinkPackets: true, 1412 } 1413 1414 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 1415 return nil, err 1416 } 1417 1418 return s, nil 1419 } 1420 1421 // signal sends a signal to one or more processes in a container. If PID is 0, 1422 // then the container init process is used. Depending on the SignalDeliveryMode 1423 // option, the signal may be sent directly to the indicated process, to all 1424 // processes in the container, or to the foreground process group. pid is 1425 // relative to the root PID namespace, not the container's. 1426 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { 1427 if pid < 0 { 1428 return fmt.Errorf("PID (%d) must be positive", pid) 1429 } 1430 1431 switch mode { 1432 case DeliverToProcess: 1433 if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { 1434 return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) 1435 } 1436 return nil 1437 1438 case DeliverToForegroundProcessGroup: 1439 if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { 1440 return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) 1441 } 1442 return nil 1443 1444 case DeliverToAllProcesses: 1445 if pid != 0 { 1446 return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) 1447 } 1448 // Check that the container has actually started before signaling it. 1449 if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { 1450 return err 1451 } 1452 if err := l.signalAllProcesses(cid, signo); err != nil { 1453 return fmt.Errorf("signaling all processes in container %q: %w", cid, err) 1454 } 1455 return nil 1456 1457 default: 1458 panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) 1459 } 1460 } 1461 1462 // signalProcess sends signal to process in the given container. tgid is 1463 // relative to the root PID namespace, not the container's. 1464 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { 1465 execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) 1466 if err == nil { 1467 // Send signal directly to the identified process. 1468 return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) 1469 } 1470 1471 // The caller may be signaling a process not started directly via exec. 1472 // In this case, find the process and check that the process belongs to the 1473 // container in question. 1474 tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) 1475 if tg == nil { 1476 return fmt.Errorf("no such process with PID %d", tgid) 1477 } 1478 if tg.Leader().ContainerID() != cid { 1479 return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) 1480 } 1481 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1482 } 1483 1484 // signalForegrondProcessGroup looks up foreground process group from the TTY 1485 // for the given "tgid" inside container "cid", and send the signal to it. 1486 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { 1487 l.mu.Lock() 1488 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) 1489 if err != nil { 1490 l.mu.Unlock() 1491 return fmt.Errorf("no thread group found: %w", err) 1492 } 1493 if tg == nil { 1494 l.mu.Unlock() 1495 return fmt.Errorf("container %q not started", cid) 1496 } 1497 1498 tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) 1499 l.mu.Unlock() 1500 if err != nil { 1501 return fmt.Errorf("no thread group found: %w", err) 1502 } 1503 if tty == nil { 1504 return fmt.Errorf("no TTY attached") 1505 } 1506 pg := tty.ForegroundProcessGroup() 1507 si := &linux.SignalInfo{Signo: signo} 1508 if pg == nil { 1509 // No foreground process group has been set. Signal the 1510 // original thread group. 1511 log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) 1512 return l.k.SendExternalSignalThreadGroup(tg, si) 1513 } 1514 // Send the signal to all processes in the process group. 1515 return l.k.SendExternalSignalProcessGroup(pg, si) 1516 } 1517 1518 // signalAllProcesses that belong to specified container. It's a noop if the 1519 // container hasn't started or has exited. 1520 func (l *Loader) signalAllProcesses(cid string, signo int32) error { 1521 // Pause the kernel to prevent new processes from being created while 1522 // the signal is delivered. This prevents process leaks when SIGKILL is 1523 // sent to the entire container. 1524 l.k.Pause() 1525 defer l.k.Unpause() 1526 return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) 1527 } 1528 1529 // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it 1530 // acquires mutex before calling it and fails in case container hasn't started 1531 // yet. 1532 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { 1533 l.mu.Lock() 1534 defer l.mu.Unlock() 1535 tg, err := l.tryThreadGroupFromIDLocked(key) 1536 if err != nil { 1537 return nil, err 1538 } 1539 if tg == nil { 1540 return nil, fmt.Errorf("container %q not started", key.cid) 1541 } 1542 return tg, nil 1543 } 1544 1545 // tryThreadGroupFromIDLocked returns the thread group for the given execution 1546 // ID. It may return nil in case the container has not started yet. Returns 1547 // error if execution ID is invalid or if the container cannot be found (maybe 1548 // it has been deleted). Caller must hold 'mu'. 1549 func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { 1550 ep := l.processes[key] 1551 if ep == nil { 1552 return nil, fmt.Errorf("container %q not found", key.cid) 1553 } 1554 return ep.tg, nil 1555 } 1556 1557 // ttyFromIDLocked returns the TTY files for the given execution ID. It may 1558 // return nil in case the container has not started yet. Returns error if 1559 // execution ID is invalid or if the container cannot be found (maybe it has 1560 // been deleted). Caller must hold 'mu'. 1561 func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) { 1562 ep := l.processes[key] 1563 if ep == nil { 1564 return nil, fmt.Errorf("container %q not found", key.cid) 1565 } 1566 return ep.tty, nil 1567 } 1568 1569 func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User, containerName string) (*kernel.FDTable, *host.TTYFileDescription, error) { 1570 if len(stdioFDs) != 3 { 1571 return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) 1572 } 1573 fdMap := map[int]*fd.FD{ 1574 0: stdioFDs[0], 1575 1: stdioFDs[1], 1576 2: stdioFDs[2], 1577 } 1578 1579 // Create the entries for the host files that were passed to our app. 1580 for _, customFD := range passFDs { 1581 if customFD.guest < 0 { 1582 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 1583 } 1584 fdMap[customFD.guest] = customFD.host 1585 } 1586 1587 k := kernel.KernelFromContext(ctx) 1588 fdTable := k.NewFDTable() 1589 ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap, containerName) 1590 if err != nil { 1591 fdTable.DecRef(ctx) 1592 return nil, nil, err 1593 } 1594 return fdTable, ttyFile, nil 1595 } 1596 1597 // portForward implements initiating a portForward connection in the sandbox. portForwardProxies 1598 // represent a two connections each copying to each other (read ends to write ends) in goroutines. 1599 // The proxies are stored and can be cleaned up, or clean up after themselves if the connection 1600 // is broken. 1601 func (l *Loader) portForward(opts *PortForwardOpts) error { 1602 // Validate that we have a stream FD to write to. If this happens then 1603 // it means there is a misbehaved urpc client or a bug has occurred. 1604 if len(opts.Files) != 1 { 1605 return fmt.Errorf("stream FD is required for port forward") 1606 } 1607 1608 l.mu.Lock() 1609 defer l.mu.Unlock() 1610 1611 cid := opts.ContainerID 1612 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1613 if err != nil { 1614 return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err) 1615 } 1616 if tg == nil { 1617 return fmt.Errorf("container %q not started", cid) 1618 } 1619 1620 // Import the fd for the UDS. 1621 ctx := l.k.SupervisorContext() 1622 fd, err := l.importFD(ctx, opts.Files[0]) 1623 if err != nil { 1624 return fmt.Errorf("importing stream fd: %w", err) 1625 } 1626 cu := cleanup.Make(func() { fd.DecRef(ctx) }) 1627 defer cu.Clean() 1628 1629 fdConn := pf.NewFileDescriptionConn(fd) 1630 1631 // Create a proxy to forward data between the fdConn and the sandboxed application. 1632 pair := pf.ProxyPair{To: fdConn} 1633 1634 switch l.root.conf.Network { 1635 case config.NetworkSandbox: 1636 stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack 1637 nsConn, err := pf.NewNetstackConn(stack, opts.Port) 1638 if err != nil { 1639 return fmt.Errorf("creating netstack port forward connection: %w", err) 1640 } 1641 pair.From = nsConn 1642 case config.NetworkHost: 1643 hConn, err := pf.NewHostInetConn(opts.Port) 1644 if err != nil { 1645 return fmt.Errorf("creating hostinet port forward connection: %w", err) 1646 } 1647 pair.From = hConn 1648 default: 1649 return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid) 1650 } 1651 cu.Release() 1652 proxy := pf.NewProxy(pair, opts.ContainerID) 1653 1654 // Add to the list of port forward connections and remove when the 1655 // connection closes. 1656 l.portForwardProxies = append(l.portForwardProxies, proxy) 1657 proxy.AddCleanup(func() { 1658 l.mu.Lock() 1659 defer l.mu.Unlock() 1660 for i := range l.portForwardProxies { 1661 if l.portForwardProxies[i] == proxy { 1662 l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...) 1663 break 1664 } 1665 } 1666 }) 1667 1668 // Start forwarding on the connection. 1669 proxy.Start(ctx) 1670 return nil 1671 } 1672 1673 // importFD generically imports a host file descriptor without adding it to any 1674 // fd table. 1675 func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) { 1676 hostFD, err := fd.NewFromFile(f) 1677 if err != nil { 1678 return nil, err 1679 } 1680 defer hostFD.Close() 1681 fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{ 1682 Savable: false, // We disconnect and close on save. 1683 IsTTY: false, 1684 VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed. 1685 }) 1686 1687 if err != nil { 1688 return nil, err 1689 } 1690 hostFD.Release() 1691 return fd, nil 1692 } 1693 1694 func (l *Loader) containerCount() int { 1695 l.mu.Lock() 1696 defer l.mu.Unlock() 1697 1698 containers := 0 1699 for id := range l.processes { 1700 if id.pid == 0 { 1701 // pid==0 represents the init process of a container. There is 1702 // only one of such process per container. 1703 containers++ 1704 } 1705 } 1706 return containers 1707 } 1708 1709 func (l *Loader) pidsCount(cid string) (int, error) { 1710 l.mu.Lock() 1711 defer l.mu.Unlock() 1712 1713 if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil { 1714 // Container doesn't exist. 1715 return 0, err 1716 } 1717 return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil 1718 }