github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/boot/loader.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package boot loads the kernel and runs a container. 16 package boot 17 18 import ( 19 "errors" 20 "fmt" 21 mrand "math/rand" 22 "os" 23 "runtime" 24 gtime "time" 25 26 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 27 "github.com/MerlinKodo/gvisor/pkg/bpf" 28 "github.com/MerlinKodo/gvisor/pkg/cleanup" 29 "github.com/MerlinKodo/gvisor/pkg/context" 30 "github.com/MerlinKodo/gvisor/pkg/coverage" 31 "github.com/MerlinKodo/gvisor/pkg/cpuid" 32 "github.com/MerlinKodo/gvisor/pkg/fd" 33 "github.com/MerlinKodo/gvisor/pkg/log" 34 "github.com/MerlinKodo/gvisor/pkg/memutil" 35 "github.com/MerlinKodo/gvisor/pkg/rand" 36 "github.com/MerlinKodo/gvisor/pkg/refs" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/control" 38 "github.com/MerlinKodo/gvisor/pkg/sentry/fdimport" 39 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/host" 40 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs" 41 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user" 42 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 43 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 44 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 45 "github.com/MerlinKodo/gvisor/pkg/sentry/loader" 46 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 47 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 48 "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck" 49 pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto" 50 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netfilter" 51 "github.com/MerlinKodo/gvisor/pkg/sentry/time" 52 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 53 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 54 "github.com/MerlinKodo/gvisor/pkg/sentry/watchdog" 55 "github.com/MerlinKodo/gvisor/pkg/sighandling" 56 "github.com/MerlinKodo/gvisor/pkg/sync" 57 "github.com/MerlinKodo/gvisor/pkg/tcpip" 58 "github.com/MerlinKodo/gvisor/pkg/tcpip/link/ethernet" 59 "github.com/MerlinKodo/gvisor/pkg/tcpip/link/loopback" 60 "github.com/MerlinKodo/gvisor/pkg/tcpip/link/packetsocket" 61 "github.com/MerlinKodo/gvisor/pkg/tcpip/link/sniffer" 62 "github.com/MerlinKodo/gvisor/pkg/tcpip/network/arp" 63 "github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv4" 64 "github.com/MerlinKodo/gvisor/pkg/tcpip/network/ipv6" 65 "github.com/MerlinKodo/gvisor/pkg/tcpip/stack" 66 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport/icmp" 67 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport/raw" 68 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport/tcp" 69 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport/udp" 70 "github.com/MerlinKodo/gvisor/runsc/boot/filter" 71 _ "github.com/MerlinKodo/gvisor/runsc/boot/platforms" // register all platforms. 72 pf "github.com/MerlinKodo/gvisor/runsc/boot/portforward" 73 "github.com/MerlinKodo/gvisor/runsc/boot/pprof" 74 "github.com/MerlinKodo/gvisor/runsc/config" 75 "github.com/MerlinKodo/gvisor/runsc/profile" 76 "github.com/MerlinKodo/gvisor/runsc/specutils" 77 "github.com/MerlinKodo/gvisor/runsc/specutils/seccomp" 78 specs "github.com/opencontainers/runtime-spec/specs-go" 79 "github.com/syndtr/gocapability/capability" 80 "golang.org/x/sys/unix" 81 82 // Top-level inet providers. 83 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/hostinet" 84 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netstack" 85 86 // Include other supported socket providers. 87 _ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink" 88 _ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/route" 89 _ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netlink/uevent" 90 _ "github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix" 91 ) 92 93 type containerInfo struct { 94 conf *config.Config 95 96 // spec is the base configuration for the root container. 97 spec *specs.Spec 98 99 // procArgs refers to the container's init task. 100 procArgs kernel.CreateProcessArgs 101 102 // stdioFDs contains stdin, stdout, and stderr. 103 stdioFDs []*fd.FD 104 105 // passFDs are mappings of user-supplied host to guest file descriptors. 106 passFDs []fdMapping 107 108 // execFD is the host file descriptor used for program execution. 109 execFD *fd.FD 110 111 // goferFDs are the FDs that attach the sandbox to the gofers. 112 goferFDs []*fd.FD 113 114 // overlayFilestoreFDs are the FDs to the regular files that will back the 115 // tmpfs upper mount in the overlay mounts. 116 overlayFilestoreFDs []*fd.FD 117 118 // overlayMediums contains information about how the gofer mounts have been 119 // overlaid. The first entry is for rootfs and the following entries are for 120 // bind mounts in spec.Mounts (in the same order). 121 overlayMediums []OverlayMedium 122 123 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 124 nvidiaUVMDevMajor uint32 125 } 126 127 // Loader keeps state needed to start the kernel and run the container. 128 type Loader struct { 129 // k is the kernel. 130 k *kernel.Kernel 131 132 // ctrl is the control server. 133 ctrl *controller 134 135 // root contains information about the root container in the sandbox. 136 root containerInfo 137 138 watchdog *watchdog.Watchdog 139 140 // stopSignalForwarding disables forwarding of signals to the sandboxed 141 // container. It should be called when a sandbox is destroyed. 142 stopSignalForwarding func() 143 144 // stopProfiling stops profiling started at container creation. It 145 // should be called when a sandbox is destroyed. 146 stopProfiling func() 147 148 // PreSeccompCallback is called right before installing seccomp filters. 149 PreSeccompCallback func() 150 151 // restore is set to true if we are restoring a container. 152 restore bool 153 154 // sandboxID is the ID for the whole sandbox. 155 sandboxID string 156 157 // mountHints provides extra information about mounts for containers that 158 // apply to the entire pod. 159 mountHints *PodMountHints 160 161 // sharedMountKey holds VFS mounts that may be shared between containers 162 // within the same pod. It is mapped by mount source. 163 sharedMounts map[string]*vfs.Mount 164 165 // productName is the value to show in 166 // /sys/devices/virtual/dmi/id/product_name. 167 productName string 168 169 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 170 nvidiaUVMDevMajor uint32 171 172 // mu guards processes and porForwardProxies. 173 mu sync.Mutex 174 175 // processes maps containers init process and invocation of exec. Root 176 // processes are keyed with container ID and pid=0, while exec invocations 177 // have the corresponding pid set. 178 // 179 // processes is guarded by mu. 180 processes map[execID]*execProcess 181 182 // portForwardProxies is a list of active port forwarding connections. 183 // 184 // portForwardProxies is guarded by mu. 185 portForwardProxies []*pf.Proxy 186 } 187 188 // execID uniquely identifies a sentry process that is executed in a container. 189 type execID struct { 190 cid string 191 pid kernel.ThreadID 192 } 193 194 // execProcess contains the thread group and host TTY of a sentry process. 195 type execProcess struct { 196 // tg will be nil for containers that haven't started yet. 197 tg *kernel.ThreadGroup 198 199 // tty will be nil if the process is not attached to a terminal. 200 tty *host.TTYFileDescription 201 202 // pidnsPath is the pid namespace path in spec 203 pidnsPath string 204 205 // hostTTY is present when creating a sub-container with terminal enabled. 206 // TTY file is passed during container create and must be saved until 207 // container start. 208 hostTTY *fd.FD 209 } 210 211 // fdMapping maps guest to host file descriptors. Guest file descriptors are 212 // exposed to the application inside the sandbox through the FD table. 213 type fdMapping struct { 214 guest int 215 host *fd.FD 216 } 217 218 // FDMapping is a helper type to represent a mapping from guest to host file 219 // descriptors. In contrast to the unexported fdMapping type, it does not imply 220 // file ownership. 221 type FDMapping struct { 222 Guest int 223 Host int 224 } 225 226 func init() { 227 // Initialize the random number generator. 228 mrand.Seed(gtime.Now().UnixNano()) 229 } 230 231 // Args are the arguments for New(). 232 type Args struct { 233 // Id is the sandbox ID. 234 ID string 235 // Spec is the sandbox specification. 236 Spec *specs.Spec 237 // Conf is the system configuration. 238 Conf *config.Config 239 // ControllerFD is the FD to the URPC controller. The Loader takes ownership 240 // of this FD and may close it at any time. 241 ControllerFD int 242 // Device is an optional argument that is passed to the platform. The Loader 243 // takes ownership of this file and may close it at any time. 244 Device *os.File 245 // GoferFDs is an array of FDs used to connect with the Gofer. The Loader 246 // takes ownership of these FDs and may close them at any time. 247 GoferFDs []int 248 // StdioFDs is the stdio for the application. The Loader takes ownership of 249 // these FDs and may close them at any time. 250 StdioFDs []int 251 // PassFDs are user-supplied FD mappings from host to guest descriptors. 252 // The Loader takes ownership of these FDs and may close them at any time. 253 PassFDs []FDMapping 254 // ExecFD is the host file descriptor used for program execution. 255 ExecFD int 256 // OverlayFilestoreFDs are the FDs to the regular files that will back the 257 // tmpfs upper mount in the overlay mounts. 258 OverlayFilestoreFDs []int 259 // OverlayMediums contains information about how the gofer mounts have been 260 // overlaid. The first entry is for rootfs and the following entries are for 261 // bind mounts in Spec.Mounts (in the same order). 262 OverlayMediums []OverlayMedium 263 // NumCPU is the number of CPUs to create inside the sandbox. 264 NumCPU int 265 // TotalMem is the initial amount of total memory to report back to the 266 // container. 267 TotalMem uint64 268 // TotalHostMem is the total memory reported by host /proc/meminfo. 269 TotalHostMem uint64 270 // UserLogFD is the file descriptor to write user logs to. 271 UserLogFD int 272 // ProductName is the value to show in 273 // /sys/devices/virtual/dmi/id/product_name. 274 ProductName string 275 // PodInitConfigFD is the file descriptor to a file passed in the 276 // --pod-init-config flag 277 PodInitConfigFD int 278 // SinkFDs is an ordered array of file descriptors to be used by seccheck 279 // sinks configured from the --pod-init-config file. 280 SinkFDs []int 281 // ProfileOpts contains the set of profiles to enable and the 282 // corresponding FDs where profile data will be written. 283 ProfileOpts profile.Opts 284 } 285 286 // make sure stdioFDs are always the same on initial start and on restore 287 const startingStdioFD = 256 288 289 // New initializes a new kernel loader configured by spec. 290 // New also handles setting up a kernel for restoring a container. 291 func New(args Args) (*Loader, error) { 292 stopProfiling := profile.Start(args.ProfileOpts) 293 294 // Initialize seccheck points. 295 seccheck.Initialize() 296 297 // We initialize the rand package now to make sure /dev/urandom is pre-opened 298 // on kernels that do not support getrandom(2). 299 if err := rand.Init(); err != nil { 300 return nil, fmt.Errorf("setting up rand: %w", err) 301 } 302 303 if err := usage.Init(); err != nil { 304 return nil, fmt.Errorf("setting up memory usage: %w", err) 305 } 306 307 kernel.IOUringEnabled = args.Conf.IOUring 308 309 info := containerInfo{ 310 conf: args.Conf, 311 spec: args.Spec, 312 overlayMediums: args.OverlayMediums, 313 } 314 315 // Make host FDs stable between invocations. Host FDs must map to the exact 316 // same number when the sandbox is restored. Otherwise the wrong FD will be 317 // used. 318 newfd := startingStdioFD 319 320 for _, stdioFD := range args.StdioFDs { 321 // Check that newfd is unused to avoid clobbering over it. 322 if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { 323 if err != nil { 324 return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) 325 } 326 return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) 327 } 328 329 err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) 330 if err != nil { 331 return nil, fmt.Errorf("dup3 of stdios failed: %w", err) 332 } 333 info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) 334 _ = unix.Close(stdioFD) 335 newfd++ 336 } 337 for _, goferFD := range args.GoferFDs { 338 info.goferFDs = append(info.goferFDs, fd.New(goferFD)) 339 } 340 for _, overlayFD := range args.OverlayFilestoreFDs { 341 info.overlayFilestoreFDs = append(info.overlayFilestoreFDs, fd.New(overlayFD)) 342 } 343 344 if args.ExecFD >= 0 { 345 info.execFD = fd.New(args.ExecFD) 346 } 347 348 for _, customFD := range args.PassFDs { 349 info.passFDs = append(info.passFDs, fdMapping{ 350 host: fd.New(customFD.Host), 351 guest: customFD.Guest, 352 }) 353 } 354 355 // Create kernel and platform. 356 p, err := createPlatform(args.Conf, args.Device) 357 if err != nil { 358 return nil, fmt.Errorf("creating platform: %w", err) 359 } 360 if args.Conf.NVProxy && p.OwnsPageTables() { 361 return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform) 362 } 363 k := &kernel.Kernel{ 364 Platform: p, 365 } 366 367 // Create memory file. 368 mf, err := createMemoryFile() 369 if err != nil { 370 return nil, fmt.Errorf("creating memory file: %w", err) 371 } 372 k.SetMemoryFile(mf) 373 374 // Create VDSO. 375 // 376 // Pass k as the platform since it is savable, unlike the actual platform. 377 vdso, err := loader.PrepareVDSO(k) 378 if err != nil { 379 return nil, fmt.Errorf("creating vdso: %w", err) 380 } 381 382 // Create timekeeper. 383 tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) 384 tk.SetClocks(time.NewCalibratedClocks()) 385 386 if err := enableStrace(args.Conf); err != nil { 387 return nil, fmt.Errorf("enabling strace: %w", err) 388 } 389 390 // Create capabilities. 391 caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) 392 if err != nil { 393 return nil, fmt.Errorf("converting capabilities: %w", err) 394 } 395 396 // Convert the spec's additional GIDs to KGIDs. 397 extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) 398 for _, GID := range args.Spec.Process.User.AdditionalGids { 399 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 400 } 401 402 // Create credentials. 403 creds := auth.NewUserCredentials( 404 auth.KUID(args.Spec.Process.User.UID), 405 auth.KGID(args.Spec.Process.User.GID), 406 extraKGIDs, 407 caps, 408 auth.NewRootUserNamespace()) 409 410 // Create root network namespace/stack. 411 netns, err := newRootNetworkNamespace(args.Conf, tk, k, creds.UserNamespace) 412 if err != nil { 413 return nil, fmt.Errorf("creating network: %w", err) 414 } 415 416 if args.NumCPU == 0 { 417 args.NumCPU = runtime.NumCPU() 418 } 419 log.Infof("CPUs: %d", args.NumCPU) 420 runtime.GOMAXPROCS(args.NumCPU) 421 422 if args.TotalHostMem > 0 { 423 // As per tmpfs(5), the default size limit is 50% of total physical RAM. 424 // See mm/shmem.c:shmem_default_max_blocks(). 425 tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2) 426 } 427 428 if args.TotalMem > 0 { 429 // Adjust the total memory returned by the Sentry so that applications that 430 // use /proc/meminfo can make allocations based on this limit. 431 usage.MinimumTotalMemoryBytes = args.TotalMem 432 usage.MaximumTotalMemoryBytes = args.TotalMem 433 log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) 434 } 435 436 // Initiate the Kernel object, which is required by the Context passed 437 // to createVFS in order to mount (among other things) procfs. 438 if err = k.Init(kernel.InitKernelArgs{ 439 FeatureSet: cpuid.HostFeatureSet().Fixed(), 440 Timekeeper: tk, 441 RootUserNamespace: creds.UserNamespace, 442 RootNetworkNamespace: netns, 443 ApplicationCores: uint(args.NumCPU), 444 Vdso: vdso, 445 RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), 446 RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), 447 RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), 448 PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), 449 }); err != nil { 450 return nil, fmt.Errorf("initializing kernel: %w", err) 451 } 452 453 if err := registerFilesystems(k, &info); err != nil { 454 return nil, fmt.Errorf("registering filesystems: %w", err) 455 } 456 457 // Turn on packet logging if enabled. 458 if args.Conf.LogPackets { 459 log.Infof("Packet logging enabled") 460 sniffer.LogPackets.Store(1) 461 } else { 462 log.Infof("Packet logging disabled") 463 sniffer.LogPackets.Store(0) 464 } 465 466 // Create a watchdog. 467 dogOpts := watchdog.DefaultOpts 468 dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction 469 dog := watchdog.New(k, dogOpts) 470 471 procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) 472 if err != nil { 473 return nil, fmt.Errorf("creating init process for root container: %w", err) 474 } 475 info.procArgs = procArgs 476 477 if err := initCompatLogs(args.UserLogFD); err != nil { 478 return nil, fmt.Errorf("initializing compat logs: %w", err) 479 } 480 481 mountHints, err := NewPodMountHints(args.Spec) 482 if err != nil { 483 return nil, fmt.Errorf("creating pod mount hints: %w", err) 484 } 485 486 // Set up host mount that will be used for imported fds. 487 hostFilesystem, err := host.NewFilesystem(k.VFS()) 488 if err != nil { 489 return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) 490 } 491 defer hostFilesystem.DecRef(k.SupervisorContext()) 492 k.SetHostMount(k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})) 493 494 if args.PodInitConfigFD >= 0 { 495 if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil { 496 log.Warningf("unable to configure event session: %v", err) 497 } 498 } 499 500 eid := execID{cid: args.ID} 501 l := &Loader{ 502 k: k, 503 watchdog: dog, 504 sandboxID: args.ID, 505 processes: map[execID]*execProcess{eid: {}}, 506 mountHints: mountHints, 507 root: info, 508 stopProfiling: stopProfiling, 509 productName: args.ProductName, 510 nvidiaUVMDevMajor: info.nvidiaUVMDevMajor, 511 } 512 513 // We don't care about child signals; some platforms can generate a 514 // tremendous number of useless ones (I'm looking at you, ptrace). 515 if err := sighandling.IgnoreChildStop(); err != nil { 516 return nil, fmt.Errorf("ignore child stop signals failed: %w", err) 517 } 518 519 // Create the control server using the provided FD. 520 // 521 // This must be done *after* we have initialized the kernel since the 522 // controller is used to configure the kernel's network stack. 523 ctrl, err := newController(args.ControllerFD, l) 524 if err != nil { 525 return nil, fmt.Errorf("creating control server: %w", err) 526 } 527 l.ctrl = ctrl 528 529 // Only start serving after Loader is set to controller and controller is set 530 // to Loader, because they are both used in the urpc methods. 531 if err := ctrl.srv.StartServing(); err != nil { 532 return nil, fmt.Errorf("starting control server: %w", err) 533 } 534 535 return l, nil 536 } 537 538 // createProcessArgs creates args that can be used with kernel.CreateProcess. 539 func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { 540 // Create initial limits. 541 ls, err := createLimitSet(spec) 542 if err != nil { 543 return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) 544 } 545 env, err := specutils.ResolveEnvs(spec.Process.Env) 546 if err != nil { 547 return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) 548 } 549 550 wd := spec.Process.Cwd 551 if wd == "" { 552 wd = "/" 553 } 554 555 // Create the process arguments. 556 procArgs := kernel.CreateProcessArgs{ 557 Argv: spec.Process.Args, 558 Envv: env, 559 WorkingDirectory: wd, 560 Credentials: creds, 561 Umask: 0022, 562 Limits: ls, 563 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 564 UTSNamespace: k.RootUTSNamespace(), 565 IPCNamespace: k.RootIPCNamespace(), 566 AbstractSocketNamespace: k.RootAbstractSocketNamespace(), 567 ContainerID: id, 568 PIDNamespace: pidns, 569 } 570 571 return procArgs, nil 572 } 573 574 // Destroy cleans up all resources used by the loader. 575 // 576 // Note that this will block until all open control server connections have 577 // been closed. For that reason, this should NOT be called in a defer, because 578 // a panic in a control server rpc would then hang forever. 579 func (l *Loader) Destroy() { 580 if l.stopSignalForwarding != nil { 581 l.stopSignalForwarding() 582 } 583 l.watchdog.Stop() 584 585 // Stop the control server. This will indirectly stop any 586 // long-running control operations that are in flight, e.g. 587 // profiling operations. 588 l.ctrl.stop() 589 590 // Release all kernel resources. This is only safe after we can no longer 591 // save/restore. 592 l.k.Release() 593 594 // Release any dangling tcp connections. 595 tcpip.ReleaseDanglingEndpoints() 596 597 // In the success case, stdioFDs and goferFDs will only contain 598 // released/closed FDs that ownership has been passed over to host FDs and 599 // gofer sessions. Close them here in case of failure. 600 for _, f := range l.root.stdioFDs { 601 _ = f.Close() 602 } 603 for _, f := range l.root.passFDs { 604 _ = f.host.Close() 605 } 606 for _, f := range l.root.goferFDs { 607 _ = f.Close() 608 } 609 610 l.stopProfiling() 611 } 612 613 func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { 614 p, err := platform.Lookup(conf.Platform) 615 if err != nil { 616 panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) 617 } 618 log.Infof("Platform: %s", conf.Platform) 619 return p.New(deviceFile) 620 } 621 622 func createMemoryFile() (*pgalloc.MemoryFile, error) { 623 const memfileName = "runsc-memory" 624 memfd, err := memutil.CreateMemFD(memfileName, 0) 625 if err != nil { 626 return nil, fmt.Errorf("error creating memfd: %w", err) 627 } 628 memfile := os.NewFile(uintptr(memfd), memfileName) 629 // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if 630 // there are memory cgroups specified, because at this point we're already 631 // in a mount namespace in which the relevant cgroupfs is not visible. 632 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 633 if err != nil { 634 _ = memfile.Close() 635 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) 636 } 637 return mf, nil 638 } 639 640 // installSeccompFilters installs sandbox seccomp filters with the host. 641 func (l *Loader) installSeccompFilters() error { 642 if l.PreSeccompCallback != nil { 643 l.PreSeccompCallback() 644 } 645 if l.root.conf.DisableSeccomp { 646 filter.Report("syscall filter is DISABLED. Running in less secure mode.") 647 } else { 648 hostnet := l.root.conf.Network == config.NetworkHost 649 opts := filter.Options{ 650 Platform: l.k.Platform, 651 HostNetwork: hostnet, 652 HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw, 653 HostFilesystem: l.root.conf.DirectFS, 654 ProfileEnable: l.root.conf.ProfileEnable, 655 NVProxy: l.root.conf.NVProxy, 656 TPUProxy: l.root.conf.TPUProxy, 657 ControllerFD: l.ctrl.srv.FD(), 658 } 659 if err := filter.Install(opts); err != nil { 660 return fmt.Errorf("installing seccomp filters: %w", err) 661 } 662 } 663 return nil 664 } 665 666 // Run runs the root container. 667 func (l *Loader) Run() error { 668 err := l.run() 669 l.ctrl.manager.startResultChan <- err 670 if err != nil { 671 // Give the controller some time to send the error to the 672 // runtime. If we return too quickly here the process will exit 673 // and the control connection will be closed before the error 674 // is returned. 675 gtime.Sleep(2 * gtime.Second) 676 return err 677 } 678 return nil 679 } 680 681 func (l *Loader) run() error { 682 if l.root.conf.Network == config.NetworkHost { 683 // Delay host network configuration to this point because network namespace 684 // is configured after the loader is created and before Run() is called. 685 log.Debugf("Configuring host network") 686 s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) 687 if err := s.Configure(l.root.conf.EnableRaw); err != nil { 688 return err 689 } 690 } 691 692 l.mu.Lock() 693 defer l.mu.Unlock() 694 695 eid := execID{cid: l.sandboxID} 696 ep, ok := l.processes[eid] 697 if !ok { 698 return fmt.Errorf("trying to start deleted container %q", l.sandboxID) 699 } 700 701 // If we are restoring, we do not want to create a process. 702 // l.restore is set by the container manager when a restore call is made. 703 if !l.restore { 704 if l.root.conf.ProfileEnable { 705 pprof.Initialize() 706 } 707 708 // Finally done with all configuration. Setup filters before user code 709 // is loaded. 710 if err := l.installSeccompFilters(); err != nil { 711 return err 712 } 713 714 // Create the root container init task. It will begin running 715 // when the kernel is started. 716 var ( 717 tg *kernel.ThreadGroup 718 err error 719 ) 720 tg, ep.tty, err = l.createContainerProcess(true, l.sandboxID, &l.root) 721 if err != nil { 722 return err 723 } 724 725 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 726 evt := pb.Start{ 727 Id: l.sandboxID, 728 Cwd: l.root.spec.Process.Cwd, 729 Args: l.root.spec.Process.Args, 730 Terminal: l.root.spec.Process.Terminal, 731 } 732 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 733 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 734 evt.Env = l.root.spec.Process.Env 735 } 736 if !fields.Context.Empty() { 737 evt.ContextData = &pb.ContextData{} 738 kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData) 739 } 740 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 741 return c.ContainerStart(context.Background(), fields, &evt) 742 }) 743 } 744 } 745 746 ep.tg = l.k.GlobalInit() 747 if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { 748 ep.pidnsPath = ns.Path 749 } 750 751 // Handle signals by forwarding them to the root container process 752 // (except for panic signal, which should cause a panic). 753 l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { 754 // Panic signal should cause a panic. 755 if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { 756 panic("Signal-induced panic") 757 } 758 759 // Otherwise forward to root container. 760 deliveryMode := DeliverToProcess 761 if l.root.spec.Process.Terminal { 762 // Since we are running with a console, we should forward the signal to 763 // the foreground process group so that job control signals like ^C can 764 // be handled properly. 765 deliveryMode = DeliverToForegroundProcessGroup 766 } 767 log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) 768 if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { 769 log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) 770 } 771 }) 772 773 log.Infof("Process should have started...") 774 l.watchdog.Start() 775 return l.k.Start() 776 } 777 778 // createSubcontainer creates a new container inside the sandbox. 779 func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { 780 l.mu.Lock() 781 defer l.mu.Unlock() 782 783 eid := execID{cid: cid} 784 if _, ok := l.processes[eid]; ok { 785 return fmt.Errorf("container %q already exists", cid) 786 } 787 l.processes[eid] = &execProcess{hostTTY: tty} 788 return nil 789 } 790 791 // startSubcontainer starts a child container. It returns the thread group ID of 792 // the newly created process. Used FDs are either closed or released. It's safe 793 // for the caller to close any remaining files upon return. 794 func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, overlayFilestoreFDs []*fd.FD, overlayMediums []OverlayMedium) error { 795 // Create capabilities. 796 caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) 797 if err != nil { 798 return fmt.Errorf("creating capabilities: %w", err) 799 } 800 801 l.mu.Lock() 802 defer l.mu.Unlock() 803 804 ep := l.processes[execID{cid: cid}] 805 if ep == nil { 806 return fmt.Errorf("trying to start a deleted container %q", cid) 807 } 808 809 // Convert the spec's additional GIDs to KGIDs. 810 extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) 811 for _, GID := range spec.Process.User.AdditionalGids { 812 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 813 } 814 815 // Create credentials. We reuse the root user namespace because the 816 // sentry currently supports only 1 mount namespace, which is tied to a 817 // single user namespace. Thus we must run in the same user namespace 818 // to access mounts. 819 creds := auth.NewUserCredentials( 820 auth.KUID(spec.Process.User.UID), 821 auth.KGID(spec.Process.User.GID), 822 extraKGIDs, 823 caps, 824 l.k.RootUserNamespace()) 825 826 var pidns *kernel.PIDNamespace 827 if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { 828 if ns.Path != "" { 829 for _, p := range l.processes { 830 if ns.Path == p.pidnsPath { 831 log.Debugf("Joining PID namespace named %q", ns.Path) 832 pidns = p.tg.PIDNamespace() 833 break 834 } 835 } 836 } 837 if pidns == nil { 838 log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path) 839 pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) 840 } 841 ep.pidnsPath = ns.Path 842 } else { 843 pidns = l.k.RootPIDNamespace() 844 } 845 846 info := &containerInfo{ 847 conf: conf, 848 spec: spec, 849 goferFDs: goferFDs, 850 overlayFilestoreFDs: overlayFilestoreFDs, 851 overlayMediums: overlayMediums, 852 nvidiaUVMDevMajor: l.nvidiaUVMDevMajor, 853 } 854 info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) 855 if err != nil { 856 return fmt.Errorf("creating new process: %w", err) 857 } 858 859 // Use stdios or TTY depending on the spec configuration. 860 if spec.Process.Terminal { 861 if l := len(stdioFDs); l != 0 { 862 return fmt.Errorf("using TTY, stdios not expected: %d", l) 863 } 864 if ep.hostTTY == nil { 865 return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") 866 } 867 info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} 868 ep.hostTTY = nil 869 } else { 870 info.stdioFDs = stdioFDs 871 } 872 873 ep.tg, ep.tty, err = l.createContainerProcess(false, cid, info) 874 if err != nil { 875 return err 876 } 877 878 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 879 evt := pb.Start{ 880 Id: cid, 881 Cwd: spec.Process.Cwd, 882 Args: spec.Process.Args, 883 Terminal: spec.Process.Terminal, 884 } 885 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 886 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 887 evt.Env = spec.Process.Env 888 } 889 if !fields.Context.Empty() { 890 evt.ContextData = &pb.ContextData{} 891 kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData) 892 } 893 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 894 return c.ContainerStart(context.Background(), fields, &evt) 895 }) 896 } 897 898 l.k.StartProcess(ep.tg) 899 return nil 900 } 901 902 func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) { 903 // Create the FD map, which will set stdin, stdout, and stderr. 904 ctx := info.procArgs.NewContext(l.k) 905 fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User) 906 if err != nil { 907 return nil, nil, fmt.Errorf("importing fds: %w", err) 908 } 909 // CreateProcess takes a reference on fdTable if successful. We won't need 910 // ours either way. 911 info.procArgs.FDTable = fdTable 912 913 if info.execFD != nil { 914 if info.procArgs.Filename != "" { 915 return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 916 } 917 file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{ 918 Readonly: true, 919 Savable: true, 920 VirtualOwner: true, 921 UID: auth.KUID(info.spec.Process.User.UID), 922 GID: auth.KGID(info.spec.Process.User.GID), 923 }) 924 if err != nil { 925 return nil, nil, err 926 } 927 defer file.DecRef(ctx) 928 info.execFD.Release() 929 930 info.procArgs.File = file 931 } 932 933 // Gofer FDs must be ordered and the first FD is always the rootfs. 934 if len(info.goferFDs) < 1 { 935 return nil, nil, fmt.Errorf("rootfs gofer FD not found") 936 } 937 l.startGoferMonitor(cid, int32(info.goferFDs[0].FD())) 938 939 if root { 940 if err := l.processHints(info.conf, info.procArgs.Credentials); err != nil { 941 return nil, nil, err 942 } 943 } 944 mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID) 945 if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil { 946 return nil, nil, err 947 } 948 949 // Add the HOME environment variable if it is not already set. 950 info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, 951 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 952 if err != nil { 953 return nil, nil, err 954 } 955 956 // Create and start the new process. 957 tg, _, err := l.k.CreateProcess(info.procArgs) 958 if err != nil { 959 return nil, nil, fmt.Errorf("creating process: %w", err) 960 } 961 // CreateProcess takes a reference on FDTable if successful. 962 info.procArgs.FDTable.DecRef(ctx) 963 964 // Set the foreground process group on the TTY to the global init process 965 // group, since that is what we are about to start running. 966 if ttyFile != nil { 967 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 968 } 969 970 // Install seccomp filters with the new task if there are any. 971 if info.conf.OCISeccomp { 972 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 973 program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) 974 if err != nil { 975 return nil, nil, fmt.Errorf("building seccomp program: %w", err) 976 } 977 978 if log.IsLogging(log.Debug) { 979 out, _ := bpf.DecodeProgram(program) 980 log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) 981 } 982 983 task := tg.Leader() 984 // NOTE: It seems Flags are ignored by runc so we ignore them too. 985 if err := task.AppendSyscallFilter(program, true); err != nil { 986 return nil, nil, fmt.Errorf("appending seccomp filters: %w", err) 987 } 988 } 989 } else { 990 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 991 log.Warningf("Seccomp spec is being ignored") 992 } 993 } 994 995 return tg, ttyFile, nil 996 } 997 998 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on 999 // the gofer FD looking for disconnects, and kills the container processes if 1000 // the rootfs FD disconnects. 1001 // 1002 // Note that other gofer mounts are allowed to be unmounted and disconnected. 1003 func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) { 1004 if rootfsGoferFD < 0 { 1005 panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD)) 1006 } 1007 go func() { 1008 log.Debugf("Monitoring gofer health for container %q", cid) 1009 events := []unix.PollFd{ 1010 { 1011 Fd: rootfsGoferFD, 1012 Events: unix.POLLHUP | unix.POLLRDHUP, 1013 }, 1014 } 1015 _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { 1016 // Use ppoll instead of poll because it's already allowed in seccomp. 1017 n, err := unix.Ppoll(events, nil, nil) 1018 return uintptr(n), 0, err 1019 }) 1020 if err != nil { 1021 panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) 1022 } 1023 1024 l.mu.Lock() 1025 defer l.mu.Unlock() 1026 1027 // The gofer could have been stopped due to a normal container shutdown. 1028 // Check if the container has not stopped yet. 1029 if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { 1030 log.Infof("Gofer socket disconnected, killing container %q", cid) 1031 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1032 log.Warningf("Error killing container %q after gofer stopped: %s", cid, err) 1033 } 1034 } 1035 }() 1036 } 1037 1038 // destroySubcontainer stops a container if it is still running and cleans up 1039 // its filesystem. 1040 func (l *Loader) destroySubcontainer(cid string) error { 1041 l.mu.Lock() 1042 defer l.mu.Unlock() 1043 1044 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1045 if err != nil { 1046 // Container doesn't exist. 1047 return err 1048 } 1049 1050 // The container exists, but has it been started? 1051 if tg != nil { 1052 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1053 return fmt.Errorf("sending SIGKILL to all container processes: %w", err) 1054 } 1055 // Wait for all processes that belong to the container to exit (including 1056 // exec'd processes). 1057 for _, t := range l.k.TaskSet().Root.Tasks() { 1058 if t.ContainerID() == cid { 1059 t.ThreadGroup().WaitExited() 1060 } 1061 } 1062 } 1063 1064 // No more failure from this point on. Remove all container thread groups 1065 // from the map. 1066 for key := range l.processes { 1067 if key.cid == cid { 1068 delete(l.processes, key) 1069 } 1070 } 1071 1072 log.Debugf("Container destroyed, cid: %s", cid) 1073 return nil 1074 } 1075 1076 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { 1077 // Hold the lock for the entire operation to ensure that exec'd process is 1078 // added to 'processes' in case it races with destroyContainer(). 1079 l.mu.Lock() 1080 defer l.mu.Unlock() 1081 1082 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) 1083 if err != nil { 1084 return 0, err 1085 } 1086 if tg == nil { 1087 return 0, fmt.Errorf("container %q not started", args.ContainerID) 1088 } 1089 1090 // Get the container MountNamespace from the Task. Try to acquire ref may fail 1091 // in case it raced with task exit. 1092 // task.MountNamespace() does not take a ref, so we must do so ourselves. 1093 args.MountNamespace = tg.Leader().MountNamespace() 1094 if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() { 1095 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 1096 } 1097 1098 args.Envv, err = specutils.ResolveEnvs(args.Envv) 1099 if err != nil { 1100 return 0, fmt.Errorf("resolving env: %w", err) 1101 } 1102 1103 // Add the HOME environment variable if it is not already set. 1104 sctx := l.k.SupervisorContext() 1105 root := args.MountNamespace.Root(sctx) 1106 defer root.DecRef(sctx) 1107 ctx := vfs.WithRoot(sctx, root) 1108 defer args.MountNamespace.DecRef(ctx) 1109 args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) 1110 if err != nil { 1111 return 0, err 1112 } 1113 args.PIDNamespace = tg.PIDNamespace() 1114 1115 args.Limits, err = createLimitSet(l.root.spec) 1116 if err != nil { 1117 return 0, fmt.Errorf("creating limits: %w", err) 1118 } 1119 1120 // Start the process. 1121 proc := control.Proc{Kernel: l.k} 1122 newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) 1123 if err != nil { 1124 return 0, err 1125 } 1126 1127 eid := execID{cid: args.ContainerID, pid: tgid} 1128 l.processes[eid] = &execProcess{ 1129 tg: newTG, 1130 tty: ttyFile, 1131 } 1132 log.Debugf("updated processes: %v", l.processes) 1133 1134 return tgid, nil 1135 } 1136 1137 // waitContainer waits for the init process of a container to exit. 1138 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { 1139 // Don't defer unlock, as doing so would make it impossible for 1140 // multiple clients to wait on the same container. 1141 tg, err := l.threadGroupFromID(execID{cid: cid}) 1142 if err != nil { 1143 return fmt.Errorf("can't wait for container %q: %w", cid, err) 1144 } 1145 1146 // If the thread either has already exited or exits during waiting, 1147 // consider the container exited. 1148 ws := l.wait(tg) 1149 *waitStatus = ws 1150 1151 // Check for leaks and write coverage report after the root container has 1152 // exited. This guarantees that the report is written in cases where the 1153 // sandbox is killed by a signal after the ContMgrWait request is completed. 1154 if l.root.procArgs.ContainerID == cid { 1155 // All sentry-created resources should have been released at this point. 1156 refs.DoLeakCheck() 1157 _ = coverage.Report() 1158 } 1159 return nil 1160 } 1161 1162 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { 1163 if tgid <= 0 { 1164 return fmt.Errorf("PID (%d) must be positive", tgid) 1165 } 1166 1167 // Try to find a process that was exec'd 1168 eid := execID{cid: cid, pid: tgid} 1169 execTG, err := l.threadGroupFromID(eid) 1170 if err == nil { 1171 ws := l.wait(execTG) 1172 *waitStatus = ws 1173 1174 l.mu.Lock() 1175 delete(l.processes, eid) 1176 log.Debugf("updated processes (removal): %v", l.processes) 1177 l.mu.Unlock() 1178 return nil 1179 } 1180 1181 // The caller may be waiting on a process not started directly via exec. 1182 // In this case, find the process in the container's PID namespace. 1183 initTG, err := l.threadGroupFromID(execID{cid: cid}) 1184 if err != nil { 1185 return fmt.Errorf("waiting for PID %d: %w", tgid, err) 1186 } 1187 tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) 1188 if tg == nil { 1189 return fmt.Errorf("waiting for PID %d: no such process", tgid) 1190 } 1191 if tg.Leader().ContainerID() != cid { 1192 return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) 1193 } 1194 ws := l.wait(tg) 1195 *waitStatus = ws 1196 return nil 1197 } 1198 1199 // wait waits for the process with TGID 'tgid' in a container's PID namespace 1200 // to exit. 1201 func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { 1202 tg.WaitExited() 1203 return uint32(tg.ExitStatus()) 1204 } 1205 1206 // WaitForStartSignal waits for a start signal from the control server. 1207 func (l *Loader) WaitForStartSignal() { 1208 <-l.ctrl.manager.startChan 1209 } 1210 1211 // WaitExit waits for the root container to exit, and returns its exit status. 1212 func (l *Loader) WaitExit() linux.WaitStatus { 1213 // Wait for container. 1214 l.k.WaitExited() 1215 1216 // Check all references. 1217 refs.OnExit() 1218 1219 return l.k.GlobalInit().ExitStatus() 1220 } 1221 1222 func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) { 1223 // Create an empty network stack because the network namespace may be empty at 1224 // this point. Netns is configured before Run() is called. Netstack is 1225 // configured using a control uRPC message. Host network is configured inside 1226 // Run(). 1227 switch conf.Network { 1228 case config.NetworkHost: 1229 // If configured for raw socket support with host network 1230 // stack, make sure that we have CAP_NET_RAW the host, 1231 // otherwise we can't make raw sockets. 1232 if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) { 1233 return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability") 1234 } 1235 // No network namespacing support for hostinet yet, hence creator is nil. 1236 return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil 1237 1238 case config.NetworkNone, config.NetworkSandbox: 1239 s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) 1240 if err != nil { 1241 return nil, err 1242 } 1243 creator := &sandboxNetstackCreator{ 1244 clock: clock, 1245 uniqueID: uniqueID, 1246 allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, 1247 } 1248 return inet.NewRootNamespace(s, creator, userns), nil 1249 1250 default: 1251 panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) 1252 } 1253 1254 } 1255 1256 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { 1257 netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} 1258 transProtos := []stack.TransportProtocolFactory{ 1259 tcp.NewProtocol, 1260 udp.NewProtocol, 1261 icmp.NewProtocol4, 1262 icmp.NewProtocol6, 1263 } 1264 s := netstack.Stack{Stack: stack.New(stack.Options{ 1265 NetworkProtocols: netProtos, 1266 TransportProtocols: transProtos, 1267 Clock: clock, 1268 Stats: netstack.Metrics, 1269 HandleLocal: true, 1270 // Enable raw sockets for users with sufficient 1271 // privileges. 1272 RawFactory: raw.EndpointFactory{}, 1273 AllowPacketEndpointWrite: allowPacketEndpointWrite, 1274 UniqueID: uniqueID, 1275 DefaultIPTables: netfilter.DefaultLinuxTables, 1276 })} 1277 1278 // Enable SACK Recovery. 1279 { 1280 opt := tcpip.TCPSACKEnabled(true) 1281 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1282 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1283 } 1284 } 1285 1286 // Set default TTLs as required by socket/netstack. 1287 { 1288 opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) 1289 if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { 1290 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) 1291 } 1292 if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { 1293 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) 1294 } 1295 } 1296 1297 // Enable Receive Buffer Auto-Tuning. 1298 { 1299 opt := tcpip.TCPModerateReceiveBufferOption(true) 1300 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1301 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1302 } 1303 } 1304 1305 return &s, nil 1306 } 1307 1308 // sandboxNetstackCreator implements kernel.NetworkStackCreator. 1309 // 1310 // +stateify savable 1311 type sandboxNetstackCreator struct { 1312 clock tcpip.Clock 1313 uniqueID stack.UniqueID 1314 allowPacketEndpointWrite bool 1315 } 1316 1317 // CreateStack implements kernel.NetworkStackCreator.CreateStack. 1318 func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { 1319 s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) 1320 if err != nil { 1321 return nil, err 1322 } 1323 1324 // Setup loopback. 1325 n := &Network{Stack: s.(*netstack.Stack).Stack} 1326 nicID := tcpip.NICID(f.uniqueID.UniqueID()) 1327 link := DefaultLoopbackLink 1328 linkEP := packetsocket.New(ethernet.New(loopback.New())) 1329 opts := stack.NICOptions{Name: link.Name} 1330 1331 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 1332 return nil, err 1333 } 1334 1335 return s, nil 1336 } 1337 1338 // signal sends a signal to one or more processes in a container. If PID is 0, 1339 // then the container init process is used. Depending on the SignalDeliveryMode 1340 // option, the signal may be sent directly to the indicated process, to all 1341 // processes in the container, or to the foreground process group. pid is 1342 // relative to the root PID namespace, not the container's. 1343 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { 1344 if pid < 0 { 1345 return fmt.Errorf("PID (%d) must be positive", pid) 1346 } 1347 1348 switch mode { 1349 case DeliverToProcess: 1350 if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { 1351 return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) 1352 } 1353 return nil 1354 1355 case DeliverToForegroundProcessGroup: 1356 if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { 1357 return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) 1358 } 1359 return nil 1360 1361 case DeliverToAllProcesses: 1362 if pid != 0 { 1363 return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) 1364 } 1365 // Check that the container has actually started before signaling it. 1366 if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { 1367 return err 1368 } 1369 if err := l.signalAllProcesses(cid, signo); err != nil { 1370 return fmt.Errorf("signaling all processes in container %q: %w", cid, err) 1371 } 1372 return nil 1373 1374 default: 1375 panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) 1376 } 1377 } 1378 1379 // signalProcess sends signal to process in the given container. tgid is 1380 // relative to the root PID namespace, not the container's. 1381 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { 1382 execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) 1383 if err == nil { 1384 // Send signal directly to the identified process. 1385 return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) 1386 } 1387 1388 // The caller may be signaling a process not started directly via exec. 1389 // In this case, find the process and check that the process belongs to the 1390 // container in question. 1391 tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) 1392 if tg == nil { 1393 return fmt.Errorf("no such process with PID %d", tgid) 1394 } 1395 if tg.Leader().ContainerID() != cid { 1396 return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) 1397 } 1398 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1399 } 1400 1401 // signalForegrondProcessGroup looks up foreground process group from the TTY 1402 // for the given "tgid" inside container "cid", and send the signal to it. 1403 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { 1404 l.mu.Lock() 1405 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) 1406 if err != nil { 1407 l.mu.Unlock() 1408 return fmt.Errorf("no thread group found: %w", err) 1409 } 1410 if tg == nil { 1411 l.mu.Unlock() 1412 return fmt.Errorf("container %q not started", cid) 1413 } 1414 1415 tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) 1416 l.mu.Unlock() 1417 if err != nil { 1418 return fmt.Errorf("no thread group found: %w", err) 1419 } 1420 if tty == nil { 1421 return fmt.Errorf("no TTY attached") 1422 } 1423 pg := tty.ForegroundProcessGroup() 1424 si := &linux.SignalInfo{Signo: signo} 1425 if pg == nil { 1426 // No foreground process group has been set. Signal the 1427 // original thread group. 1428 log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) 1429 return l.k.SendExternalSignalThreadGroup(tg, si) 1430 } 1431 // Send the signal to all processes in the process group. 1432 return l.k.SendExternalSignalProcessGroup(pg, si) 1433 } 1434 1435 // signalAllProcesses that belong to specified container. It's a noop if the 1436 // container hasn't started or has exited. 1437 func (l *Loader) signalAllProcesses(cid string, signo int32) error { 1438 // Pause the kernel to prevent new processes from being created while 1439 // the signal is delivered. This prevents process leaks when SIGKILL is 1440 // sent to the entire container. 1441 l.k.Pause() 1442 defer l.k.Unpause() 1443 return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) 1444 } 1445 1446 // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it 1447 // acquires mutex before calling it and fails in case container hasn't started 1448 // yet. 1449 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { 1450 l.mu.Lock() 1451 defer l.mu.Unlock() 1452 tg, err := l.tryThreadGroupFromIDLocked(key) 1453 if err != nil { 1454 return nil, err 1455 } 1456 if tg == nil { 1457 return nil, fmt.Errorf("container %q not started", key.cid) 1458 } 1459 return tg, nil 1460 } 1461 1462 // tryThreadGroupFromIDLocked returns the thread group for the given execution 1463 // ID. It may return nil in case the container has not started yet. Returns 1464 // error if execution ID is invalid or if the container cannot be found (maybe 1465 // it has been deleted). Caller must hold 'mu'. 1466 func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { 1467 ep := l.processes[key] 1468 if ep == nil { 1469 return nil, fmt.Errorf("container %q not found", key.cid) 1470 } 1471 return ep.tg, nil 1472 } 1473 1474 // ttyFromIDLocked returns the TTY files for the given execution ID. It may 1475 // return nil in case the container has not started yet. Returns error if 1476 // execution ID is invalid or if the container cannot be found (maybe it has 1477 // been deleted). Caller must hold 'mu'. 1478 func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) { 1479 ep := l.processes[key] 1480 if ep == nil { 1481 return nil, fmt.Errorf("container %q not found", key.cid) 1482 } 1483 return ep.tty, nil 1484 } 1485 1486 func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User) (*kernel.FDTable, *host.TTYFileDescription, error) { 1487 if len(stdioFDs) != 3 { 1488 return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) 1489 } 1490 fdMap := map[int]*fd.FD{ 1491 0: stdioFDs[0], 1492 1: stdioFDs[1], 1493 2: stdioFDs[2], 1494 } 1495 1496 // Create the entries for the host files that were passed to our app. 1497 for _, customFD := range passFDs { 1498 if customFD.guest < 0 { 1499 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 1500 } 1501 fdMap[customFD.guest] = customFD.host 1502 } 1503 1504 k := kernel.KernelFromContext(ctx) 1505 fdTable := k.NewFDTable() 1506 ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap) 1507 if err != nil { 1508 fdTable.DecRef(ctx) 1509 return nil, nil, err 1510 } 1511 return fdTable, ttyFile, nil 1512 } 1513 1514 // portForward implements initiating a portForward connection in the sandbox. portForwardProxies 1515 // represent a two connections each copying to each other (read ends to write ends) in goroutines. 1516 // The proxies are stored and can be cleaned up, or clean up after themselves if the connection 1517 // is broken. 1518 func (l *Loader) portForward(opts *PortForwardOpts) error { 1519 // Validate that we have a stream FD to write to. If this happens then 1520 // it means there is a misbehaved urpc client or a bug has occurred. 1521 if len(opts.Files) != 1 { 1522 return fmt.Errorf("stream FD is required for port forward") 1523 } 1524 1525 l.mu.Lock() 1526 defer l.mu.Unlock() 1527 1528 cid := opts.ContainerID 1529 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1530 if err != nil { 1531 return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err) 1532 } 1533 if tg == nil { 1534 return fmt.Errorf("container %q not started", cid) 1535 } 1536 1537 // Import the fd for the UDS. 1538 ctx := l.k.SupervisorContext() 1539 fd, err := l.importFD(ctx, opts.Files[0]) 1540 if err != nil { 1541 return fmt.Errorf("importing stream fd: %w", err) 1542 } 1543 cu := cleanup.Make(func() { fd.DecRef(ctx) }) 1544 defer cu.Clean() 1545 1546 fdConn := pf.NewFileDescriptionConn(fd) 1547 1548 // Create a proxy to forward data between the fdConn and the sandboxed application. 1549 pair := pf.ProxyPair{To: fdConn} 1550 1551 switch l.root.conf.Network { 1552 case config.NetworkSandbox: 1553 stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack 1554 nsConn, err := pf.NewNetstackConn(stack, opts.Port) 1555 if err != nil { 1556 return fmt.Errorf("creating netstack port forward connection: %w", err) 1557 } 1558 pair.From = nsConn 1559 case config.NetworkHost: 1560 hConn, err := pf.NewHostInetConn(opts.Port) 1561 if err != nil { 1562 return fmt.Errorf("creating hostinet port forward connection: %w", err) 1563 } 1564 pair.From = hConn 1565 default: 1566 return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid) 1567 } 1568 cu.Release() 1569 proxy := pf.NewProxy(pair, opts.ContainerID) 1570 1571 // Add to the list of port forward connections and remove when the 1572 // connection closes. 1573 l.portForwardProxies = append(l.portForwardProxies, proxy) 1574 proxy.AddCleanup(func() { 1575 l.mu.Lock() 1576 defer l.mu.Unlock() 1577 for i := range l.portForwardProxies { 1578 if l.portForwardProxies[i] == proxy { 1579 l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...) 1580 break 1581 } 1582 } 1583 }) 1584 1585 // Start forwarding on the connection. 1586 proxy.Start(ctx) 1587 return nil 1588 } 1589 1590 // importFD generically imports a host file descriptor without adding it to any 1591 // fd table. 1592 func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) { 1593 hostFD, err := fd.NewFromFile(f) 1594 if err != nil { 1595 return nil, err 1596 } 1597 defer hostFD.Close() 1598 fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{ 1599 Savable: false, // We disconnect and close on save. 1600 IsTTY: false, 1601 VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed. 1602 }) 1603 1604 if err != nil { 1605 return nil, err 1606 } 1607 hostFD.Release() 1608 return fd, nil 1609 } 1610 1611 func (l *Loader) containerCount() int { 1612 l.mu.Lock() 1613 defer l.mu.Unlock() 1614 1615 containers := 0 1616 for id := range l.processes { 1617 if id.pid == 0 { 1618 // pid==0 represents the init process of a container. There is 1619 // only one of such process per container. 1620 containers++ 1621 } 1622 } 1623 return containers 1624 } 1625 1626 func (l *Loader) pidsCount(cid string) (int, error) { 1627 l.mu.Lock() 1628 defer l.mu.Unlock() 1629 1630 if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil { 1631 // Container doesn't exist. 1632 return 0, err 1633 } 1634 return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil 1635 }