github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/boot/loader.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package boot loads the kernel and runs a container. 16 package boot 17 18 import ( 19 "errors" 20 "fmt" 21 mrand "math/rand" 22 "os" 23 "runtime" 24 gtime "time" 25 26 specs "github.com/opencontainers/runtime-spec/specs-go" 27 "github.com/syndtr/gocapability/capability" 28 "golang.org/x/sys/unix" 29 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 30 "github.com/nicocha30/gvisor-ligolo/pkg/bpf" 31 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 32 "github.com/nicocha30/gvisor-ligolo/pkg/context" 33 "github.com/nicocha30/gvisor-ligolo/pkg/coverage" 34 "github.com/nicocha30/gvisor-ligolo/pkg/cpuid" 35 "github.com/nicocha30/gvisor-ligolo/pkg/fd" 36 "github.com/nicocha30/gvisor-ligolo/pkg/log" 37 "github.com/nicocha30/gvisor-ligolo/pkg/memutil" 38 "github.com/nicocha30/gvisor-ligolo/pkg/rand" 39 "github.com/nicocha30/gvisor-ligolo/pkg/refs" 40 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/control" 41 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fdimport" 42 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/host" 43 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs" 44 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/user" 45 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet" 46 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel" 47 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 48 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/loader" 49 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 50 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 51 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck" 52 pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto" 53 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netfilter" 54 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/time" 55 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 56 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 57 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/watchdog" 58 "github.com/nicocha30/gvisor-ligolo/pkg/sighandling" 59 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 60 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip" 61 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/ethernet" 62 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/loopback" 63 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/packetsocket" 64 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/link/sniffer" 65 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/arp" 66 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/ipv4" 67 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/network/ipv6" 68 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack" 69 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/icmp" 70 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/raw" 71 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/tcp" 72 "github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/udp" 73 "github.com/nicocha30/gvisor-ligolo/runsc/boot/filter" 74 _ "github.com/nicocha30/gvisor-ligolo/runsc/boot/platforms" // register all platforms. 75 pf "github.com/nicocha30/gvisor-ligolo/runsc/boot/portforward" 76 "github.com/nicocha30/gvisor-ligolo/runsc/boot/pprof" 77 "github.com/nicocha30/gvisor-ligolo/runsc/config" 78 "github.com/nicocha30/gvisor-ligolo/runsc/profile" 79 "github.com/nicocha30/gvisor-ligolo/runsc/specutils" 80 "github.com/nicocha30/gvisor-ligolo/runsc/specutils/seccomp" 81 82 // Top-level inet providers. 83 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/hostinet" 84 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netstack" 85 86 // Include other supported socket providers. 87 _ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink" 88 _ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/route" 89 _ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netlink/uevent" 90 _ "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix" 91 ) 92 93 type containerInfo struct { 94 conf *config.Config 95 96 // spec is the base configuration for the root container. 97 spec *specs.Spec 98 99 // procArgs refers to the container's init task. 100 procArgs kernel.CreateProcessArgs 101 102 // stdioFDs contains stdin, stdout, and stderr. 103 stdioFDs []*fd.FD 104 105 // passFDs are mappings of user-supplied host to guest file descriptors. 106 passFDs []fdMapping 107 108 // execFD is the host file descriptor used for program execution. 109 execFD *fd.FD 110 111 // goferFDs are the FDs that attach the sandbox to the gofers. 112 goferFDs []*fd.FD 113 114 // overlayFilestoreFDs are the FDs to the regular files that will back the 115 // tmpfs upper mount in the overlay mounts. 116 overlayFilestoreFDs []*fd.FD 117 118 // overlayMediums contains information about how the gofer mounts have been 119 // overlaid. The first entry is for rootfs and the following entries are for 120 // bind mounts in spec.Mounts (in the same order). 121 overlayMediums []OverlayMedium 122 123 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 124 nvidiaUVMDevMajor uint32 125 } 126 127 // Loader keeps state needed to start the kernel and run the container. 128 type Loader struct { 129 // k is the kernel. 130 k *kernel.Kernel 131 132 // ctrl is the control server. 133 ctrl *controller 134 135 // root contains information about the root container in the sandbox. 136 root containerInfo 137 138 watchdog *watchdog.Watchdog 139 140 // stopSignalForwarding disables forwarding of signals to the sandboxed 141 // container. It should be called when a sandbox is destroyed. 142 stopSignalForwarding func() 143 144 // stopProfiling stops profiling started at container creation. It 145 // should be called when a sandbox is destroyed. 146 stopProfiling func() 147 148 // PreSeccompCallback is called right before installing seccomp filters. 149 PreSeccompCallback func() 150 151 // restore is set to true if we are restoring a container. 152 restore bool 153 154 // sandboxID is the ID for the whole sandbox. 155 sandboxID string 156 157 // mountHints provides extra information about mounts for containers that 158 // apply to the entire pod. 159 mountHints *PodMountHints 160 161 // productName is the value to show in 162 // /sys/devices/virtual/dmi/id/product_name. 163 productName string 164 165 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 166 nvidiaUVMDevMajor uint32 167 168 // mu guards processes and porForwardProxies. 169 mu sync.Mutex 170 171 // processes maps containers init process and invocation of exec. Root 172 // processes are keyed with container ID and pid=0, while exec invocations 173 // have the corresponding pid set. 174 // 175 // processes is guarded by mu. 176 processes map[execID]*execProcess 177 178 // portForwardProxies is a list of active port forwarding connections. 179 // 180 // portForwardProxies is guarded by mu. 181 portForwardProxies []*pf.Proxy 182 } 183 184 // execID uniquely identifies a sentry process that is executed in a container. 185 type execID struct { 186 cid string 187 pid kernel.ThreadID 188 } 189 190 // execProcess contains the thread group and host TTY of a sentry process. 191 type execProcess struct { 192 // tg will be nil for containers that haven't started yet. 193 tg *kernel.ThreadGroup 194 195 // tty will be nil if the process is not attached to a terminal. 196 tty *host.TTYFileDescription 197 198 // pidnsPath is the pid namespace path in spec 199 pidnsPath string 200 201 // hostTTY is present when creating a sub-container with terminal enabled. 202 // TTY file is passed during container create and must be saved until 203 // container start. 204 hostTTY *fd.FD 205 } 206 207 // fdMapping maps guest to host file descriptors. Guest file descriptors are 208 // exposed to the application inside the sandbox through the FD table. 209 type fdMapping struct { 210 guest int 211 host *fd.FD 212 } 213 214 // FDMapping is a helper type to represent a mapping from guest to host file 215 // descriptors. In contrast to the unexported fdMapping type, it does not imply 216 // file ownership. 217 type FDMapping struct { 218 Guest int 219 Host int 220 } 221 222 func init() { 223 // Initialize the random number generator. 224 mrand.Seed(gtime.Now().UnixNano()) 225 } 226 227 // Args are the arguments for New(). 228 type Args struct { 229 // Id is the sandbox ID. 230 ID string 231 // Spec is the sandbox specification. 232 Spec *specs.Spec 233 // Conf is the system configuration. 234 Conf *config.Config 235 // ControllerFD is the FD to the URPC controller. The Loader takes ownership 236 // of this FD and may close it at any time. 237 ControllerFD int 238 // Device is an optional argument that is passed to the platform. The Loader 239 // takes ownership of this file and may close it at any time. 240 Device *os.File 241 // GoferFDs is an array of FDs used to connect with the Gofer. The Loader 242 // takes ownership of these FDs and may close them at any time. 243 GoferFDs []int 244 // StdioFDs is the stdio for the application. The Loader takes ownership of 245 // these FDs and may close them at any time. 246 StdioFDs []int 247 // PassFDs are user-supplied FD mappings from host to guest descriptors. 248 // The Loader takes ownership of these FDs and may close them at any time. 249 PassFDs []FDMapping 250 // ExecFD is the host file descriptor used for program execution. 251 ExecFD int 252 // OverlayFilestoreFDs are the FDs to the regular files that will back the 253 // tmpfs upper mount in the overlay mounts. 254 OverlayFilestoreFDs []int 255 // OverlayMediums contains information about how the gofer mounts have been 256 // overlaid. The first entry is for rootfs and the following entries are for 257 // bind mounts in Spec.Mounts (in the same order). 258 OverlayMediums []OverlayMedium 259 // NumCPU is the number of CPUs to create inside the sandbox. 260 NumCPU int 261 // TotalMem is the initial amount of total memory to report back to the 262 // container. 263 TotalMem uint64 264 // TotalHostMem is the total memory reported by host /proc/meminfo. 265 TotalHostMem uint64 266 // UserLogFD is the file descriptor to write user logs to. 267 UserLogFD int 268 // ProductName is the value to show in 269 // /sys/devices/virtual/dmi/id/product_name. 270 ProductName string 271 // PodInitConfigFD is the file descriptor to a file passed in the 272 // --pod-init-config flag 273 PodInitConfigFD int 274 // SinkFDs is an ordered array of file descriptors to be used by seccheck 275 // sinks configured from the --pod-init-config file. 276 SinkFDs []int 277 // ProfileOpts contains the set of profiles to enable and the 278 // corresponding FDs where profile data will be written. 279 ProfileOpts profile.Opts 280 } 281 282 // make sure stdioFDs are always the same on initial start and on restore 283 const startingStdioFD = 256 284 285 // New initializes a new kernel loader configured by spec. 286 // New also handles setting up a kernel for restoring a container. 287 func New(args Args) (*Loader, error) { 288 stopProfiling := profile.Start(args.ProfileOpts) 289 290 // Initialize seccheck points. 291 seccheck.Initialize() 292 293 // We initialize the rand package now to make sure /dev/urandom is pre-opened 294 // on kernels that do not support getrandom(2). 295 if err := rand.Init(); err != nil { 296 return nil, fmt.Errorf("setting up rand: %w", err) 297 } 298 299 if err := usage.Init(); err != nil { 300 return nil, fmt.Errorf("setting up memory usage: %w", err) 301 } 302 303 kernel.IOUringEnabled = args.Conf.IOUring 304 305 info := containerInfo{ 306 conf: args.Conf, 307 spec: args.Spec, 308 overlayMediums: args.OverlayMediums, 309 } 310 311 // Make host FDs stable between invocations. Host FDs must map to the exact 312 // same number when the sandbox is restored. Otherwise the wrong FD will be 313 // used. 314 newfd := startingStdioFD 315 316 for _, stdioFD := range args.StdioFDs { 317 // Check that newfd is unused to avoid clobbering over it. 318 if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { 319 if err != nil { 320 return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) 321 } 322 return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) 323 } 324 325 err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) 326 if err != nil { 327 return nil, fmt.Errorf("dup3 of stdios failed: %w", err) 328 } 329 info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) 330 _ = unix.Close(stdioFD) 331 newfd++ 332 } 333 for _, goferFD := range args.GoferFDs { 334 info.goferFDs = append(info.goferFDs, fd.New(goferFD)) 335 } 336 for _, overlayFD := range args.OverlayFilestoreFDs { 337 info.overlayFilestoreFDs = append(info.overlayFilestoreFDs, fd.New(overlayFD)) 338 } 339 340 if args.ExecFD >= 0 { 341 info.execFD = fd.New(args.ExecFD) 342 } 343 344 for _, customFD := range args.PassFDs { 345 info.passFDs = append(info.passFDs, fdMapping{ 346 host: fd.New(customFD.Host), 347 guest: customFD.Guest, 348 }) 349 } 350 351 // Create kernel and platform. 352 p, err := createPlatform(args.Conf, args.Device) 353 if err != nil { 354 return nil, fmt.Errorf("creating platform: %w", err) 355 } 356 if args.Conf.NVProxy && p.OwnsPageTables() { 357 return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform) 358 } 359 k := &kernel.Kernel{ 360 Platform: p, 361 } 362 363 // Create memory file. 364 mf, err := createMemoryFile() 365 if err != nil { 366 return nil, fmt.Errorf("creating memory file: %w", err) 367 } 368 k.SetMemoryFile(mf) 369 370 // Create VDSO. 371 // 372 // Pass k as the platform since it is savable, unlike the actual platform. 373 vdso, err := loader.PrepareVDSO(k) 374 if err != nil { 375 return nil, fmt.Errorf("creating vdso: %w", err) 376 } 377 378 // Create timekeeper. 379 tk := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) 380 tk.SetClocks(time.NewCalibratedClocks()) 381 382 if err := enableStrace(args.Conf); err != nil { 383 return nil, fmt.Errorf("enabling strace: %w", err) 384 } 385 386 // Create capabilities. 387 caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities) 388 if err != nil { 389 return nil, fmt.Errorf("converting capabilities: %w", err) 390 } 391 392 // Convert the spec's additional GIDs to KGIDs. 393 extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids)) 394 for _, GID := range args.Spec.Process.User.AdditionalGids { 395 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 396 } 397 398 // Create credentials. 399 creds := auth.NewUserCredentials( 400 auth.KUID(args.Spec.Process.User.UID), 401 auth.KGID(args.Spec.Process.User.GID), 402 extraKGIDs, 403 caps, 404 auth.NewRootUserNamespace()) 405 406 // Create root network namespace/stack. 407 netns, err := newRootNetworkNamespace(args.Conf, tk, k, creds.UserNamespace) 408 if err != nil { 409 return nil, fmt.Errorf("creating network: %w", err) 410 } 411 412 if args.NumCPU == 0 { 413 args.NumCPU = runtime.NumCPU() 414 } 415 log.Infof("CPUs: %d", args.NumCPU) 416 runtime.GOMAXPROCS(args.NumCPU) 417 418 if args.TotalHostMem > 0 { 419 // As per tmpfs(5), the default size limit is 50% of total physical RAM. 420 // See mm/shmem.c:shmem_default_max_blocks(). 421 tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2) 422 } 423 424 if args.TotalMem > 0 { 425 // Adjust the total memory returned by the Sentry so that applications that 426 // use /proc/meminfo can make allocations based on this limit. 427 usage.MinimumTotalMemoryBytes = args.TotalMem 428 usage.MaximumTotalMemoryBytes = args.TotalMem 429 log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) 430 } 431 432 // Initiate the Kernel object, which is required by the Context passed 433 // to createVFS in order to mount (among other things) procfs. 434 if err = k.Init(kernel.InitKernelArgs{ 435 FeatureSet: cpuid.HostFeatureSet().Fixed(), 436 Timekeeper: tk, 437 RootUserNamespace: creds.UserNamespace, 438 RootNetworkNamespace: netns, 439 ApplicationCores: uint(args.NumCPU), 440 Vdso: vdso, 441 RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), 442 RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), 443 RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), 444 PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), 445 }); err != nil { 446 return nil, fmt.Errorf("initializing kernel: %w", err) 447 } 448 449 if err := registerFilesystems(k, &info); err != nil { 450 return nil, fmt.Errorf("registering filesystems: %w", err) 451 } 452 453 // Turn on packet logging if enabled. 454 if args.Conf.LogPackets { 455 log.Infof("Packet logging enabled") 456 sniffer.LogPackets.Store(1) 457 } else { 458 log.Infof("Packet logging disabled") 459 sniffer.LogPackets.Store(0) 460 } 461 462 // Create a watchdog. 463 dogOpts := watchdog.DefaultOpts 464 dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction 465 dog := watchdog.New(k, dogOpts) 466 467 procArgs, err := createProcessArgs(args.ID, args.Spec, creds, k, k.RootPIDNamespace()) 468 if err != nil { 469 return nil, fmt.Errorf("creating init process for root container: %w", err) 470 } 471 info.procArgs = procArgs 472 473 if err := initCompatLogs(args.UserLogFD); err != nil { 474 return nil, fmt.Errorf("initializing compat logs: %w", err) 475 } 476 477 mountHints, err := NewPodMountHints(args.Spec) 478 if err != nil { 479 return nil, fmt.Errorf("creating pod mount hints: %w", err) 480 } 481 482 // Set up host mount that will be used for imported fds. 483 hostFilesystem, err := host.NewFilesystem(k.VFS()) 484 if err != nil { 485 return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) 486 } 487 defer hostFilesystem.DecRef(k.SupervisorContext()) 488 k.SetHostMount(k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})) 489 490 if args.PodInitConfigFD >= 0 { 491 if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil { 492 log.Warningf("unable to configure event session: %v", err) 493 } 494 } 495 496 eid := execID{cid: args.ID} 497 l := &Loader{ 498 k: k, 499 watchdog: dog, 500 sandboxID: args.ID, 501 processes: map[execID]*execProcess{eid: {}}, 502 mountHints: mountHints, 503 root: info, 504 stopProfiling: stopProfiling, 505 productName: args.ProductName, 506 nvidiaUVMDevMajor: info.nvidiaUVMDevMajor, 507 } 508 509 // We don't care about child signals; some platforms can generate a 510 // tremendous number of useless ones (I'm looking at you, ptrace). 511 if err := sighandling.IgnoreChildStop(); err != nil { 512 return nil, fmt.Errorf("ignore child stop signals failed: %w", err) 513 } 514 515 // Create the control server using the provided FD. 516 // 517 // This must be done *after* we have initialized the kernel since the 518 // controller is used to configure the kernel's network stack. 519 ctrl, err := newController(args.ControllerFD, l) 520 if err != nil { 521 return nil, fmt.Errorf("creating control server: %w", err) 522 } 523 l.ctrl = ctrl 524 525 // Only start serving after Loader is set to controller and controller is set 526 // to Loader, because they are both used in the urpc methods. 527 if err := ctrl.srv.StartServing(); err != nil { 528 return nil, fmt.Errorf("starting control server: %w", err) 529 } 530 531 return l, nil 532 } 533 534 // createProcessArgs creates args that can be used with kernel.CreateProcess. 535 func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { 536 // Create initial limits. 537 ls, err := createLimitSet(spec) 538 if err != nil { 539 return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) 540 } 541 env, err := specutils.ResolveEnvs(spec.Process.Env) 542 if err != nil { 543 return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) 544 } 545 546 wd := spec.Process.Cwd 547 if wd == "" { 548 wd = "/" 549 } 550 551 // Create the process arguments. 552 procArgs := kernel.CreateProcessArgs{ 553 Argv: spec.Process.Args, 554 Envv: env, 555 WorkingDirectory: wd, 556 Credentials: creds, 557 Umask: 0022, 558 Limits: ls, 559 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 560 UTSNamespace: k.RootUTSNamespace(), 561 IPCNamespace: k.RootIPCNamespace(), 562 AbstractSocketNamespace: k.RootAbstractSocketNamespace(), 563 ContainerID: id, 564 PIDNamespace: pidns, 565 } 566 567 return procArgs, nil 568 } 569 570 // Destroy cleans up all resources used by the loader. 571 // 572 // Note that this will block until all open control server connections have 573 // been closed. For that reason, this should NOT be called in a defer, because 574 // a panic in a control server rpc would then hang forever. 575 func (l *Loader) Destroy() { 576 if l.stopSignalForwarding != nil { 577 l.stopSignalForwarding() 578 } 579 l.watchdog.Stop() 580 581 // Stop the control server. This will indirectly stop any 582 // long-running control operations that are in flight, e.g. 583 // profiling operations. 584 l.ctrl.stop() 585 586 // Release all kernel resources. This is only safe after we can no longer 587 // save/restore. 588 l.k.Release() 589 590 // Release any dangling tcp connections. 591 tcpip.ReleaseDanglingEndpoints() 592 593 // In the success case, stdioFDs and goferFDs will only contain 594 // released/closed FDs that ownership has been passed over to host FDs and 595 // gofer sessions. Close them here in case of failure. 596 for _, f := range l.root.stdioFDs { 597 _ = f.Close() 598 } 599 for _, f := range l.root.passFDs { 600 _ = f.host.Close() 601 } 602 for _, f := range l.root.goferFDs { 603 _ = f.Close() 604 } 605 606 l.stopProfiling() 607 } 608 609 func createPlatform(conf *config.Config, deviceFile *os.File) (platform.Platform, error) { 610 p, err := platform.Lookup(conf.Platform) 611 if err != nil { 612 panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) 613 } 614 log.Infof("Platform: %s", conf.Platform) 615 return p.New(deviceFile) 616 } 617 618 func createMemoryFile() (*pgalloc.MemoryFile, error) { 619 const memfileName = "runsc-memory" 620 memfd, err := memutil.CreateMemFD(memfileName, 0) 621 if err != nil { 622 return nil, fmt.Errorf("error creating memfd: %w", err) 623 } 624 memfile := os.NewFile(uintptr(memfd), memfileName) 625 // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if 626 // there are memory cgroups specified, because at this point we're already 627 // in a mount namespace in which the relevant cgroupfs is not visible. 628 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) 629 if err != nil { 630 _ = memfile.Close() 631 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) 632 } 633 return mf, nil 634 } 635 636 // installSeccompFilters installs sandbox seccomp filters with the host. 637 func (l *Loader) installSeccompFilters() error { 638 if l.PreSeccompCallback != nil { 639 l.PreSeccompCallback() 640 } 641 if l.root.conf.DisableSeccomp { 642 filter.Report("syscall filter is DISABLED. Running in less secure mode.") 643 } else { 644 hostnet := l.root.conf.Network == config.NetworkHost 645 opts := filter.Options{ 646 Platform: l.k.Platform, 647 HostNetwork: hostnet, 648 HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw, 649 HostFilesystem: l.root.conf.DirectFS, 650 ProfileEnable: l.root.conf.ProfileEnable, 651 NVProxy: l.root.conf.NVProxy, 652 TPUProxy: l.root.conf.TPUProxy, 653 ControllerFD: l.ctrl.srv.FD(), 654 } 655 if err := filter.Install(opts); err != nil { 656 return fmt.Errorf("installing seccomp filters: %w", err) 657 } 658 } 659 return nil 660 } 661 662 // Run runs the root container. 663 func (l *Loader) Run() error { 664 err := l.run() 665 l.ctrl.manager.startResultChan <- err 666 if err != nil { 667 // Give the controller some time to send the error to the 668 // runtime. If we return too quickly here the process will exit 669 // and the control connection will be closed before the error 670 // is returned. 671 gtime.Sleep(2 * gtime.Second) 672 return err 673 } 674 return nil 675 } 676 677 func (l *Loader) run() error { 678 if l.root.conf.Network == config.NetworkHost { 679 // Delay host network configuration to this point because network namespace 680 // is configured after the loader is created and before Run() is called. 681 log.Debugf("Configuring host network") 682 s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) 683 if err := s.Configure(l.root.conf.EnableRaw); err != nil { 684 return err 685 } 686 } 687 688 l.mu.Lock() 689 defer l.mu.Unlock() 690 691 eid := execID{cid: l.sandboxID} 692 ep, ok := l.processes[eid] 693 if !ok { 694 return fmt.Errorf("trying to start deleted container %q", l.sandboxID) 695 } 696 697 // If we are restoring, we do not want to create a process. 698 // l.restore is set by the container manager when a restore call is made. 699 if !l.restore { 700 if l.root.conf.ProfileEnable { 701 pprof.Initialize() 702 } 703 704 // Finally done with all configuration. Setup filters before user code 705 // is loaded. 706 if err := l.installSeccompFilters(); err != nil { 707 return err 708 } 709 710 // Create the root container init task. It will begin running 711 // when the kernel is started. 712 var ( 713 tg *kernel.ThreadGroup 714 err error 715 ) 716 tg, ep.tty, err = l.createContainerProcess(true, l.sandboxID, &l.root) 717 if err != nil { 718 return err 719 } 720 721 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 722 evt := pb.Start{ 723 Id: l.sandboxID, 724 Cwd: l.root.spec.Process.Cwd, 725 Args: l.root.spec.Process.Args, 726 Terminal: l.root.spec.Process.Terminal, 727 } 728 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 729 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 730 evt.Env = l.root.spec.Process.Env 731 } 732 if !fields.Context.Empty() { 733 evt.ContextData = &pb.ContextData{} 734 kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData) 735 } 736 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 737 return c.ContainerStart(context.Background(), fields, &evt) 738 }) 739 } 740 } 741 742 ep.tg = l.k.GlobalInit() 743 if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { 744 ep.pidnsPath = ns.Path 745 } 746 747 // Handle signals by forwarding them to the root container process 748 // (except for panic signal, which should cause a panic). 749 l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { 750 // Panic signal should cause a panic. 751 if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { 752 panic("Signal-induced panic") 753 } 754 755 // Otherwise forward to root container. 756 deliveryMode := DeliverToProcess 757 if l.root.spec.Process.Terminal { 758 // Since we are running with a console, we should forward the signal to 759 // the foreground process group so that job control signals like ^C can 760 // be handled properly. 761 deliveryMode = DeliverToForegroundProcessGroup 762 } 763 log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) 764 if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { 765 log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) 766 } 767 }) 768 769 log.Infof("Process should have started...") 770 l.watchdog.Start() 771 return l.k.Start() 772 } 773 774 // createSubcontainer creates a new container inside the sandbox. 775 func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { 776 l.mu.Lock() 777 defer l.mu.Unlock() 778 779 eid := execID{cid: cid} 780 if _, ok := l.processes[eid]; ok { 781 return fmt.Errorf("container %q already exists", cid) 782 } 783 l.processes[eid] = &execProcess{hostTTY: tty} 784 return nil 785 } 786 787 // startSubcontainer starts a child container. It returns the thread group ID of 788 // the newly created process. Used FDs are either closed or released. It's safe 789 // for the caller to close any remaining files upon return. 790 func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, overlayFilestoreFDs []*fd.FD, overlayMediums []OverlayMedium) error { 791 // Create capabilities. 792 caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) 793 if err != nil { 794 return fmt.Errorf("creating capabilities: %w", err) 795 } 796 797 l.mu.Lock() 798 defer l.mu.Unlock() 799 800 ep := l.processes[execID{cid: cid}] 801 if ep == nil { 802 return fmt.Errorf("trying to start a deleted container %q", cid) 803 } 804 805 // Convert the spec's additional GIDs to KGIDs. 806 extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) 807 for _, GID := range spec.Process.User.AdditionalGids { 808 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 809 } 810 811 // Create credentials. We reuse the root user namespace because the 812 // sentry currently supports only 1 mount namespace, which is tied to a 813 // single user namespace. Thus we must run in the same user namespace 814 // to access mounts. 815 creds := auth.NewUserCredentials( 816 auth.KUID(spec.Process.User.UID), 817 auth.KGID(spec.Process.User.GID), 818 extraKGIDs, 819 caps, 820 l.k.RootUserNamespace()) 821 822 var pidns *kernel.PIDNamespace 823 if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { 824 if ns.Path != "" { 825 for _, p := range l.processes { 826 if ns.Path == p.pidnsPath { 827 log.Debugf("Joining PID namespace named %q", ns.Path) 828 pidns = p.tg.PIDNamespace() 829 break 830 } 831 } 832 } 833 if pidns == nil { 834 log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path) 835 pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) 836 } 837 ep.pidnsPath = ns.Path 838 } else { 839 pidns = l.k.RootPIDNamespace() 840 } 841 842 info := &containerInfo{ 843 conf: conf, 844 spec: spec, 845 goferFDs: goferFDs, 846 overlayFilestoreFDs: overlayFilestoreFDs, 847 overlayMediums: overlayMediums, 848 nvidiaUVMDevMajor: l.nvidiaUVMDevMajor, 849 } 850 info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) 851 if err != nil { 852 return fmt.Errorf("creating new process: %w", err) 853 } 854 855 // Use stdios or TTY depending on the spec configuration. 856 if spec.Process.Terminal { 857 if l := len(stdioFDs); l != 0 { 858 return fmt.Errorf("using TTY, stdios not expected: %d", l) 859 } 860 if ep.hostTTY == nil { 861 return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") 862 } 863 info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} 864 ep.hostTTY = nil 865 } else { 866 info.stdioFDs = stdioFDs 867 } 868 869 ep.tg, ep.tty, err = l.createContainerProcess(false, cid, info) 870 if err != nil { 871 return err 872 } 873 874 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 875 evt := pb.Start{ 876 Id: cid, 877 Cwd: spec.Process.Cwd, 878 Args: spec.Process.Args, 879 Terminal: spec.Process.Terminal, 880 } 881 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 882 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 883 evt.Env = spec.Process.Env 884 } 885 if !fields.Context.Empty() { 886 evt.ContextData = &pb.ContextData{} 887 kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData) 888 } 889 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 890 return c.ContainerStart(context.Background(), fields, &evt) 891 }) 892 } 893 894 l.k.StartProcess(ep.tg) 895 return nil 896 } 897 898 func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) { 899 // Create the FD map, which will set stdin, stdout, and stderr. 900 ctx := info.procArgs.NewContext(l.k) 901 fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User) 902 if err != nil { 903 return nil, nil, fmt.Errorf("importing fds: %w", err) 904 } 905 // CreateProcess takes a reference on fdTable if successful. We won't need 906 // ours either way. 907 info.procArgs.FDTable = fdTable 908 909 if info.execFD != nil { 910 if info.procArgs.Filename != "" { 911 return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 912 } 913 file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{ 914 Readonly: true, 915 Savable: true, 916 VirtualOwner: true, 917 UID: auth.KUID(info.spec.Process.User.UID), 918 GID: auth.KGID(info.spec.Process.User.GID), 919 }) 920 if err != nil { 921 return nil, nil, err 922 } 923 defer file.DecRef(ctx) 924 info.execFD.Release() 925 926 info.procArgs.File = file 927 } 928 929 // Gofer FDs must be ordered and the first FD is always the rootfs. 930 if len(info.goferFDs) < 1 { 931 return nil, nil, fmt.Errorf("rootfs gofer FD not found") 932 } 933 l.startGoferMonitor(cid, int32(info.goferFDs[0].FD())) 934 935 mntr := newContainerMounter(info, l.k, l.mountHints, l.productName, l.sandboxID) 936 if root { 937 if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { 938 return nil, nil, err 939 } 940 } 941 if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil { 942 return nil, nil, err 943 } 944 945 // Add the HOME environment variable if it is not already set. 946 info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, 947 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 948 if err != nil { 949 return nil, nil, err 950 } 951 952 // Create and start the new process. 953 tg, _, err := l.k.CreateProcess(info.procArgs) 954 if err != nil { 955 return nil, nil, fmt.Errorf("creating process: %w", err) 956 } 957 // CreateProcess takes a reference on FDTable if successful. 958 info.procArgs.FDTable.DecRef(ctx) 959 960 // Set the foreground process group on the TTY to the global init process 961 // group, since that is what we are about to start running. 962 if ttyFile != nil { 963 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 964 } 965 966 // Install seccomp filters with the new task if there are any. 967 if info.conf.OCISeccomp { 968 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 969 program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) 970 if err != nil { 971 return nil, nil, fmt.Errorf("building seccomp program: %w", err) 972 } 973 974 if log.IsLogging(log.Debug) { 975 out, _ := bpf.DecodeProgram(program) 976 log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) 977 } 978 979 task := tg.Leader() 980 // NOTE: It seems Flags are ignored by runc so we ignore them too. 981 if err := task.AppendSyscallFilter(program, true); err != nil { 982 return nil, nil, fmt.Errorf("appending seccomp filters: %w", err) 983 } 984 } 985 } else { 986 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 987 log.Warningf("Seccomp spec is being ignored") 988 } 989 } 990 991 return tg, ttyFile, nil 992 } 993 994 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on 995 // the gofer FD looking for disconnects, and kills the container processes if 996 // the rootfs FD disconnects. 997 // 998 // Note that other gofer mounts are allowed to be unmounted and disconnected. 999 func (l *Loader) startGoferMonitor(cid string, rootfsGoferFD int32) { 1000 if rootfsGoferFD < 0 { 1001 panic(fmt.Sprintf("invalid FD: %d", rootfsGoferFD)) 1002 } 1003 go func() { 1004 log.Debugf("Monitoring gofer health for container %q", cid) 1005 events := []unix.PollFd{ 1006 { 1007 Fd: rootfsGoferFD, 1008 Events: unix.POLLHUP | unix.POLLRDHUP, 1009 }, 1010 } 1011 _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { 1012 // Use ppoll instead of poll because it's already allowed in seccomp. 1013 n, err := unix.Ppoll(events, nil, nil) 1014 return uintptr(n), 0, err 1015 }) 1016 if err != nil { 1017 panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) 1018 } 1019 1020 l.mu.Lock() 1021 defer l.mu.Unlock() 1022 1023 // The gofer could have been stopped due to a normal container shutdown. 1024 // Check if the container has not stopped yet. 1025 if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil { 1026 log.Infof("Gofer socket disconnected, killing container %q", cid) 1027 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1028 log.Warningf("Error killing container %q after gofer stopped: %s", cid, err) 1029 } 1030 } 1031 }() 1032 } 1033 1034 // destroySubcontainer stops a container if it is still running and cleans up 1035 // its filesystem. 1036 func (l *Loader) destroySubcontainer(cid string) error { 1037 l.mu.Lock() 1038 defer l.mu.Unlock() 1039 1040 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1041 if err != nil { 1042 // Container doesn't exist. 1043 return err 1044 } 1045 1046 // The container exists, but has it been started? 1047 if tg != nil { 1048 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1049 return fmt.Errorf("sending SIGKILL to all container processes: %w", err) 1050 } 1051 // Wait for all processes that belong to the container to exit (including 1052 // exec'd processes). 1053 for _, t := range l.k.TaskSet().Root.Tasks() { 1054 if t.ContainerID() == cid { 1055 t.ThreadGroup().WaitExited() 1056 } 1057 } 1058 } 1059 1060 // No more failure from this point on. Remove all container thread groups 1061 // from the map. 1062 for key := range l.processes { 1063 if key.cid == cid { 1064 delete(l.processes, key) 1065 } 1066 } 1067 1068 log.Debugf("Container destroyed, cid: %s", cid) 1069 return nil 1070 } 1071 1072 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { 1073 // Hold the lock for the entire operation to ensure that exec'd process is 1074 // added to 'processes' in case it races with destroyContainer(). 1075 l.mu.Lock() 1076 defer l.mu.Unlock() 1077 1078 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) 1079 if err != nil { 1080 return 0, err 1081 } 1082 if tg == nil { 1083 return 0, fmt.Errorf("container %q not started", args.ContainerID) 1084 } 1085 1086 // Get the container MountNamespace from the Task. Try to acquire ref may fail 1087 // in case it raced with task exit. 1088 // task.MountNamespace() does not take a ref, so we must do so ourselves. 1089 args.MountNamespace = tg.Leader().MountNamespace() 1090 if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() { 1091 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 1092 } 1093 1094 args.Envv, err = specutils.ResolveEnvs(args.Envv) 1095 if err != nil { 1096 return 0, fmt.Errorf("resolving env: %w", err) 1097 } 1098 1099 // Add the HOME environment variable if it is not already set. 1100 ctx := vfs.WithRoot(l.k.SupervisorContext(), args.MountNamespace.Root()) 1101 defer args.MountNamespace.DecRef(ctx) 1102 args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) 1103 if err != nil { 1104 return 0, err 1105 } 1106 args.PIDNamespace = tg.PIDNamespace() 1107 1108 args.Limits, err = createLimitSet(l.root.spec) 1109 if err != nil { 1110 return 0, fmt.Errorf("creating limits: %w", err) 1111 } 1112 1113 // Start the process. 1114 proc := control.Proc{Kernel: l.k} 1115 newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) 1116 if err != nil { 1117 return 0, err 1118 } 1119 1120 eid := execID{cid: args.ContainerID, pid: tgid} 1121 l.processes[eid] = &execProcess{ 1122 tg: newTG, 1123 tty: ttyFile, 1124 } 1125 log.Debugf("updated processes: %v", l.processes) 1126 1127 return tgid, nil 1128 } 1129 1130 // waitContainer waits for the init process of a container to exit. 1131 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { 1132 // Don't defer unlock, as doing so would make it impossible for 1133 // multiple clients to wait on the same container. 1134 tg, err := l.threadGroupFromID(execID{cid: cid}) 1135 if err != nil { 1136 return fmt.Errorf("can't wait for container %q: %w", cid, err) 1137 } 1138 1139 // If the thread either has already exited or exits during waiting, 1140 // consider the container exited. 1141 ws := l.wait(tg) 1142 *waitStatus = ws 1143 1144 // Check for leaks and write coverage report after the root container has 1145 // exited. This guarantees that the report is written in cases where the 1146 // sandbox is killed by a signal after the ContMgrWait request is completed. 1147 if l.root.procArgs.ContainerID == cid { 1148 // All sentry-created resources should have been released at this point. 1149 refs.DoLeakCheck() 1150 _ = coverage.Report() 1151 } 1152 return nil 1153 } 1154 1155 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { 1156 if tgid <= 0 { 1157 return fmt.Errorf("PID (%d) must be positive", tgid) 1158 } 1159 1160 // Try to find a process that was exec'd 1161 eid := execID{cid: cid, pid: tgid} 1162 execTG, err := l.threadGroupFromID(eid) 1163 if err == nil { 1164 ws := l.wait(execTG) 1165 *waitStatus = ws 1166 1167 l.mu.Lock() 1168 delete(l.processes, eid) 1169 log.Debugf("updated processes (removal): %v", l.processes) 1170 l.mu.Unlock() 1171 return nil 1172 } 1173 1174 // The caller may be waiting on a process not started directly via exec. 1175 // In this case, find the process in the container's PID namespace. 1176 initTG, err := l.threadGroupFromID(execID{cid: cid}) 1177 if err != nil { 1178 return fmt.Errorf("waiting for PID %d: %w", tgid, err) 1179 } 1180 tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) 1181 if tg == nil { 1182 return fmt.Errorf("waiting for PID %d: no such process", tgid) 1183 } 1184 if tg.Leader().ContainerID() != cid { 1185 return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) 1186 } 1187 ws := l.wait(tg) 1188 *waitStatus = ws 1189 return nil 1190 } 1191 1192 // wait waits for the process with TGID 'tgid' in a container's PID namespace 1193 // to exit. 1194 func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { 1195 tg.WaitExited() 1196 return uint32(tg.ExitStatus()) 1197 } 1198 1199 // WaitForStartSignal waits for a start signal from the control server. 1200 func (l *Loader) WaitForStartSignal() { 1201 <-l.ctrl.manager.startChan 1202 } 1203 1204 // WaitExit waits for the root container to exit, and returns its exit status. 1205 func (l *Loader) WaitExit() linux.WaitStatus { 1206 // Wait for container. 1207 l.k.WaitExited() 1208 1209 // Check all references. 1210 refs.OnExit() 1211 1212 return l.k.GlobalInit().ExitStatus() 1213 } 1214 1215 func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) { 1216 // Create an empty network stack because the network namespace may be empty at 1217 // this point. Netns is configured before Run() is called. Netstack is 1218 // configured using a control uRPC message. Host network is configured inside 1219 // Run(). 1220 switch conf.Network { 1221 case config.NetworkHost: 1222 // If configured for raw socket support with host network 1223 // stack, make sure that we have CAP_NET_RAW the host, 1224 // otherwise we can't make raw sockets. 1225 if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) { 1226 return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability") 1227 } 1228 // No network namespacing support for hostinet yet, hence creator is nil. 1229 return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil 1230 1231 case config.NetworkNone, config.NetworkSandbox: 1232 s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) 1233 if err != nil { 1234 return nil, err 1235 } 1236 creator := &sandboxNetstackCreator{ 1237 clock: clock, 1238 uniqueID: uniqueID, 1239 allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, 1240 } 1241 return inet.NewRootNamespace(s, creator, userns), nil 1242 1243 default: 1244 panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) 1245 } 1246 1247 } 1248 1249 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { 1250 netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} 1251 transProtos := []stack.TransportProtocolFactory{ 1252 tcp.NewProtocol, 1253 udp.NewProtocol, 1254 icmp.NewProtocol4, 1255 icmp.NewProtocol6, 1256 } 1257 s := netstack.Stack{Stack: stack.New(stack.Options{ 1258 NetworkProtocols: netProtos, 1259 TransportProtocols: transProtos, 1260 Clock: clock, 1261 Stats: netstack.Metrics, 1262 HandleLocal: true, 1263 // Enable raw sockets for users with sufficient 1264 // privileges. 1265 RawFactory: raw.EndpointFactory{}, 1266 AllowPacketEndpointWrite: allowPacketEndpointWrite, 1267 UniqueID: uniqueID, 1268 DefaultIPTables: netfilter.DefaultLinuxTables, 1269 })} 1270 1271 // Enable SACK Recovery. 1272 { 1273 opt := tcpip.TCPSACKEnabled(true) 1274 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1275 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1276 } 1277 } 1278 1279 // Set default TTLs as required by socket/netstack. 1280 { 1281 opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) 1282 if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { 1283 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) 1284 } 1285 if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { 1286 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) 1287 } 1288 } 1289 1290 // Enable Receive Buffer Auto-Tuning. 1291 { 1292 opt := tcpip.TCPModerateReceiveBufferOption(true) 1293 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1294 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1295 } 1296 } 1297 1298 return &s, nil 1299 } 1300 1301 // sandboxNetstackCreator implements kernel.NetworkStackCreator. 1302 // 1303 // +stateify savable 1304 type sandboxNetstackCreator struct { 1305 clock tcpip.Clock 1306 uniqueID stack.UniqueID 1307 allowPacketEndpointWrite bool 1308 } 1309 1310 // CreateStack implements kernel.NetworkStackCreator.CreateStack. 1311 func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { 1312 s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) 1313 if err != nil { 1314 return nil, err 1315 } 1316 1317 // Setup loopback. 1318 n := &Network{Stack: s.(*netstack.Stack).Stack} 1319 nicID := tcpip.NICID(f.uniqueID.UniqueID()) 1320 link := DefaultLoopbackLink 1321 linkEP := packetsocket.New(ethernet.New(loopback.New())) 1322 opts := stack.NICOptions{Name: link.Name} 1323 1324 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 1325 return nil, err 1326 } 1327 1328 return s, nil 1329 } 1330 1331 // signal sends a signal to one or more processes in a container. If PID is 0, 1332 // then the container init process is used. Depending on the SignalDeliveryMode 1333 // option, the signal may be sent directly to the indicated process, to all 1334 // processes in the container, or to the foreground process group. pid is 1335 // relative to the root PID namespace, not the container's. 1336 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { 1337 if pid < 0 { 1338 return fmt.Errorf("PID (%d) must be positive", pid) 1339 } 1340 1341 switch mode { 1342 case DeliverToProcess: 1343 if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { 1344 return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) 1345 } 1346 return nil 1347 1348 case DeliverToForegroundProcessGroup: 1349 if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { 1350 return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) 1351 } 1352 return nil 1353 1354 case DeliverToAllProcesses: 1355 if pid != 0 { 1356 return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) 1357 } 1358 // Check that the container has actually started before signaling it. 1359 if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { 1360 return err 1361 } 1362 if err := l.signalAllProcesses(cid, signo); err != nil { 1363 return fmt.Errorf("signaling all processes in container %q: %w", cid, err) 1364 } 1365 return nil 1366 1367 default: 1368 panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) 1369 } 1370 } 1371 1372 // signalProcess sends signal to process in the given container. tgid is 1373 // relative to the root PID namespace, not the container's. 1374 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { 1375 execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) 1376 if err == nil { 1377 // Send signal directly to the identified process. 1378 return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) 1379 } 1380 1381 // The caller may be signaling a process not started directly via exec. 1382 // In this case, find the process and check that the process belongs to the 1383 // container in question. 1384 tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) 1385 if tg == nil { 1386 return fmt.Errorf("no such process with PID %d", tgid) 1387 } 1388 if tg.Leader().ContainerID() != cid { 1389 return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) 1390 } 1391 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1392 } 1393 1394 // signalForegrondProcessGroup looks up foreground process group from the TTY 1395 // for the given "tgid" inside container "cid", and send the signal to it. 1396 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { 1397 l.mu.Lock() 1398 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) 1399 if err != nil { 1400 l.mu.Unlock() 1401 return fmt.Errorf("no thread group found: %w", err) 1402 } 1403 if tg == nil { 1404 l.mu.Unlock() 1405 return fmt.Errorf("container %q not started", cid) 1406 } 1407 1408 tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) 1409 l.mu.Unlock() 1410 if err != nil { 1411 return fmt.Errorf("no thread group found: %w", err) 1412 } 1413 if tty == nil { 1414 return fmt.Errorf("no TTY attached") 1415 } 1416 pg := tty.ForegroundProcessGroup() 1417 if pg == nil { 1418 // No foreground process group has been set. Signal the 1419 // original thread group. 1420 log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) 1421 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1422 } 1423 // Send the signal to all processes in the process group. 1424 var lastErr error 1425 for _, tg := range l.k.TaskSet().Root.ThreadGroups() { 1426 if tg.ProcessGroup() != pg { 1427 continue 1428 } 1429 if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}); err != nil { 1430 lastErr = err 1431 } 1432 } 1433 return lastErr 1434 } 1435 1436 // signalAllProcesses that belong to specified container. It's a noop if the 1437 // container hasn't started or has exited. 1438 func (l *Loader) signalAllProcesses(cid string, signo int32) error { 1439 // Pause the kernel to prevent new processes from being created while 1440 // the signal is delivered. This prevents process leaks when SIGKILL is 1441 // sent to the entire container. 1442 l.k.Pause() 1443 defer l.k.Unpause() 1444 return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) 1445 } 1446 1447 // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it 1448 // acquires mutex before calling it and fails in case container hasn't started 1449 // yet. 1450 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { 1451 l.mu.Lock() 1452 defer l.mu.Unlock() 1453 tg, err := l.tryThreadGroupFromIDLocked(key) 1454 if err != nil { 1455 return nil, err 1456 } 1457 if tg == nil { 1458 return nil, fmt.Errorf("container %q not started", key.cid) 1459 } 1460 return tg, nil 1461 } 1462 1463 // tryThreadGroupFromIDLocked returns the thread group for the given execution 1464 // ID. It may return nil in case the container has not started yet. Returns 1465 // error if execution ID is invalid or if the container cannot be found (maybe 1466 // it has been deleted). Caller must hold 'mu'. 1467 func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { 1468 ep := l.processes[key] 1469 if ep == nil { 1470 return nil, fmt.Errorf("container %q not found", key.cid) 1471 } 1472 return ep.tg, nil 1473 } 1474 1475 // ttyFromIDLocked returns the TTY files for the given execution ID. It may 1476 // return nil in case the container has not started yet. Returns error if 1477 // execution ID is invalid or if the container cannot be found (maybe it has 1478 // been deleted). Caller must hold 'mu'. 1479 func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) { 1480 ep := l.processes[key] 1481 if ep == nil { 1482 return nil, fmt.Errorf("container %q not found", key.cid) 1483 } 1484 return ep.tty, nil 1485 } 1486 1487 func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User) (*kernel.FDTable, *host.TTYFileDescription, error) { 1488 if len(stdioFDs) != 3 { 1489 return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) 1490 } 1491 fdMap := map[int]*fd.FD{ 1492 0: stdioFDs[0], 1493 1: stdioFDs[1], 1494 2: stdioFDs[2], 1495 } 1496 1497 // Create the entries for the host files that were passed to our app. 1498 for _, customFD := range passFDs { 1499 if customFD.guest < 0 { 1500 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 1501 } 1502 fdMap[customFD.guest] = customFD.host 1503 } 1504 1505 k := kernel.KernelFromContext(ctx) 1506 fdTable := k.NewFDTable() 1507 ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap) 1508 if err != nil { 1509 fdTable.DecRef(ctx) 1510 return nil, nil, err 1511 } 1512 return fdTable, ttyFile, nil 1513 } 1514 1515 // portForward implements initiating a portForward connection in the sandbox. portForwardProxies 1516 // represent a two connections each copying to each other (read ends to write ends) in goroutines. 1517 // The proxies are stored and can be cleaned up, or clean up after themselves if the connection 1518 // is broken. 1519 func (l *Loader) portForward(opts *PortForwardOpts) error { 1520 // Validate that we have a stream FD to write to. If this happens then 1521 // it means there is a misbehaved urpc client or a bug has occurred. 1522 if len(opts.Files) != 1 { 1523 return fmt.Errorf("stream FD is required for port forward") 1524 } 1525 1526 l.mu.Lock() 1527 defer l.mu.Unlock() 1528 1529 cid := opts.ContainerID 1530 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1531 if err != nil { 1532 return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err) 1533 } 1534 if tg == nil { 1535 return fmt.Errorf("container %q not started", cid) 1536 } 1537 1538 // Import the fd for the UDS. 1539 ctx := l.k.SupervisorContext() 1540 fd, err := l.importFD(ctx, opts.Files[0]) 1541 if err != nil { 1542 return fmt.Errorf("importing stream fd: %w", err) 1543 } 1544 cu := cleanup.Make(func() { fd.DecRef(ctx) }) 1545 defer cu.Clean() 1546 1547 fdConn := pf.NewFileDescriptionConn(fd) 1548 1549 // Create a proxy to forward data between the fdConn and the sandboxed application. 1550 pair := pf.ProxyPair{To: fdConn} 1551 1552 switch l.root.conf.Network { 1553 case config.NetworkSandbox: 1554 stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack 1555 nsConn, err := pf.NewNetstackConn(stack, opts.Port) 1556 if err != nil { 1557 return fmt.Errorf("creating netstack port forward connection: %w", err) 1558 } 1559 pair.From = nsConn 1560 case config.NetworkHost: 1561 hConn, err := pf.NewHostInetConn(opts.Port) 1562 if err != nil { 1563 return fmt.Errorf("creating hostinet port forward connection: %w", err) 1564 } 1565 pair.From = hConn 1566 default: 1567 return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid) 1568 } 1569 cu.Release() 1570 proxy := pf.NewProxy(pair, opts.ContainerID) 1571 1572 // Add to the list of port forward connections and remove when the 1573 // connection closes. 1574 l.portForwardProxies = append(l.portForwardProxies, proxy) 1575 proxy.AddCleanup(func() { 1576 l.mu.Lock() 1577 defer l.mu.Unlock() 1578 for i := range l.portForwardProxies { 1579 if l.portForwardProxies[i] == proxy { 1580 l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...) 1581 break 1582 } 1583 } 1584 }) 1585 1586 // Start forwarding on the connection. 1587 proxy.Start(ctx) 1588 return nil 1589 } 1590 1591 // importFD generically imports a host file descriptor without adding it to any 1592 // fd table. 1593 func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) { 1594 hostFD, err := fd.NewFromFile(f) 1595 if err != nil { 1596 return nil, err 1597 } 1598 defer hostFD.Close() 1599 fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{ 1600 Savable: false, // We disconnect and close on save. 1601 IsTTY: false, 1602 VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed. 1603 }) 1604 1605 if err != nil { 1606 return nil, err 1607 } 1608 hostFD.Release() 1609 return fd, nil 1610 } 1611 1612 func (l *Loader) containerCount() int { 1613 l.mu.Lock() 1614 defer l.mu.Unlock() 1615 1616 containers := 0 1617 for id := range l.processes { 1618 if id.pid == 0 { 1619 // pid==0 represents the init process of a container. There is 1620 // only one of such process per container. 1621 containers++ 1622 } 1623 } 1624 return containers 1625 }