gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/loader.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package boot loads the kernel and runs a container. 16 package boot 17 18 import ( 19 "errors" 20 "fmt" 21 mrand "math/rand" 22 "os" 23 "runtime" 24 "strconv" 25 gtime "time" 26 27 specs "github.com/opencontainers/runtime-spec/specs-go" 28 "github.com/syndtr/gocapability/capability" 29 "golang.org/x/sys/unix" 30 "gvisor.dev/gvisor/pkg/abi/linux" 31 "gvisor.dev/gvisor/pkg/bpf" 32 "gvisor.dev/gvisor/pkg/cleanup" 33 "gvisor.dev/gvisor/pkg/context" 34 "gvisor.dev/gvisor/pkg/coverage" 35 "gvisor.dev/gvisor/pkg/cpuid" 36 "gvisor.dev/gvisor/pkg/fd" 37 "gvisor.dev/gvisor/pkg/log" 38 "gvisor.dev/gvisor/pkg/memutil" 39 "gvisor.dev/gvisor/pkg/metric" 40 "gvisor.dev/gvisor/pkg/rand" 41 "gvisor.dev/gvisor/pkg/refs" 42 "gvisor.dev/gvisor/pkg/sentry/control" 43 "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" 44 "gvisor.dev/gvisor/pkg/sentry/fdimport" 45 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" 46 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" 47 "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" 48 "gvisor.dev/gvisor/pkg/sentry/inet" 49 "gvisor.dev/gvisor/pkg/sentry/kernel" 50 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 51 "gvisor.dev/gvisor/pkg/sentry/loader" 52 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 53 "gvisor.dev/gvisor/pkg/sentry/platform" 54 "gvisor.dev/gvisor/pkg/sentry/seccheck" 55 pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" 56 "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" 57 "gvisor.dev/gvisor/pkg/sentry/time" 58 "gvisor.dev/gvisor/pkg/sentry/usage" 59 "gvisor.dev/gvisor/pkg/sentry/vfs" 60 "gvisor.dev/gvisor/pkg/sentry/watchdog" 61 "gvisor.dev/gvisor/pkg/sighandling" 62 "gvisor.dev/gvisor/pkg/sync" 63 "gvisor.dev/gvisor/pkg/tcpip" 64 "gvisor.dev/gvisor/pkg/tcpip/link/ethernet" 65 "gvisor.dev/gvisor/pkg/tcpip/link/loopback" 66 "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" 67 "gvisor.dev/gvisor/pkg/tcpip/network/arp" 68 "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" 69 "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" 70 "gvisor.dev/gvisor/pkg/tcpip/stack" 71 "gvisor.dev/gvisor/pkg/tcpip/transport/icmp" 72 "gvisor.dev/gvisor/pkg/tcpip/transport/raw" 73 "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" 74 "gvisor.dev/gvisor/pkg/tcpip/transport/udp" 75 "gvisor.dev/gvisor/runsc/boot/filter" 76 _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. 77 pf "gvisor.dev/gvisor/runsc/boot/portforward" 78 "gvisor.dev/gvisor/runsc/boot/pprof" 79 "gvisor.dev/gvisor/runsc/config" 80 "gvisor.dev/gvisor/runsc/profile" 81 "gvisor.dev/gvisor/runsc/specutils" 82 "gvisor.dev/gvisor/runsc/specutils/seccomp" 83 84 // Top-level inet providers. 85 "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" 86 "gvisor.dev/gvisor/pkg/sentry/socket/netstack" 87 88 // Include other supported socket providers. 89 _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink" 90 _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route" 91 _ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent" 92 _ "gvisor.dev/gvisor/pkg/sentry/socket/unix" 93 ) 94 95 // ContainerRuntimeState is the runtime state of a container. 96 type ContainerRuntimeState int 97 98 const ( 99 // RuntimeStateInvalid used just in case of error. 100 RuntimeStateInvalid ContainerRuntimeState = iota 101 // RuntimeStateCreating indicates that the container is being 102 // created, but has not started running yet. 103 RuntimeStateCreating 104 // RuntimeStateRunning indicates that the container is running. 105 RuntimeStateRunning 106 // RuntimeStateStopped indicates that the container has stopped. 107 RuntimeStateStopped 108 ) 109 110 type containerInfo struct { 111 cid string 112 113 containerName string 114 115 conf *config.Config 116 117 // spec is the base configuration for the root container. 118 spec *specs.Spec 119 120 // procArgs refers to the container's init task. 121 procArgs kernel.CreateProcessArgs 122 123 // stdioFDs contains stdin, stdout, and stderr. 124 stdioFDs []*fd.FD 125 126 // passFDs are mappings of user-supplied host to guest file descriptors. 127 passFDs []fdMapping 128 129 // execFD is the host file descriptor used for program execution. 130 execFD *fd.FD 131 132 // goferFDs are the FDs that attach the sandbox to the gofers. 133 goferFDs []*fd.FD 134 135 // devGoferFD is the FD to attach the sandbox to the dev gofer. 136 devGoferFD *fd.FD 137 138 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 139 // overlayfs mount for certain gofer mounts. 140 goferFilestoreFDs []*fd.FD 141 142 // goferMountConfs contains information about how the gofer mounts have been 143 // configured. The first entry is for rootfs and the following entries are 144 // for bind mounts in Spec.Mounts (in the same order). 145 goferMountConfs []GoferMountConf 146 147 // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. 148 nvidiaUVMDevMajor uint32 149 150 // nvidiaDriverVersion is the NVIDIA driver ABI version to use for 151 // communicating with NVIDIA devices on the host. 152 nvidiaDriverVersion string 153 } 154 155 type loaderState int 156 157 const ( 158 // created indicates that the Loader has been created, but not started yet. 159 created loaderState = iota 160 // started indicates that the Loader has been started. 161 started 162 // restoring indicates that the Loader has been created and is restoring 163 // containers. It will change to started after restore is completed. 164 restoring 165 ) 166 167 // Loader keeps state needed to start the kernel and run the container. 168 type Loader struct { 169 // k is the kernel. 170 k *kernel.Kernel 171 172 // ctrl is the control server. 173 ctrl *controller 174 175 // root contains information about the root container in the sandbox. 176 root containerInfo 177 178 watchdog *watchdog.Watchdog 179 180 // stopSignalForwarding disables forwarding of signals to the sandboxed 181 // container. It should be called when a sandbox is destroyed. 182 stopSignalForwarding func() 183 184 // stopProfiling stops profiling started at container creation. It 185 // should be called when a sandbox is destroyed. 186 stopProfiling func() 187 188 // PreSeccompCallback is called right before installing seccomp filters. 189 PreSeccompCallback func() 190 191 // restore is set to true if we are restoring a container. 192 restore bool 193 194 restoreWaiters *sync.Cond 195 196 // sandboxID is the ID for the whole sandbox. 197 sandboxID string 198 199 // mountHints provides extra information about mounts for containers that 200 // apply to the entire pod. 201 mountHints *PodMountHints 202 203 // productName is the value to show in 204 // /sys/devices/virtual/dmi/id/product_name. 205 productName string 206 207 // mu guards the fields below. 208 mu sync.Mutex 209 210 // state is guarded by mu. 211 state loaderState 212 213 // sharedMounts holds VFS mounts that may be shared between containers within 214 // the same pod. It is mapped by mount source. 215 // 216 // sharedMounts is guarded by mu. 217 sharedMounts map[string]*vfs.Mount 218 219 // processes maps containers init process and invocation of exec. Root 220 // processes are keyed with container ID and pid=0, while exec invocations 221 // have the corresponding pid set. 222 // 223 // processes is guarded by mu. 224 processes map[execID]*execProcess 225 226 // containerIDs store container names and IDs to assist with restore and container 227 // naming when user didn't provide one. 228 // 229 // Mapping: name -> cid. 230 // processes is guarded by mu. 231 containerIDs map[string]string 232 233 // portForwardProxies is a list of active port forwarding connections. 234 // 235 // portForwardProxies is guarded by mu. 236 portForwardProxies []*pf.Proxy 237 } 238 239 // execID uniquely identifies a sentry process that is executed in a container. 240 type execID struct { 241 cid string 242 pid kernel.ThreadID 243 } 244 245 // execProcess contains the thread group and host TTY of a sentry process. 246 type execProcess struct { 247 // tg will be nil for containers that haven't started yet. 248 tg *kernel.ThreadGroup 249 250 // tty will be nil if the process is not attached to a terminal. 251 tty *host.TTYFileDescription 252 253 // pidnsPath is the pid namespace path in spec 254 pidnsPath string 255 256 // hostTTY is present when creating a sub-container with terminal enabled. 257 // TTY file is passed during container create and must be saved until 258 // container start. 259 hostTTY *fd.FD 260 } 261 262 // fdMapping maps guest to host file descriptors. Guest file descriptors are 263 // exposed to the application inside the sandbox through the FD table. 264 type fdMapping struct { 265 guest int 266 host *fd.FD 267 } 268 269 // FDMapping is a helper type to represent a mapping from guest to host file 270 // descriptors. In contrast to the unexported fdMapping type, it does not imply 271 // file ownership. 272 type FDMapping struct { 273 Guest int 274 Host int 275 } 276 277 func init() { 278 // Initialize the random number generator. 279 mrand.Seed(gtime.Now().UnixNano()) 280 } 281 282 // Args are the arguments for New(). 283 type Args struct { 284 // Id is the sandbox ID. 285 ID string 286 // Spec is the sandbox specification. 287 Spec *specs.Spec 288 // Conf is the system configuration. 289 Conf *config.Config 290 // ControllerFD is the FD to the URPC controller. The Loader takes ownership 291 // of this FD and may close it at any time. 292 ControllerFD int 293 // Device is an optional argument that is passed to the platform. The Loader 294 // takes ownership of this file and may close it at any time. 295 Device *fd.FD 296 // GoferFDs is an array of FDs used to connect with the Gofer. The Loader 297 // takes ownership of these FDs and may close them at any time. 298 GoferFDs []int 299 // DevGoferFD is the FD for the dev gofer connection. The Loader takes 300 // ownership of this FD and may close it at any time. 301 DevGoferFD int 302 // StdioFDs is the stdio for the application. The Loader takes ownership of 303 // these FDs and may close them at any time. 304 StdioFDs []int 305 // PassFDs are user-supplied FD mappings from host to guest descriptors. 306 // The Loader takes ownership of these FDs and may close them at any time. 307 PassFDs []FDMapping 308 // ExecFD is the host file descriptor used for program execution. 309 ExecFD int 310 // GoferFilestoreFDs are FDs to the regular files that will back the tmpfs or 311 // overlayfs mount for certain gofer mounts. 312 GoferFilestoreFDs []int 313 // GoferMountConfs contains information about how the gofer mounts have been 314 // configured. The first entry is for rootfs and the following entries are 315 // for bind mounts in Spec.Mounts (in the same order). 316 GoferMountConfs []GoferMountConf 317 // NumCPU is the number of CPUs to create inside the sandbox. 318 NumCPU int 319 // TotalMem is the initial amount of total memory to report back to the 320 // container. 321 TotalMem uint64 322 // TotalHostMem is the total memory reported by host /proc/meminfo. 323 TotalHostMem uint64 324 // UserLogFD is the file descriptor to write user logs to. 325 UserLogFD int 326 // ProductName is the value to show in 327 // /sys/devices/virtual/dmi/id/product_name. 328 ProductName string 329 // PodInitConfigFD is the file descriptor to a file passed in the 330 // --pod-init-config flag 331 PodInitConfigFD int 332 // SinkFDs is an ordered array of file descriptors to be used by seccheck 333 // sinks configured from the --pod-init-config file. 334 SinkFDs []int 335 // ProfileOpts contains the set of profiles to enable and the 336 // corresponding FDs where profile data will be written. 337 ProfileOpts profile.Opts 338 // NvidiaDriverVersion is the NVIDIA driver ABI version to use for 339 // communicating with NVIDIA devices on the host. 340 NvidiaDriverVersion string 341 } 342 343 // make sure stdioFDs are always the same on initial start and on restore 344 const startingStdioFD = 256 345 346 func getRootCredentials(spec *specs.Spec, conf *config.Config, userNs *auth.UserNamespace) *auth.Credentials { 347 // Create capabilities. 348 caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) 349 if err != nil { 350 return nil 351 } 352 353 // Convert the spec's additional GIDs to KGIDs. 354 extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) 355 for _, GID := range spec.Process.User.AdditionalGids { 356 extraKGIDs = append(extraKGIDs, auth.KGID(GID)) 357 } 358 359 if userNs == nil { 360 userNs = auth.NewRootUserNamespace() 361 } 362 // Create credentials. 363 creds := auth.NewUserCredentials( 364 auth.KUID(spec.Process.User.UID), 365 auth.KGID(spec.Process.User.GID), 366 extraKGIDs, 367 caps, 368 userNs) 369 370 return creds 371 } 372 373 // New initializes a new kernel loader configured by spec. 374 // New also handles setting up a kernel for restoring a container. 375 func New(args Args) (*Loader, error) { 376 stopProfilingRuntime := profile.Start(args.ProfileOpts) 377 stopProfiling := func() { 378 stopProfilingRuntime() 379 metric.StopProfilingMetrics() 380 } 381 382 // Initialize seccheck points. 383 seccheck.Initialize() 384 385 // We initialize the rand package now to make sure /dev/urandom is pre-opened 386 // on kernels that do not support getrandom(2). 387 if err := rand.Init(); err != nil { 388 return nil, fmt.Errorf("setting up rand: %w", err) 389 } 390 391 if err := usage.Init(); err != nil { 392 return nil, fmt.Errorf("setting up memory usage: %w", err) 393 } 394 395 if specutils.NVProxyEnabled(args.Spec, args.Conf) { 396 nvproxy.Init() 397 } 398 399 kernel.IOUringEnabled = args.Conf.IOUring 400 401 eid := execID{cid: args.ID} 402 l := &Loader{ 403 sandboxID: args.ID, 404 processes: map[execID]*execProcess{eid: {}}, 405 sharedMounts: make(map[string]*vfs.Mount), 406 stopProfiling: stopProfiling, 407 productName: args.ProductName, 408 containerIDs: map[string]string{}, 409 } 410 411 containerName := l.registerContainerLocked(args.Spec, args.ID) 412 l.root = containerInfo{ 413 cid: args.ID, 414 containerName: containerName, 415 conf: args.Conf, 416 spec: args.Spec, 417 goferMountConfs: args.GoferMountConfs, 418 nvidiaDriverVersion: args.NvidiaDriverVersion, 419 } 420 421 // Make host FDs stable between invocations. Host FDs must map to the exact 422 // same number when the sandbox is restored. Otherwise the wrong FD will be 423 // used. 424 newfd := startingStdioFD 425 426 for _, stdioFD := range args.StdioFDs { 427 // Check that newfd is unused to avoid clobbering over it. 428 if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { 429 if err != nil { 430 return nil, fmt.Errorf("error checking for FD (%d) conflict: %w", newfd, err) 431 } 432 return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) 433 } 434 435 err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) 436 if err != nil { 437 return nil, fmt.Errorf("dup3 of stdios failed: %w", err) 438 } 439 l.root.stdioFDs = append(l.root.stdioFDs, fd.New(newfd)) 440 _ = unix.Close(stdioFD) 441 newfd++ 442 } 443 for _, goferFD := range args.GoferFDs { 444 l.root.goferFDs = append(l.root.goferFDs, fd.New(goferFD)) 445 } 446 for _, filestoreFD := range args.GoferFilestoreFDs { 447 l.root.goferFilestoreFDs = append(l.root.goferFilestoreFDs, fd.New(filestoreFD)) 448 } 449 if args.DevGoferFD >= 0 { 450 l.root.devGoferFD = fd.New(args.DevGoferFD) 451 } 452 if args.ExecFD >= 0 { 453 l.root.execFD = fd.New(args.ExecFD) 454 } 455 456 for _, customFD := range args.PassFDs { 457 l.root.passFDs = append(l.root.passFDs, fdMapping{ 458 host: fd.New(customFD.Host), 459 guest: customFD.Guest, 460 }) 461 } 462 463 // Create kernel and platform. 464 p, err := createPlatform(args.Conf, args.Device) 465 if err != nil { 466 return nil, fmt.Errorf("creating platform: %w", err) 467 } 468 if specutils.NVProxyEnabled(args.Spec, args.Conf) && p.OwnsPageTables() { 469 return nil, fmt.Errorf("--nvproxy is incompatible with platform %s: owns page tables", args.Conf.Platform) 470 } 471 l.k = &kernel.Kernel{Platform: p} 472 473 // Create memory file. 474 mf, err := createMemoryFile() 475 if err != nil { 476 return nil, fmt.Errorf("creating memory file: %w", err) 477 } 478 l.k.SetMemoryFile(mf) 479 480 // Create VDSO. 481 // 482 // Pass k as the platform since it is savable, unlike the actual platform. 483 vdso, err := loader.PrepareVDSO(l.k.MemoryFile()) 484 if err != nil { 485 return nil, fmt.Errorf("creating vdso: %w", err) 486 } 487 488 // Create timekeeper. 489 tk := kernel.NewTimekeeper(l.k.MemoryFile(), vdso.ParamPage.FileRange()) 490 tk.SetClocks(time.NewCalibratedClocks()) 491 492 if err := enableStrace(args.Conf); err != nil { 493 return nil, fmt.Errorf("enabling strace: %w", err) 494 } 495 496 creds := getRootCredentials(args.Spec, args.Conf, nil /* UserNamespace */) 497 if creds == nil { 498 return nil, fmt.Errorf("getting root credentials") 499 } 500 // Create root network namespace/stack. 501 netns, err := newRootNetworkNamespace(args.Conf, tk, l.k, creds.UserNamespace) 502 if err != nil { 503 return nil, fmt.Errorf("creating network: %w", err) 504 } 505 506 if args.NumCPU == 0 { 507 args.NumCPU = runtime.NumCPU() 508 } 509 log.Infof("CPUs: %d", args.NumCPU) 510 runtime.GOMAXPROCS(args.NumCPU) 511 512 if args.TotalHostMem > 0 { 513 // As per tmpfs(5), the default size limit is 50% of total physical RAM. 514 // See mm/shmem.c:shmem_default_max_blocks(). 515 tmpfs.SetDefaultSizeLimit(args.TotalHostMem / 2) 516 // Set a generous but sane on maximum allowable size for memory 517 // file allocates. 518 usage.MaximumAllocatableBytes = args.TotalHostMem 519 } 520 521 if args.TotalMem > 0 { 522 // Adjust the total memory returned by the Sentry so that applications that 523 // use /proc/meminfo can make allocations based on this limit. 524 usage.MinimumTotalMemoryBytes = args.TotalMem 525 usage.MaximumTotalMemoryBytes = args.TotalMem 526 // Reset max allocatable to TotalMem because it's smaller than TotalHostMem. 527 usage.MaximumAllocatableBytes = args.TotalMem 528 log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30)) 529 } 530 531 maxFDLimit := kernel.MaxFdLimit 532 if args.Spec.Linux != nil && args.Spec.Linux.Sysctl != nil { 533 if val, ok := args.Spec.Linux.Sysctl["fs.nr_open"]; ok { 534 nrOpen, err := strconv.Atoi(val) 535 if err != nil { 536 return nil, fmt.Errorf("setting fs.nr_open=%s: %w", val, err) 537 } 538 if nrOpen <= 0 || nrOpen > int(kernel.MaxFdLimit) { 539 return nil, fmt.Errorf("setting fs.nr_open=%s", val) 540 } 541 maxFDLimit = int32(nrOpen) 542 } 543 } 544 // Initiate the Kernel object, which is required by the Context passed 545 // to createVFS in order to mount (among other things) procfs. 546 if err = l.k.Init(kernel.InitKernelArgs{ 547 FeatureSet: cpuid.HostFeatureSet().Fixed(), 548 Timekeeper: tk, 549 RootUserNamespace: creds.UserNamespace, 550 RootNetworkNamespace: netns, 551 ApplicationCores: uint(args.NumCPU), 552 Vdso: vdso, 553 RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), 554 RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), 555 PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), 556 MaxFDLimit: maxFDLimit, 557 }); err != nil { 558 return nil, fmt.Errorf("initializing kernel: %w", err) 559 } 560 561 if err := registerFilesystems(l.k, &l.root); err != nil { 562 return nil, fmt.Errorf("registering filesystems: %w", err) 563 } 564 565 // Turn on packet logging if enabled. 566 if args.Conf.LogPackets { 567 log.Infof("Packet logging enabled") 568 sniffer.LogPackets.Store(1) 569 } else { 570 log.Infof("Packet logging disabled") 571 sniffer.LogPackets.Store(0) 572 } 573 574 // Create a watchdog. 575 dogOpts := watchdog.DefaultOpts 576 dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction 577 l.watchdog = watchdog.New(l.k, dogOpts) 578 579 procArgs, err := createProcessArgs(args.ID, args.Spec, args.Conf, creds, l.k, l.k.RootPIDNamespace()) 580 if err != nil { 581 return nil, fmt.Errorf("creating init process for root container: %w", err) 582 } 583 l.root.procArgs = procArgs 584 585 if err := initCompatLogs(args.UserLogFD); err != nil { 586 return nil, fmt.Errorf("initializing compat logs: %w", err) 587 } 588 589 l.mountHints, err = NewPodMountHints(args.Spec) 590 if err != nil { 591 return nil, fmt.Errorf("creating pod mount hints: %w", err) 592 } 593 594 // Set up host mount that will be used for imported fds. 595 hostFilesystem, err := host.NewFilesystem(l.k.VFS()) 596 if err != nil { 597 return nil, fmt.Errorf("failed to create hostfs filesystem: %w", err) 598 } 599 defer hostFilesystem.DecRef(l.k.SupervisorContext()) 600 l.k.SetHostMount(l.k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})) 601 602 if args.PodInitConfigFD >= 0 { 603 if err := setupSeccheck(args.PodInitConfigFD, args.SinkFDs); err != nil { 604 log.Warningf("unable to configure event session: %v", err) 605 } 606 } 607 608 l.k.RegisterContainerName(args.ID, l.root.containerName) 609 610 // We don't care about child signals; some platforms can generate a 611 // tremendous number of useless ones (I'm looking at you, ptrace). 612 if err := sighandling.IgnoreChildStop(); err != nil { 613 return nil, fmt.Errorf("ignore child stop signals failed: %w", err) 614 } 615 616 // Create the control server using the provided FD. 617 // 618 // This must be done *after* we have initialized the kernel since the 619 // controller is used to configure the kernel's network stack. 620 ctrl, err := newController(args.ControllerFD, l) 621 if err != nil { 622 return nil, fmt.Errorf("creating control server: %w", err) 623 } 624 l.ctrl = ctrl 625 626 // Only start serving after Loader is set to controller and controller is set 627 // to Loader, because they are both used in the urpc methods. 628 if err := ctrl.srv.StartServing(); err != nil { 629 return nil, fmt.Errorf("starting control server: %w", err) 630 } 631 632 return l, nil 633 } 634 635 // createProcessArgs creates args that can be used with kernel.CreateProcess. 636 func createProcessArgs(id string, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) { 637 // Create initial limits. 638 ls, err := createLimitSet(spec, specutils.TPUProxyIsEnabled(spec, conf)) 639 if err != nil { 640 return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %w", err) 641 } 642 env, err := specutils.ResolveEnvs(spec.Process.Env) 643 if err != nil { 644 return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err) 645 } 646 647 wd := spec.Process.Cwd 648 if wd == "" { 649 wd = "/" 650 } 651 652 // Create the process arguments. 653 procArgs := kernel.CreateProcessArgs{ 654 Argv: spec.Process.Args, 655 Envv: env, 656 WorkingDirectory: wd, 657 Credentials: creds, 658 Umask: 0022, 659 Limits: ls, 660 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 661 UTSNamespace: k.RootUTSNamespace(), 662 IPCNamespace: k.RootIPCNamespace(), 663 ContainerID: id, 664 PIDNamespace: pidns, 665 } 666 667 return procArgs, nil 668 } 669 670 // Destroy cleans up all resources used by the loader. 671 // 672 // Note that this will block until all open control server connections have 673 // been closed. For that reason, this should NOT be called in a defer, because 674 // a panic in a control server rpc would then hang forever. 675 func (l *Loader) Destroy() { 676 if l.stopSignalForwarding != nil { 677 l.stopSignalForwarding() 678 } 679 l.watchdog.Stop() 680 681 ctx := l.k.SupervisorContext() 682 for _, m := range l.sharedMounts { 683 m.DecRef(ctx) 684 } 685 686 // Stop the control server. This will indirectly stop any 687 // long-running control operations that are in flight, e.g. 688 // profiling operations. 689 l.ctrl.stop() 690 691 // Release all kernel resources. This is only safe after we can no longer 692 // save/restore. 693 l.k.Release() 694 695 // Release any dangling tcp connections. 696 tcpip.ReleaseDanglingEndpoints() 697 698 // In the success case, all FDs in l.root will only contain released/closed 699 // FDs whose ownership has been passed over to host FDs and gofer sessions. 700 // Close them here in case of failure. 701 for _, f := range l.root.stdioFDs { 702 _ = f.Close() 703 } 704 for _, f := range l.root.passFDs { 705 _ = f.host.Close() 706 } 707 for _, f := range l.root.goferFDs { 708 _ = f.Close() 709 } 710 for _, f := range l.root.goferFilestoreFDs { 711 _ = f.Close() 712 } 713 if l.root.devGoferFD != nil { 714 _ = l.root.devGoferFD.Close() 715 } 716 717 l.stopProfiling() 718 // Check all references. 719 refs.OnExit() 720 } 721 722 func createPlatform(conf *config.Config, deviceFile *fd.FD) (platform.Platform, error) { 723 p, err := platform.Lookup(conf.Platform) 724 if err != nil { 725 panic(fmt.Sprintf("invalid platform %s: %s", conf.Platform, err)) 726 } 727 log.Infof("Platform: %s", conf.Platform) 728 return p.New(deviceFile) 729 } 730 731 func createMemoryFile() (*pgalloc.MemoryFile, error) { 732 const memfileName = "runsc-memory" 733 memfd, err := memutil.CreateMemFD(memfileName, 0) 734 if err != nil { 735 return nil, fmt.Errorf("error creating memfd: %w", err) 736 } 737 memfile := os.NewFile(uintptr(memfd), memfileName) 738 // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if 739 // there are memory cgroups specified, because at this point we're already 740 // in a mount namespace in which the relevant cgroupfs is not visible. 741 mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{ 742 EnforceMaximumAllocatable: true, 743 }) 744 if err != nil { 745 _ = memfile.Close() 746 return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %w", err) 747 } 748 return mf, nil 749 } 750 751 // installSeccompFilters installs sandbox seccomp filters with the host. 752 func (l *Loader) installSeccompFilters() error { 753 if l.PreSeccompCallback != nil { 754 l.PreSeccompCallback() 755 } 756 if l.root.conf.DisableSeccomp { 757 log.Warningf("*** SECCOMP WARNING: syscall filter is DISABLED. Running in less secure mode.") 758 } else { 759 hostnet := l.root.conf.Network == config.NetworkHost 760 opts := filter.Options{ 761 Platform: l.k.Platform.SeccompInfo(), 762 HostNetwork: hostnet, 763 HostNetworkRawSockets: hostnet && l.root.conf.EnableRaw, 764 HostFilesystem: l.root.conf.DirectFS, 765 ProfileEnable: l.root.conf.ProfileEnable, 766 NVProxy: specutils.NVProxyEnabled(l.root.spec, l.root.conf), 767 TPUProxy: specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf), 768 ControllerFD: uint32(l.ctrl.srv.FD()), 769 } 770 if err := filter.Install(opts); err != nil { 771 return fmt.Errorf("installing seccomp filters: %w", err) 772 } 773 } 774 return nil 775 } 776 777 // Run runs the root container. 778 func (l *Loader) Run() error { 779 err := l.run() 780 l.ctrl.manager.startResultChan <- err 781 if err != nil { 782 // Give the controller some time to send the error to the 783 // runtime. If we return too quickly here the process will exit 784 // and the control connection will be closed before the error 785 // is returned. 786 gtime.Sleep(2 * gtime.Second) 787 return err 788 } 789 return nil 790 } 791 792 func (l *Loader) run() error { 793 if l.root.conf.Network == config.NetworkHost { 794 // Delay host network configuration to this point because network namespace 795 // is configured after the loader is created and before Run() is called. 796 log.Debugf("Configuring host network") 797 s := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) 798 if err := s.Configure(l.root.conf.EnableRaw); err != nil { 799 return err 800 } 801 } 802 803 l.mu.Lock() 804 defer l.mu.Unlock() 805 806 eid := execID{cid: l.sandboxID} 807 ep, ok := l.processes[eid] 808 if !ok { 809 return fmt.Errorf("trying to start deleted container %q", l.sandboxID) 810 } 811 812 // If we are restoring, we do not want to create a process. 813 // l.restore is set by the container manager when a restore call is made. 814 if !l.restore { 815 if l.root.conf.ProfileEnable { 816 pprof.Initialize() 817 } 818 819 // Finally done with all configuration. Setup filters before user code 820 // is loaded. 821 if err := l.installSeccompFilters(); err != nil { 822 return err 823 } 824 825 // Create the root container init task. It will begin running 826 // when the kernel is started. 827 var ( 828 tg *kernel.ThreadGroup 829 err error 830 ) 831 tg, ep.tty, err = l.createContainerProcess(&l.root) 832 if err != nil { 833 return err 834 } 835 836 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 837 evt := pb.Start{ 838 Id: l.sandboxID, 839 Cwd: l.root.spec.Process.Cwd, 840 Args: l.root.spec.Process.Args, 841 Terminal: l.root.spec.Process.Terminal, 842 } 843 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 844 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 845 evt.Env = l.root.spec.Process.Env 846 } 847 if !fields.Context.Empty() { 848 evt.ContextData = &pb.ContextData{} 849 kernel.LoadSeccheckData(tg.Leader(), fields.Context, evt.ContextData) 850 } 851 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 852 return c.ContainerStart(context.Background(), fields, &evt) 853 }) 854 } 855 } 856 857 ep.tg = l.k.GlobalInit() 858 if ns, ok := specutils.GetNS(specs.PIDNamespace, l.root.spec); ok { 859 ep.pidnsPath = ns.Path 860 } 861 862 // Handle signals by forwarding them to the root container process 863 // (except for panic signal, which should cause a panic). 864 l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) { 865 // Panic signal should cause a panic. 866 if l.root.conf.PanicSignal != -1 && sig == linux.Signal(l.root.conf.PanicSignal) { 867 panic("Signal-induced panic") 868 } 869 870 // Otherwise forward to root container. 871 deliveryMode := DeliverToProcess 872 if l.root.spec.Process.Terminal { 873 // Since we are running with a console, we should forward the signal to 874 // the foreground process group so that job control signals like ^C can 875 // be handled properly. 876 deliveryMode = DeliverToForegroundProcessGroup 877 } 878 log.Infof("Received external signal %d, mode: %s", sig, deliveryMode) 879 if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil { 880 log.Warningf("error sending signal %s to container %q: %s", sig, l.sandboxID, err) 881 } 882 }) 883 884 log.Infof("Process should have started...") 885 l.watchdog.Start() 886 if err := l.k.Start(); err != nil { 887 return err 888 } 889 l.state = started 890 return nil 891 } 892 893 // createSubcontainer creates a new container inside the sandbox. 894 func (l *Loader) createSubcontainer(cid string, tty *fd.FD) error { 895 l.mu.Lock() 896 defer l.mu.Unlock() 897 898 eid := execID{cid: cid} 899 if _, ok := l.processes[eid]; ok { 900 return fmt.Errorf("container %q already exists", cid) 901 } 902 l.processes[eid] = &execProcess{hostTTY: tty} 903 return nil 904 } 905 906 // startSubcontainer starts a child container. It returns the thread group ID of 907 // the newly created process. Used FDs are either closed or released. It's safe 908 // for the caller to close any remaining files upon return. 909 func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error { 910 l.mu.Lock() 911 defer l.mu.Unlock() 912 913 ep := l.processes[execID{cid: cid}] 914 if ep == nil { 915 return fmt.Errorf("trying to start a deleted container %q", cid) 916 } 917 918 // Create credentials. We reuse the root user namespace because the 919 // sentry currently supports only 1 mount namespace, which is tied to a 920 // single user namespace. Thus we must run in the same user namespace 921 // to access mounts. 922 creds := getRootCredentials(spec, conf, l.k.RootUserNamespace()) 923 if creds == nil { 924 return fmt.Errorf("getting root credentials") 925 } 926 var pidns *kernel.PIDNamespace 927 if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok { 928 if ns.Path != "" { 929 for _, p := range l.processes { 930 if ns.Path == p.pidnsPath { 931 log.Debugf("Joining PID namespace named %q", ns.Path) 932 pidns = p.tg.PIDNamespace() 933 break 934 } 935 } 936 } 937 if pidns == nil { 938 log.Warningf("PID namespace %q not found, running in new PID namespace", ns.Path) 939 pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace()) 940 } 941 ep.pidnsPath = ns.Path 942 } else { 943 pidns = l.k.RootPIDNamespace() 944 } 945 946 containerName := l.registerContainerLocked(spec, cid) 947 info := &containerInfo{ 948 cid: cid, 949 containerName: containerName, 950 conf: conf, 951 spec: spec, 952 goferFDs: goferFDs, 953 devGoferFD: devGoferFD, 954 goferFilestoreFDs: goferFilestoreFDs, 955 goferMountConfs: goferMountConfs, 956 nvidiaUVMDevMajor: l.root.nvidiaUVMDevMajor, 957 nvidiaDriverVersion: l.root.nvidiaDriverVersion, 958 } 959 var err error 960 info.procArgs, err = createProcessArgs(cid, spec, conf, creds, l.k, pidns) 961 if err != nil { 962 return fmt.Errorf("creating new process: %w", err) 963 } 964 965 // Use stdios or TTY depending on the spec configuration. 966 if spec.Process.Terminal { 967 if l := len(stdioFDs); l != 0 { 968 return fmt.Errorf("using TTY, stdios not expected: %d", l) 969 } 970 if ep.hostTTY == nil { 971 return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?") 972 } 973 info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY} 974 ep.hostTTY = nil 975 } else { 976 info.stdioFDs = stdioFDs 977 } 978 979 var cu cleanup.Cleanup 980 defer cu.Clean() 981 if devGoferFD != nil { 982 cu.Add(func() { 983 // createContainerProcess() will consume devGoferFD and initialize a gofer 984 // connection. This connection is owned by l.k. In case of failure, we want 985 // to clean up this gofer connection so that the gofer process can exit. 986 l.k.RemoveDevGofer(containerName) 987 }) 988 } 989 990 ep.tg, ep.tty, err = l.createContainerProcess(info) 991 if err != nil { 992 return err 993 } 994 995 if seccheck.Global.Enabled(seccheck.PointContainerStart) { 996 evt := pb.Start{ 997 Id: cid, 998 Cwd: spec.Process.Cwd, 999 Args: spec.Process.Args, 1000 Terminal: spec.Process.Terminal, 1001 } 1002 fields := seccheck.Global.GetFieldSet(seccheck.PointContainerStart) 1003 if fields.Local.Contains(seccheck.FieldContainerStartEnv) { 1004 evt.Env = spec.Process.Env 1005 } 1006 if !fields.Context.Empty() { 1007 evt.ContextData = &pb.ContextData{} 1008 kernel.LoadSeccheckData(ep.tg.Leader(), fields.Context, evt.ContextData) 1009 } 1010 _ = seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 1011 return c.ContainerStart(context.Background(), fields, &evt) 1012 }) 1013 } 1014 1015 l.k.RegisterContainerName(cid, info.containerName) 1016 l.k.StartProcess(ep.tg) 1017 // No more failures from this point on. 1018 cu.Release() 1019 return nil 1020 } 1021 1022 // +checklocks:l.mu 1023 func (l *Loader) createContainerProcess(info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileDescription, error) { 1024 // Create the FD map, which will set stdin, stdout, and stderr. 1025 ctx := info.procArgs.NewContext(l.k) 1026 fdTable, ttyFile, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs, info.passFDs, info.spec.Process.User, info.containerName) 1027 if err != nil { 1028 return nil, nil, fmt.Errorf("importing fds: %w", err) 1029 } 1030 // CreateProcess takes a reference on fdTable if successful. We won't need 1031 // ours either way. 1032 info.procArgs.FDTable = fdTable 1033 1034 if info.execFD != nil { 1035 if info.procArgs.Filename != "" { 1036 return nil, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 1037 } 1038 file, err := host.NewFD(ctx, l.k.HostMount(), info.execFD.FD(), &host.NewFDOptions{ 1039 Readonly: true, 1040 Savable: true, 1041 VirtualOwner: true, 1042 UID: auth.KUID(info.spec.Process.User.UID), 1043 GID: auth.KGID(info.spec.Process.User.GID), 1044 }) 1045 if err != nil { 1046 return nil, nil, err 1047 } 1048 defer file.DecRef(ctx) 1049 info.execFD.Release() 1050 1051 info.procArgs.File = file 1052 } 1053 1054 // Gofer FDs must be ordered and the first FD is always the rootfs. 1055 if len(info.goferFDs) < 1 { 1056 return nil, nil, fmt.Errorf("rootfs gofer FD not found") 1057 } 1058 l.startGoferMonitor(info) 1059 1060 if l.root.cid == l.sandboxID { 1061 // Mounts cgroups for all the controllers. 1062 if err := l.mountCgroupMounts(info.conf, info.procArgs.Credentials); err != nil { 1063 return nil, nil, err 1064 } 1065 } 1066 // We can share l.sharedMounts with containerMounter since l.mu is locked. 1067 // Hence, mntr must only be used within this function (while l.mu is locked). 1068 mntr := newContainerMounter(info, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID) 1069 if err := setupContainerVFS(ctx, info, mntr, &info.procArgs); err != nil { 1070 return nil, nil, err 1071 } 1072 defer func() { 1073 for cg := range info.procArgs.InitialCgroups { 1074 cg.Dentry.DecRef(ctx) 1075 } 1076 }() 1077 1078 // Add the HOME environment variable if it is not already set. 1079 info.procArgs.Envv, err = user.MaybeAddExecUserHome(ctx, info.procArgs.MountNamespace, 1080 info.procArgs.Credentials.RealKUID, info.procArgs.Envv) 1081 if err != nil { 1082 return nil, nil, err 1083 } 1084 1085 // Create and start the new process. 1086 tg, _, err := l.k.CreateProcess(info.procArgs) 1087 if err != nil { 1088 return nil, nil, fmt.Errorf("creating process: %w", err) 1089 } 1090 // CreateProcess takes a reference on FDTable if successful. 1091 info.procArgs.FDTable.DecRef(ctx) 1092 1093 // Set the foreground process group on the TTY to the global init process 1094 // group, since that is what we are about to start running. 1095 if ttyFile != nil { 1096 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 1097 } 1098 1099 // Install seccomp filters with the new task if there are any. 1100 if info.conf.OCISeccomp { 1101 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 1102 program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) 1103 if err != nil { 1104 return nil, nil, fmt.Errorf("building seccomp program: %w", err) 1105 } 1106 1107 if log.IsLogging(log.Debug) { 1108 out, _ := bpf.DecodeProgram(program) 1109 log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) 1110 } 1111 1112 task := tg.Leader() 1113 // NOTE: It seems Flags are ignored by runc so we ignore them too. 1114 if err := task.AppendSyscallFilter(program, true); err != nil { 1115 return nil, nil, fmt.Errorf("appending seccomp filters: %w", err) 1116 } 1117 } 1118 } else { 1119 if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { 1120 log.Warningf("Seccomp spec is being ignored") 1121 } 1122 } 1123 1124 return tg, ttyFile, nil 1125 } 1126 1127 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on 1128 // the gofer FD looking for disconnects, and kills the container processes if 1129 // the gofer connection disconnects. 1130 func (l *Loader) startGoferMonitor(info *containerInfo) { 1131 // We need to pick a suitable gofer connection that is expected to be alive 1132 // for the entire container lifecycle. Only the following can be used: 1133 // 1. Rootfs gofer connection 1134 // 2. Device gofer connection 1135 // 1136 // Note that other gofer mounts are allowed to be unmounted and disconnected. 1137 goferFD := -1 1138 if info.goferMountConfs[0].ShouldUseLisafs() { 1139 goferFD = info.goferFDs[0].FD() 1140 } else if info.devGoferFD != nil { 1141 goferFD = info.devGoferFD.FD() 1142 } 1143 if goferFD < 0 { 1144 log.Warningf("could not find a suitable gofer FD to monitor") 1145 return 1146 } 1147 go func() { 1148 log.Debugf("Monitoring gofer health for container %q", info.cid) 1149 events := []unix.PollFd{ 1150 { 1151 Fd: int32(goferFD), 1152 Events: unix.POLLHUP | unix.POLLRDHUP, 1153 }, 1154 } 1155 _, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) { 1156 // Use ppoll instead of poll because it's already allowed in seccomp. 1157 n, err := unix.Ppoll(events, nil, nil) 1158 return uintptr(n), 0, err 1159 }) 1160 if err != nil { 1161 panic(fmt.Sprintf("Error monitoring gofer FDs: %s", err)) 1162 } 1163 1164 l.mu.Lock() 1165 defer l.mu.Unlock() 1166 1167 // The gofer could have been stopped due to a normal container shutdown. 1168 // Check if the container has not stopped yet. 1169 if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: info.cid}); tg != nil { 1170 log.Infof("Gofer socket disconnected, killing container %q", info.cid) 1171 if err := l.signalAllProcesses(info.cid, int32(linux.SIGKILL)); err != nil { 1172 log.Warningf("Error killing container %q after gofer stopped: %s", info.cid, err) 1173 } 1174 } 1175 }() 1176 } 1177 1178 // destroySubcontainer stops a container if it is still running and cleans up 1179 // its filesystem. 1180 func (l *Loader) destroySubcontainer(cid string) error { 1181 l.mu.Lock() 1182 defer l.mu.Unlock() 1183 1184 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1185 if err != nil { 1186 // Container doesn't exist. 1187 return err 1188 } 1189 1190 // The container exists, but has it been started? 1191 if tg != nil { 1192 if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { 1193 return fmt.Errorf("sending SIGKILL to all container processes: %w", err) 1194 } 1195 // Wait for all processes that belong to the container to exit (including 1196 // exec'd processes). 1197 for _, t := range l.k.TaskSet().Root.Tasks() { 1198 if t.ContainerID() == cid { 1199 t.ThreadGroup().WaitExited() 1200 } 1201 } 1202 } 1203 1204 // No more failure from this point on. 1205 1206 // Remove all container thread groups from the map. 1207 for key := range l.processes { 1208 if key.cid == cid { 1209 delete(l.processes, key) 1210 } 1211 } 1212 // Cleanup the device gofer. 1213 l.k.RemoveDevGofer(l.k.ContainerName(cid)) 1214 1215 log.Debugf("Container destroyed, cid: %s", cid) 1216 return nil 1217 } 1218 1219 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { 1220 // Hold the lock for the entire operation to ensure that exec'd process is 1221 // added to 'processes' in case it races with destroyContainer(). 1222 l.mu.Lock() 1223 defer l.mu.Unlock() 1224 1225 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) 1226 if err != nil { 1227 return 0, err 1228 } 1229 if tg == nil { 1230 return 0, fmt.Errorf("container %q not started", args.ContainerID) 1231 } 1232 1233 // Get the container MountNamespace from the Task. Try to acquire ref may fail 1234 // in case it raced with task exit. 1235 // task.MountNamespace() does not take a ref, so we must do so ourselves. 1236 args.MountNamespace = tg.Leader().MountNamespace() 1237 if args.MountNamespace == nil || !args.MountNamespace.TryIncRef() { 1238 return 0, fmt.Errorf("container %q has stopped", args.ContainerID) 1239 } 1240 1241 args.Envv, err = specutils.ResolveEnvs(args.Envv) 1242 if err != nil { 1243 return 0, fmt.Errorf("resolving env: %w", err) 1244 } 1245 1246 // Add the HOME environment variable if it is not already set. 1247 sctx := l.k.SupervisorContext() 1248 root := args.MountNamespace.Root(sctx) 1249 defer root.DecRef(sctx) 1250 ctx := vfs.WithRoot(sctx, root) 1251 defer args.MountNamespace.DecRef(ctx) 1252 args.Envv, err = user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) 1253 if err != nil { 1254 return 0, err 1255 } 1256 args.PIDNamespace = tg.PIDNamespace() 1257 1258 args.Limits, err = createLimitSet(l.root.spec, specutils.TPUProxyIsEnabled(l.root.spec, l.root.conf)) 1259 if err != nil { 1260 return 0, fmt.Errorf("creating limits: %w", err) 1261 } 1262 1263 // Start the process. 1264 proc := control.Proc{Kernel: l.k} 1265 newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) 1266 if err != nil { 1267 return 0, err 1268 } 1269 1270 eid := execID{cid: args.ContainerID, pid: tgid} 1271 l.processes[eid] = &execProcess{ 1272 tg: newTG, 1273 tty: ttyFile, 1274 } 1275 log.Debugf("updated processes: %v", l.processes) 1276 1277 return tgid, nil 1278 } 1279 1280 // waitContainer waits for the init process of a container to exit. 1281 func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { 1282 // Don't defer unlock, as doing so would make it impossible for 1283 // multiple clients to wait on the same container. 1284 key := execID{cid: cid} 1285 tg, err := l.threadGroupFromID(key) 1286 if err != nil { 1287 l.mu.Lock() 1288 // Extra handling is needed if the container is restoring. 1289 if l.state != restoring { 1290 l.mu.Unlock() 1291 return err 1292 } 1293 // Container could be restoring, first check if container exists. 1294 if _, err := l.findProcessLocked(key); err != nil { 1295 l.mu.Unlock() 1296 return err 1297 } 1298 log.Infof("Waiting for container being restored, CID: %q", cid) 1299 l.restoreWaiters.Wait() 1300 l.mu.Unlock() 1301 1302 log.Infof("Restore is completed, trying to wait for container %q again.", cid) 1303 return l.waitContainer(cid, waitStatus) 1304 } 1305 1306 // If the thread either has already exited or exits during waiting, 1307 // consider the container exited. 1308 ws := l.wait(tg) 1309 *waitStatus = ws 1310 1311 // Check for leaks and write coverage report after the root container has 1312 // exited. This guarantees that the report is written in cases where the 1313 // sandbox is killed by a signal after the ContMgrWait request is completed. 1314 if l.root.procArgs.ContainerID == cid { 1315 // All sentry-created resources should have been released at this point. 1316 _ = coverage.Report() 1317 } 1318 return nil 1319 } 1320 1321 func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { 1322 if tgid <= 0 { 1323 return fmt.Errorf("PID (%d) must be positive", tgid) 1324 } 1325 1326 // Try to find a process that was exec'd 1327 eid := execID{cid: cid, pid: tgid} 1328 execTG, err := l.threadGroupFromID(eid) 1329 if err == nil { 1330 ws := l.wait(execTG) 1331 *waitStatus = ws 1332 1333 l.mu.Lock() 1334 delete(l.processes, eid) 1335 log.Debugf("updated processes (removal): %v", l.processes) 1336 l.mu.Unlock() 1337 return nil 1338 } 1339 1340 // The caller may be waiting on a process not started directly via exec. 1341 // In this case, find the process in the container's PID namespace. 1342 initTG, err := l.threadGroupFromID(execID{cid: cid}) 1343 if err != nil { 1344 return fmt.Errorf("waiting for PID %d: %w", tgid, err) 1345 } 1346 tg := initTG.PIDNamespace().ThreadGroupWithID(tgid) 1347 if tg == nil { 1348 return fmt.Errorf("waiting for PID %d: no such process", tgid) 1349 } 1350 if tg.Leader().ContainerID() != cid { 1351 return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID()) 1352 } 1353 ws := l.wait(tg) 1354 *waitStatus = ws 1355 return nil 1356 } 1357 1358 // wait waits for the process with TGID 'tgid' in a container's PID namespace 1359 // to exit. 1360 func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 { 1361 tg.WaitExited() 1362 return uint32(tg.ExitStatus()) 1363 } 1364 1365 // WaitForStartSignal waits for a start signal from the control server. 1366 func (l *Loader) WaitForStartSignal() { 1367 <-l.ctrl.manager.startChan 1368 } 1369 1370 // WaitExit waits for the root container to exit, and returns its exit status. 1371 func (l *Loader) WaitExit() linux.WaitStatus { 1372 // Wait for container. 1373 l.k.WaitExited() 1374 1375 return l.k.GlobalInit().ExitStatus() 1376 } 1377 1378 func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID stack.UniqueID, userns *auth.UserNamespace) (*inet.Namespace, error) { 1379 // Create an empty network stack because the network namespace may be empty at 1380 // this point. Netns is configured before Run() is called. Netstack is 1381 // configured using a control uRPC message. Host network is configured inside 1382 // Run(). 1383 switch conf.Network { 1384 case config.NetworkHost: 1385 // If configured for raw socket support with host network 1386 // stack, make sure that we have CAP_NET_RAW the host, 1387 // otherwise we can't make raw sockets. 1388 if conf.EnableRaw && !specutils.HasCapabilities(capability.CAP_NET_RAW) { 1389 return nil, fmt.Errorf("configuring network=host with raw sockets requires CAP_NET_RAW capability") 1390 } 1391 // No network namespacing support for hostinet yet, hence creator is nil. 1392 return inet.NewRootNamespace(hostinet.NewStack(), nil, userns), nil 1393 1394 case config.NetworkNone, config.NetworkSandbox: 1395 s, err := newEmptySandboxNetworkStack(clock, uniqueID, conf.AllowPacketEndpointWrite) 1396 if err != nil { 1397 return nil, err 1398 } 1399 creator := &sandboxNetstackCreator{ 1400 clock: clock, 1401 uniqueID: uniqueID, 1402 allowPacketEndpointWrite: conf.AllowPacketEndpointWrite, 1403 } 1404 return inet.NewRootNamespace(s, creator, userns), nil 1405 1406 default: 1407 panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) 1408 } 1409 1410 } 1411 1412 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID, allowPacketEndpointWrite bool) (inet.Stack, error) { 1413 netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} 1414 transProtos := []stack.TransportProtocolFactory{ 1415 tcp.NewProtocol, 1416 udp.NewProtocol, 1417 icmp.NewProtocol4, 1418 icmp.NewProtocol6, 1419 } 1420 s := netstack.Stack{Stack: stack.New(stack.Options{ 1421 NetworkProtocols: netProtos, 1422 TransportProtocols: transProtos, 1423 Clock: clock, 1424 Stats: netstack.Metrics, 1425 HandleLocal: true, 1426 // Enable raw sockets for users with sufficient 1427 // privileges. 1428 RawFactory: raw.EndpointFactory{}, 1429 AllowPacketEndpointWrite: allowPacketEndpointWrite, 1430 UniqueID: uniqueID, 1431 DefaultIPTables: netfilter.DefaultLinuxTables, 1432 })} 1433 1434 // Enable SACK Recovery. 1435 { 1436 opt := tcpip.TCPSACKEnabled(true) 1437 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1438 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1439 } 1440 } 1441 1442 // Set default TTLs as required by socket/netstack. 1443 { 1444 opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) 1445 if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { 1446 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) 1447 } 1448 if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { 1449 return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) 1450 } 1451 } 1452 1453 // Enable Receive Buffer Auto-Tuning. 1454 { 1455 opt := tcpip.TCPModerateReceiveBufferOption(true) 1456 if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { 1457 return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) 1458 } 1459 } 1460 1461 return &s, nil 1462 } 1463 1464 // sandboxNetstackCreator implements kernel.NetworkStackCreator. 1465 // 1466 // +stateify savable 1467 type sandboxNetstackCreator struct { 1468 clock tcpip.Clock 1469 uniqueID stack.UniqueID 1470 allowPacketEndpointWrite bool 1471 } 1472 1473 // CreateStack implements kernel.NetworkStackCreator.CreateStack. 1474 func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { 1475 s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID, f.allowPacketEndpointWrite) 1476 if err != nil { 1477 return nil, err 1478 } 1479 1480 // Setup loopback. 1481 n := &Network{Stack: s.(*netstack.Stack).Stack} 1482 nicID := tcpip.NICID(f.uniqueID.UniqueID()) 1483 link := DefaultLoopbackLink 1484 linkEP := ethernet.New(loopback.New()) 1485 opts := stack.NICOptions{ 1486 Name: link.Name, 1487 DeliverLinkPackets: true, 1488 } 1489 1490 if err := n.createNICWithAddrs(nicID, linkEP, opts, link.Addresses); err != nil { 1491 return nil, err 1492 } 1493 1494 return s, nil 1495 } 1496 1497 // signal sends a signal to one or more processes in a container. If PID is 0, 1498 // then the container init process is used. Depending on the SignalDeliveryMode 1499 // option, the signal may be sent directly to the indicated process, to all 1500 // processes in the container, or to the foreground process group. pid is 1501 // relative to the root PID namespace, not the container's. 1502 func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error { 1503 if pid < 0 { 1504 return fmt.Errorf("PID (%d) must be positive", pid) 1505 } 1506 1507 switch mode { 1508 case DeliverToProcess: 1509 if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil { 1510 return fmt.Errorf("signaling process in container %q PID %d: %w", cid, pid, err) 1511 } 1512 return nil 1513 1514 case DeliverToForegroundProcessGroup: 1515 if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil { 1516 return fmt.Errorf("signaling foreground process group in container %q PID %d: %w", cid, pid, err) 1517 } 1518 return nil 1519 1520 case DeliverToAllProcesses: 1521 if pid != 0 { 1522 return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) 1523 } 1524 // Check that the container has actually started before signaling it. 1525 if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { 1526 return err 1527 } 1528 if err := l.signalAllProcesses(cid, signo); err != nil { 1529 return fmt.Errorf("signaling all processes in container %q: %w", cid, err) 1530 } 1531 return nil 1532 1533 default: 1534 panic(fmt.Sprintf("unknown signal delivery mode %v", mode)) 1535 } 1536 } 1537 1538 // signalProcess sends signal to process in the given container. tgid is 1539 // relative to the root PID namespace, not the container's. 1540 func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { 1541 execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) 1542 if err == nil { 1543 // Send signal directly to the identified process. 1544 return l.k.SendExternalSignalThreadGroup(execTG, &linux.SignalInfo{Signo: signo}) 1545 } 1546 1547 // The caller may be signaling a process not started directly via exec. 1548 // In this case, find the process and check that the process belongs to the 1549 // container in question. 1550 tg := l.k.RootPIDNamespace().ThreadGroupWithID(tgid) 1551 if tg == nil { 1552 return fmt.Errorf("no such process with PID %d", tgid) 1553 } 1554 if tg.Leader().ContainerID() != cid { 1555 return fmt.Errorf("process %d belongs to a different container: %q", tgid, tg.Leader().ContainerID()) 1556 } 1557 return l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: signo}) 1558 } 1559 1560 // signalForegrondProcessGroup looks up foreground process group from the TTY 1561 // for the given "tgid" inside container "cid", and send the signal to it. 1562 func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { 1563 l.mu.Lock() 1564 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) 1565 if err != nil { 1566 l.mu.Unlock() 1567 return fmt.Errorf("no thread group found: %w", err) 1568 } 1569 if tg == nil { 1570 l.mu.Unlock() 1571 return fmt.Errorf("container %q not started", cid) 1572 } 1573 1574 tty, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) 1575 l.mu.Unlock() 1576 if err != nil { 1577 return fmt.Errorf("no thread group found: %w", err) 1578 } 1579 if tty == nil { 1580 return fmt.Errorf("no TTY attached") 1581 } 1582 pg := tty.ForegroundProcessGroup() 1583 si := &linux.SignalInfo{Signo: signo} 1584 if pg == nil { 1585 // No foreground process group has been set. Signal the 1586 // original thread group. 1587 log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid) 1588 return l.k.SendExternalSignalThreadGroup(tg, si) 1589 } 1590 // Send the signal to all processes in the process group. 1591 return l.k.SendExternalSignalProcessGroup(pg, si) 1592 } 1593 1594 // signalAllProcesses that belong to specified container. It's a noop if the 1595 // container hasn't started or has exited. 1596 func (l *Loader) signalAllProcesses(cid string, signo int32) error { 1597 // Pause the kernel to prevent new processes from being created while 1598 // the signal is delivered. This prevents process leaks when SIGKILL is 1599 // sent to the entire container. 1600 l.k.Pause() 1601 defer l.k.Unpause() 1602 return l.k.SendContainerSignal(cid, &linux.SignalInfo{Signo: signo}) 1603 } 1604 1605 // threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it 1606 // acquires mutex before calling it and fails in case container hasn't started 1607 // yet. 1608 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { 1609 l.mu.Lock() 1610 defer l.mu.Unlock() 1611 tg, err := l.tryThreadGroupFromIDLocked(key) 1612 if err != nil { 1613 return nil, err 1614 } 1615 if tg == nil { 1616 return nil, fmt.Errorf("container %q not started", key.cid) 1617 } 1618 return tg, nil 1619 } 1620 1621 // tryThreadGroupFromIDLocked returns the thread group for the given execution 1622 // ID. It may return nil in case the container has not started yet. Returns 1623 // error if execution ID is invalid or if the container cannot be found (maybe 1624 // it has been deleted). Caller must hold 'mu'. 1625 func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { 1626 ep, err := l.findProcessLocked(key) 1627 if err != nil { 1628 return nil, err 1629 } 1630 return ep.tg, nil 1631 } 1632 1633 // ttyFromIDLocked returns the TTY files for the given execution ID. It may 1634 // return nil in case the container has not started yet. Returns error if 1635 // execution ID is invalid or if the container cannot be found (maybe it has 1636 // been deleted). Caller must hold 'mu'. 1637 func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileDescription, error) { 1638 ep, err := l.findProcessLocked(key) 1639 if err != nil { 1640 return nil, err 1641 } 1642 return ep.tty, nil 1643 } 1644 1645 func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD, passFDs []fdMapping, user specs.User, containerName string) (*kernel.FDTable, *host.TTYFileDescription, error) { 1646 if len(stdioFDs) != 3 { 1647 return nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) 1648 } 1649 fdMap := map[int]*fd.FD{ 1650 0: stdioFDs[0], 1651 1: stdioFDs[1], 1652 2: stdioFDs[2], 1653 } 1654 1655 // Create the entries for the host files that were passed to our app. 1656 for _, customFD := range passFDs { 1657 if customFD.guest < 0 { 1658 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 1659 } 1660 fdMap[customFD.guest] = customFD.host 1661 } 1662 1663 k := kernel.KernelFromContext(ctx) 1664 fdTable := k.NewFDTable() 1665 ttyFile, err := fdimport.Import(ctx, fdTable, console, auth.KUID(user.UID), auth.KGID(user.GID), fdMap, containerName) 1666 if err != nil { 1667 fdTable.DecRef(ctx) 1668 return nil, nil, err 1669 } 1670 return fdTable, ttyFile, nil 1671 } 1672 1673 // portForward implements initiating a portForward connection in the sandbox. portForwardProxies 1674 // represent a two connections each copying to each other (read ends to write ends) in goroutines. 1675 // The proxies are stored and can be cleaned up, or clean up after themselves if the connection 1676 // is broken. 1677 func (l *Loader) portForward(opts *PortForwardOpts) error { 1678 // Validate that we have a stream FD to write to. If this happens then 1679 // it means there is a misbehaved urpc client or a bug has occurred. 1680 if len(opts.Files) != 1 { 1681 return fmt.Errorf("stream FD is required for port forward") 1682 } 1683 1684 l.mu.Lock() 1685 defer l.mu.Unlock() 1686 1687 cid := opts.ContainerID 1688 tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) 1689 if err != nil { 1690 return fmt.Errorf("failed to get threadgroup from %q: %w", cid, err) 1691 } 1692 if tg == nil { 1693 return fmt.Errorf("container %q not started", cid) 1694 } 1695 1696 // Import the fd for the UDS. 1697 ctx := l.k.SupervisorContext() 1698 fd, err := l.importFD(ctx, opts.Files[0]) 1699 if err != nil { 1700 return fmt.Errorf("importing stream fd: %w", err) 1701 } 1702 cu := cleanup.Make(func() { fd.DecRef(ctx) }) 1703 defer cu.Clean() 1704 1705 fdConn := pf.NewFileDescriptionConn(fd) 1706 1707 // Create a proxy to forward data between the fdConn and the sandboxed application. 1708 pair := pf.ProxyPair{To: fdConn} 1709 1710 switch l.root.conf.Network { 1711 case config.NetworkSandbox: 1712 stack := l.k.RootNetworkNamespace().Stack().(*netstack.Stack).Stack 1713 nsConn, err := pf.NewNetstackConn(stack, opts.Port) 1714 if err != nil { 1715 return fmt.Errorf("creating netstack port forward connection: %w", err) 1716 } 1717 pair.From = nsConn 1718 case config.NetworkHost: 1719 hConn, err := pf.NewHostInetConn(opts.Port) 1720 if err != nil { 1721 return fmt.Errorf("creating hostinet port forward connection: %w", err) 1722 } 1723 pair.From = hConn 1724 default: 1725 return fmt.Errorf("unsupported network type %q for container %q", l.root.conf.Network, cid) 1726 } 1727 cu.Release() 1728 proxy := pf.NewProxy(pair, opts.ContainerID) 1729 1730 // Add to the list of port forward connections and remove when the 1731 // connection closes. 1732 l.portForwardProxies = append(l.portForwardProxies, proxy) 1733 proxy.AddCleanup(func() { 1734 l.mu.Lock() 1735 defer l.mu.Unlock() 1736 for i := range l.portForwardProxies { 1737 if l.portForwardProxies[i] == proxy { 1738 l.portForwardProxies = append(l.portForwardProxies[:i], l.portForwardProxies[i+1:]...) 1739 break 1740 } 1741 } 1742 }) 1743 1744 // Start forwarding on the connection. 1745 proxy.Start(ctx) 1746 return nil 1747 } 1748 1749 // importFD generically imports a host file descriptor without adding it to any 1750 // fd table. 1751 func (l *Loader) importFD(ctx context.Context, f *os.File) (*vfs.FileDescription, error) { 1752 hostFD, err := fd.NewFromFile(f) 1753 if err != nil { 1754 return nil, err 1755 } 1756 defer hostFD.Close() 1757 fd, err := host.NewFD(ctx, l.k.HostMount(), hostFD.FD(), &host.NewFDOptions{ 1758 Savable: false, // We disconnect and close on save. 1759 IsTTY: false, 1760 VirtualOwner: false, // FD not visible to the sandboxed app so user can't be changed. 1761 }) 1762 1763 if err != nil { 1764 return nil, err 1765 } 1766 hostFD.Release() 1767 return fd, nil 1768 } 1769 1770 func (l *Loader) containerCount() int { 1771 l.mu.Lock() 1772 defer l.mu.Unlock() 1773 1774 containers := 0 1775 for id := range l.processes { 1776 if id.pid == 0 { 1777 // pid==0 represents the init process of a container. There is 1778 // only one of such process per container. 1779 containers++ 1780 } 1781 } 1782 return containers 1783 } 1784 1785 func (l *Loader) pidsCount(cid string) (int, error) { 1786 l.mu.Lock() 1787 defer l.mu.Unlock() 1788 1789 if _, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}); err != nil { 1790 // Container doesn't exist. 1791 return 0, err 1792 } 1793 return l.k.TaskSet().Root.NumTasksPerContainer(cid), nil 1794 } 1795 1796 func (l *Loader) networkStats() ([]*NetworkInterface, error) { 1797 var stats []*NetworkInterface 1798 stack := l.k.RootNetworkNamespace().Stack() 1799 for _, i := range stack.Interfaces() { 1800 var stat inet.StatDev 1801 if err := stack.Statistics(&stat, i.Name); err != nil { 1802 return nil, err 1803 } 1804 stats = append(stats, &NetworkInterface{ 1805 Name: i.Name, 1806 RxBytes: stat[0], 1807 RxPackets: stat[1], 1808 RxErrors: stat[2], 1809 RxDropped: stat[3], 1810 TxBytes: stat[8], 1811 TxPackets: stat[9], 1812 TxErrors: stat[10], 1813 TxDropped: stat[11], 1814 }) 1815 } 1816 return stats, nil 1817 } 1818 1819 func (l *Loader) findProcessLocked(key execID) (*execProcess, error) { 1820 ep := l.processes[key] 1821 if ep == nil { 1822 return nil, fmt.Errorf("container %q not found", key.cid) 1823 } 1824 return ep, nil 1825 } 1826 1827 func (l *Loader) registerContainer(spec *specs.Spec, cid string) string { 1828 l.mu.Lock() 1829 defer l.mu.Unlock() 1830 1831 return l.registerContainerLocked(spec, cid) 1832 } 1833 1834 func (l *Loader) registerContainerLocked(spec *specs.Spec, cid string) string { 1835 containerName := specutils.ContainerName(spec) 1836 if len(containerName) == 0 { 1837 // If no name was provided, require containers to be restored in the same order 1838 // they were created. 1839 containerName = "__no_name_" + strconv.Itoa(len(l.containerIDs)) 1840 } 1841 1842 l.containerIDs[containerName] = cid 1843 return containerName 1844 } 1845 1846 func (l *Loader) containerRuntimeState(cid string) ContainerRuntimeState { 1847 l.mu.Lock() 1848 defer l.mu.Unlock() 1849 exec, ok := l.processes[execID{cid: cid}] 1850 if !ok { 1851 // Can't distinguish between invalid CID and stopped container, assume that 1852 // CID is valid. 1853 return RuntimeStateStopped 1854 } 1855 if exec.tg == nil { 1856 // Container has no thread group assigned, so it has started yet. 1857 return RuntimeStateCreating 1858 } 1859 if exec.tg.Leader().ExitState() == kernel.TaskExitNone { 1860 // Init process is still running. 1861 return RuntimeStateRunning 1862 } 1863 // Init process has stopped, but no one has called wait on it yet. 1864 return RuntimeStateStopped 1865 }