gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/container/container.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package container creates and manipulates containers. 16 package container 17 18 import ( 19 "bufio" 20 "context" 21 "errors" 22 "fmt" 23 "io/ioutil" 24 "os" 25 "os/exec" 26 "path" 27 "regexp" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/cenkalti/backoff" 34 specs "github.com/opencontainers/runtime-spec/specs-go" 35 "golang.org/x/sys/unix" 36 "gvisor.dev/gvisor/pkg/abi/linux" 37 "gvisor.dev/gvisor/pkg/cleanup" 38 "gvisor.dev/gvisor/pkg/log" 39 "gvisor.dev/gvisor/pkg/sentry/control" 40 "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" 41 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" 42 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 43 "gvisor.dev/gvisor/pkg/sighandling" 44 "gvisor.dev/gvisor/pkg/state/statefile" 45 "gvisor.dev/gvisor/runsc/boot" 46 "gvisor.dev/gvisor/runsc/cgroup" 47 "gvisor.dev/gvisor/runsc/config" 48 "gvisor.dev/gvisor/runsc/console" 49 "gvisor.dev/gvisor/runsc/donation" 50 "gvisor.dev/gvisor/runsc/sandbox" 51 "gvisor.dev/gvisor/runsc/specutils" 52 ) 53 54 const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" 55 56 // validateID validates the container id. 57 func validateID(id string) error { 58 // See libcontainer/factory_linux.go. 59 idRegex := regexp.MustCompile(`^[\w+\.-]+$`) 60 if !idRegex.MatchString(id) { 61 return fmt.Errorf("invalid container id: %v", id) 62 } 63 return nil 64 } 65 66 // Container represents a containerized application. When running, the 67 // container is associated with a single Sandbox. 68 // 69 // Container metadata can be saved and loaded to disk. Within a root directory, 70 // we maintain subdirectories for each container named with the container id. 71 // The container metadata is stored as a json within the container directory 72 // in a file named "meta.json". This metadata format is defined by us and is 73 // not part of the OCI spec. 74 // 75 // Containers must write their metadata files after any change to their internal 76 // states. The entire container directory is deleted when the container is 77 // destroyed. 78 // 79 // When the container is stopped, all processes that belong to the container 80 // must be stopped before Destroy() returns. containerd makes roughly the 81 // following calls to stop a container: 82 // - First it attempts to kill the container process with 83 // 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a 84 // separate thread, it's waiting on the container. As soon as the wait 85 // returns, it moves on to the next step: 86 // - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to 87 // the container. 'kill --all SIGKILL' waits for all processes before 88 // returning. 89 // - Containerd waits for stdin, stdout and stderr to drain and be closed. 90 // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once 91 // again just to be sure, waits, and then proceeds with remaining teardown. 92 // 93 // Container is thread-unsafe. 94 type Container struct { 95 // ID is the container ID. 96 ID string `json:"id"` 97 98 // Spec is the OCI runtime spec that configures this container. 99 Spec *specs.Spec `json:"spec"` 100 101 // BundleDir is the directory containing the container bundle. 102 BundleDir string `json:"bundleDir"` 103 104 // CreatedAt is the time the container was created. 105 CreatedAt time.Time `json:"createdAt"` 106 107 // Owner is the container owner. 108 Owner string `json:"owner"` 109 110 // ConsoleSocket is the path to a unix domain socket that will receive 111 // the console FD. 112 ConsoleSocket string `json:"consoleSocket"` 113 114 // Status is the current container Status. 115 Status Status `json:"status"` 116 117 // GoferPid is the PID of the gofer running along side the sandbox. May 118 // be 0 if the gofer has been killed. 119 GoferPid int `json:"goferPid"` 120 121 // Sandbox is the sandbox this container is running in. It's set when the 122 // container is created and reset when the sandbox is destroyed. 123 Sandbox *sandbox.Sandbox `json:"sandbox"` 124 125 // CompatCgroup has the cgroup configuration for the container. For the single 126 // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup 127 // is only set for multi-container, where the `c.Sandbox` cgroup represents 128 // the entire pod. 129 // 130 // Note that CompatCgroup is created only for compatibility with tools 131 // that expect container cgroups to exist. Setting limits here makes no change 132 // to the container in question. 133 CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"` 134 135 // Saver handles load from/save to the state file safely from multiple 136 // processes. 137 Saver StateFile `json:"saver"` 138 139 // GoferMountConfs contains information about how the gofer mounts have been 140 // overlaid (with tmpfs or overlayfs). The first entry is for rootfs and the 141 // following entries are for bind mounts in Spec.Mounts (in the same order). 142 GoferMountConfs boot.GoferMountConfFlags `json:"goferMountConfs"` 143 144 // 145 // Fields below this line are not saved in the state file and will not 146 // be preserved across commands. 147 // 148 149 // goferIsChild is set if a gofer process is a child of the current process. 150 // 151 // This field isn't saved to json, because only a creator of a gofer 152 // process will have it as a child process. 153 goferIsChild bool `nojson:"true"` 154 } 155 156 // Args is used to configure a new container. 157 type Args struct { 158 // ID is the container unique identifier. 159 ID string 160 161 // Spec is the OCI spec that describes the container. 162 Spec *specs.Spec 163 164 // BundleDir is the directory containing the container bundle. 165 BundleDir string 166 167 // ConsoleSocket is the path to a unix domain socket that will receive 168 // the console FD. It may be empty. 169 ConsoleSocket string 170 171 // PIDFile is the filename where the container's root process PID will be 172 // written to. It may be empty. 173 PIDFile string 174 175 // UserLog is the filename to send user-visible logs to. It may be empty. 176 // 177 // It only applies for the init container. 178 UserLog string 179 180 // Attached indicates that the sandbox lifecycle is attached with the caller. 181 // If the caller exits, the sandbox should exit too. 182 // 183 // It only applies for the init container. 184 Attached bool 185 186 // PassFiles are user-supplied files from the host to be exposed to the 187 // sandboxed app. 188 PassFiles map[int]*os.File 189 190 // ExecFile is the host file used for program execution. 191 ExecFile *os.File 192 } 193 194 // New creates the container in a new Sandbox process, unless the metadata 195 // indicates that an existing Sandbox should be used. The caller must call 196 // Destroy() on the container. 197 func New(conf *config.Config, args Args) (*Container, error) { 198 log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 199 if err := validateID(args.ID); err != nil { 200 return nil, err 201 } 202 203 if err := os.MkdirAll(conf.RootDir, 0711); err != nil { 204 return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) 205 } 206 207 if err := modifySpecForDirectfs(conf, args.Spec); err != nil { 208 return nil, fmt.Errorf("failed to modify spec for directfs: %v", err) 209 } 210 211 sandboxID := args.ID 212 if !isRoot(args.Spec) { 213 var ok bool 214 sandboxID, ok = specutils.SandboxID(args.Spec) 215 if !ok { 216 return nil, fmt.Errorf("no sandbox ID found when creating container") 217 } 218 } 219 220 c := &Container{ 221 ID: args.ID, 222 Spec: args.Spec, 223 ConsoleSocket: args.ConsoleSocket, 224 BundleDir: args.BundleDir, 225 Status: Creating, 226 CreatedAt: time.Now(), 227 Owner: os.Getenv("USER"), 228 Saver: StateFile{ 229 RootDir: conf.RootDir, 230 ID: FullID{ 231 SandboxID: sandboxID, 232 ContainerID: args.ID, 233 }, 234 }, 235 } 236 // The Cleanup object cleans up partially created containers when an error 237 // occurs. Any errors occurring during cleanup itself are ignored. 238 cu := cleanup.Make(func() { _ = c.Destroy() }) 239 defer cu.Clean() 240 241 // Lock the container metadata file to prevent concurrent creations of 242 // containers with the same id. 243 if err := c.Saver.LockForNew(); err != nil { 244 return nil, fmt.Errorf("cannot lock container metadata file: %w", err) 245 } 246 defer c.Saver.UnlockOrDie() 247 248 // If the metadata annotations indicate that this container should be started 249 // in an existing sandbox, we must do so. These are the possible metadata 250 // annotation states: 251 // 1. No annotations: it means that there is a single container and this 252 // container is obviously the root. Both container and sandbox share the 253 // ID. 254 // 2. Container type == sandbox: it means this is the root container 255 // starting the sandbox. Both container and sandbox share the same ID. 256 // 3. Container type == container: it means this is a subcontainer of an 257 // already started sandbox. In this case, container ID is different than 258 // the sandbox ID. 259 if isRoot(args.Spec) { 260 log.Debugf("Creating new sandbox for container, cid: %s", args.ID) 261 262 if args.Spec.Linux == nil { 263 args.Spec.Linux = &specs.Linux{} 264 } 265 // Don't force the use of cgroups in tests because they lack permission to do so. 266 if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 267 args.Spec.Linux.CgroupsPath = "/" + args.ID 268 } 269 var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup 270 if !conf.IgnoreCgroups { 271 var err error 272 273 // Create and join cgroup before processes are created to ensure they are 274 // part of the cgroup from the start (and all their children processes). 275 parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec) 276 if err != nil { 277 return nil, fmt.Errorf("cannot set up cgroup for root: %w", err) 278 } 279 // Join the child cgroup when using cgroupfs. Joining non leaf-node 280 // cgroups is illegal in cgroupsv2 and will return EBUSY. 281 if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() { 282 containerCgroup = subCgroup 283 } else { 284 containerCgroup = parentCgroup 285 } 286 } 287 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 288 mountHints, err := boot.NewPodMountHints(args.Spec) 289 if err != nil { 290 return nil, fmt.Errorf("error creating pod mount hints: %w", err) 291 } 292 rootfsHint, err := boot.NewRootfsHint(args.Spec) 293 if err != nil { 294 return nil, fmt.Errorf("error creating rootfs hint: %w", err) 295 } 296 goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), mountHints, rootfsHint) 297 if err != nil { 298 return nil, err 299 } 300 if !goferConfs[0].ShouldUseLisafs() && specutils.GPUFunctionalityRequestedViaHook(args.Spec, conf) { 301 // nvidia-container-runtime-hook attempts to populate the container 302 // rootfs with NVIDIA libraries and devices. With EROFS, spec.Root.Path 303 // points to an empty directory and populating that has no effect. 304 return nil, fmt.Errorf("nvidia-container-runtime-hook cannot be used together with non-lisafs backed root mount") 305 } 306 c.GoferMountConfs = goferConfs 307 if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil { 308 return nil, err 309 } 310 if err := runInCgroup(containerCgroup, func() error { 311 ioFiles, devIOFile, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached, rootfsHint) 312 if err != nil { 313 return fmt.Errorf("cannot create gofer process: %w", err) 314 } 315 316 // Start a new sandbox for this container. Any errors after this point 317 // must destroy the container. 318 sandArgs := &sandbox.Args{ 319 ID: sandboxID, 320 Spec: args.Spec, 321 BundleDir: args.BundleDir, 322 ConsoleSocket: args.ConsoleSocket, 323 UserLog: args.UserLog, 324 IOFiles: ioFiles, 325 DevIOFile: devIOFile, 326 MountsFile: specFile, 327 Cgroup: containerCgroup, 328 Attached: args.Attached, 329 GoferFilestoreFiles: goferFilestores, 330 GoferMountConfs: goferConfs, 331 MountHints: mountHints, 332 PassFiles: args.PassFiles, 333 ExecFile: args.ExecFile, 334 } 335 sand, err := sandbox.New(conf, sandArgs) 336 if err != nil { 337 return fmt.Errorf("cannot create sandbox: %w", err) 338 } 339 c.Sandbox = sand 340 return nil 341 342 }); err != nil { 343 return nil, err 344 } 345 } else { 346 log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) 347 348 // Find the sandbox associated with this ID. 349 fullID := FullID{ 350 SandboxID: sandboxID, 351 ContainerID: sandboxID, 352 } 353 sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) 354 if err != nil { 355 return nil, fmt.Errorf("cannot load sandbox: %w", err) 356 } 357 c.Sandbox = sb.Sandbox 358 359 subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) 360 if err != nil { 361 return nil, err 362 } 363 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 364 365 // If the console control socket file is provided, then create a new 366 // pty master/slave pair and send the TTY to the sandbox process. 367 var tty *os.File 368 if c.ConsoleSocket != "" { 369 // Create a new TTY pair and send the master on the provided socket. 370 var err error 371 tty, err = console.NewWithSocket(c.ConsoleSocket) 372 if err != nil { 373 return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) 374 } 375 // tty file is transferred to the sandbox, then it can be closed here. 376 defer tty.Close() 377 } 378 379 if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil { 380 return nil, fmt.Errorf("cannot create subcontainer: %w", err) 381 } 382 } 383 c.changeStatus(Created) 384 385 // Save the metadata file. 386 if err := c.saveLocked(); err != nil { 387 return nil, err 388 } 389 390 // "If any prestart hook fails, the runtime MUST generate an error, 391 // stop and destroy the container" -OCI spec. 392 if c.Spec.Hooks != nil { 393 // Even though the hook name is Prestart, runc used to call it from create. 394 // For this reason, it's now deprecated, but the spec requires it to be 395 // called *before* CreateRuntime and CreateRuntime must be called in create. 396 // 397 // "For runtimes that implement the deprecated prestart hooks as 398 // createRuntime hooks, createRuntime hooks MUST be called after the 399 // prestart hooks." 400 if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { 401 return nil, err 402 } 403 if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil { 404 return nil, err 405 } 406 if len(c.Spec.Hooks.CreateContainer) > 0 { 407 log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported") 408 } 409 } 410 411 // Write the PID file. Containerd considers the call to create complete after 412 // this file is created, so it must be the last thing we do. 413 if args.PIDFile != "" { 414 if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { 415 return nil, fmt.Errorf("error writing PID file: %v", err) 416 } 417 } 418 419 cu.Release() 420 return c, nil 421 } 422 423 // Start starts running the containerized process inside the sandbox. 424 func (c *Container) Start(conf *config.Config) error { 425 log.Debugf("Start container, cid: %s", c.ID) 426 return c.startImpl(conf, "start", c.Sandbox.StartRoot, c.Sandbox.StartSubcontainer) 427 } 428 429 // Restore takes a container and replaces its kernel and file system 430 // to restore a container from its state file. 431 func (c *Container) Restore(conf *config.Config, imagePath string, direct bool) error { 432 log.Debugf("Restore container, cid: %s", c.ID) 433 434 restore := func(conf *config.Config) error { 435 return c.Sandbox.Restore(conf, c.ID, imagePath, direct) 436 } 437 return c.startImpl(conf, "restore", restore, c.Sandbox.RestoreSubcontainer) 438 } 439 440 func (c *Container) startImpl(conf *config.Config, action string, startRoot func(conf *config.Config) error, startSub func(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error) error { 441 if err := c.Saver.lock(BlockAcquire); err != nil { 442 return err 443 } 444 unlock := cleanup.Make(c.Saver.UnlockOrDie) 445 defer unlock.Clean() 446 447 if err := c.requireStatus(action, Created); err != nil { 448 return err 449 } 450 451 // "If any prestart hook fails, the runtime MUST generate an error, 452 // stop and destroy the container" -OCI spec. 453 if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { 454 log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") 455 } 456 457 if isRoot(c.Spec) { 458 if err := startRoot(conf); err != nil { 459 return err 460 } 461 } else { 462 rootfsHint, err := boot.NewRootfsHint(c.Spec) 463 if err != nil { 464 return fmt.Errorf("error creating rootfs hint: %w", err) 465 } 466 goferFilestores, goferConfs, err := c.createGoferFilestores(conf.GetOverlay2(), c.Sandbox.MountHints, rootfsHint) 467 if err != nil { 468 return err 469 } 470 c.GoferMountConfs = goferConfs 471 // Join cgroup to start gofer process to ensure it's part of the cgroup from 472 // the start (and all their children processes). 473 if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error { 474 // Create the gofer process. 475 goferFiles, devIOFile, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false, rootfsHint) 476 if err != nil { 477 return err 478 } 479 defer func() { 480 if mountsFile != nil { 481 _ = mountsFile.Close() 482 } 483 if devIOFile != nil { 484 _ = devIOFile.Close() 485 } 486 for _, f := range goferFiles { 487 _ = f.Close() 488 } 489 for _, f := range goferFilestores { 490 _ = f.Close() 491 } 492 }() 493 494 if mountsFile != nil { 495 cleanMounts, err := specutils.ReadMounts(mountsFile) 496 if err != nil { 497 return fmt.Errorf("reading mounts file: %v", err) 498 } 499 c.Spec.Mounts = cleanMounts 500 } 501 502 // Setup stdios if the container is not using terminal. Otherwise TTY was 503 // already setup in create. 504 var stdios []*os.File 505 if !c.Spec.Process.Terminal { 506 stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} 507 } 508 509 return startSub(c.Spec, conf, c.ID, stdios, goferFiles, goferFilestores, devIOFile, goferConfs) 510 }); err != nil { 511 return err 512 } 513 } 514 515 // "If any poststart hook fails, the runtime MUST log a warning, but 516 // the remaining hooks and lifecycle continue as if the hook had 517 // succeeded" -OCI spec. 518 if c.Spec.Hooks != nil { 519 executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) 520 } 521 522 c.changeStatus(Running) 523 if err := c.saveLocked(); err != nil { 524 return err 525 } 526 527 // Release lock before adjusting OOM score because the lock is acquired there. 528 unlock.Clean() 529 530 // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). 531 if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { 532 return err 533 } 534 535 // Set container's oom_score_adj to the gofer since it is dedicated to 536 // the container, in case the gofer uses up too much memory. 537 return c.adjustGoferOOMScoreAdj() 538 } 539 540 // Run is a helper that calls Create + Start + Wait. 541 func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { 542 log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 543 c, err := New(conf, args) 544 if err != nil { 545 return 0, fmt.Errorf("creating container: %v", err) 546 } 547 // Clean up partially created container if an error occurs. 548 // Any errors returned by Destroy() itself are ignored. 549 cu := cleanup.Make(func() { 550 c.Destroy() 551 }) 552 defer cu.Clean() 553 554 if err := c.Start(conf); err != nil { 555 return 0, fmt.Errorf("starting container: %v", err) 556 } 557 558 // If we allocate a terminal, forward signals to the sandbox process. 559 // Otherwise, Ctrl+C will terminate this process and its children, 560 // including the terminal. 561 if c.Spec.Process.Terminal { 562 stopForwarding := c.ForwardSignals(0, true /* fgProcess */) 563 defer stopForwarding() 564 } 565 566 if args.Attached { 567 return c.Wait() 568 } 569 cu.Release() 570 return 0, nil 571 } 572 573 // Execute runs the specified command in the container. It returns the PID of 574 // the newly created process. 575 func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 576 log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) 577 if err := c.requireStatus("execute in", Created, Running); err != nil { 578 return 0, err 579 } 580 args.ContainerID = c.ID 581 return c.Sandbox.Execute(conf, args) 582 } 583 584 // Event returns events for the container. 585 func (c *Container) Event() (*boot.EventOut, error) { 586 log.Debugf("Getting events for container, cid: %s", c.ID) 587 if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { 588 return nil, err 589 } 590 event, err := c.Sandbox.Event(c.ID) 591 if err != nil { 592 return nil, err 593 } 594 595 if len(event.ContainerUsage) > 0 { 596 // Some stats can utilize host cgroups for accuracy. 597 c.populateStats(event) 598 } 599 600 return event, nil 601 } 602 603 // PortForward starts port forwarding to the container. 604 func (c *Container) PortForward(opts *boot.PortForwardOpts) error { 605 if err := c.requireStatus("port forward", Running); err != nil { 606 return err 607 } 608 opts.ContainerID = c.ID 609 return c.Sandbox.PortForward(opts) 610 } 611 612 // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the 613 // container is not running. 614 func (c *Container) SandboxPid() int { 615 if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { 616 return -1 617 } 618 return c.Sandbox.Getpid() 619 } 620 621 // Wait waits for the container to exit, and returns its WaitStatus. 622 // Call to wait on a stopped container is needed to retrieve the exit status 623 // and wait returns immediately. 624 func (c *Container) Wait() (unix.WaitStatus, error) { 625 log.Debugf("Wait on container, cid: %s", c.ID) 626 ws, err := c.Sandbox.Wait(c.ID) 627 if err == nil { 628 // Wait succeeded, container is not running anymore. 629 c.changeStatus(Stopped) 630 } 631 return ws, err 632 } 633 634 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and 635 // returns its WaitStatus. 636 func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) { 637 log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) 638 if !c.IsSandboxRunning() { 639 return 0, fmt.Errorf("sandbox is not running") 640 } 641 return c.Sandbox.WaitPID(c.Sandbox.ID, pid) 642 } 643 644 // WaitPID waits for process 'pid' in the container's PID namespace and returns 645 // its WaitStatus. 646 func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) { 647 log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) 648 if !c.IsSandboxRunning() { 649 return 0, fmt.Errorf("sandbox is not running") 650 } 651 return c.Sandbox.WaitPID(c.ID, pid) 652 } 653 654 // SignalContainer sends the signal to the container. If all is true and signal 655 // is SIGKILL, then waits for all processes to exit before returning. 656 // SignalContainer returns an error if the container is already stopped. 657 // TODO(b/113680494): Distinguish different error types. 658 func (c *Container) SignalContainer(sig unix.Signal, all bool) error { 659 log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) 660 // Signaling container in Stopped state is allowed. When all=false, 661 // an error will be returned anyway; when all=true, this allows 662 // sending signal to other processes inside the container even 663 // after the init process exits. This is especially useful for 664 // container cleanup. 665 if err := c.requireStatus("signal", Running, Stopped); err != nil { 666 return err 667 } 668 if !c.IsSandboxRunning() { 669 return fmt.Errorf("sandbox is not running") 670 } 671 return c.Sandbox.SignalContainer(c.ID, sig, all) 672 } 673 674 // SignalProcess sends sig to a specific process in the container. 675 func (c *Container) SignalProcess(sig unix.Signal, pid int32) error { 676 log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) 677 if err := c.requireStatus("signal a process inside", Running); err != nil { 678 return err 679 } 680 if !c.IsSandboxRunning() { 681 return fmt.Errorf("sandbox is not running") 682 } 683 return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) 684 } 685 686 // ForwardSignals forwards all signals received by the current process to the 687 // container process inside the sandbox. It returns a function that will stop 688 // forwarding signals. 689 func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { 690 log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) 691 stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { 692 log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) 693 if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil { 694 log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) 695 } 696 }) 697 return func() { 698 log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) 699 stop() 700 } 701 } 702 703 // Checkpoint sends the checkpoint call to the container. 704 // The statefile will be written to f, the file at the specified image-path. 705 func (c *Container) Checkpoint(imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error { 706 log.Debugf("Checkpoint container, cid: %s", c.ID) 707 if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { 708 return err 709 } 710 return c.Sandbox.Checkpoint(c.ID, imagePath, direct, sfOpts, mfOpts) 711 } 712 713 // Pause suspends the container and its kernel. 714 // The call only succeeds if the container's status is created or running. 715 func (c *Container) Pause() error { 716 log.Debugf("Pausing container, cid: %s", c.ID) 717 if err := c.Saver.lock(BlockAcquire); err != nil { 718 return err 719 } 720 defer c.Saver.UnlockOrDie() 721 722 if c.Status != Created && c.Status != Running { 723 return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) 724 } 725 726 if err := c.Sandbox.Pause(c.ID); err != nil { 727 return fmt.Errorf("pausing container %q: %v", c.ID, err) 728 } 729 c.changeStatus(Paused) 730 return c.saveLocked() 731 } 732 733 // Resume unpauses the container and its kernel. 734 // The call only succeeds if the container's status is paused. 735 func (c *Container) Resume() error { 736 log.Debugf("Resuming container, cid: %s", c.ID) 737 if err := c.Saver.lock(BlockAcquire); err != nil { 738 return err 739 } 740 defer c.Saver.UnlockOrDie() 741 742 if c.Status != Paused { 743 return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) 744 } 745 if err := c.Sandbox.Resume(c.ID); err != nil { 746 return fmt.Errorf("resuming container: %v", err) 747 } 748 c.changeStatus(Running) 749 return c.saveLocked() 750 } 751 752 // State returns the metadata of the container. 753 func (c *Container) State() specs.State { 754 return specs.State{ 755 Version: specs.Version, 756 ID: c.ID, 757 Status: c.Status, 758 Pid: c.SandboxPid(), 759 Bundle: c.BundleDir, 760 Annotations: c.Spec.Annotations, 761 } 762 } 763 764 // Processes retrieves the list of processes and associated metadata inside a 765 // container. 766 func (c *Container) Processes() ([]*control.Process, error) { 767 if err := c.requireStatus("get processes of", Running, Paused); err != nil { 768 return nil, err 769 } 770 return c.Sandbox.Processes(c.ID) 771 } 772 773 // Destroy stops all processes and frees all resources associated with the 774 // container. 775 func (c *Container) Destroy() error { 776 log.Debugf("Destroy container, cid: %s", c.ID) 777 778 if err := c.Saver.lock(BlockAcquire); err != nil { 779 return err 780 } 781 defer func() { 782 c.Saver.UnlockOrDie() 783 _ = c.Saver.close() 784 }() 785 786 // Stored for later use as stop() sets c.Sandbox to nil. 787 sb := c.Sandbox 788 789 // We must perform the following cleanup steps: 790 // * stop the container and gofer processes, 791 // * remove the container filesystem on the host, and 792 // * delete the container metadata directory. 793 // 794 // It's possible for one or more of these steps to fail, but we should 795 // do our best to perform all of the cleanups. Hence, we keep a slice 796 // of errors return their concatenation. 797 var errs []string 798 if err := c.stop(); err != nil { 799 err = fmt.Errorf("stopping container: %v", err) 800 log.Warningf("%v", err) 801 errs = append(errs, err.Error()) 802 } 803 804 if err := c.Saver.Destroy(); err != nil { 805 err = fmt.Errorf("deleting container state files: %v", err) 806 log.Warningf("%v", err) 807 errs = append(errs, err.Error()) 808 } 809 810 // Clean up self-backed filestore files created in their respective mounts. 811 c.forEachSelfMount(func(mountSrc string) { 812 if sb != nil { 813 if hint := sb.MountHints.FindMount(mountSrc); hint != nil && hint.ShouldShareMount() { 814 // Don't delete filestore file for shared mounts. The sandbox owns a 815 // shared master mount which uses this filestore and is shared with 816 // multiple mount points. 817 return 818 } 819 } 820 filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID()) 821 if err := os.Remove(filestorePath); err != nil { 822 err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err) 823 log.Warningf("%v", err) 824 errs = append(errs, err.Error()) 825 } 826 }) 827 if sb != nil && sb.IsRootContainer(c.ID) { 828 // When the root container is being destroyed, we can clean up filestores 829 // used by shared mounts. 830 for _, hint := range sb.MountHints.Mounts { 831 if !hint.ShouldShareMount() { 832 continue 833 } 834 // Assume this is a self-backed shared mount and try to delete the 835 // filestore. Subsequently ignore the ENOENT if the assumption is wrong. 836 filestorePath := boot.SelfFilestorePath(hint.Mount.Source, c.sandboxID()) 837 if err := os.Remove(filestorePath); err != nil && !os.IsNotExist(err) { 838 err = fmt.Errorf("failed to delete shared filestore file %q: %v", filestorePath, err) 839 log.Warningf("%v", err) 840 errs = append(errs, err.Error()) 841 } 842 } 843 } 844 845 c.changeStatus(Stopped) 846 847 // Adjust oom_score_adj for the sandbox. This must be done after the container 848 // is stopped and the directory at c.Root is removed. 849 // 850 // Use 'sb' to tell whether it has been executed before because Destroy must 851 // be idempotent. 852 if sb != nil { 853 if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { 854 errs = append(errs, err.Error()) 855 } 856 } 857 858 // "If any poststop hook fails, the runtime MUST log a warning, but the 859 // remaining hooks and lifecycle continue as if the hook had 860 // succeeded" - OCI spec. 861 // 862 // Based on the OCI, "The post-stop hooks MUST be called after the container 863 // is deleted but before the delete operation returns" 864 // Run it here to: 865 // 1) Conform to the OCI. 866 // 2) Make sure it only runs once, because the root has been deleted, the 867 // container can't be loaded again. 868 if c.Spec.Hooks != nil { 869 executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) 870 } 871 872 if len(errs) == 0 { 873 return nil 874 } 875 return fmt.Errorf(strings.Join(errs, "\n")) 876 } 877 878 func (c *Container) sandboxID() string { 879 return c.Saver.ID.SandboxID 880 } 881 882 func (c *Container) forEachSelfMount(fn func(mountSrc string)) { 883 if c.GoferMountConfs == nil { 884 // Container not started? Skip. 885 return 886 } 887 if c.GoferMountConfs[0].IsSelfBacked() { 888 fn(c.Spec.Root.Path) 889 } 890 goferMntIdx := 1 // First index is for rootfs. 891 for i := range c.Spec.Mounts { 892 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 893 continue 894 } 895 if c.GoferMountConfs[goferMntIdx].IsSelfBacked() { 896 fn(c.Spec.Mounts[i].Source) 897 } 898 goferMntIdx++ 899 } 900 } 901 902 // createGoferFilestores creates the regular files that will back the 903 // tmpfs/overlayfs mounts that will overlay some gofer mounts. It also returns 904 // information about how each gofer mount is configured. 905 func (c *Container) createGoferFilestores(ovlConf config.Overlay2, mountHints *boot.PodMountHints, rootfsHint *boot.RootfsHint) ([]*os.File, []boot.GoferMountConf, error) { 906 var goferFilestores []*os.File 907 var goferConfs []boot.GoferMountConf 908 909 // Handle root mount first. 910 overlayMedium := ovlConf.RootOverlayMedium() 911 mountType := boot.Bind 912 if rootfsHint != nil { 913 overlayMedium = rootfsHint.Overlay 914 if !specutils.IsGoferMount(rootfsHint.Mount) { 915 mountType = rootfsHint.Mount.Type 916 } 917 } 918 if c.Spec.Root.Readonly { 919 overlayMedium = config.NoOverlay 920 } 921 filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Root.Path, mountType, false /* isShared */) 922 if err != nil { 923 return nil, nil, err 924 } 925 if filestore != nil { 926 goferFilestores = append(goferFilestores, filestore) 927 } 928 goferConfs = append(goferConfs, goferConf) 929 930 // Handle bind mounts. 931 for i := range c.Spec.Mounts { 932 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 933 continue 934 } 935 overlayMedium = ovlConf.SubMountOverlayMedium() 936 mountType = boot.Bind 937 isShared := false 938 if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) { 939 overlayMedium = config.NoOverlay 940 } 941 if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil { 942 // Note that we want overlayMedium=self even if this is a read-only mount so that 943 // the shared mount is created correctly. Future containers may mount this writably. 944 overlayMedium = config.SelfOverlay 945 if !specutils.IsGoferMount(hint.Mount) { 946 mountType = hint.Mount.Type 947 } 948 isShared = hint.ShouldShareMount() 949 } 950 filestore, goferConf, err := c.createGoferFilestore(overlayMedium, c.Spec.Mounts[i].Source, mountType, isShared) 951 if err != nil { 952 return nil, nil, err 953 } 954 if filestore != nil { 955 goferFilestores = append(goferFilestores, filestore) 956 } 957 goferConfs = append(goferConfs, goferConf) 958 } 959 for _, filestore := range goferFilestores { 960 // Perform this work around outside the sandbox. The sandbox may already be 961 // running with seccomp filters that do not allow this. 962 pgalloc.IMAWorkAroundForMemFile(filestore.Fd()) 963 } 964 return goferFilestores, goferConfs, nil 965 } 966 967 func (c *Container) createGoferFilestore(overlayMedium config.OverlayMedium, mountSrc string, mountType string, isShared bool) (*os.File, boot.GoferMountConf, error) { 968 var lower boot.GoferMountConfLowerType 969 switch mountType { 970 case boot.Bind: 971 lower = boot.Lisafs 972 case tmpfs.Name: 973 lower = boot.NoneLower 974 case erofs.Name: 975 lower = boot.Erofs 976 default: 977 return nil, boot.GoferMountConf{}, fmt.Errorf("unsupported mount type %q in mount hint", mountType) 978 } 979 switch overlayMedium { 980 case config.NoOverlay: 981 return nil, boot.GoferMountConf{Lower: lower, Upper: boot.NoOverlay}, nil 982 case config.MemoryOverlay: 983 return nil, boot.GoferMountConf{Lower: lower, Upper: boot.MemoryOverlay}, nil 984 case config.SelfOverlay: 985 return c.createGoferFilestoreInSelf(mountSrc, isShared, boot.GoferMountConf{Lower: lower, Upper: boot.SelfOverlay}) 986 default: 987 if overlayMedium.IsBackedByAnon() { 988 return c.createGoferFilestoreInDir(overlayMedium.HostFileDir(), boot.GoferMountConf{Lower: lower, Upper: boot.AnonOverlay}) 989 } 990 return nil, boot.GoferMountConf{}, fmt.Errorf("unexpected overlay medium %q", overlayMedium) 991 } 992 } 993 994 func (c *Container) createGoferFilestoreInSelf(mountSrc string, isShared bool, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) { 995 mountSrcInfo, err := os.Stat(mountSrc) 996 if err != nil { 997 return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat mount %q to see if it were a directory: %v", mountSrc, err) 998 } 999 if !mountSrcInfo.IsDir() { 1000 log.Warningf("self filestore is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc) 1001 return nil, boot.GoferMountConf{Lower: successConf.Lower, Upper: boot.MemoryOverlay}, nil 1002 } 1003 // Create the self filestore file. 1004 createFlags := unix.O_RDWR | unix.O_CREAT | unix.O_CLOEXEC 1005 if !isShared { 1006 // Allow shared mounts to reuse existing filestore. A previous shared user 1007 // may have already set up the filestore. 1008 createFlags |= unix.O_EXCL 1009 } 1010 filestorePath := boot.SelfFilestorePath(mountSrc, c.sandboxID()) 1011 filestoreFD, err := unix.Open(filestorePath, createFlags, 0666) 1012 if err != nil { 1013 if err == unix.EEXIST { 1014 // Note that if the same submount is mounted multiple times within the 1015 // same sandbox, and is not shared, then the overlay option doesn't work 1016 // correctly. Because each overlay mount is independent and changes to 1017 // one are not visible to the other. 1018 return nil, boot.GoferMountConf{}, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not supported with overlay optimizations", mountSrc, filestorePath) 1019 } 1020 return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err) 1021 } 1022 log.Debugf("Created filestore file at %q for mount source %q", filestorePath, mountSrc) 1023 // Filestore in self should be a named path because it needs to be 1024 // discoverable via path traversal so that k8s can scan the filesystem 1025 // and apply any limits appropriately (like local ephemeral storage 1026 // limits). So don't delete it. These files will be unlinked when the 1027 // container is destroyed. This makes self medium appropriate for k8s. 1028 return os.NewFile(uintptr(filestoreFD), filestorePath), successConf, nil 1029 } 1030 1031 func (c *Container) createGoferFilestoreInDir(filestoreDir string, successConf boot.GoferMountConf) (*os.File, boot.GoferMountConf, error) { 1032 fileInfo, err := os.Stat(filestoreDir) 1033 if err != nil { 1034 return nil, boot.GoferMountConf{}, fmt.Errorf("failed to stat filestore directory %q: %v", filestoreDir, err) 1035 } 1036 if !fileInfo.IsDir() { 1037 return nil, boot.GoferMountConf{}, fmt.Errorf("overlay2 flag should specify an existing directory") 1038 } 1039 // Create an unnamed temporary file in filestore directory which will be 1040 // deleted when the last FD on it is closed. We don't use O_TMPFILE because 1041 // it is not supported on all filesystems. So we simulate it by creating a 1042 // named file and then immediately unlinking it while keeping an FD on it. 1043 // This file will be deleted when the container exits. 1044 filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-filestore-") 1045 if err != nil { 1046 return nil, boot.GoferMountConf{}, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err) 1047 } 1048 if err := unix.Unlink(filestoreFile.Name()); err != nil { 1049 return nil, boot.GoferMountConf{}, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err) 1050 } 1051 log.Debugf("Created an unnamed filestore file at %q", filestoreDir) 1052 return filestoreFile, successConf, nil 1053 } 1054 1055 // saveLocked saves the container metadata to a file. 1056 // 1057 // Precondition: container must be locked with container.lock(). 1058 func (c *Container) saveLocked() error { 1059 log.Debugf("Save container, cid: %s", c.ID) 1060 if err := c.Saver.SaveLocked(c); err != nil { 1061 return fmt.Errorf("saving container metadata: %v", err) 1062 } 1063 return nil 1064 } 1065 1066 // stop stops the container (for regular containers) or the sandbox (for 1067 // root containers), and waits for the container or sandbox and the gofer 1068 // to stop. If any of them doesn't stop before timeout, an error is returned. 1069 func (c *Container) stop() error { 1070 var parentCgroup cgroup.Cgroup 1071 1072 if c.Sandbox != nil { 1073 log.Debugf("Destroying container, cid: %s", c.ID) 1074 if err := c.Sandbox.DestroyContainer(c.ID); err != nil { 1075 return fmt.Errorf("destroying container %q: %v", c.ID, err) 1076 } 1077 // Only uninstall parentCgroup for sandbox stop. 1078 if c.Sandbox.IsRootContainer(c.ID) { 1079 parentCgroup = c.Sandbox.CgroupJSON.Cgroup 1080 } 1081 // Only set sandbox to nil after it has been told to destroy the container. 1082 c.Sandbox = nil 1083 } 1084 1085 // Try killing gofer if it does not exit with container. 1086 if c.GoferPid != 0 { 1087 log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) 1088 if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil { 1089 // The gofer may already be stopped, log the error. 1090 log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err) 1091 } 1092 } 1093 1094 if err := c.waitForStopped(); err != nil { 1095 return err 1096 } 1097 1098 // Delete container cgroup if any. 1099 if c.CompatCgroup.Cgroup != nil { 1100 if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil { 1101 return err 1102 } 1103 } 1104 // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called 1105 // after the gofer has stopped. 1106 if parentCgroup != nil { 1107 if err := parentCgroup.Uninstall(); err != nil { 1108 return err 1109 } 1110 } 1111 return nil 1112 } 1113 1114 func (c *Container) waitForStopped() error { 1115 if c.GoferPid == 0 { 1116 return nil 1117 } 1118 1119 if c.IsSandboxRunning() { 1120 if err := c.SignalContainer(unix.Signal(0), false); err == nil { 1121 return fmt.Errorf("container is still running") 1122 } 1123 } 1124 1125 if c.goferIsChild { 1126 // The gofer process is a child of the current process, 1127 // so we can wait it and collect its zombie. 1128 if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil { 1129 return fmt.Errorf("error waiting the gofer process: %v", err) 1130 } 1131 c.GoferPid = 0 1132 return nil 1133 } 1134 1135 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1136 defer cancel() 1137 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1138 op := func() error { 1139 if err := unix.Kill(c.GoferPid, 0); err == nil { 1140 return fmt.Errorf("gofer is still running") 1141 } 1142 c.GoferPid = 0 1143 return nil 1144 } 1145 return backoff.Retry(op, b) 1146 } 1147 1148 // shouldCreateDeviceGofer indicates whether a device gofer connection should 1149 // be created. 1150 func shouldCreateDeviceGofer(spec *specs.Spec, conf *config.Config) bool { 1151 return specutils.GPUFunctionalityRequested(spec, conf) || specutils.TPUFunctionalityRequested(spec, conf) 1152 } 1153 1154 // shouldSpawnGofer indicates whether the gofer process should be spawned. 1155 func shouldSpawnGofer(spec *specs.Spec, conf *config.Config, goferConfs []boot.GoferMountConf) bool { 1156 // Lisafs mounts need the gofer. 1157 for _, cfg := range goferConfs { 1158 if cfg.ShouldUseLisafs() { 1159 return true 1160 } 1161 } 1162 // Device gofer needs a gofer process. 1163 return shouldCreateDeviceGofer(spec, conf) 1164 } 1165 1166 // createGoferProcess returns an IO file list and a mounts file on success. 1167 // The IO file list consists of image files and/or socket files to connect to 1168 // a gofer endpoint for the mount points using Gofers. The mounts file is the 1169 // file to read list of mounts after they have been resolved (direct paths, 1170 // no symlinks), and will be nil if there is no cleaning required for mounts. 1171 func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool, rootfsHint *boot.RootfsHint) ([]*os.File, *os.File, *os.File, error) { 1172 if !shouldSpawnGofer(spec, conf, c.GoferMountConfs) { 1173 if !c.GoferMountConfs[0].ShouldUseErofs() { 1174 panic("goferless mode is only possible with EROFS rootfs") 1175 } 1176 ioFile, err := os.Open(rootfsHint.Mount.Source) 1177 if err != nil { 1178 return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err) 1179 } 1180 return []*os.File{ioFile}, nil, nil, nil 1181 } 1182 1183 // Ensure we don't leak FDs to the gofer process. 1184 if err := sandbox.SetCloExeOnAllFDs(); err != nil { 1185 return nil, nil, nil, fmt.Errorf("setting CLOEXEC on all FDs: %w", err) 1186 } 1187 1188 donations := donation.Agency{} 1189 defer donations.Close() 1190 1191 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 1192 return nil, nil, nil, err 1193 } 1194 if conf.DebugLog != "" { 1195 test := "" 1196 if len(conf.TestOnlyTestNameEnv) != 0 { 1197 // Fetch test name if one is provided and the test only flag was set. 1198 if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 1199 test = t 1200 } 1201 } 1202 if specutils.IsDebugCommand(conf, "gofer") { 1203 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test); err != nil { 1204 return nil, nil, nil, err 1205 } 1206 } 1207 } 1208 1209 // Start with the general config flags. 1210 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 1211 cmd.SysProcAttr = &unix.SysProcAttr{ 1212 // Detach from session. Otherwise, signals sent to the foreground process 1213 // will also be forwarded by this process, resulting in duplicate signals. 1214 Setsid: true, 1215 } 1216 1217 // Set Args[0] to make easier to spot the gofer process. Otherwise it's 1218 // shown as `exe`. 1219 cmd.Args[0] = "runsc-gofer" 1220 1221 // Tranfer FDs that need to be present before the "gofer" command. 1222 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 1223 nextFD := donations.Transfer(cmd, 3) 1224 1225 cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir) 1226 cmd.Args = append(cmd.Args, "--gofer-mount-confs="+c.GoferMountConfs.String()) 1227 1228 // Open the spec file to donate to the sandbox. 1229 specFile, err := specutils.OpenSpec(bundleDir) 1230 if err != nil { 1231 return nil, nil, nil, fmt.Errorf("opening spec file: %v", err) 1232 } 1233 donations.DonateAndClose("spec-fd", specFile) 1234 1235 // Donate any profile FDs to the gofer. 1236 if err := c.donateGoferProfileFDs(conf, &donations); err != nil { 1237 return nil, nil, nil, fmt.Errorf("donating gofer profile fds: %w", err) 1238 } 1239 1240 // Create pipe that allows gofer to send mount list to sandbox after all paths 1241 // have been resolved. 1242 mountsSand, mountsGofer, err := os.Pipe() 1243 if err != nil { 1244 return nil, nil, nil, err 1245 } 1246 donations.DonateAndClose("mounts-fd", mountsGofer) 1247 1248 // Count the number of mounts that needs an IO file. 1249 ioFileCount := 0 1250 for _, cfg := range c.GoferMountConfs { 1251 if cfg.ShouldUseLisafs() || cfg.ShouldUseErofs() { 1252 ioFileCount++ 1253 } 1254 } 1255 1256 sandEnds := make([]*os.File, 0, ioFileCount) 1257 for i, cfg := range c.GoferMountConfs { 1258 switch { 1259 case cfg.ShouldUseLisafs(): 1260 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1261 if err != nil { 1262 return nil, nil, nil, err 1263 } 1264 sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) 1265 1266 goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") 1267 donations.DonateAndClose("io-fds", goferEnd) 1268 1269 case cfg.ShouldUseErofs(): 1270 if i > 0 { 1271 return nil, nil, nil, fmt.Errorf("EROFS lower layer is only supported for root mount") 1272 } 1273 f, err := os.Open(rootfsHint.Mount.Source) 1274 if err != nil { 1275 return nil, nil, nil, fmt.Errorf("opening rootfs image %q: %v", rootfsHint.Mount.Source, err) 1276 } 1277 sandEnds = append(sandEnds, f) 1278 } 1279 } 1280 var devSandEnd *os.File 1281 if shouldCreateDeviceGofer(spec, conf) { 1282 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1283 if err != nil { 1284 return nil, nil, nil, err 1285 } 1286 devSandEnd = os.NewFile(uintptr(fds[0]), "sandbox dev IO FD") 1287 donations.DonateAndClose("dev-io-fd", os.NewFile(uintptr(fds[1]), "gofer dev IO FD")) 1288 } 1289 1290 if attached { 1291 // The gofer is attached to the lifetime of this process, so it 1292 // should synchronously die when this process dies. 1293 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1294 } 1295 1296 // Enter new namespaces to isolate from the rest of the system. Don't unshare 1297 // cgroup because gofer is added to a cgroup in the caller's namespace. 1298 nss := []specs.LinuxNamespace{ 1299 {Type: specs.IPCNamespace}, 1300 {Type: specs.MountNamespace}, 1301 {Type: specs.NetworkNamespace}, 1302 {Type: specs.PIDNamespace}, 1303 {Type: specs.UTSNamespace}, 1304 } 1305 1306 rootlessEUID := unix.Geteuid() != 0 1307 // Setup any uid/gid mappings, and create or join the configured user 1308 // namespace so the gofer's view of the filesystem aligns with the 1309 // users in the sandbox. 1310 if !rootlessEUID { 1311 if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1312 nss = append(nss, userNS) 1313 specutils.SetUIDGIDMappings(cmd, spec) 1314 // We need to set UID and GID to have capabilities in a new user namespace. 1315 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 1316 } 1317 } else { 1318 userNS, ok := specutils.GetNS(specs.UserNamespace, spec) 1319 if !ok { 1320 return nil, nil, nil, fmt.Errorf("unable to run a rootless container without userns") 1321 } 1322 nss = append(nss, userNS) 1323 syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations) 1324 if err != nil { 1325 return nil, nil, nil, err 1326 } 1327 defer syncFile.Close() 1328 } 1329 1330 nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations) 1331 if err != nil { 1332 return nil, nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err) 1333 } 1334 1335 donations.Transfer(cmd, nextFD) 1336 1337 // Start the gofer in the given namespace. 1338 donation.LogDonations(cmd) 1339 log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args) 1340 if err := specutils.StartInNS(cmd, nss); err != nil { 1341 return nil, nil, nil, fmt.Errorf("gofer: %v", err) 1342 } 1343 log.Infof("Gofer started, PID: %d", cmd.Process.Pid) 1344 c.GoferPid = cmd.Process.Pid 1345 c.goferIsChild = true 1346 1347 // Set up and synchronize rootless mode userns mappings. 1348 if rootlessEUID { 1349 if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil { 1350 return nil, nil, nil, err 1351 } 1352 } 1353 1354 // Set up nvproxy within the Gofer namespace. 1355 if err := nvProxySetup(); err != nil { 1356 return nil, nil, nil, fmt.Errorf("nvproxy setup: %w", err) 1357 } 1358 1359 return sandEnds, devSandEnd, mountsSand, nil 1360 } 1361 1362 // changeStatus transitions from one status to another ensuring that the 1363 // transition is valid. 1364 func (c *Container) changeStatus(s Status) { 1365 switch s { 1366 case Creating: 1367 // Initial state, never transitions to it. 1368 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1369 1370 case Created: 1371 if c.Status != Creating { 1372 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1373 } 1374 if c.Sandbox == nil { 1375 panic("sandbox cannot be nil") 1376 } 1377 1378 case Paused: 1379 if c.Status != Running { 1380 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1381 } 1382 if c.Sandbox == nil { 1383 panic("sandbox cannot be nil") 1384 } 1385 1386 case Running: 1387 if c.Status != Created && c.Status != Paused { 1388 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1389 } 1390 if c.Sandbox == nil { 1391 panic("sandbox cannot be nil") 1392 } 1393 1394 case Stopped: 1395 // All states can transition to Stopped. 1396 1397 default: 1398 panic(fmt.Sprintf("invalid new state: %v", s)) 1399 } 1400 c.Status = s 1401 } 1402 1403 // IsSandboxRunning returns true if the sandbox exists and is running. 1404 func (c *Container) IsSandboxRunning() bool { 1405 return c.Sandbox != nil && c.Sandbox.IsRunning() 1406 } 1407 1408 // HasCapabilityInAnySet returns true if the given capability is in any of the 1409 // capability sets of the container process. 1410 func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool { 1411 capString := capability.String() 1412 for _, set := range [5][]string{ 1413 c.Spec.Process.Capabilities.Bounding, 1414 c.Spec.Process.Capabilities.Effective, 1415 c.Spec.Process.Capabilities.Inheritable, 1416 c.Spec.Process.Capabilities.Permitted, 1417 c.Spec.Process.Capabilities.Ambient, 1418 } { 1419 for _, c := range set { 1420 if c == capString { 1421 return true 1422 } 1423 } 1424 } 1425 return false 1426 } 1427 1428 // RunsAsUID0 returns true if the container process runs with UID 0 (root). 1429 func (c *Container) RunsAsUID0() bool { 1430 return c.Spec.Process.User.UID == 0 1431 } 1432 1433 func (c *Container) requireStatus(action string, statuses ...Status) error { 1434 for _, s := range statuses { 1435 if c.Status == s { 1436 return nil 1437 } 1438 } 1439 return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) 1440 } 1441 1442 // IsSandboxRoot returns true if this container is its sandbox's root container. 1443 func (c *Container) IsSandboxRoot() bool { 1444 return isRoot(c.Spec) 1445 } 1446 1447 func isRoot(spec *specs.Spec) bool { 1448 return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer 1449 } 1450 1451 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute 1452 // it in the current context. 1453 func runInCgroup(cg cgroup.Cgroup, fn func() error) error { 1454 if cg == nil { 1455 return fn() 1456 } 1457 restore, err := cg.Join() 1458 if err != nil { 1459 return err 1460 } 1461 defer restore() 1462 return fn() 1463 } 1464 1465 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. 1466 func (c *Container) adjustGoferOOMScoreAdj() error { 1467 if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { 1468 return nil 1469 } 1470 return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) 1471 } 1472 1473 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. 1474 // oom_score_adj is set to the lowest oom_score_adj among the containers 1475 // running in the sandbox. 1476 // 1477 // TODO(gvisor.dev/issue/238): This call could race with other containers being 1478 // created at the same time and end up setting the wrong oom_score_adj to the 1479 // sandbox. Use rpc client to synchronize. 1480 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { 1481 // Adjustment can be skipped if the root container is exiting, because it 1482 // brings down the entire sandbox. 1483 if isRoot(spec) && destroy { 1484 return nil 1485 } 1486 1487 containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{}) 1488 if err != nil { 1489 return fmt.Errorf("loading sandbox containers: %v", err) 1490 } 1491 1492 // Do nothing if the sandbox has been terminated. 1493 if len(containers) == 0 { 1494 return nil 1495 } 1496 1497 // Get the lowest score for all containers. 1498 var lowScore int 1499 scoreFound := false 1500 for _, container := range containers { 1501 // Special multi-container support for CRI. Ignore the root container when 1502 // calculating oom_score_adj for the sandbox because it is the 1503 // infrastructure (pause) container and always has a very low oom_score_adj. 1504 // 1505 // We will use OOMScoreAdj in the single-container case where the 1506 // containerd container-type annotation is not present. 1507 if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { 1508 continue 1509 } 1510 1511 if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { 1512 scoreFound = true 1513 lowScore = *container.Spec.Process.OOMScoreAdj 1514 } 1515 } 1516 1517 // If the container is destroyed and remaining containers have no 1518 // oomScoreAdj specified then we must revert to the original oom_score_adj 1519 // saved with the root container. 1520 if !scoreFound && destroy { 1521 lowScore = containers[0].Sandbox.OriginalOOMScoreAdj 1522 scoreFound = true 1523 } 1524 1525 // Only set oom_score_adj if one of the containers has oom_score_adj set. If 1526 // not, oom_score_adj is inherited from the parent process. 1527 // 1528 // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process 1529 if !scoreFound { 1530 return nil 1531 } 1532 1533 // Set the lowest of all containers oom_score_adj to the sandbox. 1534 return setOOMScoreAdj(s.Getpid(), lowScore) 1535 } 1536 1537 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID. 1538 // /proc must be available and mounted read-write. scoreAdj should be between 1539 // -1000 and 1000. It's a noop if the process has already exited. 1540 func setOOMScoreAdj(pid int, scoreAdj int) error { 1541 f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) 1542 if err != nil { 1543 // Ignore NotExist errors because it can race with process exit. 1544 if os.IsNotExist(err) { 1545 log.Warningf("Process (%d) not found setting oom_score_adj", pid) 1546 return nil 1547 } 1548 return err 1549 } 1550 defer f.Close() 1551 if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { 1552 if errors.Is(err, unix.ESRCH) { 1553 log.Warningf("Process (%d) exited while setting oom_score_adj", pid) 1554 return nil 1555 } 1556 return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) 1557 } 1558 return nil 1559 } 1560 1561 // populateStats populates event with stats estimates based on cgroups and the 1562 // sentry's accounting. 1563 func (c *Container) populateStats(event *boot.EventOut) { 1564 // The events command, when run for all running containers, should 1565 // account for the full cgroup CPU usage. We split cgroup usage 1566 // proportionally according to the sentry-internal usage measurements, 1567 // only counting Running containers. 1568 log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) 1569 numContainers := uint64(len(event.ContainerUsage)) 1570 if numContainers == 0 { 1571 log.Warningf("events: no containers listed in usage, returning zero CPU usage") 1572 event.Event.Data.CPU.Usage.Total = 0 1573 return 1574 } 1575 1576 var containerUsage uint64 1577 var allContainersUsage uint64 1578 for ID, usage := range event.ContainerUsage { 1579 allContainersUsage += usage 1580 if ID == c.ID { 1581 containerUsage = usage 1582 } 1583 } 1584 1585 cgroup, err := c.Sandbox.NewCGroup() 1586 if err != nil { 1587 // No cgroup, so rely purely on the sentry's accounting. 1588 log.Warningf("events: no cgroups") 1589 event.Event.Data.CPU.Usage.Total = containerUsage 1590 return 1591 } 1592 1593 // Get the host cgroup CPU usage. 1594 cgroupsUsage, err := cgroup.CPUUsage() 1595 if err != nil || cgroupsUsage == 0 { 1596 // No cgroup usage, so rely purely on the sentry's accounting. 1597 log.Warningf("events: failed when getting cgroup CPU usage for container: usage=%d, err: %v", cgroupsUsage, err) 1598 event.Event.Data.CPU.Usage.Total = containerUsage 1599 return 1600 } 1601 1602 // If the sentry reports no CPU usage, fall back on cgroups and split usage 1603 // equally across containers. 1604 if allContainersUsage == 0 { 1605 log.Warningf("events: no sentry CPU usage reported") 1606 allContainersUsage = cgroupsUsage 1607 containerUsage = cgroupsUsage / numContainers 1608 } 1609 1610 // Scaling can easily overflow a uint64 (e.g. a containerUsage and 1611 // cgroupsUsage of 16 seconds each will overflow), so use floats. 1612 total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) 1613 log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) 1614 event.Event.Data.CPU.Usage.Total = uint64(total) 1615 return 1616 } 1617 1618 // setupCgroupForRoot configures and returns cgroup for the sandbox and the 1619 // root container. If `cgroupParentAnnotation` is set, use that path as the 1620 // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. 1621 func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) { 1622 var parentCgroup cgroup.Cgroup 1623 if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { 1624 var err error 1625 parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup) 1626 if err != nil { 1627 return nil, nil, err 1628 } 1629 } else { 1630 var err error 1631 parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1632 if parentCgroup == nil || err != nil { 1633 return nil, nil, err 1634 } 1635 } 1636 1637 var err error 1638 parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) 1639 if parentCgroup == nil || err != nil { 1640 return nil, nil, err 1641 } 1642 1643 subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) 1644 if err != nil { 1645 _ = parentCgroup.Uninstall() 1646 return nil, nil, err 1647 } 1648 return parentCgroup, subCgroup, nil 1649 } 1650 1651 // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since 1652 // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the 1653 // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups 1654 // paths to discover new containers and report stats for them. 1655 func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) { 1656 if isRoot(spec) { 1657 if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { 1658 return nil, nil 1659 } 1660 } 1661 1662 cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1663 if cg == nil || err != nil { 1664 return nil, err 1665 } 1666 // Use empty resources, just want the directory structure created. 1667 return cgroupInstall(conf, cg, &specs.LinuxResources{}) 1668 } 1669 1670 // donateGoferProfileFDs will open profile files and donate their FDs to the 1671 // gofer. 1672 func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error { 1673 // The gofer profile files are named based on the provided flag, but 1674 // suffixed with "gofer" and the container ID to avoid collisions with 1675 // sentry profile files or profile files from other gofers. 1676 // 1677 // TODO(b/243183772): Merge gofer profile data with sentry profile data 1678 // into a single file. 1679 profSuffix := ".gofer." + c.ID 1680 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 1681 if conf.ProfileBlock != "" { 1682 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil { 1683 return err 1684 } 1685 } 1686 if conf.ProfileCPU != "" { 1687 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil { 1688 return err 1689 } 1690 } 1691 if conf.ProfileHeap != "" { 1692 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil { 1693 return err 1694 } 1695 } 1696 if conf.ProfileMutex != "" { 1697 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil { 1698 return err 1699 } 1700 } 1701 if conf.TraceFile != "" { 1702 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil { 1703 return err 1704 } 1705 } 1706 return nil 1707 } 1708 1709 // cgroupInstall creates cgroups dir structure and sets their respective 1710 // resources. In case of success, returns the cgroups instance and nil error. 1711 // For rootless, it's possible that cgroups operations fail, in this case the 1712 // error is suppressed and a nil cgroups instance is returned to indicate that 1713 // no cgroups was configured. 1714 func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) { 1715 if err := cg.Install(res); err != nil { 1716 switch { 1717 case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless: 1718 log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) 1719 return nil, nil 1720 default: 1721 return nil, fmt.Errorf("configuring cgroup: %v", err) 1722 } 1723 } 1724 return cg, nil 1725 } 1726 1727 func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error { 1728 if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1729 return nil 1730 } 1731 if conf.Network == config.NetworkHost { 1732 // Hostnet feature requires the sandbox to run in the current user 1733 // namespace, in which the network namespace is configured. 1734 return nil 1735 } 1736 if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1737 // If the spec already defines a userns, use that. 1738 return nil 1739 } 1740 if spec.Linux == nil { 1741 spec.Linux = &specs.Linux{} 1742 } 1743 if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 { 1744 // The spec can only define UID/GID mappings with a userns (checked above). 1745 return fmt.Errorf("spec defines UID/GID mappings without defining userns") 1746 } 1747 // Run the sandbox in a new user namespace with identity UID/GID mappings. 1748 log.Debugf("Configuring container with a new userns with identity user mappings into current userns") 1749 spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace}) 1750 uidMappings, err := getIdentityMapping("uid_map") 1751 if err != nil { 1752 return err 1753 } 1754 spec.Linux.UIDMappings = uidMappings 1755 logIDMappings(uidMappings, "UID") 1756 gidMappings, err := getIdentityMapping("gid_map") 1757 if err != nil { 1758 return err 1759 } 1760 spec.Linux.GIDMappings = gidMappings 1761 logIDMappings(gidMappings, "GID") 1762 return nil 1763 } 1764 1765 func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) { 1766 // See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files 1767 // are organized. 1768 mapFile := path.Join("/proc/self", mapFileName) 1769 file, err := os.Open(mapFile) 1770 if err != nil { 1771 return nil, fmt.Errorf("failed to open %s: %v", mapFile, err) 1772 } 1773 defer file.Close() 1774 1775 var mappings []specs.LinuxIDMapping 1776 scanner := bufio.NewScanner(file) 1777 for scanner.Scan() { 1778 line := scanner.Text() 1779 var myStart, parentStart, rangeLen uint32 1780 numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen) 1781 if err != nil { 1782 return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err) 1783 } 1784 if numParsed != 3 { 1785 return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile) 1786 } 1787 // Create an identity mapping with the current userns. 1788 mappings = append(mappings, specs.LinuxIDMapping{ 1789 ContainerID: myStart, 1790 HostID: myStart, 1791 Size: rangeLen, 1792 }) 1793 } 1794 if err := scanner.Err(); err != nil { 1795 return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err) 1796 } 1797 return mappings, nil 1798 } 1799 1800 func logIDMappings(mappings []specs.LinuxIDMapping, idType string) { 1801 if !log.IsLogging(log.Debug) { 1802 return 1803 } 1804 log.Debugf("%s Mappings:", idType) 1805 for _, m := range mappings { 1806 log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size) 1807 } 1808 } 1809 1810 // nvProxyPreGoferHostSetup does host setup work so that `nvidia-container-cli 1811 // configure` can be run in the future. It runs before any Gofers start. 1812 // It verifies that all the required dependencies are in place, loads kernel 1813 // modules, and ensures the correct device files exist and are accessible. 1814 // This should only be necessary once on the host. It should be run during the 1815 // root container setup sequence to make sure it has run at least once. 1816 func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error { 1817 if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) { 1818 return nil 1819 } 1820 1821 // Locate binaries. For security reasons, unlike 1822 // nvidia-container-runtime-hook, we don't add the container's filesystem 1823 // to the search path. We also don't support 1824 // /etc/nvidia-container-runtime/config.toml to avoid importing a TOML 1825 // parser. 1826 cliPath, err := exec.LookPath("nvidia-container-cli") 1827 if err != nil { 1828 return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1829 } 1830 1831 // nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves. 1832 nvproxyLoadKernelModules() 1833 1834 if _, err := os.Stat("/dev/nvidiactl"); err != nil { 1835 if !os.IsNotExist(err) { 1836 return fmt.Errorf("stat(2) for /dev/nvidiactl failed: %w", err) 1837 } 1838 1839 // Run `nvidia-container-cli info`. 1840 // This has the side-effect of automatically creating GPU device files. 1841 argv := []string{cliPath, "--load-kmods", "info"} 1842 log.Debugf("Executing %q", argv) 1843 var infoOut, infoErr strings.Builder 1844 cmd := exec.Cmd{ 1845 Path: argv[0], 1846 Args: argv, 1847 Env: os.Environ(), 1848 Stdout: &infoOut, 1849 Stderr: &infoErr, 1850 } 1851 if err := cmd.Run(); err != nil { 1852 return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String()) 1853 } 1854 log.Debugf("nvidia-container-cli info: %v", infoOut.String()) 1855 } 1856 1857 return nil 1858 } 1859 1860 // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe. 1861 func nvproxyLoadKernelModules() { 1862 for _, mod := range [...]string{ 1863 "nvidia", 1864 "nvidia-uvm", 1865 } { 1866 argv := []string{ 1867 "/sbin/modprobe", 1868 mod, 1869 } 1870 log.Debugf("Executing %q", argv) 1871 var stdout, stderr strings.Builder 1872 cmd := exec.Cmd{ 1873 Path: argv[0], 1874 Args: argv, 1875 Env: os.Environ(), 1876 Stdout: &stdout, 1877 Stderr: &stderr, 1878 } 1879 if err := cmd.Run(); err != nil { 1880 // This might not be fatal since modules may already be loaded. Log 1881 // the failure but continue. 1882 log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String()) 1883 } 1884 } 1885 } 1886 1887 // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`. 1888 // This sets up the container filesystem with bind mounts that allow it to 1889 // use NVIDIA devices. 1890 // 1891 // This should be called during the Gofer setup process, as the bind mounts 1892 // are created in the Gofer's mount namespace. 1893 // If successful, it returns a callback function that must be called once the 1894 // Gofer process has started. 1895 // This function has no effect if nvproxy functionality is not requested. 1896 // 1897 // This function essentially replicates 1898 // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the 1899 // binary that executeHook() is hard-coded to skip, with differences noted 1900 // inline. We do this rather than move the prestart hook because the 1901 // "runtime environment" in which prestart hooks execute is vaguely 1902 // defined, such that nvidia-container-runtime-hook and existing runsc 1903 // hooks differ in their expected environment. 1904 // 1905 // Note that nvidia-container-cli will set up files in /dev and /proc which 1906 // are useless, since they will be hidden by sentry devtmpfs and procfs 1907 // respectively (and some device files will have the wrong device numbers 1908 // from the application's perspective since nvproxy may register device 1909 // numbers in sentry VFS that differ from those on the host, e.g. for 1910 // nvidia-uvm). These files are separately created during sandbox VFS 1911 // construction. For this reason, we don't need to parse 1912 // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli. 1913 func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) { 1914 if !specutils.GPUFunctionalityRequestedViaHook(spec, conf) { 1915 return func() error { return nil }, nil 1916 } 1917 1918 if spec.Root == nil { 1919 return nil, fmt.Errorf("spec missing root filesystem") 1920 } 1921 1922 // nvidia-container-cli does not create this directory. 1923 if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil { 1924 return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err) 1925 } 1926 1927 cliPath, err := exec.LookPath("nvidia-container-cli") 1928 if err != nil { 1929 return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1930 } 1931 1932 // On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter. 1933 var ldconfigPath string 1934 if _, err := os.Stat("/sbin/ldconfig.real"); err == nil { 1935 ldconfigPath = "/sbin/ldconfig.real" 1936 } else { 1937 ldconfigPath = "/sbin/ldconfig" 1938 } 1939 1940 devices, err := specutils.ParseNvidiaVisibleDevices(spec) 1941 if err != nil { 1942 return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err) 1943 } 1944 1945 // Create synchronization FD for nvproxy. 1946 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1947 if err != nil { 1948 return nil, err 1949 } 1950 ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD") 1951 goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD") 1952 goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd) 1953 1954 return func() error { 1955 defer ourEnd.Close() 1956 argv := []string{ 1957 cliPath, 1958 "--load-kmods", 1959 "configure", 1960 fmt.Sprintf("--ldconfig=@%s", ldconfigPath), 1961 "--no-cgroups", // runsc doesn't configure device cgroups yet 1962 "--utility", 1963 "--compute", 1964 fmt.Sprintf("--pid=%d", goferCmd.Process.Pid), 1965 fmt.Sprintf("--device=%s", devices), 1966 spec.Root.Path, 1967 } 1968 log.Debugf("Executing %q", argv) 1969 var stdout, stderr strings.Builder 1970 cmd := exec.Cmd{ 1971 Path: argv[0], 1972 Args: argv, 1973 Env: os.Environ(), 1974 Stdout: &stdout, 1975 Stderr: &stderr, 1976 } 1977 if err := cmd.Run(); err != nil { 1978 return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String()) 1979 } 1980 return nil 1981 }, nil 1982 } 1983 1984 // CheckStopped checks if the container is stopped and updates its status. 1985 func (c *Container) CheckStopped() { 1986 if state, err := c.Sandbox.ContainerRuntimeState(c.ID); err != nil { 1987 log.Warningf("Cannot find if container %v exists, checking if sandbox %v is running, err: %v", c.ID, c.Sandbox.ID, err) 1988 if !c.IsSandboxRunning() { 1989 log.Warningf("Sandbox isn't running anymore, marking container %v as stopped:", c.ID) 1990 c.changeStatus(Stopped) 1991 } 1992 } else { 1993 if state == boot.RuntimeStateStopped { 1994 log.Warningf("Container %v is stopped", c.ID) 1995 c.changeStatus(Stopped) 1996 } 1997 } 1998 }