github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/container/container.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package container creates and manipulates containers. 16 package container 17 18 import ( 19 "bufio" 20 "context" 21 "errors" 22 "fmt" 23 "io/ioutil" 24 "os" 25 "os/exec" 26 "path" 27 "regexp" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 34 "github.com/MerlinKodo/gvisor/pkg/cleanup" 35 "github.com/MerlinKodo/gvisor/pkg/log" 36 "github.com/MerlinKodo/gvisor/pkg/sentry/control" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 38 "github.com/MerlinKodo/gvisor/pkg/sighandling" 39 "github.com/MerlinKodo/gvisor/pkg/state/statefile" 40 "github.com/MerlinKodo/gvisor/runsc/boot" 41 "github.com/MerlinKodo/gvisor/runsc/cgroup" 42 "github.com/MerlinKodo/gvisor/runsc/config" 43 "github.com/MerlinKodo/gvisor/runsc/console" 44 "github.com/MerlinKodo/gvisor/runsc/donation" 45 "github.com/MerlinKodo/gvisor/runsc/sandbox" 46 "github.com/MerlinKodo/gvisor/runsc/specutils" 47 "github.com/cenkalti/backoff" 48 specs "github.com/opencontainers/runtime-spec/specs-go" 49 "golang.org/x/sys/unix" 50 ) 51 52 const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" 53 54 // validateID validates the container id. 55 func validateID(id string) error { 56 // See libcontainer/factory_linux.go. 57 idRegex := regexp.MustCompile(`^[\w+\.-]+$`) 58 if !idRegex.MatchString(id) { 59 return fmt.Errorf("invalid container id: %v", id) 60 } 61 return nil 62 } 63 64 // Container represents a containerized application. When running, the 65 // container is associated with a single Sandbox. 66 // 67 // Container metadata can be saved and loaded to disk. Within a root directory, 68 // we maintain subdirectories for each container named with the container id. 69 // The container metadata is stored as a json within the container directory 70 // in a file named "meta.json". This metadata format is defined by us and is 71 // not part of the OCI spec. 72 // 73 // Containers must write their metadata files after any change to their internal 74 // states. The entire container directory is deleted when the container is 75 // destroyed. 76 // 77 // When the container is stopped, all processes that belong to the container 78 // must be stopped before Destroy() returns. containerd makes roughly the 79 // following calls to stop a container: 80 // - First it attempts to kill the container process with 81 // 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a 82 // separate thread, it's waiting on the container. As soon as the wait 83 // returns, it moves on to the next step: 84 // - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to 85 // the container. 'kill --all SIGKILL' waits for all processes before 86 // returning. 87 // - Containerd waits for stdin, stdout and stderr to drain and be closed. 88 // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once 89 // again just to be sure, waits, and then proceeds with remaining teardown. 90 // 91 // Container is thread-unsafe. 92 type Container struct { 93 // ID is the container ID. 94 ID string `json:"id"` 95 96 // Spec is the OCI runtime spec that configures this container. 97 Spec *specs.Spec `json:"spec"` 98 99 // BundleDir is the directory containing the container bundle. 100 BundleDir string `json:"bundleDir"` 101 102 // CreatedAt is the time the container was created. 103 CreatedAt time.Time `json:"createdAt"` 104 105 // Owner is the container owner. 106 Owner string `json:"owner"` 107 108 // ConsoleSocket is the path to a unix domain socket that will receive 109 // the console FD. 110 ConsoleSocket string `json:"consoleSocket"` 111 112 // Status is the current container Status. 113 Status Status `json:"status"` 114 115 // GoferPid is the PID of the gofer running along side the sandbox. May 116 // be 0 if the gofer has been killed. 117 GoferPid int `json:"goferPid"` 118 119 // Sandbox is the sandbox this container is running in. It's set when the 120 // container is created and reset when the sandbox is destroyed. 121 Sandbox *sandbox.Sandbox `json:"sandbox"` 122 123 // CompatCgroup has the cgroup configuration for the container. For the single 124 // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup 125 // is only set for multi-container, where the `c.Sandbox` cgroup represents 126 // the entire pod. 127 // 128 // Note that CompatCgroup is created only for compatibility with tools 129 // that expect container cgroups to exist. Setting limits here makes no change 130 // to the container in question. 131 CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"` 132 133 // Saver handles load from/save to the state file safely from multiple 134 // processes. 135 Saver StateFile `json:"saver"` 136 137 // OverlayMediums contains information about how the gofer mounts have been 138 // overlaid. The first entry is for rootfs and the following entries are for 139 // bind mounts in Spec.Mounts (in the same order). 140 OverlayMediums boot.OverlayMediumFlags `json:"overlayMediums"` 141 142 // 143 // Fields below this line are not saved in the state file and will not 144 // be preserved across commands. 145 // 146 147 // goferIsChild is set if a gofer process is a child of the current process. 148 // 149 // This field isn't saved to json, because only a creator of a gofer 150 // process will have it as a child process. 151 goferIsChild bool `nojson:"true"` 152 } 153 154 // Args is used to configure a new container. 155 type Args struct { 156 // ID is the container unique identifier. 157 ID string 158 159 // Spec is the OCI spec that describes the container. 160 Spec *specs.Spec 161 162 // BundleDir is the directory containing the container bundle. 163 BundleDir string 164 165 // ConsoleSocket is the path to a unix domain socket that will receive 166 // the console FD. It may be empty. 167 ConsoleSocket string 168 169 // PIDFile is the filename where the container's root process PID will be 170 // written to. It may be empty. 171 PIDFile string 172 173 // UserLog is the filename to send user-visible logs to. It may be empty. 174 // 175 // It only applies for the init container. 176 UserLog string 177 178 // Attached indicates that the sandbox lifecycle is attached with the caller. 179 // If the caller exits, the sandbox should exit too. 180 // 181 // It only applies for the init container. 182 Attached bool 183 184 // PassFiles are user-supplied files from the host to be exposed to the 185 // sandboxed app. 186 PassFiles map[int]*os.File 187 188 // ExecFile is the host file used for program execution. 189 ExecFile *os.File 190 } 191 192 // New creates the container in a new Sandbox process, unless the metadata 193 // indicates that an existing Sandbox should be used. The caller must call 194 // Destroy() on the container. 195 func New(conf *config.Config, args Args) (*Container, error) { 196 log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 197 if err := validateID(args.ID); err != nil { 198 return nil, err 199 } 200 201 if err := os.MkdirAll(conf.RootDir, 0711); err != nil { 202 return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) 203 } 204 205 if err := modifySpecForDirectfs(conf, args.Spec); err != nil { 206 return nil, fmt.Errorf("failed to modify spec for directfs: %v", err) 207 } 208 209 sandboxID := args.ID 210 if !isRoot(args.Spec) { 211 var ok bool 212 sandboxID, ok = specutils.SandboxID(args.Spec) 213 if !ok { 214 return nil, fmt.Errorf("no sandbox ID found when creating container") 215 } 216 } 217 218 c := &Container{ 219 ID: args.ID, 220 Spec: args.Spec, 221 ConsoleSocket: args.ConsoleSocket, 222 BundleDir: args.BundleDir, 223 Status: Creating, 224 CreatedAt: time.Now(), 225 Owner: os.Getenv("USER"), 226 Saver: StateFile{ 227 RootDir: conf.RootDir, 228 ID: FullID{ 229 SandboxID: sandboxID, 230 ContainerID: args.ID, 231 }, 232 }, 233 } 234 // The Cleanup object cleans up partially created containers when an error 235 // occurs. Any errors occurring during cleanup itself are ignored. 236 cu := cleanup.Make(func() { _ = c.Destroy() }) 237 defer cu.Clean() 238 239 // Lock the container metadata file to prevent concurrent creations of 240 // containers with the same id. 241 if err := c.Saver.LockForNew(); err != nil { 242 return nil, fmt.Errorf("cannot lock container metadata file: %w", err) 243 } 244 defer c.Saver.UnlockOrDie() 245 246 // If the metadata annotations indicate that this container should be started 247 // in an existing sandbox, we must do so. These are the possible metadata 248 // annotation states: 249 // 1. No annotations: it means that there is a single container and this 250 // container is obviously the root. Both container and sandbox share the 251 // ID. 252 // 2. Container type == sandbox: it means this is the root container 253 // starting the sandbox. Both container and sandbox share the same ID. 254 // 3. Container type == container: it means this is a subcontainer of an 255 // already started sandbox. In this case, container ID is different than 256 // the sandbox ID. 257 if isRoot(args.Spec) { 258 log.Debugf("Creating new sandbox for container, cid: %s", args.ID) 259 260 if args.Spec.Linux == nil { 261 args.Spec.Linux = &specs.Linux{} 262 } 263 // Don't force the use of cgroups in tests because they lack permission to do so. 264 if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 265 args.Spec.Linux.CgroupsPath = "/" + args.ID 266 } 267 var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup 268 if !conf.IgnoreCgroups { 269 var err error 270 271 // Create and join cgroup before processes are created to ensure they are 272 // part of the cgroup from the start (and all their children processes). 273 parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec) 274 if err != nil { 275 return nil, fmt.Errorf("cannot set up cgroup for root: %w", err) 276 } 277 // Join the child cgroup when using cgroupfs. Joining non leaf-node 278 // cgroups is illegal in cgroupsv2 and will return EBUSY. 279 if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() { 280 containerCgroup = subCgroup 281 } else { 282 containerCgroup = parentCgroup 283 } 284 } 285 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 286 mountHints, err := boot.NewPodMountHints(args.Spec) 287 if err != nil { 288 return nil, fmt.Errorf("error creating pod mount hints: %w", err) 289 } 290 overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(conf.GetOverlay2(), mountHints) 291 if err != nil { 292 return nil, err 293 } 294 c.OverlayMediums = overlayMediums 295 if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil { 296 return nil, err 297 } 298 if err := runInCgroup(containerCgroup, func() error { 299 ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) 300 if err != nil { 301 return fmt.Errorf("cannot create gofer process: %w", err) 302 } 303 304 // Start a new sandbox for this container. Any errors after this point 305 // must destroy the container. 306 sandArgs := &sandbox.Args{ 307 ID: sandboxID, 308 Spec: args.Spec, 309 BundleDir: args.BundleDir, 310 ConsoleSocket: args.ConsoleSocket, 311 UserLog: args.UserLog, 312 IOFiles: ioFiles, 313 MountsFile: specFile, 314 Cgroup: containerCgroup, 315 Attached: args.Attached, 316 OverlayFilestoreFiles: overlayFilestoreFiles, 317 OverlayMediums: overlayMediums, 318 MountHints: mountHints, 319 PassFiles: args.PassFiles, 320 ExecFile: args.ExecFile, 321 } 322 if specutils.GPUFunctionalityRequested(args.Spec, conf) { 323 // Expose all Nvidia devices in /dev/, because we don't know what 324 // devices future subcontainers will want. 325 searchDir := "/" 326 if conf.NVProxyDocker { 327 // For single-container use cases like Docker, the container rootfs 328 // is populated with the devices that need to be exposed. Scan that. 329 // This scan needs to happen outside the sandbox process because 330 // /rootfs/dev/nvidia* mounts made in gofer may not be propagated to 331 // sandbox's mount namespace. 332 searchDir = args.Spec.Root.Path 333 } 334 sandArgs.NvidiaDevMinors, err = specutils.FindAllGPUDevices(searchDir) 335 if err != nil { 336 return fmt.Errorf("FindAllGPUDevices: %w", err) 337 } 338 } 339 sand, err := sandbox.New(conf, sandArgs) 340 if err != nil { 341 return fmt.Errorf("cannot create sandbox: %w", err) 342 } 343 c.Sandbox = sand 344 return nil 345 346 }); err != nil { 347 return nil, err 348 } 349 } else { 350 log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) 351 352 // Find the sandbox associated with this ID. 353 fullID := FullID{ 354 SandboxID: sandboxID, 355 ContainerID: sandboxID, 356 } 357 sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) 358 if err != nil { 359 return nil, fmt.Errorf("cannot load sandbox: %w", err) 360 } 361 c.Sandbox = sb.Sandbox 362 363 subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) 364 if err != nil { 365 return nil, err 366 } 367 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 368 369 // If the console control socket file is provided, then create a new 370 // pty master/slave pair and send the TTY to the sandbox process. 371 var tty *os.File 372 if c.ConsoleSocket != "" { 373 // Create a new TTY pair and send the master on the provided socket. 374 var err error 375 tty, err = console.NewWithSocket(c.ConsoleSocket) 376 if err != nil { 377 return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) 378 } 379 // tty file is transferred to the sandbox, then it can be closed here. 380 defer tty.Close() 381 } 382 383 if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil { 384 return nil, fmt.Errorf("cannot create subcontainer: %w", err) 385 } 386 } 387 c.changeStatus(Created) 388 389 // Save the metadata file. 390 if err := c.saveLocked(); err != nil { 391 return nil, err 392 } 393 394 // "If any prestart hook fails, the runtime MUST generate an error, 395 // stop and destroy the container" -OCI spec. 396 if c.Spec.Hooks != nil { 397 // Even though the hook name is Prestart, runc used to call it from create. 398 // For this reason, it's now deprecated, but the spec requires it to be 399 // called *before* CreateRuntime and CreateRuntime must be called in create. 400 // 401 // "For runtimes that implement the deprecated prestart hooks as 402 // createRuntime hooks, createRuntime hooks MUST be called after the 403 // prestart hooks." 404 if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { 405 return nil, err 406 } 407 if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil { 408 return nil, err 409 } 410 if len(c.Spec.Hooks.CreateContainer) > 0 { 411 log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported") 412 } 413 } 414 415 // Write the PID file. Containerd considers the call to create complete after 416 // this file is created, so it must be the last thing we do. 417 if args.PIDFile != "" { 418 if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { 419 return nil, fmt.Errorf("error writing PID file: %v", err) 420 } 421 } 422 423 cu.Release() 424 return c, nil 425 } 426 427 // Start starts running the containerized process inside the sandbox. 428 func (c *Container) Start(conf *config.Config) error { 429 log.Debugf("Start container, cid: %s", c.ID) 430 431 if err := c.Saver.lock(BlockAcquire); err != nil { 432 return err 433 } 434 unlock := cleanup.Make(c.Saver.UnlockOrDie) 435 defer unlock.Clean() 436 437 if err := c.requireStatus("start", Created); err != nil { 438 return err 439 } 440 441 // "If any prestart hook fails, the runtime MUST generate an error, 442 // stop and destroy the container" -OCI spec. 443 if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { 444 log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") 445 } 446 447 if isRoot(c.Spec) { 448 if err := c.Sandbox.StartRoot(conf); err != nil { 449 return err 450 } 451 } else { 452 overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(conf.GetOverlay2(), c.Sandbox.MountHints) 453 if err != nil { 454 return err 455 } 456 c.OverlayMediums = overlayMediums 457 // Join cgroup to start gofer process to ensure it's part of the cgroup from 458 // the start (and all their children processes). 459 if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error { 460 // Create the gofer process. 461 goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) 462 if err != nil { 463 return err 464 } 465 defer func() { 466 _ = mountsFile.Close() 467 for _, f := range goferFiles { 468 _ = f.Close() 469 } 470 }() 471 472 cleanMounts, err := specutils.ReadMounts(mountsFile) 473 if err != nil { 474 return fmt.Errorf("reading mounts file: %v", err) 475 } 476 c.Spec.Mounts = cleanMounts 477 478 // Setup stdios if the container is not using terminal. Otherwise TTY was 479 // already setup in create. 480 var stdios []*os.File 481 if !c.Spec.Process.Terminal { 482 stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} 483 } 484 485 return c.Sandbox.StartSubcontainer(c.Spec, conf, c.ID, stdios, goferFiles, overlayFilestoreFiles, overlayMediums) 486 }); err != nil { 487 return err 488 } 489 } 490 491 // "If any poststart hook fails, the runtime MUST log a warning, but 492 // the remaining hooks and lifecycle continue as if the hook had 493 // succeeded" -OCI spec. 494 if c.Spec.Hooks != nil { 495 executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) 496 } 497 498 c.changeStatus(Running) 499 if err := c.saveLocked(); err != nil { 500 return err 501 } 502 503 // Release lock before adjusting OOM score because the lock is acquired there. 504 unlock.Clean() 505 506 // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). 507 if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { 508 return err 509 } 510 511 // Set container's oom_score_adj to the gofer since it is dedicated to 512 // the container, in case the gofer uses up too much memory. 513 return c.adjustGoferOOMScoreAdj() 514 } 515 516 // Restore takes a container and replaces its kernel and file system 517 // to restore a container from its state file. 518 func (c *Container) Restore(conf *config.Config, restoreFile string) error { 519 log.Debugf("Restore container, cid: %s", c.ID) 520 if err := c.Saver.lock(BlockAcquire); err != nil { 521 return err 522 } 523 defer c.Saver.UnlockOrDie() 524 525 if err := c.requireStatus("restore", Created); err != nil { 526 return err 527 } 528 529 // "If any prestart hook fails, the runtime MUST generate an error, 530 // stop and destroy the container" -OCI spec. 531 if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { 532 log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") 533 } 534 535 if err := c.Sandbox.Restore(conf, c.ID, restoreFile); err != nil { 536 return err 537 } 538 c.changeStatus(Running) 539 return c.saveLocked() 540 } 541 542 // Run is a helper that calls Create + Start + Wait. 543 func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { 544 log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 545 c, err := New(conf, args) 546 if err != nil { 547 return 0, fmt.Errorf("creating container: %v", err) 548 } 549 // Clean up partially created container if an error occurs. 550 // Any errors returned by Destroy() itself are ignored. 551 cu := cleanup.Make(func() { 552 c.Destroy() 553 }) 554 defer cu.Clean() 555 556 if conf.RestoreFile != "" { 557 log.Debugf("Restore: %v", conf.RestoreFile) 558 if err := c.Restore(conf, conf.RestoreFile); err != nil { 559 return 0, fmt.Errorf("starting container: %v", err) 560 } 561 } else { 562 if err := c.Start(conf); err != nil { 563 return 0, fmt.Errorf("starting container: %v", err) 564 } 565 } 566 567 // If we allocate a terminal, forward signals to the sandbox process. 568 // Otherwise, Ctrl+C will terminate this process and its children, 569 // including the terminal. 570 if c.Spec.Process.Terminal { 571 stopForwarding := c.ForwardSignals(0, true /* fgProcess */) 572 defer stopForwarding() 573 } 574 575 if args.Attached { 576 return c.Wait() 577 } 578 cu.Release() 579 return 0, nil 580 } 581 582 // Execute runs the specified command in the container. It returns the PID of 583 // the newly created process. 584 func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 585 log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) 586 if err := c.requireStatus("execute in", Created, Running); err != nil { 587 return 0, err 588 } 589 args.ContainerID = c.ID 590 return c.Sandbox.Execute(conf, args) 591 } 592 593 // Event returns events for the container. 594 func (c *Container) Event() (*boot.EventOut, error) { 595 log.Debugf("Getting events for container, cid: %s", c.ID) 596 if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { 597 return nil, err 598 } 599 event, err := c.Sandbox.Event(c.ID) 600 if err != nil { 601 return nil, err 602 } 603 604 // Some stats can utilize host cgroups for accuracy. 605 c.populateStats(event) 606 607 return event, nil 608 } 609 610 // PortForward starts port forwarding to the container. 611 func (c *Container) PortForward(opts *boot.PortForwardOpts) error { 612 if err := c.requireStatus("port forward", Running); err != nil { 613 return err 614 } 615 opts.ContainerID = c.ID 616 return c.Sandbox.PortForward(opts) 617 } 618 619 // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the 620 // container is not running. 621 func (c *Container) SandboxPid() int { 622 if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { 623 return -1 624 } 625 return c.Sandbox.Getpid() 626 } 627 628 // Wait waits for the container to exit, and returns its WaitStatus. 629 // Call to wait on a stopped container is needed to retrieve the exit status 630 // and wait returns immediately. 631 func (c *Container) Wait() (unix.WaitStatus, error) { 632 log.Debugf("Wait on container, cid: %s", c.ID) 633 ws, err := c.Sandbox.Wait(c.ID) 634 if err == nil { 635 // Wait succeeded, container is not running anymore. 636 c.changeStatus(Stopped) 637 } 638 return ws, err 639 } 640 641 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and 642 // returns its WaitStatus. 643 func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) { 644 log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) 645 if !c.IsSandboxRunning() { 646 return 0, fmt.Errorf("sandbox is not running") 647 } 648 return c.Sandbox.WaitPID(c.Sandbox.ID, pid) 649 } 650 651 // WaitPID waits for process 'pid' in the container's PID namespace and returns 652 // its WaitStatus. 653 func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) { 654 log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) 655 if !c.IsSandboxRunning() { 656 return 0, fmt.Errorf("sandbox is not running") 657 } 658 return c.Sandbox.WaitPID(c.ID, pid) 659 } 660 661 // SignalContainer sends the signal to the container. If all is true and signal 662 // is SIGKILL, then waits for all processes to exit before returning. 663 // SignalContainer returns an error if the container is already stopped. 664 // TODO(b/113680494): Distinguish different error types. 665 func (c *Container) SignalContainer(sig unix.Signal, all bool) error { 666 log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) 667 // Signaling container in Stopped state is allowed. When all=false, 668 // an error will be returned anyway; when all=true, this allows 669 // sending signal to other processes inside the container even 670 // after the init process exits. This is especially useful for 671 // container cleanup. 672 if err := c.requireStatus("signal", Running, Stopped); err != nil { 673 return err 674 } 675 if !c.IsSandboxRunning() { 676 return fmt.Errorf("sandbox is not running") 677 } 678 return c.Sandbox.SignalContainer(c.ID, sig, all) 679 } 680 681 // SignalProcess sends sig to a specific process in the container. 682 func (c *Container) SignalProcess(sig unix.Signal, pid int32) error { 683 log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) 684 if err := c.requireStatus("signal a process inside", Running); err != nil { 685 return err 686 } 687 if !c.IsSandboxRunning() { 688 return fmt.Errorf("sandbox is not running") 689 } 690 return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) 691 } 692 693 // ForwardSignals forwards all signals received by the current process to the 694 // container process inside the sandbox. It returns a function that will stop 695 // forwarding signals. 696 func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { 697 log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) 698 stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { 699 log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) 700 if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil { 701 log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) 702 } 703 }) 704 return func() { 705 log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) 706 stop() 707 } 708 } 709 710 // Checkpoint sends the checkpoint call to the container. 711 // The statefile will be written to f, the file at the specified image-path. 712 func (c *Container) Checkpoint(f *os.File, options statefile.Options) error { 713 log.Debugf("Checkpoint container, cid: %s", c.ID) 714 if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { 715 return err 716 } 717 return c.Sandbox.Checkpoint(c.ID, f, options) 718 } 719 720 // Pause suspends the container and its kernel. 721 // The call only succeeds if the container's status is created or running. 722 func (c *Container) Pause() error { 723 log.Debugf("Pausing container, cid: %s", c.ID) 724 if err := c.Saver.lock(BlockAcquire); err != nil { 725 return err 726 } 727 defer c.Saver.UnlockOrDie() 728 729 if c.Status != Created && c.Status != Running { 730 return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) 731 } 732 733 if err := c.Sandbox.Pause(c.ID); err != nil { 734 return fmt.Errorf("pausing container %q: %v", c.ID, err) 735 } 736 c.changeStatus(Paused) 737 return c.saveLocked() 738 } 739 740 // Resume unpauses the container and its kernel. 741 // The call only succeeds if the container's status is paused. 742 func (c *Container) Resume() error { 743 log.Debugf("Resuming container, cid: %s", c.ID) 744 if err := c.Saver.lock(BlockAcquire); err != nil { 745 return err 746 } 747 defer c.Saver.UnlockOrDie() 748 749 if c.Status != Paused { 750 return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) 751 } 752 if err := c.Sandbox.Resume(c.ID); err != nil { 753 return fmt.Errorf("resuming container: %v", err) 754 } 755 c.changeStatus(Running) 756 return c.saveLocked() 757 } 758 759 // State returns the metadata of the container. 760 func (c *Container) State() specs.State { 761 return specs.State{ 762 Version: specs.Version, 763 ID: c.ID, 764 Status: c.Status, 765 Pid: c.SandboxPid(), 766 Bundle: c.BundleDir, 767 Annotations: c.Spec.Annotations, 768 } 769 } 770 771 // Processes retrieves the list of processes and associated metadata inside a 772 // container. 773 func (c *Container) Processes() ([]*control.Process, error) { 774 if err := c.requireStatus("get processes of", Running, Paused); err != nil { 775 return nil, err 776 } 777 return c.Sandbox.Processes(c.ID) 778 } 779 780 // Destroy stops all processes and frees all resources associated with the 781 // container. 782 func (c *Container) Destroy() error { 783 log.Debugf("Destroy container, cid: %s", c.ID) 784 785 if err := c.Saver.lock(BlockAcquire); err != nil { 786 return err 787 } 788 defer func() { 789 c.Saver.UnlockOrDie() 790 _ = c.Saver.close() 791 }() 792 793 // Stored for later use as stop() sets c.Sandbox to nil. 794 sb := c.Sandbox 795 796 // We must perform the following cleanup steps: 797 // * stop the container and gofer processes, 798 // * remove the container filesystem on the host, and 799 // * delete the container metadata directory. 800 // 801 // It's possible for one or more of these steps to fail, but we should 802 // do our best to perform all of the cleanups. Hence, we keep a slice 803 // of errors return their concatenation. 804 var errs []string 805 if err := c.stop(); err != nil { 806 err = fmt.Errorf("stopping container: %v", err) 807 log.Warningf("%v", err) 808 errs = append(errs, err.Error()) 809 } 810 811 if err := c.Saver.Destroy(); err != nil { 812 err = fmt.Errorf("deleting container state files: %v", err) 813 log.Warningf("%v", err) 814 errs = append(errs, err.Error()) 815 } 816 817 // Clean up overlay filestore files created in their respective mounts. 818 c.forEachSelfOverlay(func(mountSrc string) { 819 filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID()) 820 if err := os.Remove(filestorePath); err != nil { 821 err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err) 822 log.Warningf("%v", err) 823 errs = append(errs, err.Error()) 824 } 825 }) 826 827 c.changeStatus(Stopped) 828 829 // Adjust oom_score_adj for the sandbox. This must be done after the container 830 // is stopped and the directory at c.Root is removed. 831 // 832 // Use 'sb' to tell whether it has been executed before because Destroy must 833 // be idempotent. 834 if sb != nil { 835 if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { 836 errs = append(errs, err.Error()) 837 } 838 } 839 840 // "If any poststop hook fails, the runtime MUST log a warning, but the 841 // remaining hooks and lifecycle continue as if the hook had 842 // succeeded" - OCI spec. 843 // 844 // Based on the OCI, "The post-stop hooks MUST be called after the container 845 // is deleted but before the delete operation returns" 846 // Run it here to: 847 // 1) Conform to the OCI. 848 // 2) Make sure it only runs once, because the root has been deleted, the 849 // container can't be loaded again. 850 if c.Spec.Hooks != nil { 851 executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) 852 } 853 854 if len(errs) == 0 { 855 return nil 856 } 857 return fmt.Errorf(strings.Join(errs, "\n")) 858 } 859 860 func (c *Container) sandboxID() string { 861 return c.Saver.ID.SandboxID 862 } 863 864 func (c *Container) forEachSelfOverlay(fn func(mountSrc string)) { 865 if c.OverlayMediums == nil { 866 // Sub container not started? Skip. 867 return 868 } 869 if c.OverlayMediums[0] == boot.SelfMedium { 870 fn(c.Spec.Root.Path) 871 } 872 goferMntIdx := 1 // First index is for rootfs. 873 for i := range c.Spec.Mounts { 874 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 875 continue 876 } 877 if c.OverlayMediums[goferMntIdx] == boot.SelfMedium { 878 fn(c.Spec.Mounts[i].Source) 879 } 880 goferMntIdx++ 881 } 882 } 883 884 // createOverlayFilestores creates the regular files that will back the tmpfs 885 // upper mount for overlay mounts. It also returns information about the 886 // overlay medium used for each bind mount. 887 func (c *Container) createOverlayFilestores(conf config.Overlay2, mountHints *boot.PodMountHints) ([]*os.File, []boot.OverlayMedium, error) { 888 var filestoreFiles []*os.File 889 var overlayMediums []boot.OverlayMedium 890 891 // Handle root mount first. 892 shouldOverlay := conf.RootEnabled() && !c.Spec.Root.Readonly 893 filestore, medium, err := c.createOverlayFilestore(conf, c.Spec.Root.Path, shouldOverlay, nil /* hint */) 894 if err != nil { 895 return nil, nil, err 896 } 897 if filestore != nil { 898 filestoreFiles = append(filestoreFiles, filestore) 899 } 900 overlayMediums = append(overlayMediums, medium) 901 902 // Handle bind mounts. 903 for i := range c.Spec.Mounts { 904 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 905 continue 906 } 907 hint := mountHints.FindMount(&c.Spec.Mounts[i]) 908 shouldOverlay := conf.SubMountEnabled() && !specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) 909 filestore, medium, err := c.createOverlayFilestore(conf, c.Spec.Mounts[i].Source, shouldOverlay, hint) 910 if err != nil { 911 return nil, nil, err 912 } 913 if filestore != nil { 914 filestoreFiles = append(filestoreFiles, filestore) 915 } 916 overlayMediums = append(overlayMediums, medium) 917 } 918 for _, filestore := range filestoreFiles { 919 // Perform this work around outside the sandbox. The sandbox may already be 920 // running with seccomp filters that do not allow this. 921 pgalloc.IMAWorkAroundForMemFile(filestore.Fd()) 922 } 923 return filestoreFiles, overlayMediums, nil 924 } 925 926 func (c *Container) createOverlayFilestore(conf config.Overlay2, mountSrc string, shouldOverlay bool, hint *boot.MountHint) (*os.File, boot.OverlayMedium, error) { 927 if hint != nil && hint.ShouldOverlay() { 928 // MountHint information takes precedence over shouldOverlay. 929 return c.createOverlayFilestoreInSelf(mountSrc) 930 } 931 switch { 932 case !shouldOverlay: 933 return nil, boot.NoOverlay, nil 934 case conf.IsBackedByMemory(): 935 return nil, boot.MemoryMedium, nil 936 case conf.IsBackedBySelf(): 937 return c.createOverlayFilestoreInSelf(mountSrc) 938 default: 939 return c.createOverlayFilestoreInDir(conf) 940 } 941 } 942 943 func (c *Container) createOverlayFilestoreInSelf(mountSrc string) (*os.File, boot.OverlayMedium, error) { 944 mountSrcInfo, err := os.Stat(mountSrc) 945 if err != nil { 946 return nil, boot.NoOverlay, fmt.Errorf("failed to stat mount %q to see if it were a directory: %v", mountSrc, err) 947 } 948 if !mountSrcInfo.IsDir() { 949 log.Warningf("overlay2 self medium is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc) 950 return nil, boot.MemoryMedium, nil 951 } 952 // Create the self overlay filestore file. 953 filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID()) 954 filestoreFD, err := unix.Open(filestorePath, unix.O_RDWR|unix.O_CREAT|unix.O_EXCL|unix.O_CLOEXEC, 0666) 955 if err != nil { 956 if err == unix.EEXIST { 957 // Note that if the same submount is mounted multiple times within the 958 // same sandbox, then the overlay option doesn't work correctly. 959 // Because each overlay mount is independent and changes to one are not 960 // visible to the other. Given "overlay on repeated submounts" is 961 // already broken, we don't support such a scenario with the self 962 // medium. The filestore file will already exist for such a case. 963 return nil, boot.NoOverlay, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not suppported with self medium", mountSrc, filestorePath) 964 } 965 return nil, boot.NoOverlay, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err) 966 } 967 log.Debugf("Created overlay filestore file at %q for mount source %q", filestorePath, mountSrc) 968 // Filestore in self should be a named path because it needs to be 969 // discoverable via path traversal so that k8s can scan the filesystem 970 // and apply any limits appropriately (like local ephemeral storage 971 // limits). So don't delete it. These files will be unlinked when the 972 // container is destroyed. This makes self medium appropriate for k8s. 973 return os.NewFile(uintptr(filestoreFD), filestorePath), boot.SelfMedium, nil 974 } 975 976 func (c *Container) createOverlayFilestoreInDir(conf config.Overlay2) (*os.File, boot.OverlayMedium, error) { 977 filestoreDir := conf.HostFileDir() 978 fileInfo, err := os.Stat(filestoreDir) 979 if err != nil { 980 return nil, boot.NoOverlay, fmt.Errorf("failed to stat overlay filestore directory %q: %v", filestoreDir, err) 981 } 982 if !fileInfo.IsDir() { 983 return nil, boot.NoOverlay, fmt.Errorf("overlay2 flag should specify an existing directory") 984 } 985 // Create an unnamed temporary file in filestore directory which will be 986 // deleted when the last FD on it is closed. We don't use O_TMPFILE because 987 // it is not supported on all filesystems. So we simulate it by creating a 988 // named file and then immediately unlinking it while keeping an FD on it. 989 // This file will be deleted when the container exits. 990 filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-overlay-filestore-") 991 if err != nil { 992 return nil, boot.NoOverlay, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err) 993 } 994 if err := unix.Unlink(filestoreFile.Name()); err != nil { 995 return nil, boot.NoOverlay, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err) 996 } 997 log.Debugf("Created an unnamed overlay filestore file at %q", filestoreDir) 998 return filestoreFile, boot.AnonDirMedium, nil 999 } 1000 1001 // saveLocked saves the container metadata to a file. 1002 // 1003 // Precondition: container must be locked with container.lock(). 1004 func (c *Container) saveLocked() error { 1005 log.Debugf("Save container, cid: %s", c.ID) 1006 if err := c.Saver.SaveLocked(c); err != nil { 1007 return fmt.Errorf("saving container metadata: %v", err) 1008 } 1009 return nil 1010 } 1011 1012 // stop stops the container (for regular containers) or the sandbox (for 1013 // root containers), and waits for the container or sandbox and the gofer 1014 // to stop. If any of them doesn't stop before timeout, an error is returned. 1015 func (c *Container) stop() error { 1016 var parentCgroup cgroup.Cgroup 1017 1018 if c.Sandbox != nil { 1019 log.Debugf("Destroying container, cid: %s", c.ID) 1020 if err := c.Sandbox.DestroyContainer(c.ID); err != nil { 1021 return fmt.Errorf("destroying container %q: %v", c.ID, err) 1022 } 1023 // Only uninstall parentCgroup for sandbox stop. 1024 if c.Sandbox.IsRootContainer(c.ID) { 1025 parentCgroup = c.Sandbox.CgroupJSON.Cgroup 1026 } 1027 // Only set sandbox to nil after it has been told to destroy the container. 1028 c.Sandbox = nil 1029 } 1030 1031 // Try killing gofer if it does not exit with container. 1032 if c.GoferPid != 0 { 1033 log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) 1034 if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil { 1035 // The gofer may already be stopped, log the error. 1036 log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err) 1037 } 1038 } 1039 1040 if err := c.waitForStopped(); err != nil { 1041 return err 1042 } 1043 1044 // Delete container cgroup if any. 1045 if c.CompatCgroup.Cgroup != nil { 1046 if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil { 1047 return err 1048 } 1049 } 1050 // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called 1051 // after the gofer has stopped. 1052 if parentCgroup != nil { 1053 if err := parentCgroup.Uninstall(); err != nil { 1054 return err 1055 } 1056 } 1057 return nil 1058 } 1059 1060 func (c *Container) waitForStopped() error { 1061 if c.GoferPid == 0 { 1062 return nil 1063 } 1064 1065 if c.IsSandboxRunning() { 1066 if err := c.SignalContainer(unix.Signal(0), false); err == nil { 1067 return fmt.Errorf("container is still running") 1068 } 1069 } 1070 1071 if c.goferIsChild { 1072 // The gofer process is a child of the current process, 1073 // so we can wait it and collect its zombie. 1074 if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil { 1075 return fmt.Errorf("error waiting the gofer process: %v", err) 1076 } 1077 c.GoferPid = 0 1078 return nil 1079 } 1080 1081 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1082 defer cancel() 1083 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1084 op := func() error { 1085 if err := unix.Kill(c.GoferPid, 0); err == nil { 1086 return fmt.Errorf("gofer is still running") 1087 } 1088 c.GoferPid = 0 1089 return nil 1090 } 1091 return backoff.Retry(op, b) 1092 } 1093 1094 func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { 1095 donations := donation.Agency{} 1096 defer donations.Close() 1097 1098 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 1099 return nil, nil, err 1100 } 1101 if conf.DebugLog != "" { 1102 test := "" 1103 if len(conf.TestOnlyTestNameEnv) != 0 { 1104 // Fetch test name if one is provided and the test only flag was set. 1105 if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 1106 test = t 1107 } 1108 } 1109 if specutils.IsDebugCommand(conf, "gofer") { 1110 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test); err != nil { 1111 return nil, nil, err 1112 } 1113 } 1114 } 1115 1116 // Start with the general config flags. 1117 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 1118 cmd.SysProcAttr = &unix.SysProcAttr{ 1119 // Detach from session. Otherwise, signals sent to the foreground process 1120 // will also be forwarded by this process, resulting in duplicate signals. 1121 Setsid: true, 1122 } 1123 1124 // Set Args[0] to make easier to spot the gofer process. Otherwise it's 1125 // shown as `exe`. 1126 cmd.Args[0] = "runsc-gofer" 1127 1128 // Tranfer FDs that need to be present before the "gofer" command. 1129 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 1130 nextFD := donations.Transfer(cmd, 3) 1131 1132 cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir) 1133 cmd.Args = append(cmd.Args, "--overlay-mediums="+c.OverlayMediums.String()) 1134 1135 // Open the spec file to donate to the sandbox. 1136 specFile, err := specutils.OpenSpec(bundleDir) 1137 if err != nil { 1138 return nil, nil, fmt.Errorf("opening spec file: %v", err) 1139 } 1140 donations.DonateAndClose("spec-fd", specFile) 1141 1142 // Donate any profile FDs to the gofer. 1143 if err := c.donateGoferProfileFDs(conf, &donations); err != nil { 1144 return nil, nil, fmt.Errorf("donating gofer profile fds: %w", err) 1145 } 1146 1147 // Create pipe that allows gofer to send mount list to sandbox after all paths 1148 // have been resolved. 1149 mountsSand, mountsGofer, err := os.Pipe() 1150 if err != nil { 1151 return nil, nil, err 1152 } 1153 donations.DonateAndClose("mounts-fd", mountsGofer) 1154 1155 // Add root mount and then add any other additional mounts. 1156 mountCount := 1 1157 for _, m := range spec.Mounts { 1158 if specutils.IsGoferMount(m) { 1159 mountCount++ 1160 } 1161 } 1162 1163 sandEnds := make([]*os.File, 0, mountCount) 1164 for i := 0; i < mountCount; i++ { 1165 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1166 if err != nil { 1167 return nil, nil, err 1168 } 1169 sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) 1170 1171 goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") 1172 donations.DonateAndClose("io-fds", goferEnd) 1173 } 1174 1175 if attached { 1176 // The gofer is attached to the lifetime of this process, so it 1177 // should synchronously die when this process dies. 1178 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1179 } 1180 1181 // Enter new namespaces to isolate from the rest of the system. Don't unshare 1182 // cgroup because gofer is added to a cgroup in the caller's namespace. 1183 nss := []specs.LinuxNamespace{ 1184 {Type: specs.IPCNamespace}, 1185 {Type: specs.MountNamespace}, 1186 {Type: specs.NetworkNamespace}, 1187 {Type: specs.PIDNamespace}, 1188 {Type: specs.UTSNamespace}, 1189 } 1190 1191 rootlessEUID := unix.Geteuid() != 0 1192 // Setup any uid/gid mappings, and create or join the configured user 1193 // namespace so the gofer's view of the filesystem aligns with the 1194 // users in the sandbox. 1195 if !rootlessEUID { 1196 if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1197 nss = append(nss, userNS) 1198 specutils.SetUIDGIDMappings(cmd, spec) 1199 // We need to set UID and GID to have capabilities in a new user namespace. 1200 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 1201 } 1202 } else { 1203 userNS, ok := specutils.GetNS(specs.UserNamespace, spec) 1204 if !ok { 1205 return nil, nil, fmt.Errorf("unable to run a rootless container without userns") 1206 } 1207 nss = append(nss, userNS) 1208 syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations) 1209 if err != nil { 1210 return nil, nil, err 1211 } 1212 defer syncFile.Close() 1213 } 1214 1215 nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations) 1216 if err != nil { 1217 return nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err) 1218 } 1219 1220 donations.Transfer(cmd, nextFD) 1221 1222 // Start the gofer in the given namespace. 1223 donation.LogDonations(cmd) 1224 log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args) 1225 if err := specutils.StartInNS(cmd, nss); err != nil { 1226 return nil, nil, fmt.Errorf("gofer: %v", err) 1227 } 1228 log.Infof("Gofer started, PID: %d", cmd.Process.Pid) 1229 c.GoferPid = cmd.Process.Pid 1230 c.goferIsChild = true 1231 1232 // Set up and synchronize rootless mode userns mappings. 1233 if rootlessEUID { 1234 if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil { 1235 return nil, nil, err 1236 } 1237 } 1238 1239 // Set up nvproxy within the Gofer namespace. 1240 if err := nvProxySetup(); err != nil { 1241 return nil, nil, fmt.Errorf("nvproxy setup: %w", err) 1242 } 1243 1244 return sandEnds, mountsSand, nil 1245 } 1246 1247 // changeStatus transitions from one status to another ensuring that the 1248 // transition is valid. 1249 func (c *Container) changeStatus(s Status) { 1250 switch s { 1251 case Creating: 1252 // Initial state, never transitions to it. 1253 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1254 1255 case Created: 1256 if c.Status != Creating { 1257 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1258 } 1259 if c.Sandbox == nil { 1260 panic("sandbox cannot be nil") 1261 } 1262 1263 case Paused: 1264 if c.Status != Running { 1265 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1266 } 1267 if c.Sandbox == nil { 1268 panic("sandbox cannot be nil") 1269 } 1270 1271 case Running: 1272 if c.Status != Created && c.Status != Paused { 1273 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1274 } 1275 if c.Sandbox == nil { 1276 panic("sandbox cannot be nil") 1277 } 1278 1279 case Stopped: 1280 if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped { 1281 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1282 } 1283 1284 default: 1285 panic(fmt.Sprintf("invalid new state: %v", s)) 1286 } 1287 c.Status = s 1288 } 1289 1290 // IsSandboxRunning returns true if the sandbox exists and is running. 1291 func (c *Container) IsSandboxRunning() bool { 1292 return c.Sandbox != nil && c.Sandbox.IsRunning() 1293 } 1294 1295 // HasCapabilityInAnySet returns true if the given capability is in any of the 1296 // capability sets of the container process. 1297 func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool { 1298 capString := capability.String() 1299 for _, set := range [5][]string{ 1300 c.Spec.Process.Capabilities.Bounding, 1301 c.Spec.Process.Capabilities.Effective, 1302 c.Spec.Process.Capabilities.Inheritable, 1303 c.Spec.Process.Capabilities.Permitted, 1304 c.Spec.Process.Capabilities.Ambient, 1305 } { 1306 for _, c := range set { 1307 if c == capString { 1308 return true 1309 } 1310 } 1311 } 1312 return false 1313 } 1314 1315 // RunsAsUID0 returns true if the container process runs with UID 0 (root). 1316 func (c *Container) RunsAsUID0() bool { 1317 return c.Spec.Process.User.UID == 0 1318 } 1319 1320 func (c *Container) requireStatus(action string, statuses ...Status) error { 1321 for _, s := range statuses { 1322 if c.Status == s { 1323 return nil 1324 } 1325 } 1326 return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) 1327 } 1328 1329 // IsSandboxRoot returns true if this container is its sandbox's root container. 1330 func (c *Container) IsSandboxRoot() bool { 1331 return isRoot(c.Spec) 1332 } 1333 1334 func isRoot(spec *specs.Spec) bool { 1335 return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer 1336 } 1337 1338 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute 1339 // it in the current context. 1340 func runInCgroup(cg cgroup.Cgroup, fn func() error) error { 1341 if cg == nil { 1342 return fn() 1343 } 1344 restore, err := cg.Join() 1345 if err != nil { 1346 return err 1347 } 1348 defer restore() 1349 return fn() 1350 } 1351 1352 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. 1353 func (c *Container) adjustGoferOOMScoreAdj() error { 1354 if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { 1355 return nil 1356 } 1357 return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) 1358 } 1359 1360 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. 1361 // oom_score_adj is set to the lowest oom_score_adj among the containers 1362 // running in the sandbox. 1363 // 1364 // TODO(gvisor.dev/issue/238): This call could race with other containers being 1365 // created at the same time and end up setting the wrong oom_score_adj to the 1366 // sandbox. Use rpc client to synchronize. 1367 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { 1368 // Adjustment can be skipped if the root container is exiting, because it 1369 // brings down the entire sandbox. 1370 if isRoot(spec) && destroy { 1371 return nil 1372 } 1373 1374 containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{}) 1375 if err != nil { 1376 return fmt.Errorf("loading sandbox containers: %v", err) 1377 } 1378 1379 // Do nothing if the sandbox has been terminated. 1380 if len(containers) == 0 { 1381 return nil 1382 } 1383 1384 // Get the lowest score for all containers. 1385 var lowScore int 1386 scoreFound := false 1387 for _, container := range containers { 1388 // Special multi-container support for CRI. Ignore the root container when 1389 // calculating oom_score_adj for the sandbox because it is the 1390 // infrastructure (pause) container and always has a very low oom_score_adj. 1391 // 1392 // We will use OOMScoreAdj in the single-container case where the 1393 // containerd container-type annotation is not present. 1394 if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { 1395 continue 1396 } 1397 1398 if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { 1399 scoreFound = true 1400 lowScore = *container.Spec.Process.OOMScoreAdj 1401 } 1402 } 1403 1404 // If the container is destroyed and remaining containers have no 1405 // oomScoreAdj specified then we must revert to the original oom_score_adj 1406 // saved with the root container. 1407 if !scoreFound && destroy { 1408 lowScore = containers[0].Sandbox.OriginalOOMScoreAdj 1409 scoreFound = true 1410 } 1411 1412 // Only set oom_score_adj if one of the containers has oom_score_adj set. If 1413 // not, oom_score_adj is inherited from the parent process. 1414 // 1415 // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process 1416 if !scoreFound { 1417 return nil 1418 } 1419 1420 // Set the lowest of all containers oom_score_adj to the sandbox. 1421 return setOOMScoreAdj(s.Getpid(), lowScore) 1422 } 1423 1424 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID. 1425 // /proc must be available and mounted read-write. scoreAdj should be between 1426 // -1000 and 1000. It's a noop if the process has already exited. 1427 func setOOMScoreAdj(pid int, scoreAdj int) error { 1428 f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) 1429 if err != nil { 1430 // Ignore NotExist errors because it can race with process exit. 1431 if os.IsNotExist(err) { 1432 log.Warningf("Process (%d) not found setting oom_score_adj", pid) 1433 return nil 1434 } 1435 return err 1436 } 1437 defer f.Close() 1438 if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { 1439 if errors.Is(err, unix.ESRCH) { 1440 log.Warningf("Process (%d) exited while setting oom_score_adj", pid) 1441 return nil 1442 } 1443 return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) 1444 } 1445 return nil 1446 } 1447 1448 // populateStats populates event with stats estimates based on cgroups and the 1449 // sentry's accounting. 1450 // TODO(gvisor.dev/issue/172): This is an estimation; we should do more 1451 // detailed accounting. 1452 func (c *Container) populateStats(event *boot.EventOut) { 1453 // The events command, when run for all running containers, should 1454 // account for the full cgroup CPU usage. We split cgroup usage 1455 // proportionally according to the sentry-internal usage measurements, 1456 // only counting Running containers. 1457 log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) 1458 var containerUsage uint64 1459 var allContainersUsage uint64 1460 for ID, usage := range event.ContainerUsage { 1461 allContainersUsage += usage 1462 if ID == c.ID { 1463 containerUsage = usage 1464 } 1465 } 1466 1467 cgroup, err := c.Sandbox.NewCGroup() 1468 if err != nil { 1469 // No cgroup, so rely purely on the sentry's accounting. 1470 log.Warningf("events: no cgroups") 1471 event.Event.Data.CPU.Usage.Total = containerUsage 1472 return 1473 } 1474 1475 // Get the host cgroup CPU usage. 1476 cgroupsUsage, err := cgroup.CPUUsage() 1477 if err != nil { 1478 // No cgroup usage, so rely purely on the sentry's accounting. 1479 log.Warningf("events: failed when getting cgroup CPU usage for container: %v", err) 1480 event.Event.Data.CPU.Usage.Total = containerUsage 1481 return 1482 } 1483 1484 // If the sentry reports no CPU usage, fall back on cgroups and split usage 1485 // equally across containers. 1486 if allContainersUsage == 0 { 1487 log.Warningf("events: no sentry CPU usage reported") 1488 allContainersUsage = cgroupsUsage 1489 containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage)) 1490 } 1491 1492 // Scaling can easily overflow a uint64 (e.g. a containerUsage and 1493 // cgroupsUsage of 16 seconds each will overflow), so use floats. 1494 total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) 1495 log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) 1496 event.Event.Data.CPU.Usage.Total = uint64(total) 1497 return 1498 } 1499 1500 // setupCgroupForRoot configures and returns cgroup for the sandbox and the 1501 // root container. If `cgroupParentAnnotation` is set, use that path as the 1502 // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. 1503 func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) { 1504 var parentCgroup cgroup.Cgroup 1505 if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { 1506 var err error 1507 parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup) 1508 if err != nil { 1509 return nil, nil, err 1510 } 1511 } else { 1512 var err error 1513 parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1514 if parentCgroup == nil || err != nil { 1515 return nil, nil, err 1516 } 1517 } 1518 1519 var err error 1520 parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) 1521 if parentCgroup == nil || err != nil { 1522 return nil, nil, err 1523 } 1524 1525 subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) 1526 if err != nil { 1527 _ = parentCgroup.Uninstall() 1528 return nil, nil, err 1529 } 1530 return parentCgroup, subCgroup, nil 1531 } 1532 1533 // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since 1534 // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the 1535 // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups 1536 // paths to discover new containers and report stats for them. 1537 func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) { 1538 if isRoot(spec) { 1539 if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { 1540 return nil, nil 1541 } 1542 } 1543 1544 cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1545 if cg == nil || err != nil { 1546 return nil, err 1547 } 1548 // Use empty resources, just want the directory structure created. 1549 return cgroupInstall(conf, cg, &specs.LinuxResources{}) 1550 } 1551 1552 // donateGoferProfileFDs will open profile files and donate their FDs to the 1553 // gofer. 1554 func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error { 1555 // The gofer profile files are named based on the provided flag, but 1556 // suffixed with "gofer" and the container ID to avoid collisions with 1557 // sentry profile files or profile files from other gofers. 1558 // 1559 // TODO(b/243183772): Merge gofer profile data with sentry profile data 1560 // into a single file. 1561 profSuffix := ".gofer." + c.ID 1562 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 1563 if conf.ProfileBlock != "" { 1564 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil { 1565 return err 1566 } 1567 } 1568 if conf.ProfileCPU != "" { 1569 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil { 1570 return err 1571 } 1572 } 1573 if conf.ProfileHeap != "" { 1574 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil { 1575 return err 1576 } 1577 } 1578 if conf.ProfileMutex != "" { 1579 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil { 1580 return err 1581 } 1582 } 1583 if conf.TraceFile != "" { 1584 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil { 1585 return err 1586 } 1587 } 1588 return nil 1589 } 1590 1591 // cgroupInstall creates cgroups dir structure and sets their respective 1592 // resources. In case of success, returns the cgroups instance and nil error. 1593 // For rootless, it's possible that cgroups operations fail, in this case the 1594 // error is suppressed and a nil cgroups instance is returned to indicate that 1595 // no cgroups was configured. 1596 func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) { 1597 if err := cg.Install(res); err != nil { 1598 switch { 1599 case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless: 1600 log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) 1601 return nil, nil 1602 default: 1603 return nil, fmt.Errorf("configuring cgroup: %v", err) 1604 } 1605 } 1606 return cg, nil 1607 } 1608 1609 func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error { 1610 if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1611 return nil 1612 } 1613 if conf.Network == config.NetworkHost { 1614 // Hostnet feature requires the sandbox to run in the current user 1615 // namespace, in which the network namespace is configured. 1616 return nil 1617 } 1618 if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1619 // If the spec already defines a userns, use that. 1620 return nil 1621 } 1622 if spec.Linux == nil { 1623 spec.Linux = &specs.Linux{} 1624 } 1625 if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 { 1626 // The spec can only define UID/GID mappings with a userns (checked above). 1627 return fmt.Errorf("spec defines UID/GID mappings without defining userns") 1628 } 1629 // Run the sandbox in a new user namespace with identity UID/GID mappings. 1630 log.Debugf("Configuring container with a new userns with identity user mappings into current userns") 1631 spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace}) 1632 uidMappings, err := getIdentityMapping("uid_map") 1633 if err != nil { 1634 return err 1635 } 1636 spec.Linux.UIDMappings = uidMappings 1637 logIDMappings(uidMappings, "UID") 1638 gidMappings, err := getIdentityMapping("gid_map") 1639 if err != nil { 1640 return err 1641 } 1642 spec.Linux.GIDMappings = gidMappings 1643 logIDMappings(gidMappings, "GID") 1644 return nil 1645 } 1646 1647 func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) { 1648 // See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files 1649 // are organized. 1650 mapFile := path.Join("/proc/self", mapFileName) 1651 file, err := os.Open(mapFile) 1652 if err != nil { 1653 return nil, fmt.Errorf("failed to open %s: %v", mapFile, err) 1654 } 1655 defer file.Close() 1656 1657 var mappings []specs.LinuxIDMapping 1658 scanner := bufio.NewScanner(file) 1659 for scanner.Scan() { 1660 line := scanner.Text() 1661 var myStart, parentStart, rangeLen uint32 1662 numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen) 1663 if err != nil { 1664 return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err) 1665 } 1666 if numParsed != 3 { 1667 return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile) 1668 } 1669 // Create an identity mapping with the current userns. 1670 mappings = append(mappings, specs.LinuxIDMapping{ 1671 ContainerID: myStart, 1672 HostID: myStart, 1673 Size: rangeLen, 1674 }) 1675 } 1676 if err := scanner.Err(); err != nil { 1677 return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err) 1678 } 1679 return mappings, nil 1680 } 1681 1682 func logIDMappings(mappings []specs.LinuxIDMapping, idType string) { 1683 if !log.IsLogging(log.Debug) { 1684 return 1685 } 1686 log.Debugf("%s Mappings:", idType) 1687 for _, m := range mappings { 1688 log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size) 1689 } 1690 } 1691 1692 // nvProxyPreGoferHostSetup sets up nvproxy on the host. It runs before any 1693 // Gofers start. 1694 // It verifies that all the required dependencies are in place, loads kernel 1695 // modules, and ensures the correct device files exist and are accessible. 1696 // This should only be necessary once on the host. It should be run during the 1697 // root container setup sequence to make sure it has run at least once. 1698 func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error { 1699 if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker { 1700 return nil 1701 } 1702 1703 // Locate binaries. For security reasons, unlike 1704 // nvidia-container-runtime-hook, we don't add the container's filesystem 1705 // to the search path. We also don't support 1706 // /etc/nvidia-container-runtime/config.toml to avoid importing a TOML 1707 // parser. 1708 cliPath, err := exec.LookPath("nvidia-container-cli") 1709 if err != nil { 1710 return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1711 } 1712 1713 // nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves. 1714 nvproxyLoadKernelModules() 1715 1716 if _, err := os.Stat("/dev/nvidiactl"); err != nil { 1717 if !os.IsNotExist(err) { 1718 return fmt.Errorf("stat(2) for /dev/nvidiactl failed: %w", err) 1719 } 1720 1721 // Run `nvidia-container-cli info`. 1722 // This has the side-effect of automatically creating GPU device files. 1723 argv := []string{cliPath, "--load-kmods", "info"} 1724 log.Debugf("Executing %q", argv) 1725 var infoOut, infoErr strings.Builder 1726 cmd := exec.Cmd{ 1727 Path: argv[0], 1728 Args: argv, 1729 Env: os.Environ(), 1730 Stdout: &infoOut, 1731 Stderr: &infoErr, 1732 } 1733 if err := cmd.Run(); err != nil { 1734 return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String()) 1735 } 1736 log.Debugf("nvidia-container-cli info: %v", infoOut.String()) 1737 } 1738 1739 return nil 1740 } 1741 1742 // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe. 1743 func nvproxyLoadKernelModules() { 1744 for _, mod := range [...]string{ 1745 "nvidia", 1746 "nvidia-uvm", 1747 } { 1748 argv := []string{ 1749 "/sbin/modprobe", 1750 mod, 1751 } 1752 log.Debugf("Executing %q", argv) 1753 var stdout, stderr strings.Builder 1754 cmd := exec.Cmd{ 1755 Path: argv[0], 1756 Args: argv, 1757 Env: os.Environ(), 1758 Stdout: &stdout, 1759 Stderr: &stderr, 1760 } 1761 if err := cmd.Run(); err != nil { 1762 // This might not be fatal since modules may already be loaded. Log 1763 // the failure but continue. 1764 log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String()) 1765 } 1766 } 1767 } 1768 1769 // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`. 1770 // This sets up the container filesystem with bind mounts that allow it to 1771 // use NVIDIA devices. 1772 // 1773 // This should be called during the Gofer setup process, as the bind mounts 1774 // are created in the Gofer's mount namespace. 1775 // If successful, it returns a callback function that must be called once the 1776 // Gofer process has started. 1777 // This function has no effect if nvproxy functionality is not requested. 1778 // 1779 // This function essentially replicates 1780 // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the 1781 // binary that executeHook() is hard-coded to skip, with differences noted 1782 // inline. We do this rather than move the prestart hook because the 1783 // "runtime environment" in which prestart hooks execute is vaguely 1784 // defined, such that nvidia-container-runtime-hook and existing runsc 1785 // hooks differ in their expected environment. 1786 // 1787 // Note that nvidia-container-cli will set up files in /dev and /proc which 1788 // are useless, since they will be hidden by sentry devtmpfs and procfs 1789 // respectively (and some device files will have the wrong device numbers 1790 // from the application's perspective since nvproxy may register device 1791 // numbers in sentry VFS that differ from those on the host, e.g. for 1792 // nvidia-uvm). These files are separately created during sandbox VFS 1793 // construction. For this reason, we don't need to parse 1794 // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli. 1795 func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) { 1796 if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker { 1797 return func() error { return nil }, nil 1798 } 1799 1800 if spec.Root == nil { 1801 return nil, fmt.Errorf("spec missing root filesystem") 1802 } 1803 1804 // nvidia-container-cli does not create this directory. 1805 if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil { 1806 return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err) 1807 } 1808 1809 cliPath, err := exec.LookPath("nvidia-container-cli") 1810 if err != nil { 1811 return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1812 } 1813 1814 // On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter. 1815 var ldconfigPath string 1816 if _, err := os.Stat("/sbin/ldconfig.real"); err == nil { 1817 ldconfigPath = "/sbin/ldconfig.real" 1818 } else { 1819 ldconfigPath = "/sbin/ldconfig" 1820 } 1821 1822 devices, err := specutils.NvidiaDeviceList(spec, conf) 1823 if err != nil { 1824 return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err) 1825 } 1826 1827 // Create synchronization FD for nvproxy. 1828 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1829 if err != nil { 1830 return nil, err 1831 } 1832 ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD") 1833 goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD") 1834 goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd) 1835 1836 return func() error { 1837 defer ourEnd.Close() 1838 argv := []string{ 1839 cliPath, 1840 "--load-kmods", 1841 "configure", 1842 fmt.Sprintf("--ldconfig=@%s", ldconfigPath), 1843 "--no-cgroups", // runsc doesn't configure device cgroups yet 1844 "--utility", 1845 "--compute", 1846 fmt.Sprintf("--pid=%d", goferCmd.Process.Pid), 1847 fmt.Sprintf("--device=%s", devices), 1848 spec.Root.Path, 1849 } 1850 log.Debugf("Executing %q", argv) 1851 var stdout, stderr strings.Builder 1852 cmd := exec.Cmd{ 1853 Path: argv[0], 1854 Args: argv, 1855 Env: os.Environ(), 1856 Stdout: &stdout, 1857 Stderr: &stderr, 1858 } 1859 if err := cmd.Run(); err != nil { 1860 return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String()) 1861 } 1862 return nil 1863 }, nil 1864 }