github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/container/container.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package container creates and manipulates containers. 16 package container 17 18 import ( 19 "bufio" 20 "context" 21 "errors" 22 "fmt" 23 "io/ioutil" 24 "os" 25 "os/exec" 26 "path" 27 "regexp" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/cenkalti/backoff" 34 specs "github.com/opencontainers/runtime-spec/specs-go" 35 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 36 "github.com/ttpreport/gvisor-ligolo/pkg/cleanup" 37 "github.com/ttpreport/gvisor-ligolo/pkg/log" 38 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/control" 39 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/pgalloc" 40 "github.com/ttpreport/gvisor-ligolo/pkg/sighandling" 41 "github.com/ttpreport/gvisor-ligolo/runsc/boot" 42 "github.com/ttpreport/gvisor-ligolo/runsc/cgroup" 43 "github.com/ttpreport/gvisor-ligolo/runsc/config" 44 "github.com/ttpreport/gvisor-ligolo/runsc/console" 45 "github.com/ttpreport/gvisor-ligolo/runsc/donation" 46 "github.com/ttpreport/gvisor-ligolo/runsc/sandbox" 47 "github.com/ttpreport/gvisor-ligolo/runsc/specutils" 48 "golang.org/x/sys/unix" 49 ) 50 51 const cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" 52 53 // validateID validates the container id. 54 func validateID(id string) error { 55 // See libcontainer/factory_linux.go. 56 idRegex := regexp.MustCompile(`^[\w+\.-]+$`) 57 if !idRegex.MatchString(id) { 58 return fmt.Errorf("invalid container id: %v", id) 59 } 60 return nil 61 } 62 63 // Container represents a containerized application. When running, the 64 // container is associated with a single Sandbox. 65 // 66 // Container metadata can be saved and loaded to disk. Within a root directory, 67 // we maintain subdirectories for each container named with the container id. 68 // The container metadata is stored as a json within the container directory 69 // in a file named "meta.json". This metadata format is defined by us and is 70 // not part of the OCI spec. 71 // 72 // Containers must write their metadata files after any change to their internal 73 // states. The entire container directory is deleted when the container is 74 // destroyed. 75 // 76 // When the container is stopped, all processes that belong to the container 77 // must be stopped before Destroy() returns. containerd makes roughly the 78 // following calls to stop a container: 79 // - First it attempts to kill the container process with 80 // 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a 81 // separate thread, it's waiting on the container. As soon as the wait 82 // returns, it moves on to the next step: 83 // - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to 84 // the container. 'kill --all SIGKILL' waits for all processes before 85 // returning. 86 // - Containerd waits for stdin, stdout and stderr to drain and be closed. 87 // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once 88 // again just to be sure, waits, and then proceeds with remaining teardown. 89 // 90 // Container is thread-unsafe. 91 type Container struct { 92 // ID is the container ID. 93 ID string `json:"id"` 94 95 // Spec is the OCI runtime spec that configures this container. 96 Spec *specs.Spec `json:"spec"` 97 98 // BundleDir is the directory containing the container bundle. 99 BundleDir string `json:"bundleDir"` 100 101 // CreatedAt is the time the container was created. 102 CreatedAt time.Time `json:"createdAt"` 103 104 // Owner is the container owner. 105 Owner string `json:"owner"` 106 107 // ConsoleSocket is the path to a unix domain socket that will receive 108 // the console FD. 109 ConsoleSocket string `json:"consoleSocket"` 110 111 // Status is the current container Status. 112 Status Status `json:"status"` 113 114 // GoferPid is the PID of the gofer running along side the sandbox. May 115 // be 0 if the gofer has been killed. 116 GoferPid int `json:"goferPid"` 117 118 // Sandbox is the sandbox this container is running in. It's set when the 119 // container is created and reset when the sandbox is destroyed. 120 Sandbox *sandbox.Sandbox `json:"sandbox"` 121 122 // CompatCgroup has the cgroup configuration for the container. For the single 123 // container case, container cgroup is set in `c.Sandbox` only. CompactCgroup 124 // is only set for multi-container, where the `c.Sandbox` cgroup represents 125 // the entire pod. 126 // 127 // Note that CompatCgroup is created only for compatibility with tools 128 // that expect container cgroups to exist. Setting limits here makes no change 129 // to the container in question. 130 CompatCgroup cgroup.CgroupJSON `json:"compatCgroup"` 131 132 // Saver handles load from/save to the state file safely from multiple 133 // processes. 134 Saver StateFile `json:"saver"` 135 136 // OverlayConf is the overlay configuration with which this container was 137 // started. 138 OverlayConf config.Overlay2 `json:"overlayConf"` 139 140 // OverlayMediums contains information about how the gofer mounts have been 141 // overlaid. The first entry is for rootfs and the following entries are for 142 // bind mounts in Spec.Mounts (in the same order). 143 OverlayMediums []boot.OverlayMedium `json:"overlayMediums"` 144 145 // 146 // Fields below this line are not saved in the state file and will not 147 // be preserved across commands. 148 // 149 150 // goferIsChild is set if a gofer process is a child of the current process. 151 // 152 // This field isn't saved to json, because only a creator of a gofer 153 // process will have it as a child process. 154 goferIsChild bool 155 } 156 157 // Args is used to configure a new container. 158 type Args struct { 159 // ID is the container unique identifier. 160 ID string 161 162 // Spec is the OCI spec that describes the container. 163 Spec *specs.Spec 164 165 // BundleDir is the directory containing the container bundle. 166 BundleDir string 167 168 // ConsoleSocket is the path to a unix domain socket that will receive 169 // the console FD. It may be empty. 170 ConsoleSocket string 171 172 // PIDFile is the filename where the container's root process PID will be 173 // written to. It may be empty. 174 PIDFile string 175 176 // UserLog is the filename to send user-visible logs to. It may be empty. 177 // 178 // It only applies for the init container. 179 UserLog string 180 181 // Attached indicates that the sandbox lifecycle is attached with the caller. 182 // If the caller exits, the sandbox should exit too. 183 // 184 // It only applies for the init container. 185 Attached bool 186 187 // PassFiles are user-supplied files from the host to be exposed to the 188 // sandboxed app. 189 PassFiles map[int]*os.File 190 191 // ExecFile is the host file used for program execution. 192 ExecFile *os.File 193 } 194 195 // New creates the container in a new Sandbox process, unless the metadata 196 // indicates that an existing Sandbox should be used. The caller must call 197 // Destroy() on the container. 198 func New(conf *config.Config, args Args) (*Container, error) { 199 log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 200 if err := validateID(args.ID); err != nil { 201 return nil, err 202 } 203 204 if err := os.MkdirAll(conf.RootDir, 0711); err != nil { 205 return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) 206 } 207 208 if err := modifySpecForDirectfs(conf, args.Spec); err != nil { 209 return nil, fmt.Errorf("failed to modify spec for directfs: %v", err) 210 } 211 212 sandboxID := args.ID 213 if !isRoot(args.Spec) { 214 var ok bool 215 sandboxID, ok = specutils.SandboxID(args.Spec) 216 if !ok { 217 return nil, fmt.Errorf("no sandbox ID found when creating container") 218 } 219 } 220 221 c := &Container{ 222 ID: args.ID, 223 Spec: args.Spec, 224 ConsoleSocket: args.ConsoleSocket, 225 BundleDir: args.BundleDir, 226 Status: Creating, 227 CreatedAt: time.Now(), 228 Owner: os.Getenv("USER"), 229 Saver: StateFile{ 230 RootDir: conf.RootDir, 231 ID: FullID{ 232 SandboxID: sandboxID, 233 ContainerID: args.ID, 234 }, 235 }, 236 OverlayConf: conf.GetOverlay2(), 237 } 238 // The Cleanup object cleans up partially created containers when an error 239 // occurs. Any errors occurring during cleanup itself are ignored. 240 cu := cleanup.Make(func() { _ = c.Destroy() }) 241 defer cu.Clean() 242 243 // Lock the container metadata file to prevent concurrent creations of 244 // containers with the same id. 245 if err := c.Saver.LockForNew(); err != nil { 246 return nil, fmt.Errorf("cannot lock container metadata file: %w", err) 247 } 248 defer c.Saver.UnlockOrDie() 249 250 // If the metadata annotations indicate that this container should be started 251 // in an existing sandbox, we must do so. These are the possible metadata 252 // annotation states: 253 // 1. No annotations: it means that there is a single container and this 254 // container is obviously the root. Both container and sandbox share the 255 // ID. 256 // 2. Container type == sandbox: it means this is the root container 257 // starting the sandbox. Both container and sandbox share the same ID. 258 // 3. Container type == container: it means this is a subcontainer of an 259 // already started sandbox. In this case, container ID is different than 260 // the sandbox ID. 261 if isRoot(args.Spec) { 262 log.Debugf("Creating new sandbox for container, cid: %s", args.ID) 263 264 if args.Spec.Linux == nil { 265 args.Spec.Linux = &specs.Linux{} 266 } 267 // Don't force the use of cgroups in tests because they lack permission to do so. 268 if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 269 args.Spec.Linux.CgroupsPath = "/" + args.ID 270 } 271 var subCgroup, parentCgroup, containerCgroup cgroup.Cgroup 272 if !conf.IgnoreCgroups { 273 var err error 274 275 // Create and join cgroup before processes are created to ensure they are 276 // part of the cgroup from the start (and all their children processes). 277 parentCgroup, subCgroup, err = c.setupCgroupForRoot(conf, args.Spec) 278 if err != nil { 279 return nil, fmt.Errorf("cannot set up cgroup for root: %w", err) 280 } 281 // Join the child cgroup when using cgroupfs. Joining non leaf-node 282 // cgroups is illegal in cgroupsv2 and will return EBUSY. 283 if subCgroup != nil && !conf.SystemdCgroup && cgroup.IsOnlyV2() { 284 containerCgroup = subCgroup 285 } else { 286 containerCgroup = parentCgroup 287 } 288 } 289 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 290 mountHints, err := boot.NewPodMountHints(args.Spec) 291 if err != nil { 292 return nil, fmt.Errorf("error creating pod mount hints: %w", err) 293 } 294 overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(mountHints) 295 if err != nil { 296 return nil, err 297 } 298 c.OverlayMediums = overlayMediums 299 if err := nvProxyPreGoferHostSetup(args.Spec, conf); err != nil { 300 return nil, err 301 } 302 if err := runInCgroup(containerCgroup, func() error { 303 ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) 304 if err != nil { 305 return fmt.Errorf("cannot create gofer process: %w", err) 306 } 307 308 // Start a new sandbox for this container. Any errors after this point 309 // must destroy the container. 310 sandArgs := &sandbox.Args{ 311 ID: sandboxID, 312 Spec: args.Spec, 313 BundleDir: args.BundleDir, 314 ConsoleSocket: args.ConsoleSocket, 315 UserLog: args.UserLog, 316 IOFiles: ioFiles, 317 MountsFile: specFile, 318 Cgroup: containerCgroup, 319 Attached: args.Attached, 320 OverlayFilestoreFiles: overlayFilestoreFiles, 321 OverlayMediums: overlayMediums, 322 MountHints: mountHints, 323 PassFiles: args.PassFiles, 324 ExecFile: args.ExecFile, 325 } 326 sand, err := sandbox.New(conf, sandArgs) 327 if err != nil { 328 return fmt.Errorf("cannot create sandbox: %w", err) 329 } 330 c.Sandbox = sand 331 return nil 332 333 }); err != nil { 334 return nil, err 335 } 336 } else { 337 log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) 338 339 // Find the sandbox associated with this ID. 340 fullID := FullID{ 341 SandboxID: sandboxID, 342 ContainerID: sandboxID, 343 } 344 sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) 345 if err != nil { 346 return nil, fmt.Errorf("cannot load sandbox: %w", err) 347 } 348 c.Sandbox = sb.Sandbox 349 350 subCgroup, err := c.setupCgroupForSubcontainer(conf, args.Spec) 351 if err != nil { 352 return nil, err 353 } 354 c.CompatCgroup = cgroup.CgroupJSON{Cgroup: subCgroup} 355 356 // If the console control socket file is provided, then create a new 357 // pty master/slave pair and send the TTY to the sandbox process. 358 var tty *os.File 359 if c.ConsoleSocket != "" { 360 // Create a new TTY pair and send the master on the provided socket. 361 var err error 362 tty, err = console.NewWithSocket(c.ConsoleSocket) 363 if err != nil { 364 return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) 365 } 366 // tty file is transferred to the sandbox, then it can be closed here. 367 defer tty.Close() 368 } 369 370 if err := c.Sandbox.CreateSubcontainer(conf, c.ID, tty); err != nil { 371 return nil, fmt.Errorf("cannot create subcontainer: %w", err) 372 } 373 } 374 c.changeStatus(Created) 375 376 // Save the metadata file. 377 if err := c.saveLocked(); err != nil { 378 return nil, err 379 } 380 381 // "If any prestart hook fails, the runtime MUST generate an error, 382 // stop and destroy the container" -OCI spec. 383 if c.Spec.Hooks != nil { 384 // Even though the hook name is Prestart, runc used to call it from create. 385 // For this reason, it's now deprecated, but the spec requires it to be 386 // called *before* CreateRuntime and CreateRuntime must be called in create. 387 // 388 // "For runtimes that implement the deprecated prestart hooks as 389 // createRuntime hooks, createRuntime hooks MUST be called after the 390 // prestart hooks." 391 if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { 392 return nil, err 393 } 394 if err := executeHooks(c.Spec.Hooks.CreateRuntime, c.State()); err != nil { 395 return nil, err 396 } 397 if len(c.Spec.Hooks.CreateContainer) > 0 { 398 log.Warningf("CreateContainer hook skipped because running inside container namespace is not supported") 399 } 400 } 401 402 // Write the PID file. Containerd considers the call to create complete after 403 // this file is created, so it must be the last thing we do. 404 if args.PIDFile != "" { 405 if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { 406 return nil, fmt.Errorf("error writing PID file: %v", err) 407 } 408 } 409 410 cu.Release() 411 return c, nil 412 } 413 414 // Start starts running the containerized process inside the sandbox. 415 func (c *Container) Start(conf *config.Config) error { 416 log.Debugf("Start container, cid: %s", c.ID) 417 418 if err := c.Saver.lock(BlockAcquire); err != nil { 419 return err 420 } 421 unlock := cleanup.Make(c.Saver.UnlockOrDie) 422 defer unlock.Clean() 423 424 if err := c.requireStatus("start", Created); err != nil { 425 return err 426 } 427 428 // "If any prestart hook fails, the runtime MUST generate an error, 429 // stop and destroy the container" -OCI spec. 430 if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { 431 log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") 432 } 433 434 if isRoot(c.Spec) { 435 if err := c.Sandbox.StartRoot(conf); err != nil { 436 return err 437 } 438 } else { 439 overlayFilestoreFiles, overlayMediums, err := c.createOverlayFilestores(c.Sandbox.MountHints) 440 if err != nil { 441 return err 442 } 443 c.OverlayMediums = overlayMediums 444 // Join cgroup to start gofer process to ensure it's part of the cgroup from 445 // the start (and all their children processes). 446 if err := runInCgroup(c.Sandbox.CgroupJSON.Cgroup, func() error { 447 // Create the gofer process. 448 goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) 449 if err != nil { 450 return err 451 } 452 defer func() { 453 _ = mountsFile.Close() 454 for _, f := range goferFiles { 455 _ = f.Close() 456 } 457 }() 458 459 cleanMounts, err := specutils.ReadMounts(mountsFile) 460 if err != nil { 461 return fmt.Errorf("reading mounts file: %v", err) 462 } 463 c.Spec.Mounts = cleanMounts 464 465 // Setup stdios if the container is not using terminal. Otherwise TTY was 466 // already setup in create. 467 var stdios []*os.File 468 if !c.Spec.Process.Terminal { 469 stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} 470 } 471 472 return c.Sandbox.StartSubcontainer(c.Spec, conf, c.ID, stdios, goferFiles, overlayFilestoreFiles, overlayMediums) 473 }); err != nil { 474 return err 475 } 476 } 477 478 // "If any poststart hook fails, the runtime MUST log a warning, but 479 // the remaining hooks and lifecycle continue as if the hook had 480 // succeeded" -OCI spec. 481 if c.Spec.Hooks != nil { 482 executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) 483 } 484 485 c.changeStatus(Running) 486 if err := c.saveLocked(); err != nil { 487 return err 488 } 489 490 // Release lock before adjusting OOM score because the lock is acquired there. 491 unlock.Clean() 492 493 // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). 494 if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { 495 return err 496 } 497 498 // Set container's oom_score_adj to the gofer since it is dedicated to 499 // the container, in case the gofer uses up too much memory. 500 return c.adjustGoferOOMScoreAdj() 501 } 502 503 // Restore takes a container and replaces its kernel and file system 504 // to restore a container from its state file. 505 func (c *Container) Restore(conf *config.Config, restoreFile string) error { 506 log.Debugf("Restore container, cid: %s", c.ID) 507 if err := c.Saver.lock(BlockAcquire); err != nil { 508 return err 509 } 510 defer c.Saver.UnlockOrDie() 511 512 if err := c.requireStatus("restore", Created); err != nil { 513 return err 514 } 515 516 // "If any prestart hook fails, the runtime MUST generate an error, 517 // stop and destroy the container" -OCI spec. 518 if c.Spec.Hooks != nil && len(c.Spec.Hooks.StartContainer) > 0 { 519 log.Warningf("StartContainer hook skipped because running inside container namespace is not supported") 520 } 521 522 if err := c.Sandbox.Restore(conf, c.ID, restoreFile); err != nil { 523 return err 524 } 525 c.changeStatus(Running) 526 return c.saveLocked() 527 } 528 529 // Run is a helper that calls Create + Start + Wait. 530 func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { 531 log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 532 c, err := New(conf, args) 533 if err != nil { 534 return 0, fmt.Errorf("creating container: %v", err) 535 } 536 // Clean up partially created container if an error occurs. 537 // Any errors returned by Destroy() itself are ignored. 538 cu := cleanup.Make(func() { 539 c.Destroy() 540 }) 541 defer cu.Clean() 542 543 if conf.RestoreFile != "" { 544 log.Debugf("Restore: %v", conf.RestoreFile) 545 if err := c.Restore(conf, conf.RestoreFile); err != nil { 546 return 0, fmt.Errorf("starting container: %v", err) 547 } 548 } else { 549 if err := c.Start(conf); err != nil { 550 return 0, fmt.Errorf("starting container: %v", err) 551 } 552 } 553 554 // If we allocate a terminal, forward signals to the sandbox process. 555 // Otherwise, Ctrl+C will terminate this process and its children, 556 // including the terminal. 557 if c.Spec.Process.Terminal { 558 stopForwarding := c.ForwardSignals(0, true /* fgProcess */) 559 defer stopForwarding() 560 } 561 562 if args.Attached { 563 return c.Wait() 564 } 565 cu.Release() 566 return 0, nil 567 } 568 569 // Execute runs the specified command in the container. It returns the PID of 570 // the newly created process. 571 func (c *Container) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 572 log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) 573 if err := c.requireStatus("execute in", Created, Running); err != nil { 574 return 0, err 575 } 576 args.ContainerID = c.ID 577 return c.Sandbox.Execute(conf, args) 578 } 579 580 // Event returns events for the container. 581 func (c *Container) Event() (*boot.EventOut, error) { 582 log.Debugf("Getting events for container, cid: %s", c.ID) 583 if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { 584 return nil, err 585 } 586 event, err := c.Sandbox.Event(c.ID) 587 if err != nil { 588 return nil, err 589 } 590 591 // Some stats can utilize host cgroups for accuracy. 592 c.populateStats(event) 593 594 return event, nil 595 } 596 597 // PortForward starts port forwarding to the container. 598 func (c *Container) PortForward(opts *boot.PortForwardOpts) error { 599 if err := c.requireStatus("port forward", Running); err != nil { 600 return err 601 } 602 opts.ContainerID = c.ID 603 return c.Sandbox.PortForward(opts) 604 } 605 606 // SandboxPid returns the Getpid of the sandbox the container is running in, or -1 if the 607 // container is not running. 608 func (c *Container) SandboxPid() int { 609 if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { 610 return -1 611 } 612 return c.Sandbox.Getpid() 613 } 614 615 // Wait waits for the container to exit, and returns its WaitStatus. 616 // Call to wait on a stopped container is needed to retrieve the exit status 617 // and wait returns immediately. 618 func (c *Container) Wait() (unix.WaitStatus, error) { 619 log.Debugf("Wait on container, cid: %s", c.ID) 620 ws, err := c.Sandbox.Wait(c.ID) 621 if err == nil { 622 // Wait succeeded, container is not running anymore. 623 c.changeStatus(Stopped) 624 } 625 return ws, err 626 } 627 628 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and 629 // returns its WaitStatus. 630 func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) { 631 log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) 632 if !c.IsSandboxRunning() { 633 return 0, fmt.Errorf("sandbox is not running") 634 } 635 return c.Sandbox.WaitPID(c.Sandbox.ID, pid) 636 } 637 638 // WaitPID waits for process 'pid' in the container's PID namespace and returns 639 // its WaitStatus. 640 func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) { 641 log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) 642 if !c.IsSandboxRunning() { 643 return 0, fmt.Errorf("sandbox is not running") 644 } 645 return c.Sandbox.WaitPID(c.ID, pid) 646 } 647 648 // SignalContainer sends the signal to the container. If all is true and signal 649 // is SIGKILL, then waits for all processes to exit before returning. 650 // SignalContainer returns an error if the container is already stopped. 651 // TODO(b/113680494): Distinguish different error types. 652 func (c *Container) SignalContainer(sig unix.Signal, all bool) error { 653 log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) 654 // Signaling container in Stopped state is allowed. When all=false, 655 // an error will be returned anyway; when all=true, this allows 656 // sending signal to other processes inside the container even 657 // after the init process exits. This is especially useful for 658 // container cleanup. 659 if err := c.requireStatus("signal", Running, Stopped); err != nil { 660 return err 661 } 662 if !c.IsSandboxRunning() { 663 return fmt.Errorf("sandbox is not running") 664 } 665 return c.Sandbox.SignalContainer(c.ID, sig, all) 666 } 667 668 // SignalProcess sends sig to a specific process in the container. 669 func (c *Container) SignalProcess(sig unix.Signal, pid int32) error { 670 log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) 671 if err := c.requireStatus("signal a process inside", Running); err != nil { 672 return err 673 } 674 if !c.IsSandboxRunning() { 675 return fmt.Errorf("sandbox is not running") 676 } 677 return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) 678 } 679 680 // ForwardSignals forwards all signals received by the current process to the 681 // container process inside the sandbox. It returns a function that will stop 682 // forwarding signals. 683 func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { 684 log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) 685 stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { 686 log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) 687 if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil { 688 log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) 689 } 690 }) 691 return func() { 692 log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) 693 stop() 694 } 695 } 696 697 // Checkpoint sends the checkpoint call to the container. 698 // The statefile will be written to f, the file at the specified image-path. 699 func (c *Container) Checkpoint(f *os.File) error { 700 log.Debugf("Checkpoint container, cid: %s", c.ID) 701 if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { 702 return err 703 } 704 return c.Sandbox.Checkpoint(c.ID, f) 705 } 706 707 // Pause suspends the container and its kernel. 708 // The call only succeeds if the container's status is created or running. 709 func (c *Container) Pause() error { 710 log.Debugf("Pausing container, cid: %s", c.ID) 711 if err := c.Saver.lock(BlockAcquire); err != nil { 712 return err 713 } 714 defer c.Saver.UnlockOrDie() 715 716 if c.Status != Created && c.Status != Running { 717 return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) 718 } 719 720 if err := c.Sandbox.Pause(c.ID); err != nil { 721 return fmt.Errorf("pausing container %q: %v", c.ID, err) 722 } 723 c.changeStatus(Paused) 724 return c.saveLocked() 725 } 726 727 // Resume unpauses the container and its kernel. 728 // The call only succeeds if the container's status is paused. 729 func (c *Container) Resume() error { 730 log.Debugf("Resuming container, cid: %s", c.ID) 731 if err := c.Saver.lock(BlockAcquire); err != nil { 732 return err 733 } 734 defer c.Saver.UnlockOrDie() 735 736 if c.Status != Paused { 737 return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) 738 } 739 if err := c.Sandbox.Resume(c.ID); err != nil { 740 return fmt.Errorf("resuming container: %v", err) 741 } 742 c.changeStatus(Running) 743 return c.saveLocked() 744 } 745 746 // State returns the metadata of the container. 747 func (c *Container) State() specs.State { 748 return specs.State{ 749 Version: specs.Version, 750 ID: c.ID, 751 Status: c.Status, 752 Pid: c.SandboxPid(), 753 Bundle: c.BundleDir, 754 Annotations: c.Spec.Annotations, 755 } 756 } 757 758 // Processes retrieves the list of processes and associated metadata inside a 759 // container. 760 func (c *Container) Processes() ([]*control.Process, error) { 761 if err := c.requireStatus("get processes of", Running, Paused); err != nil { 762 return nil, err 763 } 764 return c.Sandbox.Processes(c.ID) 765 } 766 767 // Destroy stops all processes and frees all resources associated with the 768 // container. 769 func (c *Container) Destroy() error { 770 log.Debugf("Destroy container, cid: %s", c.ID) 771 772 if err := c.Saver.lock(BlockAcquire); err != nil { 773 return err 774 } 775 defer func() { 776 c.Saver.UnlockOrDie() 777 _ = c.Saver.close() 778 }() 779 780 // Stored for later use as stop() sets c.Sandbox to nil. 781 sb := c.Sandbox 782 783 // We must perform the following cleanup steps: 784 // * stop the container and gofer processes, 785 // * remove the container filesystem on the host, and 786 // * delete the container metadata directory. 787 // 788 // It's possible for one or more of these steps to fail, but we should 789 // do our best to perform all of the cleanups. Hence, we keep a slice 790 // of errors return their concatenation. 791 var errs []string 792 if err := c.stop(); err != nil { 793 err = fmt.Errorf("stopping container: %v", err) 794 log.Warningf("%v", err) 795 errs = append(errs, err.Error()) 796 } 797 798 if err := c.Saver.Destroy(); err != nil { 799 err = fmt.Errorf("deleting container state files: %v", err) 800 log.Warningf("%v", err) 801 errs = append(errs, err.Error()) 802 } 803 804 // Clean up overlay filestore files created in their respective mounts. 805 c.forEachSelfOverlay(func(mountSrc string) { 806 filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID()) 807 if err := os.Remove(filestorePath); err != nil { 808 err = fmt.Errorf("failed to delete filestore file %q: %v", filestorePath, err) 809 log.Warningf("%v", err) 810 errs = append(errs, err.Error()) 811 } 812 }) 813 814 c.changeStatus(Stopped) 815 816 // Adjust oom_score_adj for the sandbox. This must be done after the container 817 // is stopped and the directory at c.Root is removed. 818 // 819 // Use 'sb' to tell whether it has been executed before because Destroy must 820 // be idempotent. 821 if sb != nil { 822 if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { 823 errs = append(errs, err.Error()) 824 } 825 } 826 827 // "If any poststop hook fails, the runtime MUST log a warning, but the 828 // remaining hooks and lifecycle continue as if the hook had 829 // succeeded" - OCI spec. 830 // 831 // Based on the OCI, "The post-stop hooks MUST be called after the container 832 // is deleted but before the delete operation returns" 833 // Run it here to: 834 // 1) Conform to the OCI. 835 // 2) Make sure it only runs once, because the root has been deleted, the 836 // container can't be loaded again. 837 if c.Spec.Hooks != nil { 838 executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) 839 } 840 841 if len(errs) == 0 { 842 return nil 843 } 844 return fmt.Errorf(strings.Join(errs, "\n")) 845 } 846 847 func (c *Container) sandboxID() string { 848 return c.Saver.ID.SandboxID 849 } 850 851 func (c *Container) forEachSelfOverlay(fn func(mountSrc string)) { 852 if c.OverlayMediums == nil { 853 // Sub container not started? Skip. 854 return 855 } 856 if c.OverlayMediums[0] == boot.SelfMedium { 857 fn(c.Spec.Root.Path) 858 } 859 goferMntIdx := 1 // First index is for rootfs. 860 for i := range c.Spec.Mounts { 861 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 862 continue 863 } 864 if c.OverlayMediums[goferMntIdx] == boot.SelfMedium { 865 fn(c.Spec.Mounts[i].Source) 866 } 867 goferMntIdx++ 868 } 869 } 870 871 // createOverlayFilestores creates the regular files that will back the tmpfs 872 // upper mount for overlay mounts. It also returns information about the 873 // overlay medium used for each bind mount. 874 func (c *Container) createOverlayFilestores(mountHints *boot.PodMountHints) ([]*os.File, []boot.OverlayMedium, error) { 875 var filestoreFiles []*os.File 876 var overlayMediums []boot.OverlayMedium 877 878 // Handle root mount first. 879 shouldOverlay := c.OverlayConf.RootEnabled() && !c.Spec.Root.Readonly 880 filestore, medium, err := c.createOverlayFilestore(c.Spec.Root.Path, shouldOverlay, nil /* hint */) 881 if err != nil { 882 return nil, nil, err 883 } 884 if filestore != nil { 885 filestoreFiles = append(filestoreFiles, filestore) 886 } 887 overlayMediums = append(overlayMediums, medium) 888 889 // Handle bind mounts. 890 for i := range c.Spec.Mounts { 891 if !specutils.IsGoferMount(c.Spec.Mounts[i]) { 892 continue 893 } 894 hint := mountHints.FindMount(&c.Spec.Mounts[i]) 895 shouldOverlay := c.OverlayConf.SubMountEnabled() && !specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) 896 filestore, medium, err := c.createOverlayFilestore(c.Spec.Mounts[i].Source, shouldOverlay, hint) 897 if err != nil { 898 return nil, nil, err 899 } 900 if filestore != nil { 901 filestoreFiles = append(filestoreFiles, filestore) 902 } 903 overlayMediums = append(overlayMediums, medium) 904 } 905 for _, filestore := range filestoreFiles { 906 // Perform this work around outside the sandbox. The sandbox may already be 907 // running with seccomp filters that do not allow this. 908 pgalloc.IMAWorkAroundForMemFile(filestore.Fd()) 909 } 910 return filestoreFiles, overlayMediums, nil 911 } 912 913 func (c *Container) createOverlayFilestore(mountSrc string, shouldOverlay bool, hint *boot.MountHint) (*os.File, boot.OverlayMedium, error) { 914 if hint != nil && hint.ShouldOverlay() { 915 // MountHint information takes precedence over shouldOverlay. 916 return c.createOverlayFilestoreInSelf(mountSrc) 917 } 918 switch { 919 case !shouldOverlay: 920 return nil, boot.NoOverlay, nil 921 case c.OverlayConf.IsBackedByMemory(): 922 return nil, boot.MemoryMedium, nil 923 case c.OverlayConf.IsBackedBySelf(): 924 return c.createOverlayFilestoreInSelf(mountSrc) 925 default: 926 return c.createOverlayFilestoreInDir() 927 } 928 } 929 930 func (c *Container) createOverlayFilestoreInSelf(mountSrc string) (*os.File, boot.OverlayMedium, error) { 931 mountSrcInfo, err := os.Stat(mountSrc) 932 if err != nil { 933 return nil, boot.NoOverlay, fmt.Errorf("failed to stat mount %q to see if it were a dirctory: %v", mountSrc, err) 934 } 935 if !mountSrcInfo.IsDir() { 936 log.Warningf("overlay2 self medium is only supported for directory mounts, but mount %q is not a directory, falling back to memory", mountSrc) 937 return nil, boot.MemoryMedium, nil 938 } 939 // Create the self overlay filestore file. 940 filestorePath := boot.SelfOverlayFilestorePath(mountSrc, c.sandboxID()) 941 filestoreFD, err := unix.Open(filestorePath, unix.O_RDWR|unix.O_CREAT|unix.O_EXCL|unix.O_CLOEXEC, 0666) 942 if err != nil { 943 if err == unix.EEXIST { 944 // Note that if the same submount is mounted multiple times within the 945 // same sandbox, then the overlay option doesn't work correctly. 946 // Because each overlay mount is independent and changes to one are not 947 // visible to the other. Given "overlay on repeated submounts" is 948 // already broken, we don't support such a scenario with the self 949 // medium. The filestore file will already exist for such a case. 950 return nil, boot.NoOverlay, fmt.Errorf("%q mount source already has a filestore file at %q; repeated submounts are not suppported with self medium", mountSrc, filestorePath) 951 } 952 return nil, boot.NoOverlay, fmt.Errorf("failed to create filestore file inside %q: %v", mountSrc, err) 953 } 954 log.Debugf("Created overlay filestore file at %q for mount source %q", filestorePath, mountSrc) 955 // Filestore in self should be a named path because it needs to be 956 // discoverable via path traversal so that k8s can scan the filesystem 957 // and apply any limits appropriately (like local ephemeral storage 958 // limits). So don't delete it. These files will be unlinked when the 959 // container is destroyed. This makes self medium appropriate for k8s. 960 return os.NewFile(uintptr(filestoreFD), filestorePath), boot.SelfMedium, nil 961 } 962 963 func (c *Container) createOverlayFilestoreInDir() (*os.File, boot.OverlayMedium, error) { 964 filestoreDir := c.OverlayConf.HostFileDir() 965 fileInfo, err := os.Stat(filestoreDir) 966 if err != nil { 967 return nil, boot.NoOverlay, fmt.Errorf("failed to stat overlay filestore directory %q: %v", filestoreDir, err) 968 } 969 if !fileInfo.IsDir() { 970 return nil, boot.NoOverlay, fmt.Errorf("overlay2 flag should specify an existing directory") 971 } 972 // Create an unnamed temporary file in filestore directory which will be 973 // deleted when the last FD on it is closed. We don't use O_TMPFILE because 974 // it is not supported on all filesystems. So we simulate it by creating a 975 // named file and then immediately unlinking it while keeping an FD on it. 976 // This file will be deleted when the container exits. 977 filestoreFile, err := os.CreateTemp(filestoreDir, "runsc-overlay-filestore-") 978 if err != nil { 979 return nil, boot.NoOverlay, fmt.Errorf("failed to create a temporary file inside %q: %v", filestoreDir, err) 980 } 981 if err := unix.Unlink(filestoreFile.Name()); err != nil { 982 return nil, boot.NoOverlay, fmt.Errorf("failed to unlink temporary file %q: %v", filestoreFile.Name(), err) 983 } 984 log.Debugf("Created an unnamed overlay filestore file at %q", filestoreDir) 985 return filestoreFile, boot.AnonDirMedium, nil 986 } 987 988 // saveLocked saves the container metadata to a file. 989 // 990 // Precondition: container must be locked with container.lock(). 991 func (c *Container) saveLocked() error { 992 log.Debugf("Save container, cid: %s", c.ID) 993 if err := c.Saver.SaveLocked(c); err != nil { 994 return fmt.Errorf("saving container metadata: %v", err) 995 } 996 return nil 997 } 998 999 // stop stops the container (for regular containers) or the sandbox (for 1000 // root containers), and waits for the container or sandbox and the gofer 1001 // to stop. If any of them doesn't stop before timeout, an error is returned. 1002 func (c *Container) stop() error { 1003 var parentCgroup cgroup.Cgroup 1004 1005 if c.Sandbox != nil { 1006 log.Debugf("Destroying container, cid: %s", c.ID) 1007 if err := c.Sandbox.DestroyContainer(c.ID); err != nil { 1008 return fmt.Errorf("destroying container %q: %v", c.ID, err) 1009 } 1010 // Only uninstall parentCgroup for sandbox stop. 1011 if c.Sandbox.IsRootContainer(c.ID) { 1012 parentCgroup = c.Sandbox.CgroupJSON.Cgroup 1013 } 1014 // Only set sandbox to nil after it has been told to destroy the container. 1015 c.Sandbox = nil 1016 } 1017 1018 // Try killing gofer if it does not exit with container. 1019 if c.GoferPid != 0 { 1020 log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) 1021 if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil { 1022 // The gofer may already be stopped, log the error. 1023 log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err) 1024 } 1025 } 1026 1027 if err := c.waitForStopped(); err != nil { 1028 return err 1029 } 1030 1031 // Delete container cgroup if any. 1032 if c.CompatCgroup.Cgroup != nil { 1033 if err := c.CompatCgroup.Cgroup.Uninstall(); err != nil { 1034 return err 1035 } 1036 } 1037 // Gofer is running inside parentCgroup, so Cgroup.Uninstall has to be called 1038 // after the gofer has stopped. 1039 if parentCgroup != nil { 1040 if err := parentCgroup.Uninstall(); err != nil { 1041 return err 1042 } 1043 } 1044 return nil 1045 } 1046 1047 func (c *Container) waitForStopped() error { 1048 if c.GoferPid == 0 { 1049 return nil 1050 } 1051 1052 if c.IsSandboxRunning() { 1053 if err := c.SignalContainer(unix.Signal(0), false); err == nil { 1054 return fmt.Errorf("container is still running") 1055 } 1056 } 1057 1058 if c.goferIsChild { 1059 // The gofer process is a child of the current process, 1060 // so we can wait it and collect its zombie. 1061 if _, err := unix.Wait4(int(c.GoferPid), nil, 0, nil); err != nil { 1062 return fmt.Errorf("error waiting the gofer process: %v", err) 1063 } 1064 c.GoferPid = 0 1065 return nil 1066 } 1067 1068 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1069 defer cancel() 1070 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1071 op := func() error { 1072 if err := unix.Kill(c.GoferPid, 0); err == nil { 1073 return fmt.Errorf("gofer is still running") 1074 } 1075 c.GoferPid = 0 1076 return nil 1077 } 1078 return backoff.Retry(op, b) 1079 } 1080 1081 func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { 1082 donations := donation.Agency{} 1083 defer donations.Close() 1084 1085 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 1086 return nil, nil, err 1087 } 1088 if conf.DebugLog != "" { 1089 test := "" 1090 if len(conf.TestOnlyTestNameEnv) != 0 { 1091 // Fetch test name if one is provided and the test only flag was set. 1092 if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 1093 test = t 1094 } 1095 } 1096 if specutils.IsDebugCommand(conf, "gofer") { 1097 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "gofer", test); err != nil { 1098 return nil, nil, err 1099 } 1100 } 1101 } 1102 1103 // Start with the general config flags. 1104 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 1105 cmd.SysProcAttr = &unix.SysProcAttr{ 1106 // Detach from session. Otherwise, signals sent to the foreground process 1107 // will also be forwarded by this process, resulting in duplicate signals. 1108 Setsid: true, 1109 } 1110 1111 // Set Args[0] to make easier to spot the gofer process. Otherwise it's 1112 // shown as `exe`. 1113 cmd.Args[0] = "runsc-gofer" 1114 1115 // Tranfer FDs that need to be present before the "gofer" command. 1116 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 1117 nextFD := donations.Transfer(cmd, 3) 1118 1119 cmd.Args = append(cmd.Args, "gofer", "--bundle", bundleDir) 1120 cmd.Args = append(cmd.Args, "--overlay-mediums="+boot.ToOverlayMediumFlags(c.OverlayMediums)) 1121 1122 // Open the spec file to donate to the sandbox. 1123 specFile, err := specutils.OpenSpec(bundleDir) 1124 if err != nil { 1125 return nil, nil, fmt.Errorf("opening spec file: %v", err) 1126 } 1127 donations.DonateAndClose("spec-fd", specFile) 1128 1129 // Donate any profile FDs to the gofer. 1130 if err := c.donateGoferProfileFDs(conf, &donations); err != nil { 1131 return nil, nil, fmt.Errorf("donating gofer profile fds: %w", err) 1132 } 1133 1134 // Create pipe that allows gofer to send mount list to sandbox after all paths 1135 // have been resolved. 1136 mountsSand, mountsGofer, err := os.Pipe() 1137 if err != nil { 1138 return nil, nil, err 1139 } 1140 donations.DonateAndClose("mounts-fd", mountsGofer) 1141 1142 // Add root mount and then add any other additional mounts. 1143 mountCount := 1 1144 for _, m := range spec.Mounts { 1145 if specutils.IsGoferMount(m) { 1146 mountCount++ 1147 } 1148 } 1149 1150 sandEnds := make([]*os.File, 0, mountCount) 1151 for i := 0; i < mountCount; i++ { 1152 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1153 if err != nil { 1154 return nil, nil, err 1155 } 1156 sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) 1157 1158 goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") 1159 donations.DonateAndClose("io-fds", goferEnd) 1160 } 1161 1162 if attached { 1163 // The gofer is attached to the lifetime of this process, so it 1164 // should synchronously die when this process dies. 1165 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1166 } 1167 1168 // Enter new namespaces to isolate from the rest of the system. Don't unshare 1169 // cgroup because gofer is added to a cgroup in the caller's namespace. 1170 nss := []specs.LinuxNamespace{ 1171 {Type: specs.IPCNamespace}, 1172 {Type: specs.MountNamespace}, 1173 {Type: specs.NetworkNamespace}, 1174 {Type: specs.PIDNamespace}, 1175 {Type: specs.UTSNamespace}, 1176 } 1177 1178 rootlessEUID := unix.Geteuid() != 0 1179 // Setup any uid/gid mappings, and create or join the configured user 1180 // namespace so the gofer's view of the filesystem aligns with the 1181 // users in the sandbox. 1182 if !rootlessEUID { 1183 if userNS, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1184 nss = append(nss, userNS) 1185 specutils.SetUIDGIDMappings(cmd, spec) 1186 // We need to set UID and GID to have capabilities in a new user namespace. 1187 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 1188 } 1189 } else { 1190 userNS, ok := specutils.GetNS(specs.UserNamespace, spec) 1191 if !ok { 1192 return nil, nil, fmt.Errorf("unable to run a rootless container without userns") 1193 } 1194 nss = append(nss, userNS) 1195 syncFile, err := sandbox.ConfigureCmdForRootless(cmd, &donations) 1196 if err != nil { 1197 return nil, nil, err 1198 } 1199 defer syncFile.Close() 1200 } 1201 1202 nvProxySetup, err := nvproxySetupAfterGoferUserns(spec, conf, cmd, &donations) 1203 if err != nil { 1204 return nil, nil, fmt.Errorf("setting up nvproxy for gofer: %w", err) 1205 } 1206 1207 donations.Transfer(cmd, nextFD) 1208 1209 // Start the gofer in the given namespace. 1210 donation.LogDonations(cmd) 1211 log.Debugf("Starting gofer: %s %v", cmd.Path, cmd.Args) 1212 if err := specutils.StartInNS(cmd, nss); err != nil { 1213 return nil, nil, fmt.Errorf("gofer: %v", err) 1214 } 1215 log.Infof("Gofer started, PID: %d", cmd.Process.Pid) 1216 c.GoferPid = cmd.Process.Pid 1217 c.goferIsChild = true 1218 1219 // Set up and synchronize rootless mode userns mappings. 1220 if rootlessEUID { 1221 if err := sandbox.SetUserMappings(spec, cmd.Process.Pid); err != nil { 1222 return nil, nil, err 1223 } 1224 } 1225 1226 // Set up nvproxy within the Gofer namespace. 1227 if err := nvProxySetup(); err != nil { 1228 return nil, nil, fmt.Errorf("nvproxy setup: %w", err) 1229 } 1230 1231 return sandEnds, mountsSand, nil 1232 } 1233 1234 // changeStatus transitions from one status to another ensuring that the 1235 // transition is valid. 1236 func (c *Container) changeStatus(s Status) { 1237 switch s { 1238 case Creating: 1239 // Initial state, never transitions to it. 1240 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1241 1242 case Created: 1243 if c.Status != Creating { 1244 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1245 } 1246 if c.Sandbox == nil { 1247 panic("sandbox cannot be nil") 1248 } 1249 1250 case Paused: 1251 if c.Status != Running { 1252 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1253 } 1254 if c.Sandbox == nil { 1255 panic("sandbox cannot be nil") 1256 } 1257 1258 case Running: 1259 if c.Status != Created && c.Status != Paused { 1260 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1261 } 1262 if c.Sandbox == nil { 1263 panic("sandbox cannot be nil") 1264 } 1265 1266 case Stopped: 1267 if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped { 1268 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 1269 } 1270 1271 default: 1272 panic(fmt.Sprintf("invalid new state: %v", s)) 1273 } 1274 c.Status = s 1275 } 1276 1277 // IsSandboxRunning returns true if the sandbox exists and is running. 1278 func (c *Container) IsSandboxRunning() bool { 1279 return c.Sandbox != nil && c.Sandbox.IsRunning() 1280 } 1281 1282 // HasCapabilityInAnySet returns true if the given capability is in any of the 1283 // capability sets of the container process. 1284 func (c *Container) HasCapabilityInAnySet(capability linux.Capability) bool { 1285 capString := capability.String() 1286 for _, set := range [5][]string{ 1287 c.Spec.Process.Capabilities.Bounding, 1288 c.Spec.Process.Capabilities.Effective, 1289 c.Spec.Process.Capabilities.Inheritable, 1290 c.Spec.Process.Capabilities.Permitted, 1291 c.Spec.Process.Capabilities.Ambient, 1292 } { 1293 for _, c := range set { 1294 if c == capString { 1295 return true 1296 } 1297 } 1298 } 1299 return false 1300 } 1301 1302 // RunsAsUID0 returns true if the container process runs with UID 0 (root). 1303 func (c *Container) RunsAsUID0() bool { 1304 return c.Spec.Process.User.UID == 0 1305 } 1306 1307 func (c *Container) requireStatus(action string, statuses ...Status) error { 1308 for _, s := range statuses { 1309 if c.Status == s { 1310 return nil 1311 } 1312 } 1313 return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) 1314 } 1315 1316 // IsSandboxRoot returns true if this container is its sandbox's root container. 1317 func (c *Container) IsSandboxRoot() bool { 1318 return isRoot(c.Spec) 1319 } 1320 1321 func isRoot(spec *specs.Spec) bool { 1322 return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer 1323 } 1324 1325 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute 1326 // it in the current context. 1327 func runInCgroup(cg cgroup.Cgroup, fn func() error) error { 1328 if cg == nil { 1329 return fn() 1330 } 1331 restore, err := cg.Join() 1332 if err != nil { 1333 return err 1334 } 1335 defer restore() 1336 return fn() 1337 } 1338 1339 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. 1340 func (c *Container) adjustGoferOOMScoreAdj() error { 1341 if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { 1342 return nil 1343 } 1344 return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) 1345 } 1346 1347 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. 1348 // oom_score_adj is set to the lowest oom_score_adj among the containers 1349 // running in the sandbox. 1350 // 1351 // TODO(gvisor.dev/issue/238): This call could race with other containers being 1352 // created at the same time and end up setting the wrong oom_score_adj to the 1353 // sandbox. Use rpc client to synchronize. 1354 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { 1355 // Adjustment can be skipped if the root container is exiting, because it 1356 // brings down the entire sandbox. 1357 if isRoot(spec) && destroy { 1358 return nil 1359 } 1360 1361 containers, err := LoadSandbox(rootDir, s.ID, LoadOpts{}) 1362 if err != nil { 1363 return fmt.Errorf("loading sandbox containers: %v", err) 1364 } 1365 1366 // Do nothing if the sandbox has been terminated. 1367 if len(containers) == 0 { 1368 return nil 1369 } 1370 1371 // Get the lowest score for all containers. 1372 var lowScore int 1373 scoreFound := false 1374 for _, container := range containers { 1375 // Special multi-container support for CRI. Ignore the root container when 1376 // calculating oom_score_adj for the sandbox because it is the 1377 // infrastructure (pause) container and always has a very low oom_score_adj. 1378 // 1379 // We will use OOMScoreAdj in the single-container case where the 1380 // containerd container-type annotation is not present. 1381 if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { 1382 continue 1383 } 1384 1385 if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { 1386 scoreFound = true 1387 lowScore = *container.Spec.Process.OOMScoreAdj 1388 } 1389 } 1390 1391 // If the container is destroyed and remaining containers have no 1392 // oomScoreAdj specified then we must revert to the original oom_score_adj 1393 // saved with the root container. 1394 if !scoreFound && destroy { 1395 lowScore = containers[0].Sandbox.OriginalOOMScoreAdj 1396 scoreFound = true 1397 } 1398 1399 // Only set oom_score_adj if one of the containers has oom_score_adj set. If 1400 // not, oom_score_adj is inherited from the parent process. 1401 // 1402 // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process 1403 if !scoreFound { 1404 return nil 1405 } 1406 1407 // Set the lowest of all containers oom_score_adj to the sandbox. 1408 return setOOMScoreAdj(s.Getpid(), lowScore) 1409 } 1410 1411 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID. 1412 // /proc must be available and mounted read-write. scoreAdj should be between 1413 // -1000 and 1000. It's a noop if the process has already exited. 1414 func setOOMScoreAdj(pid int, scoreAdj int) error { 1415 f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) 1416 if err != nil { 1417 // Ignore NotExist errors because it can race with process exit. 1418 if os.IsNotExist(err) { 1419 log.Warningf("Process (%d) not found setting oom_score_adj", pid) 1420 return nil 1421 } 1422 return err 1423 } 1424 defer f.Close() 1425 if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { 1426 if errors.Is(err, unix.ESRCH) { 1427 log.Warningf("Process (%d) exited while setting oom_score_adj", pid) 1428 return nil 1429 } 1430 return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) 1431 } 1432 return nil 1433 } 1434 1435 // populateStats populates event with stats estimates based on cgroups and the 1436 // sentry's accounting. 1437 // TODO(gvisor.dev/issue/172): This is an estimation; we should do more 1438 // detailed accounting. 1439 func (c *Container) populateStats(event *boot.EventOut) { 1440 // The events command, when run for all running containers, should 1441 // account for the full cgroup CPU usage. We split cgroup usage 1442 // proportionally according to the sentry-internal usage measurements, 1443 // only counting Running containers. 1444 log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) 1445 var containerUsage uint64 1446 var allContainersUsage uint64 1447 for ID, usage := range event.ContainerUsage { 1448 allContainersUsage += usage 1449 if ID == c.ID { 1450 containerUsage = usage 1451 } 1452 } 1453 1454 cgroup, err := c.Sandbox.NewCGroup() 1455 if err != nil { 1456 // No cgroup, so rely purely on the sentry's accounting. 1457 log.Warningf("events: no cgroups") 1458 event.Event.Data.CPU.Usage.Total = containerUsage 1459 return 1460 } 1461 1462 // Get the host cgroup CPU usage. 1463 cgroupsUsage, err := cgroup.CPUUsage() 1464 if err != nil { 1465 // No cgroup usage, so rely purely on the sentry's accounting. 1466 log.Warningf("events: failed when getting cgroup CPU usage for container: %v", err) 1467 event.Event.Data.CPU.Usage.Total = containerUsage 1468 return 1469 } 1470 1471 // If the sentry reports no CPU usage, fall back on cgroups and split usage 1472 // equally across containers. 1473 if allContainersUsage == 0 { 1474 log.Warningf("events: no sentry CPU usage reported") 1475 allContainersUsage = cgroupsUsage 1476 containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage)) 1477 } 1478 1479 // Scaling can easily overflow a uint64 (e.g. a containerUsage and 1480 // cgroupsUsage of 16 seconds each will overflow), so use floats. 1481 total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) 1482 log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) 1483 event.Event.Data.CPU.Usage.Total = uint64(total) 1484 return 1485 } 1486 1487 // setupCgroupForRoot configures and returns cgroup for the sandbox and the 1488 // root container. If `cgroupParentAnnotation` is set, use that path as the 1489 // sandbox cgroup and use Spec.Linux.CgroupsPath as the root container cgroup. 1490 func (c *Container) setupCgroupForRoot(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, cgroup.Cgroup, error) { 1491 var parentCgroup cgroup.Cgroup 1492 if parentPath, ok := spec.Annotations[cgroupParentAnnotation]; ok { 1493 var err error 1494 parentCgroup, err = cgroup.NewFromPath(parentPath, conf.SystemdCgroup) 1495 if err != nil { 1496 return nil, nil, err 1497 } 1498 } else { 1499 var err error 1500 parentCgroup, err = cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1501 if parentCgroup == nil || err != nil { 1502 return nil, nil, err 1503 } 1504 } 1505 1506 var err error 1507 parentCgroup, err = cgroupInstall(conf, parentCgroup, spec.Linux.Resources) 1508 if parentCgroup == nil || err != nil { 1509 return nil, nil, err 1510 } 1511 1512 subCgroup, err := c.setupCgroupForSubcontainer(conf, spec) 1513 if err != nil { 1514 _ = parentCgroup.Uninstall() 1515 return nil, nil, err 1516 } 1517 return parentCgroup, subCgroup, nil 1518 } 1519 1520 // setupCgroupForSubcontainer sets up empty cgroups for subcontainers. Since 1521 // subcontainers run exclusively inside the sandbox, subcontainer cgroups on the 1522 // host have no effect on them. However, some tools (e.g. cAdvisor) uses cgroups 1523 // paths to discover new containers and report stats for them. 1524 func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.Spec) (cgroup.Cgroup, error) { 1525 if isRoot(spec) { 1526 if _, ok := spec.Annotations[cgroupParentAnnotation]; !ok { 1527 return nil, nil 1528 } 1529 } 1530 1531 cg, err := cgroup.NewFromSpec(spec, conf.SystemdCgroup) 1532 if cg == nil || err != nil { 1533 return nil, err 1534 } 1535 // Use empty resources, just want the directory structure created. 1536 return cgroupInstall(conf, cg, &specs.LinuxResources{}) 1537 } 1538 1539 // donateGoferProfileFDs will open profile files and donate their FDs to the 1540 // gofer. 1541 func (c *Container) donateGoferProfileFDs(conf *config.Config, donations *donation.Agency) error { 1542 // The gofer profile files are named based on the provided flag, but 1543 // suffixed with "gofer" and the container ID to avoid collisions with 1544 // sentry profile files or profile files from other gofers. 1545 // 1546 // TODO(b/243183772): Merge gofer profile data with sentry profile data 1547 // into a single file. 1548 profSuffix := ".gofer." + c.ID 1549 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 1550 if conf.ProfileBlock != "" { 1551 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock+profSuffix, profFlags); err != nil { 1552 return err 1553 } 1554 } 1555 if conf.ProfileCPU != "" { 1556 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU+profSuffix, profFlags); err != nil { 1557 return err 1558 } 1559 } 1560 if conf.ProfileHeap != "" { 1561 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap+profSuffix, profFlags); err != nil { 1562 return err 1563 } 1564 } 1565 if conf.ProfileMutex != "" { 1566 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex+profSuffix, profFlags); err != nil { 1567 return err 1568 } 1569 } 1570 if conf.TraceFile != "" { 1571 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile+profSuffix, profFlags); err != nil { 1572 return err 1573 } 1574 } 1575 return nil 1576 } 1577 1578 // cgroupInstall creates cgroups dir structure and sets their respective 1579 // resources. In case of success, returns the cgroups instance and nil error. 1580 // For rootless, it's possible that cgroups operations fail, in this case the 1581 // error is suppressed and a nil cgroups instance is returned to indicate that 1582 // no cgroups was configured. 1583 func cgroupInstall(conf *config.Config, cg cgroup.Cgroup, res *specs.LinuxResources) (cgroup.Cgroup, error) { 1584 if err := cg.Install(res); err != nil { 1585 switch { 1586 case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless: 1587 log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) 1588 return nil, nil 1589 default: 1590 return nil, fmt.Errorf("configuring cgroup: %v", err) 1591 } 1592 } 1593 return cg, nil 1594 } 1595 1596 func modifySpecForDirectfs(conf *config.Config, spec *specs.Spec) error { 1597 if !conf.DirectFS || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1598 return nil 1599 } 1600 if conf.Network == config.NetworkHost { 1601 // Hostnet feature requires the sandbox to run in the current user 1602 // namespace, in which the network namespace is configured. 1603 return nil 1604 } 1605 if _, ok := specutils.GetNS(specs.UserNamespace, spec); ok { 1606 // If the spec already defines a userns, use that. 1607 return nil 1608 } 1609 if spec.Linux == nil { 1610 spec.Linux = &specs.Linux{} 1611 } 1612 if len(spec.Linux.UIDMappings) > 0 || len(spec.Linux.GIDMappings) > 0 { 1613 // The spec can only define UID/GID mappings with a userns (checked above). 1614 return fmt.Errorf("spec defines UID/GID mappings without defining userns") 1615 } 1616 // Run the sandbox in a new user namespace with identity UID/GID mappings. 1617 log.Debugf("Configuring container with a new userns with identity user mappings into current userns") 1618 spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: specs.UserNamespace}) 1619 uidMappings, err := getIdentityMapping("uid_map") 1620 if err != nil { 1621 return err 1622 } 1623 spec.Linux.UIDMappings = uidMappings 1624 logIDMappings(uidMappings, "UID") 1625 gidMappings, err := getIdentityMapping("gid_map") 1626 if err != nil { 1627 return err 1628 } 1629 spec.Linux.GIDMappings = gidMappings 1630 logIDMappings(gidMappings, "GID") 1631 return nil 1632 } 1633 1634 func getIdentityMapping(mapFileName string) ([]specs.LinuxIDMapping, error) { 1635 // See user_namespaces(7) to understand how /proc/self/{uid/gid}_map files 1636 // are organized. 1637 mapFile := path.Join("/proc/self", mapFileName) 1638 file, err := os.Open(mapFile) 1639 if err != nil { 1640 return nil, fmt.Errorf("failed to open %s: %v", mapFile, err) 1641 } 1642 defer file.Close() 1643 1644 var mappings []specs.LinuxIDMapping 1645 scanner := bufio.NewScanner(file) 1646 for scanner.Scan() { 1647 line := scanner.Text() 1648 var myStart, parentStart, rangeLen uint32 1649 numParsed, err := fmt.Sscanf(line, "%d %d %d", &myStart, &parentStart, &rangeLen) 1650 if err != nil { 1651 return nil, fmt.Errorf("failed to parse line %q in file %s: %v", line, mapFile, err) 1652 } 1653 if numParsed != 3 { 1654 return nil, fmt.Errorf("failed to parse 3 integers from line %q in file %s", line, mapFile) 1655 } 1656 // Create an identity mapping with the current userns. 1657 mappings = append(mappings, specs.LinuxIDMapping{ 1658 ContainerID: myStart, 1659 HostID: myStart, 1660 Size: rangeLen, 1661 }) 1662 } 1663 if err := scanner.Err(); err != nil { 1664 return nil, fmt.Errorf("failed to scan file %s: %v", mapFile, err) 1665 } 1666 return mappings, nil 1667 } 1668 1669 func logIDMappings(mappings []specs.LinuxIDMapping, idType string) { 1670 if !log.IsLogging(log.Debug) { 1671 return 1672 } 1673 log.Debugf("%s Mappings:", idType) 1674 for _, m := range mappings { 1675 log.Debugf("\tContainer ID: %d, Host ID: %d, Range Length: %d", m.ContainerID, m.HostID, m.Size) 1676 } 1677 } 1678 1679 // nvProxyPreGoferHostSetup sets up nvproxy on the host. It runs before any 1680 // Gofers start. 1681 // It verifies that all the required dependencies are in place, loads kernel 1682 // modules, and ensures the correct device files exist and are accessible. 1683 // This should only be necessary once on the host. It should be run during the 1684 // root container setup sequence to make sure it has run at least once. 1685 func nvProxyPreGoferHostSetup(spec *specs.Spec, conf *config.Config) error { 1686 if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker { 1687 return nil 1688 } 1689 1690 // Locate binaries. For security reasons, unlike 1691 // nvidia-container-runtime-hook, we don't add the container's filesystem 1692 // to the search path. We also don't support 1693 // /etc/nvidia-container-runtime/config.toml to avoid importing a TOML 1694 // parser. 1695 cliPath, err := exec.LookPath("nvidia-container-cli") 1696 if err != nil { 1697 return fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1698 } 1699 1700 // nvidia-container-cli --load-kmods seems to be a noop; load kernel modules ourselves. 1701 nvproxyLoadKernelModules() 1702 1703 // Run `nvidia-container-cli info`. 1704 // This has the side-effect of automatically creating GPU device files. 1705 argv := []string{cliPath, "--load-kmods", "info"} 1706 log.Debugf("Executing %q", argv) 1707 var infoOut, infoErr strings.Builder 1708 cmd := exec.Cmd{ 1709 Path: argv[0], 1710 Args: argv, 1711 Env: os.Environ(), 1712 Stdout: &infoOut, 1713 Stderr: &infoErr, 1714 } 1715 if err := cmd.Run(); err != nil { 1716 return fmt.Errorf("nvidia-container-cli info failed, err: %v\nstdout: %s\nstderr: %s", err, infoOut.String(), infoErr.String()) 1717 } 1718 log.Debugf("nvidia-container-cli info: %v", infoOut.String()) 1719 1720 return nil 1721 } 1722 1723 // nvproxyLoadKernelModules loads NVIDIA-related kernel modules with modprobe. 1724 func nvproxyLoadKernelModules() { 1725 for _, mod := range [...]string{ 1726 "nvidia", 1727 "nvidia-uvm", 1728 } { 1729 argv := []string{ 1730 "/sbin/modprobe", 1731 mod, 1732 } 1733 log.Debugf("Executing %q", argv) 1734 var stdout, stderr strings.Builder 1735 cmd := exec.Cmd{ 1736 Path: argv[0], 1737 Args: argv, 1738 Env: os.Environ(), 1739 Stdout: &stdout, 1740 Stderr: &stderr, 1741 } 1742 if err := cmd.Run(); err != nil { 1743 // This might not be fatal since modules may already be loaded. Log 1744 // the failure but continue. 1745 log.Warningf("modprobe %s failed, err: %v\nstdout: %s\nstderr: %s", mod, err, stdout.String(), stderr.String()) 1746 } 1747 } 1748 } 1749 1750 // nvproxySetupAfterGoferUserns runs `nvidia-container-cli configure`. 1751 // This sets up the container filesystem with bind mounts that allow it to 1752 // use NVIDIA devices. 1753 // 1754 // This should be called during the Gofer setup process, as the bind mounts 1755 // are created in the Gofer's mount namespace. 1756 // If successful, it returns a callback function that must be called once the 1757 // Gofer process has started. 1758 // This function has no effect if nvproxy functionality is not requested. 1759 // 1760 // This function essentially replicates 1761 // nvidia-container-toolkit:cmd/nvidia-container-runtime-hook, i.e. the 1762 // binary that executeHook() is hard-coded to skip, with differences noted 1763 // inline. We do this rather than move the prestart hook because the 1764 // "runtime environment" in which prestart hooks execute is vaguely 1765 // defined, such that nvidia-container-runtime-hook and existing runsc 1766 // hooks differ in their expected environment. 1767 // 1768 // Note that nvidia-container-cli will set up files in /dev and /proc which 1769 // are useless, since they will be hidden by sentry devtmpfs and procfs 1770 // respectively (and some device files will have the wrong device numbers 1771 // from the application's perspective since nvproxy may register device 1772 // numbers in sentry VFS that differ from those on the host, e.g. for 1773 // nvidia-uvm). These files are separately created during sandbox VFS 1774 // construction. For this reason, we don't need to parse 1775 // NVIDIA_VISIBLE_DEVICES or pass --device to nvidia-container-cli. 1776 func nvproxySetupAfterGoferUserns(spec *specs.Spec, conf *config.Config, goferCmd *exec.Cmd, goferDonations *donation.Agency) (func() error, error) { 1777 if !specutils.GPUFunctionalityRequested(spec, conf) || !conf.NVProxyDocker { 1778 return func() error { return nil }, nil 1779 } 1780 1781 if spec.Root == nil { 1782 return nil, fmt.Errorf("spec missing root filesystem") 1783 } 1784 1785 // nvidia-container-cli does not create this directory. 1786 if err := os.MkdirAll(path.Join(spec.Root.Path, "proc", "driver", "nvidia"), 0555); err != nil { 1787 return nil, fmt.Errorf("failed to create /proc/driver/nvidia in app filesystem: %w", err) 1788 } 1789 1790 cliPath, err := exec.LookPath("nvidia-container-cli") 1791 if err != nil { 1792 return nil, fmt.Errorf("failed to locate nvidia-container-cli in PATH: %w", err) 1793 } 1794 1795 // On Ubuntu, ldconfig is a wrapper around ldconfig.real, and we need the latter. 1796 var ldconfigPath string 1797 if _, err := os.Stat("/sbin/ldconfig.real"); err == nil { 1798 ldconfigPath = "/sbin/ldconfig.real" 1799 } else { 1800 ldconfigPath = "/sbin/ldconfig" 1801 } 1802 1803 var nvidiaDevices strings.Builder 1804 deviceIDs, err := specutils.NvidiaDeviceNumbers(spec, conf) 1805 if err != nil { 1806 return nil, fmt.Errorf("failed to get nvidia device numbers: %w", err) 1807 } 1808 for i, deviceID := range deviceIDs { 1809 if i > 0 { 1810 nvidiaDevices.WriteRune(',') 1811 } 1812 nvidiaDevices.WriteString(fmt.Sprintf("%d", uint32(deviceID))) 1813 } 1814 1815 // Create synchronization FD for nvproxy. 1816 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1817 if err != nil { 1818 return nil, err 1819 } 1820 ourEnd := os.NewFile(uintptr(fds[0]), "nvproxy sync runsc FD") 1821 goferEnd := os.NewFile(uintptr(fds[1]), "nvproxy sync gofer FD") 1822 goferDonations.DonateAndClose("sync-nvproxy-fd", goferEnd) 1823 1824 return func() error { 1825 defer ourEnd.Close() 1826 argv := []string{ 1827 cliPath, 1828 "--load-kmods", 1829 "configure", 1830 fmt.Sprintf("--ldconfig=@%s", ldconfigPath), 1831 "--no-cgroups", // runsc doesn't configure device cgroups yet 1832 "--utility", 1833 "--compute", 1834 fmt.Sprintf("--pid=%d", goferCmd.Process.Pid), 1835 fmt.Sprintf("--device=%s", nvidiaDevices.String()), 1836 spec.Root.Path, 1837 } 1838 log.Debugf("Executing %q", argv) 1839 var stdout, stderr strings.Builder 1840 cmd := exec.Cmd{ 1841 Path: argv[0], 1842 Args: argv, 1843 Env: os.Environ(), 1844 Stdout: &stdout, 1845 Stderr: &stderr, 1846 } 1847 if err := cmd.Run(); err != nil { 1848 return fmt.Errorf("nvidia-container-cli configure failed, err: %v\nstdout: %s\nstderr: %s", err, stdout.String(), stderr.String()) 1849 } 1850 return nil 1851 }, nil 1852 }