github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/process_linux.go (about) 1 package libcontainer 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/json" 7 "errors" 8 "fmt" 9 "io" 10 "io/fs" 11 "net" 12 "os" 13 "os/exec" 14 "path/filepath" 15 "runtime" 16 "strconv" 17 "sync" 18 "time" 19 20 "github.com/opencontainers/runtime-spec/specs-go" 21 "github.com/sirupsen/logrus" 22 "golang.org/x/sys/unix" 23 24 "github.com/opencontainers/runc/libcontainer/cgroups" 25 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" 26 "github.com/opencontainers/runc/libcontainer/cgroups/systemd" 27 "github.com/opencontainers/runc/libcontainer/configs" 28 "github.com/opencontainers/runc/libcontainer/intelrdt" 29 "github.com/opencontainers/runc/libcontainer/logs" 30 "github.com/opencontainers/runc/libcontainer/system" 31 "github.com/opencontainers/runc/libcontainer/system/kernelparam" 32 "github.com/opencontainers/runc/libcontainer/userns" 33 "github.com/opencontainers/runc/libcontainer/utils" 34 ) 35 36 type parentProcess interface { 37 // pid returns the pid for the running process. 38 pid() int 39 40 // start starts the process execution. 41 start() error 42 43 // send a SIGKILL to the process and wait for the exit. 44 terminate() error 45 46 // wait waits on the process returning the process state. 47 wait() (*os.ProcessState, error) 48 49 // startTime returns the process start time. 50 startTime() (uint64, error) 51 signal(os.Signal) error 52 externalDescriptors() []string 53 setExternalDescriptors(fds []string) 54 forwardChildLogs() chan error 55 } 56 57 type processComm struct { 58 // Used to send initial configuration to "runc init" and for "runc init" to 59 // indicate that it is ready. 60 initSockParent *os.File 61 initSockChild *os.File 62 // Used for control messages between parent and "runc init". 63 syncSockParent *syncSocket 64 syncSockChild *syncSocket 65 // Used for log forwarding from "runc init" to the parent. 66 logPipeParent *os.File 67 logPipeChild *os.File 68 } 69 70 func newProcessComm() (*processComm, error) { 71 var ( 72 comm processComm 73 err error 74 ) 75 comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init") 76 if err != nil { 77 return nil, fmt.Errorf("unable to create init pipe: %w", err) 78 } 79 comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync") 80 if err != nil { 81 return nil, fmt.Errorf("unable to create sync pipe: %w", err) 82 } 83 comm.logPipeParent, comm.logPipeChild, err = os.Pipe() 84 if err != nil { 85 return nil, fmt.Errorf("unable to create log pipe: %w", err) 86 } 87 return &comm, nil 88 } 89 90 func (c *processComm) closeChild() { 91 _ = c.initSockChild.Close() 92 _ = c.syncSockChild.Close() 93 _ = c.logPipeChild.Close() 94 } 95 96 func (c *processComm) closeParent() { 97 _ = c.initSockParent.Close() 98 _ = c.syncSockParent.Close() 99 // c.logPipeParent is kept alive for ForwardLogs 100 } 101 102 type setnsProcess struct { 103 cmd *exec.Cmd 104 comm *processComm 105 cgroupPaths map[string]string 106 rootlessCgroups bool 107 manager cgroups.Manager 108 intelRdtPath string 109 config *initConfig 110 fds []string 111 process *Process 112 bootstrapData io.Reader 113 initProcessPid int 114 } 115 116 func (p *setnsProcess) startTime() (uint64, error) { 117 stat, err := system.Stat(p.pid()) 118 return stat.StartTime, err 119 } 120 121 func (p *setnsProcess) signal(sig os.Signal) error { 122 s, ok := sig.(unix.Signal) 123 if !ok { 124 return errors.New("os: unsupported signal type") 125 } 126 return unix.Kill(p.pid(), s) 127 } 128 129 func (p *setnsProcess) start() (retErr error) { 130 defer p.comm.closeParent() 131 132 if p.process.IOPriority != nil { 133 if err := setIOPriority(p.process.IOPriority); err != nil { 134 return err 135 } 136 } 137 138 // get the "before" value of oom kill count 139 oom, _ := p.manager.OOMKillCount() 140 141 // When greater or equal to zero, it will set a temporary single CPU 142 // affinity before cgroup cpuset transition, this handles a corner 143 // case when joining a container having all the processes running 144 // exclusively on isolated CPU cores to force the kernel to schedule 145 // runc process on the first CPU core within the cgroups cpuset. 146 // The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 147 // in 5.7 has affected this deterministic scheduling behavior by 148 // distributing tasks across CPU cores within the cgroups cpuset. 149 // Some intensive real-time application are relying on this 150 // deterministic behavior and use the first CPU core to run a slow 151 // thread while other CPU cores are fully used by real-time threads 152 // with SCHED_FIFO policy. Such applications prevent runc process 153 // from joining a container when the runc process is randomly 154 // scheduled on a CPU core owned by a real-time thread. 155 cpuAffinity := -1 156 resetCPUAffinity := true 157 158 if len(p.manager.GetPaths()) > 0 { 159 // Get the target container cgroup. 160 if cg, err := p.manager.GetCgroups(); err != nil { 161 // Close the pipe to not be blocked in the parent. 162 p.comm.closeChild() 163 return fmt.Errorf("getting container cgroups: %w", err) 164 } else if cg.CpusetCpus != "" { 165 definitive := false 166 167 _, annotations := utils.Annotations(p.config.Config.Labels) 168 cpuAffinity, definitive, err = isolatedCPUAffinityTransition( 169 os.DirFS("/"), 170 cg.CpusetCpus, 171 annotations, 172 ) 173 if err != nil { 174 // Close the pipe to not be blocked in the parent. 175 p.comm.closeChild() 176 return fmt.Errorf("getting CPU affinity: %w", err) 177 } else if definitive { 178 resetCPUAffinity = false 179 } 180 } 181 } 182 183 var err error 184 185 if cpuAffinity < 0 { 186 err = p.cmd.Start() 187 } else { 188 err = startCommandWithCPUAffinity(p.cmd, cpuAffinity) 189 } 190 191 // Close the write-side of the pipes (controlled by child). 192 p.comm.closeChild() 193 if err != nil { 194 return fmt.Errorf("error starting setns process: %w", err) 195 } 196 197 waitInit := initWaiter(p.comm.initSockParent) 198 defer func() { 199 if retErr != nil { 200 if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { 201 // Someone in this cgroup was killed, this _might_ be us. 202 retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr) 203 } 204 werr := <-waitInit 205 if werr != nil { 206 logrus.WithError(werr).Warn() 207 } 208 err := ignoreTerminateErrors(p.terminate()) 209 if err != nil { 210 logrus.WithError(err).Warn("unable to terminate setnsProcess") 211 } 212 } 213 }() 214 215 if p.bootstrapData != nil { 216 if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { 217 return fmt.Errorf("error copying bootstrap data to pipe: %w", err) 218 } 219 } 220 err = <-waitInit 221 if err != nil { 222 return err 223 } 224 if err := p.execSetns(); err != nil { 225 return fmt.Errorf("error executing setns process: %w", err) 226 } 227 for _, path := range p.cgroupPaths { 228 if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups { 229 // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY. 230 // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 231 // Try to join the cgroup of InitProcessPid. 232 if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 { 233 initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) 234 initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) 235 if initCgErr == nil { 236 if initCgPath, ok := initCg[""]; ok { 237 initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) 238 logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)", 239 p.pid(), p.cgroupPaths, err, initCg, initCgDirpath) 240 // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container. 241 err = cgroups.WriteCgroupProc(initCgDirpath, p.pid()) 242 } 243 } 244 } 245 if err != nil { 246 return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err) 247 } 248 } 249 } 250 251 if resetCPUAffinity { 252 // Fix the container process CPU affinity to match container cgroup cpuset, 253 // since kernel 6.2, the runc CPU affinity might affect the container process 254 // CPU affinity after cgroup cpuset transition, by example if runc is running 255 // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the 256 // resulting container process CPU affinity will be 1 instead of 1-2. 257 if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { 258 return fmt.Errorf("error resetting container process CPU affinity: %w", err) 259 } 260 } 261 262 if p.intelRdtPath != "" { 263 // if Intel RDT "resource control" filesystem path exists 264 _, err := os.Stat(p.intelRdtPath) 265 if err == nil { 266 if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { 267 return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err) 268 } 269 } 270 } 271 272 if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { 273 return fmt.Errorf("error writing config to pipe: %w", err) 274 } 275 276 var seenProcReady bool 277 ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { 278 switch sync.Type { 279 case procReady: 280 seenProcReady = true 281 // Set rlimits, this has to be done here because we lose permissions 282 // to raise the limits once we enter a user-namespace 283 if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { 284 return fmt.Errorf("error setting rlimits for ready process: %w", err) 285 } 286 287 // Sync with child. 288 if err := writeSync(p.comm.syncSockParent, procRun); err != nil { 289 return err 290 } 291 case procHooks: 292 // This shouldn't happen. 293 panic("unexpected procHooks in setns") 294 case procMountPlease: 295 // This shouldn't happen. 296 panic("unexpected procMountPlease in setns") 297 case procSeccomp: 298 if p.config.Config.Seccomp.ListenerPath == "" { 299 return errors.New("seccomp listenerPath is not set") 300 } 301 if sync.Arg == nil { 302 return fmt.Errorf("sync %q is missing an argument", sync.Type) 303 } 304 var srcFd int 305 if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { 306 return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) 307 } 308 seccompFd, err := pidGetFd(p.pid(), srcFd) 309 if err != nil { 310 return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) 311 } 312 defer seccompFd.Close() 313 // We have a copy, the child can keep working. We don't need to 314 // wait for the seccomp notify listener to get the fd before we 315 // permit the child to continue because the child will happily wait 316 // for the listener if it hits SCMP_ACT_NOTIFY. 317 if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { 318 return err 319 } 320 321 bundle, annotations := utils.Annotations(p.config.Config.Labels) 322 containerProcessState := &specs.ContainerProcessState{ 323 Version: specs.Version, 324 Fds: []string{specs.SeccompFdName}, 325 Pid: p.cmd.Process.Pid, 326 Metadata: p.config.Config.Seccomp.ListenerMetadata, 327 State: specs.State{ 328 Version: specs.Version, 329 ID: p.config.ContainerID, 330 Status: specs.StateRunning, 331 Pid: p.initProcessPid, 332 Bundle: bundle, 333 Annotations: annotations, 334 }, 335 } 336 if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, 337 containerProcessState, seccompFd); err != nil { 338 return err 339 } 340 default: 341 return errors.New("invalid JSON payload from child") 342 } 343 return nil 344 }) 345 346 if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { 347 return err 348 } 349 if !seenProcReady && ierr == nil { 350 ierr = errors.New("procReady not received") 351 } 352 // Must be done after Shutdown so the child will exit and we can wait for it. 353 if ierr != nil { 354 _, _ = p.wait() 355 return ierr 356 } 357 return nil 358 } 359 360 // execSetns runs the process that executes C code to perform the setns calls 361 // because setns support requires the C process to fork off a child and perform the setns 362 // before the go runtime boots, we wait on the process to die and receive the child's pid 363 // over the provided pipe. 364 func (p *setnsProcess) execSetns() error { 365 status, err := p.cmd.Process.Wait() 366 if err != nil { 367 _ = p.cmd.Wait() 368 return fmt.Errorf("error waiting on setns process to finish: %w", err) 369 } 370 if !status.Success() { 371 _ = p.cmd.Wait() 372 return &exec.ExitError{ProcessState: status} 373 } 374 var pid *pid 375 if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { 376 _ = p.cmd.Wait() 377 return fmt.Errorf("error reading pid from init pipe: %w", err) 378 } 379 380 // Clean up the zombie parent process 381 // On Unix systems FindProcess always succeeds. 382 firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) 383 384 // Ignore the error in case the child has already been reaped for any reason 385 _, _ = firstChildProcess.Wait() 386 387 process, err := os.FindProcess(pid.Pid) 388 if err != nil { 389 return err 390 } 391 p.cmd.Process = process 392 p.process.ops = p 393 return nil 394 } 395 396 // terminate sends a SIGKILL to the forked process for the setns routine then waits to 397 // avoid the process becoming a zombie. 398 func (p *setnsProcess) terminate() error { 399 if p.cmd.Process == nil { 400 return nil 401 } 402 err := p.cmd.Process.Kill() 403 if _, werr := p.wait(); err == nil { 404 err = werr 405 } 406 return err 407 } 408 409 func (p *setnsProcess) wait() (*os.ProcessState, error) { 410 err := p.cmd.Wait() 411 412 // Return actual ProcessState even on Wait error 413 return p.cmd.ProcessState, err 414 } 415 416 func (p *setnsProcess) pid() int { 417 return p.cmd.Process.Pid 418 } 419 420 func (p *setnsProcess) externalDescriptors() []string { 421 return p.fds 422 } 423 424 func (p *setnsProcess) setExternalDescriptors(newFds []string) { 425 p.fds = newFds 426 } 427 428 func (p *setnsProcess) forwardChildLogs() chan error { 429 return logs.ForwardLogs(p.comm.logPipeParent) 430 } 431 432 type initProcess struct { 433 cmd *exec.Cmd 434 comm *processComm 435 config *initConfig 436 manager cgroups.Manager 437 intelRdtManager *intelrdt.Manager 438 container *Container 439 fds []string 440 process *Process 441 bootstrapData io.Reader 442 } 443 444 func (p *initProcess) pid() int { 445 return p.cmd.Process.Pid 446 } 447 448 func (p *initProcess) externalDescriptors() []string { 449 return p.fds 450 } 451 452 // getChildPid receives the final child's pid over the provided pipe. 453 func (p *initProcess) getChildPid() (int, error) { 454 var pid pid 455 if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { 456 _ = p.cmd.Wait() 457 return -1, err 458 } 459 460 // Clean up the zombie parent process 461 // On Unix systems FindProcess always succeeds. 462 firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) 463 464 // Ignore the error in case the child has already been reaped for any reason 465 _, _ = firstChildProcess.Wait() 466 467 return pid.Pid, nil 468 } 469 470 func (p *initProcess) waitForChildExit(childPid int) error { 471 status, err := p.cmd.Process.Wait() 472 if err != nil { 473 _ = p.cmd.Wait() 474 return err 475 } 476 if !status.Success() { 477 _ = p.cmd.Wait() 478 return &exec.ExitError{ProcessState: status} 479 } 480 481 process, err := os.FindProcess(childPid) 482 if err != nil { 483 return err 484 } 485 p.cmd.Process = process 486 p.process.ops = p 487 return nil 488 } 489 490 type mountSourceRequestFn func(*configs.Mount) (*mountSource, error) 491 492 // goCreateMountSources spawns a goroutine which creates open_tree(2)-style 493 // mountfds based on the requested configs.Mount configuration. The returned 494 // requestFn and cancelFn are used to interact with the goroutine. 495 // 496 // The caller of the returned mountSourceRequestFn is responsible for closing 497 // the returned file. 498 func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) { 499 type response struct { 500 src *mountSource 501 err error 502 } 503 504 errCh := make(chan error, 1) 505 requestCh := make(chan *configs.Mount) 506 responseCh := make(chan response) 507 508 ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute) 509 go func() { 510 // We lock this thread because we need to setns(2) here. There is no 511 // UnlockOSThread() here, to ensure that the Go runtime will kill this 512 // thread once this goroutine returns (ensuring no other goroutines run 513 // in this context). 514 runtime.LockOSThread() 515 516 // Detach from the shared fs of the rest of the Go process in order to 517 // be able to CLONE_NEWNS. 518 if err := unix.Unshare(unix.CLONE_FS); err != nil { 519 err = os.NewSyscallError("unshare(CLONE_FS)", err) 520 errCh <- fmt.Errorf("mount source thread: %w", err) 521 return 522 } 523 524 // Attach to the container's mount namespace. 525 nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid())) 526 if err != nil { 527 errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err) 528 return 529 } 530 defer nsFd.Close() 531 if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil { 532 err = os.NewSyscallError("setns", err) 533 errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err) 534 return 535 } 536 537 // No errors during setup! 538 close(errCh) 539 logrus.Debugf("mount source thread: successfully running in container mntns") 540 541 nsHandles := new(userns.Handles) 542 defer nsHandles.Release() 543 loop: 544 for { 545 select { 546 case m, ok := <-requestCh: 547 if !ok { 548 break loop 549 } 550 src, err := mountFd(nsHandles, m) 551 logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err) 552 responseCh <- response{ 553 src: src, 554 err: err, 555 } 556 case <-ctx.Done(): 557 break loop 558 } 559 } 560 logrus.Debugf("mount source thread: closing thread: %v", ctx.Err()) 561 close(responseCh) 562 }() 563 564 // Check for setup errors. 565 err := <-errCh 566 if err != nil { 567 cancelFn() 568 return nil, nil, err 569 } 570 571 // TODO: Switch to context.AfterFunc when we switch to Go 1.21. 572 var requestChCloseOnce sync.Once 573 requestFn := func(m *configs.Mount) (*mountSource, error) { 574 var err error 575 select { 576 case requestCh <- m: 577 select { 578 case resp, ok := <-responseCh: 579 if ok { 580 return resp.src, resp.err 581 } 582 case <-ctx.Done(): 583 err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err()) 584 } 585 case <-ctx.Done(): 586 err = fmt.Errorf("send mount request cancelled: %w", ctx.Err()) 587 } 588 requestChCloseOnce.Do(func() { close(requestCh) }) 589 return nil, err 590 } 591 return requestFn, cancelFn, nil 592 } 593 594 func (p *initProcess) start() (retErr error) { 595 defer p.comm.closeParent() 596 err := p.cmd.Start() 597 p.process.ops = p 598 // close the child-side of the pipes (controlled by child) 599 p.comm.closeChild() 600 if err != nil { 601 p.process.ops = nil 602 return fmt.Errorf("unable to start init: %w", err) 603 } 604 605 waitInit := initWaiter(p.comm.initSockParent) 606 defer func() { 607 if retErr != nil { 608 // Find out if init is killed by the kernel's OOM killer. 609 // Get the count before killing init as otherwise cgroup 610 // might be removed by systemd. 611 oom, err := p.manager.OOMKillCount() 612 if err != nil { 613 logrus.WithError(err).Warn("unable to get oom kill count") 614 } else if oom > 0 { 615 // Does not matter what the particular error was, 616 // its cause is most probably OOM, so report that. 617 const oomError = "container init was OOM-killed (memory limit too low?)" 618 619 if logrus.GetLevel() >= logrus.DebugLevel { 620 // Only show the original error if debug is set, 621 // as it is not generally very useful. 622 retErr = fmt.Errorf(oomError+": %w", retErr) 623 } else { 624 retErr = errors.New(oomError) 625 } 626 } 627 628 werr := <-waitInit 629 if werr != nil { 630 logrus.WithError(werr).Warn() 631 } 632 633 // Terminate the process to ensure we can remove cgroups. 634 if err := ignoreTerminateErrors(p.terminate()); err != nil { 635 logrus.WithError(err).Warn("unable to terminate initProcess") 636 } 637 638 _ = p.manager.Destroy() 639 if p.intelRdtManager != nil { 640 _ = p.intelRdtManager.Destroy() 641 } 642 } 643 }() 644 645 // Do this before syncing with child so that no children can escape the 646 // cgroup. We don't need to worry about not doing this and not being root 647 // because we'd be using the rootless cgroup manager in that case. 648 if err := p.manager.Apply(p.pid()); err != nil { 649 return fmt.Errorf("unable to apply cgroup configuration: %w", err) 650 } 651 if p.intelRdtManager != nil { 652 if err := p.intelRdtManager.Apply(p.pid()); err != nil { 653 return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) 654 } 655 } 656 if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { 657 return fmt.Errorf("can't copy bootstrap data to pipe: %w", err) 658 } 659 err = <-waitInit 660 if err != nil { 661 return err 662 } 663 664 childPid, err := p.getChildPid() 665 if err != nil { 666 return fmt.Errorf("can't get final child's PID from pipe: %w", err) 667 } 668 669 // Save the standard descriptor names before the container process 670 // can potentially move them (e.g., via dup2()). If we don't do this now, 671 // we won't know at checkpoint time which file descriptor to look up. 672 fds, err := getPipeFds(childPid) 673 if err != nil { 674 return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err) 675 } 676 p.setExternalDescriptors(fds) 677 678 // Wait for our first child to exit 679 if err := p.waitForChildExit(childPid); err != nil { 680 return fmt.Errorf("error waiting for our first child to exit: %w", err) 681 } 682 683 // Spin up a goroutine to handle remapping mount requests by runc init. 684 // There is no point doing this for rootless containers because they cannot 685 // configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just 686 // service plain-open requests for plain bind-mounts but there's no need 687 // (rootless containers will never have permission issues on a source mount 688 // that the parent process can help with -- they are the same user). 689 var mountRequest mountSourceRequestFn 690 if !p.container.config.RootlessEUID { 691 request, cancel, err := p.goCreateMountSources(context.Background()) 692 if err != nil { 693 return fmt.Errorf("error spawning mount remapping thread: %w", err) 694 } 695 defer cancel() 696 mountRequest = request 697 } 698 699 if err := p.createNetworkInterfaces(); err != nil { 700 return fmt.Errorf("error creating network interfaces: %w", err) 701 } 702 if err := p.updateSpecState(); err != nil { 703 return fmt.Errorf("error updating spec state: %w", err) 704 } 705 if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { 706 return fmt.Errorf("error sending config to init process: %w", err) 707 } 708 709 var seenProcReady bool 710 ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { 711 switch sync.Type { 712 case procMountPlease: 713 if mountRequest == nil { 714 return fmt.Errorf("cannot fulfil mount requests as a rootless user") 715 } 716 var m *configs.Mount 717 if sync.Arg == nil { 718 return fmt.Errorf("sync %q is missing an argument", sync.Type) 719 } 720 if err := json.Unmarshal(*sync.Arg, &m); err != nil { 721 return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err) 722 } 723 mnt, err := mountRequest(m) 724 if err != nil { 725 return fmt.Errorf("failed to fulfil mount request: %w", err) 726 } 727 defer mnt.file.Close() 728 729 arg, err := json.Marshal(mnt) 730 if err != nil { 731 return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err) 732 } 733 argMsg := json.RawMessage(arg) 734 if err := doWriteSync(p.comm.syncSockParent, syncT{ 735 Type: procMountFd, 736 Arg: &argMsg, 737 File: mnt.file, 738 }); err != nil { 739 return err 740 } 741 case procSeccomp: 742 if p.config.Config.Seccomp.ListenerPath == "" { 743 return errors.New("seccomp listenerPath is not set") 744 } 745 var srcFd int 746 if sync.Arg == nil { 747 return fmt.Errorf("sync %q is missing an argument", sync.Type) 748 } 749 if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { 750 return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) 751 } 752 seccompFd, err := pidGetFd(p.pid(), srcFd) 753 if err != nil { 754 return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) 755 } 756 defer seccompFd.Close() 757 // We have a copy, the child can keep working. We don't need to 758 // wait for the seccomp notify listener to get the fd before we 759 // permit the child to continue because the child will happily wait 760 // for the listener if it hits SCMP_ACT_NOTIFY. 761 if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { 762 return err 763 } 764 765 s, err := p.container.currentOCIState() 766 if err != nil { 767 return err 768 } 769 770 // initProcessStartTime hasn't been set yet. 771 s.Pid = p.cmd.Process.Pid 772 s.Status = specs.StateCreating 773 containerProcessState := &specs.ContainerProcessState{ 774 Version: specs.Version, 775 Fds: []string{specs.SeccompFdName}, 776 Pid: s.Pid, 777 Metadata: p.config.Config.Seccomp.ListenerMetadata, 778 State: *s, 779 } 780 if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, 781 containerProcessState, seccompFd); err != nil { 782 return err 783 } 784 case procReady: 785 seenProcReady = true 786 // Set rlimits, this has to be done here because we lose permissions 787 // to raise the limits once we enter a user-namespace 788 if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { 789 return fmt.Errorf("error setting rlimits for ready process: %w", err) 790 } 791 792 // generate a timestamp indicating when the container was started 793 p.container.created = time.Now().UTC() 794 p.container.state = &createdState{ 795 c: p.container, 796 } 797 798 // NOTE: If the procRun state has been synced and the 799 // runc-create process has been killed for some reason, 800 // the runc-init[2:stage] process will be leaky. And 801 // the runc command also fails to parse root directory 802 // because the container doesn't have state.json. 803 // 804 // In order to cleanup the runc-init[2:stage] by 805 // runc-delete/stop, we should store the status before 806 // procRun sync. 807 state, uerr := p.container.updateState(p) 808 if uerr != nil { 809 return fmt.Errorf("unable to store init state: %w", uerr) 810 } 811 p.container.initProcessStartTime = state.InitProcessStartTime 812 813 // Sync with child. 814 if err := writeSync(p.comm.syncSockParent, procRun); err != nil { 815 return err 816 } 817 case procHooks: 818 // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. 819 if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { 820 return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) 821 } 822 // Reset container process CPU affinity to match container cgroup cpuset, 823 // since kernel 6.2, the runc CPU affinity might affect the container process 824 // CPU affinity after cgroup cpuset transition, by example if runc is running 825 // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the 826 // resulting container process CPU affinity will be 1 instead of 1-2. 827 if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { 828 return fmt.Errorf("error resetting container process CPU affinity: %w", err) 829 } 830 if p.intelRdtManager != nil { 831 if err := p.intelRdtManager.Set(p.config.Config); err != nil { 832 return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) 833 } 834 } 835 if len(p.config.Config.Hooks) != 0 { 836 s, err := p.container.currentOCIState() 837 if err != nil { 838 return err 839 } 840 // initProcessStartTime hasn't been set yet. 841 s.Pid = p.cmd.Process.Pid 842 s.Status = specs.StateCreating 843 hooks := p.config.Config.Hooks 844 845 if err := hooks.Run(configs.Prestart, s); err != nil { 846 return err 847 } 848 if err := hooks.Run(configs.CreateRuntime, s); err != nil { 849 return err 850 } 851 } 852 // Sync with child. 853 if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil { 854 return err 855 } 856 default: 857 return errors.New("invalid JSON payload from child") 858 } 859 return nil 860 }) 861 862 if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { 863 return err 864 } 865 if !seenProcReady && ierr == nil { 866 ierr = errors.New("procReady not received") 867 } 868 if ierr != nil { 869 return fmt.Errorf("error during container init: %w", ierr) 870 } 871 return nil 872 } 873 874 func (p *initProcess) wait() (*os.ProcessState, error) { 875 err := p.cmd.Wait() 876 return p.cmd.ProcessState, err 877 } 878 879 func (p *initProcess) terminate() error { 880 if p.cmd.Process == nil { 881 return nil 882 } 883 err := p.cmd.Process.Kill() 884 if _, werr := p.wait(); err == nil { 885 err = werr 886 } 887 return err 888 } 889 890 func (p *initProcess) startTime() (uint64, error) { 891 stat, err := system.Stat(p.pid()) 892 return stat.StartTime, err 893 } 894 895 func (p *initProcess) updateSpecState() error { 896 s, err := p.container.currentOCIState() 897 if err != nil { 898 return err 899 } 900 901 p.config.SpecState = s 902 return nil 903 } 904 905 func (p *initProcess) createNetworkInterfaces() error { 906 for _, config := range p.config.Config.Networks { 907 strategy, err := getStrategy(config.Type) 908 if err != nil { 909 return err 910 } 911 n := &network{ 912 Network: *config, 913 } 914 if err := strategy.create(n, p.pid()); err != nil { 915 return err 916 } 917 p.config.Networks = append(p.config.Networks, n) 918 } 919 return nil 920 } 921 922 func (p *initProcess) signal(sig os.Signal) error { 923 s, ok := sig.(unix.Signal) 924 if !ok { 925 return errors.New("os: unsupported signal type") 926 } 927 return unix.Kill(p.pid(), s) 928 } 929 930 func (p *initProcess) setExternalDescriptors(newFds []string) { 931 p.fds = newFds 932 } 933 934 func (p *initProcess) forwardChildLogs() chan error { 935 return logs.ForwardLogs(p.comm.logPipeParent) 936 } 937 938 func pidGetFd(pid, srcFd int) (*os.File, error) { 939 pidFd, err := unix.PidfdOpen(pid, 0) 940 if err != nil { 941 return nil, os.NewSyscallError("pidfd_open", err) 942 } 943 defer unix.Close(pidFd) 944 fd, err := unix.PidfdGetfd(pidFd, srcFd, 0) 945 if err != nil { 946 return nil, os.NewSyscallError("pidfd_getfd", err) 947 } 948 return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil 949 } 950 951 func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error { 952 conn, err := net.Dial("unix", listenerPath) 953 if err != nil { 954 return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) 955 } 956 957 socket, err := conn.(*net.UnixConn).File() 958 if err != nil { 959 return fmt.Errorf("cannot get seccomp socket: %w", err) 960 } 961 defer socket.Close() 962 963 b, err := json.Marshal(state) 964 if err != nil { 965 return fmt.Errorf("cannot marshall seccomp state: %w", err) 966 } 967 968 if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil { 969 return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) 970 } 971 runtime.KeepAlive(file) 972 return nil 973 } 974 975 func getPipeFds(pid int) ([]string, error) { 976 fds := make([]string, 3) 977 978 dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") 979 for i := 0; i < 3; i++ { 980 // XXX: This breaks if the path is not a valid symlink (which can 981 // happen in certain particularly unlucky mount namespace setups). 982 f := filepath.Join(dirPath, strconv.Itoa(i)) 983 target, err := os.Readlink(f) 984 if err != nil { 985 // Ignore permission errors, for rootless containers and other 986 // non-dumpable processes. if we can't get the fd for a particular 987 // file, there's not much we can do. 988 if os.IsPermission(err) { 989 continue 990 } 991 return fds, err 992 } 993 fds[i] = target 994 } 995 return fds, nil 996 } 997 998 // InitializeIO creates pipes for use with the process's stdio and returns the 999 // opposite side for each. Do not use this if you want to have a pseudoterminal 1000 // set up for you by libcontainer (TODO: fix that too). 1001 // TODO: This is mostly unnecessary, and should be handled by clients. 1002 func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { 1003 var fds []uintptr 1004 i = &IO{} 1005 // cleanup in case of an error 1006 defer func() { 1007 if err != nil { 1008 for _, fd := range fds { 1009 _ = unix.Close(int(fd)) 1010 } 1011 } 1012 }() 1013 // STDIN 1014 r, w, err := os.Pipe() 1015 if err != nil { 1016 return nil, err 1017 } 1018 fds = append(fds, r.Fd(), w.Fd()) 1019 p.Stdin, i.Stdin = r, w 1020 // STDOUT 1021 if r, w, err = os.Pipe(); err != nil { 1022 return nil, err 1023 } 1024 fds = append(fds, r.Fd(), w.Fd()) 1025 p.Stdout, i.Stdout = w, r 1026 // STDERR 1027 if r, w, err = os.Pipe(); err != nil { 1028 return nil, err 1029 } 1030 fds = append(fds, r.Fd(), w.Fd()) 1031 p.Stderr, i.Stderr = w, r 1032 // change ownership of the pipes in case we are in a user namespace 1033 for _, fd := range fds { 1034 if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { 1035 return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err} 1036 } 1037 } 1038 return i, nil 1039 } 1040 1041 // initWaiter returns a channel to wait on for making sure 1042 // runc init has finished the initial setup. 1043 func initWaiter(r io.Reader) chan error { 1044 ch := make(chan error, 1) 1045 go func() { 1046 defer close(ch) 1047 1048 inited := make([]byte, 1) 1049 n, err := r.Read(inited) 1050 if err == nil { 1051 if n < 1 { 1052 err = errors.New("short read") 1053 } else if inited[0] != 0 { 1054 err = fmt.Errorf("unexpected %d != 0", inited[0]) 1055 } else { 1056 ch <- nil 1057 return 1058 } 1059 } 1060 ch <- fmt.Errorf("waiting for init preliminary setup: %w", err) 1061 }() 1062 1063 return ch 1064 } 1065 1066 func setIOPriority(ioprio *configs.IOPriority) error { 1067 const ioprioWhoPgrp = 1 1068 1069 class, ok := configs.IOPrioClassMapping[ioprio.Class] 1070 if !ok { 1071 return fmt.Errorf("invalid io priority class: %s", ioprio.Class) 1072 } 1073 1074 // Combine class and priority into a single value 1075 // https://github.com/torvalds/linux/blob/v5.18/include/uapi/linux/ioprio.h#L5-L17 1076 iop := (class << 13) | ioprio.Priority 1077 _, _, errno := unix.RawSyscall(unix.SYS_IOPRIO_SET, ioprioWhoPgrp, 0, uintptr(iop)) 1078 if errno != 0 { 1079 return fmt.Errorf("failed to set io priority: %w", errno) 1080 } 1081 1082 return nil 1083 } 1084 1085 // isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics 1086 // and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value. 1087 func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) { 1088 const ( 1089 isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" 1090 nohzFullParam = "nohz_full" 1091 ) 1092 1093 definitive := false 1094 1095 transition := annotations[isolatedCPUAffinityTransitionAnnotation] 1096 switch transition { 1097 case "temporary": 1098 case "definitive": 1099 definitive = true 1100 default: 1101 if transition != "" { 1102 return -1, false, fmt.Errorf( 1103 "unknown transition value %q for annotation %s", 1104 transition, isolatedCPUAffinityTransitionAnnotation, 1105 ) 1106 } 1107 return -1, false, nil 1108 } 1109 1110 kernelParams, err := kernelparam.LookupKernelBootParameters( 1111 rootFS, 1112 nohzFullParam, 1113 ) 1114 if err != nil { 1115 // If /proc/cmdline does not exist or isn't readable, continue to read 1116 // nohz_full from sysfs below. 1117 if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) { 1118 return -1, false, err 1119 } 1120 } 1121 1122 // First get nohz_full value from kernel boot params, if not 1123 // present, get the value from sysfs, to cover the case where 1124 // CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration 1125 // tests not dependent on /sys/devices/system/cpu/nohz_full. 1126 isolatedList := kernelParams[nohzFullParam] 1127 if isolatedList == "" { 1128 // Get the isolated CPU list, the error is not checked here because 1129 // no matter what the error is, it returns without error the same way 1130 // as with empty data. 1131 isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full") 1132 isolatedList = string(bytes.TrimSpace(isolatedData)) 1133 if isolatedList == "" || isolatedList == "(null)" { 1134 return -1, false, nil 1135 } 1136 } 1137 1138 cpu, err := getEligibleCPU(cpusetList, isolatedList) 1139 if err != nil { 1140 return -1, false, fmt.Errorf("getting eligible cpu: %w", err) 1141 } else if cpu == -1 { 1142 definitive = false 1143 } 1144 1145 return cpu, definitive, nil 1146 } 1147 1148 // getEligibleCPU returns the first eligible CPU for CPU affinity before 1149 // entering in a cgroup cpuset: 1150 // - when there is not cpuset cores: no eligible CPU (-1) 1151 // - when there is not isolated cores: no eligible CPU (-1) 1152 // - when cpuset cores are not in isolated cores: no eligible CPU (-1) 1153 // - when cpuset cores are all isolated cores: return the first CPU of the cpuset 1154 // - when cpuset cores are mixed between housekeeping/isolated cores: return the 1155 // first housekeeping CPU not in isolated CPUs. 1156 func getEligibleCPU(cpusetList, isolatedList string) (int, error) { 1157 if isolatedList == "" || cpusetList == "" { 1158 return -1, nil 1159 } 1160 1161 // The target container has a cgroup cpuset, get the bit range. 1162 cpusetBits, err := systemd.RangeToBits(cpusetList) 1163 if err != nil { 1164 return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) 1165 } 1166 1167 isolatedBits, err := systemd.RangeToBits(isolatedList) 1168 if err != nil { 1169 return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) 1170 } 1171 1172 eligibleCore := -1 1173 isolatedCores := 0 1174 1175 // Start from cpu core #0. 1176 currentCore := 0 1177 // Handle mixed sets. 1178 mixed := false 1179 1180 // CPU core start from the first slice element and bits are read 1181 // from the least to the most significant bit. 1182 for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { 1183 if byteRange >= len(isolatedBits) { 1184 // No more isolated cores. 1185 break 1186 } 1187 for bit := 0; bit < 8; bit++ { 1188 if cpusetBits[byteRange]&(1<<bit) != 0 { 1189 // Mark the first core of the cgroup cpuset as eligible. 1190 if eligibleCore < 0 { 1191 eligibleCore = currentCore 1192 } 1193 1194 // Isolated cores count. 1195 if isolatedBits[byteRange]&(1<<bit) != 0 { 1196 isolatedCores++ 1197 } else if !mixed { 1198 // Not an isolated core, mark the current core as eligible once. 1199 mixed = true 1200 eligibleCore = currentCore 1201 } 1202 if mixed && isolatedCores > 0 { 1203 return eligibleCore, nil 1204 } 1205 } 1206 currentCore++ 1207 } 1208 } 1209 1210 // We have an eligible CPU if there is at least one isolated CPU in the cpuset. 1211 if isolatedCores == 0 { 1212 return -1, nil 1213 } 1214 1215 return eligibleCore, nil 1216 } 1217 1218 // startCommandWithCPUAffinity starts a command on a specific CPU if set. 1219 func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error { 1220 errCh := make(chan error) 1221 defer close(errCh) 1222 1223 // Use a goroutine to dedicate an OS thread. 1224 go func() { 1225 cpuSet := new(unix.CPUSet) 1226 cpuSet.Zero() 1227 cpuSet.Set(cpuAffinity) 1228 1229 // Don't call runtime.UnlockOSThread to terminate the OS thread 1230 // when goroutine exits. 1231 runtime.LockOSThread() 1232 1233 // Command inherits the CPU affinity. 1234 if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil { 1235 errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err) 1236 return 1237 } 1238 1239 errCh <- cmd.Start() 1240 }() 1241 1242 return <-errCh 1243 } 1244 1245 // fixProcessCPUAffinity sets the CPU affinity of a container process 1246 // to all CPUs allowed by container cgroup cpuset. 1247 func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error { 1248 cpusetList := manager.GetEffectiveCPUs() 1249 if cpusetList == "" { 1250 // If the cgroup cpuset is not present, the container will inherit 1251 // this process CPU affinity, so it can return without further actions. 1252 return nil 1253 } 1254 1255 cpusetBits, err := systemd.RangeToBits(cpusetList) 1256 if err != nil { 1257 return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) 1258 } 1259 1260 processCPUSet := new(unix.CPUSet) 1261 1262 for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { 1263 for bit := 0; bit < 8; bit++ { 1264 processCPUSet.Set(byteRange*8 + bit) 1265 } 1266 } 1267 1268 if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil { 1269 return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err) 1270 } 1271 1272 return nil 1273 }