github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/container_linux.go (about) 1 package libcontainer 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "os/exec" 10 "path" 11 "path/filepath" 12 "reflect" 13 "strconv" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/opencontainers/runtime-spec/specs-go" 19 "github.com/sirupsen/logrus" 20 "github.com/vishvananda/netlink/nl" 21 "golang.org/x/sys/execabs" 22 "golang.org/x/sys/unix" 23 24 "github.com/opencontainers/runc/libcontainer/cgroups" 25 "github.com/opencontainers/runc/libcontainer/configs" 26 "github.com/opencontainers/runc/libcontainer/dmz" 27 "github.com/opencontainers/runc/libcontainer/intelrdt" 28 "github.com/opencontainers/runc/libcontainer/system" 29 "github.com/opencontainers/runc/libcontainer/system/kernelversion" 30 "github.com/opencontainers/runc/libcontainer/utils" 31 ) 32 33 const stdioFdCount = 3 34 35 // Container is a libcontainer container object. 36 type Container struct { 37 id string 38 stateDir string 39 config *configs.Config 40 cgroupManager cgroups.Manager 41 intelRdtManager *intelrdt.Manager 42 initProcess parentProcess 43 initProcessStartTime uint64 44 m sync.Mutex 45 criuVersion int 46 state containerState 47 created time.Time 48 fifo *os.File 49 } 50 51 // State represents a running container's state 52 type State struct { 53 BaseState 54 55 // Platform specific fields below here 56 57 // Specified if the container was started under the rootless mode. 58 // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups 59 Rootless bool `json:"rootless"` 60 61 // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths 62 // 63 // For cgroup v1, a key is cgroup subsystem name, and the value is the path 64 // to the cgroup for this subsystem. 65 // 66 // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. 67 CgroupPaths map[string]string `json:"cgroup_paths"` 68 69 // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type 70 // with the value as the path. 71 NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` 72 73 // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore 74 ExternalDescriptors []string `json:"external_descriptors,omitempty"` 75 76 // Intel RDT "resource control" filesystem path 77 IntelRdtPath string `json:"intel_rdt_path"` 78 } 79 80 // ID returns the container's unique ID 81 func (c *Container) ID() string { 82 return c.id 83 } 84 85 // Config returns the container's configuration 86 func (c *Container) Config() configs.Config { 87 return *c.config 88 } 89 90 // Status returns the current status of the container. 91 func (c *Container) Status() (Status, error) { 92 c.m.Lock() 93 defer c.m.Unlock() 94 return c.currentStatus() 95 } 96 97 // State returns the current container's state information. 98 func (c *Container) State() (*State, error) { 99 c.m.Lock() 100 defer c.m.Unlock() 101 return c.currentState() 102 } 103 104 // OCIState returns the current container's state information. 105 func (c *Container) OCIState() (*specs.State, error) { 106 c.m.Lock() 107 defer c.m.Unlock() 108 return c.currentOCIState() 109 } 110 111 // ignoreCgroupError filters out cgroup-related errors that can be ignored, 112 // because the container is stopped and its cgroup is gone. 113 func (c *Container) ignoreCgroupError(err error) error { 114 if err == nil { 115 return nil 116 } 117 if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() { 118 return nil 119 } 120 return err 121 } 122 123 // Processes returns the PIDs inside this container. The PIDs are in the 124 // namespace of the calling process. 125 // 126 // Some of the returned PIDs may no longer refer to processes in the container, 127 // unless the container state is PAUSED in which case every PID in the slice is 128 // valid. 129 func (c *Container) Processes() ([]int, error) { 130 pids, err := c.cgroupManager.GetAllPids() 131 if err = c.ignoreCgroupError(err); err != nil { 132 return nil, fmt.Errorf("unable to get all container pids: %w", err) 133 } 134 return pids, nil 135 } 136 137 // Stats returns statistics for the container. 138 func (c *Container) Stats() (*Stats, error) { 139 var ( 140 err error 141 stats = &Stats{} 142 ) 143 if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { 144 return stats, fmt.Errorf("unable to get container cgroup stats: %w", err) 145 } 146 if c.intelRdtManager != nil { 147 if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { 148 return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err) 149 } 150 } 151 for _, iface := range c.config.Networks { 152 switch iface.Type { 153 case "veth": 154 istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) 155 if err != nil { 156 return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err) 157 } 158 stats.Interfaces = append(stats.Interfaces, istats) 159 } 160 } 161 return stats, nil 162 } 163 164 // Set resources of container as configured. Can be used to change resources 165 // when the container is running. 166 func (c *Container) Set(config configs.Config) error { 167 c.m.Lock() 168 defer c.m.Unlock() 169 status, err := c.currentStatus() 170 if err != nil { 171 return err 172 } 173 if status == Stopped { 174 return ErrNotRunning 175 } 176 if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil { 177 // Set configs back 178 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { 179 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 180 } 181 return err 182 } 183 if c.intelRdtManager != nil { 184 if err := c.intelRdtManager.Set(&config); err != nil { 185 // Set configs back 186 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { 187 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 188 } 189 if err2 := c.intelRdtManager.Set(c.config); err2 != nil { 190 logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) 191 } 192 return err 193 } 194 } 195 // After config setting succeed, update config and states 196 c.config = &config 197 _, err = c.updateState(nil) 198 return err 199 } 200 201 // Start starts a process inside the container. Returns error if process fails 202 // to start. You can track process lifecycle with passed Process structure. 203 func (c *Container) Start(process *Process) error { 204 c.m.Lock() 205 defer c.m.Unlock() 206 if c.config.Cgroups.Resources.SkipDevices { 207 return errors.New("can't start container with SkipDevices set") 208 } 209 if process.Init { 210 if err := c.createExecFifo(); err != nil { 211 return err 212 } 213 } 214 if err := c.start(process); err != nil { 215 if process.Init { 216 c.deleteExecFifo() 217 } 218 return err 219 } 220 return nil 221 } 222 223 // Run immediately starts the process inside the container. Returns an error if 224 // the process fails to start. It does not block waiting for the exec fifo 225 // after start returns but opens the fifo after start returns. 226 func (c *Container) Run(process *Process) error { 227 if err := c.Start(process); err != nil { 228 return err 229 } 230 if process.Init { 231 return c.exec() 232 } 233 return nil 234 } 235 236 // Exec signals the container to exec the users process at the end of the init. 237 func (c *Container) Exec() error { 238 c.m.Lock() 239 defer c.m.Unlock() 240 return c.exec() 241 } 242 243 func (c *Container) exec() error { 244 path := filepath.Join(c.stateDir, execFifoFilename) 245 pid := c.initProcess.pid() 246 blockingFifoOpenCh := awaitFifoOpen(path) 247 for { 248 select { 249 case result := <-blockingFifoOpenCh: 250 return handleFifoResult(result) 251 252 case <-time.After(time.Millisecond * 100): 253 stat, err := system.Stat(pid) 254 if err != nil || stat.State == system.Zombie { 255 // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check. 256 // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete). 257 if err := handleFifoResult(fifoOpen(path, false)); err != nil { 258 return errors.New("container process is already dead") 259 } 260 return nil 261 } 262 } 263 } 264 } 265 266 func readFromExecFifo(execFifo io.Reader) error { 267 data, err := io.ReadAll(execFifo) 268 if err != nil { 269 return err 270 } 271 if len(data) <= 0 { 272 return errors.New("cannot start an already running container") 273 } 274 return nil 275 } 276 277 func awaitFifoOpen(path string) <-chan openResult { 278 fifoOpened := make(chan openResult) 279 go func() { 280 result := fifoOpen(path, true) 281 fifoOpened <- result 282 }() 283 return fifoOpened 284 } 285 286 func fifoOpen(path string, block bool) openResult { 287 flags := os.O_RDONLY 288 if !block { 289 flags |= unix.O_NONBLOCK 290 } 291 f, err := os.OpenFile(path, flags, 0) 292 if err != nil { 293 return openResult{err: fmt.Errorf("exec fifo: %w", err)} 294 } 295 return openResult{file: f} 296 } 297 298 func handleFifoResult(result openResult) error { 299 if result.err != nil { 300 return result.err 301 } 302 f := result.file 303 defer f.Close() 304 if err := readFromExecFifo(f); err != nil { 305 return err 306 } 307 return os.Remove(f.Name()) 308 } 309 310 type openResult struct { 311 file *os.File 312 err error 313 } 314 315 func (c *Container) start(process *Process) (retErr error) { 316 parent, err := c.newParentProcess(process) 317 if err != nil { 318 return fmt.Errorf("unable to create new parent process: %w", err) 319 } 320 // We do not need the cloned binaries once the process is spawned. 321 defer process.closeClonedExes() 322 323 logsDone := parent.forwardChildLogs() 324 if logsDone != nil { 325 defer func() { 326 // Wait for log forwarder to finish. This depends on 327 // runc init closing the _LIBCONTAINER_LOGPIPE log fd. 328 err := <-logsDone 329 if err != nil && retErr == nil { 330 retErr = fmt.Errorf("unable to forward init logs: %w", err) 331 } 332 }() 333 } 334 335 // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC 336 // to make sure we don't leak any files into "runc init". Any files to be 337 // passed to "runc init" through ExtraFiles will get dup2'd by the Go 338 // runtime and thus their O_CLOEXEC flag will be cleared. This is some 339 // additional protection against attacks like CVE-2024-21626, by making 340 // sure we never leak files to "runc init" we didn't intend to. 341 if err := utils.CloseExecFrom(3); err != nil { 342 return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) 343 } 344 if err := parent.start(); err != nil { 345 return fmt.Errorf("unable to start container process: %w", err) 346 } 347 348 if process.Init { 349 c.fifo.Close() 350 if c.config.Hooks != nil { 351 s, err := c.currentOCIState() 352 if err != nil { 353 return err 354 } 355 356 if err := c.config.Hooks.Run(configs.Poststart, s); err != nil { 357 if err := ignoreTerminateErrors(parent.terminate()); err != nil { 358 logrus.Warn(fmt.Errorf("error running poststart hook: %w", err)) 359 } 360 return err 361 } 362 } 363 } 364 return nil 365 } 366 367 // Signal sends a specified signal to container's init. 368 // 369 // When s is SIGKILL and the container does not have its own PID namespace, all 370 // the container's processes are killed. In this scenario, the libcontainer 371 // user may be required to implement a proper child reaper. 372 func (c *Container) Signal(s os.Signal) error { 373 c.m.Lock() 374 defer c.m.Unlock() 375 376 // When a container has its own PID namespace, inside it the init PID 377 // is 1, and thus it is handled specially by the kernel. In particular, 378 // killing init with SIGKILL from an ancestor namespace will also kill 379 // all other processes in that PID namespace (see pid_namespaces(7)). 380 // 381 // OTOH, if PID namespace is shared, we should kill all pids to avoid 382 // leftover processes. Handle this special case here. 383 if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { 384 if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { 385 return fmt.Errorf("unable to kill all processes: %w", err) 386 } 387 return nil 388 } 389 390 // To avoid a PID reuse attack, don't kill non-running container. 391 if !c.hasInit() { 392 return ErrNotRunning 393 } 394 if err := c.initProcess.signal(s); err != nil { 395 return fmt.Errorf("unable to signal init: %w", err) 396 } 397 if s == unix.SIGKILL { 398 // For cgroup v1, killing a process in a frozen cgroup 399 // does nothing until it's thawed. Only thaw the cgroup 400 // for SIGKILL. 401 if paused, _ := c.isPaused(); paused { 402 _ = c.cgroupManager.Freeze(configs.Thawed) 403 } 404 } 405 return nil 406 } 407 408 func (c *Container) createExecFifo() error { 409 rootuid, err := c.Config().HostRootUID() 410 if err != nil { 411 return err 412 } 413 rootgid, err := c.Config().HostRootGID() 414 if err != nil { 415 return err 416 } 417 418 fifoName := filepath.Join(c.stateDir, execFifoFilename) 419 if _, err := os.Stat(fifoName); err == nil { 420 return fmt.Errorf("exec fifo %s already exists", fifoName) 421 } 422 if err := unix.Mkfifo(fifoName, 0o622); err != nil { 423 return &os.PathError{Op: "mkfifo", Path: fifoName, Err: err} 424 } 425 // Ensure permission bits (can be different because of umask). 426 if err := os.Chmod(fifoName, 0o622); err != nil { 427 return err 428 } 429 return os.Chown(fifoName, rootuid, rootgid) 430 } 431 432 func (c *Container) deleteExecFifo() { 433 fifoName := filepath.Join(c.stateDir, execFifoFilename) 434 os.Remove(fifoName) 435 } 436 437 // includeExecFifo opens the container's execfifo as a pathfd, so that the 438 // container cannot access the statedir (and the FIFO itself remains 439 // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited 440 // fd, with _LIBCONTAINER_FIFOFD set to its fd number. 441 func (c *Container) includeExecFifo(cmd *exec.Cmd) error { 442 fifoName := filepath.Join(c.stateDir, execFifoFilename) 443 fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) 444 if err != nil { 445 return err 446 } 447 c.fifo = fifo 448 449 cmd.ExtraFiles = append(cmd.ExtraFiles, fifo) 450 cmd.Env = append(cmd.Env, 451 "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) 452 return nil 453 } 454 455 // No longer needed in Go 1.21. 456 func slicesContains[S ~[]E, E comparable](slice S, needle E) bool { 457 for _, val := range slice { 458 if val == needle { 459 return true 460 } 461 } 462 return false 463 } 464 465 func isDmzBinarySafe(c *configs.Config) bool { 466 // Because we set the dumpable flag in nsexec, the only time when it is 467 // unsafe to use runc-dmz is when the container process would be able to 468 // race against "runc init" and bypass the ptrace_may_access() checks. 469 // 470 // This is only the case if the container processes could have 471 // CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding, 472 // inheritable, or ambient sets). Luckily, most containers do not have this 473 // capability. 474 if c.Capabilities == nil || 475 (!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") && 476 !slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") && 477 !slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) { 478 return true 479 } 480 481 // Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot 482 // access /proc/$pid/exe of runc after it joins the namespace (until it 483 // does an exec), regardless of the capability set. This has been 484 // backported to other distribution kernels, but there's no way of checking 485 // this cheaply -- better to be safe than sorry here. 486 linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10} 487 if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil { 488 if c.Namespaces.Contains(configs.NEWUSER) { 489 return true 490 } 491 } 492 493 // Assume it's unsafe otherwise. 494 return false 495 } 496 497 func (c *Container) newParentProcess(p *Process) (parentProcess, error) { 498 comm, err := newProcessComm() 499 if err != nil { 500 return nil, err 501 } 502 503 // Make sure we use a new safe copy of /proc/self/exe or the runc-dmz 504 // binary each time this is called, to make sure that if a container 505 // manages to overwrite the file it cannot affect other containers on the 506 // system. For runc, this code will only ever be called once, but 507 // libcontainer users might call this more than once. 508 p.closeClonedExes() 509 var ( 510 exePath string 511 // only one of dmzExe or safeExe are used at a time 512 dmzExe, safeExe *os.File 513 ) 514 if dmz.IsSelfExeCloned() { 515 // /proc/self/exe is already a cloned binary -- no need to do anything 516 logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") 517 // We don't need to use /proc/thread-self here because the exe mm of a 518 // thread-group is guaranteed to be the same for all threads by 519 // definition. This lets us avoid having to do runtime.LockOSThread. 520 exePath = "/proc/self/exe" 521 } else { 522 var err error 523 if isDmzBinarySafe(c.config) { 524 dmzExe, err = dmz.Binary(c.stateDir) 525 if err == nil { 526 // We can use our own executable without cloning if we are 527 // using runc-dmz. We don't need to use /proc/thread-self here 528 // because the exe mm of a thread-group is guaranteed to be the 529 // same for all threads by definition. This lets us avoid 530 // having to do runtime.LockOSThread. 531 exePath = "/proc/self/exe" 532 p.clonedExes = append(p.clonedExes, dmzExe) 533 logrus.Debug("runc-dmz: using runc-dmz") // used for tests 534 } else if errors.Is(err, dmz.ErrNoDmzBinary) { 535 logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone") 536 } else if err != nil { 537 return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err) 538 } 539 } else { 540 // If the configuration makes it unsafe to use runc-dmz, pretend we 541 // don't have it embedded so we do /proc/self/exe cloning. 542 logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone") 543 err = dmz.ErrNoDmzBinary 544 } 545 if errors.Is(err, dmz.ErrNoDmzBinary) { 546 safeExe, err = dmz.CloneSelfExe(c.stateDir) 547 if err != nil { 548 return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) 549 } 550 exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) 551 p.clonedExes = append(p.clonedExes, safeExe) 552 logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests 553 } 554 // Just to make sure we don't run without protection. 555 if dmzExe == nil && safeExe == nil { 556 // This should never happen. 557 return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection") 558 } 559 } 560 561 cmd := exec.Command(exePath, "init") 562 cmd.Args[0] = os.Args[0] 563 cmd.Stdin = p.Stdin 564 cmd.Stdout = p.Stdout 565 cmd.Stderr = p.Stderr 566 cmd.Dir = c.config.Rootfs 567 if cmd.SysProcAttr == nil { 568 cmd.SysProcAttr = &unix.SysProcAttr{} 569 } 570 cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) 571 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) 572 if p.ConsoleSocket != nil { 573 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) 574 cmd.Env = append(cmd.Env, 575 "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 576 ) 577 } 578 579 cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) 580 cmd.Env = append(cmd.Env, 581 "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 582 ) 583 cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File()) 584 cmd.Env = append(cmd.Env, 585 "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 586 ) 587 588 if dmzExe != nil { 589 cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe) 590 cmd.Env = append(cmd.Env, 591 "_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) 592 } 593 594 cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild) 595 cmd.Env = append(cmd.Env, 596 "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) 597 if p.LogLevel != "" { 598 cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) 599 } 600 601 if p.PidfdSocket != nil { 602 cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket) 603 cmd.Env = append(cmd.Env, 604 "_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), 605 ) 606 } 607 608 if safeExe != nil { 609 // Due to a Go stdlib bug, we need to add safeExe to the set of 610 // ExtraFiles otherwise it is possible for the stdlib to clobber the fd 611 // during forkAndExecInChild1 and replace it with some other file that 612 // might be malicious. This is less than ideal (because the descriptor 613 // will be non-O_CLOEXEC) however we have protections in "runc init" to 614 // stop us from leaking extra file descriptors. 615 // 616 // See <https://github.com/golang/go/issues/61751>. 617 cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe) 618 } 619 620 // NOTE: when running a container with no PID namespace and the parent 621 // process spawning the container is PID1 the pdeathsig is being 622 // delivered to the container's init process by the kernel for some 623 // reason even with the parent still running. 624 if c.config.ParentDeathSignal > 0 { 625 cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) 626 } 627 628 if p.Init { 629 // We only set up fifoFd if we're not doing a `runc exec`. The historic 630 // reason for this is that previously we would pass a dirfd that allowed 631 // for container rootfs escape (and not doing it in `runc exec` avoided 632 // that problem), but we no longer do that. However, there's no need to do 633 // this for `runc exec` so we just keep it this way to be safe. 634 if err := c.includeExecFifo(cmd); err != nil { 635 return nil, fmt.Errorf("unable to setup exec fifo: %w", err) 636 } 637 return c.newInitProcess(p, cmd, comm) 638 } 639 return c.newSetnsProcess(p, cmd, comm) 640 } 641 642 func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { 643 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) 644 nsMaps := make(map[configs.NamespaceType]string) 645 for _, ns := range c.config.Namespaces { 646 if ns.Path != "" { 647 nsMaps[ns.Type] = ns.Path 648 } 649 } 650 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) 651 if err != nil { 652 return nil, err 653 } 654 655 init := &initProcess{ 656 cmd: cmd, 657 comm: comm, 658 manager: c.cgroupManager, 659 intelRdtManager: c.intelRdtManager, 660 config: c.newInitConfig(p), 661 container: c, 662 process: p, 663 bootstrapData: data, 664 } 665 c.initProcess = init 666 return init, nil 667 } 668 669 func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) { 670 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) 671 state, err := c.currentState() 672 if err != nil { 673 return nil, fmt.Errorf("unable to get container state: %w", err) 674 } 675 // for setns process, we don't have to set cloneflags as the process namespaces 676 // will only be set via setns syscall 677 data, err := c.bootstrapData(0, state.NamespacePaths) 678 if err != nil { 679 return nil, err 680 } 681 proc := &setnsProcess{ 682 cmd: cmd, 683 cgroupPaths: state.CgroupPaths, 684 rootlessCgroups: c.config.RootlessCgroups, 685 intelRdtPath: state.IntelRdtPath, 686 comm: comm, 687 manager: c.cgroupManager, 688 config: c.newInitConfig(p), 689 process: p, 690 bootstrapData: data, 691 initProcessPid: state.InitProcessPid, 692 } 693 if len(p.SubCgroupPaths) > 0 { 694 if add, ok := p.SubCgroupPaths[""]; ok { 695 // cgroup v1: using the same path for all controllers. 696 // cgroup v2: the only possible way. 697 for k := range proc.cgroupPaths { 698 subPath := path.Join(proc.cgroupPaths[k], add) 699 if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) { 700 return nil, fmt.Errorf("%s is not a sub cgroup path", add) 701 } 702 proc.cgroupPaths[k] = subPath 703 } 704 // cgroup v2: do not try to join init process's cgroup 705 // as a fallback (see (*setnsProcess).start). 706 proc.initProcessPid = 0 707 } else { 708 // Per-controller paths. 709 for ctrl, add := range p.SubCgroupPaths { 710 if val, ok := proc.cgroupPaths[ctrl]; ok { 711 subPath := path.Join(val, add) 712 if !strings.HasPrefix(subPath, val) { 713 return nil, fmt.Errorf("%s is not a sub cgroup path", add) 714 } 715 proc.cgroupPaths[ctrl] = subPath 716 } else { 717 return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl) 718 } 719 } 720 } 721 } 722 return proc, nil 723 } 724 725 func (c *Container) newInitConfig(process *Process) *initConfig { 726 cfg := &initConfig{ 727 Config: c.config, 728 Args: process.Args, 729 Env: process.Env, 730 User: process.User, 731 AdditionalGroups: process.AdditionalGroups, 732 Cwd: process.Cwd, 733 Capabilities: process.Capabilities, 734 PassedFilesCount: len(process.ExtraFiles), 735 ContainerID: c.ID(), 736 NoNewPrivileges: c.config.NoNewPrivileges, 737 RootlessEUID: c.config.RootlessEUID, 738 RootlessCgroups: c.config.RootlessCgroups, 739 AppArmorProfile: c.config.AppArmorProfile, 740 ProcessLabel: c.config.ProcessLabel, 741 Rlimits: c.config.Rlimits, 742 CreateConsole: process.ConsoleSocket != nil, 743 ConsoleWidth: process.ConsoleWidth, 744 ConsoleHeight: process.ConsoleHeight, 745 } 746 if process.NoNewPrivileges != nil { 747 cfg.NoNewPrivileges = *process.NoNewPrivileges 748 } 749 if process.AppArmorProfile != "" { 750 cfg.AppArmorProfile = process.AppArmorProfile 751 } 752 if process.Label != "" { 753 cfg.ProcessLabel = process.Label 754 } 755 if len(process.Rlimits) > 0 { 756 cfg.Rlimits = process.Rlimits 757 } 758 if cgroups.IsCgroup2UnifiedMode() { 759 cfg.Cgroup2Path = c.cgroupManager.Path("") 760 } 761 762 return cfg 763 } 764 765 // Destroy destroys the container, if its in a valid state. 766 // 767 // Any event registrations are removed before the container is destroyed. 768 // No error is returned if the container is already destroyed. 769 // 770 // Running containers must first be stopped using Signal. 771 // Paused containers must first be resumed using Resume. 772 func (c *Container) Destroy() error { 773 c.m.Lock() 774 defer c.m.Unlock() 775 if err := c.state.destroy(); err != nil { 776 return fmt.Errorf("unable to destroy container: %w", err) 777 } 778 return nil 779 } 780 781 // Pause pauses the container, if its state is RUNNING or CREATED, changing 782 // its state to PAUSED. If the state is already PAUSED, does nothing. 783 func (c *Container) Pause() error { 784 c.m.Lock() 785 defer c.m.Unlock() 786 status, err := c.currentStatus() 787 if err != nil { 788 return err 789 } 790 switch status { 791 case Running, Created: 792 if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { 793 return err 794 } 795 return c.state.transition(&pausedState{ 796 c: c, 797 }) 798 } 799 return ErrNotRunning 800 } 801 802 // Resume resumes the execution of any user processes in the 803 // container before setting the container state to RUNNING. 804 // This is only performed if the current state is PAUSED. 805 // If the Container state is RUNNING, does nothing. 806 func (c *Container) Resume() error { 807 c.m.Lock() 808 defer c.m.Unlock() 809 status, err := c.currentStatus() 810 if err != nil { 811 return err 812 } 813 if status != Paused { 814 return ErrNotPaused 815 } 816 if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { 817 return err 818 } 819 return c.state.transition(&runningState{ 820 c: c, 821 }) 822 } 823 824 // NotifyOOM returns a read-only channel signaling when the container receives 825 // an OOM notification. 826 func (c *Container) NotifyOOM() (<-chan struct{}, error) { 827 // XXX(cyphar): This requires cgroups. 828 if c.config.RootlessCgroups { 829 logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") 830 } 831 path := c.cgroupManager.Path("memory") 832 if cgroups.IsCgroup2UnifiedMode() { 833 return notifyOnOOMV2(path) 834 } 835 return notifyOnOOM(path) 836 } 837 838 // NotifyMemoryPressure returns a read-only channel signaling when the 839 // container reaches a given pressure level. 840 func (c *Container) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { 841 // XXX(cyphar): This requires cgroups. 842 if c.config.RootlessCgroups { 843 logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") 844 } 845 return notifyMemoryPressure(c.cgroupManager.Path("memory"), level) 846 } 847 848 func (c *Container) updateState(process parentProcess) (*State, error) { 849 if process != nil { 850 c.initProcess = process 851 } 852 state, err := c.currentState() 853 if err != nil { 854 return nil, err 855 } 856 err = c.saveState(state) 857 if err != nil { 858 return nil, err 859 } 860 return state, nil 861 } 862 863 func (c *Container) saveState(s *State) (retErr error) { 864 tmpFile, err := os.CreateTemp(c.stateDir, "state-") 865 if err != nil { 866 return err 867 } 868 869 defer func() { 870 if retErr != nil { 871 tmpFile.Close() 872 os.Remove(tmpFile.Name()) 873 } 874 }() 875 876 err = utils.WriteJSON(tmpFile, s) 877 if err != nil { 878 return err 879 } 880 err = tmpFile.Close() 881 if err != nil { 882 return err 883 } 884 885 stateFilePath := filepath.Join(c.stateDir, stateFilename) 886 return os.Rename(tmpFile.Name(), stateFilePath) 887 } 888 889 func (c *Container) currentStatus() (Status, error) { 890 if err := c.refreshState(); err != nil { 891 return -1, err 892 } 893 return c.state.status(), nil 894 } 895 896 // refreshState needs to be called to verify that the current state on the 897 // container is what is true. Because consumers of libcontainer can use it 898 // out of process we need to verify the container's status based on runtime 899 // information and not rely on our in process info. 900 func (c *Container) refreshState() error { 901 paused, err := c.isPaused() 902 if err != nil { 903 return err 904 } 905 if paused { 906 return c.state.transition(&pausedState{c: c}) 907 } 908 if !c.hasInit() { 909 return c.state.transition(&stoppedState{c: c}) 910 } 911 // The presence of exec fifo helps to distinguish between 912 // the created and the running states. 913 if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil { 914 return c.state.transition(&createdState{c: c}) 915 } 916 return c.state.transition(&runningState{c: c}) 917 } 918 919 // hasInit tells whether the container init process exists. 920 func (c *Container) hasInit() bool { 921 if c.initProcess == nil { 922 return false 923 } 924 pid := c.initProcess.pid() 925 stat, err := system.Stat(pid) 926 if err != nil { 927 return false 928 } 929 if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { 930 return false 931 } 932 return true 933 } 934 935 func (c *Container) isPaused() (bool, error) { 936 state, err := c.cgroupManager.GetFreezerState() 937 if err != nil { 938 return false, err 939 } 940 return state == configs.Frozen, nil 941 } 942 943 func (c *Container) currentState() (*State, error) { 944 var ( 945 startTime uint64 946 externalDescriptors []string 947 pid = -1 948 ) 949 if c.initProcess != nil { 950 pid = c.initProcess.pid() 951 startTime, _ = c.initProcess.startTime() 952 externalDescriptors = c.initProcess.externalDescriptors() 953 } 954 955 intelRdtPath := "" 956 if c.intelRdtManager != nil { 957 intelRdtPath = c.intelRdtManager.GetPath() 958 } 959 state := &State{ 960 BaseState: BaseState{ 961 ID: c.ID(), 962 Config: *c.config, 963 InitProcessPid: pid, 964 InitProcessStartTime: startTime, 965 Created: c.created, 966 }, 967 Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, 968 CgroupPaths: c.cgroupManager.GetPaths(), 969 IntelRdtPath: intelRdtPath, 970 NamespacePaths: make(map[configs.NamespaceType]string), 971 ExternalDescriptors: externalDescriptors, 972 } 973 if pid > 0 { 974 for _, ns := range c.config.Namespaces { 975 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 976 } 977 for _, nsType := range configs.NamespaceTypes() { 978 if !configs.IsNamespaceSupported(nsType) { 979 continue 980 } 981 if _, ok := state.NamespacePaths[nsType]; !ok { 982 ns := configs.Namespace{Type: nsType} 983 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 984 } 985 } 986 } 987 return state, nil 988 } 989 990 func (c *Container) currentOCIState() (*specs.State, error) { 991 bundle, annotations := utils.Annotations(c.config.Labels) 992 state := &specs.State{ 993 Version: specs.Version, 994 ID: c.ID(), 995 Bundle: bundle, 996 Annotations: annotations, 997 } 998 status, err := c.currentStatus() 999 if err != nil { 1000 return nil, err 1001 } 1002 state.Status = specs.ContainerState(status.String()) 1003 if status != Stopped { 1004 if c.initProcess != nil { 1005 state.Pid = c.initProcess.pid() 1006 } 1007 } 1008 return state, nil 1009 } 1010 1011 // orderNamespacePaths sorts namespace paths into a list of paths that we 1012 // can setns in order. 1013 func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { 1014 paths := []string{} 1015 for _, ns := range configs.NamespaceTypes() { 1016 1017 // Remove namespaces that we don't need to join. 1018 if !c.config.Namespaces.Contains(ns) { 1019 continue 1020 } 1021 1022 if p, ok := namespaces[ns]; ok && p != "" { 1023 // check if the requested namespace is supported 1024 if !configs.IsNamespaceSupported(ns) { 1025 return nil, fmt.Errorf("namespace %s is not supported", ns) 1026 } 1027 // only set to join this namespace if it exists 1028 if _, err := os.Lstat(p); err != nil { 1029 return nil, fmt.Errorf("namespace path: %w", err) 1030 } 1031 // do not allow namespace path with comma as we use it to separate 1032 // the namespace paths 1033 if strings.ContainsRune(p, ',') { 1034 return nil, fmt.Errorf("invalid namespace path %s", p) 1035 } 1036 paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) 1037 } 1038 1039 } 1040 1041 return paths, nil 1042 } 1043 1044 func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { 1045 data := bytes.NewBuffer(nil) 1046 for _, im := range idMap { 1047 line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) 1048 if _, err := data.WriteString(line); err != nil { 1049 return nil, err 1050 } 1051 } 1052 return data.Bytes(), nil 1053 } 1054 1055 // netlinkError is an error wrapper type for use by custom netlink message 1056 // types. Panics with errors are wrapped in netlinkError so that the recover 1057 // in bootstrapData can distinguish intentional panics. 1058 type netlinkError struct{ error } 1059 1060 // bootstrapData encodes the necessary data in netlink binary format 1061 // as a io.Reader. 1062 // Consumer can write the data to a bootstrap program 1063 // such as one that uses nsenter package to bootstrap the container's 1064 // init process correctly, i.e. with correct namespaces, uid/gid 1065 // mapping etc. 1066 func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) { 1067 // create the netlink message 1068 r := nl.NewNetlinkRequest(int(InitMsg), 0) 1069 1070 // Our custom messages cannot bubble up an error using returns, instead 1071 // they will panic with the specific error type, netlinkError. In that 1072 // case, recover from the panic and return that as an error. 1073 defer func() { 1074 if r := recover(); r != nil { 1075 if e, ok := r.(netlinkError); ok { 1076 Err = e.error 1077 } else { 1078 panic(r) 1079 } 1080 } 1081 }() 1082 1083 // write cloneFlags 1084 r.AddData(&Int32msg{ 1085 Type: CloneFlagsAttr, 1086 Value: uint32(cloneFlags), 1087 }) 1088 1089 // write custom namespace paths 1090 if len(nsMaps) > 0 { 1091 nsPaths, err := c.orderNamespacePaths(nsMaps) 1092 if err != nil { 1093 return nil, err 1094 } 1095 r.AddData(&Bytemsg{ 1096 Type: NsPathsAttr, 1097 Value: []byte(strings.Join(nsPaths, ",")), 1098 }) 1099 } 1100 1101 // write namespace paths only when we are not joining an existing user ns 1102 _, joinExistingUser := nsMaps[configs.NEWUSER] 1103 if !joinExistingUser { 1104 // write uid mappings 1105 if len(c.config.UIDMappings) > 0 { 1106 if c.config.RootlessEUID { 1107 // We resolve the paths for new{u,g}idmap from 1108 // the context of runc to avoid doing a path 1109 // lookup in the nsexec context. 1110 if path, err := execabs.LookPath("newuidmap"); err == nil { 1111 r.AddData(&Bytemsg{ 1112 Type: UidmapPathAttr, 1113 Value: []byte(path), 1114 }) 1115 } 1116 } 1117 b, err := encodeIDMapping(c.config.UIDMappings) 1118 if err != nil { 1119 return nil, err 1120 } 1121 r.AddData(&Bytemsg{ 1122 Type: UidmapAttr, 1123 Value: b, 1124 }) 1125 } 1126 1127 // write gid mappings 1128 if len(c.config.GIDMappings) > 0 { 1129 b, err := encodeIDMapping(c.config.GIDMappings) 1130 if err != nil { 1131 return nil, err 1132 } 1133 r.AddData(&Bytemsg{ 1134 Type: GidmapAttr, 1135 Value: b, 1136 }) 1137 if c.config.RootlessEUID { 1138 if path, err := execabs.LookPath("newgidmap"); err == nil { 1139 r.AddData(&Bytemsg{ 1140 Type: GidmapPathAttr, 1141 Value: []byte(path), 1142 }) 1143 } 1144 } 1145 if requiresRootOrMappingTool(c.config) { 1146 r.AddData(&Boolmsg{ 1147 Type: SetgroupAttr, 1148 Value: true, 1149 }) 1150 } 1151 } 1152 } 1153 1154 if c.config.OomScoreAdj != nil { 1155 // write oom_score_adj 1156 r.AddData(&Bytemsg{ 1157 Type: OomScoreAdjAttr, 1158 Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)), 1159 }) 1160 } 1161 1162 // write rootless 1163 r.AddData(&Boolmsg{ 1164 Type: RootlessEUIDAttr, 1165 Value: c.config.RootlessEUID, 1166 }) 1167 1168 // write boottime and monotonic time ns offsets. 1169 if c.config.TimeOffsets != nil { 1170 var offsetSpec bytes.Buffer 1171 for clock, offset := range c.config.TimeOffsets { 1172 fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs) 1173 } 1174 r.AddData(&Bytemsg{ 1175 Type: TimeOffsetsAttr, 1176 Value: offsetSpec.Bytes(), 1177 }) 1178 } 1179 1180 return bytes.NewReader(r.Serialize()), nil 1181 } 1182 1183 // ignoreTerminateErrors returns nil if the given err matches an error known 1184 // to indicate that the terminate occurred successfully or err was nil, otherwise 1185 // err is returned unaltered. 1186 func ignoreTerminateErrors(err error) error { 1187 if err == nil { 1188 return nil 1189 } 1190 // terminate() might return an error from either Kill or Wait. 1191 // The (*Cmd).Wait documentation says: "If the command fails to run 1192 // or doesn't complete successfully, the error is of type *ExitError". 1193 // Filter out such errors (like "exit status 1" or "signal: killed"). 1194 var exitErr *exec.ExitError 1195 if errors.As(err, &exitErr) { 1196 return nil 1197 } 1198 if errors.Is(err, os.ErrProcessDone) { 1199 return nil 1200 } 1201 s := err.Error() 1202 if strings.Contains(s, "Wait was already called") { 1203 return nil 1204 } 1205 return err 1206 } 1207 1208 func requiresRootOrMappingTool(c *configs.Config) bool { 1209 gidMap := []configs.IDMap{ 1210 {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1}, 1211 } 1212 return !reflect.DeepEqual(c.GIDMappings, gidMap) 1213 }