github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/init_linux.go (about) 1 package libcontainer 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "runtime/debug" 13 "strconv" 14 "strings" 15 16 "github.com/containerd/console" 17 "github.com/moby/sys/user" 18 "github.com/opencontainers/runtime-spec/specs-go" 19 "github.com/sirupsen/logrus" 20 "github.com/vishvananda/netlink" 21 "golang.org/x/sys/unix" 22 23 "github.com/opencontainers/runc/libcontainer/capabilities" 24 "github.com/opencontainers/runc/libcontainer/cgroups" 25 "github.com/opencontainers/runc/libcontainer/configs" 26 "github.com/opencontainers/runc/libcontainer/system" 27 "github.com/opencontainers/runc/libcontainer/utils" 28 ) 29 30 type initType string 31 32 const ( 33 initSetns initType = "setns" 34 initStandard initType = "standard" 35 ) 36 37 type pid struct { 38 Pid int `json:"stage2_pid"` 39 PidFirstChild int `json:"stage1_pid"` 40 } 41 42 // network is an internal struct used to setup container networks. 43 type network struct { 44 configs.Network 45 46 // TempVethPeerName is a unique temporary veth peer name that was placed into 47 // the container's namespace. 48 TempVethPeerName string `json:"temp_veth_peer_name"` 49 } 50 51 // initConfig is used for transferring parameters from Exec() to Init() 52 type initConfig struct { 53 Args []string `json:"args"` 54 Env []string `json:"env"` 55 Cwd string `json:"cwd"` 56 Capabilities *configs.Capabilities `json:"capabilities"` 57 ProcessLabel string `json:"process_label"` 58 AppArmorProfile string `json:"apparmor_profile"` 59 NoNewPrivileges bool `json:"no_new_privileges"` 60 User string `json:"user"` 61 AdditionalGroups []string `json:"additional_groups"` 62 Config *configs.Config `json:"config"` 63 Networks []*network `json:"network"` 64 PassedFilesCount int `json:"passed_files_count"` 65 ContainerID string `json:"containerid"` 66 Rlimits []configs.Rlimit `json:"rlimits"` 67 CreateConsole bool `json:"create_console"` 68 ConsoleWidth uint16 `json:"console_width"` 69 ConsoleHeight uint16 `json:"console_height"` 70 RootlessEUID bool `json:"rootless_euid,omitempty"` 71 RootlessCgroups bool `json:"rootless_cgroups,omitempty"` 72 SpecState *specs.State `json:"spec_state,omitempty"` 73 Cgroup2Path string `json:"cgroup2_path,omitempty"` 74 } 75 76 // Init is part of "runc init" implementation. 77 func Init() { 78 runtime.GOMAXPROCS(1) 79 runtime.LockOSThread() 80 81 if err := startInitialization(); err != nil { 82 // If the error is returned, it was not communicated 83 // back to the parent (which is not a common case), 84 // so print it to stderr here as a last resort. 85 // 86 // Do not use logrus as we are not sure if it has been 87 // set up yet, but most important, if the parent is 88 // alive (and its log forwarding is working). 89 fmt.Fprintln(os.Stderr, err) 90 } 91 // Normally, StartInitialization() never returns, meaning 92 // if we are here, it had failed. 93 os.Exit(255) 94 } 95 96 // Normally, this function does not return. If it returns, with or without an 97 // error, it means the initialization has failed. If the error is returned, 98 // it means the error can not be communicated back to the parent. 99 func startInitialization() (retErr error) { 100 // Get the synchronisation pipe. 101 envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE") 102 syncPipeFd, err := strconv.Atoi(envSyncPipe) 103 if err != nil { 104 return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err) 105 } 106 syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync")) 107 defer syncPipe.Close() 108 109 defer func() { 110 // If this defer is ever called, this means initialization has failed. 111 // Send the error back to the parent process in the form of an initError 112 // if the sync socket has not been closed. 113 if syncPipe.isClosed() { 114 return 115 } 116 ierr := initError{Message: retErr.Error()} 117 if err := writeSyncArg(syncPipe, procError, ierr); err != nil { 118 fmt.Fprintln(os.Stderr, err) 119 return 120 } 121 // The error is sent, no need to also return it (or it will be reported twice). 122 retErr = nil 123 }() 124 125 // Get the INITPIPE. 126 envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") 127 initPipeFd, err := strconv.Atoi(envInitPipe) 128 if err != nil { 129 return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) 130 } 131 initPipe := os.NewFile(uintptr(initPipeFd), "init") 132 defer initPipe.Close() 133 134 // Set up logging. This is used rarely, and mostly for init debugging. 135 136 // Passing log level is optional; currently libcontainer/integration does not do it. 137 if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" { 138 logLevel, err := strconv.Atoi(levelStr) 139 if err != nil { 140 return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err) 141 } 142 logrus.SetLevel(logrus.Level(logLevel)) 143 } 144 145 logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE")) 146 if err != nil { 147 return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err) 148 } 149 logPipe := os.NewFile(uintptr(logFd), "logpipe") 150 151 logrus.SetOutput(logPipe) 152 logrus.SetFormatter(new(logrus.JSONFormatter)) 153 logrus.Debug("child process in init()") 154 155 // Only init processes have FIFOFD. 156 var fifoFile *os.File 157 envInitType := os.Getenv("_LIBCONTAINER_INITTYPE") 158 it := initType(envInitType) 159 if it == initStandard { 160 fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD")) 161 if err != nil { 162 return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err) 163 } 164 fifoFile = os.NewFile(uintptr(fifoFd), "initfifo") 165 } 166 167 var consoleSocket *os.File 168 if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" { 169 console, err := strconv.Atoi(envConsole) 170 if err != nil { 171 return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err) 172 } 173 consoleSocket = os.NewFile(uintptr(console), "console-socket") 174 defer consoleSocket.Close() 175 } 176 177 var pidfdSocket *os.File 178 if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" { 179 sockFd, err := strconv.Atoi(envSockFd) 180 if err != nil { 181 return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err) 182 } 183 pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket") 184 defer pidfdSocket.Close() 185 } 186 187 // Get runc-dmz fds. 188 var dmzExe *os.File 189 if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" { 190 dmzFd, err := strconv.Atoi(dmzFdStr) 191 if err != nil { 192 return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err) 193 } 194 unix.CloseOnExec(dmzFd) 195 dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz") 196 } 197 198 // clear the current process's environment to clean any libcontainer 199 // specific env vars. 200 os.Clearenv() 201 202 defer func() { 203 if err := recover(); err != nil { 204 if err2, ok := err.(error); ok { 205 retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack()) 206 } else { 207 retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack()) 208 } 209 } 210 }() 211 212 var config initConfig 213 if err := json.NewDecoder(initPipe).Decode(&config); err != nil { 214 return err 215 } 216 217 // If init succeeds, it will not return, hence none of the defers will be called. 218 return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe) 219 } 220 221 func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe *os.File) error { 222 if err := populateProcessEnvironment(config.Env); err != nil { 223 return err 224 } 225 226 // Clean the RLIMIT_NOFILE cache in go runtime. 227 // Issue: https://github.com/opencontainers/runc/issues/4195 228 if containsRlimit(config.Rlimits, unix.RLIMIT_NOFILE) { 229 system.ClearRlimitNofileCache() 230 } 231 232 switch t { 233 case initSetns: 234 i := &linuxSetnsInit{ 235 pipe: pipe, 236 consoleSocket: consoleSocket, 237 pidfdSocket: pidfdSocket, 238 config: config, 239 logPipe: logPipe, 240 dmzExe: dmzExe, 241 } 242 return i.Init() 243 case initStandard: 244 i := &linuxStandardInit{ 245 pipe: pipe, 246 consoleSocket: consoleSocket, 247 pidfdSocket: pidfdSocket, 248 parentPid: unix.Getppid(), 249 config: config, 250 fifoFile: fifoFile, 251 logPipe: logPipe, 252 dmzExe: dmzExe, 253 } 254 return i.Init() 255 } 256 return fmt.Errorf("unknown init type %q", t) 257 } 258 259 // populateProcessEnvironment loads the provided environment variables into the 260 // current processes's environment. 261 func populateProcessEnvironment(env []string) error { 262 for _, pair := range env { 263 p := strings.SplitN(pair, "=", 2) 264 if len(p) < 2 { 265 return errors.New("invalid environment variable: missing '='") 266 } 267 name, val := p[0], p[1] 268 if name == "" { 269 return errors.New("invalid environment variable: name cannot be empty") 270 } 271 if strings.IndexByte(name, 0) >= 0 { 272 return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name) 273 } 274 if strings.IndexByte(val, 0) >= 0 { 275 return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name) 276 } 277 if err := os.Setenv(name, val); err != nil { 278 return err 279 } 280 } 281 return nil 282 } 283 284 // verifyCwd ensures that the current directory is actually inside the mount 285 // namespace root of the current process. 286 func verifyCwd() error { 287 // getcwd(2) on Linux detects if cwd is outside of the rootfs of the 288 // current mount namespace root, and in that case prefixes "(unreachable)" 289 // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect 290 // when this happens and return ENOENT rather than returning a non-absolute 291 // path. In both cases we can therefore easily detect if we have an invalid 292 // cwd by checking the return value of getcwd(3). See getcwd(3) for more 293 // details, and CVE-2024-21626 for the security issue that motivated this 294 // check. 295 // 296 // We have to use unix.Getwd() here because os.Getwd() has a workaround for 297 // $PWD which involves doing stat(.), which can fail if the current 298 // directory is inaccessible to the container process. 299 if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { 300 return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") 301 } else if err != nil { 302 return fmt.Errorf("failed to verify if current working directory is safe: %w", err) 303 } else if !filepath.IsAbs(wd) { 304 // We shouldn't ever hit this, but check just in case. 305 return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) 306 } 307 return nil 308 } 309 310 // finalizeNamespace drops the caps, sets the correct user 311 // and working dir, and closes any leaked file descriptors 312 // before executing the command inside the namespace 313 func finalizeNamespace(config *initConfig) error { 314 // Ensure that all unwanted fds we may have accidentally 315 // inherited are marked close-on-exec so they stay out of the 316 // container 317 if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { 318 return fmt.Errorf("error closing exec fds: %w", err) 319 } 320 321 // we only do chdir if it's specified 322 doChdir := config.Cwd != "" 323 if doChdir { 324 // First, attempt the chdir before setting up the user. 325 // This could allow us to access a directory that the user running runc can access 326 // but the container user cannot. 327 err := unix.Chdir(config.Cwd) 328 switch { 329 case err == nil: 330 doChdir = false 331 case os.IsPermission(err): 332 // If we hit an EPERM, we should attempt again after setting up user. 333 // This will allow us to successfully chdir if the container user has access 334 // to the directory, but the user running runc does not. 335 // This is useful in cases where the cwd is also a volume that's been chowned to the container user. 336 default: 337 return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) 338 } 339 } 340 341 caps := &configs.Capabilities{} 342 if config.Capabilities != nil { 343 caps = config.Capabilities 344 } else if config.Config.Capabilities != nil { 345 caps = config.Config.Capabilities 346 } 347 w, err := capabilities.New(caps) 348 if err != nil { 349 return err 350 } 351 // drop capabilities in bounding set before changing user 352 if err := w.ApplyBoundingSet(); err != nil { 353 return fmt.Errorf("unable to apply bounding set: %w", err) 354 } 355 // preserve existing capabilities while we change users 356 if err := system.SetKeepCaps(); err != nil { 357 return fmt.Errorf("unable to set keep caps: %w", err) 358 } 359 if err := setupUser(config); err != nil { 360 return fmt.Errorf("unable to setup user: %w", err) 361 } 362 // Change working directory AFTER the user has been set up, if we haven't done it yet. 363 if doChdir { 364 if err := unix.Chdir(config.Cwd); err != nil { 365 return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) 366 } 367 } 368 // Make sure our final working directory is inside the container. 369 if err := verifyCwd(); err != nil { 370 return err 371 } 372 if err := system.ClearKeepCaps(); err != nil { 373 return fmt.Errorf("unable to clear keep caps: %w", err) 374 } 375 if err := w.ApplyCaps(); err != nil { 376 return fmt.Errorf("unable to apply caps: %w", err) 377 } 378 return nil 379 } 380 381 // setupConsole sets up the console from inside the container, and sends the 382 // master pty fd to the config.Pipe (using cmsg). This is done to ensure that 383 // consoles are scoped to a container properly (see runc#814 and the many 384 // issues related to that). This has to be run *after* we've pivoted to the new 385 // rootfs (and the users' configuration is entirely set up). 386 func setupConsole(socket *os.File, config *initConfig, mount bool) error { 387 defer socket.Close() 388 // At this point, /dev/ptmx points to something that we would expect. We 389 // used to change the owner of the slave path, but since the /dev/pts mount 390 // can have gid=X set (at the users' option). So touching the owner of the 391 // slave PTY is not necessary, as the kernel will handle that for us. Note 392 // however, that setupUser (specifically fixStdioPermissions) *will* change 393 // the UID owner of the console to be the user the process will run as (so 394 // they can actually control their console). 395 396 pty, slavePath, err := console.NewPty() 397 if err != nil { 398 return err 399 } 400 // After we return from here, we don't need the console anymore. 401 defer pty.Close() 402 403 if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { 404 err = pty.Resize(console.WinSize{ 405 Height: config.ConsoleHeight, 406 Width: config.ConsoleWidth, 407 }) 408 if err != nil { 409 return err 410 } 411 } 412 413 // Mount the console inside our rootfs. 414 if mount { 415 if err := mountConsole(slavePath); err != nil { 416 return err 417 } 418 } 419 // While we can access console.master, using the API is a good idea. 420 if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil { 421 return err 422 } 423 runtime.KeepAlive(pty) 424 425 // Now, dup over all the things. 426 return dupStdio(slavePath) 427 } 428 429 // syncParentReady sends to the given pipe a JSON payload which indicates that 430 // the init is ready to Exec the child process. It then waits for the parent to 431 // indicate that it is cleared to Exec. 432 func syncParentReady(pipe *syncSocket) error { 433 // Tell parent. 434 if err := writeSync(pipe, procReady); err != nil { 435 return err 436 } 437 // Wait for parent to give the all-clear. 438 return readSync(pipe, procRun) 439 } 440 441 // syncParentHooks sends to the given pipe a JSON payload which indicates that 442 // the parent should execute pre-start hooks. It then waits for the parent to 443 // indicate that it is cleared to resume. 444 func syncParentHooks(pipe *syncSocket) error { 445 // Tell parent. 446 if err := writeSync(pipe, procHooks); err != nil { 447 return err 448 } 449 // Wait for parent to give the all-clear. 450 return readSync(pipe, procHooksDone) 451 } 452 453 // syncParentSeccomp sends the fd associated with the seccomp file descriptor 454 // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. 455 func syncParentSeccomp(pipe *syncSocket, seccompFd *os.File) error { 456 if seccompFd == nil { 457 return nil 458 } 459 defer seccompFd.Close() 460 461 // Tell parent to grab our fd. 462 // 463 // Notably, we do not use writeSyncFile here because a container might have 464 // an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest 465 // possible number of system calls here because all of those syscalls 466 // cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here 467 // before the parent gets the file descriptor would deadlock "runc init" if 468 // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more 469 // details. 470 if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil { 471 return err 472 } 473 // Wait for parent to tell us they've grabbed the seccompfd. 474 return readSync(pipe, procSeccompDone) 475 } 476 477 // setupUser changes the groups, gid, and uid for the user inside the container 478 func setupUser(config *initConfig) error { 479 // Set up defaults. 480 defaultExecUser := user.ExecUser{ 481 Uid: 0, 482 Gid: 0, 483 Home: "/", 484 } 485 486 passwdPath, err := user.GetPasswdPath() 487 if err != nil { 488 return err 489 } 490 491 groupPath, err := user.GetGroupPath() 492 if err != nil { 493 return err 494 } 495 496 execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) 497 if err != nil { 498 return err 499 } 500 501 var addGroups []int 502 if len(config.AdditionalGroups) > 0 { 503 addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) 504 if err != nil { 505 return err 506 } 507 } 508 509 if config.RootlessEUID { 510 // We cannot set any additional groups in a rootless container and thus 511 // we bail if the user asked us to do so. TODO: We currently can't do 512 // this check earlier, but if libcontainer.Process.User was typesafe 513 // this might work. 514 if len(addGroups) > 0 { 515 return errors.New("cannot set any additional groups in a rootless container") 516 } 517 } 518 519 // Before we change to the container's user make sure that the processes 520 // STDIO is correctly owned by the user that we are switching to. 521 if err := fixStdioPermissions(execUser); err != nil { 522 return err 523 } 524 525 // We don't need to use /proc/thread-self here because setgroups is a 526 // per-userns file and thus is global to all threads in a thread-group. 527 // This lets us avoid having to do runtime.LockOSThread. 528 setgroups, err := os.ReadFile("/proc/self/setgroups") 529 if err != nil && !os.IsNotExist(err) { 530 return err 531 } 532 533 // This isn't allowed in an unprivileged user namespace since Linux 3.19. 534 // There's nothing we can do about /etc/group entries, so we silently 535 // ignore setting groups here (since the user didn't explicitly ask us to 536 // set the group). 537 allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" 538 539 if allowSupGroups { 540 suppGroups := append(execUser.Sgids, addGroups...) 541 if err := unix.Setgroups(suppGroups); err != nil { 542 return &os.SyscallError{Syscall: "setgroups", Err: err} 543 } 544 } 545 546 if err := unix.Setgid(execUser.Gid); err != nil { 547 if err == unix.EINVAL { 548 return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", execUser.Gid) 549 } 550 return err 551 } 552 if err := unix.Setuid(execUser.Uid); err != nil { 553 if err == unix.EINVAL { 554 return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", execUser.Uid) 555 } 556 return err 557 } 558 559 // if we didn't get HOME already, set it based on the user's HOME 560 if envHome := os.Getenv("HOME"); envHome == "" { 561 if err := os.Setenv("HOME", execUser.Home); err != nil { 562 return err 563 } 564 } 565 return nil 566 } 567 568 // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. 569 // The ownership needs to match because it is created outside of the container and needs to be 570 // localized. 571 func fixStdioPermissions(u *user.ExecUser) error { 572 var null unix.Stat_t 573 if err := unix.Stat("/dev/null", &null); err != nil { 574 return &os.PathError{Op: "stat", Path: "/dev/null", Err: err} 575 } 576 for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { 577 var s unix.Stat_t 578 if err := unix.Fstat(int(file.Fd()), &s); err != nil { 579 return &os.PathError{Op: "fstat", Path: file.Name(), Err: err} 580 } 581 582 // Skip chown if uid is already the one we want or any of the STDIO descriptors 583 // were redirected to /dev/null. 584 if int(s.Uid) == u.Uid || s.Rdev == null.Rdev { 585 continue 586 } 587 588 // We only change the uid (as it is possible for the mount to 589 // prefer a different gid, and there's no reason for us to change it). 590 // The reason why we don't just leave the default uid=X mount setup is 591 // that users expect to be able to actually use their console. Without 592 // this code, you couldn't effectively run as a non-root user inside a 593 // container and also have a console set up. 594 if err := file.Chown(u.Uid, int(s.Gid)); err != nil { 595 // If we've hit an EINVAL then s.Gid isn't mapped in the user 596 // namespace. If we've hit an EPERM then the inode's current owner 597 // is not mapped in our user namespace (in particular, 598 // privileged_wrt_inode_uidgid() has failed). Read-only 599 // /dev can result in EROFS error. In any case, it's 600 // better for us to just not touch the stdio rather 601 // than bail at this point. 602 603 if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { 604 continue 605 } 606 return err 607 } 608 } 609 return nil 610 } 611 612 // setupNetwork sets up and initializes any network interface inside the container. 613 func setupNetwork(config *initConfig) error { 614 for _, config := range config.Networks { 615 strategy, err := getStrategy(config.Type) 616 if err != nil { 617 return err 618 } 619 if err := strategy.initialize(config); err != nil { 620 return err 621 } 622 } 623 return nil 624 } 625 626 func setupRoute(config *configs.Config) error { 627 for _, config := range config.Routes { 628 _, dst, err := net.ParseCIDR(config.Destination) 629 if err != nil { 630 return err 631 } 632 src := net.ParseIP(config.Source) 633 if src == nil { 634 return fmt.Errorf("Invalid source for route: %s", config.Source) 635 } 636 gw := net.ParseIP(config.Gateway) 637 if gw == nil { 638 return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) 639 } 640 l, err := netlink.LinkByName(config.InterfaceName) 641 if err != nil { 642 return err 643 } 644 route := &netlink.Route{ 645 Scope: netlink.SCOPE_UNIVERSE, 646 Dst: dst, 647 Src: src, 648 Gw: gw, 649 LinkIndex: l.Attrs().Index, 650 } 651 if err := netlink.RouteAdd(route); err != nil { 652 return err 653 } 654 } 655 return nil 656 } 657 658 func containsRlimit(limits []configs.Rlimit, resource int) bool { 659 for _, rlimit := range limits { 660 if rlimit.Type == resource { 661 return true 662 } 663 } 664 return false 665 } 666 667 func setupRlimits(limits []configs.Rlimit, pid int) error { 668 for _, rlimit := range limits { 669 if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil { 670 return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err) 671 } 672 } 673 return nil 674 } 675 676 func setupScheduler(config *configs.Config) error { 677 attr, err := configs.ToSchedAttr(config.Scheduler) 678 if err != nil { 679 return err 680 } 681 if err := unix.SchedSetAttr(0, attr, 0); err != nil { 682 if errors.Is(err, unix.EPERM) && config.Cgroups.CpusetCpus != "" { 683 return errors.New("process scheduler can't be used together with AllowedCPUs") 684 } 685 return fmt.Errorf("error setting scheduler: %w", err) 686 } 687 return nil 688 } 689 690 func setupPersonality(config *configs.Config) error { 691 return system.SetLinuxPersonality(config.Personality.Domain) 692 } 693 694 // signalAllProcesses freezes then iterates over all the processes inside the 695 // manager's cgroups sending the signal s to them. 696 func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { 697 if !m.Exists() { 698 return ErrNotRunning 699 } 700 // Use cgroup.kill, if available. 701 if s == unix.SIGKILL { 702 if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid. 703 err := cgroups.WriteFile(p, "cgroup.kill", "1") 704 if err == nil || !errors.Is(err, os.ErrNotExist) { 705 return err 706 } 707 // Fallback to old implementation. 708 } 709 } 710 711 if err := m.Freeze(configs.Frozen); err != nil { 712 logrus.Warn(err) 713 } 714 pids, err := m.GetAllPids() 715 if err != nil { 716 if err := m.Freeze(configs.Thawed); err != nil { 717 logrus.Warn(err) 718 } 719 return err 720 } 721 for _, pid := range pids { 722 err := unix.Kill(pid, s) 723 if err != nil && err != unix.ESRCH { 724 logrus.Warnf("kill %d: %v", pid, err) 725 } 726 } 727 if err := m.Freeze(configs.Thawed); err != nil { 728 logrus.Warn(err) 729 } 730 731 return nil 732 } 733 734 // setupPidfd opens a process file descriptor of init process, and sends the 735 // file descriptor back to the socket. 736 func setupPidfd(socket *os.File, initType string) error { 737 defer socket.Close() 738 739 pidFd, err := unix.PidfdOpen(os.Getpid(), 0) 740 if err != nil { 741 return fmt.Errorf("failed to pidfd_open: %w", err) 742 } 743 744 if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil { 745 unix.Close(pidFd) 746 return fmt.Errorf("failed to send pidfd on socket: %w", err) 747 } 748 return unix.Close(pidFd) 749 }