github.com/hernad/nomad@v1.6.112/drivers/shared/executor/executor_linux.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 //go:build linux 5 6 package executor 7 8 import ( 9 "context" 10 "errors" 11 "fmt" 12 "io" 13 "os" 14 "os/exec" 15 "path" 16 "path/filepath" 17 "strings" 18 "syscall" 19 "time" 20 21 "github.com/armon/circbuf" 22 "github.com/hernad/consul-template/signals" 23 hclog "github.com/hashicorp/go-hclog" 24 "github.com/hernad/nomad/client/allocdir" 25 "github.com/hernad/nomad/client/lib/cgutil" 26 "github.com/hernad/nomad/client/lib/resources" 27 cstructs "github.com/hernad/nomad/client/structs" 28 "github.com/hernad/nomad/drivers/shared/capabilities" 29 "github.com/hernad/nomad/helper/stats" 30 "github.com/hernad/nomad/helper/uuid" 31 "github.com/hernad/nomad/nomad/structs" 32 "github.com/hernad/nomad/plugins/drivers" 33 "github.com/opencontainers/runc/libcontainer" 34 "github.com/opencontainers/runc/libcontainer/cgroups" 35 lconfigs "github.com/opencontainers/runc/libcontainer/configs" 36 "github.com/opencontainers/runc/libcontainer/devices" 37 ldevices "github.com/opencontainers/runc/libcontainer/devices" 38 "github.com/opencontainers/runc/libcontainer/specconv" 39 lutils "github.com/opencontainers/runc/libcontainer/utils" 40 "github.com/opencontainers/runtime-spec/specs-go" 41 "golang.org/x/sys/unix" 42 ) 43 44 var ( 45 // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 46 ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} 47 48 // ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage. 49 ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"} 50 51 // ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor 52 ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"} 53 ) 54 55 // LibcontainerExecutor implements an Executor with the runc/libcontainer api 56 type LibcontainerExecutor struct { 57 id string 58 command *ExecCommand 59 60 logger hclog.Logger 61 62 totalCpuStats *stats.CpuStats 63 userCpuStats *stats.CpuStats 64 systemCpuStats *stats.CpuStats 65 pidCollector *pidCollector 66 67 container libcontainer.Container 68 userProc *libcontainer.Process 69 userProcExited chan interface{} 70 exitState *ProcessState 71 } 72 73 func NewExecutorWithIsolation(logger hclog.Logger, cpuTotalTicks uint64) Executor { 74 logger = logger.Named("isolated_executor") 75 stats.SetCpuTotalTicks(cpuTotalTicks) 76 77 return &LibcontainerExecutor{ 78 id: strings.ReplaceAll(uuid.Generate(), "-", "_"), 79 logger: logger, 80 totalCpuStats: stats.NewCpuStats(), 81 userCpuStats: stats.NewCpuStats(), 82 systemCpuStats: stats.NewCpuStats(), 83 pidCollector: newPidCollector(logger), 84 } 85 } 86 87 // Launch creates a new container in libcontainer and starts a new process with it 88 func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) { 89 l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " ")) 90 91 if command.Resources == nil { 92 command.Resources = &drivers.Resources{ 93 NomadResources: &structs.AllocatedTaskResources{}, 94 } 95 } 96 97 l.command = command 98 99 // create a new factory which will store the container state in the allocDir 100 factory, err := libcontainer.New( 101 path.Join(command.TaskDir, "../alloc/container"), 102 // note that os.Args[0] refers to the executor shim typically 103 // and first args arguments is ignored now due 104 // until https://github.com/opencontainers/runc/pull/1888 is merged 105 libcontainer.InitArgs(os.Args[0], "libcontainer-shim"), 106 ) 107 if err != nil { 108 return nil, fmt.Errorf("failed to create factory: %v", err) 109 } 110 111 // A container groups processes under the same isolation enforcement 112 containerCfg, err := newLibcontainerConfig(command) 113 if err != nil { 114 return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err) 115 } 116 117 container, err := factory.Create(l.id, containerCfg) 118 if err != nil { 119 return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err) 120 } 121 l.container = container 122 123 // Look up the binary path and make it executable 124 taskPath, hostPath, err := lookupTaskBin(command) 125 if err != nil { 126 return nil, err 127 } 128 if err := makeExecutable(hostPath); err != nil { 129 return nil, err 130 } 131 132 combined := append([]string{taskPath}, command.Args...) 133 stdout, err := command.Stdout() 134 if err != nil { 135 return nil, err 136 } 137 stderr, err := command.Stderr() 138 if err != nil { 139 return nil, err 140 } 141 142 l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " ")) 143 144 // the task process will be started by the container 145 process := &libcontainer.Process{ 146 Args: combined, 147 Env: command.Env, 148 Stdout: stdout, 149 Stderr: stderr, 150 Init: true, 151 } 152 153 if command.User != "" { 154 process.User = command.User 155 } 156 l.userProc = process 157 158 l.totalCpuStats = stats.NewCpuStats() 159 l.userCpuStats = stats.NewCpuStats() 160 l.systemCpuStats = stats.NewCpuStats() 161 162 // Starts the task 163 if err := container.Run(process); err != nil { 164 container.Destroy() 165 return nil, err 166 } 167 168 pid, err := process.Pid() 169 if err != nil { 170 container.Destroy() 171 return nil, err 172 } 173 174 // start a goroutine to wait on the process to complete, so Wait calls can 175 // be multiplexed 176 l.userProcExited = make(chan interface{}) 177 go l.pidCollector.collectPids(l.userProcExited, l.getAllPids) 178 go l.wait() 179 180 return &ProcessState{ 181 Pid: pid, 182 ExitCode: -1, 183 Time: time.Now(), 184 }, nil 185 } 186 187 func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) { 188 pids, err := l.container.Processes() 189 if err != nil { 190 return nil, err 191 } 192 m := make(resources.PIDs, 1) 193 for _, pid := range pids { 194 m[pid] = resources.NewPID(pid) 195 } 196 return m, nil 197 } 198 199 // Wait waits until a process has exited and returns it's exitcode and errors 200 func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) { 201 select { 202 case <-ctx.Done(): 203 return nil, ctx.Err() 204 case <-l.userProcExited: 205 return l.exitState, nil 206 } 207 } 208 209 func (l *LibcontainerExecutor) wait() { 210 defer close(l.userProcExited) 211 212 ps, err := l.userProc.Wait() 213 if err != nil { 214 // If the process has exited before we called wait an error is returned 215 // the process state is embedded in the error 216 if exitErr, ok := err.(*exec.ExitError); ok { 217 ps = exitErr.ProcessState 218 } else { 219 l.logger.Error("failed to call wait on user process", "error", err) 220 l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()} 221 return 222 } 223 } 224 225 l.command.Close() 226 227 exitCode := 1 228 var signal int 229 if status, ok := ps.Sys().(syscall.WaitStatus); ok { 230 exitCode = status.ExitStatus() 231 if status.Signaled() { 232 const exitSignalBase = 128 233 signal = int(status.Signal()) 234 exitCode = exitSignalBase + signal 235 } 236 } 237 238 l.exitState = &ProcessState{ 239 Pid: ps.Pid(), 240 ExitCode: exitCode, 241 Signal: signal, 242 Time: time.Now(), 243 } 244 } 245 246 // Shutdown stops all processes started and cleans up any resources 247 // created (such as mountpoints, devices, etc). 248 func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error { 249 if l.container == nil { 250 return nil 251 } 252 253 status, err := l.container.Status() 254 if err != nil { 255 return err 256 } 257 258 defer l.container.Destroy() 259 260 if status == libcontainer.Stopped { 261 return nil 262 } 263 264 if grace > 0 { 265 if signal == "" { 266 signal = "SIGINT" 267 } 268 269 sig, ok := signals.SignalLookup[signal] 270 if !ok { 271 return fmt.Errorf("error unknown signal given for shutdown: %s", signal) 272 } 273 274 // Signal initial container processes only during graceful 275 // shutdown; hence `false` arg. 276 err = l.container.Signal(sig, false) 277 if err != nil { 278 return err 279 } 280 281 select { 282 case <-l.userProcExited: 283 return nil 284 case <-time.After(grace): 285 // Force kill all container processes after grace period, 286 // hence `true` argument. 287 if err := l.container.Signal(os.Kill, true); err != nil { 288 return err 289 } 290 } 291 } else { 292 err := l.container.Signal(os.Kill, true) 293 if err != nil { 294 return err 295 } 296 } 297 298 select { 299 case <-l.userProcExited: 300 return nil 301 case <-time.After(time.Second * 15): 302 return fmt.Errorf("process failed to exit after 15 seconds") 303 } 304 } 305 306 // UpdateResources updates the resource isolation with new values to be enforced 307 func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error { 308 return nil 309 } 310 311 // Version returns the api version of the executor 312 func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) { 313 return &ExecutorVersion{Version: ExecutorVersionLatest}, nil 314 } 315 316 // Stats returns the resource statistics for processes managed by the executor 317 func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) { 318 ch := make(chan *cstructs.TaskResourceUsage) 319 go l.handleStats(ch, ctx, interval) 320 return ch, nil 321 322 } 323 324 func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) { 325 defer close(ch) 326 timer := time.NewTimer(0) 327 328 measuredMemStats := ExecutorCgroupV1MeasuredMemStats 329 if cgroups.IsCgroup2UnifiedMode() { 330 measuredMemStats = ExecutorCgroupV2MeasuredMemStats 331 } 332 333 for { 334 select { 335 case <-ctx.Done(): 336 return 337 338 case <-timer.C: 339 timer.Reset(interval) 340 } 341 342 lstats, err := l.container.Stats() 343 if err != nil { 344 l.logger.Warn("error collecting stats", "error", err) 345 return 346 } 347 348 pidStats, err := l.pidCollector.pidStats() 349 if err != nil { 350 l.logger.Warn("error collecting stats", "error", err) 351 return 352 } 353 354 ts := time.Now() 355 stats := lstats.CgroupStats 356 357 // Memory Related Stats 358 swap := stats.MemoryStats.SwapUsage 359 maxUsage := stats.MemoryStats.Usage.MaxUsage 360 rss := stats.MemoryStats.Stats["rss"] 361 cache := stats.MemoryStats.Stats["cache"] 362 mapped_file := stats.MemoryStats.Stats["mapped_file"] 363 ms := &cstructs.MemoryStats{ 364 RSS: rss, 365 Cache: cache, 366 Swap: swap.Usage, 367 MappedFile: mapped_file, 368 Usage: stats.MemoryStats.Usage.Usage, 369 MaxUsage: maxUsage, 370 KernelUsage: stats.MemoryStats.KernelUsage.Usage, 371 KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage, 372 Measured: measuredMemStats, 373 } 374 375 // CPU Related Stats 376 totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage) 377 userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode) 378 kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode) 379 380 totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage) 381 cs := &cstructs.CpuStats{ 382 SystemMode: l.systemCpuStats.Percent(kernelModeTime), 383 UserMode: l.userCpuStats.Percent(userModeTime), 384 Percent: totalPercent, 385 ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods, 386 ThrottledTime: stats.CpuStats.ThrottlingData.ThrottledTime, 387 TotalTicks: l.systemCpuStats.TicksConsumed(totalPercent), 388 Measured: ExecutorCgroupMeasuredCpuStats, 389 } 390 taskResUsage := cstructs.TaskResourceUsage{ 391 ResourceUsage: &cstructs.ResourceUsage{ 392 MemoryStats: ms, 393 CpuStats: cs, 394 }, 395 Timestamp: ts.UTC().UnixNano(), 396 Pids: pidStats, 397 } 398 399 select { 400 case <-ctx.Done(): 401 return 402 case ch <- &taskResUsage: 403 } 404 405 } 406 } 407 408 // Signal sends a signal to the process managed by the executor 409 func (l *LibcontainerExecutor) Signal(s os.Signal) error { 410 return l.userProc.Signal(s) 411 } 412 413 // Exec starts an additional process inside the container 414 func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) { 415 combined := append([]string{cmd}, args...) 416 // Capture output 417 buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize)) 418 419 process := &libcontainer.Process{ 420 Args: combined, 421 Env: l.command.Env, 422 Stdout: buf, 423 Stderr: buf, 424 } 425 426 err := l.container.Run(process) 427 if err != nil { 428 return nil, 0, err 429 } 430 431 waitCh := make(chan *waitResult) 432 defer close(waitCh) 433 go l.handleExecWait(waitCh, process) 434 435 select { 436 case result := <-waitCh: 437 ps := result.ps 438 if result.err != nil { 439 if exitErr, ok := result.err.(*exec.ExitError); ok { 440 ps = exitErr.ProcessState 441 } else { 442 return nil, 0, result.err 443 } 444 } 445 var exitCode int 446 if status, ok := ps.Sys().(syscall.WaitStatus); ok { 447 exitCode = status.ExitStatus() 448 } 449 return buf.Bytes(), exitCode, nil 450 451 case <-time.After(time.Until(deadline)): 452 process.Signal(os.Kill) 453 return nil, 0, context.DeadlineExceeded 454 } 455 456 } 457 458 func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) { 459 parent, child, err := lutils.NewSockPair("socket") 460 if err != nil { 461 return nil, nil, fmt.Errorf("failed to create terminal: %v", err) 462 } 463 464 return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err 465 466 } 467 468 func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool, 469 stream drivers.ExecTaskStream) error { 470 471 // the task process will be started by the container 472 process := &libcontainer.Process{ 473 Args: cmd, 474 Env: l.userProc.Env, 475 User: l.userProc.User, 476 Init: false, 477 Cwd: "/", 478 } 479 480 execHelper := &execHelper{ 481 logger: l.logger, 482 483 newTerminal: l.newTerminalSocket, 484 setTTY: func(tty *os.File) error { 485 process.ConsoleSocket = tty 486 return nil 487 }, 488 setIO: func(stdin io.Reader, stdout, stderr io.Writer) error { 489 process.Stdin = stdin 490 process.Stdout = stdout 491 process.Stderr = stderr 492 return nil 493 }, 494 495 processStart: func() error { return l.container.Run(process) }, 496 processWait: func() (*os.ProcessState, error) { 497 return process.Wait() 498 }, 499 } 500 501 return execHelper.run(ctx, tty, stream) 502 503 } 504 505 type waitResult struct { 506 ps *os.ProcessState 507 err error 508 } 509 510 func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) { 511 ps, err := process.Wait() 512 ch <- &waitResult{ps, err} 513 } 514 515 func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) { 516 switch command.User { 517 case "root": 518 // when running as root, use the legacy set of system capabilities, so 519 // that we do not break existing nomad clusters using this "feature" 520 legacyCaps := capabilities.LegacySupported().Slice(true) 521 cfg.Capabilities = &lconfigs.Capabilities{ 522 Bounding: legacyCaps, 523 Permitted: legacyCaps, 524 Effective: legacyCaps, 525 Ambient: nil, 526 Inheritable: nil, 527 } 528 default: 529 // otherwise apply the plugin + task capability configuration 530 // 531 // The capabilities must be set in the Ambient set as libcontainer 532 // performs `execve`` as an unprivileged user. Ambient also requires 533 // that capabilities are Permitted and Inheritable. Setting Effective 534 // is unnecessary, because we only need the capabilities to become 535 // effective _after_ execve, not before. 536 cfg.Capabilities = &lconfigs.Capabilities{ 537 Bounding: command.Capabilities, 538 Permitted: command.Capabilities, 539 Inheritable: command.Capabilities, 540 Ambient: command.Capabilities, 541 } 542 } 543 } 544 545 func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces { 546 namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}} 547 if pidMode == IsolationModePrivate { 548 namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID}) 549 } 550 if ipcMode == IsolationModePrivate { 551 namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC}) 552 } 553 return namespaces 554 } 555 556 // configureIsolation prepares the isolation primitives of the container. 557 // The process runs in a container configured with the following: 558 // 559 // * the task directory as the chroot 560 // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host 561 // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker 562 // * some special filesystems: `/proc`, `/sys`. Some case is given to avoid exec escaping or setting malicious values through them. 563 func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { 564 defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV 565 566 // set the new root directory for the container 567 cfg.Rootfs = command.TaskDir 568 569 // disable pivot_root if set in the driver's configuration 570 cfg.NoPivotRoot = command.NoPivotRoot 571 572 // set up default namespaces as configured 573 cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC) 574 575 if command.NetworkIsolation != nil { 576 cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{ 577 Type: lconfigs.NEWNET, 578 Path: command.NetworkIsolation.Path, 579 }) 580 } 581 582 // paths to mask using a bind mount to /dev/null to prevent reading 583 cfg.MaskPaths = []string{ 584 "/proc/kcore", 585 "/sys/firmware", 586 } 587 588 // paths that should be remounted as readonly inside the container 589 cfg.ReadonlyPaths = []string{ 590 "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", 591 } 592 593 cfg.Devices = specconv.AllowedDevices 594 if len(command.Devices) > 0 { 595 devs, err := cmdDevices(command.Devices) 596 if err != nil { 597 return err 598 } 599 cfg.Devices = append(cfg.Devices, devs...) 600 } 601 602 cfg.Mounts = []*lconfigs.Mount{ 603 { 604 Source: "tmpfs", 605 Destination: "/dev", 606 Device: "tmpfs", 607 Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, 608 Data: "mode=755", 609 }, 610 { 611 Source: "proc", 612 Destination: "/proc", 613 Device: "proc", 614 Flags: defaultMountFlags, 615 }, 616 { 617 Source: "devpts", 618 Destination: "/dev/pts", 619 Device: "devpts", 620 Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, 621 Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", 622 }, 623 { 624 Device: "tmpfs", 625 Source: "shm", 626 Destination: "/dev/shm", 627 Data: "mode=1777,size=65536k", 628 Flags: defaultMountFlags, 629 }, 630 { 631 Source: "mqueue", 632 Destination: "/dev/mqueue", 633 Device: "mqueue", 634 Flags: defaultMountFlags, 635 }, 636 { 637 Source: "sysfs", 638 Destination: "/sys", 639 Device: "sysfs", 640 Flags: defaultMountFlags | syscall.MS_RDONLY, 641 }, 642 } 643 644 if len(command.Mounts) > 0 { 645 cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...) 646 } 647 648 return nil 649 } 650 651 func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { 652 // If resources are not limited then manually create cgroups needed 653 if !command.ResourceLimits { 654 return cgutil.ConfigureBasicCgroups(cfg) 655 } 656 657 // set cgroups path 658 if cgutil.UseV2 { 659 // in v2, the cgroup must have been created by the client already, 660 // which breaks a lot of existing tests that run drivers without a client 661 if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { 662 return errors.New("cgroup path must be set") 663 } 664 parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) 665 cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) 666 } else { 667 // in v1, the cgroup is created using /nomad, which is a bug because it 668 // does not respect the cgroup_parent client configuration 669 // (but makes testing easy) 670 id := uuid.Generate() 671 cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) 672 } 673 674 if command.Resources == nil || command.Resources.NomadResources == nil { 675 return nil 676 } 677 678 // Total amount of memory allowed to consume 679 res := command.Resources.NomadResources 680 memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB 681 if memHard <= 0 { 682 memHard = res.Memory.MemoryMB 683 memSoft = 0 684 } 685 686 if memHard > 0 { 687 cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024 688 cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024 689 690 // Disable swap if possible, to avoid issues on the machine 691 cfg.Cgroups.Resources.MemorySwappiness = cgutil.MaybeDisableMemorySwappiness() 692 } 693 694 cpuShares := res.Cpu.CpuShares 695 if cpuShares < 2 { 696 return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares) 697 } 698 699 // Set the relative CPU shares for this cgroup, and convert for cgroupv2 700 cfg.Cgroups.Resources.CpuShares = uint64(cpuShares) 701 cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares)) 702 703 if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" { 704 cfg.Hooks = lconfigs.Hooks{ 705 lconfigs.CreateRuntime: lconfigs.HookList{ 706 newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath), 707 }, 708 } 709 } 710 711 return nil 712 } 713 714 func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { 715 cfg := &lconfigs.Config{ 716 Cgroups: &lconfigs.Cgroup{ 717 Resources: &lconfigs.Resources{ 718 MemorySwappiness: nil, 719 }, 720 }, 721 Version: "1.0.0", 722 } 723 724 for _, device := range specconv.AllowedDevices { 725 cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) 726 } 727 728 configureCapabilities(cfg, command) 729 730 // children should not inherit Nomad agent oom_score_adj value 731 oomScoreAdj := 0 732 cfg.OomScoreAdj = &oomScoreAdj 733 734 if err := configureIsolation(cfg, command); err != nil { 735 return nil, err 736 } 737 738 if err := configureCgroups(cfg, command); err != nil { 739 return nil, err 740 } 741 742 return cfg, nil 743 } 744 745 // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices. 746 func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) { 747 if len(driverDevices) == 0 { 748 return nil, nil 749 } 750 751 r := make([]*devices.Device, len(driverDevices)) 752 753 for i, d := range driverDevices { 754 ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions) 755 if err != nil { 756 return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err) 757 } 758 ed.Path = d.TaskPath 759 r[i] = ed 760 } 761 762 return r, nil 763 } 764 765 var userMountToUnixMount = map[string]int{ 766 // Empty string maps to `rprivate` for backwards compatibility in restored 767 // older tasks, where mount propagation will not be present. 768 "": unix.MS_PRIVATE | unix.MS_REC, // rprivate 769 structs.VolumeMountPropagationPrivate: unix.MS_PRIVATE | unix.MS_REC, // rprivate 770 structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC, // rslave 771 structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared 772 } 773 774 // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts. 775 func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount { 776 if len(mounts) == 0 { 777 return nil 778 } 779 780 r := make([]*lconfigs.Mount, len(mounts)) 781 782 for i, m := range mounts { 783 flags := unix.MS_BIND 784 if m.Readonly { 785 flags |= unix.MS_RDONLY 786 } 787 788 r[i] = &lconfigs.Mount{ 789 Source: m.HostPath, 790 Destination: m.TaskPath, 791 Device: "bind", 792 Flags: flags, 793 PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]}, 794 } 795 } 796 797 return r 798 } 799 800 // lookupTaskBin finds the file `bin`, searching in order: 801 // - taskDir/local 802 // - taskDir 803 // - each mount, in order listed in the jobspec 804 // - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir 805 // 806 // Returns an absolute path inside the container that will get passed as arg[0] 807 // to the launched process, and the absolute path to that binary as seen by the 808 // host (these will be identical for binaries that don't come from mounts). 809 // 810 // See also executor.lookupBin for a version used by non-isolated drivers. 811 func lookupTaskBin(command *ExecCommand) (string, string, error) { 812 taskDir := command.TaskDir 813 bin := command.Cmd 814 815 // Check in the local directory 816 localDir := filepath.Join(taskDir, allocdir.TaskLocal) 817 taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin) 818 if err == nil { 819 return taskPath, hostPath, nil 820 } 821 822 // Check at the root of the task's directory 823 taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin) 824 if err == nil { 825 return taskPath, hostPath, nil 826 } 827 828 // Check in our mounts 829 for _, mount := range command.Mounts { 830 taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin) 831 if err == nil { 832 return taskPath, hostPath, nil 833 } 834 } 835 836 // If there's a / in the binary's path, we can't fallback to a PATH search 837 if strings.Contains(bin, "/") { 838 return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir) 839 } 840 841 // look for a file using a PATH-style lookup inside the directory 842 // root. Similar to the stdlib's exec.LookPath except: 843 // - uses a restricted lookup PATH rather than the agent process's PATH env var. 844 // - does not require that the file is already executable (this will be ensured 845 // by the caller) 846 // - does not prevent using relative path as added to exec.LookPath in go1.19 847 // (this gets fixed-up in the caller) 848 849 // This is a fake PATH so that we're not using the agent's PATH 850 restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"} 851 852 for _, dir := range restrictedPaths { 853 pathDir := filepath.Join(command.TaskDir, dir) 854 taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin) 855 if err == nil { 856 return taskPath, hostPath, nil 857 } 858 } 859 860 return "", "", fmt.Errorf("file %s not found under path", bin) 861 } 862 863 // getPathInTaskDir searches for the binary in the task directory and nested 864 // search directory. It returns the absolute path rooted inside the container 865 // and the absolute path on the host. 866 func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) { 867 868 hostPath := filepath.Join(searchDir, bin) 869 err := filepathIsRegular(hostPath) 870 if err != nil { 871 return "", "", err 872 } 873 874 // Find the path relative to the task directory 875 rel, err := filepath.Rel(taskDir, hostPath) 876 if rel == "" || err != nil { 877 return "", "", fmt.Errorf( 878 "failed to determine relative path base=%q target=%q: %v", 879 taskDir, hostPath, err) 880 } 881 882 // Turn relative-to-taskdir path into re-rooted absolute path to avoid 883 // libcontainer trying to resolve the binary using $PATH. 884 // Do *not* use filepath.Join as it will translate ".."s returned by 885 // filepath.Rel. Prepending "/" will cause the path to be rooted in the 886 // chroot which is the desired behavior. 887 return filepath.Clean("/" + rel), hostPath, nil 888 } 889 890 // getPathInMount for the binary in the mount's host path, constructing the path 891 // considering that the bin path is rooted in the mount's task path and not its 892 // host path. It returns the absolute path rooted inside the container and the 893 // absolute path on the host. 894 func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) { 895 896 // Find the path relative to the mount point in the task so that we can 897 // trim off any shared prefix when we search on the host path 898 mountRel, err := filepath.Rel(mountTaskPath, bin) 899 if mountRel == "" || err != nil { 900 return "", "", fmt.Errorf("path was not relative to the mount task path") 901 } 902 903 hostPath := filepath.Join(mountHostPath, mountRel) 904 905 err = filepathIsRegular(hostPath) 906 if err != nil { 907 return "", "", err 908 } 909 910 // Turn relative-to-taskdir path into re-rooted absolute path to avoid 911 // libcontainer trying to resolve the binary using $PATH. 912 // Do *not* use filepath.Join as it will translate ".."s returned by 913 // filepath.Rel. Prepending "/" will cause the path to be rooted in the 914 // chroot which is the desired behavior. 915 return filepath.Clean("/" + bin), hostPath, nil 916 } 917 918 // filepathIsRegular verifies that a filepath is a regular file (i.e. not a 919 // directory, socket, device, etc.) 920 func filepathIsRegular(path string) error { 921 f, err := os.Stat(path) 922 if err != nil { 923 return err 924 } 925 if !f.Mode().Type().IsRegular() { 926 return fmt.Errorf("path was not a regular file") 927 } 928 return nil 929 } 930 931 func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook { 932 return lconfigs.NewFunctionHook(func(state *specs.State) error { 933 return cgroups.WriteCgroupProc(cgroupPath, state.Pid) 934 }) 935 }