github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/drivers/shared/executor/executor_linux.go (about) 1 //go:build linux 2 3 package executor 4 5 import ( 6 "context" 7 "errors" 8 "fmt" 9 "io" 10 "os" 11 "os/exec" 12 "path" 13 "path/filepath" 14 "strings" 15 "syscall" 16 "time" 17 18 "github.com/armon/circbuf" 19 "github.com/hashicorp/consul-template/signals" 20 hclog "github.com/hashicorp/go-hclog" 21 "github.com/hashicorp/nomad/client/allocdir" 22 "github.com/hashicorp/nomad/client/lib/cgutil" 23 "github.com/hashicorp/nomad/client/lib/resources" 24 "github.com/hashicorp/nomad/client/stats" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/drivers/shared/capabilities" 27 shelpers "github.com/hashicorp/nomad/helper/stats" 28 "github.com/hashicorp/nomad/helper/uuid" 29 "github.com/hashicorp/nomad/nomad/structs" 30 "github.com/hashicorp/nomad/plugins/drivers" 31 "github.com/opencontainers/runc/libcontainer" 32 "github.com/opencontainers/runc/libcontainer/cgroups" 33 lconfigs "github.com/opencontainers/runc/libcontainer/configs" 34 "github.com/opencontainers/runc/libcontainer/devices" 35 ldevices "github.com/opencontainers/runc/libcontainer/devices" 36 "github.com/opencontainers/runc/libcontainer/specconv" 37 lutils "github.com/opencontainers/runc/libcontainer/utils" 38 "github.com/opencontainers/runtime-spec/specs-go" 39 "golang.org/x/sys/unix" 40 ) 41 42 var ( 43 // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 44 ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} 45 46 // ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage. 47 ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"} 48 49 // ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor 50 ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"} 51 ) 52 53 // LibcontainerExecutor implements an Executor with the runc/libcontainer api 54 type LibcontainerExecutor struct { 55 id string 56 command *ExecCommand 57 58 logger hclog.Logger 59 60 totalCpuStats *stats.CpuStats 61 userCpuStats *stats.CpuStats 62 systemCpuStats *stats.CpuStats 63 pidCollector *pidCollector 64 65 container libcontainer.Container 66 userProc *libcontainer.Process 67 userProcExited chan interface{} 68 exitState *ProcessState 69 } 70 71 func NewExecutorWithIsolation(logger hclog.Logger) Executor { 72 logger = logger.Named("isolated_executor") 73 if err := shelpers.Init(); err != nil { 74 logger.Error("unable to initialize stats", "error", err) 75 } 76 return &LibcontainerExecutor{ 77 id: strings.ReplaceAll(uuid.Generate(), "-", "_"), 78 logger: logger, 79 totalCpuStats: stats.NewCpuStats(), 80 userCpuStats: stats.NewCpuStats(), 81 systemCpuStats: stats.NewCpuStats(), 82 pidCollector: newPidCollector(logger), 83 } 84 } 85 86 // Launch creates a new container in libcontainer and starts a new process with it 87 func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) { 88 l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " ")) 89 90 if command.Resources == nil { 91 command.Resources = &drivers.Resources{ 92 NomadResources: &structs.AllocatedTaskResources{}, 93 } 94 } 95 96 l.command = command 97 98 // create a new factory which will store the container state in the allocDir 99 factory, err := libcontainer.New( 100 path.Join(command.TaskDir, "../alloc/container"), 101 // note that os.Args[0] refers to the executor shim typically 102 // and first args arguments is ignored now due 103 // until https://github.com/opencontainers/runc/pull/1888 is merged 104 libcontainer.InitArgs(os.Args[0], "libcontainer-shim"), 105 ) 106 if err != nil { 107 return nil, fmt.Errorf("failed to create factory: %v", err) 108 } 109 110 // A container groups processes under the same isolation enforcement 111 containerCfg, err := newLibcontainerConfig(command) 112 if err != nil { 113 return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err) 114 } 115 116 container, err := factory.Create(l.id, containerCfg) 117 if err != nil { 118 return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err) 119 } 120 l.container = container 121 122 // Look up the binary path and make it executable 123 taskPath, hostPath, err := lookupTaskBin(command) 124 if err != nil { 125 return nil, err 126 } 127 if err := makeExecutable(hostPath); err != nil { 128 return nil, err 129 } 130 131 combined := append([]string{taskPath}, command.Args...) 132 stdout, err := command.Stdout() 133 if err != nil { 134 return nil, err 135 } 136 stderr, err := command.Stderr() 137 if err != nil { 138 return nil, err 139 } 140 141 l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " ")) 142 143 // the task process will be started by the container 144 process := &libcontainer.Process{ 145 Args: combined, 146 Env: command.Env, 147 Stdout: stdout, 148 Stderr: stderr, 149 Init: true, 150 } 151 152 if command.User != "" { 153 process.User = command.User 154 } 155 l.userProc = process 156 157 l.totalCpuStats = stats.NewCpuStats() 158 l.userCpuStats = stats.NewCpuStats() 159 l.systemCpuStats = stats.NewCpuStats() 160 161 // Starts the task 162 if err := container.Run(process); err != nil { 163 container.Destroy() 164 return nil, err 165 } 166 167 pid, err := process.Pid() 168 if err != nil { 169 container.Destroy() 170 return nil, err 171 } 172 173 // start a goroutine to wait on the process to complete, so Wait calls can 174 // be multiplexed 175 l.userProcExited = make(chan interface{}) 176 go l.pidCollector.collectPids(l.userProcExited, l.getAllPids) 177 go l.wait() 178 179 return &ProcessState{ 180 Pid: pid, 181 ExitCode: -1, 182 Time: time.Now(), 183 }, nil 184 } 185 186 func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) { 187 pids, err := l.container.Processes() 188 if err != nil { 189 return nil, err 190 } 191 m := make(resources.PIDs, 1) 192 for _, pid := range pids { 193 m[pid] = resources.NewPID(pid) 194 } 195 return m, nil 196 } 197 198 // Wait waits until a process has exited and returns it's exitcode and errors 199 func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) { 200 select { 201 case <-ctx.Done(): 202 return nil, ctx.Err() 203 case <-l.userProcExited: 204 return l.exitState, nil 205 } 206 } 207 208 func (l *LibcontainerExecutor) wait() { 209 defer close(l.userProcExited) 210 211 ps, err := l.userProc.Wait() 212 if err != nil { 213 // If the process has exited before we called wait an error is returned 214 // the process state is embedded in the error 215 if exitErr, ok := err.(*exec.ExitError); ok { 216 ps = exitErr.ProcessState 217 } else { 218 l.logger.Error("failed to call wait on user process", "error", err) 219 l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()} 220 return 221 } 222 } 223 224 l.command.Close() 225 226 exitCode := 1 227 var signal int 228 if status, ok := ps.Sys().(syscall.WaitStatus); ok { 229 exitCode = status.ExitStatus() 230 if status.Signaled() { 231 const exitSignalBase = 128 232 signal = int(status.Signal()) 233 exitCode = exitSignalBase + signal 234 } 235 } 236 237 l.exitState = &ProcessState{ 238 Pid: ps.Pid(), 239 ExitCode: exitCode, 240 Signal: signal, 241 Time: time.Now(), 242 } 243 } 244 245 // Shutdown stops all processes started and cleans up any resources 246 // created (such as mountpoints, devices, etc). 247 func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error { 248 if l.container == nil { 249 return nil 250 } 251 252 status, err := l.container.Status() 253 if err != nil { 254 return err 255 } 256 257 defer l.container.Destroy() 258 259 if status == libcontainer.Stopped { 260 return nil 261 } 262 263 if grace > 0 { 264 if signal == "" { 265 signal = "SIGINT" 266 } 267 268 sig, ok := signals.SignalLookup[signal] 269 if !ok { 270 return fmt.Errorf("error unknown signal given for shutdown: %s", signal) 271 } 272 273 // Signal initial container processes only during graceful 274 // shutdown; hence `false` arg. 275 err = l.container.Signal(sig, false) 276 if err != nil { 277 return err 278 } 279 280 select { 281 case <-l.userProcExited: 282 return nil 283 case <-time.After(grace): 284 // Force kill all container processes after grace period, 285 // hence `true` argument. 286 if err := l.container.Signal(os.Kill, true); err != nil { 287 return err 288 } 289 } 290 } else { 291 err := l.container.Signal(os.Kill, true) 292 if err != nil { 293 return err 294 } 295 } 296 297 select { 298 case <-l.userProcExited: 299 return nil 300 case <-time.After(time.Second * 15): 301 return fmt.Errorf("process failed to exit after 15 seconds") 302 } 303 } 304 305 // UpdateResources updates the resource isolation with new values to be enforced 306 func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error { 307 return nil 308 } 309 310 // Version returns the api version of the executor 311 func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) { 312 return &ExecutorVersion{Version: ExecutorVersionLatest}, nil 313 } 314 315 // Stats returns the resource statistics for processes managed by the executor 316 func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) { 317 ch := make(chan *cstructs.TaskResourceUsage) 318 go l.handleStats(ch, ctx, interval) 319 return ch, nil 320 321 } 322 323 func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) { 324 defer close(ch) 325 timer := time.NewTimer(0) 326 327 measuredMemStats := ExecutorCgroupV1MeasuredMemStats 328 if cgroups.IsCgroup2UnifiedMode() { 329 measuredMemStats = ExecutorCgroupV2MeasuredMemStats 330 } 331 332 for { 333 select { 334 case <-ctx.Done(): 335 return 336 337 case <-timer.C: 338 timer.Reset(interval) 339 } 340 341 lstats, err := l.container.Stats() 342 if err != nil { 343 l.logger.Warn("error collecting stats", "error", err) 344 return 345 } 346 347 pidStats, err := l.pidCollector.pidStats() 348 if err != nil { 349 l.logger.Warn("error collecting stats", "error", err) 350 return 351 } 352 353 ts := time.Now() 354 stats := lstats.CgroupStats 355 356 // Memory Related Stats 357 swap := stats.MemoryStats.SwapUsage 358 maxUsage := stats.MemoryStats.Usage.MaxUsage 359 rss := stats.MemoryStats.Stats["rss"] 360 cache := stats.MemoryStats.Stats["cache"] 361 mapped_file := stats.MemoryStats.Stats["mapped_file"] 362 ms := &cstructs.MemoryStats{ 363 RSS: rss, 364 Cache: cache, 365 Swap: swap.Usage, 366 MappedFile: mapped_file, 367 Usage: stats.MemoryStats.Usage.Usage, 368 MaxUsage: maxUsage, 369 KernelUsage: stats.MemoryStats.KernelUsage.Usage, 370 KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage, 371 Measured: measuredMemStats, 372 } 373 374 // CPU Related Stats 375 totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage) 376 userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode) 377 kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode) 378 379 totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage) 380 cs := &cstructs.CpuStats{ 381 SystemMode: l.systemCpuStats.Percent(kernelModeTime), 382 UserMode: l.userCpuStats.Percent(userModeTime), 383 Percent: totalPercent, 384 ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods, 385 ThrottledTime: stats.CpuStats.ThrottlingData.ThrottledTime, 386 TotalTicks: l.systemCpuStats.TicksConsumed(totalPercent), 387 Measured: ExecutorCgroupMeasuredCpuStats, 388 } 389 taskResUsage := cstructs.TaskResourceUsage{ 390 ResourceUsage: &cstructs.ResourceUsage{ 391 MemoryStats: ms, 392 CpuStats: cs, 393 }, 394 Timestamp: ts.UTC().UnixNano(), 395 Pids: pidStats, 396 } 397 398 select { 399 case <-ctx.Done(): 400 return 401 case ch <- &taskResUsage: 402 } 403 404 } 405 } 406 407 // Signal sends a signal to the process managed by the executor 408 func (l *LibcontainerExecutor) Signal(s os.Signal) error { 409 return l.userProc.Signal(s) 410 } 411 412 // Exec starts an additional process inside the container 413 func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) { 414 combined := append([]string{cmd}, args...) 415 // Capture output 416 buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize)) 417 418 process := &libcontainer.Process{ 419 Args: combined, 420 Env: l.command.Env, 421 Stdout: buf, 422 Stderr: buf, 423 } 424 425 err := l.container.Run(process) 426 if err != nil { 427 return nil, 0, err 428 } 429 430 waitCh := make(chan *waitResult) 431 defer close(waitCh) 432 go l.handleExecWait(waitCh, process) 433 434 select { 435 case result := <-waitCh: 436 ps := result.ps 437 if result.err != nil { 438 if exitErr, ok := result.err.(*exec.ExitError); ok { 439 ps = exitErr.ProcessState 440 } else { 441 return nil, 0, result.err 442 } 443 } 444 var exitCode int 445 if status, ok := ps.Sys().(syscall.WaitStatus); ok { 446 exitCode = status.ExitStatus() 447 } 448 return buf.Bytes(), exitCode, nil 449 450 case <-time.After(time.Until(deadline)): 451 process.Signal(os.Kill) 452 return nil, 0, context.DeadlineExceeded 453 } 454 455 } 456 457 func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) { 458 parent, child, err := lutils.NewSockPair("socket") 459 if err != nil { 460 return nil, nil, fmt.Errorf("failed to create terminal: %v", err) 461 } 462 463 return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err 464 465 } 466 467 func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool, 468 stream drivers.ExecTaskStream) error { 469 470 // the task process will be started by the container 471 process := &libcontainer.Process{ 472 Args: cmd, 473 Env: l.userProc.Env, 474 User: l.userProc.User, 475 Init: false, 476 Cwd: "/", 477 } 478 479 execHelper := &execHelper{ 480 logger: l.logger, 481 482 newTerminal: l.newTerminalSocket, 483 setTTY: func(tty *os.File) error { 484 process.ConsoleSocket = tty 485 return nil 486 }, 487 setIO: func(stdin io.Reader, stdout, stderr io.Writer) error { 488 process.Stdin = stdin 489 process.Stdout = stdout 490 process.Stderr = stderr 491 return nil 492 }, 493 494 processStart: func() error { return l.container.Run(process) }, 495 processWait: func() (*os.ProcessState, error) { 496 return process.Wait() 497 }, 498 } 499 500 return execHelper.run(ctx, tty, stream) 501 502 } 503 504 type waitResult struct { 505 ps *os.ProcessState 506 err error 507 } 508 509 func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) { 510 ps, err := process.Wait() 511 ch <- &waitResult{ps, err} 512 } 513 514 func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) { 515 switch command.User { 516 case "root": 517 // when running as root, use the legacy set of system capabilities, so 518 // that we do not break existing nomad clusters using this "feature" 519 legacyCaps := capabilities.LegacySupported().Slice(true) 520 cfg.Capabilities = &lconfigs.Capabilities{ 521 Bounding: legacyCaps, 522 Permitted: legacyCaps, 523 Effective: legacyCaps, 524 Ambient: nil, 525 Inheritable: nil, 526 } 527 default: 528 // otherwise apply the plugin + task capability configuration 529 cfg.Capabilities = &lconfigs.Capabilities{ 530 Bounding: command.Capabilities, 531 } 532 } 533 } 534 535 func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces { 536 namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}} 537 if pidMode == IsolationModePrivate { 538 namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID}) 539 } 540 if ipcMode == IsolationModePrivate { 541 namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC}) 542 } 543 return namespaces 544 } 545 546 // configureIsolation prepares the isolation primitives of the container. 547 // The process runs in a container configured with the following: 548 // 549 // * the task directory as the chroot 550 // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host 551 // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker 552 // * some special filesystems: `/proc`, `/sys`. Some case is given to avoid exec escaping or setting malicious values through them. 553 func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { 554 defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV 555 556 // set the new root directory for the container 557 cfg.Rootfs = command.TaskDir 558 559 // disable pivot_root if set in the driver's configuration 560 cfg.NoPivotRoot = command.NoPivotRoot 561 562 // set up default namespaces as configured 563 cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC) 564 565 if command.NetworkIsolation != nil { 566 cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{ 567 Type: lconfigs.NEWNET, 568 Path: command.NetworkIsolation.Path, 569 }) 570 } 571 572 // paths to mask using a bind mount to /dev/null to prevent reading 573 cfg.MaskPaths = []string{ 574 "/proc/kcore", 575 "/sys/firmware", 576 } 577 578 // paths that should be remounted as readonly inside the container 579 cfg.ReadonlyPaths = []string{ 580 "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", 581 } 582 583 cfg.Devices = specconv.AllowedDevices 584 if len(command.Devices) > 0 { 585 devs, err := cmdDevices(command.Devices) 586 if err != nil { 587 return err 588 } 589 cfg.Devices = append(cfg.Devices, devs...) 590 } 591 592 cfg.Mounts = []*lconfigs.Mount{ 593 { 594 Source: "tmpfs", 595 Destination: "/dev", 596 Device: "tmpfs", 597 Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, 598 Data: "mode=755", 599 }, 600 { 601 Source: "proc", 602 Destination: "/proc", 603 Device: "proc", 604 Flags: defaultMountFlags, 605 }, 606 { 607 Source: "devpts", 608 Destination: "/dev/pts", 609 Device: "devpts", 610 Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, 611 Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", 612 }, 613 { 614 Device: "tmpfs", 615 Source: "shm", 616 Destination: "/dev/shm", 617 Data: "mode=1777,size=65536k", 618 Flags: defaultMountFlags, 619 }, 620 { 621 Source: "mqueue", 622 Destination: "/dev/mqueue", 623 Device: "mqueue", 624 Flags: defaultMountFlags, 625 }, 626 { 627 Source: "sysfs", 628 Destination: "/sys", 629 Device: "sysfs", 630 Flags: defaultMountFlags | syscall.MS_RDONLY, 631 }, 632 } 633 634 if len(command.Mounts) > 0 { 635 cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...) 636 } 637 638 return nil 639 } 640 641 func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { 642 // If resources are not limited then manually create cgroups needed 643 if !command.ResourceLimits { 644 return cgutil.ConfigureBasicCgroups(cfg) 645 } 646 647 // set cgroups path 648 if cgutil.UseV2 { 649 // in v2, the cgroup must have been created by the client already, 650 // which breaks a lot of existing tests that run drivers without a client 651 if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { 652 return errors.New("cgroup path must be set") 653 } 654 parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) 655 cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) 656 } else { 657 // in v1, the cgroup is created using /nomad, which is a bug because it 658 // does not respect the cgroup_parent client configuration 659 // (but makes testing easy) 660 id := uuid.Generate() 661 cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) 662 } 663 664 if command.Resources == nil || command.Resources.NomadResources == nil { 665 return nil 666 } 667 668 // Total amount of memory allowed to consume 669 res := command.Resources.NomadResources 670 memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB 671 if memHard <= 0 { 672 memHard = res.Memory.MemoryMB 673 memSoft = 0 674 } 675 676 if memHard > 0 { 677 cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024 678 cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024 679 680 // Disable swap to avoid issues on the machine 681 var memSwappiness uint64 682 cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness 683 } 684 685 cpuShares := res.Cpu.CpuShares 686 if cpuShares < 2 { 687 return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares) 688 } 689 690 // Set the relative CPU shares for this cgroup, and convert for cgroupv2 691 cfg.Cgroups.Resources.CpuShares = uint64(cpuShares) 692 cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares)) 693 694 if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" { 695 cfg.Hooks = lconfigs.Hooks{ 696 lconfigs.CreateRuntime: lconfigs.HookList{ 697 newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath), 698 }, 699 } 700 } 701 702 return nil 703 } 704 705 func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { 706 cfg := &lconfigs.Config{ 707 Cgroups: &lconfigs.Cgroup{ 708 Resources: &lconfigs.Resources{ 709 MemorySwappiness: nil, 710 }, 711 }, 712 Version: "1.0.0", 713 } 714 715 for _, device := range specconv.AllowedDevices { 716 cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) 717 } 718 719 configureCapabilities(cfg, command) 720 721 // children should not inherit Nomad agent oom_score_adj value 722 oomScoreAdj := 0 723 cfg.OomScoreAdj = &oomScoreAdj 724 725 if err := configureIsolation(cfg, command); err != nil { 726 return nil, err 727 } 728 729 if err := configureCgroups(cfg, command); err != nil { 730 return nil, err 731 } 732 733 return cfg, nil 734 } 735 736 // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices. 737 func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) { 738 if len(driverDevices) == 0 { 739 return nil, nil 740 } 741 742 r := make([]*devices.Device, len(driverDevices)) 743 744 for i, d := range driverDevices { 745 ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions) 746 if err != nil { 747 return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err) 748 } 749 ed.Path = d.TaskPath 750 r[i] = ed 751 } 752 753 return r, nil 754 } 755 756 var userMountToUnixMount = map[string]int{ 757 // Empty string maps to `rprivate` for backwards compatibility in restored 758 // older tasks, where mount propagation will not be present. 759 "": unix.MS_PRIVATE | unix.MS_REC, // rprivate 760 structs.VolumeMountPropagationPrivate: unix.MS_PRIVATE | unix.MS_REC, // rprivate 761 structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC, // rslave 762 structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared 763 } 764 765 // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts. 766 func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount { 767 if len(mounts) == 0 { 768 return nil 769 } 770 771 r := make([]*lconfigs.Mount, len(mounts)) 772 773 for i, m := range mounts { 774 flags := unix.MS_BIND 775 if m.Readonly { 776 flags |= unix.MS_RDONLY 777 } 778 779 r[i] = &lconfigs.Mount{ 780 Source: m.HostPath, 781 Destination: m.TaskPath, 782 Device: "bind", 783 Flags: flags, 784 PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]}, 785 } 786 } 787 788 return r 789 } 790 791 // lookupTaskBin finds the file `bin`, searching in order: 792 // - taskDir/local 793 // - taskDir 794 // - each mount, in order listed in the jobspec 795 // - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir 796 // 797 // Returns an absolute path inside the container that will get passed as arg[0] 798 // to the launched process, and the absolute path to that binary as seen by the 799 // host (these will be identical for binaries that don't come from mounts). 800 // 801 // See also executor.lookupBin for a version used by non-isolated drivers. 802 func lookupTaskBin(command *ExecCommand) (string, string, error) { 803 taskDir := command.TaskDir 804 bin := command.Cmd 805 806 // Check in the local directory 807 localDir := filepath.Join(taskDir, allocdir.TaskLocal) 808 taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin) 809 if err == nil { 810 return taskPath, hostPath, nil 811 } 812 813 // Check at the root of the task's directory 814 taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin) 815 if err == nil { 816 return taskPath, hostPath, nil 817 } 818 819 // Check in our mounts 820 for _, mount := range command.Mounts { 821 taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin) 822 if err == nil { 823 return taskPath, hostPath, nil 824 } 825 } 826 827 // If there's a / in the binary's path, we can't fallback to a PATH search 828 if strings.Contains(bin, "/") { 829 return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir) 830 } 831 832 // look for a file using a PATH-style lookup inside the directory 833 // root. Similar to the stdlib's exec.LookPath except: 834 // - uses a restricted lookup PATH rather than the agent process's PATH env var. 835 // - does not require that the file is already executable (this will be ensured 836 // by the caller) 837 // - does not prevent using relative path as added to exec.LookPath in go1.19 838 // (this gets fixed-up in the caller) 839 840 // This is a fake PATH so that we're not using the agent's PATH 841 restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"} 842 843 for _, dir := range restrictedPaths { 844 pathDir := filepath.Join(command.TaskDir, dir) 845 taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin) 846 if err == nil { 847 return taskPath, hostPath, nil 848 } 849 } 850 851 return "", "", fmt.Errorf("file %s not found under path", bin) 852 } 853 854 // getPathInTaskDir searches for the binary in the task directory and nested 855 // search directory. It returns the absolute path rooted inside the container 856 // and the absolute path on the host. 857 func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) { 858 859 hostPath := filepath.Join(searchDir, bin) 860 err := filepathIsRegular(hostPath) 861 if err != nil { 862 return "", "", err 863 } 864 865 // Find the path relative to the task directory 866 rel, err := filepath.Rel(taskDir, hostPath) 867 if rel == "" || err != nil { 868 return "", "", fmt.Errorf( 869 "failed to determine relative path base=%q target=%q: %v", 870 taskDir, hostPath, err) 871 } 872 873 // Turn relative-to-taskdir path into re-rooted absolute path to avoid 874 // libcontainer trying to resolve the binary using $PATH. 875 // Do *not* use filepath.Join as it will translate ".."s returned by 876 // filepath.Rel. Prepending "/" will cause the path to be rooted in the 877 // chroot which is the desired behavior. 878 return filepath.Clean("/" + rel), hostPath, nil 879 } 880 881 // getPathInMount for the binary in the mount's host path, constructing the path 882 // considering that the bin path is rooted in the mount's task path and not its 883 // host path. It returns the absolute path rooted inside the container and the 884 // absolute path on the host. 885 func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) { 886 887 // Find the path relative to the mount point in the task so that we can 888 // trim off any shared prefix when we search on the host path 889 mountRel, err := filepath.Rel(mountTaskPath, bin) 890 if mountRel == "" || err != nil { 891 return "", "", fmt.Errorf("path was not relative to the mount task path") 892 } 893 894 hostPath := filepath.Join(mountHostPath, mountRel) 895 896 err = filepathIsRegular(hostPath) 897 if err != nil { 898 return "", "", err 899 } 900 901 // Turn relative-to-taskdir path into re-rooted absolute path to avoid 902 // libcontainer trying to resolve the binary using $PATH. 903 // Do *not* use filepath.Join as it will translate ".."s returned by 904 // filepath.Rel. Prepending "/" will cause the path to be rooted in the 905 // chroot which is the desired behavior. 906 return filepath.Clean("/" + bin), hostPath, nil 907 } 908 909 // filepathIsRegular verifies that a filepath is a regular file (i.e. not a 910 // directory, socket, device, etc.) 911 func filepathIsRegular(path string) error { 912 f, err := os.Stat(path) 913 if err != nil { 914 return err 915 } 916 if !f.Mode().Type().IsRegular() { 917 return fmt.Errorf("path was not a regular file") 918 } 919 return nil 920 } 921 922 func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook { 923 return lconfigs.NewFunctionHook(func(state *specs.State) error { 924 return cgroups.WriteCgroupProc(cgroupPath, state.Pid) 925 }) 926 }