github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/drivers/shared/executor/executor_linux.go (about)

     1  //go:build linux
     2  
     3  package executor
     4  
     5  import (
     6  	"context"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"os/exec"
    12  	"path"
    13  	"path/filepath"
    14  	"strings"
    15  	"syscall"
    16  	"time"
    17  
    18  	"github.com/armon/circbuf"
    19  	"github.com/hashicorp/consul-template/signals"
    20  	hclog "github.com/hashicorp/go-hclog"
    21  	"github.com/hashicorp/nomad/client/allocdir"
    22  	"github.com/hashicorp/nomad/client/lib/cgutil"
    23  	"github.com/hashicorp/nomad/client/lib/resources"
    24  	"github.com/hashicorp/nomad/client/stats"
    25  	cstructs "github.com/hashicorp/nomad/client/structs"
    26  	"github.com/hashicorp/nomad/drivers/shared/capabilities"
    27  	shelpers "github.com/hashicorp/nomad/helper/stats"
    28  	"github.com/hashicorp/nomad/helper/uuid"
    29  	"github.com/hashicorp/nomad/nomad/structs"
    30  	"github.com/hashicorp/nomad/plugins/drivers"
    31  	"github.com/opencontainers/runc/libcontainer"
    32  	"github.com/opencontainers/runc/libcontainer/cgroups"
    33  	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
    34  	"github.com/opencontainers/runc/libcontainer/devices"
    35  	ldevices "github.com/opencontainers/runc/libcontainer/devices"
    36  	"github.com/opencontainers/runc/libcontainer/specconv"
    37  	lutils "github.com/opencontainers/runc/libcontainer/utils"
    38  	"github.com/opencontainers/runtime-spec/specs-go"
    39  	"golang.org/x/sys/unix"
    40  )
    41  
    42  var (
    43  	// ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1
    44  	ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
    45  
    46  	// ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage.
    47  	ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"}
    48  
    49  	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
    50  	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
    51  )
    52  
    53  // LibcontainerExecutor implements an Executor with the runc/libcontainer api
    54  type LibcontainerExecutor struct {
    55  	id      string
    56  	command *ExecCommand
    57  
    58  	logger hclog.Logger
    59  
    60  	totalCpuStats  *stats.CpuStats
    61  	userCpuStats   *stats.CpuStats
    62  	systemCpuStats *stats.CpuStats
    63  	pidCollector   *pidCollector
    64  
    65  	container      libcontainer.Container
    66  	userProc       *libcontainer.Process
    67  	userProcExited chan interface{}
    68  	exitState      *ProcessState
    69  }
    70  
    71  func NewExecutorWithIsolation(logger hclog.Logger) Executor {
    72  	logger = logger.Named("isolated_executor")
    73  	if err := shelpers.Init(); err != nil {
    74  		logger.Error("unable to initialize stats", "error", err)
    75  	}
    76  	return &LibcontainerExecutor{
    77  		id:             strings.ReplaceAll(uuid.Generate(), "-", "_"),
    78  		logger:         logger,
    79  		totalCpuStats:  stats.NewCpuStats(),
    80  		userCpuStats:   stats.NewCpuStats(),
    81  		systemCpuStats: stats.NewCpuStats(),
    82  		pidCollector:   newPidCollector(logger),
    83  	}
    84  }
    85  
    86  // Launch creates a new container in libcontainer and starts a new process with it
    87  func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
    88  	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
    89  
    90  	if command.Resources == nil {
    91  		command.Resources = &drivers.Resources{
    92  			NomadResources: &structs.AllocatedTaskResources{},
    93  		}
    94  	}
    95  
    96  	l.command = command
    97  
    98  	// create a new factory which will store the container state in the allocDir
    99  	factory, err := libcontainer.New(
   100  		path.Join(command.TaskDir, "../alloc/container"),
   101  		// note that os.Args[0] refers to the executor shim typically
   102  		// and first args arguments is ignored now due
   103  		// until https://github.com/opencontainers/runc/pull/1888 is merged
   104  		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
   105  	)
   106  	if err != nil {
   107  		return nil, fmt.Errorf("failed to create factory: %v", err)
   108  	}
   109  
   110  	// A container groups processes under the same isolation enforcement
   111  	containerCfg, err := newLibcontainerConfig(command)
   112  	if err != nil {
   113  		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
   114  	}
   115  
   116  	container, err := factory.Create(l.id, containerCfg)
   117  	if err != nil {
   118  		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
   119  	}
   120  	l.container = container
   121  
   122  	// Look up the binary path and make it executable
   123  	taskPath, hostPath, err := lookupTaskBin(command)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  	if err := makeExecutable(hostPath); err != nil {
   128  		return nil, err
   129  	}
   130  
   131  	combined := append([]string{taskPath}, command.Args...)
   132  	stdout, err := command.Stdout()
   133  	if err != nil {
   134  		return nil, err
   135  	}
   136  	stderr, err := command.Stderr()
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  
   141  	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
   142  
   143  	// the task process will be started by the container
   144  	process := &libcontainer.Process{
   145  		Args:   combined,
   146  		Env:    command.Env,
   147  		Stdout: stdout,
   148  		Stderr: stderr,
   149  		Init:   true,
   150  	}
   151  
   152  	if command.User != "" {
   153  		process.User = command.User
   154  	}
   155  	l.userProc = process
   156  
   157  	l.totalCpuStats = stats.NewCpuStats()
   158  	l.userCpuStats = stats.NewCpuStats()
   159  	l.systemCpuStats = stats.NewCpuStats()
   160  
   161  	// Starts the task
   162  	if err := container.Run(process); err != nil {
   163  		container.Destroy()
   164  		return nil, err
   165  	}
   166  
   167  	pid, err := process.Pid()
   168  	if err != nil {
   169  		container.Destroy()
   170  		return nil, err
   171  	}
   172  
   173  	// start a goroutine to wait on the process to complete, so Wait calls can
   174  	// be multiplexed
   175  	l.userProcExited = make(chan interface{})
   176  	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
   177  	go l.wait()
   178  
   179  	return &ProcessState{
   180  		Pid:      pid,
   181  		ExitCode: -1,
   182  		Time:     time.Now(),
   183  	}, nil
   184  }
   185  
   186  func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) {
   187  	pids, err := l.container.Processes()
   188  	if err != nil {
   189  		return nil, err
   190  	}
   191  	m := make(resources.PIDs, 1)
   192  	for _, pid := range pids {
   193  		m[pid] = resources.NewPID(pid)
   194  	}
   195  	return m, nil
   196  }
   197  
   198  // Wait waits until a process has exited and returns it's exitcode and errors
   199  func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
   200  	select {
   201  	case <-ctx.Done():
   202  		return nil, ctx.Err()
   203  	case <-l.userProcExited:
   204  		return l.exitState, nil
   205  	}
   206  }
   207  
   208  func (l *LibcontainerExecutor) wait() {
   209  	defer close(l.userProcExited)
   210  
   211  	ps, err := l.userProc.Wait()
   212  	if err != nil {
   213  		// If the process has exited before we called wait an error is returned
   214  		// the process state is embedded in the error
   215  		if exitErr, ok := err.(*exec.ExitError); ok {
   216  			ps = exitErr.ProcessState
   217  		} else {
   218  			l.logger.Error("failed to call wait on user process", "error", err)
   219  			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
   220  			return
   221  		}
   222  	}
   223  
   224  	l.command.Close()
   225  
   226  	exitCode := 1
   227  	var signal int
   228  	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   229  		exitCode = status.ExitStatus()
   230  		if status.Signaled() {
   231  			const exitSignalBase = 128
   232  			signal = int(status.Signal())
   233  			exitCode = exitSignalBase + signal
   234  		}
   235  	}
   236  
   237  	l.exitState = &ProcessState{
   238  		Pid:      ps.Pid(),
   239  		ExitCode: exitCode,
   240  		Signal:   signal,
   241  		Time:     time.Now(),
   242  	}
   243  }
   244  
   245  // Shutdown stops all processes started and cleans up any resources
   246  // created (such as mountpoints, devices, etc).
   247  func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
   248  	if l.container == nil {
   249  		return nil
   250  	}
   251  
   252  	status, err := l.container.Status()
   253  	if err != nil {
   254  		return err
   255  	}
   256  
   257  	defer l.container.Destroy()
   258  
   259  	if status == libcontainer.Stopped {
   260  		return nil
   261  	}
   262  
   263  	if grace > 0 {
   264  		if signal == "" {
   265  			signal = "SIGINT"
   266  		}
   267  
   268  		sig, ok := signals.SignalLookup[signal]
   269  		if !ok {
   270  			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
   271  		}
   272  
   273  		// Signal initial container processes only during graceful
   274  		// shutdown; hence `false` arg.
   275  		err = l.container.Signal(sig, false)
   276  		if err != nil {
   277  			return err
   278  		}
   279  
   280  		select {
   281  		case <-l.userProcExited:
   282  			return nil
   283  		case <-time.After(grace):
   284  			// Force kill all container processes after grace period,
   285  			// hence `true` argument.
   286  			if err := l.container.Signal(os.Kill, true); err != nil {
   287  				return err
   288  			}
   289  		}
   290  	} else {
   291  		err := l.container.Signal(os.Kill, true)
   292  		if err != nil {
   293  			return err
   294  		}
   295  	}
   296  
   297  	select {
   298  	case <-l.userProcExited:
   299  		return nil
   300  	case <-time.After(time.Second * 15):
   301  		return fmt.Errorf("process failed to exit after 15 seconds")
   302  	}
   303  }
   304  
   305  // UpdateResources updates the resource isolation with new values to be enforced
   306  func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
   307  	return nil
   308  }
   309  
   310  // Version returns the api version of the executor
   311  func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
   312  	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
   313  }
   314  
   315  // Stats returns the resource statistics for processes managed by the executor
   316  func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
   317  	ch := make(chan *cstructs.TaskResourceUsage)
   318  	go l.handleStats(ch, ctx, interval)
   319  	return ch, nil
   320  
   321  }
   322  
   323  func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
   324  	defer close(ch)
   325  	timer := time.NewTimer(0)
   326  
   327  	measuredMemStats := ExecutorCgroupV1MeasuredMemStats
   328  	if cgroups.IsCgroup2UnifiedMode() {
   329  		measuredMemStats = ExecutorCgroupV2MeasuredMemStats
   330  	}
   331  
   332  	for {
   333  		select {
   334  		case <-ctx.Done():
   335  			return
   336  
   337  		case <-timer.C:
   338  			timer.Reset(interval)
   339  		}
   340  
   341  		lstats, err := l.container.Stats()
   342  		if err != nil {
   343  			l.logger.Warn("error collecting stats", "error", err)
   344  			return
   345  		}
   346  
   347  		pidStats, err := l.pidCollector.pidStats()
   348  		if err != nil {
   349  			l.logger.Warn("error collecting stats", "error", err)
   350  			return
   351  		}
   352  
   353  		ts := time.Now()
   354  		stats := lstats.CgroupStats
   355  
   356  		// Memory Related Stats
   357  		swap := stats.MemoryStats.SwapUsage
   358  		maxUsage := stats.MemoryStats.Usage.MaxUsage
   359  		rss := stats.MemoryStats.Stats["rss"]
   360  		cache := stats.MemoryStats.Stats["cache"]
   361  		mapped_file := stats.MemoryStats.Stats["mapped_file"]
   362  		ms := &cstructs.MemoryStats{
   363  			RSS:            rss,
   364  			Cache:          cache,
   365  			Swap:           swap.Usage,
   366  			MappedFile:     mapped_file,
   367  			Usage:          stats.MemoryStats.Usage.Usage,
   368  			MaxUsage:       maxUsage,
   369  			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
   370  			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
   371  			Measured:       measuredMemStats,
   372  		}
   373  
   374  		// CPU Related Stats
   375  		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
   376  		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
   377  		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
   378  
   379  		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
   380  		cs := &cstructs.CpuStats{
   381  			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
   382  			UserMode:         l.userCpuStats.Percent(userModeTime),
   383  			Percent:          totalPercent,
   384  			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
   385  			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
   386  			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
   387  			Measured:         ExecutorCgroupMeasuredCpuStats,
   388  		}
   389  		taskResUsage := cstructs.TaskResourceUsage{
   390  			ResourceUsage: &cstructs.ResourceUsage{
   391  				MemoryStats: ms,
   392  				CpuStats:    cs,
   393  			},
   394  			Timestamp: ts.UTC().UnixNano(),
   395  			Pids:      pidStats,
   396  		}
   397  
   398  		select {
   399  		case <-ctx.Done():
   400  			return
   401  		case ch <- &taskResUsage:
   402  		}
   403  
   404  	}
   405  }
   406  
   407  // Signal sends a signal to the process managed by the executor
   408  func (l *LibcontainerExecutor) Signal(s os.Signal) error {
   409  	return l.userProc.Signal(s)
   410  }
   411  
   412  // Exec starts an additional process inside the container
   413  func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
   414  	combined := append([]string{cmd}, args...)
   415  	// Capture output
   416  	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   417  
   418  	process := &libcontainer.Process{
   419  		Args:   combined,
   420  		Env:    l.command.Env,
   421  		Stdout: buf,
   422  		Stderr: buf,
   423  	}
   424  
   425  	err := l.container.Run(process)
   426  	if err != nil {
   427  		return nil, 0, err
   428  	}
   429  
   430  	waitCh := make(chan *waitResult)
   431  	defer close(waitCh)
   432  	go l.handleExecWait(waitCh, process)
   433  
   434  	select {
   435  	case result := <-waitCh:
   436  		ps := result.ps
   437  		if result.err != nil {
   438  			if exitErr, ok := result.err.(*exec.ExitError); ok {
   439  				ps = exitErr.ProcessState
   440  			} else {
   441  				return nil, 0, result.err
   442  			}
   443  		}
   444  		var exitCode int
   445  		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   446  			exitCode = status.ExitStatus()
   447  		}
   448  		return buf.Bytes(), exitCode, nil
   449  
   450  	case <-time.After(time.Until(deadline)):
   451  		process.Signal(os.Kill)
   452  		return nil, 0, context.DeadlineExceeded
   453  	}
   454  
   455  }
   456  
   457  func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
   458  	parent, child, err := lutils.NewSockPair("socket")
   459  	if err != nil {
   460  		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
   461  	}
   462  
   463  	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
   464  
   465  }
   466  
   467  func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
   468  	stream drivers.ExecTaskStream) error {
   469  
   470  	// the task process will be started by the container
   471  	process := &libcontainer.Process{
   472  		Args: cmd,
   473  		Env:  l.userProc.Env,
   474  		User: l.userProc.User,
   475  		Init: false,
   476  		Cwd:  "/",
   477  	}
   478  
   479  	execHelper := &execHelper{
   480  		logger: l.logger,
   481  
   482  		newTerminal: l.newTerminalSocket,
   483  		setTTY: func(tty *os.File) error {
   484  			process.ConsoleSocket = tty
   485  			return nil
   486  		},
   487  		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
   488  			process.Stdin = stdin
   489  			process.Stdout = stdout
   490  			process.Stderr = stderr
   491  			return nil
   492  		},
   493  
   494  		processStart: func() error { return l.container.Run(process) },
   495  		processWait: func() (*os.ProcessState, error) {
   496  			return process.Wait()
   497  		},
   498  	}
   499  
   500  	return execHelper.run(ctx, tty, stream)
   501  
   502  }
   503  
   504  type waitResult struct {
   505  	ps  *os.ProcessState
   506  	err error
   507  }
   508  
   509  func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
   510  	ps, err := process.Wait()
   511  	ch <- &waitResult{ps, err}
   512  }
   513  
   514  func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) {
   515  	switch command.User {
   516  	case "root":
   517  		// when running as root, use the legacy set of system capabilities, so
   518  		// that we do not break existing nomad clusters using this "feature"
   519  		legacyCaps := capabilities.LegacySupported().Slice(true)
   520  		cfg.Capabilities = &lconfigs.Capabilities{
   521  			Bounding:    legacyCaps,
   522  			Permitted:   legacyCaps,
   523  			Effective:   legacyCaps,
   524  			Ambient:     nil,
   525  			Inheritable: nil,
   526  		}
   527  	default:
   528  		// otherwise apply the plugin + task capability configuration
   529  		cfg.Capabilities = &lconfigs.Capabilities{
   530  			Bounding: command.Capabilities,
   531  		}
   532  	}
   533  }
   534  
   535  func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces {
   536  	namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}}
   537  	if pidMode == IsolationModePrivate {
   538  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID})
   539  	}
   540  	if ipcMode == IsolationModePrivate {
   541  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC})
   542  	}
   543  	return namespaces
   544  }
   545  
   546  // configureIsolation prepares the isolation primitives of the container.
   547  // The process runs in a container configured with the following:
   548  //
   549  // * the task directory as the chroot
   550  // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
   551  // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
   552  // * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
   553  func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
   554  	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   555  
   556  	// set the new root directory for the container
   557  	cfg.Rootfs = command.TaskDir
   558  
   559  	// disable pivot_root if set in the driver's configuration
   560  	cfg.NoPivotRoot = command.NoPivotRoot
   561  
   562  	// set up default namespaces as configured
   563  	cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC)
   564  
   565  	if command.NetworkIsolation != nil {
   566  		cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
   567  			Type: lconfigs.NEWNET,
   568  			Path: command.NetworkIsolation.Path,
   569  		})
   570  	}
   571  
   572  	// paths to mask using a bind mount to /dev/null to prevent reading
   573  	cfg.MaskPaths = []string{
   574  		"/proc/kcore",
   575  		"/sys/firmware",
   576  	}
   577  
   578  	// paths that should be remounted as readonly inside the container
   579  	cfg.ReadonlyPaths = []string{
   580  		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
   581  	}
   582  
   583  	cfg.Devices = specconv.AllowedDevices
   584  	if len(command.Devices) > 0 {
   585  		devs, err := cmdDevices(command.Devices)
   586  		if err != nil {
   587  			return err
   588  		}
   589  		cfg.Devices = append(cfg.Devices, devs...)
   590  	}
   591  
   592  	cfg.Mounts = []*lconfigs.Mount{
   593  		{
   594  			Source:      "tmpfs",
   595  			Destination: "/dev",
   596  			Device:      "tmpfs",
   597  			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
   598  			Data:        "mode=755",
   599  		},
   600  		{
   601  			Source:      "proc",
   602  			Destination: "/proc",
   603  			Device:      "proc",
   604  			Flags:       defaultMountFlags,
   605  		},
   606  		{
   607  			Source:      "devpts",
   608  			Destination: "/dev/pts",
   609  			Device:      "devpts",
   610  			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
   611  			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
   612  		},
   613  		{
   614  			Device:      "tmpfs",
   615  			Source:      "shm",
   616  			Destination: "/dev/shm",
   617  			Data:        "mode=1777,size=65536k",
   618  			Flags:       defaultMountFlags,
   619  		},
   620  		{
   621  			Source:      "mqueue",
   622  			Destination: "/dev/mqueue",
   623  			Device:      "mqueue",
   624  			Flags:       defaultMountFlags,
   625  		},
   626  		{
   627  			Source:      "sysfs",
   628  			Destination: "/sys",
   629  			Device:      "sysfs",
   630  			Flags:       defaultMountFlags | syscall.MS_RDONLY,
   631  		},
   632  	}
   633  
   634  	if len(command.Mounts) > 0 {
   635  		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
   636  	}
   637  
   638  	return nil
   639  }
   640  
   641  func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
   642  	// If resources are not limited then manually create cgroups needed
   643  	if !command.ResourceLimits {
   644  		return cgutil.ConfigureBasicCgroups(cfg)
   645  	}
   646  
   647  	// set cgroups path
   648  	if cgutil.UseV2 {
   649  		// in v2, the cgroup must have been created by the client already,
   650  		// which breaks a lot of existing tests that run drivers without a client
   651  		if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" {
   652  			return errors.New("cgroup path must be set")
   653  		}
   654  		parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath)
   655  		cfg.Cgroups.Path = filepath.Join("/", parent, cgroup)
   656  	} else {
   657  		// in v1, the cgroup is created using /nomad, which is a bug because it
   658  		// does not respect the cgroup_parent client configuration
   659  		// (but makes testing easy)
   660  		id := uuid.Generate()
   661  		cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id)
   662  	}
   663  
   664  	if command.Resources == nil || command.Resources.NomadResources == nil {
   665  		return nil
   666  	}
   667  
   668  	// Total amount of memory allowed to consume
   669  	res := command.Resources.NomadResources
   670  	memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB
   671  	if memHard <= 0 {
   672  		memHard = res.Memory.MemoryMB
   673  		memSoft = 0
   674  	}
   675  
   676  	if memHard > 0 {
   677  		cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024
   678  		cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024
   679  
   680  		// Disable swap to avoid issues on the machine
   681  		var memSwappiness uint64
   682  		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
   683  	}
   684  
   685  	cpuShares := res.Cpu.CpuShares
   686  	if cpuShares < 2 {
   687  		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
   688  	}
   689  
   690  	// Set the relative CPU shares for this cgroup, and convert for cgroupv2
   691  	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
   692  	cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
   693  
   694  	if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" {
   695  		cfg.Hooks = lconfigs.Hooks{
   696  			lconfigs.CreateRuntime: lconfigs.HookList{
   697  				newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath),
   698  			},
   699  		}
   700  	}
   701  
   702  	return nil
   703  }
   704  
   705  func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
   706  	cfg := &lconfigs.Config{
   707  		Cgroups: &lconfigs.Cgroup{
   708  			Resources: &lconfigs.Resources{
   709  				MemorySwappiness: nil,
   710  			},
   711  		},
   712  		Version: "1.0.0",
   713  	}
   714  
   715  	for _, device := range specconv.AllowedDevices {
   716  		cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule)
   717  	}
   718  
   719  	configureCapabilities(cfg, command)
   720  
   721  	// children should not inherit Nomad agent oom_score_adj value
   722  	oomScoreAdj := 0
   723  	cfg.OomScoreAdj = &oomScoreAdj
   724  
   725  	if err := configureIsolation(cfg, command); err != nil {
   726  		return nil, err
   727  	}
   728  
   729  	if err := configureCgroups(cfg, command); err != nil {
   730  		return nil, err
   731  	}
   732  
   733  	return cfg, nil
   734  }
   735  
   736  // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
   737  func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) {
   738  	if len(driverDevices) == 0 {
   739  		return nil, nil
   740  	}
   741  
   742  	r := make([]*devices.Device, len(driverDevices))
   743  
   744  	for i, d := range driverDevices {
   745  		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
   746  		if err != nil {
   747  			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
   748  		}
   749  		ed.Path = d.TaskPath
   750  		r[i] = ed
   751  	}
   752  
   753  	return r, nil
   754  }
   755  
   756  var userMountToUnixMount = map[string]int{
   757  	// Empty string maps to `rprivate` for backwards compatibility in restored
   758  	// older tasks, where mount propagation will not be present.
   759  	"":                                       unix.MS_PRIVATE | unix.MS_REC, // rprivate
   760  	structs.VolumeMountPropagationPrivate:    unix.MS_PRIVATE | unix.MS_REC, // rprivate
   761  	structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC,   // rslave
   762  	structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
   763  }
   764  
   765  // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
   766  func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
   767  	if len(mounts) == 0 {
   768  		return nil
   769  	}
   770  
   771  	r := make([]*lconfigs.Mount, len(mounts))
   772  
   773  	for i, m := range mounts {
   774  		flags := unix.MS_BIND
   775  		if m.Readonly {
   776  			flags |= unix.MS_RDONLY
   777  		}
   778  
   779  		r[i] = &lconfigs.Mount{
   780  			Source:           m.HostPath,
   781  			Destination:      m.TaskPath,
   782  			Device:           "bind",
   783  			Flags:            flags,
   784  			PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
   785  		}
   786  	}
   787  
   788  	return r
   789  }
   790  
   791  // lookupTaskBin finds the file `bin`, searching in order:
   792  //   - taskDir/local
   793  //   - taskDir
   794  //   - each mount, in order listed in the jobspec
   795  //   - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir
   796  //
   797  // Returns an absolute path inside the container that will get passed as arg[0]
   798  // to the launched process, and the absolute path to that binary as seen by the
   799  // host (these will be identical for binaries that don't come from mounts).
   800  //
   801  // See also executor.lookupBin for a version used by non-isolated drivers.
   802  func lookupTaskBin(command *ExecCommand) (string, string, error) {
   803  	taskDir := command.TaskDir
   804  	bin := command.Cmd
   805  
   806  	// Check in the local directory
   807  	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
   808  	taskPath, hostPath, err := getPathInTaskDir(command.TaskDir, localDir, bin)
   809  	if err == nil {
   810  		return taskPath, hostPath, nil
   811  	}
   812  
   813  	// Check at the root of the task's directory
   814  	taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, command.TaskDir, bin)
   815  	if err == nil {
   816  		return taskPath, hostPath, nil
   817  	}
   818  
   819  	// Check in our mounts
   820  	for _, mount := range command.Mounts {
   821  		taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin)
   822  		if err == nil {
   823  			return taskPath, hostPath, nil
   824  		}
   825  	}
   826  
   827  	// If there's a / in the binary's path, we can't fallback to a PATH search
   828  	if strings.Contains(bin, "/") {
   829  		return "", "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
   830  	}
   831  
   832  	// look for a file using a PATH-style lookup inside the directory
   833  	// root. Similar to the stdlib's exec.LookPath except:
   834  	//   - uses a restricted lookup PATH rather than the agent process's PATH env var.
   835  	//   - does not require that the file is already executable (this will be ensured
   836  	//     by the caller)
   837  	//   - does not prevent using relative path as added to exec.LookPath in go1.19
   838  	//     (this gets fixed-up in the caller)
   839  
   840  	// This is a fake PATH so that we're not using the agent's PATH
   841  	restrictedPaths := []string{"/usr/local/bin", "/usr/bin", "/bin"}
   842  
   843  	for _, dir := range restrictedPaths {
   844  		pathDir := filepath.Join(command.TaskDir, dir)
   845  		taskPath, hostPath, err = getPathInTaskDir(command.TaskDir, pathDir, bin)
   846  		if err == nil {
   847  			return taskPath, hostPath, nil
   848  		}
   849  	}
   850  
   851  	return "", "", fmt.Errorf("file %s not found under path", bin)
   852  }
   853  
   854  // getPathInTaskDir searches for the binary in the task directory and nested
   855  // search directory. It returns the absolute path rooted inside the container
   856  // and the absolute path on the host.
   857  func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) {
   858  
   859  	hostPath := filepath.Join(searchDir, bin)
   860  	err := filepathIsRegular(hostPath)
   861  	if err != nil {
   862  		return "", "", err
   863  	}
   864  
   865  	// Find the path relative to the task directory
   866  	rel, err := filepath.Rel(taskDir, hostPath)
   867  	if rel == "" || err != nil {
   868  		return "", "", fmt.Errorf(
   869  			"failed to determine relative path base=%q target=%q: %v",
   870  			taskDir, hostPath, err)
   871  	}
   872  
   873  	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
   874  	// libcontainer trying to resolve the binary using $PATH.
   875  	// Do *not* use filepath.Join as it will translate ".."s returned by
   876  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   877  	// chroot which is the desired behavior.
   878  	return filepath.Clean("/" + rel), hostPath, nil
   879  }
   880  
   881  // getPathInMount for the binary in the mount's host path, constructing the path
   882  // considering that the bin path is rooted in the mount's task path and not its
   883  // host path. It returns the absolute path rooted inside the container and the
   884  // absolute path on the host.
   885  func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) {
   886  
   887  	// Find the path relative to the mount point in the task so that we can
   888  	// trim off any shared prefix when we search on the host path
   889  	mountRel, err := filepath.Rel(mountTaskPath, bin)
   890  	if mountRel == "" || err != nil {
   891  		return "", "", fmt.Errorf("path was not relative to the mount task path")
   892  	}
   893  
   894  	hostPath := filepath.Join(mountHostPath, mountRel)
   895  
   896  	err = filepathIsRegular(hostPath)
   897  	if err != nil {
   898  		return "", "", err
   899  	}
   900  
   901  	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
   902  	// libcontainer trying to resolve the binary using $PATH.
   903  	// Do *not* use filepath.Join as it will translate ".."s returned by
   904  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   905  	// chroot which is the desired behavior.
   906  	return filepath.Clean("/" + bin), hostPath, nil
   907  }
   908  
   909  // filepathIsRegular verifies that a filepath is a regular file (i.e. not a
   910  // directory, socket, device, etc.)
   911  func filepathIsRegular(path string) error {
   912  	f, err := os.Stat(path)
   913  	if err != nil {
   914  		return err
   915  	}
   916  	if !f.Mode().Type().IsRegular() {
   917  		return fmt.Errorf("path was not a regular file")
   918  	}
   919  	return nil
   920  }
   921  
   922  func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook {
   923  	return lconfigs.NewFunctionHook(func(state *specs.State) error {
   924  		return cgroups.WriteCgroupProc(cgroupPath, state.Pid)
   925  	})
   926  }