github.com/hernad/nomad@v1.6.112/drivers/nix/_executor/executor_linux.go (about)

     1  //go:build linux
     2  
     3  package executor
     4  
     5  import (
     6  	"context"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"io/fs"
    11  	"os"
    12  	"os/exec"
    13  	"path"
    14  	"path/filepath"
    15  	"strings"
    16  	"syscall"
    17  	"time"
    18  
    19  	"github.com/armon/circbuf"
    20  	"github.com/hernad/consul-template/signals"
    21  	hclog "github.com/hashicorp/go-hclog"
    22  	"github.com/hernad/nomad/client/allocdir"
    23  	"github.com/hernad/nomad/client/lib/cgutil"
    24  	"github.com/hernad/nomad/client/lib/resources"
    25  	"github.com/hernad/nomad/client/stats"
    26  	cstructs "github.com/hernad/nomad/client/structs"
    27  	"github.com/hernad/nomad/drivers/shared/capabilities"
    28  	shelpers "github.com/hernad/nomad/helper/stats"
    29  	"github.com/hernad/nomad/helper/uuid"
    30  	"github.com/hernad/nomad/nomad/structs"
    31  	"github.com/hernad/nomad/plugins/drivers"
    32  	"github.com/opencontainers/runc/libcontainer"
    33  	"github.com/opencontainers/runc/libcontainer/cgroups"
    34  	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
    35  	"github.com/opencontainers/runc/libcontainer/devices"
    36  	ldevices "github.com/opencontainers/runc/libcontainer/devices"
    37  	"github.com/opencontainers/runc/libcontainer/specconv"
    38  	lutils "github.com/opencontainers/runc/libcontainer/utils"
    39  	"github.com/opencontainers/runtime-spec/specs-go"
    40  	"golang.org/x/sys/unix"
    41  )
    42  
    43  var (
    44  	// ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1
    45  	ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
    46  
    47  	// ExecutorCgroupV2MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v2. cgroup-v2 exposes different memory stats and no longer reports rss or max usage.
    48  	ExecutorCgroupV2MeasuredMemStats = []string{"Cache", "Swap", "Usage"}
    49  
    50  	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
    51  	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
    52  )
    53  
    54  // LibcontainerExecutor implements an Executor with the runc/libcontainer api
    55  type LibcontainerExecutor struct {
    56  	id      string
    57  	command *ExecCommand
    58  
    59  	logger hclog.Logger
    60  
    61  	totalCpuStats  *stats.CpuStats
    62  	userCpuStats   *stats.CpuStats
    63  	systemCpuStats *stats.CpuStats
    64  	pidCollector   *pidCollector
    65  
    66  	container      libcontainer.Container
    67  	userProc       *libcontainer.Process
    68  	userProcExited chan interface{}
    69  	exitState      *ProcessState
    70  }
    71  
    72  func NewExecutorWithIsolation(logger hclog.Logger) Executor {
    73  	logger = logger.Named("isolated_executor")
    74  	if err := shelpers.Init(); err != nil {
    75  		logger.Error("unable to initialize stats", "error", err)
    76  	}
    77  	return &LibcontainerExecutor{
    78  		id:             strings.ReplaceAll(uuid.Generate(), "-", "_"),
    79  		logger:         logger,
    80  		totalCpuStats:  stats.NewCpuStats(),
    81  		userCpuStats:   stats.NewCpuStats(),
    82  		systemCpuStats: stats.NewCpuStats(),
    83  		pidCollector:   newPidCollector(logger),
    84  	}
    85  }
    86  
    87  // Launch creates a new container in libcontainer and starts a new process with it
    88  func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
    89  	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
    90  
    91  	if command.Resources == nil {
    92  		command.Resources = &drivers.Resources{
    93  			NomadResources: &structs.AllocatedTaskResources{},
    94  		}
    95  	}
    96  
    97  	l.command = command
    98  
    99  	// create a new factory which will store the container state in the allocDir
   100  	factory, err := libcontainer.New(
   101  		path.Join(command.TaskDir, "../alloc/container"),
   102  		// note that os.Args[0] refers to the executor shim typically
   103  		// and first args arguments is ignored now due
   104  		// until https://github.com/opencontainers/runc/pull/1888 is merged
   105  		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
   106  	)
   107  	if err != nil {
   108  		return nil, fmt.Errorf("failed to create factory: %v", err)
   109  	}
   110  
   111  	// A container groups processes under the same isolation enforcement
   112  	containerCfg, err := newLibcontainerConfig(command)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
   115  	}
   116  
   117  	container, err := factory.Create(l.id, containerCfg)
   118  	if err != nil {
   119  		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
   120  	}
   121  	l.container = container
   122  
   123  	// Look up the binary path and make it executable
   124  	taskPath, hostPath, err := lookupTaskBin(command)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	if err := makeExecutable(hostPath); err != nil {
   129  		return nil, err
   130  	}
   131  
   132  	combined := append([]string{taskPath}, command.Args...)
   133  	stdout, err := command.Stdout()
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  	stderr, err := command.Stderr()
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  
   142  	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
   143  
   144  	// the task process will be started by the container
   145  	process := &libcontainer.Process{
   146  		Args:   combined,
   147  		Env:    command.Env,
   148  		Stdout: stdout,
   149  		Stderr: stderr,
   150  		Init:   true,
   151  	}
   152  
   153  	if command.User != "" {
   154  		process.User = command.User
   155  	}
   156  	l.userProc = process
   157  
   158  	l.totalCpuStats = stats.NewCpuStats()
   159  	l.userCpuStats = stats.NewCpuStats()
   160  	l.systemCpuStats = stats.NewCpuStats()
   161  
   162  	// Starts the task
   163  	if err := container.Run(process); err != nil {
   164  		container.Destroy()
   165  		return nil, err
   166  	}
   167  
   168  	pid, err := process.Pid()
   169  	if err != nil {
   170  		container.Destroy()
   171  		return nil, err
   172  	}
   173  
   174  	// start a goroutine to wait on the process to complete, so Wait calls can
   175  	// be multiplexed
   176  	l.userProcExited = make(chan interface{})
   177  	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
   178  	go l.wait()
   179  
   180  	return &ProcessState{
   181  		Pid:      pid,
   182  		ExitCode: -1,
   183  		Time:     time.Now(),
   184  	}, nil
   185  }
   186  
   187  func (l *LibcontainerExecutor) getAllPids() (resources.PIDs, error) {
   188  	pids, err := l.container.Processes()
   189  	if err != nil {
   190  		return nil, err
   191  	}
   192  	m := make(resources.PIDs, 1)
   193  	for _, pid := range pids {
   194  		m[pid] = resources.NewPID(pid)
   195  	}
   196  	return m, nil
   197  }
   198  
   199  // Wait waits until a process has exited and returns it's exitcode and errors
   200  func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
   201  	select {
   202  	case <-ctx.Done():
   203  		return nil, ctx.Err()
   204  	case <-l.userProcExited:
   205  		return l.exitState, nil
   206  	}
   207  }
   208  
   209  func (l *LibcontainerExecutor) wait() {
   210  	defer close(l.userProcExited)
   211  
   212  	ps, err := l.userProc.Wait()
   213  	if err != nil {
   214  		// If the process has exited before we called wait an error is returned
   215  		// the process state is embedded in the error
   216  		if exitErr, ok := err.(*exec.ExitError); ok {
   217  			ps = exitErr.ProcessState
   218  		} else {
   219  			l.logger.Error("failed to call wait on user process", "error", err)
   220  			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
   221  			return
   222  		}
   223  	}
   224  
   225  	l.command.Close()
   226  
   227  	exitCode := 1
   228  	var signal int
   229  	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   230  		exitCode = status.ExitStatus()
   231  		if status.Signaled() {
   232  			const exitSignalBase = 128
   233  			signal = int(status.Signal())
   234  			exitCode = exitSignalBase + signal
   235  		}
   236  	}
   237  
   238  	l.exitState = &ProcessState{
   239  		Pid:      ps.Pid(),
   240  		ExitCode: exitCode,
   241  		Signal:   signal,
   242  		Time:     time.Now(),
   243  	}
   244  }
   245  
   246  // Shutdown stops all processes started and cleans up any resources
   247  // created (such as mountpoints, devices, etc).
   248  func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
   249  	if l.container == nil {
   250  		return nil
   251  	}
   252  
   253  	status, err := l.container.Status()
   254  	if err != nil {
   255  		return err
   256  	}
   257  
   258  	defer l.container.Destroy()
   259  
   260  	if status == libcontainer.Stopped {
   261  		return nil
   262  	}
   263  
   264  	if grace > 0 {
   265  		if signal == "" {
   266  			signal = "SIGINT"
   267  		}
   268  
   269  		sig, ok := signals.SignalLookup[signal]
   270  		if !ok {
   271  			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
   272  		}
   273  
   274  		// Signal initial container processes only during graceful
   275  		// shutdown; hence `false` arg.
   276  		err = l.container.Signal(sig, false)
   277  		if err != nil {
   278  			return err
   279  		}
   280  
   281  		select {
   282  		case <-l.userProcExited:
   283  			return nil
   284  		case <-time.After(grace):
   285  			// Force kill all container processes after grace period,
   286  			// hence `true` argument.
   287  			if err := l.container.Signal(os.Kill, true); err != nil {
   288  				return err
   289  			}
   290  		}
   291  	} else {
   292  		err := l.container.Signal(os.Kill, true)
   293  		if err != nil {
   294  			return err
   295  		}
   296  	}
   297  
   298  	select {
   299  	case <-l.userProcExited:
   300  		return nil
   301  	case <-time.After(time.Second * 15):
   302  		return fmt.Errorf("process failed to exit after 15 seconds")
   303  	}
   304  }
   305  
   306  // UpdateResources updates the resource isolation with new values to be enforced
   307  func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
   308  	return nil
   309  }
   310  
   311  // Version returns the api version of the executor
   312  func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
   313  	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
   314  }
   315  
   316  // Stats returns the resource statistics for processes managed by the executor
   317  func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
   318  	ch := make(chan *cstructs.TaskResourceUsage)
   319  	go l.handleStats(ch, ctx, interval)
   320  	return ch, nil
   321  
   322  }
   323  
   324  func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
   325  	defer close(ch)
   326  	timer := time.NewTimer(0)
   327  
   328  	measuredMemStats := ExecutorCgroupV1MeasuredMemStats
   329  	if cgroups.IsCgroup2UnifiedMode() {
   330  		measuredMemStats = ExecutorCgroupV2MeasuredMemStats
   331  	}
   332  
   333  	for {
   334  		select {
   335  		case <-ctx.Done():
   336  			return
   337  
   338  		case <-timer.C:
   339  			timer.Reset(interval)
   340  		}
   341  
   342  		lstats, err := l.container.Stats()
   343  		if err != nil {
   344  			l.logger.Warn("error collecting stats", "error", err)
   345  			return
   346  		}
   347  
   348  		pidStats, err := l.pidCollector.pidStats()
   349  		if err != nil {
   350  			l.logger.Warn("error collecting stats", "error", err)
   351  			return
   352  		}
   353  
   354  		ts := time.Now()
   355  		stats := lstats.CgroupStats
   356  
   357  		// Memory Related Stats
   358  		swap := stats.MemoryStats.SwapUsage
   359  		maxUsage := stats.MemoryStats.Usage.MaxUsage
   360  		rss := stats.MemoryStats.Stats["rss"]
   361  		cache := stats.MemoryStats.Stats["cache"]
   362  		mapped_file := stats.MemoryStats.Stats["mapped_file"]
   363  		ms := &cstructs.MemoryStats{
   364  			RSS:            rss,
   365  			Cache:          cache,
   366  			Swap:           swap.Usage,
   367  			MappedFile:     mapped_file,
   368  			Usage:          stats.MemoryStats.Usage.Usage,
   369  			MaxUsage:       maxUsage,
   370  			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
   371  			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
   372  			Measured:       measuredMemStats,
   373  		}
   374  
   375  		// CPU Related Stats
   376  		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
   377  		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
   378  		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
   379  
   380  		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
   381  		cs := &cstructs.CpuStats{
   382  			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
   383  			UserMode:         l.userCpuStats.Percent(userModeTime),
   384  			Percent:          totalPercent,
   385  			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
   386  			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
   387  			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
   388  			Measured:         ExecutorCgroupMeasuredCpuStats,
   389  		}
   390  		taskResUsage := cstructs.TaskResourceUsage{
   391  			ResourceUsage: &cstructs.ResourceUsage{
   392  				MemoryStats: ms,
   393  				CpuStats:    cs,
   394  			},
   395  			Timestamp: ts.UTC().UnixNano(),
   396  			Pids:      pidStats,
   397  		}
   398  
   399  		select {
   400  		case <-ctx.Done():
   401  			return
   402  		case ch <- &taskResUsage:
   403  		}
   404  
   405  	}
   406  }
   407  
   408  // Signal sends a signal to the process managed by the executor
   409  func (l *LibcontainerExecutor) Signal(s os.Signal) error {
   410  	return l.userProc.Signal(s)
   411  }
   412  
   413  // Exec starts an additional process inside the container
   414  func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
   415  	combined := append([]string{cmd}, args...)
   416  	// Capture output
   417  	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   418  
   419  	process := &libcontainer.Process{
   420  		Args:   combined,
   421  		Env:    l.command.Env,
   422  		Stdout: buf,
   423  		Stderr: buf,
   424  	}
   425  
   426  	err := l.container.Run(process)
   427  	if err != nil {
   428  		return nil, 0, err
   429  	}
   430  
   431  	waitCh := make(chan *waitResult)
   432  	defer close(waitCh)
   433  	go l.handleExecWait(waitCh, process)
   434  
   435  	select {
   436  	case result := <-waitCh:
   437  		ps := result.ps
   438  		if result.err != nil {
   439  			if exitErr, ok := result.err.(*exec.ExitError); ok {
   440  				ps = exitErr.ProcessState
   441  			} else {
   442  				return nil, 0, result.err
   443  			}
   444  		}
   445  		var exitCode int
   446  		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   447  			exitCode = status.ExitStatus()
   448  		}
   449  		return buf.Bytes(), exitCode, nil
   450  
   451  	case <-time.After(time.Until(deadline)):
   452  		process.Signal(os.Kill)
   453  		return nil, 0, context.DeadlineExceeded
   454  	}
   455  
   456  }
   457  
   458  func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
   459  	parent, child, err := lutils.NewSockPair("socket")
   460  	if err != nil {
   461  		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
   462  	}
   463  
   464  	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
   465  
   466  }
   467  
   468  func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
   469  	stream drivers.ExecTaskStream) error {
   470  
   471  	// the task process will be started by the container
   472  	process := &libcontainer.Process{
   473  		Args: cmd,
   474  		Env:  l.userProc.Env,
   475  		User: l.userProc.User,
   476  		Init: false,
   477  		Cwd:  "/",
   478  	}
   479  
   480  	execHelper := &execHelper{
   481  		logger: l.logger,
   482  
   483  		newTerminal: l.newTerminalSocket,
   484  		setTTY: func(tty *os.File) error {
   485  			process.ConsoleSocket = tty
   486  			return nil
   487  		},
   488  		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
   489  			process.Stdin = stdin
   490  			process.Stdout = stdout
   491  			process.Stderr = stderr
   492  			return nil
   493  		},
   494  
   495  		processStart: func() error { return l.container.Run(process) },
   496  		processWait: func() (*os.ProcessState, error) {
   497  			return process.Wait()
   498  		},
   499  	}
   500  
   501  	return execHelper.run(ctx, tty, stream)
   502  
   503  }
   504  
   505  type waitResult struct {
   506  	ps  *os.ProcessState
   507  	err error
   508  }
   509  
   510  func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
   511  	ps, err := process.Wait()
   512  	ch <- &waitResult{ps, err}
   513  }
   514  
   515  func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) {
   516  	switch command.User {
   517  	case "root":
   518  		// when running as root, use the legacy set of system capabilities, so
   519  		// that we do not break existing nomad clusters using this "feature"
   520  		legacyCaps := capabilities.LegacySupported().Slice(true)
   521  		cfg.Capabilities = &lconfigs.Capabilities{
   522  			Bounding:    legacyCaps,
   523  			Permitted:   legacyCaps,
   524  			Effective:   legacyCaps,
   525  			Ambient:     nil,
   526  			Inheritable: nil,
   527  		}
   528  	default:
   529  		// otherwise apply the plugin + task capability configuration
   530  		cfg.Capabilities = &lconfigs.Capabilities{
   531  			Bounding: command.Capabilities,
   532  		}
   533  	}
   534  }
   535  
   536  func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces {
   537  	namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}}
   538  	if pidMode == IsolationModePrivate {
   539  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID})
   540  	}
   541  	if ipcMode == IsolationModePrivate {
   542  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC})
   543  	}
   544  	return namespaces
   545  }
   546  
   547  // configureIsolation prepares the isolation primitives of the container.
   548  // The process runs in a container configured with the following:
   549  //
   550  // * the task directory as the chroot
   551  // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
   552  // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
   553  // * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
   554  func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
   555  	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   556  
   557  	// set the new root directory for the container
   558  	cfg.Rootfs = command.TaskDir
   559  
   560  	// disable pivot_root if set in the driver's configuration
   561  	cfg.NoPivotRoot = command.NoPivotRoot
   562  
   563  	// set up default namespaces as configured
   564  	cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC)
   565  
   566  	if command.NetworkIsolation != nil {
   567  		cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
   568  			Type: lconfigs.NEWNET,
   569  			Path: command.NetworkIsolation.Path,
   570  		})
   571  	}
   572  
   573  	// paths to mask using a bind mount to /dev/null to prevent reading
   574  	cfg.MaskPaths = []string{
   575  		"/proc/kcore",
   576  		"/sys/firmware",
   577  	}
   578  
   579  	// paths that should be remounted as readonly inside the container
   580  	cfg.ReadonlyPaths = []string{
   581  		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
   582  	}
   583  
   584  	cfg.Devices = specconv.AllowedDevices
   585  	if len(command.Devices) > 0 {
   586  		devs, err := cmdDevices(command.Devices)
   587  		if err != nil {
   588  			return err
   589  		}
   590  		cfg.Devices = append(cfg.Devices, devs...)
   591  	}
   592  
   593  	cfg.Mounts = []*lconfigs.Mount{
   594  		{
   595  			Source:      "tmpfs",
   596  			Destination: "/dev",
   597  			Device:      "tmpfs",
   598  			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
   599  			Data:        "mode=755",
   600  		},
   601  		{
   602  			Source:      "proc",
   603  			Destination: "/proc",
   604  			Device:      "proc",
   605  			Flags:       defaultMountFlags,
   606  		},
   607  		{
   608  			Source:      "devpts",
   609  			Destination: "/dev/pts",
   610  			Device:      "devpts",
   611  			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
   612  			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
   613  		},
   614  		{
   615  			Device:      "tmpfs",
   616  			Source:      "shm",
   617  			Destination: "/dev/shm",
   618  			Data:        "mode=1777,size=65536k",
   619  			Flags:       defaultMountFlags,
   620  		},
   621  		{
   622  			Source:      "mqueue",
   623  			Destination: "/dev/mqueue",
   624  			Device:      "mqueue",
   625  			Flags:       defaultMountFlags,
   626  		},
   627  		{
   628  			Source:      "sysfs",
   629  			Destination: "/sys",
   630  			Device:      "sysfs",
   631  			Flags:       defaultMountFlags | syscall.MS_RDONLY,
   632  		},
   633  	}
   634  
   635  	if len(command.Mounts) > 0 {
   636  		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
   637  	}
   638  
   639  	return nil
   640  }
   641  
   642  func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
   643  	// If resources are not limited then manually create cgroups needed
   644  	if !command.ResourceLimits {
   645  		return cgutil.ConfigureBasicCgroups(cfg)
   646  	}
   647  
   648  	// set cgroups path
   649  	if cgutil.UseV2 {
   650  		// in v2, the cgroup must have been created by the client already,
   651  		// which breaks a lot of existing tests that run drivers without a client
   652  		if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" {
   653  			return errors.New("cgroup path must be set")
   654  		}
   655  		parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath)
   656  		cfg.Cgroups.Path = filepath.Join("/", parent, cgroup)
   657  	} else {
   658  		// in v1, the cgroup is created using /nomad, which is a bug because it
   659  		// does not respect the cgroup_parent client configuration
   660  		// (but makes testing easy)
   661  		id := uuid.Generate()
   662  		cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id)
   663  	}
   664  
   665  	if command.Resources == nil || command.Resources.NomadResources == nil {
   666  		return nil
   667  	}
   668  
   669  	// Total amount of memory allowed to consume
   670  	res := command.Resources.NomadResources
   671  	memHard, memSoft := res.Memory.MemoryMaxMB, res.Memory.MemoryMB
   672  	if memHard <= 0 {
   673  		memHard = res.Memory.MemoryMB
   674  		memSoft = 0
   675  	}
   676  
   677  	if memHard > 0 {
   678  		cfg.Cgroups.Resources.Memory = memHard * 1024 * 1024
   679  		cfg.Cgroups.Resources.MemoryReservation = memSoft * 1024 * 1024
   680  
   681  		// Disable swap to avoid issues on the machine
   682  		var memSwappiness uint64
   683  		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
   684  	}
   685  
   686  	cpuShares := res.Cpu.CpuShares
   687  	if cpuShares < 2 {
   688  		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
   689  	}
   690  
   691  	// Set the relative CPU shares for this cgroup, and convert for cgroupv2
   692  	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
   693  	cfg.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
   694  
   695  	if command.Resources.LinuxResources != nil && command.Resources.LinuxResources.CpusetCgroupPath != "" {
   696  		cfg.Hooks = lconfigs.Hooks{
   697  			lconfigs.CreateRuntime: lconfigs.HookList{
   698  				newSetCPUSetCgroupHook(command.Resources.LinuxResources.CpusetCgroupPath),
   699  			},
   700  		}
   701  	}
   702  
   703  	return nil
   704  }
   705  
   706  func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
   707  	cfg := &lconfigs.Config{
   708  		Cgroups: &lconfigs.Cgroup{
   709  			Resources: &lconfigs.Resources{
   710  				MemorySwappiness: nil,
   711  			},
   712  		},
   713  		Version: "1.0.0",
   714  	}
   715  
   716  	for _, device := range specconv.AllowedDevices {
   717  		cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule)
   718  	}
   719  
   720  	configureCapabilities(cfg, command)
   721  
   722  	// children should not inherit Nomad agent oom_score_adj value
   723  	oomScoreAdj := 0
   724  	cfg.OomScoreAdj = &oomScoreAdj
   725  
   726  	if err := configureIsolation(cfg, command); err != nil {
   727  		return nil, err
   728  	}
   729  
   730  	if err := configureCgroups(cfg, command); err != nil {
   731  		return nil, err
   732  	}
   733  
   734  	return cfg, nil
   735  }
   736  
   737  // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
   738  func cmdDevices(driverDevices []*drivers.DeviceConfig) ([]*devices.Device, error) {
   739  	if len(driverDevices) == 0 {
   740  		return nil, nil
   741  	}
   742  
   743  	r := make([]*devices.Device, len(driverDevices))
   744  
   745  	for i, d := range driverDevices {
   746  		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
   747  		if err != nil {
   748  			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
   749  		}
   750  		ed.Path = d.TaskPath
   751  		r[i] = ed
   752  	}
   753  
   754  	return r, nil
   755  }
   756  
   757  var userMountToUnixMount = map[string]int{
   758  	// Empty string maps to `rprivate` for backwards compatibility in restored
   759  	// older tasks, where mount propagation will not be present.
   760  	"":                                       unix.MS_PRIVATE | unix.MS_REC, // rprivate
   761  	structs.VolumeMountPropagationPrivate:    unix.MS_PRIVATE | unix.MS_REC, // rprivate
   762  	structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC,   // rslave
   763  	structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
   764  }
   765  
   766  // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
   767  func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
   768  	if len(mounts) == 0 {
   769  		return nil
   770  	}
   771  
   772  	r := make([]*lconfigs.Mount, len(mounts))
   773  
   774  	for i, m := range mounts {
   775  		flags := unix.MS_BIND
   776  		if m.Readonly {
   777  			flags |= unix.MS_RDONLY
   778  		}
   779  
   780  		r[i] = &lconfigs.Mount{
   781  			Source:           m.HostPath,
   782  			Destination:      m.TaskPath,
   783  			Device:           "bind",
   784  			Flags:            flags,
   785  			PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
   786  		}
   787  	}
   788  
   789  	return r
   790  }
   791  
   792  // lookupTaskBin finds the file `bin`, searching in order:
   793  //   - taskDir/local
   794  //   - taskDir
   795  //   - each mount, in order listed in the jobspec
   796  //   - a PATH-like search of usr/local/bin/, usr/bin/, and bin/ inside the taskDir
   797  //
   798  // Returns an absolute path inside the container that will get passed as arg[0]
   799  // to the launched process, and the absolute path to that binary as seen by the
   800  // host (these will be identical for binaries that don't come from mounts).
   801  //
   802  // See also executor.lookupBin for a version used by non-isolated drivers.
   803  func lookupTaskBin(command *ExecCommand) (string, string, error) {
   804  	cmd := command.Cmd
   805  
   806  	taskPath, hostPath, err := lookupBinFile(command, cmd)
   807  	if err == nil {
   808  		return taskPath, hostPath, nil
   809  	}
   810  
   811  	if !strings.Contains(cmd, "/") {
   812  		// Look up also in /bin
   813  		bin := filepath.Join("/bin", cmd)
   814  		taskPath, hostPath, err = lookupBinFile(command, bin)
   815  		if err == nil {
   816  			return taskPath, hostPath, nil
   817  		}
   818  
   819  		return "", "", fmt.Errorf("file %s not found in task dir or in mounts, even when looking up /bin", cmd)
   820  	} else {
   821  		// If there's a / in the binary's path, we can't fallback to a PATH search
   822  		return "", "", fmt.Errorf("file %s not found in task dir or in mounts", cmd)
   823  	}
   824  
   825  }
   826  
   827  func lookupBinFile(command *ExecCommand, bin string) (string, string, error) {
   828  	taskDir := command.TaskDir
   829  
   830  	// Check in the local directory
   831  	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
   832  	taskPath, hostPath, err := getPathInTaskDir(taskDir, localDir, bin)
   833  	if err == nil {
   834  		return taskPath, hostPath, nil
   835  	}
   836  
   837  	// Check at the root of the task's directory
   838  	taskPath, hostPath, err = getPathInTaskDir(taskDir, taskDir, bin)
   839  	if err == nil {
   840  		return taskPath, hostPath, nil
   841  	}
   842  
   843  	// Check in our mounts
   844  	for _, mount := range command.Mounts {
   845  		taskPath, hostPath, err = getPathInMount(mount.HostPath, mount.TaskPath, bin)
   846  		if err == nil {
   847  			return taskPath, hostPath, nil
   848  		}
   849  	}
   850  
   851  	return "", "", fmt.Errorf("file %s not found in task dir or in mounts", bin)
   852  }
   853  
   854  // getPathInTaskDir searches for the binary in the task directory and nested
   855  // search directory. It returns the absolute path rooted inside the container
   856  // and the absolute path on the host.
   857  func getPathInTaskDir(taskDir, searchDir, bin string) (string, string, error) {
   858  
   859  	hostPath := filepath.Join(searchDir, bin)
   860  	err := filepathIsRegular(hostPath)
   861  	if err != nil {
   862  		return "", "", err
   863  	}
   864  
   865  	// Find the path relative to the task directory
   866  	rel, err := filepath.Rel(taskDir, hostPath)
   867  	if rel == "" || err != nil {
   868  		return "", "", fmt.Errorf(
   869  			"failed to determine relative path base=%q target=%q: %v",
   870  			taskDir, hostPath, err)
   871  	}
   872  
   873  	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
   874  	// libcontainer trying to resolve the binary using $PATH.
   875  	// Do *not* use filepath.Join as it will translate ".."s returned by
   876  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   877  	// chroot which is the desired behavior.
   878  	return filepath.Clean("/" + rel), hostPath, nil
   879  }
   880  
   881  // getPathInMount for the binary in the mount's host path, constructing the path
   882  // considering that the bin path is rooted in the mount's task path and not its
   883  // host path. It returns the absolute path rooted inside the container and the
   884  // absolute path on the host.
   885  func getPathInMount(mountHostPath, mountTaskPath, bin string) (string, string, error) {
   886  
   887  	// Find the path relative to the mount point in the task so that we can
   888  	// trim off any shared prefix when we search on the host path
   889  	mountRel, err := filepath.Rel(mountTaskPath, bin)
   890  	if mountRel == "" || err != nil {
   891  		return "", "", fmt.Errorf("path was not relative to the mount task path")
   892  	}
   893  
   894  	hostPath := filepath.Join(mountHostPath, mountRel)
   895  
   896  	err = filepathIsRegular(hostPath)
   897  	if err != nil {
   898  		return "", "", err
   899  	}
   900  
   901  	// Turn relative-to-taskdir path into re-rooted absolute path to avoid
   902  	// libcontainer trying to resolve the binary using $PATH.
   903  	// Do *not* use filepath.Join as it will translate ".."s returned by
   904  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   905  	// chroot which is the desired behavior.
   906  	return filepath.Clean("/" + bin), hostPath, nil
   907  }
   908  
   909  // filepathIsRegular verifies that a filepath is a regular file (i.e. not a
   910  // directory, socket, device, etc.)
   911  func filepathIsRegular(path string) error {
   912  	f, err := os.Stat(path)
   913  	if err != nil {
   914  		return err
   915  	}
   916  	if !(f.Mode().Type().IsRegular() || f.Mode().Type() & fs.ModeType == fs.ModeSymlink) {
   917  		return fmt.Errorf("path was not a regular file")
   918  	}
   919  	return nil
   920  }
   921  
   922  func newSetCPUSetCgroupHook(cgroupPath string) lconfigs.Hook {
   923  	return lconfigs.NewFunctionHook(func(state *specs.State) error {
   924  		return cgroups.WriteCgroupProc(cgroupPath, state.Pid)
   925  	})
   926  }