github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/drivers/shared/executor/executor_linux.go (about)

     1  // +build linux
     2  
     3  package executor
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"os/exec"
    11  	"path"
    12  	"path/filepath"
    13  	"strings"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/armon/circbuf"
    18  	"github.com/hashicorp/consul-template/signals"
    19  	hclog "github.com/hashicorp/go-hclog"
    20  	"github.com/hashicorp/nomad/client/allocdir"
    21  	"github.com/hashicorp/nomad/client/stats"
    22  	cstructs "github.com/hashicorp/nomad/client/structs"
    23  	shelpers "github.com/hashicorp/nomad/helper/stats"
    24  	"github.com/hashicorp/nomad/helper/uuid"
    25  	"github.com/hashicorp/nomad/nomad/structs"
    26  	"github.com/hashicorp/nomad/plugins/drivers"
    27  	"github.com/opencontainers/runc/libcontainer"
    28  	"github.com/opencontainers/runc/libcontainer/cgroups"
    29  	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
    30  	ldevices "github.com/opencontainers/runc/libcontainer/devices"
    31  	lutils "github.com/opencontainers/runc/libcontainer/utils"
    32  	"github.com/syndtr/gocapability/capability"
    33  	"golang.org/x/sys/unix"
    34  )
    35  
    36  const (
    37  	defaultCgroupParent = "/nomad"
    38  )
    39  
    40  var (
    41  	// ExecutorCgroupMeasuredMemStats is the list of memory stats captured by the executor
    42  	ExecutorCgroupMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
    43  
    44  	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
    45  	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
    46  )
    47  
    48  // LibcontainerExecutor implements an Executor with the runc/libcontainer api
    49  type LibcontainerExecutor struct {
    50  	id      string
    51  	command *ExecCommand
    52  
    53  	logger hclog.Logger
    54  
    55  	totalCpuStats  *stats.CpuStats
    56  	userCpuStats   *stats.CpuStats
    57  	systemCpuStats *stats.CpuStats
    58  	pidCollector   *pidCollector
    59  
    60  	container      libcontainer.Container
    61  	userProc       *libcontainer.Process
    62  	userProcExited chan interface{}
    63  	exitState      *ProcessState
    64  }
    65  
    66  func NewExecutorWithIsolation(logger hclog.Logger) Executor {
    67  	logger = logger.Named("isolated_executor")
    68  	if err := shelpers.Init(); err != nil {
    69  		logger.Error("unable to initialize stats", "error", err)
    70  	}
    71  	return &LibcontainerExecutor{
    72  		id:             strings.Replace(uuid.Generate(), "-", "_", -1),
    73  		logger:         logger,
    74  		totalCpuStats:  stats.NewCpuStats(),
    75  		userCpuStats:   stats.NewCpuStats(),
    76  		systemCpuStats: stats.NewCpuStats(),
    77  		pidCollector:   newPidCollector(logger),
    78  	}
    79  }
    80  
    81  // Launch creates a new container in libcontainer and starts a new process with it
    82  func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
    83  	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
    84  
    85  	if command.Resources == nil {
    86  		command.Resources = &drivers.Resources{
    87  			NomadResources: &structs.AllocatedTaskResources{},
    88  		}
    89  	}
    90  
    91  	l.command = command
    92  
    93  	// create a new factory which will store the container state in the allocDir
    94  	factory, err := libcontainer.New(
    95  		path.Join(command.TaskDir, "../alloc/container"),
    96  		libcontainer.Cgroupfs,
    97  		// note that os.Args[0] refers to the executor shim typically
    98  		// and first args arguments is ignored now due
    99  		// until https://github.com/opencontainers/runc/pull/1888 is merged
   100  		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
   101  	)
   102  	if err != nil {
   103  		return nil, fmt.Errorf("failed to create factory: %v", err)
   104  	}
   105  
   106  	// A container groups processes under the same isolation enforcement
   107  	containerCfg, err := newLibcontainerConfig(command)
   108  	if err != nil {
   109  		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
   110  	}
   111  
   112  	container, err := factory.Create(l.id, containerCfg)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
   115  	}
   116  	l.container = container
   117  
   118  	// Look up the binary path and make it executable
   119  	absPath, err := lookupTaskBin(command)
   120  
   121  	if err != nil {
   122  		return nil, err
   123  	}
   124  
   125  	if err := makeExecutable(absPath); err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	path := absPath
   130  
   131  	// Ensure that the path is contained in the chroot, and find it relative to the container
   132  	rel, err := filepath.Rel(command.TaskDir, path)
   133  	if err != nil {
   134  		return nil, fmt.Errorf("failed to determine relative path base=%q target=%q: %v", command.TaskDir, path, err)
   135  	}
   136  
   137  	// Turn relative-to-chroot path into absolute path to avoid
   138  	// libcontainer trying to resolve the binary using $PATH.
   139  	// Do *not* use filepath.Join as it will translate ".."s returned by
   140  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   141  	// chroot which is the desired behavior.
   142  	path = "/" + rel
   143  
   144  	combined := append([]string{path}, command.Args...)
   145  	stdout, err := command.Stdout()
   146  	if err != nil {
   147  		return nil, err
   148  	}
   149  	stderr, err := command.Stderr()
   150  	if err != nil {
   151  		return nil, err
   152  	}
   153  
   154  	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
   155  
   156  	// the task process will be started by the container
   157  	process := &libcontainer.Process{
   158  		Args:   combined,
   159  		Env:    command.Env,
   160  		Stdout: stdout,
   161  		Stderr: stderr,
   162  		Init:   true,
   163  	}
   164  
   165  	if command.User != "" {
   166  		process.User = command.User
   167  	}
   168  	l.userProc = process
   169  
   170  	l.totalCpuStats = stats.NewCpuStats()
   171  	l.userCpuStats = stats.NewCpuStats()
   172  	l.systemCpuStats = stats.NewCpuStats()
   173  
   174  	// Starts the task
   175  	if err := container.Run(process); err != nil {
   176  		container.Destroy()
   177  		return nil, err
   178  	}
   179  
   180  	pid, err := process.Pid()
   181  	if err != nil {
   182  		container.Destroy()
   183  		return nil, err
   184  	}
   185  
   186  	// start a goroutine to wait on the process to complete, so Wait calls can
   187  	// be multiplexed
   188  	l.userProcExited = make(chan interface{})
   189  	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
   190  	go l.wait()
   191  
   192  	return &ProcessState{
   193  		Pid:      pid,
   194  		ExitCode: -1,
   195  		Time:     time.Now(),
   196  	}, nil
   197  }
   198  
   199  func (l *LibcontainerExecutor) getAllPids() (map[int]*nomadPid, error) {
   200  	pids, err := l.container.Processes()
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  	nPids := make(map[int]*nomadPid)
   205  	for _, pid := range pids {
   206  		nPids[pid] = &nomadPid{
   207  			pid:           pid,
   208  			cpuStatsTotal: stats.NewCpuStats(),
   209  			cpuStatsUser:  stats.NewCpuStats(),
   210  			cpuStatsSys:   stats.NewCpuStats(),
   211  		}
   212  	}
   213  	return nPids, nil
   214  }
   215  
   216  // Wait waits until a process has exited and returns it's exitcode and errors
   217  func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
   218  	select {
   219  	case <-ctx.Done():
   220  		return nil, ctx.Err()
   221  	case <-l.userProcExited:
   222  		return l.exitState, nil
   223  	}
   224  }
   225  
   226  func (l *LibcontainerExecutor) wait() {
   227  	defer close(l.userProcExited)
   228  
   229  	ps, err := l.userProc.Wait()
   230  	if err != nil {
   231  		// If the process has exited before we called wait an error is returned
   232  		// the process state is embedded in the error
   233  		if exitErr, ok := err.(*exec.ExitError); ok {
   234  			ps = exitErr.ProcessState
   235  		} else {
   236  			l.logger.Error("failed to call wait on user process", "error", err)
   237  			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
   238  			return
   239  		}
   240  	}
   241  
   242  	l.command.Close()
   243  
   244  	exitCode := 1
   245  	var signal int
   246  	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   247  		exitCode = status.ExitStatus()
   248  		if status.Signaled() {
   249  			const exitSignalBase = 128
   250  			signal = int(status.Signal())
   251  			exitCode = exitSignalBase + signal
   252  		}
   253  	}
   254  
   255  	l.exitState = &ProcessState{
   256  		Pid:      ps.Pid(),
   257  		ExitCode: exitCode,
   258  		Signal:   signal,
   259  		Time:     time.Now(),
   260  	}
   261  }
   262  
   263  // Shutdown stops all processes started and cleans up any resources
   264  // created (such as mountpoints, devices, etc).
   265  func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
   266  	if l.container == nil {
   267  		return nil
   268  	}
   269  
   270  	status, err := l.container.Status()
   271  	if err != nil {
   272  		return err
   273  	}
   274  
   275  	defer l.container.Destroy()
   276  
   277  	if status == libcontainer.Stopped {
   278  		return nil
   279  	}
   280  
   281  	if grace > 0 {
   282  		if signal == "" {
   283  			signal = "SIGINT"
   284  		}
   285  
   286  		sig, ok := signals.SignalLookup[signal]
   287  		if !ok {
   288  			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
   289  		}
   290  
   291  		// Signal initial container processes only during graceful
   292  		// shutdown; hence `false` arg.
   293  		err = l.container.Signal(sig, false)
   294  		if err != nil {
   295  			return err
   296  		}
   297  
   298  		select {
   299  		case <-l.userProcExited:
   300  			return nil
   301  		case <-time.After(grace):
   302  			// Force kill all container processes after grace period,
   303  			// hence `true` argument.
   304  			if err := l.container.Signal(os.Kill, true); err != nil {
   305  				return err
   306  			}
   307  		}
   308  	} else {
   309  		if err := l.container.Signal(os.Kill, true); err != nil {
   310  			return err
   311  		}
   312  	}
   313  
   314  	select {
   315  	case <-l.userProcExited:
   316  		return nil
   317  	case <-time.After(time.Second * 15):
   318  		return fmt.Errorf("process failed to exit after 15 seconds")
   319  	}
   320  }
   321  
   322  // UpdateResources updates the resource isolation with new values to be enforced
   323  func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
   324  	return nil
   325  }
   326  
   327  // Version returns the api version of the executor
   328  func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
   329  	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
   330  }
   331  
   332  // Stats returns the resource statistics for processes managed by the executor
   333  func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
   334  	ch := make(chan *cstructs.TaskResourceUsage)
   335  	go l.handleStats(ch, ctx, interval)
   336  	return ch, nil
   337  
   338  }
   339  
   340  func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
   341  	defer close(ch)
   342  	timer := time.NewTimer(0)
   343  	for {
   344  		select {
   345  		case <-ctx.Done():
   346  			return
   347  
   348  		case <-timer.C:
   349  			timer.Reset(interval)
   350  		}
   351  
   352  		lstats, err := l.container.Stats()
   353  		if err != nil {
   354  			l.logger.Warn("error collecting stats", "error", err)
   355  			return
   356  		}
   357  
   358  		pidStats, err := l.pidCollector.pidStats()
   359  		if err != nil {
   360  			l.logger.Warn("error collecting stats", "error", err)
   361  			return
   362  		}
   363  
   364  		ts := time.Now()
   365  		stats := lstats.CgroupStats
   366  
   367  		// Memory Related Stats
   368  		swap := stats.MemoryStats.SwapUsage
   369  		maxUsage := stats.MemoryStats.Usage.MaxUsage
   370  		rss := stats.MemoryStats.Stats["rss"]
   371  		cache := stats.MemoryStats.Stats["cache"]
   372  		ms := &cstructs.MemoryStats{
   373  			RSS:            rss,
   374  			Cache:          cache,
   375  			Swap:           swap.Usage,
   376  			Usage:          stats.MemoryStats.Usage.Usage,
   377  			MaxUsage:       maxUsage,
   378  			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
   379  			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
   380  			Measured:       ExecutorCgroupMeasuredMemStats,
   381  		}
   382  
   383  		// CPU Related Stats
   384  		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
   385  		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
   386  		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
   387  
   388  		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
   389  		cs := &cstructs.CpuStats{
   390  			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
   391  			UserMode:         l.userCpuStats.Percent(userModeTime),
   392  			Percent:          totalPercent,
   393  			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
   394  			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
   395  			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
   396  			Measured:         ExecutorCgroupMeasuredCpuStats,
   397  		}
   398  		taskResUsage := cstructs.TaskResourceUsage{
   399  			ResourceUsage: &cstructs.ResourceUsage{
   400  				MemoryStats: ms,
   401  				CpuStats:    cs,
   402  			},
   403  			Timestamp: ts.UTC().UnixNano(),
   404  			Pids:      pidStats,
   405  		}
   406  
   407  		select {
   408  		case <-ctx.Done():
   409  			return
   410  		case ch <- &taskResUsage:
   411  		}
   412  
   413  	}
   414  }
   415  
   416  // Signal sends a signal to the process managed by the executor
   417  func (l *LibcontainerExecutor) Signal(s os.Signal) error {
   418  	return l.userProc.Signal(s)
   419  }
   420  
   421  // Exec starts an additional process inside the container
   422  func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
   423  	combined := append([]string{cmd}, args...)
   424  	// Capture output
   425  	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   426  
   427  	process := &libcontainer.Process{
   428  		Args:   combined,
   429  		Env:    l.command.Env,
   430  		Stdout: buf,
   431  		Stderr: buf,
   432  	}
   433  
   434  	err := l.container.Run(process)
   435  	if err != nil {
   436  		return nil, 0, err
   437  	}
   438  
   439  	waitCh := make(chan *waitResult)
   440  	defer close(waitCh)
   441  	go l.handleExecWait(waitCh, process)
   442  
   443  	select {
   444  	case result := <-waitCh:
   445  		ps := result.ps
   446  		if result.err != nil {
   447  			if exitErr, ok := result.err.(*exec.ExitError); ok {
   448  				ps = exitErr.ProcessState
   449  			} else {
   450  				return nil, 0, result.err
   451  			}
   452  		}
   453  		var exitCode int
   454  		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   455  			exitCode = status.ExitStatus()
   456  		}
   457  		return buf.Bytes(), exitCode, nil
   458  
   459  	case <-time.After(time.Until(deadline)):
   460  		process.Signal(os.Kill)
   461  		return nil, 0, context.DeadlineExceeded
   462  	}
   463  
   464  }
   465  
   466  func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
   467  	parent, child, err := lutils.NewSockPair("socket")
   468  	if err != nil {
   469  		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
   470  	}
   471  
   472  	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
   473  
   474  }
   475  
   476  func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
   477  	stream drivers.ExecTaskStream) error {
   478  
   479  	// the task process will be started by the container
   480  	process := &libcontainer.Process{
   481  		Args: cmd,
   482  		Env:  l.userProc.Env,
   483  		User: l.userProc.User,
   484  		Init: false,
   485  		Cwd:  "/",
   486  	}
   487  
   488  	execHelper := &execHelper{
   489  		logger: l.logger,
   490  
   491  		newTerminal: l.newTerminalSocket,
   492  		setTTY: func(tty *os.File) error {
   493  			process.ConsoleSocket = tty
   494  			return nil
   495  		},
   496  		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
   497  			process.Stdin = stdin
   498  			process.Stdout = stdout
   499  			process.Stderr = stderr
   500  			return nil
   501  		},
   502  
   503  		processStart: func() error { return l.container.Run(process) },
   504  		processWait: func() (*os.ProcessState, error) {
   505  			return process.Wait()
   506  		},
   507  	}
   508  
   509  	return execHelper.run(ctx, tty, stream)
   510  
   511  }
   512  
   513  type waitResult struct {
   514  	ps  *os.ProcessState
   515  	err error
   516  }
   517  
   518  func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
   519  	ps, err := process.Wait()
   520  	ch <- &waitResult{ps, err}
   521  }
   522  
   523  func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) error {
   524  	// TODO: allow better control of these
   525  	// use capabilities list as prior to adopting libcontainer in 0.9
   526  	allCaps := supportedCaps()
   527  
   528  	// match capabilities used in Nomad 0.8
   529  	if command.User == "root" {
   530  		cfg.Capabilities = &lconfigs.Capabilities{
   531  			Bounding:    allCaps,
   532  			Permitted:   allCaps,
   533  			Effective:   allCaps,
   534  			Ambient:     nil,
   535  			Inheritable: nil,
   536  		}
   537  	} else {
   538  		cfg.Capabilities = &lconfigs.Capabilities{
   539  			Bounding: allCaps,
   540  		}
   541  	}
   542  
   543  	return nil
   544  }
   545  
   546  // supportedCaps returns a list of all supported capabilities in kernel
   547  func supportedCaps() []string {
   548  	allCaps := []string{}
   549  	last := capability.CAP_LAST_CAP
   550  	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
   551  	if last == capability.Cap(63) {
   552  		last = capability.CAP_BLOCK_SUSPEND
   553  	}
   554  	for _, cap := range capability.List() {
   555  		if cap > last {
   556  			continue
   557  		}
   558  		allCaps = append(allCaps, fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())))
   559  	}
   560  	return allCaps
   561  }
   562  
   563  // configureIsolation prepares the isolation primitives of the container.
   564  // The process runs in a container configured with the following:
   565  //
   566  // * the task directory as the chroot
   567  // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
   568  // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
   569  // * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
   570  func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
   571  	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   572  
   573  	// set the new root directory for the container
   574  	cfg.Rootfs = command.TaskDir
   575  
   576  	// disable pivot_root if set in the driver's configuration
   577  	cfg.NoPivotRoot = command.NoPivotRoot
   578  
   579  	// launch with mount namespace
   580  	cfg.Namespaces = lconfigs.Namespaces{
   581  		{Type: lconfigs.NEWNS},
   582  	}
   583  
   584  	if command.NetworkIsolation != nil {
   585  		cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
   586  			Type: lconfigs.NEWNET,
   587  			Path: command.NetworkIsolation.Path,
   588  		})
   589  	}
   590  
   591  	// paths to mask using a bind mount to /dev/null to prevent reading
   592  	cfg.MaskPaths = []string{
   593  		"/proc/kcore",
   594  		"/sys/firmware",
   595  	}
   596  
   597  	// paths that should be remounted as readonly inside the container
   598  	cfg.ReadonlyPaths = []string{
   599  		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
   600  	}
   601  
   602  	cfg.Devices = lconfigs.DefaultAutoCreatedDevices
   603  	if len(command.Devices) > 0 {
   604  		devs, err := cmdDevices(command.Devices)
   605  		if err != nil {
   606  			return err
   607  		}
   608  		cfg.Devices = append(cfg.Devices, devs...)
   609  	}
   610  
   611  	cfg.Mounts = []*lconfigs.Mount{
   612  		{
   613  			Source:      "tmpfs",
   614  			Destination: "/dev",
   615  			Device:      "tmpfs",
   616  			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
   617  			Data:        "mode=755",
   618  		},
   619  		{
   620  			Source:      "proc",
   621  			Destination: "/proc",
   622  			Device:      "proc",
   623  			Flags:       defaultMountFlags,
   624  		},
   625  		{
   626  			Source:      "devpts",
   627  			Destination: "/dev/pts",
   628  			Device:      "devpts",
   629  			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
   630  			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
   631  		},
   632  		{
   633  			Device:      "tmpfs",
   634  			Source:      "shm",
   635  			Destination: "/dev/shm",
   636  			Data:        "mode=1777,size=65536k",
   637  			Flags:       defaultMountFlags,
   638  		},
   639  		{
   640  			Source:      "mqueue",
   641  			Destination: "/dev/mqueue",
   642  			Device:      "mqueue",
   643  			Flags:       defaultMountFlags,
   644  		},
   645  		{
   646  			Source:      "sysfs",
   647  			Destination: "/sys",
   648  			Device:      "sysfs",
   649  			Flags:       defaultMountFlags | syscall.MS_RDONLY,
   650  		},
   651  	}
   652  
   653  	if len(command.Mounts) > 0 {
   654  		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
   655  	}
   656  
   657  	return nil
   658  }
   659  
   660  func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
   661  
   662  	// If resources are not limited then manually create cgroups needed
   663  	if !command.ResourceLimits {
   664  		return configureBasicCgroups(cfg)
   665  	}
   666  
   667  	id := uuid.Generate()
   668  	cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id)
   669  
   670  	if command.Resources == nil || command.Resources.NomadResources == nil {
   671  		return nil
   672  	}
   673  
   674  	if mb := command.Resources.NomadResources.Memory.MemoryMB; mb > 0 {
   675  		// Total amount of memory allowed to consume
   676  		cfg.Cgroups.Resources.Memory = mb * 1024 * 1024
   677  		// Disable swap to avoid issues on the machine
   678  		var memSwappiness uint64
   679  		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
   680  	}
   681  
   682  	cpuShares := command.Resources.NomadResources.Cpu.CpuShares
   683  	if cpuShares < 2 {
   684  		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
   685  	}
   686  
   687  	// Set the relative CPU shares for this cgroup.
   688  	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
   689  
   690  	return nil
   691  }
   692  
   693  func configureBasicCgroups(cfg *lconfigs.Config) error {
   694  	id := uuid.Generate()
   695  
   696  	// Manually create freezer cgroup
   697  
   698  	subsystem := "freezer"
   699  
   700  	path, err := getCgroupPathHelper(subsystem, filepath.Join(defaultCgroupParent, id))
   701  	if err != nil {
   702  		return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err)
   703  	}
   704  
   705  	if err = os.MkdirAll(path, 0755); err != nil {
   706  		return err
   707  	}
   708  
   709  	cfg.Cgroups.Paths = map[string]string{
   710  		subsystem: path,
   711  	}
   712  	return nil
   713  }
   714  
   715  func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
   716  	mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem)
   717  	if err != nil {
   718  		return "", err
   719  	}
   720  
   721  	// This is needed for nested containers, because in /proc/self/cgroup we
   722  	// see paths from host, which don't exist in container.
   723  	relCgroup, err := filepath.Rel(root, cgroup)
   724  	if err != nil {
   725  		return "", err
   726  	}
   727  
   728  	return filepath.Join(mnt, relCgroup), nil
   729  }
   730  
   731  func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
   732  	cfg := &lconfigs.Config{
   733  		Cgroups: &lconfigs.Cgroup{
   734  			Resources: &lconfigs.Resources{
   735  				AllowAllDevices:  nil,
   736  				MemorySwappiness: nil,
   737  				AllowedDevices:   lconfigs.DefaultAllowedDevices,
   738  			},
   739  		},
   740  		Version: "1.0.0",
   741  	}
   742  
   743  	if err := configureCapabilities(cfg, command); err != nil {
   744  		return nil, err
   745  	}
   746  	if err := configureIsolation(cfg, command); err != nil {
   747  		return nil, err
   748  	}
   749  	if err := configureCgroups(cfg, command); err != nil {
   750  		return nil, err
   751  	}
   752  	return cfg, nil
   753  }
   754  
   755  // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
   756  func cmdDevices(devices []*drivers.DeviceConfig) ([]*lconfigs.Device, error) {
   757  	if len(devices) == 0 {
   758  		return nil, nil
   759  	}
   760  
   761  	r := make([]*lconfigs.Device, len(devices))
   762  
   763  	for i, d := range devices {
   764  		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
   765  		if err != nil {
   766  			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
   767  		}
   768  		ed.Path = d.TaskPath
   769  		r[i] = ed
   770  	}
   771  
   772  	return r, nil
   773  }
   774  
   775  var userMountToUnixMount = map[string]int{
   776  	// Empty string maps to `rprivate` for backwards compatibility in restored
   777  	// older tasks, where mount propagation will not be present.
   778  	"":                                       unix.MS_PRIVATE | unix.MS_REC, // rprivate
   779  	structs.VolumeMountPropagationPrivate:    unix.MS_PRIVATE | unix.MS_REC, // rprivate
   780  	structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC,   // rslave
   781  	structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
   782  }
   783  
   784  // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
   785  func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
   786  	if len(mounts) == 0 {
   787  		return nil
   788  	}
   789  
   790  	r := make([]*lconfigs.Mount, len(mounts))
   791  
   792  	for i, m := range mounts {
   793  		flags := unix.MS_BIND
   794  		if m.Readonly {
   795  			flags |= unix.MS_RDONLY
   796  		}
   797  
   798  		r[i] = &lconfigs.Mount{
   799  			Source:           m.HostPath,
   800  			Destination:      m.TaskPath,
   801  			Device:           "bind",
   802  			Flags:            flags,
   803  			PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
   804  		}
   805  	}
   806  
   807  	return r
   808  }
   809  
   810  // lookupTaskBin finds the file `bin` in taskDir/local, taskDir in that order, then performs
   811  // a PATH search inside taskDir. It returns an absolute path. See also executor.lookupBin
   812  func lookupTaskBin(command *ExecCommand) (string, error) {
   813  	taskDir := command.TaskDir
   814  	bin := command.Cmd
   815  
   816  	// Check in the local directory
   817  	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
   818  	local := filepath.Join(localDir, bin)
   819  	if _, err := os.Stat(local); err == nil {
   820  		return local, nil
   821  	}
   822  
   823  	// Check at the root of the task's directory
   824  	root := filepath.Join(taskDir, bin)
   825  	if _, err := os.Stat(root); err == nil {
   826  		return root, nil
   827  	}
   828  
   829  	if strings.Contains(bin, "/") {
   830  		return "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
   831  	}
   832  
   833  	// Find the PATH
   834  	path := "/usr/local/bin:/usr/bin:/bin"
   835  	for _, e := range command.Env {
   836  		if strings.HasPrefix("PATH=", e) {
   837  			path = e[5:]
   838  		}
   839  	}
   840  
   841  	return lookPathIn(path, taskDir, bin)
   842  }
   843  
   844  // lookPathIn looks for a file with PATH inside the directory root. Like exec.LookPath
   845  func lookPathIn(path string, root string, bin string) (string, error) {
   846  	// exec.LookPath(file string)
   847  	for _, dir := range filepath.SplitList(path) {
   848  		if dir == "" {
   849  			// match unix shell behavior, empty path element == .
   850  			dir = "."
   851  		}
   852  		path := filepath.Join(root, dir, bin)
   853  		f, err := os.Stat(path)
   854  		if err != nil {
   855  			continue
   856  		}
   857  		if m := f.Mode(); !m.IsDir() {
   858  			return path, nil
   859  		}
   860  	}
   861  	return "", fmt.Errorf("file %s not found under path %s", bin, root)
   862  }