github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/drivers/shared/executor/executor_linux.go (about)

     1  // +build linux
     2  
     3  package executor
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"os/exec"
    11  	"path"
    12  	"path/filepath"
    13  	"strings"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/armon/circbuf"
    18  	"github.com/hashicorp/consul-template/signals"
    19  	hclog "github.com/hashicorp/go-hclog"
    20  	"github.com/hashicorp/nomad/client/allocdir"
    21  	"github.com/hashicorp/nomad/client/stats"
    22  	cstructs "github.com/hashicorp/nomad/client/structs"
    23  	shelpers "github.com/hashicorp/nomad/helper/stats"
    24  	"github.com/hashicorp/nomad/helper/uuid"
    25  	"github.com/hashicorp/nomad/nomad/structs"
    26  	"github.com/hashicorp/nomad/plugins/drivers"
    27  	"github.com/opencontainers/runc/libcontainer"
    28  	"github.com/opencontainers/runc/libcontainer/cgroups"
    29  	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
    30  	ldevices "github.com/opencontainers/runc/libcontainer/devices"
    31  	"github.com/opencontainers/runc/libcontainer/specconv"
    32  	lutils "github.com/opencontainers/runc/libcontainer/utils"
    33  	"github.com/syndtr/gocapability/capability"
    34  	"golang.org/x/sys/unix"
    35  )
    36  
    37  const (
    38  	defaultCgroupParent = "/nomad"
    39  )
    40  
    41  var (
    42  	// ExecutorCgroupMeasuredMemStats is the list of memory stats captured by the executor
    43  	ExecutorCgroupMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
    44  
    45  	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
    46  	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
    47  )
    48  
    49  // LibcontainerExecutor implements an Executor with the runc/libcontainer api
    50  type LibcontainerExecutor struct {
    51  	id      string
    52  	command *ExecCommand
    53  
    54  	logger hclog.Logger
    55  
    56  	totalCpuStats  *stats.CpuStats
    57  	userCpuStats   *stats.CpuStats
    58  	systemCpuStats *stats.CpuStats
    59  	pidCollector   *pidCollector
    60  
    61  	container      libcontainer.Container
    62  	userProc       *libcontainer.Process
    63  	userProcExited chan interface{}
    64  	exitState      *ProcessState
    65  }
    66  
    67  func NewExecutorWithIsolation(logger hclog.Logger) Executor {
    68  	logger = logger.Named("isolated_executor")
    69  	if err := shelpers.Init(); err != nil {
    70  		logger.Error("unable to initialize stats", "error", err)
    71  	}
    72  	return &LibcontainerExecutor{
    73  		id:             strings.Replace(uuid.Generate(), "-", "_", -1),
    74  		logger:         logger,
    75  		totalCpuStats:  stats.NewCpuStats(),
    76  		userCpuStats:   stats.NewCpuStats(),
    77  		systemCpuStats: stats.NewCpuStats(),
    78  		pidCollector:   newPidCollector(logger),
    79  	}
    80  }
    81  
    82  // Launch creates a new container in libcontainer and starts a new process with it
    83  func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
    84  	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
    85  
    86  	if command.Resources == nil {
    87  		command.Resources = &drivers.Resources{
    88  			NomadResources: &structs.AllocatedTaskResources{},
    89  		}
    90  	}
    91  
    92  	l.command = command
    93  
    94  	// create a new factory which will store the container state in the allocDir
    95  	factory, err := libcontainer.New(
    96  		path.Join(command.TaskDir, "../alloc/container"),
    97  		libcontainer.Cgroupfs,
    98  		// note that os.Args[0] refers to the executor shim typically
    99  		// and first args arguments is ignored now due
   100  		// until https://github.com/opencontainers/runc/pull/1888 is merged
   101  		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
   102  	)
   103  	if err != nil {
   104  		return nil, fmt.Errorf("failed to create factory: %v", err)
   105  	}
   106  
   107  	// A container groups processes under the same isolation enforcement
   108  	containerCfg, err := newLibcontainerConfig(command)
   109  	if err != nil {
   110  		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
   111  	}
   112  
   113  	container, err := factory.Create(l.id, containerCfg)
   114  	if err != nil {
   115  		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
   116  	}
   117  	l.container = container
   118  
   119  	// Look up the binary path and make it executable
   120  	absPath, err := lookupTaskBin(command)
   121  
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  
   126  	if err := makeExecutable(absPath); err != nil {
   127  		return nil, err
   128  	}
   129  
   130  	path := absPath
   131  
   132  	// Ensure that the path is contained in the chroot, and find it relative to the container
   133  	rel, err := filepath.Rel(command.TaskDir, path)
   134  	if err != nil {
   135  		return nil, fmt.Errorf("failed to determine relative path base=%q target=%q: %v", command.TaskDir, path, err)
   136  	}
   137  
   138  	// Turn relative-to-chroot path into absolute path to avoid
   139  	// libcontainer trying to resolve the binary using $PATH.
   140  	// Do *not* use filepath.Join as it will translate ".."s returned by
   141  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   142  	// chroot which is the desired behavior.
   143  	path = "/" + rel
   144  
   145  	combined := append([]string{path}, command.Args...)
   146  	stdout, err := command.Stdout()
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  	stderr, err := command.Stderr()
   151  	if err != nil {
   152  		return nil, err
   153  	}
   154  
   155  	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
   156  
   157  	// the task process will be started by the container
   158  	process := &libcontainer.Process{
   159  		Args:   combined,
   160  		Env:    command.Env,
   161  		Stdout: stdout,
   162  		Stderr: stderr,
   163  		Init:   true,
   164  	}
   165  
   166  	if command.User != "" {
   167  		process.User = command.User
   168  	}
   169  	l.userProc = process
   170  
   171  	l.totalCpuStats = stats.NewCpuStats()
   172  	l.userCpuStats = stats.NewCpuStats()
   173  	l.systemCpuStats = stats.NewCpuStats()
   174  
   175  	// Starts the task
   176  	if err := container.Run(process); err != nil {
   177  		container.Destroy()
   178  		return nil, err
   179  	}
   180  
   181  	pid, err := process.Pid()
   182  	if err != nil {
   183  		container.Destroy()
   184  		return nil, err
   185  	}
   186  
   187  	// start a goroutine to wait on the process to complete, so Wait calls can
   188  	// be multiplexed
   189  	l.userProcExited = make(chan interface{})
   190  	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
   191  	go l.wait()
   192  
   193  	return &ProcessState{
   194  		Pid:      pid,
   195  		ExitCode: -1,
   196  		Time:     time.Now(),
   197  	}, nil
   198  }
   199  
   200  func (l *LibcontainerExecutor) getAllPids() (map[int]*nomadPid, error) {
   201  	pids, err := l.container.Processes()
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  	nPids := make(map[int]*nomadPid)
   206  	for _, pid := range pids {
   207  		nPids[pid] = &nomadPid{
   208  			pid:           pid,
   209  			cpuStatsTotal: stats.NewCpuStats(),
   210  			cpuStatsUser:  stats.NewCpuStats(),
   211  			cpuStatsSys:   stats.NewCpuStats(),
   212  		}
   213  	}
   214  	return nPids, nil
   215  }
   216  
   217  // Wait waits until a process has exited and returns it's exitcode and errors
   218  func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
   219  	select {
   220  	case <-ctx.Done():
   221  		return nil, ctx.Err()
   222  	case <-l.userProcExited:
   223  		return l.exitState, nil
   224  	}
   225  }
   226  
   227  func (l *LibcontainerExecutor) wait() {
   228  	defer close(l.userProcExited)
   229  
   230  	ps, err := l.userProc.Wait()
   231  	if err != nil {
   232  		// If the process has exited before we called wait an error is returned
   233  		// the process state is embedded in the error
   234  		if exitErr, ok := err.(*exec.ExitError); ok {
   235  			ps = exitErr.ProcessState
   236  		} else {
   237  			l.logger.Error("failed to call wait on user process", "error", err)
   238  			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
   239  			return
   240  		}
   241  	}
   242  
   243  	l.command.Close()
   244  
   245  	exitCode := 1
   246  	var signal int
   247  	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   248  		exitCode = status.ExitStatus()
   249  		if status.Signaled() {
   250  			const exitSignalBase = 128
   251  			signal = int(status.Signal())
   252  			exitCode = exitSignalBase + signal
   253  		}
   254  	}
   255  
   256  	l.exitState = &ProcessState{
   257  		Pid:      ps.Pid(),
   258  		ExitCode: exitCode,
   259  		Signal:   signal,
   260  		Time:     time.Now(),
   261  	}
   262  }
   263  
   264  // Shutdown stops all processes started and cleans up any resources
   265  // created (such as mountpoints, devices, etc).
   266  func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
   267  	if l.container == nil {
   268  		return nil
   269  	}
   270  
   271  	status, err := l.container.Status()
   272  	if err != nil {
   273  		return err
   274  	}
   275  
   276  	defer l.container.Destroy()
   277  
   278  	if status == libcontainer.Stopped {
   279  		return nil
   280  	}
   281  
   282  	if grace > 0 {
   283  		if signal == "" {
   284  			signal = "SIGINT"
   285  		}
   286  
   287  		sig, ok := signals.SignalLookup[signal]
   288  		if !ok {
   289  			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
   290  		}
   291  
   292  		// Signal initial container processes only during graceful
   293  		// shutdown; hence `false` arg.
   294  		err = l.container.Signal(sig, false)
   295  		if err != nil {
   296  			return err
   297  		}
   298  
   299  		select {
   300  		case <-l.userProcExited:
   301  			return nil
   302  		case <-time.After(grace):
   303  			// Force kill all container processes after grace period,
   304  			// hence `true` argument.
   305  			if err := l.container.Signal(os.Kill, true); err != nil {
   306  				return err
   307  			}
   308  		}
   309  	} else {
   310  		err := l.container.Signal(os.Kill, true)
   311  		if err != nil {
   312  			return err
   313  		}
   314  	}
   315  
   316  	select {
   317  	case <-l.userProcExited:
   318  		return nil
   319  	case <-time.After(time.Second * 15):
   320  		return fmt.Errorf("process failed to exit after 15 seconds")
   321  	}
   322  }
   323  
   324  // UpdateResources updates the resource isolation with new values to be enforced
   325  func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
   326  	return nil
   327  }
   328  
   329  // Version returns the api version of the executor
   330  func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
   331  	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
   332  }
   333  
   334  // Stats returns the resource statistics for processes managed by the executor
   335  func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
   336  	ch := make(chan *cstructs.TaskResourceUsage)
   337  	go l.handleStats(ch, ctx, interval)
   338  	return ch, nil
   339  
   340  }
   341  
   342  func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
   343  	defer close(ch)
   344  	timer := time.NewTimer(0)
   345  	for {
   346  		select {
   347  		case <-ctx.Done():
   348  			return
   349  
   350  		case <-timer.C:
   351  			timer.Reset(interval)
   352  		}
   353  
   354  		lstats, err := l.container.Stats()
   355  		if err != nil {
   356  			l.logger.Warn("error collecting stats", "error", err)
   357  			return
   358  		}
   359  
   360  		pidStats, err := l.pidCollector.pidStats()
   361  		if err != nil {
   362  			l.logger.Warn("error collecting stats", "error", err)
   363  			return
   364  		}
   365  
   366  		ts := time.Now()
   367  		stats := lstats.CgroupStats
   368  
   369  		// Memory Related Stats
   370  		swap := stats.MemoryStats.SwapUsage
   371  		maxUsage := stats.MemoryStats.Usage.MaxUsage
   372  		rss := stats.MemoryStats.Stats["rss"]
   373  		cache := stats.MemoryStats.Stats["cache"]
   374  		ms := &cstructs.MemoryStats{
   375  			RSS:            rss,
   376  			Cache:          cache,
   377  			Swap:           swap.Usage,
   378  			Usage:          stats.MemoryStats.Usage.Usage,
   379  			MaxUsage:       maxUsage,
   380  			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
   381  			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
   382  			Measured:       ExecutorCgroupMeasuredMemStats,
   383  		}
   384  
   385  		// CPU Related Stats
   386  		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
   387  		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
   388  		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
   389  
   390  		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
   391  		cs := &cstructs.CpuStats{
   392  			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
   393  			UserMode:         l.userCpuStats.Percent(userModeTime),
   394  			Percent:          totalPercent,
   395  			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
   396  			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
   397  			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
   398  			Measured:         ExecutorCgroupMeasuredCpuStats,
   399  		}
   400  		taskResUsage := cstructs.TaskResourceUsage{
   401  			ResourceUsage: &cstructs.ResourceUsage{
   402  				MemoryStats: ms,
   403  				CpuStats:    cs,
   404  			},
   405  			Timestamp: ts.UTC().UnixNano(),
   406  			Pids:      pidStats,
   407  		}
   408  
   409  		select {
   410  		case <-ctx.Done():
   411  			return
   412  		case ch <- &taskResUsage:
   413  		}
   414  
   415  	}
   416  }
   417  
   418  // Signal sends a signal to the process managed by the executor
   419  func (l *LibcontainerExecutor) Signal(s os.Signal) error {
   420  	return l.userProc.Signal(s)
   421  }
   422  
   423  // Exec starts an additional process inside the container
   424  func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
   425  	combined := append([]string{cmd}, args...)
   426  	// Capture output
   427  	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   428  
   429  	process := &libcontainer.Process{
   430  		Args:   combined,
   431  		Env:    l.command.Env,
   432  		Stdout: buf,
   433  		Stderr: buf,
   434  	}
   435  
   436  	err := l.container.Run(process)
   437  	if err != nil {
   438  		return nil, 0, err
   439  	}
   440  
   441  	waitCh := make(chan *waitResult)
   442  	defer close(waitCh)
   443  	go l.handleExecWait(waitCh, process)
   444  
   445  	select {
   446  	case result := <-waitCh:
   447  		ps := result.ps
   448  		if result.err != nil {
   449  			if exitErr, ok := result.err.(*exec.ExitError); ok {
   450  				ps = exitErr.ProcessState
   451  			} else {
   452  				return nil, 0, result.err
   453  			}
   454  		}
   455  		var exitCode int
   456  		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   457  			exitCode = status.ExitStatus()
   458  		}
   459  		return buf.Bytes(), exitCode, nil
   460  
   461  	case <-time.After(time.Until(deadline)):
   462  		process.Signal(os.Kill)
   463  		return nil, 0, context.DeadlineExceeded
   464  	}
   465  
   466  }
   467  
   468  func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
   469  	parent, child, err := lutils.NewSockPair("socket")
   470  	if err != nil {
   471  		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
   472  	}
   473  
   474  	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
   475  
   476  }
   477  
   478  func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
   479  	stream drivers.ExecTaskStream) error {
   480  
   481  	// the task process will be started by the container
   482  	process := &libcontainer.Process{
   483  		Args: cmd,
   484  		Env:  l.userProc.Env,
   485  		User: l.userProc.User,
   486  		Init: false,
   487  		Cwd:  "/",
   488  	}
   489  
   490  	execHelper := &execHelper{
   491  		logger: l.logger,
   492  
   493  		newTerminal: l.newTerminalSocket,
   494  		setTTY: func(tty *os.File) error {
   495  			process.ConsoleSocket = tty
   496  			return nil
   497  		},
   498  		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
   499  			process.Stdin = stdin
   500  			process.Stdout = stdout
   501  			process.Stderr = stderr
   502  			return nil
   503  		},
   504  
   505  		processStart: func() error { return l.container.Run(process) },
   506  		processWait: func() (*os.ProcessState, error) {
   507  			return process.Wait()
   508  		},
   509  	}
   510  
   511  	return execHelper.run(ctx, tty, stream)
   512  
   513  }
   514  
   515  type waitResult struct {
   516  	ps  *os.ProcessState
   517  	err error
   518  }
   519  
   520  func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
   521  	ps, err := process.Wait()
   522  	ch <- &waitResult{ps, err}
   523  }
   524  
   525  func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) error {
   526  	// TODO: allow better control of these
   527  	// use capabilities list as prior to adopting libcontainer in 0.9
   528  	allCaps := supportedCaps()
   529  
   530  	// match capabilities used in Nomad 0.8
   531  	if command.User == "root" {
   532  		cfg.Capabilities = &lconfigs.Capabilities{
   533  			Bounding:    allCaps,
   534  			Permitted:   allCaps,
   535  			Effective:   allCaps,
   536  			Ambient:     nil,
   537  			Inheritable: nil,
   538  		}
   539  	} else {
   540  		cfg.Capabilities = &lconfigs.Capabilities{
   541  			Bounding: allCaps,
   542  		}
   543  	}
   544  
   545  	return nil
   546  }
   547  
   548  // supportedCaps returns a list of all supported capabilities in kernel
   549  func supportedCaps() []string {
   550  	allCaps := []string{}
   551  	last := capability.CAP_LAST_CAP
   552  	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
   553  	if last == capability.Cap(63) {
   554  		last = capability.CAP_BLOCK_SUSPEND
   555  	}
   556  	for _, cap := range capability.List() {
   557  		if cap > last {
   558  			continue
   559  		}
   560  		allCaps = append(allCaps, fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())))
   561  	}
   562  	return allCaps
   563  }
   564  
   565  func configureNamespaces(pidMode, ipcMode string) lconfigs.Namespaces {
   566  	namespaces := lconfigs.Namespaces{{Type: lconfigs.NEWNS}}
   567  	if pidMode == IsolationModePrivate {
   568  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWPID})
   569  	}
   570  	if ipcMode == IsolationModePrivate {
   571  		namespaces = append(namespaces, lconfigs.Namespace{Type: lconfigs.NEWIPC})
   572  	}
   573  	return namespaces
   574  }
   575  
   576  // configureIsolation prepares the isolation primitives of the container.
   577  // The process runs in a container configured with the following:
   578  //
   579  // * the task directory as the chroot
   580  // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
   581  // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
   582  // * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
   583  func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
   584  	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   585  
   586  	// set the new root directory for the container
   587  	cfg.Rootfs = command.TaskDir
   588  
   589  	// disable pivot_root if set in the driver's configuration
   590  	cfg.NoPivotRoot = command.NoPivotRoot
   591  
   592  	// set up default namespaces as configured
   593  	cfg.Namespaces = configureNamespaces(command.ModePID, command.ModeIPC)
   594  
   595  	if command.NetworkIsolation != nil {
   596  		cfg.Namespaces = append(cfg.Namespaces, lconfigs.Namespace{
   597  			Type: lconfigs.NEWNET,
   598  			Path: command.NetworkIsolation.Path,
   599  		})
   600  	}
   601  
   602  	// paths to mask using a bind mount to /dev/null to prevent reading
   603  	cfg.MaskPaths = []string{
   604  		"/proc/kcore",
   605  		"/sys/firmware",
   606  	}
   607  
   608  	// paths that should be remounted as readonly inside the container
   609  	cfg.ReadonlyPaths = []string{
   610  		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
   611  	}
   612  
   613  	cfg.Devices = specconv.AllowedDevices
   614  	if len(command.Devices) > 0 {
   615  		devs, err := cmdDevices(command.Devices)
   616  		if err != nil {
   617  			return err
   618  		}
   619  		cfg.Devices = append(cfg.Devices, devs...)
   620  	}
   621  
   622  	cfg.Mounts = []*lconfigs.Mount{
   623  		{
   624  			Source:      "tmpfs",
   625  			Destination: "/dev",
   626  			Device:      "tmpfs",
   627  			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
   628  			Data:        "mode=755",
   629  		},
   630  		{
   631  			Source:      "proc",
   632  			Destination: "/proc",
   633  			Device:      "proc",
   634  			Flags:       defaultMountFlags,
   635  		},
   636  		{
   637  			Source:      "devpts",
   638  			Destination: "/dev/pts",
   639  			Device:      "devpts",
   640  			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
   641  			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
   642  		},
   643  		{
   644  			Device:      "tmpfs",
   645  			Source:      "shm",
   646  			Destination: "/dev/shm",
   647  			Data:        "mode=1777,size=65536k",
   648  			Flags:       defaultMountFlags,
   649  		},
   650  		{
   651  			Source:      "mqueue",
   652  			Destination: "/dev/mqueue",
   653  			Device:      "mqueue",
   654  			Flags:       defaultMountFlags,
   655  		},
   656  		{
   657  			Source:      "sysfs",
   658  			Destination: "/sys",
   659  			Device:      "sysfs",
   660  			Flags:       defaultMountFlags | syscall.MS_RDONLY,
   661  		},
   662  	}
   663  
   664  	if len(command.Mounts) > 0 {
   665  		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
   666  	}
   667  
   668  	return nil
   669  }
   670  
   671  func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
   672  
   673  	// If resources are not limited then manually create cgroups needed
   674  	if !command.ResourceLimits {
   675  		return configureBasicCgroups(cfg)
   676  	}
   677  
   678  	id := uuid.Generate()
   679  	cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id)
   680  
   681  	if command.Resources == nil || command.Resources.NomadResources == nil {
   682  		return nil
   683  	}
   684  
   685  	if mb := command.Resources.NomadResources.Memory.MemoryMB; mb > 0 {
   686  		// Total amount of memory allowed to consume
   687  		cfg.Cgroups.Resources.Memory = mb * 1024 * 1024
   688  		// Disable swap to avoid issues on the machine
   689  		var memSwappiness uint64
   690  		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
   691  	}
   692  
   693  	cpuShares := command.Resources.NomadResources.Cpu.CpuShares
   694  	if cpuShares < 2 {
   695  		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
   696  	}
   697  
   698  	// Set the relative CPU shares for this cgroup.
   699  	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
   700  
   701  	return nil
   702  }
   703  
   704  func configureBasicCgroups(cfg *lconfigs.Config) error {
   705  	id := uuid.Generate()
   706  
   707  	// Manually create freezer cgroup
   708  
   709  	subsystem := "freezer"
   710  
   711  	path, err := getCgroupPathHelper(subsystem, filepath.Join(defaultCgroupParent, id))
   712  	if err != nil {
   713  		return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err)
   714  	}
   715  
   716  	if err = os.MkdirAll(path, 0755); err != nil {
   717  		return err
   718  	}
   719  
   720  	cfg.Cgroups.Paths = map[string]string{
   721  		subsystem: path,
   722  	}
   723  	return nil
   724  }
   725  
   726  func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
   727  	mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem)
   728  	if err != nil {
   729  		return "", err
   730  	}
   731  
   732  	// This is needed for nested containers, because in /proc/self/cgroup we
   733  	// see paths from host, which don't exist in container.
   734  	relCgroup, err := filepath.Rel(root, cgroup)
   735  	if err != nil {
   736  		return "", err
   737  	}
   738  
   739  	return filepath.Join(mnt, relCgroup), nil
   740  }
   741  
   742  func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
   743  	cfg := &lconfigs.Config{
   744  		Cgroups: &lconfigs.Cgroup{
   745  			Resources: &lconfigs.Resources{
   746  				MemorySwappiness: nil,
   747  			},
   748  		},
   749  		Version: "1.0.0",
   750  	}
   751  	for _, device := range specconv.AllowedDevices {
   752  		cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.DeviceRule)
   753  	}
   754  
   755  	if err := configureCapabilities(cfg, command); err != nil {
   756  		return nil, err
   757  	}
   758  	if err := configureIsolation(cfg, command); err != nil {
   759  		return nil, err
   760  	}
   761  	if err := configureCgroups(cfg, command); err != nil {
   762  		return nil, err
   763  	}
   764  	return cfg, nil
   765  }
   766  
   767  // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
   768  func cmdDevices(devices []*drivers.DeviceConfig) ([]*lconfigs.Device, error) {
   769  	if len(devices) == 0 {
   770  		return nil, nil
   771  	}
   772  
   773  	r := make([]*lconfigs.Device, len(devices))
   774  
   775  	for i, d := range devices {
   776  		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
   777  		if err != nil {
   778  			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
   779  		}
   780  		ed.Path = d.TaskPath
   781  		r[i] = ed
   782  	}
   783  
   784  	return r, nil
   785  }
   786  
   787  var userMountToUnixMount = map[string]int{
   788  	// Empty string maps to `rprivate` for backwards compatibility in restored
   789  	// older tasks, where mount propagation will not be present.
   790  	"":                                       unix.MS_PRIVATE | unix.MS_REC, // rprivate
   791  	structs.VolumeMountPropagationPrivate:    unix.MS_PRIVATE | unix.MS_REC, // rprivate
   792  	structs.VolumeMountPropagationHostToTask: unix.MS_SLAVE | unix.MS_REC,   // rslave
   793  	structs.VolumeMountPropagationBidirectional: unix.MS_SHARED | unix.MS_REC, // rshared
   794  }
   795  
   796  // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
   797  func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
   798  	if len(mounts) == 0 {
   799  		return nil
   800  	}
   801  
   802  	r := make([]*lconfigs.Mount, len(mounts))
   803  
   804  	for i, m := range mounts {
   805  		flags := unix.MS_BIND
   806  		if m.Readonly {
   807  			flags |= unix.MS_RDONLY
   808  		}
   809  
   810  		r[i] = &lconfigs.Mount{
   811  			Source:           m.HostPath,
   812  			Destination:      m.TaskPath,
   813  			Device:           "bind",
   814  			Flags:            flags,
   815  			PropagationFlags: []int{userMountToUnixMount[m.PropagationMode]},
   816  		}
   817  	}
   818  
   819  	return r
   820  }
   821  
   822  // lookupTaskBin finds the file `bin` in taskDir/local, taskDir in that order, then performs
   823  // a PATH search inside taskDir. It returns an absolute path. See also executor.lookupBin
   824  func lookupTaskBin(command *ExecCommand) (string, error) {
   825  	taskDir := command.TaskDir
   826  	bin := command.Cmd
   827  
   828  	// Check in the local directory
   829  	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
   830  	local := filepath.Join(localDir, bin)
   831  	if _, err := os.Stat(local); err == nil {
   832  		return local, nil
   833  	}
   834  
   835  	// Check at the root of the task's directory
   836  	root := filepath.Join(taskDir, bin)
   837  	if _, err := os.Stat(root); err == nil {
   838  		return root, nil
   839  	}
   840  
   841  	if strings.Contains(bin, "/") {
   842  		return "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
   843  	}
   844  
   845  	path := "/usr/local/bin:/usr/bin:/bin"
   846  
   847  	return lookPathIn(path, taskDir, bin)
   848  }
   849  
   850  // lookPathIn looks for a file with PATH inside the directory root. Like exec.LookPath
   851  func lookPathIn(path string, root string, bin string) (string, error) {
   852  	// exec.LookPath(file string)
   853  	for _, dir := range filepath.SplitList(path) {
   854  		if dir == "" {
   855  			// match unix shell behavior, empty path element == .
   856  			dir = "."
   857  		}
   858  		path := filepath.Join(root, dir, bin)
   859  		f, err := os.Stat(path)
   860  		if err != nil {
   861  			continue
   862  		}
   863  		if m := f.Mode(); !m.IsDir() {
   864  			return path, nil
   865  		}
   866  	}
   867  	return "", fmt.Errorf("file %s not found under path %s", bin, root)
   868  }