github.com/bigcommerce/nomad@v0.9.3-bc/drivers/shared/executor/executor_linux.go (about)

     1  // +build linux
     2  
     3  package executor
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"os/exec"
    11  	"path"
    12  	"path/filepath"
    13  	"strings"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/armon/circbuf"
    18  	"github.com/hashicorp/consul-template/signals"
    19  	hclog "github.com/hashicorp/go-hclog"
    20  	multierror "github.com/hashicorp/go-multierror"
    21  	"github.com/hashicorp/nomad/client/allocdir"
    22  	"github.com/hashicorp/nomad/client/stats"
    23  	cstructs "github.com/hashicorp/nomad/client/structs"
    24  	shelpers "github.com/hashicorp/nomad/helper/stats"
    25  	"github.com/hashicorp/nomad/helper/uuid"
    26  	"github.com/hashicorp/nomad/nomad/structs"
    27  	"github.com/hashicorp/nomad/plugins/drivers"
    28  	"github.com/opencontainers/runc/libcontainer"
    29  	"github.com/opencontainers/runc/libcontainer/cgroups"
    30  	cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
    31  	lconfigs "github.com/opencontainers/runc/libcontainer/configs"
    32  	ldevices "github.com/opencontainers/runc/libcontainer/devices"
    33  	lutils "github.com/opencontainers/runc/libcontainer/utils"
    34  	"github.com/syndtr/gocapability/capability"
    35  	"golang.org/x/sys/unix"
    36  )
    37  
    38  const (
    39  	defaultCgroupParent = "nomad"
    40  )
    41  
    42  var (
    43  	// ExecutorCgroupMeasuredMemStats is the list of memory stats captured by the executor
    44  	ExecutorCgroupMeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"}
    45  
    46  	// ExecutorCgroupMeasuredCpuStats is the list of CPU stats captures by the executor
    47  	ExecutorCgroupMeasuredCpuStats = []string{"System Mode", "User Mode", "Throttled Periods", "Throttled Time", "Percent"}
    48  )
    49  
    50  // LibcontainerExecutor implements an Executor with the runc/libcontainer api
    51  type LibcontainerExecutor struct {
    52  	id      string
    53  	command *ExecCommand
    54  
    55  	logger hclog.Logger
    56  
    57  	totalCpuStats  *stats.CpuStats
    58  	userCpuStats   *stats.CpuStats
    59  	systemCpuStats *stats.CpuStats
    60  	pidCollector   *pidCollector
    61  
    62  	container      libcontainer.Container
    63  	userProc       *libcontainer.Process
    64  	userProcExited chan interface{}
    65  	exitState      *ProcessState
    66  }
    67  
    68  func NewExecutorWithIsolation(logger hclog.Logger) Executor {
    69  	logger = logger.Named("isolated_executor")
    70  	if err := shelpers.Init(); err != nil {
    71  		logger.Error("unable to initialize stats", "error", err)
    72  	}
    73  	return &LibcontainerExecutor{
    74  		id:             strings.Replace(uuid.Generate(), "-", "_", -1),
    75  		logger:         logger,
    76  		totalCpuStats:  stats.NewCpuStats(),
    77  		userCpuStats:   stats.NewCpuStats(),
    78  		systemCpuStats: stats.NewCpuStats(),
    79  		pidCollector:   newPidCollector(logger),
    80  	}
    81  }
    82  
    83  // Launch creates a new container in libcontainer and starts a new process with it
    84  func (l *LibcontainerExecutor) Launch(command *ExecCommand) (*ProcessState, error) {
    85  	l.logger.Trace("preparing to launch command", "command", command.Cmd, "args", strings.Join(command.Args, " "))
    86  
    87  	if command.Resources == nil {
    88  		command.Resources = &drivers.Resources{
    89  			NomadResources: &structs.AllocatedTaskResources{},
    90  		}
    91  	}
    92  
    93  	l.command = command
    94  
    95  	// Move to the root cgroup until process is started
    96  	subsystems, err := cgroups.GetAllSubsystems()
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  	if err := JoinRootCgroup(subsystems); err != nil {
   101  		return nil, err
   102  	}
   103  
   104  	// create a new factory which will store the container state in the allocDir
   105  	factory, err := libcontainer.New(
   106  		path.Join(command.TaskDir, "../alloc/container"),
   107  		libcontainer.Cgroupfs,
   108  		// note that os.Args[0] refers to the executor shim typically
   109  		// and first args arguments is ignored now due
   110  		// until https://github.com/opencontainers/runc/pull/1888 is merged
   111  		libcontainer.InitArgs(os.Args[0], "libcontainer-shim"),
   112  	)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("failed to create factory: %v", err)
   115  	}
   116  
   117  	// A container groups processes under the same isolation enforcement
   118  	containerCfg, err := newLibcontainerConfig(command)
   119  	if err != nil {
   120  		return nil, fmt.Errorf("failed to configure container(%s): %v", l.id, err)
   121  	}
   122  
   123  	container, err := factory.Create(l.id, containerCfg)
   124  	if err != nil {
   125  		return nil, fmt.Errorf("failed to create container(%s): %v", l.id, err)
   126  	}
   127  	l.container = container
   128  
   129  	// Look up the binary path and make it executable
   130  	absPath, err := lookupTaskBin(command)
   131  
   132  	if err != nil {
   133  		return nil, err
   134  	}
   135  
   136  	if err := makeExecutable(absPath); err != nil {
   137  		return nil, err
   138  	}
   139  
   140  	path := absPath
   141  
   142  	// Ensure that the path is contained in the chroot, and find it relative to the container
   143  	rel, err := filepath.Rel(command.TaskDir, path)
   144  	if err != nil {
   145  		return nil, fmt.Errorf("failed to determine relative path base=%q target=%q: %v", command.TaskDir, path, err)
   146  	}
   147  
   148  	// Turn relative-to-chroot path into absolute path to avoid
   149  	// libcontainer trying to resolve the binary using $PATH.
   150  	// Do *not* use filepath.Join as it will translate ".."s returned by
   151  	// filepath.Rel. Prepending "/" will cause the path to be rooted in the
   152  	// chroot which is the desired behavior.
   153  	path = "/" + rel
   154  
   155  	combined := append([]string{path}, command.Args...)
   156  	stdout, err := command.Stdout()
   157  	if err != nil {
   158  		return nil, err
   159  	}
   160  	stderr, err := command.Stderr()
   161  	if err != nil {
   162  		return nil, err
   163  	}
   164  
   165  	l.logger.Debug("launching", "command", command.Cmd, "args", strings.Join(command.Args, " "))
   166  
   167  	// the task process will be started by the container
   168  	process := &libcontainer.Process{
   169  		Args:   combined,
   170  		Env:    command.Env,
   171  		Stdout: stdout,
   172  		Stderr: stderr,
   173  		Init:   true,
   174  	}
   175  
   176  	if command.User != "" {
   177  		process.User = command.User
   178  	}
   179  	l.userProc = process
   180  
   181  	l.totalCpuStats = stats.NewCpuStats()
   182  	l.userCpuStats = stats.NewCpuStats()
   183  	l.systemCpuStats = stats.NewCpuStats()
   184  
   185  	// Starts the task
   186  	if err := container.Run(process); err != nil {
   187  		container.Destroy()
   188  		return nil, err
   189  	}
   190  
   191  	pid, err := process.Pid()
   192  	if err != nil {
   193  		container.Destroy()
   194  		return nil, err
   195  	}
   196  
   197  	// Join process cgroups
   198  	containerState, err := container.State()
   199  	if err != nil {
   200  		l.logger.Error("error entering user process cgroups", "executor_pid", os.Getpid(), "error", err)
   201  	}
   202  	if err := cgroups.EnterPid(containerState.CgroupPaths, os.Getpid()); err != nil {
   203  		l.logger.Error("error entering user process cgroups", "executor_pid", os.Getpid(), "error", err)
   204  	}
   205  
   206  	// start a goroutine to wait on the process to complete, so Wait calls can
   207  	// be multiplexed
   208  	l.userProcExited = make(chan interface{})
   209  	go l.pidCollector.collectPids(l.userProcExited, l.getAllPids)
   210  	go l.wait()
   211  
   212  	return &ProcessState{
   213  		Pid:      pid,
   214  		ExitCode: -1,
   215  		Time:     time.Now(),
   216  	}, nil
   217  }
   218  
   219  func (l *LibcontainerExecutor) getAllPids() (map[int]*nomadPid, error) {
   220  	pids, err := l.container.Processes()
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  	nPids := make(map[int]*nomadPid)
   225  	for _, pid := range pids {
   226  		nPids[pid] = &nomadPid{
   227  			pid:           pid,
   228  			cpuStatsTotal: stats.NewCpuStats(),
   229  			cpuStatsUser:  stats.NewCpuStats(),
   230  			cpuStatsSys:   stats.NewCpuStats(),
   231  		}
   232  	}
   233  	return nPids, nil
   234  }
   235  
   236  // Wait waits until a process has exited and returns it's exitcode and errors
   237  func (l *LibcontainerExecutor) Wait(ctx context.Context) (*ProcessState, error) {
   238  	select {
   239  	case <-ctx.Done():
   240  		return nil, ctx.Err()
   241  	case <-l.userProcExited:
   242  		return l.exitState, nil
   243  	}
   244  }
   245  
   246  func (l *LibcontainerExecutor) wait() {
   247  	defer close(l.userProcExited)
   248  
   249  	ps, err := l.userProc.Wait()
   250  	if err != nil {
   251  		// If the process has exited before we called wait an error is returned
   252  		// the process state is embedded in the error
   253  		if exitErr, ok := err.(*exec.ExitError); ok {
   254  			ps = exitErr.ProcessState
   255  		} else {
   256  			l.logger.Error("failed to call wait on user process", "error", err)
   257  			l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()}
   258  			return
   259  		}
   260  	}
   261  
   262  	l.command.Close()
   263  
   264  	exitCode := 1
   265  	var signal int
   266  	if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   267  		exitCode = status.ExitStatus()
   268  		if status.Signaled() {
   269  			const exitSignalBase = 128
   270  			signal = int(status.Signal())
   271  			exitCode = exitSignalBase + signal
   272  		}
   273  	}
   274  
   275  	l.exitState = &ProcessState{
   276  		Pid:      ps.Pid(),
   277  		ExitCode: exitCode,
   278  		Signal:   signal,
   279  		Time:     time.Now(),
   280  	}
   281  }
   282  
   283  // Shutdown stops all processes started and cleans up any resources
   284  // created (such as mountpoints, devices, etc).
   285  func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) error {
   286  	if l.container == nil {
   287  		return nil
   288  	}
   289  
   290  	// move executor to root cgroup
   291  	subsystems, err := cgroups.GetAllSubsystems()
   292  	if err != nil {
   293  		return err
   294  	}
   295  	if err := JoinRootCgroup(subsystems); err != nil {
   296  		return err
   297  	}
   298  
   299  	status, err := l.container.Status()
   300  	if err != nil {
   301  		return err
   302  	}
   303  
   304  	defer l.container.Destroy()
   305  
   306  	if status == libcontainer.Stopped {
   307  		return nil
   308  	}
   309  
   310  	if grace > 0 {
   311  		if signal == "" {
   312  			signal = "SIGINT"
   313  		}
   314  
   315  		sig, ok := signals.SignalLookup[signal]
   316  		if !ok {
   317  			return fmt.Errorf("error unknown signal given for shutdown: %s", signal)
   318  		}
   319  
   320  		// Signal initial container processes only during graceful
   321  		// shutdown; hence `false` arg.
   322  		err = l.container.Signal(sig, false)
   323  		if err != nil {
   324  			return err
   325  		}
   326  
   327  		select {
   328  		case <-l.userProcExited:
   329  			return nil
   330  		case <-time.After(grace):
   331  			// Force kill all container processes after grace period,
   332  			// hence `true` argument.
   333  			if err := l.container.Signal(os.Kill, true); err != nil {
   334  				return err
   335  			}
   336  		}
   337  	} else {
   338  		if err := l.container.Signal(os.Kill, true); err != nil {
   339  			return err
   340  		}
   341  	}
   342  
   343  	select {
   344  	case <-l.userProcExited:
   345  		return nil
   346  	case <-time.After(time.Second * 15):
   347  		return fmt.Errorf("process failed to exit after 15 seconds")
   348  	}
   349  }
   350  
   351  // UpdateResources updates the resource isolation with new values to be enforced
   352  func (l *LibcontainerExecutor) UpdateResources(resources *drivers.Resources) error {
   353  	return nil
   354  }
   355  
   356  // Version returns the api version of the executor
   357  func (l *LibcontainerExecutor) Version() (*ExecutorVersion, error) {
   358  	return &ExecutorVersion{Version: ExecutorVersionLatest}, nil
   359  }
   360  
   361  // Stats returns the resource statistics for processes managed by the executor
   362  func (l *LibcontainerExecutor) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
   363  	ch := make(chan *cstructs.TaskResourceUsage)
   364  	go l.handleStats(ch, ctx, interval)
   365  	return ch, nil
   366  
   367  }
   368  
   369  func (l *LibcontainerExecutor) handleStats(ch chan *cstructs.TaskResourceUsage, ctx context.Context, interval time.Duration) {
   370  	defer close(ch)
   371  	timer := time.NewTimer(0)
   372  	for {
   373  		select {
   374  		case <-ctx.Done():
   375  			return
   376  
   377  		case <-timer.C:
   378  			timer.Reset(interval)
   379  		}
   380  
   381  		lstats, err := l.container.Stats()
   382  		if err != nil {
   383  			l.logger.Warn("error collecting stats", "error", err)
   384  			return
   385  		}
   386  
   387  		pidStats, err := l.pidCollector.pidStats()
   388  		if err != nil {
   389  			l.logger.Warn("error collecting stats", "error", err)
   390  			return
   391  		}
   392  
   393  		ts := time.Now()
   394  		stats := lstats.CgroupStats
   395  
   396  		// Memory Related Stats
   397  		swap := stats.MemoryStats.SwapUsage
   398  		maxUsage := stats.MemoryStats.Usage.MaxUsage
   399  		rss := stats.MemoryStats.Stats["rss"]
   400  		cache := stats.MemoryStats.Stats["cache"]
   401  		ms := &cstructs.MemoryStats{
   402  			RSS:            rss,
   403  			Cache:          cache,
   404  			Swap:           swap.Usage,
   405  			Usage:          stats.MemoryStats.Usage.Usage,
   406  			MaxUsage:       maxUsage,
   407  			KernelUsage:    stats.MemoryStats.KernelUsage.Usage,
   408  			KernelMaxUsage: stats.MemoryStats.KernelUsage.MaxUsage,
   409  			Measured:       ExecutorCgroupMeasuredMemStats,
   410  		}
   411  
   412  		// CPU Related Stats
   413  		totalProcessCPUUsage := float64(stats.CpuStats.CpuUsage.TotalUsage)
   414  		userModeTime := float64(stats.CpuStats.CpuUsage.UsageInUsermode)
   415  		kernelModeTime := float64(stats.CpuStats.CpuUsage.UsageInKernelmode)
   416  
   417  		totalPercent := l.totalCpuStats.Percent(totalProcessCPUUsage)
   418  		cs := &cstructs.CpuStats{
   419  			SystemMode:       l.systemCpuStats.Percent(kernelModeTime),
   420  			UserMode:         l.userCpuStats.Percent(userModeTime),
   421  			Percent:          totalPercent,
   422  			ThrottledPeriods: stats.CpuStats.ThrottlingData.ThrottledPeriods,
   423  			ThrottledTime:    stats.CpuStats.ThrottlingData.ThrottledTime,
   424  			TotalTicks:       l.systemCpuStats.TicksConsumed(totalPercent),
   425  			Measured:         ExecutorCgroupMeasuredCpuStats,
   426  		}
   427  		taskResUsage := cstructs.TaskResourceUsage{
   428  			ResourceUsage: &cstructs.ResourceUsage{
   429  				MemoryStats: ms,
   430  				CpuStats:    cs,
   431  			},
   432  			Timestamp: ts.UTC().UnixNano(),
   433  			Pids:      pidStats,
   434  		}
   435  
   436  		select {
   437  		case <-ctx.Done():
   438  			return
   439  		case ch <- &taskResUsage:
   440  		}
   441  
   442  	}
   443  }
   444  
   445  // Signal sends a signal to the process managed by the executor
   446  func (l *LibcontainerExecutor) Signal(s os.Signal) error {
   447  	return l.userProc.Signal(s)
   448  }
   449  
   450  // Exec starts an additional process inside the container
   451  func (l *LibcontainerExecutor) Exec(deadline time.Time, cmd string, args []string) ([]byte, int, error) {
   452  	combined := append([]string{cmd}, args...)
   453  	// Capture output
   454  	buf, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   455  
   456  	process := &libcontainer.Process{
   457  		Args:   combined,
   458  		Env:    l.command.Env,
   459  		Stdout: buf,
   460  		Stderr: buf,
   461  	}
   462  
   463  	err := l.container.Run(process)
   464  	if err != nil {
   465  		return nil, 0, err
   466  	}
   467  
   468  	waitCh := make(chan *waitResult)
   469  	defer close(waitCh)
   470  	go l.handleExecWait(waitCh, process)
   471  
   472  	select {
   473  	case result := <-waitCh:
   474  		ps := result.ps
   475  		if result.err != nil {
   476  			if exitErr, ok := result.err.(*exec.ExitError); ok {
   477  				ps = exitErr.ProcessState
   478  			} else {
   479  				return nil, 0, result.err
   480  			}
   481  		}
   482  		var exitCode int
   483  		if status, ok := ps.Sys().(syscall.WaitStatus); ok {
   484  			exitCode = status.ExitStatus()
   485  		}
   486  		return buf.Bytes(), exitCode, nil
   487  
   488  	case <-time.After(time.Until(deadline)):
   489  		process.Signal(os.Kill)
   490  		return nil, 0, context.DeadlineExceeded
   491  	}
   492  
   493  }
   494  
   495  func (l *LibcontainerExecutor) newTerminalSocket() (pty func() (*os.File, error), tty *os.File, err error) {
   496  	parent, child, err := lutils.NewSockPair("socket")
   497  	if err != nil {
   498  		return nil, nil, fmt.Errorf("failed to create terminal: %v", err)
   499  	}
   500  
   501  	return func() (*os.File, error) { return lutils.RecvFd(parent) }, child, err
   502  
   503  }
   504  
   505  func (l *LibcontainerExecutor) ExecStreaming(ctx context.Context, cmd []string, tty bool,
   506  	stream drivers.ExecTaskStream) error {
   507  
   508  	// the task process will be started by the container
   509  	process := &libcontainer.Process{
   510  		Args: cmd,
   511  		Env:  l.userProc.Env,
   512  		User: l.userProc.User,
   513  		Init: false,
   514  		Cwd:  "/",
   515  	}
   516  
   517  	execHelper := &execHelper{
   518  		logger: l.logger,
   519  
   520  		newTerminal: l.newTerminalSocket,
   521  		setTTY: func(tty *os.File) error {
   522  			process.ConsoleSocket = tty
   523  			return nil
   524  		},
   525  		setIO: func(stdin io.Reader, stdout, stderr io.Writer) error {
   526  			process.Stdin = stdin
   527  			process.Stdout = stdout
   528  			process.Stderr = stderr
   529  			return nil
   530  		},
   531  
   532  		processStart: func() error { return l.container.Run(process) },
   533  		processWait: func() (*os.ProcessState, error) {
   534  			return process.Wait()
   535  		},
   536  	}
   537  
   538  	return execHelper.run(ctx, tty, stream)
   539  
   540  }
   541  
   542  type waitResult struct {
   543  	ps  *os.ProcessState
   544  	err error
   545  }
   546  
   547  func (l *LibcontainerExecutor) handleExecWait(ch chan *waitResult, process *libcontainer.Process) {
   548  	ps, err := process.Wait()
   549  	ch <- &waitResult{ps, err}
   550  }
   551  
   552  func configureCapabilities(cfg *lconfigs.Config, command *ExecCommand) error {
   553  	// TODO: allow better control of these
   554  	// use capabilities list as prior to adopting libcontainer in 0.9
   555  	allCaps := supportedCaps()
   556  
   557  	// match capabilities used in Nomad 0.8
   558  	if command.User == "root" {
   559  		cfg.Capabilities = &lconfigs.Capabilities{
   560  			Bounding:    allCaps,
   561  			Permitted:   allCaps,
   562  			Effective:   allCaps,
   563  			Ambient:     nil,
   564  			Inheritable: nil,
   565  		}
   566  	} else {
   567  		cfg.Capabilities = &lconfigs.Capabilities{
   568  			Bounding: allCaps,
   569  		}
   570  	}
   571  
   572  	return nil
   573  }
   574  
   575  // supportedCaps returns a list of all supported capabilities in kernel
   576  func supportedCaps() []string {
   577  	allCaps := []string{}
   578  	last := capability.CAP_LAST_CAP
   579  	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
   580  	if last == capability.Cap(63) {
   581  		last = capability.CAP_BLOCK_SUSPEND
   582  	}
   583  	for _, cap := range capability.List() {
   584  		if cap > last {
   585  			continue
   586  		}
   587  		allCaps = append(allCaps, fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())))
   588  	}
   589  	return allCaps
   590  }
   591  
   592  // configureIsolation prepares the isolation primitives of the container.
   593  // The process runs in a container configured with the following:
   594  //
   595  // * the task directory as the chroot
   596  // * dedicated mount points namespace, but shares the PID, User, domain, network namespaces with host
   597  // * small subset of devices (e.g. stdout/stderr/stdin, tty, shm, pts); default to using the same set of devices as Docker
   598  // * some special filesystems: `/proc`, `/sys`.  Some case is given to avoid exec escaping or setting malicious values through them.
   599  func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error {
   600  	defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   601  
   602  	// set the new root directory for the container
   603  	cfg.Rootfs = command.TaskDir
   604  
   605  	// launch with mount namespace
   606  	cfg.Namespaces = lconfigs.Namespaces{
   607  		{Type: lconfigs.NEWNS},
   608  	}
   609  
   610  	// paths to mask using a bind mount to /dev/null to prevent reading
   611  	cfg.MaskPaths = []string{
   612  		"/proc/kcore",
   613  		"/sys/firmware",
   614  	}
   615  
   616  	// paths that should be remounted as readonly inside the container
   617  	cfg.ReadonlyPaths = []string{
   618  		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
   619  	}
   620  
   621  	cfg.Devices = lconfigs.DefaultAutoCreatedDevices
   622  	if len(command.Devices) > 0 {
   623  		devs, err := cmdDevices(command.Devices)
   624  		if err != nil {
   625  			return err
   626  		}
   627  		cfg.Devices = append(cfg.Devices, devs...)
   628  	}
   629  
   630  	cfg.Mounts = []*lconfigs.Mount{
   631  		{
   632  			Source:      "tmpfs",
   633  			Destination: "/dev",
   634  			Device:      "tmpfs",
   635  			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
   636  			Data:        "mode=755",
   637  		},
   638  		{
   639  			Source:      "proc",
   640  			Destination: "/proc",
   641  			Device:      "proc",
   642  			Flags:       defaultMountFlags,
   643  		},
   644  		{
   645  			Source:      "devpts",
   646  			Destination: "/dev/pts",
   647  			Device:      "devpts",
   648  			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
   649  			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
   650  		},
   651  		{
   652  			Device:      "tmpfs",
   653  			Source:      "shm",
   654  			Destination: "/dev/shm",
   655  			Data:        "mode=1777,size=65536k",
   656  			Flags:       defaultMountFlags,
   657  		},
   658  		{
   659  			Source:      "mqueue",
   660  			Destination: "/dev/mqueue",
   661  			Device:      "mqueue",
   662  			Flags:       defaultMountFlags,
   663  		},
   664  		{
   665  			Source:      "sysfs",
   666  			Destination: "/sys",
   667  			Device:      "sysfs",
   668  			Flags:       defaultMountFlags | syscall.MS_RDONLY,
   669  		},
   670  	}
   671  
   672  	if len(command.Mounts) > 0 {
   673  		cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...)
   674  	}
   675  
   676  	return nil
   677  }
   678  
   679  func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error {
   680  
   681  	// If resources are not limited then manually create cgroups needed
   682  	if !command.ResourceLimits {
   683  		return configureBasicCgroups(cfg)
   684  	}
   685  
   686  	id := uuid.Generate()
   687  	cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id)
   688  
   689  	if command.Resources == nil || command.Resources.NomadResources == nil {
   690  		return nil
   691  	}
   692  
   693  	if mb := command.Resources.NomadResources.Memory.MemoryMB; mb > 0 {
   694  		// Total amount of memory allowed to consume
   695  		cfg.Cgroups.Resources.Memory = mb * 1024 * 1024
   696  		// Disable swap to avoid issues on the machine
   697  		var memSwappiness uint64
   698  		cfg.Cgroups.Resources.MemorySwappiness = &memSwappiness
   699  	}
   700  
   701  	cpuShares := command.Resources.NomadResources.Cpu.CpuShares
   702  	if cpuShares < 2 {
   703  		return fmt.Errorf("resources.Cpu.CpuShares must be equal to or greater than 2: %v", cpuShares)
   704  	}
   705  
   706  	// Set the relative CPU shares for this cgroup.
   707  	cfg.Cgroups.Resources.CpuShares = uint64(cpuShares)
   708  
   709  	return nil
   710  }
   711  
   712  func configureBasicCgroups(cfg *lconfigs.Config) error {
   713  	id := uuid.Generate()
   714  
   715  	// Manually create freezer cgroup
   716  	cfg.Cgroups.Paths = map[string]string{}
   717  	root, err := cgroups.FindCgroupMountpointDir()
   718  	if err != nil {
   719  		return err
   720  	}
   721  
   722  	if _, err := os.Stat(root); err != nil {
   723  		return err
   724  	}
   725  
   726  	freezer := cgroupFs.FreezerGroup{}
   727  	subsystem := freezer.Name()
   728  	path, err := cgroups.FindCgroupMountpoint("", subsystem)
   729  	if err != nil {
   730  		return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err)
   731  	}
   732  	// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
   733  	path = filepath.Join(root, filepath.Base(path), defaultCgroupParent, id)
   734  
   735  	if err = os.MkdirAll(path, 0755); err != nil {
   736  		return err
   737  	}
   738  
   739  	cfg.Cgroups.Paths[subsystem] = path
   740  	return nil
   741  }
   742  
   743  func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) {
   744  	cfg := &lconfigs.Config{
   745  		Cgroups: &lconfigs.Cgroup{
   746  			Resources: &lconfigs.Resources{
   747  				AllowAllDevices:  nil,
   748  				MemorySwappiness: nil,
   749  				AllowedDevices:   lconfigs.DefaultAllowedDevices,
   750  			},
   751  		},
   752  		Version: "1.0.0",
   753  	}
   754  
   755  	if err := configureCapabilities(cfg, command); err != nil {
   756  		return nil, err
   757  	}
   758  	if err := configureIsolation(cfg, command); err != nil {
   759  		return nil, err
   760  	}
   761  	if err := configureCgroups(cfg, command); err != nil {
   762  		return nil, err
   763  	}
   764  	return cfg, nil
   765  }
   766  
   767  // JoinRootCgroup moves the current process to the cgroups of the init process
   768  func JoinRootCgroup(subsystems []string) error {
   769  	mErrs := new(multierror.Error)
   770  	paths := map[string]string{}
   771  	for _, s := range subsystems {
   772  		mnt, _, err := cgroups.FindCgroupMountpointAndRoot("", s)
   773  		if err != nil {
   774  			multierror.Append(mErrs, fmt.Errorf("error getting cgroup path for subsystem: %s", s))
   775  			continue
   776  		}
   777  
   778  		paths[s] = mnt
   779  	}
   780  
   781  	err := cgroups.EnterPid(paths, os.Getpid())
   782  	if err != nil {
   783  		multierror.Append(mErrs, err)
   784  	}
   785  
   786  	return mErrs.ErrorOrNil()
   787  }
   788  
   789  // cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices.
   790  func cmdDevices(devices []*drivers.DeviceConfig) ([]*lconfigs.Device, error) {
   791  	if len(devices) == 0 {
   792  		return nil, nil
   793  	}
   794  
   795  	r := make([]*lconfigs.Device, len(devices))
   796  
   797  	for i, d := range devices {
   798  		ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions)
   799  		if err != nil {
   800  			return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err)
   801  		}
   802  		ed.Path = d.TaskPath
   803  		r[i] = ed
   804  	}
   805  
   806  	return r, nil
   807  }
   808  
   809  // cmdMounts converts a list of driver.MountConfigs into excutor.Mounts.
   810  func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount {
   811  	if len(mounts) == 0 {
   812  		return nil
   813  	}
   814  
   815  	r := make([]*lconfigs.Mount, len(mounts))
   816  
   817  	for i, m := range mounts {
   818  		flags := unix.MS_BIND
   819  		if m.Readonly {
   820  			flags |= unix.MS_RDONLY
   821  		}
   822  		r[i] = &lconfigs.Mount{
   823  			Source:      m.HostPath,
   824  			Destination: m.TaskPath,
   825  			Device:      "bind",
   826  			Flags:       flags,
   827  		}
   828  	}
   829  
   830  	return r
   831  }
   832  
   833  // lookupTaskBin finds the file `bin` in taskDir/local, taskDir in that order, then performs
   834  // a PATH search inside taskDir. It returns an absolute path. See also executor.lookupBin
   835  func lookupTaskBin(command *ExecCommand) (string, error) {
   836  	taskDir := command.TaskDir
   837  	bin := command.Cmd
   838  
   839  	// Check in the local directory
   840  	localDir := filepath.Join(taskDir, allocdir.TaskLocal)
   841  	local := filepath.Join(localDir, bin)
   842  	if _, err := os.Stat(local); err == nil {
   843  		return local, nil
   844  	}
   845  
   846  	// Check at the root of the task's directory
   847  	root := filepath.Join(taskDir, bin)
   848  	if _, err := os.Stat(root); err == nil {
   849  		return root, nil
   850  	}
   851  
   852  	if strings.Contains(bin, "/") {
   853  		return "", fmt.Errorf("file %s not found under path %s", bin, taskDir)
   854  	}
   855  
   856  	// Find the PATH
   857  	path := "/usr/local/bin:/usr/bin:/bin"
   858  	for _, e := range command.Env {
   859  		if strings.HasPrefix("PATH=", e) {
   860  			path = e[5:]
   861  		}
   862  	}
   863  
   864  	return lookPathIn(path, taskDir, bin)
   865  }
   866  
   867  // lookPathIn looks for a file with PATH inside the directory root. Like exec.LookPath
   868  func lookPathIn(path string, root string, bin string) (string, error) {
   869  	// exec.LookPath(file string)
   870  	for _, dir := range filepath.SplitList(path) {
   871  		if dir == "" {
   872  			// match unix shell behavior, empty path element == .
   873  			dir = "."
   874  		}
   875  		path := filepath.Join(root, dir, bin)
   876  		f, err := os.Stat(path)
   877  		if err != nil {
   878  			continue
   879  		}
   880  		if m := f.Mode(); !m.IsDir() {
   881  			return path, nil
   882  		}
   883  	}
   884  	return "", fmt.Errorf("file %s not found under path %s", bin, root)
   885  }