github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/client/executor/exec_linux.go (about)

     1  package executor
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"os/exec"
    11  	"os/user"
    12  	"path/filepath"
    13  	"strconv"
    14  	"strings"
    15  	"syscall"
    16  
    17  	"github.com/hashicorp/go-multierror"
    18  	"github.com/hashicorp/nomad/client/allocdir"
    19  	"github.com/hashicorp/nomad/client/driver/args"
    20  	"github.com/hashicorp/nomad/client/driver/environment"
    21  	"github.com/hashicorp/nomad/command"
    22  	"github.com/hashicorp/nomad/helper/discover"
    23  	"github.com/hashicorp/nomad/nomad/structs"
    24  
    25  	cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
    26  	cgroupConfig "github.com/opencontainers/runc/libcontainer/configs"
    27  )
    28  
    29  const (
    30  	cgroupMount = "/sys/fs/cgroup"
    31  )
    32  
    33  var (
    34  	// A mapping of directories on the host OS to attempt to embed inside each
    35  	// task's chroot.
    36  	chrootEnv = map[string]string{
    37  		"/bin":     "/bin",
    38  		"/etc":     "/etc",
    39  		"/lib":     "/lib",
    40  		"/lib32":   "/lib32",
    41  		"/lib64":   "/lib64",
    42  		"/usr/bin": "/usr/bin",
    43  		"/usr/lib": "/usr/lib",
    44  	}
    45  )
    46  
    47  func NewExecutor() Executor {
    48  	e := LinuxExecutor{}
    49  
    50  	// TODO: In a follow-up PR make it so this only happens once per client.
    51  	// Fingerprinting shouldn't happen per task.
    52  
    53  	// Check that cgroups are available.
    54  	if _, err := os.Stat(cgroupMount); err == nil {
    55  		e.cgroupEnabled = true
    56  	}
    57  
    58  	return &e
    59  }
    60  
    61  // Linux executor is designed to run on linux kernel 2.8+.
    62  type LinuxExecutor struct {
    63  	cmd
    64  	user *user.User
    65  
    66  	// Finger print capabilities.
    67  	cgroupEnabled bool
    68  
    69  	// Isolation configurations.
    70  	groups   *cgroupConfig.Cgroup
    71  	alloc    *allocdir.AllocDir
    72  	taskName string
    73  	taskDir  string
    74  
    75  	// Tracking of child process.
    76  	spawnChild        exec.Cmd
    77  	spawnOutputWriter *os.File
    78  	spawnOutputReader *os.File
    79  
    80  	// Track whether there are filesystems mounted in the task dir.
    81  	mounts bool
    82  }
    83  
    84  func (e *LinuxExecutor) Limit(resources *structs.Resources) error {
    85  	if resources == nil {
    86  		return errNoResources
    87  	}
    88  
    89  	if e.cgroupEnabled {
    90  		return e.configureCgroups(resources)
    91  	}
    92  
    93  	return nil
    94  }
    95  
    96  func (e *LinuxExecutor) ConfigureTaskDir(taskName string, alloc *allocdir.AllocDir) error {
    97  	e.taskName = taskName
    98  	taskDir, ok := alloc.TaskDirs[taskName]
    99  	if !ok {
   100  		fmt.Errorf("Couldn't find task directory for task %v", taskName)
   101  	}
   102  	e.taskDir = taskDir
   103  
   104  	if err := alloc.MountSharedDir(taskName); err != nil {
   105  		return err
   106  	}
   107  
   108  	if err := alloc.Embed(taskName, chrootEnv); err != nil {
   109  		return err
   110  	}
   111  
   112  	// Mount dev
   113  	dev := filepath.Join(taskDir, "dev")
   114  	if err := os.Mkdir(dev, 0777); err != nil {
   115  		return fmt.Errorf("Mkdir(%v) failed: %v", dev, err)
   116  	}
   117  
   118  	if err := syscall.Mount("", dev, "devtmpfs", syscall.MS_RDONLY, ""); err != nil {
   119  		return fmt.Errorf("Couldn't mount /dev to %v: %v", dev, err)
   120  	}
   121  
   122  	// Mount proc
   123  	proc := filepath.Join(taskDir, "proc")
   124  	if err := os.Mkdir(proc, 0777); err != nil {
   125  		return fmt.Errorf("Mkdir(%v) failed: %v", proc, err)
   126  	}
   127  
   128  	if err := syscall.Mount("", proc, "proc", syscall.MS_RDONLY, ""); err != nil {
   129  		return fmt.Errorf("Couldn't mount /proc to %v: %v", proc, err)
   130  	}
   131  
   132  	// Set the tasks AllocDir environment variable.
   133  	env, err := environment.ParseFromList(e.Cmd.Env)
   134  	if err != nil {
   135  		return err
   136  	}
   137  	env.SetAllocDir(filepath.Join("/", allocdir.SharedAllocName))
   138  	e.Cmd.Env = env.List()
   139  
   140  	e.alloc = alloc
   141  	e.mounts = true
   142  	return nil
   143  }
   144  
   145  func (e *LinuxExecutor) cleanTaskDir() error {
   146  	if e.alloc == nil {
   147  		return errors.New("ConfigureTaskDir() must be called before Start()")
   148  	}
   149  
   150  	if !e.mounts {
   151  		return nil
   152  	}
   153  
   154  	// Unmount dev.
   155  	errs := new(multierror.Error)
   156  	dev := filepath.Join(e.taskDir, "dev")
   157  	if err := syscall.Unmount(dev, 0); err != nil {
   158  		errs = multierror.Append(errs, fmt.Errorf("Failed to unmount dev (%v): %v", dev, err))
   159  	}
   160  
   161  	// Unmount proc.
   162  	proc := filepath.Join(e.taskDir, "proc")
   163  	if err := syscall.Unmount(proc, 0); err != nil {
   164  		errs = multierror.Append(errs, fmt.Errorf("Failed to unmount proc (%v): %v", proc, err))
   165  	}
   166  
   167  	e.mounts = false
   168  	return errs.ErrorOrNil()
   169  }
   170  
   171  func (e *LinuxExecutor) configureCgroups(resources *structs.Resources) error {
   172  	if !e.cgroupEnabled {
   173  		return nil
   174  	}
   175  
   176  	e.groups = &cgroupConfig.Cgroup{}
   177  
   178  	// Groups will be created in a heiarchy according to the resource being
   179  	// constrained, current session, and then this unique name. Restraints are
   180  	// then placed in the corresponding files.
   181  	// Ex: restricting a process to 2048Mhz CPU and 2MB of memory:
   182  	//   $ cat /sys/fs/cgroup/cpu/user/1000.user/4.session/<uuid>/cpu.shares
   183  	//		2028
   184  	//   $ cat /sys/fs/cgroup/memory/user/1000.user/4.session/<uuid>/memory.limit_in_bytes
   185  	//		2097152
   186  	e.groups.Name = structs.GenerateUUID()
   187  
   188  	// TODO: verify this is needed for things like network access
   189  	e.groups.AllowAllDevices = true
   190  
   191  	if resources.MemoryMB > 0 {
   192  		// Total amount of memory allowed to consume
   193  		e.groups.Memory = int64(resources.MemoryMB * 1024 * 1024)
   194  		// Disable swap to avoid issues on the machine
   195  		e.groups.MemorySwap = int64(-1)
   196  	}
   197  
   198  	if resources.CPU != 0 {
   199  		if resources.CPU < 2 {
   200  			return fmt.Errorf("resources.CPU must be equal to or greater than 2: %v", resources.CPU)
   201  		}
   202  
   203  		// Set the relative CPU shares for this cgroup.
   204  		// The simplest scale is 1 share to 1 MHz so 1024 = 1GHz. This means any
   205  		// given process will have at least that amount of resources, but likely
   206  		// more since it is (probably) rare that the machine will run at 100%
   207  		// CPU. This scale will cease to work if a node is overprovisioned.
   208  		e.groups.CpuShares = int64(resources.CPU)
   209  	}
   210  
   211  	if resources.IOPS != 0 {
   212  		// Validate it is in an acceptable range.
   213  		if resources.IOPS < 10 || resources.IOPS > 1000 {
   214  			return fmt.Errorf("resources.IOPS must be between 10 and 1000: %d", resources.IOPS)
   215  		}
   216  
   217  		e.groups.BlkioWeight = uint16(resources.IOPS)
   218  	}
   219  
   220  	return nil
   221  }
   222  
   223  func (e *LinuxExecutor) runAs(userid string) error {
   224  	errs := new(multierror.Error)
   225  
   226  	// First, try to lookup the user by uid
   227  	u, err := user.LookupId(userid)
   228  	if err == nil {
   229  		e.user = u
   230  		return nil
   231  	} else {
   232  		errs = multierror.Append(errs, err)
   233  	}
   234  
   235  	// Lookup failed, so try by username instead
   236  	u, err = user.Lookup(userid)
   237  	if err == nil {
   238  		e.user = u
   239  		return nil
   240  	} else {
   241  		errs = multierror.Append(errs, err)
   242  	}
   243  
   244  	// If we got here we failed to lookup based on id and username, so we'll
   245  	// return those errors.
   246  	return fmt.Errorf("Failed to identify user to run as: %s", errs)
   247  }
   248  
   249  func (e *LinuxExecutor) Start() error {
   250  	// Run as "nobody" user so we don't leak root privilege to the
   251  	// spawned process.
   252  	if err := e.runAs("nobody"); err == nil && e.user != nil {
   253  		e.cmd.SetUID(e.user.Uid)
   254  		e.cmd.SetGID(e.user.Gid)
   255  	}
   256  
   257  	if e.alloc == nil {
   258  		return errors.New("ConfigureTaskDir() must be called before Start()")
   259  	}
   260  
   261  	// Parse the commands arguments and replace instances of Nomad environment
   262  	// variables.
   263  	envVars, err := environment.ParseFromList(e.Cmd.Env)
   264  	if err != nil {
   265  		return err
   266  	}
   267  
   268  	combined := strings.Join(e.Cmd.Args, " ")
   269  	parsed, err := args.ParseAndReplace(combined, envVars.Map())
   270  	if err != nil {
   271  		return err
   272  	}
   273  	e.Cmd.Args = parsed
   274  
   275  	return e.spawnDaemon()
   276  }
   277  
   278  // spawnDaemon executes a double fork to start the user command with proper
   279  // isolation. Stores the child process for use in Wait.
   280  func (e *LinuxExecutor) spawnDaemon() error {
   281  	bin, err := discover.NomadExecutable()
   282  	if err != nil {
   283  		return fmt.Errorf("Failed to determine the nomad executable: %v", err)
   284  	}
   285  
   286  	// Serialize the cmd and the cgroup configuration so it can be passed to the
   287  	// sub-process.
   288  	var buffer bytes.Buffer
   289  	enc := json.NewEncoder(&buffer)
   290  
   291  	c := command.DaemonConfig{
   292  		Cmd:        e.cmd.Cmd,
   293  		Chroot:     e.taskDir,
   294  		StdoutFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stdout", e.taskName)),
   295  		StderrFile: filepath.Join(e.taskDir, allocdir.TaskLocal, fmt.Sprintf("%v.stderr", e.taskName)),
   296  		StdinFile:  "/dev/null",
   297  	}
   298  	if err := enc.Encode(c); err != nil {
   299  		return fmt.Errorf("Failed to serialize daemon configuration: %v", err)
   300  	}
   301  
   302  	// Create a pipe to capture Stdout.
   303  	pr, pw, err := os.Pipe()
   304  	if err != nil {
   305  		return err
   306  	}
   307  	e.spawnOutputWriter = pw
   308  	e.spawnOutputReader = pr
   309  
   310  	// Call ourselves using a hidden flag. The new instance of nomad will join
   311  	// the passed cgroup, forkExec the cmd, and output status codes through
   312  	// Stdout.
   313  	escaped := strconv.Quote(buffer.String())
   314  	spawn := exec.Command(bin, "spawn-daemon", escaped)
   315  	spawn.Stdout = e.spawnOutputWriter
   316  
   317  	// Capture its Stdin.
   318  	spawnStdIn, err := spawn.StdinPipe()
   319  	if err != nil {
   320  		return err
   321  	}
   322  
   323  	if err := spawn.Start(); err != nil {
   324  		fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err)
   325  	}
   326  
   327  	// Join the spawn-daemon to the cgroup.
   328  	if e.groups != nil {
   329  		manager := cgroupFs.Manager{}
   330  		manager.Cgroups = e.groups
   331  
   332  		// Apply will place the current pid into the tasks file for each of the
   333  		// created cgroups:
   334  		//  /sys/fs/cgroup/memory/user/1000.user/4.session/<uuid>/tasks
   335  		//
   336  		// Apply requires superuser permissions, and may fail if Nomad is not run with
   337  		// the required permissions
   338  		if err := manager.Apply(spawn.Process.Pid); err != nil {
   339  			errs := new(multierror.Error)
   340  			errs = multierror.Append(errs, fmt.Errorf("Failed to join spawn-daemon to the cgroup (config => %+v): %v", manager.Cgroups, err))
   341  
   342  			if err := sendAbortCommand(spawnStdIn); err != nil {
   343  				errs = multierror.Append(errs, err)
   344  			}
   345  
   346  			return errs
   347  		}
   348  	}
   349  
   350  	// Tell it to start.
   351  	if err := sendStartCommand(spawnStdIn); err != nil {
   352  		return err
   353  	}
   354  
   355  	// Parse the response.
   356  	dec := json.NewDecoder(e.spawnOutputReader)
   357  	var resp command.SpawnStartStatus
   358  	if err := dec.Decode(&resp); err != nil {
   359  		return fmt.Errorf("Failed to parse spawn-daemon start response: %v", err)
   360  	}
   361  
   362  	if resp.ErrorMsg != "" {
   363  		return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg)
   364  	}
   365  
   366  	e.spawnChild = *spawn
   367  	return nil
   368  }
   369  
   370  func sendStartCommand(w io.Writer) error {
   371  	enc := json.NewEncoder(w)
   372  	if err := enc.Encode(true); err != nil {
   373  		return fmt.Errorf("Failed to serialize start command: %v", err)
   374  	}
   375  
   376  	return nil
   377  }
   378  
   379  func sendAbortCommand(w io.Writer) error {
   380  	enc := json.NewEncoder(w)
   381  	if err := enc.Encode(false); err != nil {
   382  		return fmt.Errorf("Failed to serialize abort command: %v", err)
   383  	}
   384  
   385  	return nil
   386  }
   387  
   388  // Open's behavior is to kill all processes associated with the id and return an
   389  // error. This is done because it is not possible to re-attach to the
   390  // spawn-daemon's stdout to retrieve status messages.
   391  func (e *LinuxExecutor) Open(id string) error {
   392  	parts := strings.SplitN(id, ":", 2)
   393  	if len(parts) != 2 {
   394  		return fmt.Errorf("Invalid id: %v", id)
   395  	}
   396  
   397  	switch parts[0] {
   398  	case "PID":
   399  		pid, err := strconv.Atoi(parts[1])
   400  		if err != nil {
   401  			return fmt.Errorf("Invalid id: failed to parse pid %v", parts[1])
   402  		}
   403  
   404  		process, err := os.FindProcess(pid)
   405  		if err != nil {
   406  			return fmt.Errorf("Failed to find Pid %v: %v", pid, err)
   407  		}
   408  
   409  		if err := process.Kill(); err != nil {
   410  			return fmt.Errorf("Failed to kill Pid %v: %v", pid, err)
   411  		}
   412  	case "CGROUP":
   413  		if !e.cgroupEnabled {
   414  			return errors.New("Passed a a cgroup identifier, but cgroups are disabled")
   415  		}
   416  
   417  		// De-serialize the cgroup configuration.
   418  		dec := json.NewDecoder(strings.NewReader(parts[1]))
   419  		var groups cgroupConfig.Cgroup
   420  		if err := dec.Decode(&groups); err != nil {
   421  			return fmt.Errorf("Failed to parse cgroup configuration: %v", err)
   422  		}
   423  
   424  		e.groups = &groups
   425  		if err := e.destroyCgroup(); err != nil {
   426  			return err
   427  		}
   428  		// TODO: cleanTaskDir is a little more complicated here because the OS
   429  		// may have already unmounted in the case of a restart. Need to scan.
   430  	default:
   431  		return fmt.Errorf("Invalid id type: %v", parts[0])
   432  	}
   433  
   434  	return errors.New("Could not re-open to id (intended).")
   435  }
   436  
   437  func (e *LinuxExecutor) Wait() error {
   438  	if e.spawnChild.Process == nil {
   439  		return errors.New("Can not find child to wait on")
   440  	}
   441  
   442  	defer e.spawnOutputWriter.Close()
   443  	defer e.spawnOutputReader.Close()
   444  
   445  	errs := new(multierror.Error)
   446  	if err := e.spawnChild.Wait(); err != nil {
   447  		errs = multierror.Append(errs, fmt.Errorf("Wait failed on pid %v: %v", e.spawnChild.Process.Pid, err))
   448  	}
   449  
   450  	// If they fork/exec and then exit, wait will return but they will be still
   451  	// running processes so we need to kill the full cgroup.
   452  	if e.groups != nil {
   453  		if err := e.destroyCgroup(); err != nil {
   454  			errs = multierror.Append(errs, err)
   455  		}
   456  	}
   457  
   458  	if err := e.cleanTaskDir(); err != nil {
   459  		errs = multierror.Append(errs, err)
   460  	}
   461  
   462  	return errs.ErrorOrNil()
   463  }
   464  
   465  // If cgroups are used, the ID is the cgroup structurue. Otherwise, it is the
   466  // PID of the spawn-daemon process. An error is returned if the process was
   467  // never started.
   468  func (e *LinuxExecutor) ID() (string, error) {
   469  	if e.spawnChild.Process != nil {
   470  		if e.cgroupEnabled && e.groups != nil {
   471  			// Serialize the cgroup structure so it can be undone on suabsequent
   472  			// opens.
   473  			var buffer bytes.Buffer
   474  			enc := json.NewEncoder(&buffer)
   475  			if err := enc.Encode(e.groups); err != nil {
   476  				return "", fmt.Errorf("Failed to serialize daemon configuration: %v", err)
   477  			}
   478  
   479  			return fmt.Sprintf("CGROUP:%v", buffer.String()), nil
   480  		}
   481  
   482  		return fmt.Sprintf("PID:%d", e.spawnChild.Process.Pid), nil
   483  	}
   484  
   485  	return "", fmt.Errorf("Process has finished or was never started")
   486  }
   487  
   488  func (e *LinuxExecutor) Shutdown() error {
   489  	return e.ForceStop()
   490  }
   491  
   492  func (e *LinuxExecutor) ForceStop() error {
   493  	if e.spawnOutputReader != nil {
   494  		e.spawnOutputReader.Close()
   495  	}
   496  
   497  	if e.spawnOutputWriter != nil {
   498  		e.spawnOutputWriter.Close()
   499  	}
   500  
   501  	// If the task is not running inside a cgroup then just the spawn-daemon child is killed.
   502  	// TODO: Find a good way to kill the children of the spawn-daemon.
   503  	if e.groups == nil {
   504  		if err := e.spawnChild.Process.Kill(); err != nil {
   505  			return fmt.Errorf("Failed to kill child (%v): %v", e.spawnChild.Process.Pid, err)
   506  		}
   507  
   508  		return nil
   509  	}
   510  
   511  	errs := new(multierror.Error)
   512  	if e.groups != nil {
   513  		if err := e.destroyCgroup(); err != nil {
   514  			errs = multierror.Append(errs, err)
   515  		}
   516  	}
   517  
   518  	if err := e.cleanTaskDir(); err != nil {
   519  		errs = multierror.Append(errs, err)
   520  	}
   521  
   522  	return errs.ErrorOrNil()
   523  }
   524  
   525  func (e *LinuxExecutor) destroyCgroup() error {
   526  	if e.groups == nil {
   527  		return errors.New("Can't destroy: cgroup configuration empty")
   528  	}
   529  
   530  	manager := cgroupFs.Manager{}
   531  	manager.Cgroups = e.groups
   532  	pids, err := manager.GetPids()
   533  	if err != nil {
   534  		return fmt.Errorf("Failed to get pids in the cgroup %v: %v", e.groups.Name, err)
   535  	}
   536  
   537  	errs := new(multierror.Error)
   538  	for _, pid := range pids {
   539  		process, err := os.FindProcess(pid)
   540  		if err != nil {
   541  			multierror.Append(errs, fmt.Errorf("Failed to find Pid %v: %v", pid, err))
   542  			continue
   543  		}
   544  
   545  		if err := process.Kill(); err != nil {
   546  			multierror.Append(errs, fmt.Errorf("Failed to kill Pid %v: %v", pid, err))
   547  			continue
   548  		}
   549  
   550  		if _, err := process.Wait(); err != nil {
   551  			multierror.Append(errs, fmt.Errorf("Failed to wait Pid %v: %v", pid, err))
   552  			continue
   553  		}
   554  	}
   555  
   556  	// Remove the cgroup.
   557  	if err := manager.Destroy(); err != nil {
   558  		multierror.Append(errs, fmt.Errorf("Failed to delete the cgroup directories: %v", err))
   559  	}
   560  
   561  	if len(errs.Errors) != 0 {
   562  		return fmt.Errorf("Failed to destroy cgroup: %v", errs)
   563  	}
   564  
   565  	return nil
   566  }
   567  
   568  func (e *LinuxExecutor) Command() *cmd {
   569  	return &e.cmd
   570  }