github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/client/driver/spawn/spawn.go (about)

     1  package spawn
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"os/exec"
    10  	"strconv"
    11  	"time"
    12  
    13  	"github.com/hashicorp/go-multierror"
    14  	"github.com/hashicorp/nomad/client/driver/structs"
    15  	"github.com/hashicorp/nomad/command"
    16  	"github.com/hashicorp/nomad/helper/discover"
    17  )
    18  
    19  // Spawner is used to start a user command in an isolated fashion that is
    20  // resistent to Nomad agent failure.
    21  type Spawner struct {
    22  	spawn     *os.Process
    23  	SpawnPid  int
    24  	SpawnPpid int
    25  	StateFile string
    26  	UserPid   int
    27  
    28  	// User configuration
    29  	UserCmd *exec.Cmd
    30  	Logs    *Logs
    31  	Chroot  string
    32  }
    33  
    34  // Logs is used to define the filepaths the user command's logs should be
    35  // redirected to. The files do not need to exist.
    36  type Logs struct {
    37  	Stdin, Stdout, Stderr string
    38  }
    39  
    40  // NewSpawner takes a path to a state file. This state file can be used to
    41  // create a new Spawner that can be used to wait on the exit status of a
    42  // process even through Nomad restarts.
    43  func NewSpawner(stateFile string) *Spawner {
    44  	return &Spawner{StateFile: stateFile}
    45  }
    46  
    47  // SetCommand sets the user command to spawn.
    48  func (s *Spawner) SetCommand(cmd *exec.Cmd) {
    49  	s.UserCmd = cmd
    50  }
    51  
    52  // SetLogs sets the redirection of user command log files.
    53  func (s *Spawner) SetLogs(l *Logs) {
    54  	s.Logs = l
    55  }
    56  
    57  // SetChroot puts the user command into a chroot.
    58  func (s *Spawner) SetChroot(root string) {
    59  	s.Chroot = root
    60  }
    61  
    62  // Spawn does a double-fork to start and isolate the user command. It takes a
    63  // call-back that is invoked with the pid of the intermediary process. If the
    64  // call back returns an error, the user command is not started and the spawn is
    65  // cancelled. This can be used to put the process into a cgroup or jail and
    66  // cancel starting the user process if that was not successful. An error is
    67  // returned if the call-back returns an error or the user-command couldn't be
    68  // started.
    69  func (s *Spawner) Spawn(cb func(pid int) error) error {
    70  	bin, err := discover.NomadExecutable()
    71  	if err != nil {
    72  		return fmt.Errorf("Failed to determine the nomad executable: %v", err)
    73  	}
    74  
    75  	exitFile, err := os.OpenFile(s.StateFile, os.O_CREATE|os.O_WRONLY, 0666)
    76  	if err != nil {
    77  		return fmt.Errorf("Error opening file to store exit status: %v", err)
    78  	}
    79  	defer exitFile.Close()
    80  
    81  	config, err := s.spawnConfig()
    82  	if err != nil {
    83  		return err
    84  	}
    85  
    86  	spawn := exec.Command(bin, "spawn-daemon", config)
    87  
    88  	// Capture stdout
    89  	spawnStdout, err := spawn.StdoutPipe()
    90  	if err != nil {
    91  		return fmt.Errorf("Failed to capture spawn-daemon stdout: %v", err)
    92  	}
    93  	defer spawnStdout.Close()
    94  
    95  	// Capture stdin.
    96  	spawnStdin, err := spawn.StdinPipe()
    97  	if err != nil {
    98  		return fmt.Errorf("Failed to capture spawn-daemon stdin: %v", err)
    99  	}
   100  	defer spawnStdin.Close()
   101  
   102  	if err := spawn.Start(); err != nil {
   103  		return fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err)
   104  	}
   105  
   106  	if cb != nil {
   107  		cbErr := cb(spawn.Process.Pid)
   108  		if cbErr != nil {
   109  			errs := new(multierror.Error)
   110  			errs = multierror.Append(errs, cbErr)
   111  			if err := s.sendAbortCommand(spawnStdin); err != nil {
   112  				errs = multierror.Append(errs, err)
   113  			}
   114  
   115  			return errs
   116  		}
   117  	}
   118  
   119  	if err := s.sendStartCommand(spawnStdin); err != nil {
   120  		return err
   121  	}
   122  
   123  	respCh := make(chan command.SpawnStartStatus, 1)
   124  	errCh := make(chan error, 1)
   125  
   126  	go func() {
   127  		var resp command.SpawnStartStatus
   128  		dec := json.NewDecoder(spawnStdout)
   129  		if err := dec.Decode(&resp); err != nil {
   130  			errCh <- fmt.Errorf("Failed to parse spawn-daemon start response: %v", err)
   131  		}
   132  		respCh <- resp
   133  	}()
   134  
   135  	select {
   136  	case err := <-errCh:
   137  		return err
   138  	case resp := <-respCh:
   139  		if resp.ErrorMsg != "" {
   140  			return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg)
   141  		}
   142  		s.UserPid = resp.UserPID
   143  	case <-time.After(5 * time.Second):
   144  		return fmt.Errorf("timed out waiting for response")
   145  	}
   146  
   147  	// Store the spawn process.
   148  	s.spawn = spawn.Process
   149  	s.SpawnPid = s.spawn.Pid
   150  	s.SpawnPpid = os.Getpid()
   151  	return nil
   152  }
   153  
   154  // spawnConfig returns a serialized config to pass to the Nomad spawn-daemon
   155  // command.
   156  func (s *Spawner) spawnConfig() (string, error) {
   157  	if s.UserCmd == nil {
   158  		return "", fmt.Errorf("Must specify user command")
   159  	}
   160  
   161  	config := command.DaemonConfig{
   162  		Cmd:            *s.UserCmd,
   163  		Chroot:         s.Chroot,
   164  		ExitStatusFile: s.StateFile,
   165  	}
   166  
   167  	if s.Logs != nil {
   168  		config.StdoutFile = s.Logs.Stdout
   169  		config.StdinFile = s.Logs.Stdin
   170  		config.StderrFile = s.Logs.Stderr
   171  	}
   172  
   173  	var buffer bytes.Buffer
   174  	enc := json.NewEncoder(&buffer)
   175  	if err := enc.Encode(config); err != nil {
   176  		return "", fmt.Errorf("Failed to serialize configuration: %v", err)
   177  	}
   178  
   179  	return strconv.Quote(buffer.String()), nil
   180  }
   181  
   182  // sendStartCommand sends the necessary command to the spawn-daemon to have it
   183  // start the user process.
   184  func (s *Spawner) sendStartCommand(w io.Writer) error {
   185  	enc := json.NewEncoder(w)
   186  	if err := enc.Encode(true); err != nil {
   187  		return fmt.Errorf("Failed to serialize start command: %v", err)
   188  	}
   189  
   190  	return nil
   191  }
   192  
   193  // sendAbortCommand sends the necessary command to the spawn-daemon to have it
   194  // abort starting the user process. This should be invoked if the spawn-daemon
   195  // could not be isolated into a cgroup.
   196  func (s *Spawner) sendAbortCommand(w io.Writer) error {
   197  	enc := json.NewEncoder(w)
   198  	if err := enc.Encode(false); err != nil {
   199  		return fmt.Errorf("Failed to serialize abort command: %v", err)
   200  	}
   201  
   202  	return nil
   203  }
   204  
   205  // Wait returns the exit code of the user process or an error if the wait
   206  // failed.
   207  func (s *Spawner) Wait() *structs.WaitResult {
   208  	if os.Getpid() == s.SpawnPpid {
   209  		return s.waitAsParent()
   210  	}
   211  
   212  	return s.pollWait()
   213  }
   214  
   215  // waitAsParent waits on the process if the current process was the spawner.
   216  func (s *Spawner) waitAsParent() *structs.WaitResult {
   217  	if s.SpawnPpid != os.Getpid() {
   218  		return structs.NewWaitResult(-1, 0, fmt.Errorf("not the parent. Spawner parent is %v; current pid is %v", s.SpawnPpid, os.Getpid()))
   219  	}
   220  
   221  	// Try to reattach to the spawn.
   222  	if s.spawn == nil {
   223  		// If it can't be reattached, it means the spawn process has exited so
   224  		// we should just read its exit file.
   225  		var err error
   226  		if s.spawn, err = os.FindProcess(s.SpawnPid); err != nil {
   227  			return s.pollWait()
   228  		}
   229  	}
   230  
   231  	if _, err := s.spawn.Wait(); err != nil {
   232  		return structs.NewWaitResult(-1, 0, err)
   233  	}
   234  
   235  	return s.pollWait()
   236  }
   237  
   238  // pollWait polls on the spawn daemon to determine when it exits. After it
   239  // exits, it reads the state file and returns the exit code and possibly an
   240  // error.
   241  func (s *Spawner) pollWait() *structs.WaitResult {
   242  	// Stat to check if it is there to avoid a race condition.
   243  	stat, err := os.Stat(s.StateFile)
   244  	if err != nil {
   245  		return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to Stat exit status file %v: %v", s.StateFile, err))
   246  	}
   247  
   248  	// If there is data it means that the file has already been written.
   249  	if stat.Size() > 0 {
   250  		return s.readExitCode()
   251  	}
   252  
   253  	// Read after the process exits.
   254  	ticker := time.NewTicker(5 * time.Second)
   255  	defer ticker.Stop()
   256  	for range ticker.C {
   257  		if !s.Alive() {
   258  			break
   259  		}
   260  	}
   261  
   262  	return s.readExitCode()
   263  }
   264  
   265  // readExitCode parses the state file and returns the exit code of the task. It
   266  // returns an error if the file can't be read.
   267  func (s *Spawner) readExitCode() *structs.WaitResult {
   268  	f, err := os.Open(s.StateFile)
   269  	if err != nil {
   270  		return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to open %v to read exit code: %v", s.StateFile, err))
   271  	}
   272  	defer f.Close()
   273  
   274  	stat, err := f.Stat()
   275  	if err != nil {
   276  		return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to stat file %v: %v", s.StateFile, err))
   277  	}
   278  
   279  	if stat.Size() == 0 {
   280  		return structs.NewWaitResult(-1, 0, fmt.Errorf("Empty state file: %v", s.StateFile))
   281  	}
   282  
   283  	var exitStatus command.SpawnExitStatus
   284  	dec := json.NewDecoder(f)
   285  	if err := dec.Decode(&exitStatus); err != nil {
   286  		return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to parse exit status from %v: %v", s.StateFile, err))
   287  	}
   288  
   289  	return structs.NewWaitResult(exitStatus.ExitCode, 0, nil)
   290  }
   291  
   292  // Valid checks that the state of the Spawner is valid and that a subsequent
   293  // Wait could be called. This is useful to call when reopening a Spawner
   294  // through client restarts. If Valid a nil error is returned.
   295  func (s *Spawner) Valid() error {
   296  	// If the spawner is still alive, then the task is running and we can wait
   297  	// on it.
   298  	if s.Alive() {
   299  		return nil
   300  	}
   301  
   302  	// The task isn't alive so check that there is a valid exit code file.
   303  	if res := s.readExitCode(); res.Err == nil {
   304  		return nil
   305  	}
   306  
   307  	return fmt.Errorf("Spawner not alive and exit code not written")
   308  }