github.com/orofarne/hammy@v0.0.0-20130409105742-374fadfd6ecb/src/hammy/spexecuter.go (about)

     1  package hammy
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"time"
     7  	"os"
     8  	"os/exec"
     9  	"bytes"
    10  	"bufio"
    11  	"syscall"
    12  	"log"
    13  	"strings"
    14  	"github.com/ugorji/go-msgpack"
    15  )
    16  
    17  
    18  type process struct {
    19  	*exec.Cmd
    20  	Count uint
    21  	PStdin io.Writer
    22  	PStdout io.Reader
    23  	PStderr bytes.Buffer
    24  }
    25  
    26  type WorkerProcessInput struct {
    27  	Hostname string
    28  	Trigger string
    29  	State *State
    30  	IData IncomingHostData
    31  }
    32  
    33  type WorkerProcessOutput struct {
    34  	CmdBuffer *CmdBuffer
    35  	State *State
    36  }
    37  
    38  // Executer implementation for subprocesses with MessagePack-based RPC
    39  type SPExecuter struct {
    40  	cmdLine string
    41  	maxIter uint
    42  	workers chan *process
    43  	timeout time.Duration
    44  
    45  	//Metrics
    46  	ms *MetricSet
    47  	mRequest *TimerMetric
    48  	mExecTimer *TimerMetric
    49  	mWorkerWaitTimer *TimerMetric
    50  	mErrors *CounterMetric
    51  	mCreate *TimerMetric
    52  	mKills *CounterMetric
    53  	mTimeouts *CounterMetric
    54  }
    55  
    56  // Create new instance of SPExecutor
    57  // per process
    58  func NewSPExecuter(cfg Config, metricNamespace string) *SPExecuter {
    59  	if cfg.Workers.PoolSize < 1 || cfg.Workers.CmdLine == "" {
    60  		panic("Invalid argument")
    61  	}
    62  
    63  	e := new(SPExecuter)
    64  	e.cmdLine = cfg.Workers.CmdLine
    65  	e.maxIter = cfg.Workers.MaxIter
    66  	e.workers = make(chan *process, cfg.Workers.PoolSize)
    67  	e.timeout = time.Duration(cfg.Workers.Timeout) * time.Second
    68  
    69  	for i := uint(0); i < cfg.Workers.PoolSize; i++ {
    70  		e.workers <- &process{}
    71  	}
    72  
    73  	e.ms = NewMetricSet(metricNamespace, 30 * time.Second)
    74  	e.mRequest = e.ms.NewTimer("request")
    75  	e.mExecTimer = e.ms.NewTimer("exec")
    76  	e.mWorkerWaitTimer = e.ms.NewTimer("worker_wait")
    77  	e.mErrors = e.ms.NewCounter("errors")
    78  	e.mCreate = e.ms.NewTimer("create")
    79  	e.mKills = e.ms.NewCounter("kill")
    80  	e.mTimeouts = e.ms.NewCounter("timeouts")
    81  
    82  	return e
    83  }
    84  
    85  func (e *SPExecuter) ProcessTrigger(key string, trigger string, state *State,
    86  		data IncomingHostData) (newState *State, cmdb *CmdBuffer, err error) {
    87  //
    88  	ζ := e.mRequest.NewObservation()
    89  	defer func() { ζ.End() } ()
    90  
    91  	cmdb = NewCmdBuffer(0)
    92  	newState = NewState()
    93  	res := WorkerProcessOutput{
    94  		CmdBuffer: cmdb,
    95  		State: newState,
    96  	}
    97  
    98  	defer func() { if err != nil { e.mErrors.Add(1) } } ()
    99  
   100  	// Fetch worker (may be wait for free worker)
   101  	worker, err := e.getWorker()
   102  	defer e.freeWorker(worker)
   103  	if err != nil {
   104  		return
   105  	}
   106  
   107  	//Setup statistics
   108  	τ := e.mExecTimer.NewObservation()
   109  	defer func() { τ.End() } ()
   110  
   111  	// Set up timeout
   112  	cEnd := make(chan int)
   113  	go e.workerTimeout(worker, cEnd)
   114  
   115  	// marshal and send args
   116  	pInput := WorkerProcessInput{
   117  		Hostname: key,
   118  		Trigger: trigger,
   119  		State: state,
   120  		IData: data,
   121  	}
   122  
   123  	var errDec error
   124  	buf, errEnc := msgpack.Marshal(pInput)
   125  	if errEnc == nil {
   126  		cEnc := make(chan error)
   127  		go func() {
   128  			_, e := worker.PStdin.Write(buf)
   129  			cEnc <- e
   130  		}()
   131  
   132  		// wait, read and unmarshal result
   133  		buffer := bufio.NewReader(worker.PStdout)
   134  		dec := msgpack.NewDecoder(buffer, nil)
   135  		errDec = dec.Decode(&res)
   136  		errEnc = <- cEnc
   137  	}
   138  
   139  	cEnd <- 1
   140  	toRes := <- cEnd
   141  	switch {
   142  		case toRes == 2 && errEnc == nil && errDec == nil:
   143  			// FIXME
   144  			log.Printf(">_<")
   145  		case toRes == 2:
   146  			err = fmt.Errorf("SPExexuter timeout for host %v, errors: encoding(%v), decoding(%v), child stderr: %#v",
   147  					key, errEnc, errDec, worker.PStderr.String())
   148  		case errEnc != nil || errDec != nil:
   149  			inf := e.workerInfo(worker)
   150  			e2 := e.workerKill(worker)
   151  			err = fmt.Errorf("SPExexuter error: encoding(%v), decoding(%v), child stderr: %#v, additional info: %s, killed (%v)",
   152  					errEnc, errDec, worker.PStderr.String(), inf, e2)
   153  	}
   154  
   155  	if err == nil && worker.PStderr.String() != "" {
   156  		log.Printf("Not empty worker stderr: \"%s\"", worker.PStderr.String())
   157  	}
   158  
   159  	return
   160  }
   161  
   162  // timeout task
   163  func (e *SPExecuter) workerTimeout(worker *process, cEnd chan int) {
   164  	select {
   165  	case <-cEnd:
   166  		cEnd <- 1
   167  		return
   168  	case <-time.After(e.timeout):
   169  		e.mTimeouts.Add(1)
   170  		err := e.workerKill(worker)
   171  		if err != nil {
   172  			log.Printf("%s", err)
   173  		}
   174  		<- cEnd
   175  		cEnd <- 2
   176  		return
   177  	}
   178  	panic("?!")
   179  }
   180  
   181  func (e *SPExecuter) workerInfo(worker *process) string {
   182  	if worker.Cmd == nil { return "<worker.Cmd == nil>" }
   183  
   184  	var status syscall.WaitStatus
   185  	wpid, err := syscall.Wait4(worker.Process.Pid, &status, syscall.WNOHANG, nil)
   186  	if err != nil {
   187  		return fmt.Sprintf("Wait4 error: %v", err)
   188  	}
   189  
   190  	_ = wpid
   191  	return fmt.Sprintf("exit code = %v", status.ExitStatus())
   192  }
   193  
   194  func (e *SPExecuter) workerKill(worker *process) error {
   195  	defer func() {
   196  		worker.Cmd = nil
   197  	}()
   198  
   199  	if worker.Cmd == nil || worker.Cmd.Process == nil {
   200  		return nil
   201  	}
   202  
   203  	e.mKills.Add(1)
   204  
   205  	err := worker.Process.Kill()
   206  	switch err {
   207  		case nil:
   208  			//
   209  		case syscall.ECHILD:
   210  			return nil
   211  		default:
   212  			if e, ok := err.(*os.SyscallError); ok && (e.Err == syscall.ECHILD || e.Err == syscall.ESRCH) {
   213  				return nil
   214  			}
   215  			return fmt.Errorf("SPExecuter: Process.Kill error: %#v", err)
   216  	}
   217  
   218  	// Zombies is not good for us...
   219  	_, err = worker.Process.Wait()
   220  	switch err {
   221  		case nil:
   222  			//
   223  		case syscall.ECHILD:
   224  			return nil
   225  		default:
   226  			if e, ok := err.(*os.SyscallError); ok && e.Err == syscall.ECHILD {
   227  				return nil
   228  			}
   229  			return fmt.Errorf("SPExecuter: Process.Wait error: %#v", err)
   230  	}
   231  
   232  	return nil
   233  }
   234  
   235  // Fetch worker (may be wait for free worker)
   236  func (e *SPExecuter) getWorker() (worker *process, err error) {
   237  	//Statistics
   238  	τ := e.mWorkerWaitTimer.NewObservation()
   239  	defer func() { τ.End() } ()
   240  
   241  	worker = <- e.workers
   242  
   243  	if worker == nil {
   244  		panic("nil worker")
   245  	}
   246  
   247  	if worker.Cmd != nil {
   248  		// Check process state
   249  		var status syscall.WaitStatus
   250  
   251  		// We can't use worker.ProcessState (it's available only after a call to Wait or Run)
   252  		wpid, err := syscall.Wait4(worker.Process.Pid, &status, syscall.WNOHANG, nil)
   253  
   254  		switch {
   255  			case err == nil && wpid == 0:
   256  				// Do nothing
   257  			case err == nil && status.Exited() || err == syscall.ECHILD:
   258  				worker.Cmd = nil
   259  			case err != nil:
   260  				if err2, ok := err.(*os.SyscallError); ok && err2.Err == syscall.ECHILD {
   261  					worker.Cmd = nil
   262  				} else {
   263  					log.Printf("SPExecuter: syscall.Wait4 error: %#v", err)
   264  					err = e.workerKill(worker)
   265  					if err != nil {
   266  						log.Printf("%s", err)
   267  					}
   268  				}
   269  			default:
   270  				// Do nothing
   271  		}
   272  	}
   273  
   274  	if worker.Cmd == nil {
   275  		ζ := e.mCreate.NewObservation()
   276  		defer func() { ζ.End() } ()
   277  
   278  		// Creating new subprocess
   279  		worker.Count = 0
   280  		worker.Cmd = exec.Command(e.cmdLine)
   281  		worker.PStdin, err = worker.Cmd.StdinPipe()
   282  		if err != nil {
   283  			worker.Cmd = nil
   284  			return
   285  		}
   286  		worker.PStdout, err = worker.Cmd.StdoutPipe()
   287  		if err != nil {
   288  			worker.Cmd = nil
   289  			return
   290  		}
   291  		worker.PStderr.Reset()
   292  		worker.Cmd.Stderr = &worker.PStderr
   293  		err = worker.Start()
   294  		if err != nil {
   295  			if strings.Contains(err.Error(), "cannot allocate memory") {
   296  				panic("cannot allocate memory")
   297  			}
   298  			worker.Cmd = nil
   299  			return
   300  		}
   301  	}
   302  
   303  	return
   304  }
   305  
   306  // Return worker to buffer
   307  func (e *SPExecuter) freeWorker(worker *process) {
   308  	// Increment count of execution for the worker
   309  	worker.Count++
   310  
   311  	// Check iteration count
   312  	if worker.Count >= e.maxIter {
   313  		err := e.workerKill(worker)
   314  		if err != nil {
   315  			log.Printf("%s", err)
   316  		}
   317  	}
   318  
   319  	// Return worker to the queue
   320  	e.workers <- worker
   321  }