github.com/orofarne/hammy@v0.0.0-20130409105742-374fadfd6ecb/src/hammy/spexecuter.go (about) 1 package hammy 2 3 import ( 4 "fmt" 5 "io" 6 "time" 7 "os" 8 "os/exec" 9 "bytes" 10 "bufio" 11 "syscall" 12 "log" 13 "strings" 14 "github.com/ugorji/go-msgpack" 15 ) 16 17 18 type process struct { 19 *exec.Cmd 20 Count uint 21 PStdin io.Writer 22 PStdout io.Reader 23 PStderr bytes.Buffer 24 } 25 26 type WorkerProcessInput struct { 27 Hostname string 28 Trigger string 29 State *State 30 IData IncomingHostData 31 } 32 33 type WorkerProcessOutput struct { 34 CmdBuffer *CmdBuffer 35 State *State 36 } 37 38 // Executer implementation for subprocesses with MessagePack-based RPC 39 type SPExecuter struct { 40 cmdLine string 41 maxIter uint 42 workers chan *process 43 timeout time.Duration 44 45 //Metrics 46 ms *MetricSet 47 mRequest *TimerMetric 48 mExecTimer *TimerMetric 49 mWorkerWaitTimer *TimerMetric 50 mErrors *CounterMetric 51 mCreate *TimerMetric 52 mKills *CounterMetric 53 mTimeouts *CounterMetric 54 } 55 56 // Create new instance of SPExecutor 57 // per process 58 func NewSPExecuter(cfg Config, metricNamespace string) *SPExecuter { 59 if cfg.Workers.PoolSize < 1 || cfg.Workers.CmdLine == "" { 60 panic("Invalid argument") 61 } 62 63 e := new(SPExecuter) 64 e.cmdLine = cfg.Workers.CmdLine 65 e.maxIter = cfg.Workers.MaxIter 66 e.workers = make(chan *process, cfg.Workers.PoolSize) 67 e.timeout = time.Duration(cfg.Workers.Timeout) * time.Second 68 69 for i := uint(0); i < cfg.Workers.PoolSize; i++ { 70 e.workers <- &process{} 71 } 72 73 e.ms = NewMetricSet(metricNamespace, 30 * time.Second) 74 e.mRequest = e.ms.NewTimer("request") 75 e.mExecTimer = e.ms.NewTimer("exec") 76 e.mWorkerWaitTimer = e.ms.NewTimer("worker_wait") 77 e.mErrors = e.ms.NewCounter("errors") 78 e.mCreate = e.ms.NewTimer("create") 79 e.mKills = e.ms.NewCounter("kill") 80 e.mTimeouts = e.ms.NewCounter("timeouts") 81 82 return e 83 } 84 85 func (e *SPExecuter) ProcessTrigger(key string, trigger string, state *State, 86 data IncomingHostData) (newState *State, cmdb *CmdBuffer, err error) { 87 // 88 ζ := e.mRequest.NewObservation() 89 defer func() { ζ.End() } () 90 91 cmdb = NewCmdBuffer(0) 92 newState = NewState() 93 res := WorkerProcessOutput{ 94 CmdBuffer: cmdb, 95 State: newState, 96 } 97 98 defer func() { if err != nil { e.mErrors.Add(1) } } () 99 100 // Fetch worker (may be wait for free worker) 101 worker, err := e.getWorker() 102 defer e.freeWorker(worker) 103 if err != nil { 104 return 105 } 106 107 //Setup statistics 108 τ := e.mExecTimer.NewObservation() 109 defer func() { τ.End() } () 110 111 // Set up timeout 112 cEnd := make(chan int) 113 go e.workerTimeout(worker, cEnd) 114 115 // marshal and send args 116 pInput := WorkerProcessInput{ 117 Hostname: key, 118 Trigger: trigger, 119 State: state, 120 IData: data, 121 } 122 123 var errDec error 124 buf, errEnc := msgpack.Marshal(pInput) 125 if errEnc == nil { 126 cEnc := make(chan error) 127 go func() { 128 _, e := worker.PStdin.Write(buf) 129 cEnc <- e 130 }() 131 132 // wait, read and unmarshal result 133 buffer := bufio.NewReader(worker.PStdout) 134 dec := msgpack.NewDecoder(buffer, nil) 135 errDec = dec.Decode(&res) 136 errEnc = <- cEnc 137 } 138 139 cEnd <- 1 140 toRes := <- cEnd 141 switch { 142 case toRes == 2 && errEnc == nil && errDec == nil: 143 // FIXME 144 log.Printf(">_<") 145 case toRes == 2: 146 err = fmt.Errorf("SPExexuter timeout for host %v, errors: encoding(%v), decoding(%v), child stderr: %#v", 147 key, errEnc, errDec, worker.PStderr.String()) 148 case errEnc != nil || errDec != nil: 149 inf := e.workerInfo(worker) 150 e2 := e.workerKill(worker) 151 err = fmt.Errorf("SPExexuter error: encoding(%v), decoding(%v), child stderr: %#v, additional info: %s, killed (%v)", 152 errEnc, errDec, worker.PStderr.String(), inf, e2) 153 } 154 155 if err == nil && worker.PStderr.String() != "" { 156 log.Printf("Not empty worker stderr: \"%s\"", worker.PStderr.String()) 157 } 158 159 return 160 } 161 162 // timeout task 163 func (e *SPExecuter) workerTimeout(worker *process, cEnd chan int) { 164 select { 165 case <-cEnd: 166 cEnd <- 1 167 return 168 case <-time.After(e.timeout): 169 e.mTimeouts.Add(1) 170 err := e.workerKill(worker) 171 if err != nil { 172 log.Printf("%s", err) 173 } 174 <- cEnd 175 cEnd <- 2 176 return 177 } 178 panic("?!") 179 } 180 181 func (e *SPExecuter) workerInfo(worker *process) string { 182 if worker.Cmd == nil { return "<worker.Cmd == nil>" } 183 184 var status syscall.WaitStatus 185 wpid, err := syscall.Wait4(worker.Process.Pid, &status, syscall.WNOHANG, nil) 186 if err != nil { 187 return fmt.Sprintf("Wait4 error: %v", err) 188 } 189 190 _ = wpid 191 return fmt.Sprintf("exit code = %v", status.ExitStatus()) 192 } 193 194 func (e *SPExecuter) workerKill(worker *process) error { 195 defer func() { 196 worker.Cmd = nil 197 }() 198 199 if worker.Cmd == nil || worker.Cmd.Process == nil { 200 return nil 201 } 202 203 e.mKills.Add(1) 204 205 err := worker.Process.Kill() 206 switch err { 207 case nil: 208 // 209 case syscall.ECHILD: 210 return nil 211 default: 212 if e, ok := err.(*os.SyscallError); ok && (e.Err == syscall.ECHILD || e.Err == syscall.ESRCH) { 213 return nil 214 } 215 return fmt.Errorf("SPExecuter: Process.Kill error: %#v", err) 216 } 217 218 // Zombies is not good for us... 219 _, err = worker.Process.Wait() 220 switch err { 221 case nil: 222 // 223 case syscall.ECHILD: 224 return nil 225 default: 226 if e, ok := err.(*os.SyscallError); ok && e.Err == syscall.ECHILD { 227 return nil 228 } 229 return fmt.Errorf("SPExecuter: Process.Wait error: %#v", err) 230 } 231 232 return nil 233 } 234 235 // Fetch worker (may be wait for free worker) 236 func (e *SPExecuter) getWorker() (worker *process, err error) { 237 //Statistics 238 τ := e.mWorkerWaitTimer.NewObservation() 239 defer func() { τ.End() } () 240 241 worker = <- e.workers 242 243 if worker == nil { 244 panic("nil worker") 245 } 246 247 if worker.Cmd != nil { 248 // Check process state 249 var status syscall.WaitStatus 250 251 // We can't use worker.ProcessState (it's available only after a call to Wait or Run) 252 wpid, err := syscall.Wait4(worker.Process.Pid, &status, syscall.WNOHANG, nil) 253 254 switch { 255 case err == nil && wpid == 0: 256 // Do nothing 257 case err == nil && status.Exited() || err == syscall.ECHILD: 258 worker.Cmd = nil 259 case err != nil: 260 if err2, ok := err.(*os.SyscallError); ok && err2.Err == syscall.ECHILD { 261 worker.Cmd = nil 262 } else { 263 log.Printf("SPExecuter: syscall.Wait4 error: %#v", err) 264 err = e.workerKill(worker) 265 if err != nil { 266 log.Printf("%s", err) 267 } 268 } 269 default: 270 // Do nothing 271 } 272 } 273 274 if worker.Cmd == nil { 275 ζ := e.mCreate.NewObservation() 276 defer func() { ζ.End() } () 277 278 // Creating new subprocess 279 worker.Count = 0 280 worker.Cmd = exec.Command(e.cmdLine) 281 worker.PStdin, err = worker.Cmd.StdinPipe() 282 if err != nil { 283 worker.Cmd = nil 284 return 285 } 286 worker.PStdout, err = worker.Cmd.StdoutPipe() 287 if err != nil { 288 worker.Cmd = nil 289 return 290 } 291 worker.PStderr.Reset() 292 worker.Cmd.Stderr = &worker.PStderr 293 err = worker.Start() 294 if err != nil { 295 if strings.Contains(err.Error(), "cannot allocate memory") { 296 panic("cannot allocate memory") 297 } 298 worker.Cmd = nil 299 return 300 } 301 } 302 303 return 304 } 305 306 // Return worker to buffer 307 func (e *SPExecuter) freeWorker(worker *process) { 308 // Increment count of execution for the worker 309 worker.Count++ 310 311 // Check iteration count 312 if worker.Count >= e.maxIter { 313 err := e.workerKill(worker) 314 if err != nil { 315 log.Printf("%s", err) 316 } 317 } 318 319 // Return worker to the queue 320 e.workers <- worker 321 }