github.com/kontera-technologies/go-supervisor/v2@v2.1.0/supervisor.go (about) 1 package supervisor 2 3 import ( 4 "errors" 5 "fmt" 6 "io" 7 "log" 8 "math" 9 "math/rand" 10 "os" 11 "os/exec" 12 "sync" 13 "sync/atomic" 14 "syscall" 15 "time" 16 ) 17 18 const maxDuration = 1<<63 - 1 19 const ( 20 defaultMaxSpawns = 1 21 defaultMaxSpawnAttempts = 10 22 defaultMaxSpawnBackOff = 2 * time.Minute 23 defaultMaxRespawnBackOff = 2 * time.Minute 24 defaultMaxInterruptAttempts = 5 25 defaultMaxTerminateAttempts = 5 26 defaultNotifyEventTimeout = time.Millisecond 27 defaultParserBufferSize = 4096 28 defaultIdleTimeout = 10 * time.Second 29 defaultRunTimeout = time.Duration(maxDuration) 30 defaultTerminationGraceTimeout = time.Second 31 defaultEventTimeFormat = time.RFC3339Nano 32 ) 33 34 var EnsureClosedTimeout = time.Second 35 36 type Event struct { 37 Id string 38 Code string 39 Message string 40 Time time.Time 41 TimeFormat string 42 } 43 44 func (ev Event) String() string { 45 if len(ev.Message) == 0 { 46 return fmt.Sprintf("[%30s][%s] %s", ev.Time.Format(ev.TimeFormat), ev.Id, ev.Code) 47 } 48 return fmt.Sprintf("[%s][%30s] %s - %s", ev.Time.Format(ev.TimeFormat), ev.Id, ev.Code, ev.Message) 49 } 50 51 const ( 52 ready uint32 = 1 << iota 53 running 54 respawning 55 stopped 56 errored 57 ) 58 59 func phaseString(s uint32) string { 60 str := "unknown" 61 switch s { 62 case ready: 63 str = "ready" 64 case running: 65 str = "running" 66 case respawning: 67 str = "respawning" 68 case stopped: 69 str = "stopped" 70 case errored: 71 str = "errored" 72 } 73 return fmt.Sprintf("%s(%d)", str, s) 74 } 75 76 type ProduceFn func() (*interface{}, error) 77 78 type Process struct { 79 cmd *exec.Cmd 80 pid int64 81 spawnCount int64 82 stopC chan bool 83 ensureAllClosed func() 84 85 phase uint32 86 phaseMu sync.Mutex 87 88 lastError atomic.Value 89 lastProcessState atomic.Value 90 91 opts *ProcessOptions 92 93 eventTimer *time.Timer 94 eventNotifierMu sync.Mutex 95 96 doneNotifier chan bool 97 rand *rand.Rand 98 stopSleep chan bool 99 } 100 101 func (p *Process) Input() chan<- []byte { 102 return p.opts.In 103 } 104 105 // EmptyInput empties all messages from the Input channel. 106 func (p *Process) EmptyInput() { 107 for { 108 select { 109 case _, ok := <-p.opts.In: 110 if !ok { 111 return 112 } 113 default: 114 return 115 } 116 } 117 } 118 119 func (p *Process) Stdout() <-chan *interface{} { 120 return p.opts.Out 121 } 122 123 func (p *Process) Stderr() <-chan *interface{} { 124 return p.opts.Err 125 } 126 127 func (p *Process) LastProcessState() *os.ProcessState { 128 v := p.lastProcessState.Load() 129 if v == nil { 130 return nil 131 } 132 return v.(*os.ProcessState) 133 } 134 135 func (p *Process) LastError() error { 136 v := p.lastError.Load() 137 if v == nil { 138 return nil 139 } 140 if x, ok := v.(error); ok { 141 return x 142 } 143 return nil 144 } 145 146 func (p *Process) Pid() int { 147 return int(atomic.LoadInt64(&p.pid)) 148 } 149 150 func (p *Process) Start() (err error) { 151 p.phaseMu.Lock() 152 defer p.phaseMu.Unlock() 153 if p.phase != ready && p.phase != respawning { 154 return fmt.Errorf(`process phase is "%s" and not "ready" or "respawning"`, phaseString(p.phase)) 155 } 156 157 for attempt := 0; p.opts.MaxSpawnAttempts == -1 || attempt < p.opts.MaxSpawnAttempts; attempt++ { 158 err = p.unprotectedStart() 159 if err == nil { 160 p.phase = running 161 return 162 } 163 if !p.sleep(p.CalcBackOff(attempt, time.Second, p.opts.MaxSpawnBackOff)) { 164 break 165 } 166 } 167 168 p.phase = errored 169 p.notifyDone() 170 return 171 } 172 173 func (p *Process) unprotectedStart() error { 174 p.cmd = newCommand(p.opts) 175 176 inPipe, err := p.cmd.StdinPipe() 177 if err != nil { 178 return fmt.Errorf("failed to fetch stdin pipe: %s", err) 179 } 180 181 outPipe, err := p.cmd.StdoutPipe() 182 if err != nil { 183 return fmt.Errorf("failed to fetch stdout pipe: %s", err) 184 } 185 186 errPipe, err := p.cmd.StderrPipe() 187 if err != nil { 188 return fmt.Errorf("failed to fetch stderr pipe: %s", err) 189 } 190 191 if p.opts.OutputParser == nil { 192 return errors.New("missing output streamer") 193 } 194 195 if p.opts.ErrorParser == nil { 196 return errors.New("missing error streamer") 197 } 198 199 if err = p.cmd.Start(); err != nil { 200 return err 201 } 202 203 atomic.AddInt64(&p.spawnCount, 1) 204 atomic.StoreInt64(&p.pid, int64(p.cmd.Process.Pid)) 205 206 p.stopC = make(chan bool) 207 heartbeat, isMonitorClosed, isInClosed, isOutClosed, isErrClosed := make(chan bool), make(chan bool), make(chan bool), make(chan bool), make(chan bool) 208 209 go chanToWriter(p.opts.In, inPipe, p.notifyEvent, isInClosed, p.stopC, heartbeat) 210 go readerToChan(p.opts.OutputParser(outPipe, p.opts.ParserBufferSize), p.opts.Out, isOutClosed, p.stopC, heartbeat) 211 go readerToChan(p.opts.ErrorParser(errPipe, p.opts.ParserBufferSize), p.opts.Err, isErrClosed, p.stopC, nil) 212 213 go MonitorHeartBeat(p.opts.IdleTimeout, p.opts.RunTimeout, heartbeat, isMonitorClosed, p.stopC, p.Stop, p.notifyEvent) 214 215 var ensureOnce sync.Once 216 p.ensureAllClosed = func() { 217 ensureOnce.Do(func() { 218 select { 219 case <-p.stopC: 220 default: 221 log.Printf("[%s] ensureAllClosed was called before stopC channel was closed.", p.opts.Id) 222 } 223 if p.opts.Debug { 224 log.Printf("[%s] Starting to ensure all pipes have closed.", p.opts.Id) 225 } 226 if cErr := ensureClosed("stdin", isInClosed, inPipe.Close); cErr != nil { 227 log.Printf("[%s] Possible memory leak, stdin go-routine not closed. Error: %s", p.opts.Id, cErr) 228 } 229 if cErr := ensureClosed("stdout", isOutClosed, outPipe.Close); cErr != nil { 230 log.Printf("[%s] Possible memory leak, stdout go-routine not closed. Error: %s", p.opts.Id, cErr) 231 } 232 if cErr := ensureClosed("stderr", isErrClosed, errPipe.Close); cErr != nil { 233 log.Printf("[%s] Possible memory leak, stderr go-routine not closed. Error: %s", p.opts.Id, cErr) 234 } 235 if cErr := ensureClosed("heartbeat monitor", isMonitorClosed, nil); cErr != nil { 236 log.Printf("[%s] Possible memory leak, monitoring go-routine not closed. Error: %s", p.opts.Id, cErr) 237 } 238 }) 239 } 240 241 go p.waitAndNotify() 242 243 p.notifyEvent("ProcessStart", fmt.Sprintf("pid: %d", p.Pid())) 244 return nil 245 } 246 247 func chanToWriter(in <-chan []byte, out io.Writer, notifyEvent func(string, ...interface{}), closeWhenDone, stopC, heartbeat chan bool) { 248 defer close(closeWhenDone) 249 for { 250 select { 251 case <-stopC: 252 return 253 case raw, chanOpen := <-in: 254 if !chanOpen { 255 notifyEvent("Error", "Input channel closed unexpectedly.") 256 return 257 } 258 259 _, err := out.Write(raw) 260 if err != nil { 261 notifyEvent("WriteError", err.Error()) 262 return 263 } 264 heartbeat <- true 265 } 266 } 267 } 268 269 func readerToChan(producer ProduceFn, out chan<- *interface{}, closeWhenDone, stopC, heartbeat chan bool) { 270 defer close(closeWhenDone) 271 272 cleanPipe := func() { 273 for { 274 if res, err := producer(); res != nil { 275 select { 276 case out <- res: 277 default: 278 // During cleaning, throw out messages if they are not collect right away. 279 } 280 } else if err != nil { 281 return 282 } 283 } 284 } 285 286 for { 287 if res, err := producer(); res != nil { 288 select { 289 case out <- res: 290 select { 291 case heartbeat <- true: 292 default: 293 } 294 case <-stopC: 295 cleanPipe() 296 return 297 } 298 } else if err != nil { 299 return 300 } 301 302 select { 303 case <-stopC: 304 cleanPipe() 305 return 306 default: 307 } 308 } 309 } 310 311 // MonitorHeartBeat monitors the heartbeat channel and stops the process if idleTimeout time is passed without a 312 // positive heartbeat, or if a negative heartbeat is passed, or if the run timeout passed. 313 // 314 // isMonitorClosed will be closed when this function exists. 315 // 316 // When stopC closes, this function will exit immediately. 317 func MonitorHeartBeat(idleTimeout time.Duration, runTimeout time.Duration, heartbeat, isMonitorClosed, stopC chan bool, stop func() error, notifyEvent func(string, ...interface{})) { 318 t := time.NewTimer(idleTimeout) 319 r := time.NewTimer(runTimeout) 320 defer t.Stop() 321 defer r.Stop() 322 for alive := true; alive; { 323 select { 324 case <-stopC: 325 notifyEvent("StoppingHeartbeatMonitoring", "Stop signal received.") 326 close(isMonitorClosed) 327 return // Return early to avoid calling stop() 328 329 case alive = <-heartbeat: 330 if alive { 331 if !t.Stop() { 332 <-t.C 333 } 334 t.Reset(idleTimeout) 335 } else { 336 notifyEvent("NegativeHeartbeat", "Stopping process.") 337 } 338 339 case <-t.C: 340 alive = false 341 notifyEvent("MissingHeartbeat", "Stopping process.") 342 case <-r.C: 343 alive = false 344 notifyEvent("RunTimePassed", "Stopping process.") 345 } 346 } 347 348 close(isMonitorClosed) 349 if err := stop(); err != nil { 350 notifyEvent("StopError", err.Error()) 351 } 352 } 353 354 func (p *Process) waitAndNotify() { 355 state, waitErr := p.cmd.Process.Wait() 356 357 p.phaseMu.Lock() 358 automaticUnlock := true 359 defer func() { 360 if automaticUnlock { 361 p.phaseMu.Unlock() 362 } 363 }() 364 365 p.lastProcessState.Store(state) 366 367 if p.phase == stopped { 368 return 369 } else if p.phase != running && p.phase != respawning { 370 p.notifyEvent("RespawnError", fmt.Sprintf(`process phase is "%s" and not "running" or "respawning"`, phaseString(p.phase))) 371 } 372 373 p.phase = stopped 374 375 if waitErr != nil { 376 p.notifyEvent("WaitError", fmt.Sprintf("os.Process.Wait returned an error - %s", waitErr.Error())) 377 p.phase = errored 378 return 379 } 380 381 if state.Success() { 382 p.notifyEvent("ProcessDone", state.String()) 383 } else { 384 p.notifyEvent("ProcessCrashed", state.String()) 385 p.lastError.Store(errors.New(state.String())) 386 } 387 388 // Cleanup resources 389 select { 390 case <-p.stopC: 391 default: 392 close(p.stopC) 393 } 394 p.ensureAllClosed() 395 396 if !p.canRespawn() { 397 p.notifyEvent("RespawnError", "Max number of respawns reached.") 398 p.notifyDone() 399 return 400 } 401 402 sleepFor := p.CalcBackOff(int(atomic.LoadInt64(&p.spawnCount))-1, time.Second, p.opts.MaxRespawnBackOff) 403 p.notifyEvent("Sleep", fmt.Sprintf("Sleeping for %s before respwaning instance.", sleepFor.String())) 404 if !p.sleep(sleepFor) { 405 return 406 } 407 408 p.phase = respawning 409 p.notifyEvent("ProcessRespawn", "Trying to respawn instance.") 410 411 automaticUnlock = false 412 p.phaseMu.Unlock() 413 err := p.Start() 414 415 if err != nil { 416 p.notifyEvent("RespawnError", err.Error()) 417 } 418 } 419 420 func (p *Process) sleep(d time.Duration) bool { 421 t := time.NewTimer(d) 422 select { 423 case <-t.C: 424 return true 425 case <-p.stopSleep: 426 t.Stop() 427 return false 428 } 429 } 430 431 func (p *Process) canRespawn() bool { 432 return p.opts.MaxSpawns == -1 || atomic.LoadInt64(&p.spawnCount) < int64(p.opts.MaxSpawns) 433 } 434 435 // Stop tries to stop the process. 436 // Entering this function will change the phase from "running" to "stopping" (any other initial phase will cause an error 437 // to be returned). 438 // 439 // This function will call notifyDone when it is done. 440 // 441 // If it fails to stop the process, the phase will change to errored and an error will be returned. 442 // Otherwise, the phase changes to stopped. 443 func (p *Process) Stop() error { 444 select { 445 case <-p.stopSleep: 446 default: 447 close(p.stopSleep) 448 } 449 p.phaseMu.Lock() 450 defer p.phaseMu.Unlock() 451 defer p.notifyDone() 452 err := p.unprotectedStop() 453 if err != nil { 454 p.phase = errored 455 return err 456 } 457 p.phase = stopped 458 return nil 459 } 460 461 func (p *Process) unprotectedStop() (err error) { 462 p.notifyEvent("ProcessStop") 463 464 select { 465 case <-p.stopC: 466 default: 467 close(p.stopC) 468 } 469 defer p.ensureAllClosed() 470 471 if !p.IsAlive() { 472 return nil 473 } 474 475 attempt := 0 476 for ; attempt < p.opts.MaxInterruptAttempts; attempt++ { 477 p.notifyEvent("Interrupt", fmt.Sprintf("sending intterupt signal to %d - attempt #%d", -p.Pid(), attempt+1)) 478 err = p.interrupt() 479 if err == nil { 480 return nil 481 } 482 } 483 if p.opts.MaxInterruptAttempts > 0 { 484 p.notifyEvent("InterruptError", fmt.Sprintf("interrupt signal failed - %d attempts", attempt)) 485 } 486 487 err = nil 488 for attempt = 0; attempt < p.opts.MaxTerminateAttempts; attempt++ { 489 p.notifyEvent("Terminate", fmt.Sprintf("sending terminate signal to %d - attempt #%d", -p.Pid(), attempt+1)) 490 err = p.terminate() 491 if err == nil { 492 return nil 493 } 494 } 495 if p.opts.MaxTerminateAttempts > 0 { 496 p.notifyEvent("TerminateError", fmt.Sprintf("terminate signal failed - %d attempts", attempt)) 497 } 498 499 p.notifyEvent("Killing", fmt.Sprintf("sending kill signal to %d", p.Pid())) 500 err = syscall.Kill(-p.Pid(), syscall.SIGKILL) 501 502 if err != nil { 503 p.notifyEvent("KillError", err.Error()) 504 return err 505 } 506 507 return nil 508 } 509 510 // Restart tries to stop and start the process. 511 // Entering this function will change the phase from running to respawning (any other initial phase will cause an error 512 // to be returned). 513 // 514 // If it fails to stop the process the phase will change to errored and notifyDone will be called. 515 // If there are no more allowed respawns the phase will change to stopped and notifyDone will be called. 516 // 517 // This function calls Process.Start to start the process which will change the phase to "running" (or "errored" if it 518 // fails) 519 // If Start fails, notifyDone will be called. 520 func (p *Process) Restart() error { 521 p.phaseMu.Lock() 522 defer p.phaseMu.Unlock() 523 if p.phase != running { 524 return fmt.Errorf(`process phase is "%s" and not "running"`, phaseString(p.phase)) 525 } 526 p.phase = respawning 527 err := p.unprotectedStop() 528 529 if err != nil { 530 p.phase = errored 531 p.notifyDone() 532 return err 533 } 534 535 if !p.canRespawn() { 536 p.phase = stopped 537 p.notifyDone() 538 return errors.New("max number of respawns reached") 539 } 540 541 return nil 542 } 543 544 func (p *Process) IsAlive() bool { 545 err := syscall.Kill(-p.Pid(), syscall.Signal(0)) 546 if errno, ok := err.(syscall.Errno); ok { 547 return errno != syscall.ESRCH 548 } 549 return true 550 } 551 552 func (p *Process) IsDone() bool { 553 select { 554 case <-p.doneNotifier: 555 return true 556 default: 557 return false 558 } 559 } 560 561 func (p *Process) DoneNotifier() <-chan bool { 562 return p.doneNotifier 563 } 564 565 // notifyDone closes the DoneNotifier channel (if it isn't already closed). 566 func (p *Process) notifyDone() { 567 select { 568 case <-p.doneNotifier: 569 default: 570 close(p.doneNotifier) 571 } 572 } 573 574 // EventNotifier returns the eventNotifier channel (and creates one if none exists). 575 // 576 // It is protected by Process.eventNotifierMu. 577 func (p *Process) EventNotifier() chan Event { 578 p.eventNotifierMu.Lock() 579 defer p.eventNotifierMu.Unlock() 580 581 if p.opts.EventNotifier == nil { 582 p.opts.EventNotifier = make(chan Event) 583 } 584 585 return p.opts.EventNotifier 586 } 587 588 // notifyEvent creates and passes an event struct from an event code string and an optional event message. 589 // fmt.Sprint will be called on the message slice. 590 // 591 // It is protected by Process.eventNotifierMu. 592 func (p *Process) notifyEvent(code string, message ...interface{}) { 593 // Create the event before calling Lock. 594 ev := Event{ 595 Id: p.opts.Id, 596 Code: code, 597 Message: fmt.Sprint(message...), 598 Time: time.Now(), 599 TimeFormat: p.opts.EventTimeFormat, 600 } 601 602 // Log the event before calling Lock. 603 if p.opts.Debug { 604 fmt.Println(ev) 605 } 606 607 p.eventNotifierMu.Lock() 608 defer p.eventNotifierMu.Unlock() 609 610 if notifier := p.opts.EventNotifier; notifier != nil { 611 if p.eventTimer == nil { 612 p.eventTimer = time.NewTimer(p.opts.NotifyEventTimeout) 613 } else { 614 p.eventTimer.Reset(p.opts.NotifyEventTimeout) 615 } 616 617 select { 618 case notifier <- ev: 619 if !p.eventTimer.Stop() { 620 <-p.eventTimer.C 621 } 622 case <-p.eventTimer.C: 623 log.Printf("Failed to sent %#v. EventNotifier is set, but isn't accepting any events.", ev) 624 } 625 } 626 } 627 628 func (p *Process) interrupt() (err error) { 629 err = syscall.Kill(-p.Pid(), syscall.SIGINT) 630 if err != nil { 631 return 632 } 633 634 time.Sleep(p.opts.TerminationGraceTimeout) // Sleep for a second to allow the process to end. 635 if p.IsAlive() { 636 err = errors.New("interrupt signal failed") 637 } 638 return 639 } 640 641 func (p *Process) terminate() (err error) { 642 err = syscall.Kill(-p.Pid(), syscall.SIGTERM) 643 if err != nil { 644 return 645 } 646 647 time.Sleep(p.opts.TerminationGraceTimeout) // Sleep for a second to allow the process to end. 648 if p.IsAlive() { 649 err = errors.New("terminate signal failed") 650 } 651 return 652 } 653 654 func (p *Process) CalcBackOff(attempt int, step time.Duration, maxBackOff time.Duration) time.Duration { 655 randBuffer := (step / 1000) * time.Duration(p.rand.Intn(1000)) 656 backOff := randBuffer + step*time.Duration(math.Exp2(float64(attempt))) 657 if backOff > maxBackOff { 658 return maxBackOff 659 } 660 return backOff 661 } 662 663 func NewProcess(opts ProcessOptions) *Process { 664 return &Process{ 665 phase: ready, 666 opts: initProcessOptions(opts), 667 doneNotifier: make(chan bool), 668 stopSleep: make(chan bool), 669 rand: rand.New(rand.NewSource(time.Now().UTC().UnixNano())), 670 } 671 } 672 673 // newCommand creates a new exec.Cmd struct. 674 func newCommand(opts *ProcessOptions) *exec.Cmd { 675 cmd := exec.Command(opts.Name, opts.Args...) 676 cmd.Env = opts.Env 677 cmd.Dir = opts.Dir 678 cmd.ExtraFiles = opts.ExtraFiles 679 cmd.SysProcAttr = opts.SysProcAttr 680 return cmd 681 } 682 683 // todo: test if panics on double-close 684 func ensureClosed(name string, isStopped chan bool, forceClose func() error) error { 685 t := time.NewTimer(EnsureClosedTimeout) 686 defer t.Stop() 687 688 select { 689 case <-isStopped: 690 return nil 691 case <-t.C: 692 if forceClose == nil { 693 return fmt.Errorf("stopped waiting for %s after %s", name, EnsureClosedTimeout) 694 } 695 if err := forceClose(); err != nil { 696 return fmt.Errorf("%s - %s", name, err.Error()) 697 } 698 699 return ensureClosed(name, isStopped, nil) 700 } 701 }