github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/updater/watchdog/watchdog.go (about)

     1  // Copyright 2016 Keybase, Inc. All rights reserved. Use of
     2  // this source code is governed by the included BSD license.
     3  
     4  package watchdog
     5  
     6  import (
     7  	"os"
     8  	"os/exec"
     9  	"time"
    10  
    11  	"github.com/keybase/client/go/updater/process"
    12  )
    13  
    14  // ExitOn describes when a program should exit (not-restart)
    15  type ExitOn string
    16  
    17  const (
    18  	// ExitOnNone means the program should always be restarted
    19  	ExitOnNone ExitOn = ""
    20  	// ExitOnSuccess means the program should only restart if errored
    21  	ExitOnSuccess ExitOn = "success"
    22  	// ExitAllOnSuccess means the program should only restart if errored,
    23  	// otherwise exit this watchdog. Intended for Windows
    24  	ExitAllOnSuccess ExitOn = "all"
    25  )
    26  
    27  const terminationDelay = 200 * time.Millisecond
    28  const heartbealDelay = 1 * time.Hour
    29  
    30  // Program is a program at path with arguments
    31  type Program struct {
    32  	Path       string
    33  	Name       string
    34  	Args       []string
    35  	ExitOn     ExitOn
    36  	runningPid int
    37  }
    38  
    39  // Log is the logging interface for the watchdog package
    40  type Log interface {
    41  	Debugf(s string, args ...interface{})
    42  	Infof(s string, args ...interface{})
    43  	Warningf(s string, args ...interface{})
    44  	Errorf(s string, args ...interface{})
    45  }
    46  
    47  type Watchdog struct {
    48  	Programs     []Program
    49  	RestartDelay time.Duration
    50  	Log          Log
    51  	shutdownCh   chan (struct{})
    52  }
    53  
    54  func Watch(programs []Program, restartDelay time.Duration, log Log) error {
    55  	w := Watchdog{
    56  		Programs:     programs,
    57  		RestartDelay: restartDelay,
    58  		Log:          log,
    59  		shutdownCh:   make(chan struct{}),
    60  	}
    61  	w.Log.Infof("Terminating any existing programs we will be monitoring")
    62  	w.terminateExistingMatches()
    63  	// Start monitoring all the programs
    64  	w.Log.Infof("about to start %+v\n", w.Programs)
    65  	for idx := range w.Programs {
    66  		// modifies the underlying
    67  		go w.startProgram(idx)
    68  	}
    69  	go w.heartbeatToLog(heartbealDelay)
    70  	return nil
    71  }
    72  
    73  func (w *Watchdog) Shutdown() {
    74  	w.Log.Infof("attempting a graceful exit of all of the watchdog's programs")
    75  	close(w.shutdownCh)
    76  	time.Sleep(terminationDelay)
    77  	for i := 1; i <= 3; i++ {
    78  		for _, p := range w.Programs {
    79  			_ = p.dieIfRunning(w.Log)
    80  		}
    81  		time.Sleep(terminationDelay)
    82  	}
    83  	w.Log.Infof("done terminating all watched programs - exiting process")
    84  	os.Exit(0)
    85  }
    86  
    87  func (p *Program) dieIfRunning(log Log) bool {
    88  	if p.runningPid != 0 {
    89  		log.Infof("%s running at %d is asked to die", p.Name, p.runningPid)
    90  		_ = process.TerminatePID(p.runningPid, terminationDelay, log)
    91  		p.runningPid = 0
    92  		return true
    93  	}
    94  	log.Debugf("%s did not appear to be running so it was not terminated", p.Name)
    95  	return false
    96  }
    97  
    98  func (p *Program) Run(log Log, shutdownCh chan struct{}) (err error) {
    99  	p.dieIfRunning(log)
   100  	cmd := exec.Command(p.Path, p.Args...)
   101  	if err = cmd.Start(); err != nil {
   102  		log.Errorf("Error starting %#v, err: %s", p, err.Error())
   103  		return err
   104  	}
   105  	p.runningPid = cmd.Process.Pid
   106  	log.Infof("Started %s at %d", p.Name, cmd.Process.Pid)
   107  	err = cmd.Wait()
   108  	p.runningPid = 0
   109  	return err
   110  }
   111  
   112  type heartbeat struct {
   113  	name string
   114  	pid  int
   115  }
   116  
   117  func (w *Watchdog) heartbeatToLog(delay time.Duration) {
   118  	// wait enough time for the first heartbeat so it's actually useful
   119  	time.Sleep(1 * time.Minute)
   120  	for {
   121  		var heartbeats []heartbeat
   122  		for _, p := range w.Programs {
   123  			heartbeats = append(heartbeats, heartbeat{p.Name, p.runningPid})
   124  		}
   125  		w.Log.Infof("heartbeating programs: %v", heartbeats)
   126  		select {
   127  		case <-w.shutdownCh:
   128  			w.Log.Infof("watchdog is shutting down, stop heartbeating")
   129  			return
   130  		case <-time.After(delay):
   131  			continue
   132  		}
   133  	}
   134  }
   135  
   136  // watchProgram will monitor a program and restart it if it exits.
   137  // This method will run forever.
   138  func (w *Watchdog) startProgram(idx int) {
   139  	program := &(w.Programs[idx])
   140  	for {
   141  		start := time.Now()
   142  		err := program.Run(w.Log, w.shutdownCh)
   143  		if err != nil {
   144  			w.Log.Errorf("Error running %s: %+v; %s", program.Name, program, err)
   145  		} else {
   146  			w.Log.Infof("%s finished: %+v", program.Name, program)
   147  			if program.ExitOn == ExitOnSuccess {
   148  				w.Log.Infof("Program %s configured to exit on success, not restarting", program.Name)
   149  				break
   150  			} else if program.ExitOn == ExitAllOnSuccess {
   151  				w.Log.Infof("Program %s configured to trigger full watchdog shutdown", program.Name)
   152  				w.Shutdown()
   153  			}
   154  		}
   155  		w.Log.Infof("Program %s ran for %s", program.Name, time.Since(start))
   156  		select {
   157  		case <-w.shutdownCh:
   158  			w.Log.Infof("watchdog is shutting down, not restarting %s", program.Name)
   159  			return
   160  		default:
   161  		}
   162  		if time.Since(start) < w.RestartDelay {
   163  			w.Log.Infof("Waiting %s before trying to start %s command again", w.RestartDelay, program.Name)
   164  			time.Sleep(w.RestartDelay)
   165  		}
   166  	}
   167  }
   168  
   169  // terminateExistingMatches aggressively kills anything running that looks like similar
   170  // to what this watchdog will be running. the goal here is to be sure that, if multiple
   171  // calls attempt to start a watchdog, the last one will be the only one that survives.
   172  func (w *Watchdog) terminateExistingMatches() {
   173  	w.Log.Infof("Terminate any existing programs that look like matches")
   174  	var killedPids []int
   175  	for i := 1; i <= 3; i++ {
   176  		killedPids = w.killSimilarRunningPrograms()
   177  		if !includesARealProcess(killedPids) {
   178  			w.Log.Infof("none of these programs are running")
   179  			return
   180  		}
   181  		w.Log.Infof("Terminated pids %v", killedPids)
   182  		time.Sleep(terminationDelay)
   183  	}
   184  }
   185  
   186  func includesARealProcess(pids []int) bool {
   187  	for _, p := range pids {
   188  		if p > 0 {
   189  			return true
   190  		}
   191  	}
   192  	return false
   193  }
   194  
   195  func (w *Watchdog) killSimilarRunningPrograms() (killedPids []int) {
   196  	// kill any running processes that look like the ones this watchdog wants to watch
   197  	// this logic also exists in the updater, so if you want to change it, look there too.
   198  	ospid := os.Getpid()
   199  	for _, program := range w.Programs {
   200  		matcher := process.NewMatcher(program.Path, process.PathEqual, w.Log)
   201  		matcher.ExceptPID(ospid)
   202  		w.Log.Infof("Terminating %s", program.Name)
   203  		pids := process.TerminateAll(matcher, time.Second, w.Log)
   204  		killedPids = append(killedPids, pids...)
   205  	}
   206  	return killedPids
   207  }