github.com/keybase/client/go@v0.0.0-20241007131713-f10651d043c8/updater/watchdog/watchdog.go (about) 1 // Copyright 2016 Keybase, Inc. All rights reserved. Use of 2 // this source code is governed by the included BSD license. 3 4 package watchdog 5 6 import ( 7 "os" 8 "os/exec" 9 "time" 10 11 "github.com/keybase/client/go/updater/process" 12 ) 13 14 // ExitOn describes when a program should exit (not-restart) 15 type ExitOn string 16 17 const ( 18 // ExitOnNone means the program should always be restarted 19 ExitOnNone ExitOn = "" 20 // ExitOnSuccess means the program should only restart if errored 21 ExitOnSuccess ExitOn = "success" 22 // ExitAllOnSuccess means the program should only restart if errored, 23 // otherwise exit this watchdog. Intended for Windows 24 ExitAllOnSuccess ExitOn = "all" 25 ) 26 27 const terminationDelay = 200 * time.Millisecond 28 const heartbealDelay = 1 * time.Hour 29 30 // Program is a program at path with arguments 31 type Program struct { 32 Path string 33 Name string 34 Args []string 35 ExitOn ExitOn 36 runningPid int 37 } 38 39 // Log is the logging interface for the watchdog package 40 type Log interface { 41 Debugf(s string, args ...interface{}) 42 Infof(s string, args ...interface{}) 43 Warningf(s string, args ...interface{}) 44 Errorf(s string, args ...interface{}) 45 } 46 47 type Watchdog struct { 48 Programs []Program 49 RestartDelay time.Duration 50 Log Log 51 shutdownCh chan (struct{}) 52 } 53 54 func Watch(programs []Program, restartDelay time.Duration, log Log) error { 55 w := Watchdog{ 56 Programs: programs, 57 RestartDelay: restartDelay, 58 Log: log, 59 shutdownCh: make(chan struct{}), 60 } 61 w.Log.Infof("Terminating any existing programs we will be monitoring") 62 w.terminateExistingMatches() 63 // Start monitoring all the programs 64 w.Log.Infof("about to start %+v\n", w.Programs) 65 for idx := range w.Programs { 66 // modifies the underlying 67 go w.startProgram(idx) 68 } 69 go w.heartbeatToLog(heartbealDelay) 70 return nil 71 } 72 73 func (w *Watchdog) Shutdown() { 74 w.Log.Infof("attempting a graceful exit of all of the watchdog's programs") 75 close(w.shutdownCh) 76 time.Sleep(terminationDelay) 77 for i := 1; i <= 3; i++ { 78 for _, p := range w.Programs { 79 _ = p.dieIfRunning(w.Log) 80 } 81 time.Sleep(terminationDelay) 82 } 83 w.Log.Infof("done terminating all watched programs - exiting process") 84 os.Exit(0) 85 } 86 87 func (p *Program) dieIfRunning(log Log) bool { 88 if p.runningPid != 0 { 89 log.Infof("%s running at %d is asked to die", p.Name, p.runningPid) 90 _ = process.TerminatePID(p.runningPid, terminationDelay, log) 91 p.runningPid = 0 92 return true 93 } 94 log.Debugf("%s did not appear to be running so it was not terminated", p.Name) 95 return false 96 } 97 98 func (p *Program) Run(log Log, shutdownCh chan struct{}) (err error) { 99 p.dieIfRunning(log) 100 cmd := exec.Command(p.Path, p.Args...) 101 if err = cmd.Start(); err != nil { 102 log.Errorf("Error starting %#v, err: %s", p, err.Error()) 103 return err 104 } 105 p.runningPid = cmd.Process.Pid 106 log.Infof("Started %s at %d", p.Name, cmd.Process.Pid) 107 err = cmd.Wait() 108 p.runningPid = 0 109 return err 110 } 111 112 type heartbeat struct { 113 name string 114 pid int 115 } 116 117 func (w *Watchdog) heartbeatToLog(delay time.Duration) { 118 // wait enough time for the first heartbeat so it's actually useful 119 time.Sleep(1 * time.Minute) 120 for { 121 var heartbeats []heartbeat 122 for _, p := range w.Programs { 123 heartbeats = append(heartbeats, heartbeat{p.Name, p.runningPid}) 124 } 125 w.Log.Infof("heartbeating programs: %v", heartbeats) 126 select { 127 case <-w.shutdownCh: 128 w.Log.Infof("watchdog is shutting down, stop heartbeating") 129 return 130 case <-time.After(delay): 131 continue 132 } 133 } 134 } 135 136 // watchProgram will monitor a program and restart it if it exits. 137 // This method will run forever. 138 func (w *Watchdog) startProgram(idx int) { 139 program := &(w.Programs[idx]) 140 for { 141 start := time.Now() 142 err := program.Run(w.Log, w.shutdownCh) 143 if err != nil { 144 w.Log.Errorf("Error running %s: %+v; %s", program.Name, program, err) 145 } else { 146 w.Log.Infof("%s finished: %+v", program.Name, program) 147 if program.ExitOn == ExitOnSuccess { 148 w.Log.Infof("Program %s configured to exit on success, not restarting", program.Name) 149 break 150 } else if program.ExitOn == ExitAllOnSuccess { 151 w.Log.Infof("Program %s configured to trigger full watchdog shutdown", program.Name) 152 w.Shutdown() 153 } 154 } 155 w.Log.Infof("Program %s ran for %s", program.Name, time.Since(start)) 156 select { 157 case <-w.shutdownCh: 158 w.Log.Infof("watchdog is shutting down, not restarting %s", program.Name) 159 return 160 default: 161 } 162 if time.Since(start) < w.RestartDelay { 163 w.Log.Infof("Waiting %s before trying to start %s command again", w.RestartDelay, program.Name) 164 time.Sleep(w.RestartDelay) 165 } 166 } 167 } 168 169 // terminateExistingMatches aggressively kills anything running that looks like similar 170 // to what this watchdog will be running. the goal here is to be sure that, if multiple 171 // calls attempt to start a watchdog, the last one will be the only one that survives. 172 func (w *Watchdog) terminateExistingMatches() { 173 w.Log.Infof("Terminate any existing programs that look like matches") 174 var killedPids []int 175 for i := 1; i <= 3; i++ { 176 killedPids = w.killSimilarRunningPrograms() 177 if !includesARealProcess(killedPids) { 178 w.Log.Infof("none of these programs are running") 179 return 180 } 181 w.Log.Infof("Terminated pids %v", killedPids) 182 time.Sleep(terminationDelay) 183 } 184 } 185 186 func includesARealProcess(pids []int) bool { 187 for _, p := range pids { 188 if p > 0 { 189 return true 190 } 191 } 192 return false 193 } 194 195 func (w *Watchdog) killSimilarRunningPrograms() (killedPids []int) { 196 // kill any running processes that look like the ones this watchdog wants to watch 197 // this logic also exists in the updater, so if you want to change it, look there too. 198 ospid := os.Getpid() 199 for _, program := range w.Programs { 200 matcher := process.NewMatcher(program.Path, process.PathEqual, w.Log) 201 matcher.ExceptPID(ospid) 202 w.Log.Infof("Terminating %s", program.Name) 203 pids := process.TerminateAll(matcher, time.Second, w.Log) 204 killedPids = append(killedPids, pids...) 205 } 206 return killedPids 207 }