github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/runner.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package worker 5 6 import ( 7 "errors" 8 "time" 9 10 "launchpad.net/tomb" 11 ) 12 13 // RestartDelay holds the length of time that a worker 14 // will wait between exiting and restarting. 15 var RestartDelay = 3 * time.Second 16 17 // Worker is implemented by a running worker. 18 type Worker interface { 19 // Kill asks the worker to stop without necessarily 20 // waiting for it to do so. 21 Kill() 22 // Wait waits for the worker to exit and returns any 23 // error encountered when it was running. 24 Wait() error 25 } 26 27 // Runner is implemented by instances capable of starting and stopping workers. 28 type Runner interface { 29 Worker 30 StartWorker(id string, startFunc func() (Worker, error)) error 31 StopWorker(id string) error 32 } 33 34 // runner runs a set of workers, restarting them as necessary 35 // when they fail. 36 type runner struct { 37 tomb tomb.Tomb 38 startc chan startReq 39 stopc chan string 40 donec chan doneInfo 41 startedc chan startInfo 42 isFatal func(error) bool 43 moreImportant func(err0, err1 error) bool 44 } 45 46 var _ Runner = (*runner)(nil) 47 48 type startReq struct { 49 id string 50 start func() (Worker, error) 51 } 52 53 type startInfo struct { 54 id string 55 worker Worker 56 } 57 58 type doneInfo struct { 59 id string 60 err error 61 } 62 63 // NewRunner creates a new Runner. When a worker finishes, if its error 64 // is deemed fatal (determined by calling isFatal), all the other workers 65 // will be stopped and the runner itself will finish. Of all the fatal errors 66 // returned by the stopped workers, only the most important one, 67 // determined by calling moreImportant, will be returned from 68 // Runner.Wait. Non-fatal errors will not be returned. 69 // 70 // The function isFatal(err) returns whether err is a fatal error. The 71 // function moreImportant(err0, err1) returns whether err0 is considered 72 // more important than err1. 73 func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool) Runner { 74 runner := &runner{ 75 startc: make(chan startReq), 76 stopc: make(chan string), 77 donec: make(chan doneInfo), 78 startedc: make(chan startInfo), 79 isFatal: isFatal, 80 moreImportant: moreImportant, 81 } 82 go func() { 83 defer runner.tomb.Done() 84 runner.tomb.Kill(runner.run()) 85 }() 86 return runner 87 } 88 89 var ErrDead = errors.New("worker runner is not running") 90 91 // StartWorker starts a worker running associated with the given id. 92 // The startFunc function will be called to create the worker; 93 // when the worker exits, it will be restarted as long as it 94 // does not return a fatal error. 95 // 96 // If there is already a worker with the given id, nothing will be done. 97 // 98 // StartWorker returns ErrDead if the runner is not running. 99 func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error { 100 select { 101 case runner.startc <- startReq{id, startFunc}: 102 return nil 103 case <-runner.tomb.Dead(): 104 } 105 return ErrDead 106 } 107 108 // StopWorker stops the worker associated with the given id. 109 // It does nothing if there is no such worker. 110 // 111 // StopWorker returns ErrDead if the runner is not running. 112 func (runner *runner) StopWorker(id string) error { 113 select { 114 case runner.stopc <- id: 115 return nil 116 case <-runner.tomb.Dead(): 117 } 118 return ErrDead 119 } 120 121 func (runner *runner) Wait() error { 122 return runner.tomb.Wait() 123 } 124 125 func (runner *runner) Kill() { 126 logger.Debugf("killing runner %p", runner) 127 runner.tomb.Kill(nil) 128 } 129 130 // Stop kills the given worker and waits for it to exit. 131 func Stop(worker Worker) error { 132 worker.Kill() 133 return worker.Wait() 134 } 135 136 type workerInfo struct { 137 start func() (Worker, error) 138 worker Worker 139 restartDelay time.Duration 140 stopping bool 141 } 142 143 func (runner *runner) run() error { 144 // workers holds the current set of workers. All workers with a 145 // running goroutine have an entry here. 146 workers := make(map[string]*workerInfo) 147 var finalError error 148 149 // isDying holds whether the runner is currently dying. When it 150 // is dying (whether as a result of being killed or due to a 151 // fatal error), all existing workers are killed, no new workers 152 // will be started, and the loop will exit when all existing 153 // workers have stopped. 154 isDying := false 155 tombDying := runner.tomb.Dying() 156 for { 157 if isDying && len(workers) == 0 { 158 return finalError 159 } 160 select { 161 case <-tombDying: 162 logger.Infof("runner is dying") 163 isDying = true 164 killAll(workers) 165 tombDying = nil 166 case req := <-runner.startc: 167 if isDying { 168 logger.Infof("ignoring start request for %q when dying", req.id) 169 break 170 } 171 info := workers[req.id] 172 if info == nil { 173 workers[req.id] = &workerInfo{ 174 start: req.start, 175 restartDelay: RestartDelay, 176 } 177 go runner.runWorker(0, req.id, req.start) 178 break 179 } 180 if !info.stopping { 181 // The worker is already running, so leave it alone 182 break 183 } 184 // The worker previously existed and is 185 // currently being stopped. When it eventually 186 // does stop, we'll restart it immediately with 187 // the new start function. 188 info.start = req.start 189 info.restartDelay = 0 190 case id := <-runner.stopc: 191 logger.Debugf("stop %q", id) 192 if info := workers[id]; info != nil { 193 killWorker(id, info) 194 } 195 case info := <-runner.startedc: 196 logger.Debugf("%q started", info.id) 197 workerInfo := workers[info.id] 198 workerInfo.worker = info.worker 199 if isDying || workerInfo.stopping { 200 killWorker(info.id, workerInfo) 201 } 202 case info := <-runner.donec: 203 logger.Debugf("%q done: %v", info.id, info.err) 204 workerInfo := workers[info.id] 205 if !workerInfo.stopping && info.err == nil { 206 logger.Debugf("removing %q from known workers", info.id) 207 delete(workers, info.id) 208 break 209 } 210 if info.err != nil { 211 if runner.isFatal(info.err) { 212 logger.Errorf("fatal %q: %v", info.id, info.err) 213 if finalError == nil || runner.moreImportant(info.err, finalError) { 214 finalError = info.err 215 } 216 delete(workers, info.id) 217 if !isDying { 218 isDying = true 219 killAll(workers) 220 } 221 break 222 } else { 223 logger.Errorf("exited %q: %v", info.id, info.err) 224 } 225 } 226 if workerInfo.start == nil { 227 logger.Debugf("no restart, removing %q from known workers", info.id) 228 229 // The worker has been deliberately stopped; 230 // we can now remove it from the list of workers. 231 delete(workers, info.id) 232 break 233 } 234 go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start) 235 workerInfo.restartDelay = RestartDelay 236 } 237 } 238 } 239 240 func killAll(workers map[string]*workerInfo) { 241 for id, info := range workers { 242 killWorker(id, info) 243 } 244 } 245 246 func killWorker(id string, info *workerInfo) { 247 if info.worker != nil { 248 logger.Debugf("killing %q", id) 249 info.worker.Kill() 250 info.worker = nil 251 } else { 252 logger.Debugf("couldn't kill %q, not yet started", id) 253 } 254 info.stopping = true 255 info.start = nil 256 } 257 258 // runWorker starts the given worker after waiting for the given delay. 259 func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) { 260 if delay > 0 { 261 logger.Infof("restarting %q in %v", id, delay) 262 select { 263 case <-runner.tomb.Dying(): 264 runner.donec <- doneInfo{id, nil} 265 return 266 case <-time.After(delay): 267 } 268 } 269 logger.Infof("start %q", id) 270 worker, err := start() 271 if err == nil { 272 runner.startedc <- startInfo{id, worker} 273 err = worker.Wait() 274 } 275 logger.Infof("stopped %q, err: %v", id, err) 276 runner.donec <- doneInfo{id, err} 277 }