github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/worker/runner.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package worker 5 6 import ( 7 "errors" 8 "time" 9 10 "launchpad.net/tomb" 11 ) 12 13 // RestartDelay holds the length of time that a worker 14 // will wait between exiting and restarting. 15 var RestartDelay = 3 * time.Second 16 17 // Worker is implemented by a running worker. 18 type Worker interface { 19 // Kill asks the worker to stop without necessarily 20 // waiting for it to do so. 21 Kill() 22 // Wait waits for the worker to exit and returns any 23 // error encountered when it was running. 24 Wait() error 25 } 26 27 // Runner is implemented by instances capable of starting and stopping workers. 28 type Runner interface { 29 Worker 30 StartWorker(id string, startFunc func() (Worker, error)) error 31 StopWorker(id string) error 32 Dying() <-chan struct{} 33 } 34 35 // runner runs a set of workers, restarting them as necessary 36 // when they fail. 37 type runner struct { 38 tomb tomb.Tomb 39 startc chan startReq 40 stopc chan string 41 donec chan doneInfo 42 startedc chan startInfo 43 isFatal func(error) bool 44 moreImportant func(err0, err1 error) bool 45 } 46 47 var _ Runner = (*runner)(nil) 48 49 type startReq struct { 50 id string 51 start func() (Worker, error) 52 } 53 54 type startInfo struct { 55 id string 56 worker Worker 57 } 58 59 type doneInfo struct { 60 id string 61 err error 62 } 63 64 // NewRunner creates a new Runner. When a worker finishes, if its error 65 // is deemed fatal (determined by calling isFatal), all the other workers 66 // will be stopped and the runner itself will finish. Of all the fatal errors 67 // returned by the stopped workers, only the most important one, 68 // determined by calling moreImportant, will be returned from 69 // Runner.Wait. Non-fatal errors will not be returned. 70 // 71 // The function isFatal(err) returns whether err is a fatal error. The 72 // function moreImportant(err0, err1) returns whether err0 is considered 73 // more important than err1. 74 func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool) Runner { 75 runner := &runner{ 76 startc: make(chan startReq), 77 stopc: make(chan string), 78 donec: make(chan doneInfo), 79 startedc: make(chan startInfo), 80 isFatal: isFatal, 81 moreImportant: moreImportant, 82 } 83 go func() { 84 defer runner.tomb.Done() 85 runner.tomb.Kill(runner.run()) 86 }() 87 return runner 88 } 89 90 var ErrDead = errors.New("worker runner is not running") 91 92 // StartWorker starts a worker running associated with the given id. 93 // The startFunc function will be called to create the worker; 94 // when the worker exits, it will be restarted as long as it 95 // does not return a fatal error. 96 // 97 // If there is already a worker with the given id, nothing will be done. 98 // 99 // StartWorker returns ErrDead if the runner is not running. 100 func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error { 101 select { 102 case runner.startc <- startReq{id, startFunc}: 103 return nil 104 case <-runner.tomb.Dead(): 105 } 106 return ErrDead 107 } 108 109 // StopWorker stops the worker associated with the given id. 110 // It does nothing if there is no such worker. 111 // 112 // StopWorker returns ErrDead if the runner is not running. 113 func (runner *runner) StopWorker(id string) error { 114 select { 115 case runner.stopc <- id: 116 return nil 117 case <-runner.tomb.Dead(): 118 } 119 return ErrDead 120 } 121 122 func (runner *runner) Wait() error { 123 return runner.tomb.Wait() 124 } 125 126 func (runner *runner) Kill() { 127 logger.Debugf("killing runner %p", runner) 128 runner.tomb.Kill(nil) 129 } 130 131 func (runner *runner) Dying() <-chan struct{} { 132 return runner.tomb.Dying() 133 } 134 135 // Stop kills the given worker and waits for it to exit. 136 func Stop(worker Worker) error { 137 worker.Kill() 138 return worker.Wait() 139 } 140 141 type workerInfo struct { 142 start func() (Worker, error) 143 worker Worker 144 restartDelay time.Duration 145 stopping bool 146 } 147 148 func (runner *runner) run() error { 149 // workers holds the current set of workers. All workers with a 150 // running goroutine have an entry here. 151 workers := make(map[string]*workerInfo) 152 var finalError error 153 154 // isDying holds whether the runner is currently dying. When it 155 // is dying (whether as a result of being killed or due to a 156 // fatal error), all existing workers are killed, no new workers 157 // will be started, and the loop will exit when all existing 158 // workers have stopped. 159 isDying := false 160 tombDying := runner.tomb.Dying() 161 for { 162 if isDying && len(workers) == 0 { 163 return finalError 164 } 165 select { 166 case <-tombDying: 167 logger.Infof("runner is dying") 168 isDying = true 169 killAll(workers) 170 tombDying = nil 171 case req := <-runner.startc: 172 if isDying { 173 logger.Infof("ignoring start request for %q when dying", req.id) 174 break 175 } 176 info := workers[req.id] 177 if info == nil { 178 workers[req.id] = &workerInfo{ 179 start: req.start, 180 restartDelay: RestartDelay, 181 } 182 go runner.runWorker(0, req.id, req.start) 183 break 184 } 185 if !info.stopping { 186 // The worker is already running, so leave it alone 187 break 188 } 189 // The worker previously existed and is 190 // currently being stopped. When it eventually 191 // does stop, we'll restart it immediately with 192 // the new start function. 193 info.start = req.start 194 info.restartDelay = 0 195 case id := <-runner.stopc: 196 if info := workers[id]; info != nil { 197 killWorker(id, info) 198 } 199 case info := <-runner.startedc: 200 workerInfo := workers[info.id] 201 workerInfo.worker = info.worker 202 if isDying { 203 killWorker(info.id, workerInfo) 204 } 205 case info := <-runner.donec: 206 workerInfo := workers[info.id] 207 if !workerInfo.stopping && info.err == nil { 208 delete(workers, info.id) 209 break 210 } 211 if info.err != nil { 212 if runner.isFatal(info.err) { 213 logger.Errorf("fatal %q: %v", info.id, info.err) 214 if finalError == nil || runner.moreImportant(info.err, finalError) { 215 finalError = info.err 216 } 217 delete(workers, info.id) 218 if !isDying { 219 isDying = true 220 killAll(workers) 221 } 222 break 223 } else { 224 logger.Errorf("exited %q: %v", info.id, info.err) 225 } 226 } 227 if workerInfo.start == nil { 228 // The worker has been deliberately stopped; 229 // we can now remove it from the list of workers. 230 delete(workers, info.id) 231 break 232 } 233 go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start) 234 workerInfo.restartDelay = RestartDelay 235 } 236 } 237 } 238 239 func killAll(workers map[string]*workerInfo) { 240 for id, info := range workers { 241 killWorker(id, info) 242 } 243 } 244 245 func killWorker(id string, info *workerInfo) { 246 if info.worker != nil { 247 logger.Debugf("killing %q", id) 248 info.worker.Kill() 249 info.worker = nil 250 } 251 info.stopping = true 252 info.start = nil 253 } 254 255 // runWorker starts the given worker after waiting for the given delay. 256 func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) { 257 if delay > 0 { 258 logger.Infof("restarting %q in %v", id, delay) 259 select { 260 case <-runner.tomb.Dying(): 261 runner.donec <- doneInfo{id, nil} 262 return 263 case <-time.After(delay): 264 } 265 } 266 logger.Infof("start %q", id) 267 worker, err := start() 268 if err == nil { 269 runner.startedc <- startInfo{id, worker} 270 err = worker.Wait() 271 } 272 runner.donec <- doneInfo{id, err} 273 }