github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/runner.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package worker 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "launchpad.net/tomb" 12 ) 13 14 var logger = loggo.GetLogger("juju.worker") 15 16 // RestartDelay holds the length of time that a worker 17 // will wait between exiting and restarting. 18 const RestartDelay = 3 * time.Second 19 20 // Worker is implemented by a running worker. 21 type Worker interface { 22 // Kill asks the worker to stop without necessarily 23 // waiting for it to do so. 24 Kill() 25 // Wait waits for the worker to exit and returns any 26 // error encountered when it was running. 27 Wait() error 28 } 29 30 // Runner is implemented by instances capable of starting and stopping workers. 31 type Runner interface { 32 Worker 33 StartWorker(id string, startFunc func() (Worker, error)) error 34 StopWorker(id string) error 35 } 36 37 // runner runs a set of workers, restarting them as necessary 38 // when they fail. 39 type runner struct { 40 tomb tomb.Tomb 41 startc chan startReq 42 stopc chan string 43 donec chan doneInfo 44 startedc chan startInfo 45 isFatal func(error) bool 46 moreImportant func(err0, err1 error) bool 47 48 // restartDelay holds the length of time that a worker 49 // will wait between exiting and restarting. 50 restartDelay time.Duration 51 } 52 53 type startReq struct { 54 id string 55 start func() (Worker, error) 56 } 57 58 type startInfo struct { 59 id string 60 worker Worker 61 } 62 63 type doneInfo struct { 64 id string 65 err error 66 } 67 68 // NewRunner creates a new Runner. When a worker finishes, if its error 69 // is deemed fatal (determined by calling isFatal), all the other workers 70 // will be stopped and the runner itself will finish. Of all the fatal errors 71 // returned by the stopped workers, only the most important one, 72 // determined by calling moreImportant, will be returned from 73 // Runner.Wait. Non-fatal errors will not be returned. 74 // 75 // The function isFatal(err) returns whether err is a fatal error. The 76 // function moreImportant(err0, err1) returns whether err0 is considered 77 // more important than err1. 78 func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool, restartDelay time.Duration) Runner { 79 runner := &runner{ 80 startc: make(chan startReq), 81 stopc: make(chan string), 82 donec: make(chan doneInfo), 83 startedc: make(chan startInfo), 84 isFatal: isFatal, 85 moreImportant: moreImportant, 86 restartDelay: restartDelay, 87 } 88 go func() { 89 defer runner.tomb.Done() 90 runner.tomb.Kill(runner.run()) 91 }() 92 return runner 93 } 94 95 var ErrDead = errors.New("worker runner is not running") 96 97 // StartWorker starts a worker running associated with the given id. 98 // The startFunc function will be called to create the worker; 99 // when the worker exits, it will be restarted as long as it 100 // does not return a fatal error. 101 // 102 // If there is already a worker with the given id, nothing will be done. 103 // 104 // StartWorker returns ErrDead if the runner is not running. 105 func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error { 106 select { 107 case runner.startc <- startReq{id, startFunc}: 108 return nil 109 case <-runner.tomb.Dead(): 110 } 111 return ErrDead 112 } 113 114 // StopWorker stops the worker associated with the given id. 115 // It does nothing if there is no such worker. 116 // 117 // StopWorker returns ErrDead if the runner is not running. 118 func (runner *runner) StopWorker(id string) error { 119 select { 120 case runner.stopc <- id: 121 return nil 122 case <-runner.tomb.Dead(): 123 } 124 return ErrDead 125 } 126 127 func (runner *runner) Wait() error { 128 return runner.tomb.Wait() 129 } 130 131 func (runner *runner) Kill() { 132 logger.Debugf("killing runner %p", runner) 133 runner.tomb.Kill(nil) 134 } 135 136 // Stop kills the given worker and waits for it to exit. 137 func Stop(worker Worker) error { 138 worker.Kill() 139 return worker.Wait() 140 } 141 142 type workerInfo struct { 143 start func() (Worker, error) 144 worker Worker 145 restartDelay time.Duration 146 stopping bool 147 } 148 149 func (runner *runner) run() error { 150 // workers holds the current set of workers. All workers with a 151 // running goroutine have an entry here. 152 workers := make(map[string]*workerInfo) 153 var finalError error 154 155 // isDying holds whether the runner is currently dying. When it 156 // is dying (whether as a result of being killed or due to a 157 // fatal error), all existing workers are killed, no new workers 158 // will be started, and the loop will exit when all existing 159 // workers have stopped. 160 isDying := false 161 tombDying := runner.tomb.Dying() 162 for { 163 if isDying && len(workers) == 0 { 164 return finalError 165 } 166 select { 167 case <-tombDying: 168 logger.Infof("runner is dying") 169 isDying = true 170 killAll(workers) 171 tombDying = nil 172 case req := <-runner.startc: 173 if isDying { 174 logger.Infof("ignoring start request for %q when dying", req.id) 175 break 176 } 177 info := workers[req.id] 178 if info == nil { 179 workers[req.id] = &workerInfo{ 180 start: req.start, 181 restartDelay: runner.restartDelay, 182 } 183 go runner.runWorker(0, req.id, req.start) 184 break 185 } 186 if !info.stopping { 187 // The worker is already running, so leave it alone 188 break 189 } 190 // The worker previously existed and is 191 // currently being stopped. When it eventually 192 // does stop, we'll restart it immediately with 193 // the new start function. 194 info.start = req.start 195 info.restartDelay = 0 196 case id := <-runner.stopc: 197 logger.Debugf("stop %q", id) 198 if info := workers[id]; info != nil { 199 killWorker(id, info) 200 } 201 case info := <-runner.startedc: 202 logger.Debugf("%q started", info.id) 203 workerInfo := workers[info.id] 204 workerInfo.worker = info.worker 205 if isDying || workerInfo.stopping { 206 killWorker(info.id, workerInfo) 207 } 208 case info := <-runner.donec: 209 logger.Debugf("%q done: %v", info.id, info.err) 210 workerInfo := workers[info.id] 211 if !workerInfo.stopping && info.err == nil { 212 logger.Debugf("removing %q from known workers", info.id) 213 delete(workers, info.id) 214 break 215 } 216 if info.err != nil { 217 if runner.isFatal(info.err) { 218 logger.Errorf("fatal %q: %v", info.id, info.err) 219 if finalError == nil || runner.moreImportant(info.err, finalError) { 220 finalError = info.err 221 } 222 delete(workers, info.id) 223 if !isDying { 224 isDying = true 225 killAll(workers) 226 } 227 break 228 } else { 229 logger.Errorf("exited %q: %v", info.id, info.err) 230 } 231 } 232 if workerInfo.start == nil { 233 logger.Debugf("no restart, removing %q from known workers", info.id) 234 235 // The worker has been deliberately stopped; 236 // we can now remove it from the list of workers. 237 delete(workers, info.id) 238 break 239 } 240 go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start) 241 workerInfo.restartDelay = runner.restartDelay 242 } 243 } 244 } 245 246 func killAll(workers map[string]*workerInfo) { 247 for id, info := range workers { 248 killWorker(id, info) 249 } 250 } 251 252 func killWorker(id string, info *workerInfo) { 253 if info.worker != nil { 254 logger.Debugf("killing %q", id) 255 info.worker.Kill() 256 info.worker = nil 257 } else { 258 logger.Debugf("couldn't kill %q, not yet started", id) 259 } 260 info.stopping = true 261 info.start = nil 262 } 263 264 // runWorker starts the given worker after waiting for the given delay. 265 func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) { 266 if delay > 0 { 267 logger.Infof("restarting %q in %v", id, delay) 268 select { 269 case <-runner.tomb.Dying(): 270 runner.donec <- doneInfo{id, nil} 271 return 272 case <-time.After(delay): 273 } 274 } 275 logger.Infof("start %q", id) 276 worker, err := start() 277 if err == nil { 278 runner.startedc <- startInfo{id, worker} 279 err = worker.Wait() 280 } 281 logger.Infof("stopped %q, err: %v", id, err) 282 runner.donec <- doneInfo{id, err} 283 } 284 285 // Workers is an order-preserving registry of worker factory functions. 286 type Workers struct { 287 ids []string 288 funcs map[string]func() (Worker, error) 289 } 290 291 // NewWorkers returns a new Workers. 292 func NewWorkers() Workers { 293 return Workers{ 294 funcs: make(map[string]func() (Worker, error)), 295 } 296 } 297 298 // IDs returns the list of registered worker IDs. 299 func (r Workers) IDs() []string { 300 ids := make([]string, len(r.ids)) 301 copy(ids, r.ids) 302 return ids 303 } 304 305 // Add registered the factory function for the identified worker. 306 func (r *Workers) Add(id string, newWorker func() (Worker, error)) error { 307 if _, ok := r.funcs[id]; ok { 308 return errors.Errorf("worker %q already registered", id) 309 } 310 r.funcs[id] = newWorker 311 r.ids = append(r.ids, id) 312 return nil 313 } 314 315 // Start starts all the registered workers under the given runner. 316 func (r *Workers) Start(runner Runner) error { 317 for _, id := range r.ids { 318 newWorker := r.funcs[id] 319 if err := runner.StartWorker(id, newWorker); err != nil { 320 return errors.Annotatef(err, "worker %q failed to start", id) 321 } 322 } 323 return nil 324 }