github.com/Pankov404/juju@v0.0.0-20150703034450-be266991dceb/worker/dependency/engine.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dependency 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "launchpad.net/tomb" 12 13 "github.com/juju/juju/worker" 14 ) 15 16 var logger = loggo.GetLogger("juju.worker.dependency") 17 18 // NewEngine returns an Engine that will maintain any installed Manifolds until 19 // either the engine is stopped or one of the manifolds' workers returns an error 20 // that satisfies isFatal. The caller takes responsibility for the returned Engine: 21 // it's responsible for Kill()ing the Engine when no longer used, and must handle 22 // any error from Wait(). 23 func NewEngine(isFatal IsFatalFunc, errorDelay, bounceDelay time.Duration) Engine { 24 engine := &engine{ 25 isFatal: isFatal, 26 errorDelay: errorDelay, 27 bounceDelay: bounceDelay, 28 29 manifolds: map[string]Manifold{}, 30 dependents: map[string][]string{}, 31 current: map[string]workerInfo{}, 32 33 install: make(chan installTicket), 34 started: make(chan startedTicket), 35 stopped: make(chan stoppedTicket), 36 } 37 go func() { 38 defer engine.tomb.Done() 39 engine.tomb.Kill(engine.loop()) 40 }() 41 return engine 42 } 43 44 // engine maintains workers corresponding to its installed manifolds, and 45 // restarts them whenever their inputs change. 46 type engine struct { 47 tomb tomb.Tomb 48 49 // isFatal allows errors generated by workers to stop the engine. 50 isFatal IsFatalFunc 51 52 // errorDelay controls how long the engine waits before restarting a worker 53 // that encountered an unknown error. 54 errorDelay time.Duration 55 56 // bounceDelay controls how long the engine waits before restarting a worker 57 // that was deliberately shut down because its dependencies changed. 58 bounceDelay time.Duration 59 60 // manifolds holds the installed manifolds by name. 61 manifolds map[string]Manifold 62 63 // dependents holds, for each named manifold, those that depend on it. 64 dependents map[string][]string 65 66 // current holds the active worker information for each installed manifold. 67 current map[string]workerInfo 68 69 // install, started, and stopped each communicate requests and changes into 70 // the loop goroutine. 71 install chan installTicket 72 started chan startedTicket 73 stopped chan stoppedTicket 74 } 75 76 // loop serializes manifold install operations and worker start/stop notifications. 77 // It's notable for its oneShotDying var, which is necessary because any number of 78 // start/stop notification could be in flight at the point the engine needs to stop; 79 // we need to handle all those, and any subsequent messages, until the main loop is 80 // confident that every worker has stopped. (The usual pattern -- to defer a cleanup 81 // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because 82 // it needs to duplicate that start/stop message handling; better to localise that 83 // in this method.) 84 func (engine *engine) loop() error { 85 oneShotDying := engine.tomb.Dying() 86 for { 87 select { 88 case <-oneShotDying: 89 oneShotDying = nil 90 for name := range engine.current { 91 engine.requestStop(name) 92 } 93 case ticket := <-engine.install: 94 // This is safe so long as the Install method reads the result. 95 ticket.result <- engine.gotInstall(ticket.name, ticket.manifold) 96 case ticket := <-engine.started: 97 engine.gotStarted(ticket.name, ticket.worker) 98 case ticket := <-engine.stopped: 99 engine.gotStopped(ticket.name, ticket.error) 100 } 101 if engine.isDying() { 102 if engine.allStopped() { 103 return tomb.ErrDying 104 } 105 } 106 } 107 } 108 109 // Kill is part of the worker.Worker interface. 110 func (engine *engine) Kill() { 111 engine.tomb.Kill(nil) 112 } 113 114 // Wait is part of the worker.Worker interface. 115 func (engine *engine) Wait() error { 116 return engine.tomb.Wait() 117 } 118 119 // Install is part of the Engine interface. 120 func (engine *engine) Install(name string, manifold Manifold) error { 121 result := make(chan error) 122 select { 123 case <-engine.tomb.Dying(): 124 return errors.New("engine is shutting down") 125 case engine.install <- installTicket{name, manifold, result}: 126 // This is safe so long as the loop sends a result. 127 return <-result 128 } 129 } 130 131 // gotInstall handles the params originally supplied to Install. It must only be 132 // called from the loop goroutine. 133 func (engine *engine) gotInstall(name string, manifold Manifold) error { 134 logger.Infof("installing %q manifold...", name) 135 if _, found := engine.manifolds[name]; found { 136 return errors.Errorf("%q manifold already installed", name) 137 } 138 engine.manifolds[name] = manifold 139 for _, input := range manifold.Inputs { 140 engine.dependents[input] = append(engine.dependents[input], name) 141 } 142 engine.current[name] = workerInfo{} 143 engine.requestStart(name, 0) 144 return nil 145 } 146 147 // requestStart invokes a runWorker goroutine for the manifold with the supplied 148 // name. It must only be called from the loop goroutine. 149 func (engine *engine) requestStart(name string, delay time.Duration) { 150 151 // Check preconditions. 152 manifold, found := engine.manifolds[name] 153 if !found { 154 engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name)) 155 } 156 157 // Copy current info and check more preconditions. 158 info := engine.current[name] 159 if !info.stopped() { 160 engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name)) 161 } 162 163 // Final check that we're not shutting down yet... 164 if engine.isDying() { 165 logger.Debugf("not starting %q manifold worker (shutting down)", name) 166 return 167 } 168 169 // ...then update the info, copy it back to the engine, and start a worker 170 // goroutine based on current known state. 171 info.starting = true 172 engine.current[name] = info 173 getResource := engine.getResourceFunc(name, manifold.Inputs) 174 go engine.runWorker(name, delay, manifold.Start, getResource) 175 } 176 177 // getResourceFunc returns a GetResourceFunc backed by a snapshot of current 178 // worker state, restricted to those workers declared in inputs. It must only 179 // be called from the loop goroutine; see inside for a detailed dicsussion of 180 // why we took this appproach. 181 func (engine *engine) getResourceFunc(name string, inputs []string) GetResourceFunc { 182 // We snapshot the resources available at invocation time, rather than adding an 183 // additional communicate-resource-request channel. The latter approach is not 184 // unreasonable... but is prone to inelegant scrambles when starting several 185 // dependent workers at once. For example: 186 // 187 // * Install manifold A; loop starts worker A 188 // * Install manifold B; loop starts worker B 189 // * A communicates its worker back to loop; main thread bounces B 190 // * B asks for A, gets A, doesn't react to bounce (*) 191 // * B communicates its worker back to loop; loop kills it immediately in 192 // response to earlier bounce 193 // * loop starts worker B again, now everything's fine; but, still, yuck. 194 // This is not a happy path to take by default. 195 // 196 // The problem, of course, is in the (*); the main thread *does* know that B 197 // needs to bounce soon anyway, and it *could* communicate that fact back via 198 // an error over a channel back into getResource; the StartFunc could then 199 // just return (say) that ErrResourceChanged and avoid the hassle of creating 200 // a worker. But that adds a whole layer of complexity (and unpredictability 201 // in tests, which is not much fun) for very little benefit. 202 // 203 // In the analogous scenario with snapshotted dependencies, we see a happier 204 // picture at startup time: 205 // 206 // * Install manifold A; loop starts worker A 207 // * Install manifold B; loop starts worker B with empty resource snapshot 208 // * A communicates its worker back to loop; main thread bounces B 209 // * B's StartFunc asks for A, gets nothing, returns ErrUnmetDependencies 210 // * loop restarts worker B with an up-to-date snapshot, B works fine 211 // 212 // We assume that, in the common case, most workers run without error most 213 // of the time; and, thus, that the vast majority of worker startups will 214 // happen as an agent starts. Furthermore, most of them will have simple 215 // hard dependencies, and their Start funcs will be easy to write; the only 216 // components that may be impacted by such a strategy will be those workers 217 // which still want to run (with reduced functionality) with some dependency 218 // unmet. 219 // 220 // Those may indeed suffer the occasional extra bounce as the system comes 221 // to stability as it starts, or after a change; but workers *must* be 222 // written for resilience in the face of arbitrary bounces *anyway*, so it 223 // shouldn't be harmful 224 outputs := map[string]OutputFunc{} 225 workers := map[string]worker.Worker{} 226 for _, resourceName := range inputs { 227 outputs[resourceName] = engine.manifolds[resourceName].Output 228 workers[resourceName] = engine.current[resourceName].worker 229 } 230 return func(resourceName string, out interface{}) error { 231 logger.Debugf("%q manifold requested %q resource", name, resourceName) 232 input := workers[resourceName] 233 if input == nil { 234 // No worker running (or not declared). 235 return ErrMissing 236 } 237 convert := outputs[resourceName] 238 if convert == nil { 239 // No conversion func available... 240 if out != nil { 241 // ...and the caller wants a resource. 242 return ErrMissing 243 } 244 // ...but it's ok, because the caller depends on existence only. 245 return nil 246 } 247 return convert(input, out) 248 } 249 } 250 251 // runWorker starts the supplied manifold's worker and communicates it back to the 252 // loop goroutine; waits for worker completion; and communicates any error encountered 253 // back to the loop goroutine. It must not be run on the loop goroutine. 254 func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, getResource GetResourceFunc) { 255 startWorkerAndWait := func() error { 256 logger.Infof("starting %q manifold worker in %s...", name, delay) 257 select { 258 case <-time.After(delay): 259 case <-engine.tomb.Dying(): 260 logger.Debugf("not starting %q manifold worker (shutting down)", name) 261 return tomb.ErrDying 262 } 263 264 logger.Debugf("starting %q manifold worker", name) 265 worker, err := start(getResource) 266 if err != nil { 267 logger.Warningf("failed to start %q manifold worker: %v", name, err) 268 return err 269 } 270 271 logger.Debugf("running %q manifold worker", name) 272 select { 273 case <-engine.tomb.Dying(): 274 logger.Debugf("stopping %q manifold worker (shutting down)", name) 275 worker.Kill() 276 case engine.started <- startedTicket{name, worker}: 277 logger.Debugf("registered %q manifold worker", name) 278 } 279 return worker.Wait() 280 } 281 282 // We may or may not send on started, but we *must* send on stopped. 283 engine.stopped <- stoppedTicket{name, startWorkerAndWait()} 284 } 285 286 // gotStarted updates the engine to reflect the creation of a worker. It must 287 // only be called from the loop goroutine. 288 func (engine *engine) gotStarted(name string, worker worker.Worker) { 289 // Copy current info; check preconditions and abort the workers if we've 290 // already been asked to stop it. 291 info := engine.current[name] 292 switch { 293 case info.worker != nil: 294 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name)) 295 fallthrough 296 case info.stopping, engine.isDying(): 297 logger.Debugf("%q manifold worker no longer required", name) 298 worker.Kill() 299 default: 300 // It's fine to use this worker; update info and copy back. 301 logger.Infof("%q manifold worker started", name) 302 info.starting = false 303 info.worker = worker 304 engine.current[name] = info 305 306 // Any manifold that declares this one as an input needs to be restarted. 307 engine.bounceDependents(name) 308 } 309 } 310 311 // gotStopped updates the engine to reflect the demise of (or failure to create) 312 // a worker. It must only be called from the loop goroutine. 313 func (engine *engine) gotStopped(name string, err error) { 314 logger.Infof("%q manifold worker stopped: %v", name, err) 315 316 // Copy current info and check for reasons to stop the engine. 317 info := engine.current[name] 318 if info.stopped() { 319 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name)) 320 } else if engine.isFatal(err) { 321 engine.tomb.Kill(err) 322 } 323 324 // Reset engine info; and bail out if we can be sure there's no need to bounce. 325 engine.current[name] = workerInfo{} 326 if engine.isDying() { 327 logger.Debugf("permanently stopped %q manifold worker (shutting down)", name) 328 return 329 } 330 331 // If we told the worker to stop, we should start it again immediately, 332 // whatever else happened. 333 if info.stopping { 334 engine.requestStart(name, engine.bounceDelay) 335 } else { 336 // If we didn't stop it ourselves, we need to interpret the error. 337 switch err { 338 case nil: 339 // Nothing went wrong; the task completed successfully. Nothing 340 // needs to be done (unless the inputs change, in which case it 341 // gets to check again). 342 case ErrMissing: 343 // The task can't even start with the current state. Nothing more 344 // can be done (until the inputs change, in which case we retry 345 // anyway). 346 default: 347 // Something went wrong but we don't know what. Try again soon. 348 engine.requestStart(name, engine.errorDelay) 349 } 350 } 351 352 // Manifolds that declared a dependency on this one only need to be notified 353 // if the worker has changed; if it was already nil, nobody needs to know. 354 if info.worker != nil { 355 engine.bounceDependents(name) 356 } 357 } 358 359 // requestStop ensures that any running or starting worker will be stopped in the 360 // near future. It must only be called from the loop goroutine. 361 func (engine *engine) requestStop(name string) { 362 363 // If already stopping or stopped, just don't do anything. 364 info := engine.current[name] 365 if info.stopping || info.stopped() { 366 return 367 } 368 369 // Update info, kill worker if present, and copy info back to engine. 370 info.stopping = true 371 if info.worker != nil { 372 info.worker.Kill() 373 } 374 engine.current[name] = info 375 } 376 377 // isDying returns true if the engine is shutting down. It's safe to call it 378 // from any goroutine. 379 func (engine *engine) isDying() bool { 380 select { 381 case <-engine.tomb.Dying(): 382 return true 383 default: 384 return false 385 } 386 } 387 388 // allStopped returns true if no workers are running or starting. It must only 389 // be called from the loop goroutine. 390 func (engine *engine) allStopped() bool { 391 for _, info := range engine.current { 392 if !info.stopped() { 393 return false 394 } 395 } 396 return true 397 } 398 399 // bounceDependents starts every stopped dependent of the named manifold, and 400 // stops every started one (and trusts the rest of the engine to restart them). 401 // It must only be called from the loop goroutine. 402 func (engine *engine) bounceDependents(name string) { 403 logger.Debugf("restarting dependents of %q manifold", name) 404 for _, dependentName := range engine.dependents[name] { 405 if engine.current[dependentName].stopped() { 406 engine.requestStart(dependentName, engine.bounceDelay) 407 } else { 408 engine.requestStop(dependentName) 409 } 410 } 411 } 412 413 // workerInfo stores what an engine's loop goroutine needs to know about the 414 // worker for a given Manifold. 415 type workerInfo struct { 416 starting bool 417 stopping bool 418 worker worker.Worker 419 } 420 421 // stopped returns true unless the worker is either assigned or starting. 422 func (info workerInfo) stopped() bool { 423 switch { 424 case info.worker != nil: 425 return false 426 case info.starting: 427 return false 428 } 429 return true 430 } 431 432 // installTicket is used by engine to induce installation of a named manifold 433 // and pass on any errors encountered in the process. 434 type installTicket struct { 435 name string 436 manifold Manifold 437 result chan<- error 438 } 439 440 // startedTicket is used by engine to notify the loop of the creation of the 441 // worker for a particular manifold. 442 type startedTicket struct { 443 name string 444 worker worker.Worker 445 } 446 447 // stoppedTicket is used by engine to notify the loop of the demise of (or 448 // failure to create) the worker for a particular manifold. 449 type stoppedTicket struct { 450 name string 451 error error 452 }