github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/dependency/engine.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dependency 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "launchpad.net/tomb" 12 13 "github.com/juju/juju/worker" 14 ) 15 16 var logger = loggo.GetLogger("juju.worker.dependency") 17 18 // EngineConfig defines the parameters needed to create a new engine. 19 type EngineConfig struct { 20 // IsFatal allows errors generated by workers to stop the engine. 21 IsFatal IsFatalFunc 22 23 // MoreImportant allows fatal errors to be ranked according to importance. 24 MoreImportant MoreImportantFunc 25 26 // ErrorDelay controls how long the engine waits before restarting a worker 27 // that encountered an unknown error. 28 ErrorDelay time.Duration 29 30 // BounceDelay controls how long the engine waits before restarting a worker 31 // that was deliberately shut down because its dependencies changed. 32 BounceDelay time.Duration 33 } 34 35 // Validate checks the config values are sensible. 36 func (config *EngineConfig) Validate() error { 37 if config.IsFatal == nil { 38 return errors.New("engineconfig validation failed: IsFatal not specified") 39 } 40 if config.MoreImportant == nil { 41 return errors.New("engineconfig validation failed: MoreImportant not specified") 42 } 43 if config.ErrorDelay <= 0 { 44 return errors.New("engineconfig validation failed: ErrorDelay needs to be >= 0") 45 } 46 if config.BounceDelay <= 0 { 47 return errors.New("engineconfig validation failed: BounceDelay needs to be >= 0") 48 } 49 return nil 50 } 51 52 // NewEngine returns an Engine that will maintain any installed Manifolds until 53 // either the engine is stopped or one of the manifolds' workers returns an error 54 // that satisfies isFatal. The caller takes responsibility for the returned Engine: 55 // it's responsible for Kill()ing the Engine when no longer used, and must handle 56 // any error from Wait(). 57 func NewEngine(config EngineConfig) (Engine, error) { 58 if err := config.Validate(); err != nil { 59 return nil, errors.Trace(err) 60 } 61 engine := &engine{ 62 config: config, 63 64 manifolds: map[string]Manifold{}, 65 dependents: map[string][]string{}, 66 current: map[string]workerInfo{}, 67 68 install: make(chan installTicket), 69 started: make(chan startedTicket), 70 stopped: make(chan stoppedTicket), 71 } 72 go func() { 73 defer engine.tomb.Done() 74 engine.tomb.Kill(engine.loop()) 75 }() 76 return engine, nil 77 } 78 79 // engine maintains workers corresponding to its installed manifolds, and 80 // restarts them whenever their inputs change. 81 type engine struct { 82 tomb tomb.Tomb 83 84 // config contains values passed in as config when the engine was created. 85 config EngineConfig 86 87 // worstError is used to track the most important error we've received from a 88 // manifold. We use tomb.Tomb to track engine life cycle but the first error 89 // we get is not necessarily the most important one. 90 // Using moreImportant we rank errors and return the worst error. 91 worstError error 92 93 // manifolds holds the installed manifolds by name. 94 manifolds map[string]Manifold 95 96 // dependents holds, for each named manifold, those that depend on it. 97 dependents map[string][]string 98 99 // current holds the active worker information for each installed manifold. 100 current map[string]workerInfo 101 102 // install, started, and stopped each communicate requests and changes into 103 // the loop goroutine. 104 install chan installTicket 105 started chan startedTicket 106 stopped chan stoppedTicket 107 } 108 109 // loop serializes manifold install operations and worker start/stop notifications. 110 // It's notable for its oneShotDying var, which is necessary because any number of 111 // start/stop notification could be in flight at the point the engine needs to stop; 112 // we need to handle all those, and any subsequent messages, until the main loop is 113 // confident that every worker has stopped. (The usual pattern -- to defer a cleanup 114 // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because 115 // it needs to duplicate that start/stop message handling; better to localise that 116 // in this method.) 117 func (engine *engine) loop() error { 118 oneShotDying := engine.tomb.Dying() 119 for { 120 select { 121 case <-oneShotDying: 122 oneShotDying = nil 123 for name := range engine.current { 124 engine.requestStop(name) 125 } 126 case ticket := <-engine.install: 127 // This is safe so long as the Install method reads the result. 128 ticket.result <- engine.gotInstall(ticket.name, ticket.manifold) 129 case ticket := <-engine.started: 130 engine.gotStarted(ticket.name, ticket.worker) 131 case ticket := <-engine.stopped: 132 engine.gotStopped(ticket.name, ticket.error) 133 } 134 if engine.isDying() { 135 if engine.allStopped() { 136 if engine.worstError == nil { 137 return tomb.ErrDying 138 } 139 return engine.worstError 140 } 141 } 142 } 143 } 144 145 // Kill is part of the worker.Worker interface. 146 func (engine *engine) Kill() { 147 engine.tomb.Kill(nil) 148 } 149 150 // Wait is part of the worker.Worker interface. 151 func (engine *engine) Wait() error { 152 <-engine.tomb.Dead() 153 return engine.worstError 154 } 155 156 // Install is part of the Engine interface. 157 func (engine *engine) Install(name string, manifold Manifold) error { 158 result := make(chan error) 159 select { 160 case <-engine.tomb.Dying(): 161 return errors.New("engine is shutting down") 162 case engine.install <- installTicket{name, manifold, result}: 163 // This is safe so long as the loop sends a result. 164 return <-result 165 } 166 } 167 168 // gotInstall handles the params originally supplied to Install. It must only be 169 // called from the loop goroutine. 170 func (engine *engine) gotInstall(name string, manifold Manifold) error { 171 logger.Infof("installing %q manifold...", name) 172 if _, found := engine.manifolds[name]; found { 173 return errors.Errorf("%q manifold already installed", name) 174 } 175 engine.manifolds[name] = manifold 176 for _, input := range manifold.Inputs { 177 engine.dependents[input] = append(engine.dependents[input], name) 178 } 179 engine.current[name] = workerInfo{} 180 engine.requestStart(name, 0) 181 return nil 182 } 183 184 // requestStart invokes a runWorker goroutine for the manifold with the supplied 185 // name. It must only be called from the loop goroutine. 186 func (engine *engine) requestStart(name string, delay time.Duration) { 187 188 // Check preconditions. 189 manifold, found := engine.manifolds[name] 190 if !found { 191 engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name)) 192 } 193 194 // Copy current info and check more preconditions. 195 info := engine.current[name] 196 if !info.stopped() { 197 engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name)) 198 } 199 200 // Final check that we're not shutting down yet... 201 if engine.isDying() { 202 logger.Debugf("not starting %q manifold worker (shutting down)", name) 203 return 204 } 205 206 // ...then update the info, copy it back to the engine, and start a worker 207 // goroutine based on current known state. 208 info.starting = true 209 engine.current[name] = info 210 getResource := engine.getResourceFunc(name, manifold.Inputs) 211 go engine.runWorker(name, delay, manifold.Start, getResource) 212 } 213 214 // getResourceFunc returns a GetResourceFunc backed by a snapshot of current 215 // worker state, restricted to those workers declared in inputs. It must only 216 // be called from the loop goroutine; see inside for a detailed dicsussion of 217 // why we took this appproach. 218 func (engine *engine) getResourceFunc(name string, inputs []string) GetResourceFunc { 219 // We snapshot the resources available at invocation time, rather than adding an 220 // additional communicate-resource-request channel. The latter approach is not 221 // unreasonable... but is prone to inelegant scrambles when starting several 222 // dependent workers at once. For example: 223 // 224 // * Install manifold A; loop starts worker A 225 // * Install manifold B; loop starts worker B 226 // * A communicates its worker back to loop; main thread bounces B 227 // * B asks for A, gets A, doesn't react to bounce (*) 228 // * B communicates its worker back to loop; loop kills it immediately in 229 // response to earlier bounce 230 // * loop starts worker B again, now everything's fine; but, still, yuck. 231 // This is not a happy path to take by default. 232 // 233 // The problem, of course, is in the (*); the main thread *does* know that B 234 // needs to bounce soon anyway, and it *could* communicate that fact back via 235 // an error over a channel back into getResource; the StartFunc could then 236 // just return (say) that ErrResourceChanged and avoid the hassle of creating 237 // a worker. But that adds a whole layer of complexity (and unpredictability 238 // in tests, which is not much fun) for very little benefit. 239 // 240 // In the analogous scenario with snapshotted dependencies, we see a happier 241 // picture at startup time: 242 // 243 // * Install manifold A; loop starts worker A 244 // * Install manifold B; loop starts worker B with empty resource snapshot 245 // * A communicates its worker back to loop; main thread bounces B 246 // * B's StartFunc asks for A, gets nothing, returns ErrUnmetDependencies 247 // * loop restarts worker B with an up-to-date snapshot, B works fine 248 // 249 // We assume that, in the common case, most workers run without error most 250 // of the time; and, thus, that the vast majority of worker startups will 251 // happen as an agent starts. Furthermore, most of them will have simple 252 // hard dependencies, and their Start funcs will be easy to write; the only 253 // components that may be impacted by such a strategy will be those workers 254 // which still want to run (with reduced functionality) with some dependency 255 // unmet. 256 // 257 // Those may indeed suffer the occasional extra bounce as the system comes 258 // to stability as it starts, or after a change; but workers *must* be 259 // written for resilience in the face of arbitrary bounces *anyway*, so it 260 // shouldn't be harmful 261 outputs := map[string]OutputFunc{} 262 workers := map[string]worker.Worker{} 263 for _, resourceName := range inputs { 264 outputs[resourceName] = engine.manifolds[resourceName].Output 265 workers[resourceName] = engine.current[resourceName].worker 266 } 267 return func(resourceName string, out interface{}) error { 268 logger.Debugf("%q manifold requested %q resource", name, resourceName) 269 input := workers[resourceName] 270 if input == nil { 271 // No worker running (or not declared). 272 return ErrMissing 273 } 274 convert := outputs[resourceName] 275 if convert == nil { 276 // No conversion func available... 277 if out != nil { 278 // ...and the caller wants a resource. 279 return ErrMissing 280 } 281 // ...but it's ok, because the caller depends on existence only. 282 return nil 283 } 284 return convert(input, out) 285 } 286 } 287 288 // runWorker starts the supplied manifold's worker and communicates it back to the 289 // loop goroutine; waits for worker completion; and communicates any error encountered 290 // back to the loop goroutine. It must not be run on the loop goroutine. 291 func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, getResource GetResourceFunc) { 292 startWorkerAndWait := func() error { 293 logger.Infof("starting %q manifold worker in %s...", name, delay) 294 select { 295 case <-time.After(delay): 296 case <-engine.tomb.Dying(): 297 logger.Debugf("not starting %q manifold worker (shutting down)", name) 298 return tomb.ErrDying 299 } 300 301 logger.Debugf("starting %q manifold worker", name) 302 worker, err := start(getResource) 303 if err != nil { 304 logger.Warningf("failed to start %q manifold worker: %v", name, err) 305 return err 306 } 307 308 logger.Debugf("running %q manifold worker", name) 309 select { 310 case <-engine.tomb.Dying(): 311 logger.Debugf("stopping %q manifold worker (shutting down)", name) 312 worker.Kill() 313 case engine.started <- startedTicket{name, worker}: 314 logger.Debugf("registered %q manifold worker", name) 315 } 316 return worker.Wait() 317 } 318 319 // We may or may not send on started, but we *must* send on stopped. 320 engine.stopped <- stoppedTicket{name, startWorkerAndWait()} 321 } 322 323 // gotStarted updates the engine to reflect the creation of a worker. It must 324 // only be called from the loop goroutine. 325 func (engine *engine) gotStarted(name string, worker worker.Worker) { 326 // Copy current info; check preconditions and abort the workers if we've 327 // already been asked to stop it. 328 info := engine.current[name] 329 switch { 330 case info.worker != nil: 331 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name)) 332 fallthrough 333 case info.stopping, engine.isDying(): 334 logger.Debugf("%q manifold worker no longer required", name) 335 worker.Kill() 336 default: 337 // It's fine to use this worker; update info and copy back. 338 logger.Infof("%q manifold worker started", name) 339 info.starting = false 340 info.worker = worker 341 engine.current[name] = info 342 343 // Any manifold that declares this one as an input needs to be restarted. 344 engine.bounceDependents(name) 345 } 346 } 347 348 // gotStopped updates the engine to reflect the demise of (or failure to create) 349 // a worker. It must only be called from the loop goroutine. 350 func (engine *engine) gotStopped(name string, err error) { 351 logger.Infof("%q manifold worker stopped: %v", name, err) 352 353 // Copy current info and check for reasons to stop the engine. 354 info := engine.current[name] 355 if info.stopped() { 356 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name)) 357 } else if engine.config.IsFatal(err) { 358 if engine.worstError == nil { 359 engine.worstError = engine.config.MoreImportant(err, engine.worstError) 360 } 361 engine.tomb.Kill(err) 362 } 363 364 // Reset engine info; and bail out if we can be sure there's no need to bounce. 365 engine.current[name] = workerInfo{} 366 if engine.isDying() { 367 logger.Debugf("permanently stopped %q manifold worker (shutting down)", name) 368 return 369 } 370 371 // If we told the worker to stop, we should start it again immediately, 372 // whatever else happened. 373 if info.stopping { 374 engine.requestStart(name, engine.config.BounceDelay) 375 } else { 376 // If we didn't stop it ourselves, we need to interpret the error. 377 switch err { 378 case nil: 379 // Nothing went wrong; the task completed successfully. Nothing 380 // needs to be done (unless the inputs change, in which case it 381 // gets to check again). 382 case ErrMissing: 383 // The task can't even start with the current state. Nothing more 384 // can be done (until the inputs change, in which case we retry 385 // anyway). 386 default: 387 // Something went wrong but we don't know what. Try again soon. 388 engine.requestStart(name, engine.config.ErrorDelay) 389 } 390 } 391 392 // Manifolds that declared a dependency on this one only need to be notified 393 // if the worker has changed; if it was already nil, nobody needs to know. 394 if info.worker != nil { 395 engine.bounceDependents(name) 396 } 397 } 398 399 // requestStop ensures that any running or starting worker will be stopped in the 400 // near future. It must only be called from the loop goroutine. 401 func (engine *engine) requestStop(name string) { 402 403 // If already stopping or stopped, just don't do anything. 404 info := engine.current[name] 405 if info.stopping || info.stopped() { 406 return 407 } 408 409 // Update info, kill worker if present, and copy info back to engine. 410 info.stopping = true 411 if info.worker != nil { 412 info.worker.Kill() 413 } 414 engine.current[name] = info 415 } 416 417 // isDying returns true if the engine is shutting down. It's safe to call it 418 // from any goroutine. 419 func (engine *engine) isDying() bool { 420 select { 421 case <-engine.tomb.Dying(): 422 return true 423 default: 424 return false 425 } 426 } 427 428 // allStopped returns true if no workers are running or starting. It must only 429 // be called from the loop goroutine. 430 func (engine *engine) allStopped() bool { 431 for _, info := range engine.current { 432 if !info.stopped() { 433 return false 434 } 435 } 436 return true 437 } 438 439 // bounceDependents starts every stopped dependent of the named manifold, and 440 // stops every started one (and trusts the rest of the engine to restart them). 441 // It must only be called from the loop goroutine. 442 func (engine *engine) bounceDependents(name string) { 443 logger.Debugf("restarting dependents of %q manifold", name) 444 for _, dependentName := range engine.dependents[name] { 445 if engine.current[dependentName].stopped() { 446 engine.requestStart(dependentName, engine.config.BounceDelay) 447 } else { 448 engine.requestStop(dependentName) 449 } 450 } 451 } 452 453 // workerInfo stores what an engine's loop goroutine needs to know about the 454 // worker for a given Manifold. 455 type workerInfo struct { 456 starting bool 457 stopping bool 458 worker worker.Worker 459 } 460 461 // stopped returns true unless the worker is either assigned or starting. 462 func (info workerInfo) stopped() bool { 463 switch { 464 case info.worker != nil: 465 return false 466 case info.starting: 467 return false 468 } 469 return true 470 } 471 472 // installTicket is used by engine to induce installation of a named manifold 473 // and pass on any errors encountered in the process. 474 type installTicket struct { 475 name string 476 manifold Manifold 477 result chan<- error 478 } 479 480 // startedTicket is used by engine to notify the loop of the creation of the 481 // worker for a particular manifold. 482 type startedTicket struct { 483 name string 484 worker worker.Worker 485 } 486 487 // stoppedTicket is used by engine to notify the loop of the demise of (or 488 // failure to create) the worker for a particular manifold. 489 type stoppedTicket struct { 490 name string 491 error error 492 }