github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/dependency/engine.go (about) 1 // Copyright 2015-2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dependency 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "github.com/juju/utils/set" 12 "gopkg.in/tomb.v1" 13 14 "github.com/juju/juju/worker" 15 ) 16 17 var logger = loggo.GetLogger("juju.worker.dependency") 18 19 // EngineConfig defines the parameters needed to create a new engine. 20 type EngineConfig struct { 21 22 // IsFatal returns true when passed an error that should stop 23 // the engine. It must not be nil. 24 IsFatal IsFatalFunc 25 26 // WorstError returns the more important of two fatal errors 27 // passed to it, and is used to determine which fatal error to 28 // report when there's more than one. It must not be nil. 29 WorstError WorstErrorFunc 30 31 // Filter, if not nil, will modify any fatal error reported from 32 // Wait(). 33 Filter FilterFunc 34 35 // ErrorDelay controls how long the engine waits before starting 36 // a worker that stopped with an unknown error. It must not be 37 // negative. 38 ErrorDelay time.Duration 39 40 // BounceDelay controls how long the engine waits before starting 41 // a worker that was deliberately stopped because its dependencies 42 // changed. It must not be negative. 43 BounceDelay time.Duration 44 } 45 46 // Validate returns an error if any field is invalid. 47 func (config *EngineConfig) Validate() error { 48 if config.IsFatal == nil { 49 return errors.New("IsFatal not specified") 50 } 51 if config.WorstError == nil { 52 return errors.New("WorstError not specified") 53 } 54 if config.ErrorDelay < 0 { 55 return errors.New("ErrorDelay is negative") 56 } 57 if config.BounceDelay < 0 { 58 return errors.New("BounceDelay is negative") 59 } 60 return nil 61 } 62 63 // NewEngine returns an Engine that will maintain any installed Manifolds until 64 // either the engine is stopped or one of the manifolds' workers returns an error 65 // that satisfies isFatal. The caller takes responsibility for the returned Engine: 66 // it's responsible for Kill()ing the Engine when no longer used, and must handle 67 // any error from Wait(). 68 func NewEngine(config EngineConfig) (*Engine, error) { 69 if err := config.Validate(); err != nil { 70 return nil, errors.Annotatef(err, "invalid config") 71 } 72 engine := &Engine{ 73 config: config, 74 75 manifolds: Manifolds{}, 76 dependents: map[string][]string{}, 77 current: map[string]workerInfo{}, 78 79 install: make(chan installTicket), 80 started: make(chan startedTicket), 81 stopped: make(chan stoppedTicket), 82 report: make(chan reportTicket), 83 } 84 go func() { 85 defer engine.tomb.Done() 86 engine.tomb.Kill(engine.loop()) 87 }() 88 return engine, nil 89 } 90 91 // Engine maintains workers corresponding to its installed manifolds, and 92 // restarts them whenever their inputs change. 93 type Engine struct { 94 95 // config contains values passed in as config when the engine was created. 96 config EngineConfig 97 98 // As usual, we use tomb.Tomb to track the lifecycle and error state of the 99 // engine worker itself; but we *only* report *internal* errors via the tomb. 100 // Fatal errors received from workers are *not* used to kill the tomb; they 101 // are tracked separately, and will only be exposed to the client when the 102 // engine's tomb has completed its job and encountered no errors. 103 tomb tomb.Tomb 104 105 // worstError is used to track the most important fatal error we've received 106 // from any manifold. This should be the only place fatal errors are stored; 107 // they must *not* be passed into the tomb. 108 worstError error 109 110 // manifolds holds the installed manifolds by name. 111 manifolds Manifolds 112 113 // dependents holds, for each named manifold, those that depend on it. 114 dependents map[string][]string 115 116 // current holds the active worker information for each installed manifold. 117 current map[string]workerInfo 118 119 // install, started, report and stopped each communicate requests and changes into 120 // the loop goroutine. 121 install chan installTicket 122 started chan startedTicket 123 stopped chan stoppedTicket 124 report chan reportTicket 125 } 126 127 // loop serializes manifold install operations and worker start/stop notifications. 128 // It's notable for its oneShotDying var, which is necessary because any number of 129 // start/stop notification could be in flight at the point the engine needs to stop; 130 // we need to handle all those, and any subsequent messages, until the main loop is 131 // confident that every worker has stopped. (The usual pattern -- to defer a cleanup 132 // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because 133 // it needs to duplicate that start/stop message handling; better to localise that 134 // in this method.) 135 func (engine *Engine) loop() error { 136 oneShotDying := engine.tomb.Dying() 137 for { 138 select { 139 case <-oneShotDying: 140 oneShotDying = nil 141 for name := range engine.current { 142 engine.requestStop(name) 143 } 144 case ticket := <-engine.report: 145 // This is safe so long as the Report method reads the result. 146 ticket.result <- engine.liveReport() 147 case ticket := <-engine.install: 148 // This is safe so long as the Install method reads the result. 149 ticket.result <- engine.gotInstall(ticket.name, ticket.manifold) 150 case ticket := <-engine.started: 151 engine.gotStarted(ticket.name, ticket.worker, ticket.resourceLog) 152 case ticket := <-engine.stopped: 153 engine.gotStopped(ticket.name, ticket.error, ticket.resourceLog) 154 } 155 if engine.isDying() { 156 if engine.allOthersStopped() { 157 return tomb.ErrDying 158 } 159 } 160 } 161 } 162 163 // Kill is part of the worker.Worker interface. 164 func (engine *Engine) Kill() { 165 engine.tomb.Kill(nil) 166 } 167 168 // Wait is part of the worker.Worker interface. 169 func (engine *Engine) Wait() error { 170 if tombError := engine.tomb.Wait(); tombError != nil { 171 return tombError 172 } 173 err := engine.worstError 174 if engine.config.Filter != nil { 175 return engine.config.Filter(err) 176 } 177 return err 178 } 179 180 // Report is part of the Reporter interface. 181 func (engine *Engine) Report() map[string]interface{} { 182 report := make(chan map[string]interface{}) 183 select { 184 case engine.report <- reportTicket{report}: 185 // This is safe so long as the loop sends a result. 186 return <-report 187 case <-engine.tomb.Dead(): 188 // Note that we don't abort on Dying as we usually would; the 189 // oneShotDying approach in loop means that it can continue to 190 // process requests until the last possible moment. Only once 191 // loop has exited do we fall back to this report. 192 report := map[string]interface{}{ 193 KeyState: "stopped", 194 KeyManifolds: engine.manifoldsReport(), 195 } 196 if err := engine.Wait(); err != nil { 197 report[KeyError] = err.Error() 198 } 199 return report 200 } 201 } 202 203 // liveReport collects and returns information about the engine, its manifolds, 204 // and their workers. It must only be called from the loop goroutine. 205 func (engine *Engine) liveReport() map[string]interface{} { 206 var reportError error 207 state := "started" 208 if engine.isDying() { 209 state = "stopping" 210 if tombError := engine.tomb.Err(); tombError != nil { 211 reportError = tombError 212 } else { 213 reportError = engine.worstError 214 } 215 } 216 report := map[string]interface{}{ 217 KeyState: state, 218 KeyManifolds: engine.manifoldsReport(), 219 } 220 if reportError != nil { 221 report[KeyError] = reportError.Error() 222 } 223 return report 224 } 225 226 // manifoldsReport collects and returns information about the engine's manifolds 227 // and their workers. Until the tomb is Dead, it should only be called from the 228 // loop goroutine; after that, it's goroutine-safe. 229 func (engine *Engine) manifoldsReport() map[string]interface{} { 230 manifolds := map[string]interface{}{} 231 for name, info := range engine.current { 232 report := map[string]interface{}{ 233 KeyState: info.state(), 234 KeyInputs: engine.manifolds[name].Inputs, 235 KeyResourceLog: resourceLogReport(info.resourceLog), 236 } 237 if info.err != nil { 238 report[KeyError] = info.err.Error() 239 } 240 if reporter, ok := info.worker.(Reporter); ok { 241 if reporter != engine { 242 report[KeyReport] = reporter.Report() 243 } 244 } 245 manifolds[name] = report 246 } 247 return manifolds 248 } 249 250 // Install is part of the Engine interface. 251 func (engine *Engine) Install(name string, manifold Manifold) error { 252 result := make(chan error) 253 select { 254 case <-engine.tomb.Dying(): 255 return errors.New("engine is shutting down") 256 case engine.install <- installTicket{name, manifold, result}: 257 // This is safe so long as the loop sends a result. 258 return <-result 259 } 260 } 261 262 // gotInstall handles the params originally supplied to Install. It must only be 263 // called from the loop goroutine. 264 func (engine *Engine) gotInstall(name string, manifold Manifold) error { 265 logger.Tracef("installing %q manifold...", name) 266 if _, found := engine.manifolds[name]; found { 267 return errors.Errorf("%q manifold already installed", name) 268 } 269 if err := engine.checkAcyclic(name, manifold); err != nil { 270 return errors.Annotatef(err, "cannot install %q manifold", name) 271 } 272 engine.manifolds[name] = manifold 273 for _, input := range manifold.Inputs { 274 engine.dependents[input] = append(engine.dependents[input], name) 275 } 276 engine.current[name] = workerInfo{} 277 engine.requestStart(name, 0) 278 return nil 279 } 280 281 // uninstall removes the named manifold from the engine's records. 282 func (engine *Engine) uninstall(name string) { 283 // Note that we *don't* want to remove dependents[name] -- all those other 284 // manifolds do still depend on this, and another manifold with the same 285 // name might be installed in the future -- but we do want to remove the 286 // named manifold from all *values* in the dependents map. 287 for dName, dependents := range engine.dependents { 288 depSet := set.NewStrings(dependents...) 289 depSet.Remove(name) 290 engine.dependents[dName] = depSet.Values() 291 } 292 delete(engine.current, name) 293 delete(engine.manifolds, name) 294 } 295 296 // checkAcyclic returns an error if the introduction of the supplied manifold 297 // would cause the dependency graph to contain cycles. 298 func (engine *Engine) checkAcyclic(name string, manifold Manifold) error { 299 manifolds := Manifolds{name: manifold} 300 for name, manifold := range engine.manifolds { 301 manifolds[name] = manifold 302 } 303 return Validate(manifolds) 304 } 305 306 // requestStart invokes a runWorker goroutine for the manifold with the supplied 307 // name. It must only be called from the loop goroutine. 308 func (engine *Engine) requestStart(name string, delay time.Duration) { 309 310 // Check preconditions. 311 manifold, found := engine.manifolds[name] 312 if !found { 313 engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name)) 314 } 315 316 // Copy current info and check more preconditions. 317 info := engine.current[name] 318 if !info.stopped() { 319 engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name)) 320 } 321 322 // Final check that we're not shutting down yet... 323 if engine.isDying() { 324 logger.Tracef("not starting %q manifold worker (shutting down)", name) 325 return 326 } 327 328 // ...then update the info, copy it back to the engine, and start a worker 329 // goroutine based on current known state. 330 info.starting = true 331 info.abort = make(chan struct{}) 332 engine.current[name] = info 333 context := engine.context(name, manifold.Inputs, info.abort) 334 go engine.runWorker(name, delay, manifold.Start, context) 335 } 336 337 // context returns a context backed by a snapshot of current 338 // worker state, restricted to those workers declared in inputs. It must only 339 // be called from the loop goroutine; see inside for a detailed dicsussion of 340 // why we took this appproach. 341 func (engine *Engine) context(name string, inputs []string, abort <-chan struct{}) *context { 342 // We snapshot the resources available at invocation time, rather than adding an 343 // additional communicate-resource-request channel. The latter approach is not 344 // unreasonable... but is prone to inelegant scrambles when starting several 345 // dependent workers at once. For example: 346 // 347 // * Install manifold A; loop starts worker A 348 // * Install manifold B; loop starts worker B 349 // * A communicates its worker back to loop; main thread bounces B 350 // * B asks for A, gets A, doesn't react to bounce (*) 351 // * B communicates its worker back to loop; loop kills it immediately in 352 // response to earlier bounce 353 // * loop starts worker B again, now everything's fine; but, still, yuck. 354 // This is not a happy path to take by default. 355 // 356 // The problem, of course, is in the (*); the main thread *does* know that B 357 // needs to bounce soon anyway, and it *could* communicate that fact back via 358 // an error over a channel back into context.Get; the StartFunc could then 359 // just return (say) that ErrResourceChanged and avoid the hassle of creating 360 // a worker. But that adds a whole layer of complexity (and unpredictability 361 // in tests, which is not much fun) for very little benefit. 362 // 363 // In the analogous scenario with snapshotted dependencies, we see a happier 364 // picture at startup time: 365 // 366 // * Install manifold A; loop starts worker A 367 // * Install manifold B; loop starts worker B with empty resource snapshot 368 // * A communicates its worker back to loop; main thread bounces B 369 // * B's StartFunc asks for A, gets nothing, returns ErrMissing 370 // * loop restarts worker B with an up-to-date snapshot, B works fine 371 // 372 // We assume that, in the common case, most workers run without error most 373 // of the time; and, thus, that the vast majority of worker startups will 374 // happen as an agent starts. Furthermore, most of them will have simple 375 // hard dependencies, and their Start funcs will be easy to write; the only 376 // components that may be impacted by such a strategy will be those workers 377 // which still want to run (with reduced functionality) with some dependency 378 // unmet. 379 // 380 // Those may indeed suffer the occasional extra bounce as the system comes 381 // to stability as it starts, or after a change; but workers *must* be 382 // written for resilience in the face of arbitrary bounces *anyway*, so it 383 // shouldn't be harmful. 384 outputs := map[string]OutputFunc{} 385 workers := map[string]worker.Worker{} 386 for _, resourceName := range inputs { 387 outputs[resourceName] = engine.manifolds[resourceName].Output 388 workers[resourceName] = engine.current[resourceName].worker 389 } 390 return &context{ 391 clientName: name, 392 abort: abort, 393 expired: make(chan struct{}), 394 workers: workers, 395 outputs: outputs, 396 } 397 } 398 399 // runWorker starts the supplied manifold's worker and communicates it back to the 400 // loop goroutine; waits for worker completion; and communicates any error encountered 401 // back to the loop goroutine. It must not be run on the loop goroutine. 402 func (engine *Engine) runWorker(name string, delay time.Duration, start StartFunc, context *context) { 403 404 errAborted := errors.New("aborted before delay elapsed") 405 406 startAfterDelay := func() (worker.Worker, error) { 407 // NOTE: the context will expire *after* the worker is started. 408 // This is tolerable because 409 // 1) we'll still correctly block access attempts most of the time 410 // 2) failing to block them won't cause data races anyway 411 // 3) it's not worth complicating the interface for every client just 412 // to eliminate the possibility of one harmlessly dumb interaction. 413 defer context.expire() 414 logger.Tracef("starting %q manifold worker in %s...", name, delay) 415 select { 416 case <-engine.tomb.Dying(): 417 return nil, errAborted 418 case <-context.Abort(): 419 return nil, errAborted 420 // TODO(fwereade): 2016-03-17 lp:1558657 421 case <-time.After(delay): 422 } 423 logger.Tracef("starting %q manifold worker", name) 424 return start(context) 425 } 426 427 startWorkerAndWait := func() error { 428 worker, err := startAfterDelay() 429 switch errors.Cause(err) { 430 case errAborted: 431 return nil 432 case nil: 433 logger.Tracef("running %q manifold worker", name) 434 default: 435 logger.Tracef("failed to start %q manifold worker: %v", name, err) 436 return err 437 } 438 select { 439 case <-engine.tomb.Dying(): 440 logger.Tracef("stopping %q manifold worker (shutting down)", name) 441 // Doesn't matter whether worker == engine: if we're already Dying 442 // then cleanly Kill()ing ourselves again won't hurt anything. 443 worker.Kill() 444 case engine.started <- startedTicket{name, worker, context.accessLog}: 445 logger.Tracef("registered %q manifold worker", name) 446 } 447 if worker == engine { 448 // We mustn't Wait() for ourselves to complete here, or we'll 449 // deadlock. But we should wait until we're Dying, because we 450 // need this func to keep running to keep the self manifold 451 // accessible as a resource. 452 <-engine.tomb.Dying() 453 return tomb.ErrDying 454 } 455 456 return worker.Wait() 457 } 458 459 // We may or may not send on started, but we *must* send on stopped. 460 engine.stopped <- stoppedTicket{name, startWorkerAndWait(), context.accessLog} 461 } 462 463 // gotStarted updates the engine to reflect the creation of a worker. It must 464 // only be called from the loop goroutine. 465 func (engine *Engine) gotStarted(name string, worker worker.Worker, resourceLog []resourceAccess) { 466 // Copy current info; check preconditions and abort the workers if we've 467 // already been asked to stop it. 468 info := engine.current[name] 469 switch { 470 case info.worker != nil: 471 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name)) 472 fallthrough 473 case info.stopping, engine.isDying(): 474 logger.Tracef("%q manifold worker no longer required", name) 475 worker.Kill() 476 default: 477 // It's fine to use this worker; update info and copy back. 478 logger.Debugf("%q manifold worker started", name) 479 engine.current[name] = workerInfo{ 480 worker: worker, 481 resourceLog: resourceLog, 482 } 483 484 // Any manifold that declares this one as an input needs to be restarted. 485 engine.bounceDependents(name) 486 } 487 } 488 489 // gotStopped updates the engine to reflect the demise of (or failure to create) 490 // a worker. It must only be called from the loop goroutine. 491 func (engine *Engine) gotStopped(name string, err error, resourceLog []resourceAccess) { 492 logger.Debugf("%q manifold worker stopped: %v", name, err) 493 if filter := engine.manifolds[name].Filter; filter != nil { 494 err = filter(err) 495 } 496 497 // Copy current info and check for reasons to stop the engine. 498 info := engine.current[name] 499 if info.stopped() { 500 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name)) 501 } else if engine.config.IsFatal(err) { 502 engine.worstError = engine.config.WorstError(err, engine.worstError) 503 engine.tomb.Kill(nil) 504 } 505 506 // Reset engine info; and bail out if we can be sure there's no need to bounce. 507 engine.current[name] = workerInfo{ 508 err: err, 509 resourceLog: resourceLog, 510 } 511 if engine.isDying() { 512 logger.Tracef("permanently stopped %q manifold worker (shutting down)", name) 513 return 514 } 515 516 // If we told the worker to stop, we should start it again immediately, 517 // whatever else happened. 518 if info.stopping { 519 engine.requestStart(name, engine.config.BounceDelay) 520 } else { 521 // If we didn't stop it ourselves, we need to interpret the error. 522 switch errors.Cause(err) { 523 case nil: 524 // Nothing went wrong; the task completed successfully. Nothing 525 // needs to be done (unless the inputs change, in which case it 526 // gets to check again). 527 case ErrMissing: 528 // The task can't even start with the current state. Nothing more 529 // can be done (until the inputs change, in which case we retry 530 // anyway). 531 case ErrBounce: 532 // The task exited but wanted to restart immediately. 533 engine.requestStart(name, engine.config.BounceDelay) 534 case ErrUninstall: 535 // The task should never run again, and can be removed completely. 536 engine.uninstall(name) 537 default: 538 // Something went wrong but we don't know what. Try again soon. 539 logger.Errorf("%q manifold worker returned unexpected error: %v", name, err) 540 engine.requestStart(name, engine.config.ErrorDelay) 541 } 542 } 543 544 // Manifolds that declared a dependency on this one only need to be notified 545 // if the worker has changed; if it was already nil, nobody needs to know. 546 if info.worker != nil { 547 engine.bounceDependents(name) 548 } 549 } 550 551 // requestStop ensures that any running or starting worker will be stopped in the 552 // near future. It must only be called from the loop goroutine. 553 func (engine *Engine) requestStop(name string) { 554 555 // If already stopping or stopped, just don't do anything. 556 info := engine.current[name] 557 if info.stopping || info.stopped() { 558 return 559 } 560 561 // Update info, kill worker if present, and copy info back to engine. 562 info.stopping = true 563 if info.abort != nil { 564 close(info.abort) 565 info.abort = nil 566 } 567 if info.worker != nil { 568 info.worker.Kill() 569 } 570 engine.current[name] = info 571 } 572 573 // isDying returns true if the engine is shutting down. It's safe to call it 574 // from any goroutine. 575 func (engine *Engine) isDying() bool { 576 select { 577 case <-engine.tomb.Dying(): 578 return true 579 default: 580 return false 581 } 582 } 583 584 // allOthersStopped returns true if no workers (other than the engine itself, 585 // if it happens to have been injected) are running or starting. It must only 586 // be called from the loop goroutine. 587 func (engine *Engine) allOthersStopped() bool { 588 for _, info := range engine.current { 589 if !info.stopped() && info.worker != engine { 590 return false 591 } 592 } 593 return true 594 } 595 596 // bounceDependents starts every stopped dependent of the named manifold, and 597 // stops every started one (and trusts the rest of the engine to restart them). 598 // It must only be called from the loop goroutine. 599 func (engine *Engine) bounceDependents(name string) { 600 logger.Tracef("restarting dependents of %q manifold", name) 601 for _, dependentName := range engine.dependents[name] { 602 if engine.current[dependentName].stopped() { 603 engine.requestStart(dependentName, engine.config.BounceDelay) 604 } else { 605 engine.requestStop(dependentName) 606 } 607 } 608 } 609 610 // workerInfo stores what an engine's loop goroutine needs to know about the 611 // worker for a given Manifold. 612 type workerInfo struct { 613 starting bool 614 stopping bool 615 abort chan struct{} 616 worker worker.Worker 617 err error 618 resourceLog []resourceAccess 619 } 620 621 // stopped returns true unless the worker is either assigned or starting. 622 func (info workerInfo) stopped() bool { 623 switch { 624 case info.worker != nil: 625 return false 626 case info.starting: 627 return false 628 } 629 return true 630 } 631 632 // state returns the latest known state of the worker, for use in reports. 633 func (info workerInfo) state() string { 634 switch { 635 case info.starting: 636 return "starting" 637 case info.stopping: 638 return "stopping" 639 case info.worker != nil: 640 return "started" 641 } 642 return "stopped" 643 } 644 645 // installTicket is used by engine to induce installation of a named manifold 646 // and pass on any errors encountered in the process. 647 type installTicket struct { 648 name string 649 manifold Manifold 650 result chan<- error 651 } 652 653 // startedTicket is used by engine to notify the loop of the creation of the 654 // worker for a particular manifold. 655 type startedTicket struct { 656 name string 657 worker worker.Worker 658 resourceLog []resourceAccess 659 } 660 661 // stoppedTicket is used by engine to notify the loop of the demise of (or 662 // failure to create) the worker for a particular manifold. 663 type stoppedTicket struct { 664 name string 665 error error 666 resourceLog []resourceAccess 667 } 668 669 // reportTicket is used by the engine to notify the loop that a status report 670 // should be generated. 671 type reportTicket struct { 672 result chan map[string]interface{} 673 }