github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/dependency/engine.go (about) 1 // Copyright 2015-2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dependency 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "github.com/juju/utils/set" 12 "launchpad.net/tomb" 13 14 "github.com/juju/juju/worker" 15 ) 16 17 var logger = loggo.GetLogger("juju.worker.dependency") 18 19 // EngineConfig defines the parameters needed to create a new engine. 20 type EngineConfig struct { 21 22 // IsFatal returns true when passed an error that should stop 23 // the engine. It must not be nil. 24 IsFatal IsFatalFunc 25 26 // WorstError returns the more important of two fatal errors 27 // passed to it, and is used to determine which fatal error to 28 // report when there's more than one. It must not be nil. 29 WorstError WorstErrorFunc 30 31 // Filter, if not nil, will modify any fatal error reported from 32 // Wait(). 33 Filter FilterFunc 34 35 // ErrorDelay controls how long the engine waits before starting 36 // a worker that stopped with an unknown error. It must not be 37 // negative. 38 ErrorDelay time.Duration 39 40 // BounceDelay controls how long the engine waits before starting 41 // a worker that was deliberately stopped because its dependencies 42 // changed. It must not be negative. 43 BounceDelay time.Duration 44 } 45 46 // Validate returns an error if any field is invalid. 47 func (config *EngineConfig) Validate() error { 48 if config.IsFatal == nil { 49 return errors.New("IsFatal not specified") 50 } 51 if config.WorstError == nil { 52 return errors.New("WorstError not specified") 53 } 54 if config.ErrorDelay < 0 { 55 return errors.New("ErrorDelay is negative") 56 } 57 if config.BounceDelay < 0 { 58 return errors.New("BounceDelay is negative") 59 } 60 return nil 61 } 62 63 // NewEngine returns an Engine that will maintain any installed Manifolds until 64 // either the engine is stopped or one of the manifolds' workers returns an error 65 // that satisfies isFatal. The caller takes responsibility for the returned Engine: 66 // it's responsible for Kill()ing the Engine when no longer used, and must handle 67 // any error from Wait(). 68 func NewEngine(config EngineConfig) (*Engine, error) { 69 if err := config.Validate(); err != nil { 70 return nil, errors.Annotatef(err, "invalid config") 71 } 72 engine := &Engine{ 73 config: config, 74 75 manifolds: Manifolds{}, 76 dependents: map[string][]string{}, 77 current: map[string]workerInfo{}, 78 79 install: make(chan installTicket), 80 started: make(chan startedTicket), 81 stopped: make(chan stoppedTicket), 82 report: make(chan reportTicket), 83 } 84 go func() { 85 defer engine.tomb.Done() 86 engine.tomb.Kill(engine.loop()) 87 }() 88 return engine, nil 89 } 90 91 // Engine maintains workers corresponding to its installed manifolds, and 92 // restarts them whenever their inputs change. 93 type Engine struct { 94 95 // config contains values passed in as config when the engine was created. 96 config EngineConfig 97 98 // As usual, we use tomb.Tomb to track the lifecycle and error state of the 99 // engine worker itself; but we *only* report *internal* errors via the tomb. 100 // Fatal errors received from workers are *not* used to kill the tomb; they 101 // are tracked separately, and will only be exposed to the client when the 102 // engine's tomb has completed its job and encountered no errors. 103 tomb tomb.Tomb 104 105 // worstError is used to track the most important fatal error we've received 106 // from any manifold. This should be the only place fatal errors are stored; 107 // they must *not* be passed into the tomb. 108 worstError error 109 110 // manifolds holds the installed manifolds by name. 111 manifolds Manifolds 112 113 // dependents holds, for each named manifold, those that depend on it. 114 dependents map[string][]string 115 116 // current holds the active worker information for each installed manifold. 117 current map[string]workerInfo 118 119 // install, started, report and stopped each communicate requests and changes into 120 // the loop goroutine. 121 install chan installTicket 122 started chan startedTicket 123 stopped chan stoppedTicket 124 report chan reportTicket 125 } 126 127 // loop serializes manifold install operations and worker start/stop notifications. 128 // It's notable for its oneShotDying var, which is necessary because any number of 129 // start/stop notification could be in flight at the point the engine needs to stop; 130 // we need to handle all those, and any subsequent messages, until the main loop is 131 // confident that every worker has stopped. (The usual pattern -- to defer a cleanup 132 // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because 133 // it needs to duplicate that start/stop message handling; better to localise that 134 // in this method.) 135 func (engine *Engine) loop() error { 136 oneShotDying := engine.tomb.Dying() 137 for { 138 select { 139 case <-oneShotDying: 140 oneShotDying = nil 141 for name := range engine.current { 142 engine.requestStop(name) 143 } 144 case ticket := <-engine.report: 145 // This is safe so long as the Report method reads the result. 146 ticket.result <- engine.liveReport() 147 case ticket := <-engine.install: 148 // This is safe so long as the Install method reads the result. 149 ticket.result <- engine.gotInstall(ticket.name, ticket.manifold) 150 case ticket := <-engine.started: 151 engine.gotStarted(ticket.name, ticket.worker, ticket.resourceLog) 152 case ticket := <-engine.stopped: 153 engine.gotStopped(ticket.name, ticket.error, ticket.resourceLog) 154 } 155 if engine.isDying() { 156 if engine.allOthersStopped() { 157 return tomb.ErrDying 158 } 159 } 160 } 161 } 162 163 // Kill is part of the worker.Worker interface. 164 func (engine *Engine) Kill() { 165 engine.tomb.Kill(nil) 166 } 167 168 // Wait is part of the worker.Worker interface. 169 func (engine *Engine) Wait() error { 170 if tombError := engine.tomb.Wait(); tombError != nil { 171 return tombError 172 } 173 err := engine.worstError 174 if engine.config.Filter != nil { 175 return engine.config.Filter(err) 176 } 177 return err 178 } 179 180 // Report is part of the Reporter interface. 181 func (engine *Engine) Report() map[string]interface{} { 182 report := make(chan map[string]interface{}) 183 select { 184 case engine.report <- reportTicket{report}: 185 // This is safe so long as the loop sends a result. 186 return <-report 187 case <-engine.tomb.Dead(): 188 // Note that we don't abort on Dying as we usually would; the 189 // oneShotDying approach in loop means that it can continue to 190 // process requests until the last possible moment. Only once 191 // loop has exited do we fall back to this report. 192 return map[string]interface{}{ 193 KeyState: "stopped", 194 KeyError: engine.Wait(), 195 KeyManifolds: engine.manifoldsReport(), 196 } 197 } 198 } 199 200 // liveReport collects and returns information about the engine, its manifolds, 201 // and their workers. It must only be called from the loop goroutine. 202 func (engine *Engine) liveReport() map[string]interface{} { 203 var reportError error 204 state := "started" 205 if engine.isDying() { 206 state = "stopping" 207 if tombError := engine.tomb.Err(); tombError != nil { 208 reportError = tombError 209 } else { 210 reportError = engine.worstError 211 } 212 } 213 return map[string]interface{}{ 214 KeyState: state, 215 KeyError: reportError, 216 KeyManifolds: engine.manifoldsReport(), 217 } 218 } 219 220 // manifoldsReport collects and returns information about the engine's manifolds 221 // and their workers. Until the tomb is Dead, it should only be called from the 222 // loop goroutine; after that, it's goroutine-safe. 223 func (engine *Engine) manifoldsReport() map[string]interface{} { 224 manifolds := map[string]interface{}{} 225 for name, info := range engine.current { 226 manifolds[name] = map[string]interface{}{ 227 KeyState: info.state(), 228 KeyError: info.err, 229 KeyInputs: engine.manifolds[name].Inputs, 230 KeyReport: info.report(), 231 KeyResourceLog: resourceLogReport(info.resourceLog), 232 } 233 } 234 return manifolds 235 } 236 237 // Install is part of the Engine interface. 238 func (engine *Engine) Install(name string, manifold Manifold) error { 239 result := make(chan error) 240 select { 241 case <-engine.tomb.Dying(): 242 return errors.New("engine is shutting down") 243 case engine.install <- installTicket{name, manifold, result}: 244 // This is safe so long as the loop sends a result. 245 return <-result 246 } 247 } 248 249 // gotInstall handles the params originally supplied to Install. It must only be 250 // called from the loop goroutine. 251 func (engine *Engine) gotInstall(name string, manifold Manifold) error { 252 logger.Tracef("installing %q manifold...", name) 253 if _, found := engine.manifolds[name]; found { 254 return errors.Errorf("%q manifold already installed", name) 255 } 256 if err := engine.checkAcyclic(name, manifold); err != nil { 257 return errors.Annotatef(err, "cannot install %q manifold", name) 258 } 259 engine.manifolds[name] = manifold 260 for _, input := range manifold.Inputs { 261 engine.dependents[input] = append(engine.dependents[input], name) 262 } 263 engine.current[name] = workerInfo{} 264 engine.requestStart(name, 0) 265 return nil 266 } 267 268 // uninstall removes the named manifold from the engine's records. 269 func (engine *Engine) uninstall(name string) { 270 // Note that we *don't* want to remove dependents[name] -- all those other 271 // manifolds do still depend on this, and another manifold with the same 272 // name might be installed in the future -- but we do want to remove the 273 // named manifold from all *values* in the dependents map. 274 for dName, dependents := range engine.dependents { 275 depSet := set.NewStrings(dependents...) 276 depSet.Remove(name) 277 engine.dependents[dName] = depSet.Values() 278 } 279 delete(engine.current, name) 280 delete(engine.manifolds, name) 281 } 282 283 // checkAcyclic returns an error if the introduction of the supplied manifold 284 // would cause the dependency graph to contain cycles. 285 func (engine *Engine) checkAcyclic(name string, manifold Manifold) error { 286 manifolds := Manifolds{name: manifold} 287 for name, manifold := range engine.manifolds { 288 manifolds[name] = manifold 289 } 290 return Validate(manifolds) 291 } 292 293 // requestStart invokes a runWorker goroutine for the manifold with the supplied 294 // name. It must only be called from the loop goroutine. 295 func (engine *Engine) requestStart(name string, delay time.Duration) { 296 297 // Check preconditions. 298 manifold, found := engine.manifolds[name] 299 if !found { 300 engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name)) 301 } 302 303 // Copy current info and check more preconditions. 304 info := engine.current[name] 305 if !info.stopped() { 306 engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name)) 307 } 308 309 // Final check that we're not shutting down yet... 310 if engine.isDying() { 311 logger.Tracef("not starting %q manifold worker (shutting down)", name) 312 return 313 } 314 315 // ...then update the info, copy it back to the engine, and start a worker 316 // goroutine based on current known state. 317 info.starting = true 318 info.abort = make(chan struct{}) 319 engine.current[name] = info 320 context := engine.context(name, manifold.Inputs, info.abort) 321 go engine.runWorker(name, delay, manifold.Start, context) 322 } 323 324 // context returns a context backed by a snapshot of current 325 // worker state, restricted to those workers declared in inputs. It must only 326 // be called from the loop goroutine; see inside for a detailed dicsussion of 327 // why we took this appproach. 328 func (engine *Engine) context(name string, inputs []string, abort <-chan struct{}) *context { 329 // We snapshot the resources available at invocation time, rather than adding an 330 // additional communicate-resource-request channel. The latter approach is not 331 // unreasonable... but is prone to inelegant scrambles when starting several 332 // dependent workers at once. For example: 333 // 334 // * Install manifold A; loop starts worker A 335 // * Install manifold B; loop starts worker B 336 // * A communicates its worker back to loop; main thread bounces B 337 // * B asks for A, gets A, doesn't react to bounce (*) 338 // * B communicates its worker back to loop; loop kills it immediately in 339 // response to earlier bounce 340 // * loop starts worker B again, now everything's fine; but, still, yuck. 341 // This is not a happy path to take by default. 342 // 343 // The problem, of course, is in the (*); the main thread *does* know that B 344 // needs to bounce soon anyway, and it *could* communicate that fact back via 345 // an error over a channel back into context.Get; the StartFunc could then 346 // just return (say) that ErrResourceChanged and avoid the hassle of creating 347 // a worker. But that adds a whole layer of complexity (and unpredictability 348 // in tests, which is not much fun) for very little benefit. 349 // 350 // In the analogous scenario with snapshotted dependencies, we see a happier 351 // picture at startup time: 352 // 353 // * Install manifold A; loop starts worker A 354 // * Install manifold B; loop starts worker B with empty resource snapshot 355 // * A communicates its worker back to loop; main thread bounces B 356 // * B's StartFunc asks for A, gets nothing, returns ErrMissing 357 // * loop restarts worker B with an up-to-date snapshot, B works fine 358 // 359 // We assume that, in the common case, most workers run without error most 360 // of the time; and, thus, that the vast majority of worker startups will 361 // happen as an agent starts. Furthermore, most of them will have simple 362 // hard dependencies, and their Start funcs will be easy to write; the only 363 // components that may be impacted by such a strategy will be those workers 364 // which still want to run (with reduced functionality) with some dependency 365 // unmet. 366 // 367 // Those may indeed suffer the occasional extra bounce as the system comes 368 // to stability as it starts, or after a change; but workers *must* be 369 // written for resilience in the face of arbitrary bounces *anyway*, so it 370 // shouldn't be harmful. 371 outputs := map[string]OutputFunc{} 372 workers := map[string]worker.Worker{} 373 for _, resourceName := range inputs { 374 outputs[resourceName] = engine.manifolds[resourceName].Output 375 workers[resourceName] = engine.current[resourceName].worker 376 } 377 return &context{ 378 clientName: name, 379 abort: abort, 380 expired: make(chan struct{}), 381 workers: workers, 382 outputs: outputs, 383 } 384 } 385 386 // runWorker starts the supplied manifold's worker and communicates it back to the 387 // loop goroutine; waits for worker completion; and communicates any error encountered 388 // back to the loop goroutine. It must not be run on the loop goroutine. 389 func (engine *Engine) runWorker(name string, delay time.Duration, start StartFunc, context *context) { 390 391 errAborted := errors.New("aborted before delay elapsed") 392 393 startAfterDelay := func() (worker.Worker, error) { 394 // NOTE: the context will expire *after* the worker is started. 395 // This is tolerable because 396 // 1) we'll still correctly block access attempts most of the time 397 // 2) failing to block them won't cause data races anyway 398 // 3) it's not worth complicating the interface for every client just 399 // to eliminate the possibility of one harmlessly dumb interaction. 400 defer context.expire() 401 logger.Tracef("starting %q manifold worker in %s...", name, delay) 402 select { 403 case <-engine.tomb.Dying(): 404 return nil, errAborted 405 case <-context.Abort(): 406 return nil, errAborted 407 // TODO(fwereade): 2016-03-17 lp:1558657 408 case <-time.After(delay): 409 } 410 logger.Tracef("starting %q manifold worker", name) 411 return start(context) 412 } 413 414 startWorkerAndWait := func() error { 415 worker, err := startAfterDelay() 416 switch errors.Cause(err) { 417 case errAborted: 418 return nil 419 case nil: 420 logger.Tracef("running %q manifold worker", name) 421 default: 422 logger.Tracef("failed to start %q manifold worker: %v", name, err) 423 return err 424 } 425 select { 426 case <-engine.tomb.Dying(): 427 logger.Tracef("stopping %q manifold worker (shutting down)", name) 428 // Doesn't matter whether worker == engine: if we're already Dying 429 // then cleanly Kill()ing ourselves again won't hurt anything. 430 worker.Kill() 431 case engine.started <- startedTicket{name, worker, context.accessLog}: 432 logger.Tracef("registered %q manifold worker", name) 433 } 434 if worker == engine { 435 // We mustn't Wait() for ourselves to complete here, or we'll 436 // deadlock. But we should wait until we're Dying, because we 437 // need this func to keep running to keep the self manifold 438 // accessible as a resource. 439 <-engine.tomb.Dying() 440 return tomb.ErrDying 441 } 442 443 return worker.Wait() 444 } 445 446 // We may or may not send on started, but we *must* send on stopped. 447 engine.stopped <- stoppedTicket{name, startWorkerAndWait(), context.accessLog} 448 } 449 450 // gotStarted updates the engine to reflect the creation of a worker. It must 451 // only be called from the loop goroutine. 452 func (engine *Engine) gotStarted(name string, worker worker.Worker, resourceLog []resourceAccess) { 453 // Copy current info; check preconditions and abort the workers if we've 454 // already been asked to stop it. 455 info := engine.current[name] 456 switch { 457 case info.worker != nil: 458 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name)) 459 fallthrough 460 case info.stopping, engine.isDying(): 461 logger.Tracef("%q manifold worker no longer required", name) 462 worker.Kill() 463 default: 464 // It's fine to use this worker; update info and copy back. 465 logger.Debugf("%q manifold worker started", name) 466 engine.current[name] = workerInfo{ 467 worker: worker, 468 resourceLog: resourceLog, 469 } 470 471 // Any manifold that declares this one as an input needs to be restarted. 472 engine.bounceDependents(name) 473 } 474 } 475 476 // gotStopped updates the engine to reflect the demise of (or failure to create) 477 // a worker. It must only be called from the loop goroutine. 478 func (engine *Engine) gotStopped(name string, err error, resourceLog []resourceAccess) { 479 logger.Debugf("%q manifold worker stopped: %v", name, err) 480 if filter := engine.manifolds[name].Filter; filter != nil { 481 err = filter(err) 482 } 483 484 // Copy current info and check for reasons to stop the engine. 485 info := engine.current[name] 486 if info.stopped() { 487 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name)) 488 } else if engine.config.IsFatal(err) { 489 engine.worstError = engine.config.WorstError(err, engine.worstError) 490 engine.tomb.Kill(nil) 491 } 492 493 // Reset engine info; and bail out if we can be sure there's no need to bounce. 494 engine.current[name] = workerInfo{ 495 err: err, 496 resourceLog: resourceLog, 497 } 498 if engine.isDying() { 499 logger.Tracef("permanently stopped %q manifold worker (shutting down)", name) 500 return 501 } 502 503 // If we told the worker to stop, we should start it again immediately, 504 // whatever else happened. 505 if info.stopping { 506 engine.requestStart(name, engine.config.BounceDelay) 507 } else { 508 // If we didn't stop it ourselves, we need to interpret the error. 509 switch errors.Cause(err) { 510 case nil: 511 // Nothing went wrong; the task completed successfully. Nothing 512 // needs to be done (unless the inputs change, in which case it 513 // gets to check again). 514 case ErrMissing: 515 // The task can't even start with the current state. Nothing more 516 // can be done (until the inputs change, in which case we retry 517 // anyway). 518 case ErrBounce: 519 // The task exited but wanted to restart immediately. 520 engine.requestStart(name, engine.config.BounceDelay) 521 case ErrUninstall: 522 // The task should never run again, and can be removed completely. 523 engine.uninstall(name) 524 default: 525 // Something went wrong but we don't know what. Try again soon. 526 logger.Errorf("%q manifold worker returned unexpected error: %v", name, err) 527 engine.requestStart(name, engine.config.ErrorDelay) 528 } 529 } 530 531 // Manifolds that declared a dependency on this one only need to be notified 532 // if the worker has changed; if it was already nil, nobody needs to know. 533 if info.worker != nil { 534 engine.bounceDependents(name) 535 } 536 } 537 538 // requestStop ensures that any running or starting worker will be stopped in the 539 // near future. It must only be called from the loop goroutine. 540 func (engine *Engine) requestStop(name string) { 541 542 // If already stopping or stopped, just don't do anything. 543 info := engine.current[name] 544 if info.stopping || info.stopped() { 545 return 546 } 547 548 // Update info, kill worker if present, and copy info back to engine. 549 info.stopping = true 550 if info.abort != nil { 551 close(info.abort) 552 info.abort = nil 553 } 554 if info.worker != nil { 555 info.worker.Kill() 556 } 557 engine.current[name] = info 558 } 559 560 // isDying returns true if the engine is shutting down. It's safe to call it 561 // from any goroutine. 562 func (engine *Engine) isDying() bool { 563 select { 564 case <-engine.tomb.Dying(): 565 return true 566 default: 567 return false 568 } 569 } 570 571 // allOthersStopped returns true if no workers (other than the engine itself, 572 // if it happens to have been injected) are running or starting. It must only 573 // be called from the loop goroutine. 574 func (engine *Engine) allOthersStopped() bool { 575 for _, info := range engine.current { 576 if !info.stopped() && info.worker != engine { 577 return false 578 } 579 } 580 return true 581 } 582 583 // bounceDependents starts every stopped dependent of the named manifold, and 584 // stops every started one (and trusts the rest of the engine to restart them). 585 // It must only be called from the loop goroutine. 586 func (engine *Engine) bounceDependents(name string) { 587 logger.Tracef("restarting dependents of %q manifold", name) 588 for _, dependentName := range engine.dependents[name] { 589 if engine.current[dependentName].stopped() { 590 engine.requestStart(dependentName, engine.config.BounceDelay) 591 } else { 592 engine.requestStop(dependentName) 593 } 594 } 595 } 596 597 // workerInfo stores what an engine's loop goroutine needs to know about the 598 // worker for a given Manifold. 599 type workerInfo struct { 600 starting bool 601 stopping bool 602 abort chan struct{} 603 worker worker.Worker 604 err error 605 resourceLog []resourceAccess 606 } 607 608 // stopped returns true unless the worker is either assigned or starting. 609 func (info workerInfo) stopped() bool { 610 switch { 611 case info.worker != nil: 612 return false 613 case info.starting: 614 return false 615 } 616 return true 617 } 618 619 // state returns the latest known state of the worker, for use in reports. 620 func (info workerInfo) state() string { 621 switch { 622 case info.starting: 623 return "starting" 624 case info.stopping: 625 return "stopping" 626 case info.worker != nil: 627 return "started" 628 } 629 return "stopped" 630 } 631 632 // report returns any available report from the worker. If the worker is not 633 // a Reporter, or is not present, this method will return nil. 634 func (info workerInfo) report() map[string]interface{} { 635 if reporter, ok := info.worker.(Reporter); ok { 636 return reporter.Report() 637 } 638 return nil 639 } 640 641 // installTicket is used by engine to induce installation of a named manifold 642 // and pass on any errors encountered in the process. 643 type installTicket struct { 644 name string 645 manifold Manifold 646 result chan<- error 647 } 648 649 // startedTicket is used by engine to notify the loop of the creation of the 650 // worker for a particular manifold. 651 type startedTicket struct { 652 name string 653 worker worker.Worker 654 resourceLog []resourceAccess 655 } 656 657 // stoppedTicket is used by engine to notify the loop of the demise of (or 658 // failure to create) the worker for a particular manifold. 659 type stoppedTicket struct { 660 name string 661 error error 662 resourceLog []resourceAccess 663 } 664 665 // reportTicket is used by the engine to notify the loop that a status report 666 // should be generated. 667 type reportTicket struct { 668 result chan map[string]interface{} 669 }