github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/worker/dependency/engine.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dependency 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "launchpad.net/tomb" 12 13 "github.com/juju/juju/worker" 14 ) 15 16 var logger = loggo.GetLogger("juju.worker.dependency") 17 18 // EngineConfig defines the parameters needed to create a new engine. 19 type EngineConfig struct { 20 21 // IsFatal returns true when passed an error that should stop the engine. 22 // It must not be nil. 23 IsFatal IsFatalFunc 24 25 // WorstError returns the more important of two fatal errors passed to it, 26 // and is used to determine which fatal error to report when there's more 27 // than one. It must not be nil. 28 WorstError WorstErrorFunc 29 30 // ErrorDelay controls how long the engine waits before restarting a worker 31 // that encountered an unknown error. It must not be negative. 32 ErrorDelay time.Duration 33 34 // BounceDelay controls how long the engine waits before restarting a worker 35 // that was deliberately shut down because its dependencies changed. It must 36 // not be negative. 37 BounceDelay time.Duration 38 } 39 40 // Validate returns an error if any field is invalid. 41 func (config *EngineConfig) Validate() error { 42 if config.IsFatal == nil { 43 return errors.New("IsFatal not specified") 44 } 45 if config.WorstError == nil { 46 return errors.New("WorstError not specified") 47 } 48 if config.ErrorDelay < 0 { 49 return errors.New("ErrorDelay is negative") 50 } 51 if config.BounceDelay < 0 { 52 return errors.New("BounceDelay is negative") 53 } 54 return nil 55 } 56 57 // NewEngine returns an Engine that will maintain any installed Manifolds until 58 // either the engine is stopped or one of the manifolds' workers returns an error 59 // that satisfies isFatal. The caller takes responsibility for the returned Engine: 60 // it's responsible for Kill()ing the Engine when no longer used, and must handle 61 // any error from Wait(). 62 func NewEngine(config EngineConfig) (Engine, error) { 63 if err := config.Validate(); err != nil { 64 return nil, errors.Annotatef(err, "invalid config") 65 } 66 engine := &engine{ 67 config: config, 68 69 manifolds: Manifolds{}, 70 dependents: map[string][]string{}, 71 current: map[string]workerInfo{}, 72 73 install: make(chan installTicket), 74 started: make(chan startedTicket), 75 stopped: make(chan stoppedTicket), 76 report: make(chan reportTicket), 77 } 78 go func() { 79 defer engine.tomb.Done() 80 engine.tomb.Kill(engine.loop()) 81 }() 82 return engine, nil 83 } 84 85 // engine maintains workers corresponding to its installed manifolds, and 86 // restarts them whenever their inputs change. 87 type engine struct { 88 89 // config contains values passed in as config when the engine was created. 90 config EngineConfig 91 92 // As usual, we use tomb.Tomb to track the lifecycle and error state of the 93 // engine worker itself; but we *only* report *internal* errors via the tomb. 94 // Fatal errors received from workers are *not* used to kill the tomb; they 95 // are tracked separately, and will only be exposed to the client when the 96 // engine's tomb has completed its job and encountered no errors. 97 tomb tomb.Tomb 98 99 // worstError is used to track the most important fatal error we've received 100 // from any manifold. This should be the only place fatal errors are stored; 101 // they must *not* be passed into the tomb. 102 worstError error 103 104 // manifolds holds the installed manifolds by name. 105 manifolds Manifolds 106 107 // dependents holds, for each named manifold, those that depend on it. 108 dependents map[string][]string 109 110 // current holds the active worker information for each installed manifold. 111 current map[string]workerInfo 112 113 // install, started, report and stopped each communicate requests and changes into 114 // the loop goroutine. 115 install chan installTicket 116 started chan startedTicket 117 stopped chan stoppedTicket 118 report chan reportTicket 119 } 120 121 // loop serializes manifold install operations and worker start/stop notifications. 122 // It's notable for its oneShotDying var, which is necessary because any number of 123 // start/stop notification could be in flight at the point the engine needs to stop; 124 // we need to handle all those, and any subsequent messages, until the main loop is 125 // confident that every worker has stopped. (The usual pattern -- to defer a cleanup 126 // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because 127 // it needs to duplicate that start/stop message handling; better to localise that 128 // in this method.) 129 func (engine *engine) loop() error { 130 oneShotDying := engine.tomb.Dying() 131 for { 132 select { 133 case <-oneShotDying: 134 oneShotDying = nil 135 for name := range engine.current { 136 engine.requestStop(name) 137 } 138 case ticket := <-engine.report: 139 // This is safe so long as the Report method reads the result. 140 ticket.result <- engine.liveReport() 141 case ticket := <-engine.install: 142 // This is safe so long as the Install method reads the result. 143 ticket.result <- engine.gotInstall(ticket.name, ticket.manifold) 144 case ticket := <-engine.started: 145 engine.gotStarted(ticket.name, ticket.worker, ticket.resourceLog) 146 case ticket := <-engine.stopped: 147 engine.gotStopped(ticket.name, ticket.error, ticket.resourceLog) 148 } 149 if engine.isDying() { 150 if engine.allStopped() { 151 return tomb.ErrDying 152 } 153 } 154 } 155 } 156 157 // Kill is part of the worker.Worker interface. 158 func (engine *engine) Kill() { 159 engine.tomb.Kill(nil) 160 } 161 162 // Wait is part of the worker.Worker interface. 163 func (engine *engine) Wait() error { 164 if tombError := engine.tomb.Wait(); tombError != nil { 165 return tombError 166 } 167 return engine.worstError 168 } 169 170 // Report is part of the Reporter interface. 171 func (engine *engine) Report() map[string]interface{} { 172 report := make(chan map[string]interface{}) 173 select { 174 case engine.report <- reportTicket{report}: 175 // This is safe so long as the loop sends a result. 176 return <-report 177 case <-engine.tomb.Dead(): 178 // Note that we don't abort on Dying as we usually would; the 179 // oneShotDying approach in loop means that it can continue to 180 // process requests until the last possible moment. Only once 181 // loop has exited do we fall back to this report. 182 return map[string]interface{}{ 183 KeyState: "stopped", 184 KeyError: engine.Wait(), 185 KeyManifolds: engine.manifoldsReport(), 186 } 187 } 188 } 189 190 // liveReport collects and returns information about the engine, its manifolds, 191 // and their workers. It must only be called from the loop goroutine. 192 func (engine *engine) liveReport() map[string]interface{} { 193 var reportError error 194 state := "started" 195 if engine.isDying() { 196 state = "stopping" 197 if tombError := engine.tomb.Err(); tombError != nil { 198 reportError = tombError 199 } else { 200 reportError = engine.worstError 201 } 202 } 203 return map[string]interface{}{ 204 KeyState: state, 205 KeyError: reportError, 206 KeyManifolds: engine.manifoldsReport(), 207 } 208 } 209 210 // manifoldsReport collects and returns information about the engine's manifolds 211 // and their workers. Until the tomb is Dead, it should only be called from the 212 // loop goroutine; after that, it's goroutine-safe. 213 func (engine *engine) manifoldsReport() map[string]interface{} { 214 manifolds := map[string]interface{}{} 215 for name, info := range engine.current { 216 manifolds[name] = map[string]interface{}{ 217 KeyState: info.state(), 218 KeyError: info.err, 219 KeyInputs: engine.manifolds[name].Inputs, 220 KeyReport: info.report(), 221 KeyResourceLog: resourceLogReport(info.resourceLog), 222 } 223 } 224 return manifolds 225 } 226 227 // Install is part of the Engine interface. 228 func (engine *engine) Install(name string, manifold Manifold) error { 229 result := make(chan error) 230 select { 231 case <-engine.tomb.Dying(): 232 return errors.New("engine is shutting down") 233 case engine.install <- installTicket{name, manifold, result}: 234 // This is safe so long as the loop sends a result. 235 return <-result 236 } 237 } 238 239 // gotInstall handles the params originally supplied to Install. It must only be 240 // called from the loop goroutine. 241 func (engine *engine) gotInstall(name string, manifold Manifold) error { 242 logger.Tracef("installing %q manifold...", name) 243 if _, found := engine.manifolds[name]; found { 244 return errors.Errorf("%q manifold already installed", name) 245 } 246 if err := engine.checkAcyclic(name, manifold); err != nil { 247 return errors.Annotatef(err, "cannot install %q manifold", name) 248 } 249 engine.manifolds[name] = manifold 250 for _, input := range manifold.Inputs { 251 engine.dependents[input] = append(engine.dependents[input], name) 252 } 253 engine.current[name] = workerInfo{} 254 engine.requestStart(name, 0) 255 return nil 256 } 257 258 // checkAcyclic returns an error if the introduction of the supplied manifold 259 // would cause the dependency graph to contain cycles. 260 func (engine *engine) checkAcyclic(name string, manifold Manifold) error { 261 manifolds := Manifolds{name: manifold} 262 for name, manifold := range engine.manifolds { 263 manifolds[name] = manifold 264 } 265 return Validate(manifolds) 266 } 267 268 // requestStart invokes a runWorker goroutine for the manifold with the supplied 269 // name. It must only be called from the loop goroutine. 270 func (engine *engine) requestStart(name string, delay time.Duration) { 271 272 // Check preconditions. 273 manifold, found := engine.manifolds[name] 274 if !found { 275 engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name)) 276 } 277 278 // Copy current info and check more preconditions. 279 info := engine.current[name] 280 if !info.stopped() { 281 engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name)) 282 } 283 284 // Final check that we're not shutting down yet... 285 if engine.isDying() { 286 logger.Tracef("not starting %q manifold worker (shutting down)", name) 287 return 288 } 289 290 // ...then update the info, copy it back to the engine, and start a worker 291 // goroutine based on current known state. 292 info.starting = true 293 engine.current[name] = info 294 resourceGetter := engine.resourceGetter(name, manifold.Inputs) 295 go engine.runWorker(name, delay, manifold.Start, resourceGetter) 296 } 297 298 // resourceGetter returns a resourceGetter backed by a snapshot of current 299 // worker state, restricted to those workers declared in inputs. It must only 300 // be called from the loop goroutine; see inside for a detailed dicsussion of 301 // why we took this appproach. 302 func (engine *engine) resourceGetter(name string, inputs []string) *resourceGetter { 303 // We snapshot the resources available at invocation time, rather than adding an 304 // additional communicate-resource-request channel. The latter approach is not 305 // unreasonable... but is prone to inelegant scrambles when starting several 306 // dependent workers at once. For example: 307 // 308 // * Install manifold A; loop starts worker A 309 // * Install manifold B; loop starts worker B 310 // * A communicates its worker back to loop; main thread bounces B 311 // * B asks for A, gets A, doesn't react to bounce (*) 312 // * B communicates its worker back to loop; loop kills it immediately in 313 // response to earlier bounce 314 // * loop starts worker B again, now everything's fine; but, still, yuck. 315 // This is not a happy path to take by default. 316 // 317 // The problem, of course, is in the (*); the main thread *does* know that B 318 // needs to bounce soon anyway, and it *could* communicate that fact back via 319 // an error over a channel back into getResource; the StartFunc could then 320 // just return (say) that ErrResourceChanged and avoid the hassle of creating 321 // a worker. But that adds a whole layer of complexity (and unpredictability 322 // in tests, which is not much fun) for very little benefit. 323 // 324 // In the analogous scenario with snapshotted dependencies, we see a happier 325 // picture at startup time: 326 // 327 // * Install manifold A; loop starts worker A 328 // * Install manifold B; loop starts worker B with empty resource snapshot 329 // * A communicates its worker back to loop; main thread bounces B 330 // * B's StartFunc asks for A, gets nothing, returns ErrMissing 331 // * loop restarts worker B with an up-to-date snapshot, B works fine 332 // 333 // We assume that, in the common case, most workers run without error most 334 // of the time; and, thus, that the vast majority of worker startups will 335 // happen as an agent starts. Furthermore, most of them will have simple 336 // hard dependencies, and their Start funcs will be easy to write; the only 337 // components that may be impacted by such a strategy will be those workers 338 // which still want to run (with reduced functionality) with some dependency 339 // unmet. 340 // 341 // Those may indeed suffer the occasional extra bounce as the system comes 342 // to stability as it starts, or after a change; but workers *must* be 343 // written for resilience in the face of arbitrary bounces *anyway*, so it 344 // shouldn't be harmful. 345 outputs := map[string]OutputFunc{} 346 workers := map[string]worker.Worker{} 347 for _, resourceName := range inputs { 348 outputs[resourceName] = engine.manifolds[resourceName].Output 349 workers[resourceName] = engine.current[resourceName].worker 350 } 351 return &resourceGetter{ 352 clientName: name, 353 expired: make(chan struct{}), 354 workers: workers, 355 outputs: outputs, 356 } 357 } 358 359 // runWorker starts the supplied manifold's worker and communicates it back to the 360 // loop goroutine; waits for worker completion; and communicates any error encountered 361 // back to the loop goroutine. It must not be run on the loop goroutine. 362 func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, resourceGetter *resourceGetter) { 363 364 errAborted := errors.New("aborted before delay elapsed") 365 366 startAfterDelay := func() (worker.Worker, error) { 367 // NOTE: the resourceGetter will expire *after* the worker is started. 368 // This is tolerable because 369 // 1) we'll still correctly block access attempts most of the time 370 // 2) failing to block them won't cause data races anyway 371 // 3) it's not worth complicating the interface for every client just 372 // to eliminate the possibility of one harmlessly dumb interaction. 373 defer resourceGetter.expire() 374 logger.Tracef("starting %q manifold worker in %s...", name, delay) 375 select { 376 case <-time.After(delay): 377 case <-engine.tomb.Dying(): 378 return nil, errAborted 379 } 380 logger.Tracef("starting %q manifold worker", name) 381 return start(resourceGetter.getResource) 382 } 383 384 startWorkerAndWait := func() error { 385 worker, err := startAfterDelay() 386 switch errors.Cause(err) { 387 case errAborted: 388 return nil 389 case nil: 390 logger.Tracef("running %q manifold worker", name) 391 default: 392 logger.Tracef("failed to start %q manifold worker: %v", name, err) 393 return err 394 } 395 select { 396 case <-engine.tomb.Dying(): 397 logger.Tracef("stopping %q manifold worker (shutting down)", name) 398 worker.Kill() 399 case engine.started <- startedTicket{name, worker, resourceGetter.accessLog}: 400 logger.Tracef("registered %q manifold worker", name) 401 } 402 return worker.Wait() 403 } 404 405 // We may or may not send on started, but we *must* send on stopped. 406 engine.stopped <- stoppedTicket{name, startWorkerAndWait(), resourceGetter.accessLog} 407 } 408 409 // gotStarted updates the engine to reflect the creation of a worker. It must 410 // only be called from the loop goroutine. 411 func (engine *engine) gotStarted(name string, worker worker.Worker, resourceLog []resourceAccess) { 412 // Copy current info; check preconditions and abort the workers if we've 413 // already been asked to stop it. 414 info := engine.current[name] 415 switch { 416 case info.worker != nil: 417 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name)) 418 fallthrough 419 case info.stopping, engine.isDying(): 420 logger.Tracef("%q manifold worker no longer required", name) 421 worker.Kill() 422 default: 423 // It's fine to use this worker; update info and copy back. 424 logger.Tracef("%q manifold worker started", name) 425 engine.current[name] = workerInfo{ 426 worker: worker, 427 resourceLog: resourceLog, 428 } 429 430 // Any manifold that declares this one as an input needs to be restarted. 431 engine.bounceDependents(name) 432 } 433 } 434 435 // gotStopped updates the engine to reflect the demise of (or failure to create) 436 // a worker. It must only be called from the loop goroutine. 437 func (engine *engine) gotStopped(name string, err error, resourceLog []resourceAccess) { 438 logger.Tracef("%q manifold worker stopped: %v", name, err) 439 440 // Copy current info and check for reasons to stop the engine. 441 info := engine.current[name] 442 if info.stopped() { 443 engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name)) 444 } else if engine.config.IsFatal(err) { 445 engine.worstError = engine.config.WorstError(err, engine.worstError) 446 engine.tomb.Kill(nil) 447 } 448 449 // Reset engine info; and bail out if we can be sure there's no need to bounce. 450 engine.current[name] = workerInfo{ 451 err: err, 452 resourceLog: resourceLog, 453 } 454 if engine.isDying() { 455 logger.Tracef("permanently stopped %q manifold worker (shutting down)", name) 456 return 457 } 458 459 // If we told the worker to stop, we should start it again immediately, 460 // whatever else happened. 461 if info.stopping { 462 engine.requestStart(name, engine.config.BounceDelay) 463 } else { 464 // If we didn't stop it ourselves, we need to interpret the error. 465 switch errors.Cause(err) { 466 case nil: 467 // Nothing went wrong; the task completed successfully. Nothing 468 // needs to be done (unless the inputs change, in which case it 469 // gets to check again). 470 case ErrMissing: 471 // The task can't even start with the current state. Nothing more 472 // can be done (until the inputs change, in which case we retry 473 // anyway). 474 default: 475 // Something went wrong but we don't know what. Try again soon. 476 logger.Errorf("%q manifold worker returned unexpected error: %v", name, err) 477 engine.requestStart(name, engine.config.ErrorDelay) 478 } 479 } 480 481 // Manifolds that declared a dependency on this one only need to be notified 482 // if the worker has changed; if it was already nil, nobody needs to know. 483 if info.worker != nil { 484 engine.bounceDependents(name) 485 } 486 } 487 488 // requestStop ensures that any running or starting worker will be stopped in the 489 // near future. It must only be called from the loop goroutine. 490 func (engine *engine) requestStop(name string) { 491 492 // If already stopping or stopped, just don't do anything. 493 info := engine.current[name] 494 if info.stopping || info.stopped() { 495 return 496 } 497 498 // Update info, kill worker if present, and copy info back to engine. 499 info.stopping = true 500 if info.worker != nil { 501 info.worker.Kill() 502 } 503 engine.current[name] = info 504 } 505 506 // isDying returns true if the engine is shutting down. It's safe to call it 507 // from any goroutine. 508 func (engine *engine) isDying() bool { 509 select { 510 case <-engine.tomb.Dying(): 511 return true 512 default: 513 return false 514 } 515 } 516 517 // allStopped returns true if no workers are running or starting. It must only 518 // be called from the loop goroutine. 519 func (engine *engine) allStopped() bool { 520 for _, info := range engine.current { 521 if !info.stopped() { 522 return false 523 } 524 } 525 return true 526 } 527 528 // bounceDependents starts every stopped dependent of the named manifold, and 529 // stops every started one (and trusts the rest of the engine to restart them). 530 // It must only be called from the loop goroutine. 531 func (engine *engine) bounceDependents(name string) { 532 logger.Tracef("restarting dependents of %q manifold", name) 533 for _, dependentName := range engine.dependents[name] { 534 if engine.current[dependentName].stopped() { 535 engine.requestStart(dependentName, engine.config.BounceDelay) 536 } else { 537 engine.requestStop(dependentName) 538 } 539 } 540 } 541 542 // workerInfo stores what an engine's loop goroutine needs to know about the 543 // worker for a given Manifold. 544 type workerInfo struct { 545 starting bool 546 stopping bool 547 worker worker.Worker 548 err error 549 resourceLog []resourceAccess 550 } 551 552 // stopped returns true unless the worker is either assigned or starting. 553 func (info workerInfo) stopped() bool { 554 switch { 555 case info.worker != nil: 556 return false 557 case info.starting: 558 return false 559 } 560 return true 561 } 562 563 // state returns the latest known state of the worker, for use in reports. 564 func (info workerInfo) state() string { 565 switch { 566 case info.starting: 567 return "starting" 568 case info.stopping: 569 return "stopping" 570 case info.worker != nil: 571 return "started" 572 } 573 return "stopped" 574 } 575 576 // report returns any available report from the worker. If the worker is not 577 // a Reporter, or is not present, this method will return nil. 578 func (info workerInfo) report() map[string]interface{} { 579 if reporter, ok := info.worker.(Reporter); ok { 580 return reporter.Report() 581 } 582 return nil 583 } 584 585 // installTicket is used by engine to induce installation of a named manifold 586 // and pass on any errors encountered in the process. 587 type installTicket struct { 588 name string 589 manifold Manifold 590 result chan<- error 591 } 592 593 // startedTicket is used by engine to notify the loop of the creation of the 594 // worker for a particular manifold. 595 type startedTicket struct { 596 name string 597 worker worker.Worker 598 resourceLog []resourceAccess 599 } 600 601 // stoppedTicket is used by engine to notify the loop of the demise of (or 602 // failure to create) the worker for a particular manifold. 603 type stoppedTicket struct { 604 name string 605 error error 606 resourceLog []resourceAccess 607 } 608 609 // reportTicket is used by the engine to notify the loop that a status report 610 // should be generated. 611 type reportTicket struct { 612 result chan map[string]interface{} 613 }