github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/worker/dependency/engine.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package dependency
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/loggo"
    11  	"launchpad.net/tomb"
    12  
    13  	"github.com/juju/juju/worker"
    14  )
    15  
    16  var logger = loggo.GetLogger("juju.worker.dependency")
    17  
    18  // EngineConfig defines the parameters needed to create a new engine.
    19  type EngineConfig struct {
    20  
    21  	// IsFatal returns true when passed an error that should stop the engine.
    22  	// It must not be nil.
    23  	IsFatal IsFatalFunc
    24  
    25  	// WorstError returns the more important of two fatal errors passed to it,
    26  	// and is used to determine which fatal error to report when there's more
    27  	// than one. It must not be nil.
    28  	WorstError WorstErrorFunc
    29  
    30  	// ErrorDelay controls how long the engine waits before restarting a worker
    31  	// that encountered an unknown error. It must not be negative.
    32  	ErrorDelay time.Duration
    33  
    34  	// BounceDelay controls how long the engine waits before restarting a worker
    35  	// that was deliberately shut down because its dependencies changed. It must
    36  	// not be negative.
    37  	BounceDelay time.Duration
    38  }
    39  
    40  // Validate returns an error if any field is invalid.
    41  func (config *EngineConfig) Validate() error {
    42  	if config.IsFatal == nil {
    43  		return errors.New("IsFatal not specified")
    44  	}
    45  	if config.WorstError == nil {
    46  		return errors.New("WorstError not specified")
    47  	}
    48  	if config.ErrorDelay < 0 {
    49  		return errors.New("ErrorDelay is negative")
    50  	}
    51  	if config.BounceDelay < 0 {
    52  		return errors.New("BounceDelay is negative")
    53  	}
    54  	return nil
    55  }
    56  
    57  // NewEngine returns an Engine that will maintain any installed Manifolds until
    58  // either the engine is stopped or one of the manifolds' workers returns an error
    59  // that satisfies isFatal. The caller takes responsibility for the returned Engine:
    60  // it's responsible for Kill()ing the Engine when no longer used, and must handle
    61  // any error from Wait().
    62  func NewEngine(config EngineConfig) (Engine, error) {
    63  	if err := config.Validate(); err != nil {
    64  		return nil, errors.Annotatef(err, "invalid config")
    65  	}
    66  	engine := &engine{
    67  		config: config,
    68  
    69  		manifolds:  Manifolds{},
    70  		dependents: map[string][]string{},
    71  		current:    map[string]workerInfo{},
    72  
    73  		install: make(chan installTicket),
    74  		started: make(chan startedTicket),
    75  		stopped: make(chan stoppedTicket),
    76  		report:  make(chan reportTicket),
    77  	}
    78  	go func() {
    79  		defer engine.tomb.Done()
    80  		engine.tomb.Kill(engine.loop())
    81  	}()
    82  	return engine, nil
    83  }
    84  
    85  // engine maintains workers corresponding to its installed manifolds, and
    86  // restarts them whenever their inputs change.
    87  type engine struct {
    88  
    89  	// config contains values passed in as config when the engine was created.
    90  	config EngineConfig
    91  
    92  	// As usual, we use tomb.Tomb to track the lifecycle and error state of the
    93  	// engine worker itself; but we *only* report *internal* errors via the tomb.
    94  	// Fatal errors received from workers are *not* used to kill the tomb; they
    95  	// are tracked separately, and will only be exposed to the client when the
    96  	// engine's tomb has completed its job and encountered no errors.
    97  	tomb tomb.Tomb
    98  
    99  	// worstError is used to track the most important fatal error we've received
   100  	// from any manifold. This should be the only place fatal errors are stored;
   101  	// they must *not* be passed into the tomb.
   102  	worstError error
   103  
   104  	// manifolds holds the installed manifolds by name.
   105  	manifolds Manifolds
   106  
   107  	// dependents holds, for each named manifold, those that depend on it.
   108  	dependents map[string][]string
   109  
   110  	// current holds the active worker information for each installed manifold.
   111  	current map[string]workerInfo
   112  
   113  	// install, started, report and stopped each communicate requests and changes into
   114  	// the loop goroutine.
   115  	install chan installTicket
   116  	started chan startedTicket
   117  	stopped chan stoppedTicket
   118  	report  chan reportTicket
   119  }
   120  
   121  // loop serializes manifold install operations and worker start/stop notifications.
   122  // It's notable for its oneShotDying var, which is necessary because any number of
   123  // start/stop notification could be in flight at the point the engine needs to stop;
   124  // we need to handle all those, and any subsequent messages, until the main loop is
   125  // confident that every worker has stopped. (The usual pattern -- to defer a cleanup
   126  // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because
   127  // it needs to duplicate that start/stop message handling; better to localise that
   128  // in this method.)
   129  func (engine *engine) loop() error {
   130  	oneShotDying := engine.tomb.Dying()
   131  	for {
   132  		select {
   133  		case <-oneShotDying:
   134  			oneShotDying = nil
   135  			for name := range engine.current {
   136  				engine.requestStop(name)
   137  			}
   138  		case ticket := <-engine.report:
   139  			// This is safe so long as the Report method reads the result.
   140  			ticket.result <- engine.liveReport()
   141  		case ticket := <-engine.install:
   142  			// This is safe so long as the Install method reads the result.
   143  			ticket.result <- engine.gotInstall(ticket.name, ticket.manifold)
   144  		case ticket := <-engine.started:
   145  			engine.gotStarted(ticket.name, ticket.worker, ticket.resourceLog)
   146  		case ticket := <-engine.stopped:
   147  			engine.gotStopped(ticket.name, ticket.error, ticket.resourceLog)
   148  		}
   149  		if engine.isDying() {
   150  			if engine.allStopped() {
   151  				return tomb.ErrDying
   152  			}
   153  		}
   154  	}
   155  }
   156  
   157  // Kill is part of the worker.Worker interface.
   158  func (engine *engine) Kill() {
   159  	engine.tomb.Kill(nil)
   160  }
   161  
   162  // Wait is part of the worker.Worker interface.
   163  func (engine *engine) Wait() error {
   164  	if tombError := engine.tomb.Wait(); tombError != nil {
   165  		return tombError
   166  	}
   167  	return engine.worstError
   168  }
   169  
   170  // Report is part of the Reporter interface.
   171  func (engine *engine) Report() map[string]interface{} {
   172  	report := make(chan map[string]interface{})
   173  	select {
   174  	case engine.report <- reportTicket{report}:
   175  		// This is safe so long as the loop sends a result.
   176  		return <-report
   177  	case <-engine.tomb.Dead():
   178  		// Note that we don't abort on Dying as we usually would; the
   179  		// oneShotDying approach in loop means that it can continue to
   180  		// process requests until the last possible moment. Only once
   181  		// loop has exited do we fall back to this report.
   182  		return map[string]interface{}{
   183  			KeyState:     "stopped",
   184  			KeyError:     engine.Wait(),
   185  			KeyManifolds: engine.manifoldsReport(),
   186  		}
   187  	}
   188  }
   189  
   190  // liveReport collects and returns information about the engine, its manifolds,
   191  // and their workers. It must only be called from the loop goroutine.
   192  func (engine *engine) liveReport() map[string]interface{} {
   193  	var reportError error
   194  	state := "started"
   195  	if engine.isDying() {
   196  		state = "stopping"
   197  		if tombError := engine.tomb.Err(); tombError != nil {
   198  			reportError = tombError
   199  		} else {
   200  			reportError = engine.worstError
   201  		}
   202  	}
   203  	return map[string]interface{}{
   204  		KeyState:     state,
   205  		KeyError:     reportError,
   206  		KeyManifolds: engine.manifoldsReport(),
   207  	}
   208  }
   209  
   210  // manifoldsReport collects and returns information about the engine's manifolds
   211  // and their workers. Until the tomb is Dead, it should only be called from the
   212  // loop goroutine; after that, it's goroutine-safe.
   213  func (engine *engine) manifoldsReport() map[string]interface{} {
   214  	manifolds := map[string]interface{}{}
   215  	for name, info := range engine.current {
   216  		manifolds[name] = map[string]interface{}{
   217  			KeyState:       info.state(),
   218  			KeyError:       info.err,
   219  			KeyInputs:      engine.manifolds[name].Inputs,
   220  			KeyReport:      info.report(),
   221  			KeyResourceLog: resourceLogReport(info.resourceLog),
   222  		}
   223  	}
   224  	return manifolds
   225  }
   226  
   227  // Install is part of the Engine interface.
   228  func (engine *engine) Install(name string, manifold Manifold) error {
   229  	result := make(chan error)
   230  	select {
   231  	case <-engine.tomb.Dying():
   232  		return errors.New("engine is shutting down")
   233  	case engine.install <- installTicket{name, manifold, result}:
   234  		// This is safe so long as the loop sends a result.
   235  		return <-result
   236  	}
   237  }
   238  
   239  // gotInstall handles the params originally supplied to Install. It must only be
   240  // called from the loop goroutine.
   241  func (engine *engine) gotInstall(name string, manifold Manifold) error {
   242  	logger.Tracef("installing %q manifold...", name)
   243  	if _, found := engine.manifolds[name]; found {
   244  		return errors.Errorf("%q manifold already installed", name)
   245  	}
   246  	if err := engine.checkAcyclic(name, manifold); err != nil {
   247  		return errors.Annotatef(err, "cannot install %q manifold", name)
   248  	}
   249  	engine.manifolds[name] = manifold
   250  	for _, input := range manifold.Inputs {
   251  		engine.dependents[input] = append(engine.dependents[input], name)
   252  	}
   253  	engine.current[name] = workerInfo{}
   254  	engine.requestStart(name, 0)
   255  	return nil
   256  }
   257  
   258  // checkAcyclic returns an error if the introduction of the supplied manifold
   259  // would cause the dependency graph to contain cycles.
   260  func (engine *engine) checkAcyclic(name string, manifold Manifold) error {
   261  	manifolds := Manifolds{name: manifold}
   262  	for name, manifold := range engine.manifolds {
   263  		manifolds[name] = manifold
   264  	}
   265  	return Validate(manifolds)
   266  }
   267  
   268  // requestStart invokes a runWorker goroutine for the manifold with the supplied
   269  // name. It must only be called from the loop goroutine.
   270  func (engine *engine) requestStart(name string, delay time.Duration) {
   271  
   272  	// Check preconditions.
   273  	manifold, found := engine.manifolds[name]
   274  	if !found {
   275  		engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name))
   276  	}
   277  
   278  	// Copy current info and check more preconditions.
   279  	info := engine.current[name]
   280  	if !info.stopped() {
   281  		engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name))
   282  	}
   283  
   284  	// Final check that we're not shutting down yet...
   285  	if engine.isDying() {
   286  		logger.Tracef("not starting %q manifold worker (shutting down)", name)
   287  		return
   288  	}
   289  
   290  	// ...then update the info, copy it back to the engine, and start a worker
   291  	// goroutine based on current known state.
   292  	info.starting = true
   293  	engine.current[name] = info
   294  	resourceGetter := engine.resourceGetter(name, manifold.Inputs)
   295  	go engine.runWorker(name, delay, manifold.Start, resourceGetter)
   296  }
   297  
   298  // resourceGetter returns a resourceGetter backed by a snapshot of current
   299  // worker state, restricted to those workers declared in inputs. It must only
   300  // be called from the loop goroutine; see inside for a detailed dicsussion of
   301  // why we took this appproach.
   302  func (engine *engine) resourceGetter(name string, inputs []string) *resourceGetter {
   303  	// We snapshot the resources available at invocation time, rather than adding an
   304  	// additional communicate-resource-request channel. The latter approach is not
   305  	// unreasonable... but is prone to inelegant scrambles when starting several
   306  	// dependent workers at once. For example:
   307  	//
   308  	//  * Install manifold A; loop starts worker A
   309  	//  * Install manifold B; loop starts worker B
   310  	//  * A communicates its worker back to loop; main thread bounces B
   311  	//  * B asks for A, gets A, doesn't react to bounce (*)
   312  	//  * B communicates its worker back to loop; loop kills it immediately in
   313  	//    response to earlier bounce
   314  	//  * loop starts worker B again, now everything's fine; but, still, yuck.
   315  	//    This is not a happy path to take by default.
   316  	//
   317  	// The problem, of course, is in the (*); the main thread *does* know that B
   318  	// needs to bounce soon anyway, and it *could* communicate that fact back via
   319  	// an error over a channel back into getResource; the StartFunc could then
   320  	// just return (say) that ErrResourceChanged and avoid the hassle of creating
   321  	// a worker. But that adds a whole layer of complexity (and unpredictability
   322  	// in tests, which is not much fun) for very little benefit.
   323  	//
   324  	// In the analogous scenario with snapshotted dependencies, we see a happier
   325  	// picture at startup time:
   326  	//
   327  	//  * Install manifold A; loop starts worker A
   328  	//  * Install manifold B; loop starts worker B with empty resource snapshot
   329  	//  * A communicates its worker back to loop; main thread bounces B
   330  	//  * B's StartFunc asks for A, gets nothing, returns ErrMissing
   331  	//  * loop restarts worker B with an up-to-date snapshot, B works fine
   332  	//
   333  	// We assume that, in the common case, most workers run without error most
   334  	// of the time; and, thus, that the vast majority of worker startups will
   335  	// happen as an agent starts. Furthermore, most of them will have simple
   336  	// hard dependencies, and their Start funcs will be easy to write; the only
   337  	// components that may be impacted by such a strategy will be those workers
   338  	// which still want to run (with reduced functionality) with some dependency
   339  	// unmet.
   340  	//
   341  	// Those may indeed suffer the occasional extra bounce as the system comes
   342  	// to stability as it starts, or after a change; but workers *must* be
   343  	// written for resilience in the face of arbitrary bounces *anyway*, so it
   344  	// shouldn't be harmful.
   345  	outputs := map[string]OutputFunc{}
   346  	workers := map[string]worker.Worker{}
   347  	for _, resourceName := range inputs {
   348  		outputs[resourceName] = engine.manifolds[resourceName].Output
   349  		workers[resourceName] = engine.current[resourceName].worker
   350  	}
   351  	return &resourceGetter{
   352  		clientName: name,
   353  		expired:    make(chan struct{}),
   354  		workers:    workers,
   355  		outputs:    outputs,
   356  	}
   357  }
   358  
   359  // runWorker starts the supplied manifold's worker and communicates it back to the
   360  // loop goroutine; waits for worker completion; and communicates any error encountered
   361  // back to the loop goroutine. It must not be run on the loop goroutine.
   362  func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, resourceGetter *resourceGetter) {
   363  
   364  	errAborted := errors.New("aborted before delay elapsed")
   365  
   366  	startAfterDelay := func() (worker.Worker, error) {
   367  		// NOTE: the resourceGetter will expire *after* the worker is started.
   368  		// This is tolerable because
   369  		//  1) we'll still correctly block access attempts most of the time
   370  		//  2) failing to block them won't cause data races anyway
   371  		//  3) it's not worth complicating the interface for every client just
   372  		//     to eliminate the possibility of one harmlessly dumb interaction.
   373  		defer resourceGetter.expire()
   374  		logger.Tracef("starting %q manifold worker in %s...", name, delay)
   375  		select {
   376  		case <-time.After(delay):
   377  		case <-engine.tomb.Dying():
   378  			return nil, errAborted
   379  		}
   380  		logger.Tracef("starting %q manifold worker", name)
   381  		return start(resourceGetter.getResource)
   382  	}
   383  
   384  	startWorkerAndWait := func() error {
   385  		worker, err := startAfterDelay()
   386  		switch errors.Cause(err) {
   387  		case errAborted:
   388  			return nil
   389  		case nil:
   390  			logger.Tracef("running %q manifold worker", name)
   391  		default:
   392  			logger.Tracef("failed to start %q manifold worker: %v", name, err)
   393  			return err
   394  		}
   395  		select {
   396  		case <-engine.tomb.Dying():
   397  			logger.Tracef("stopping %q manifold worker (shutting down)", name)
   398  			worker.Kill()
   399  		case engine.started <- startedTicket{name, worker, resourceGetter.accessLog}:
   400  			logger.Tracef("registered %q manifold worker", name)
   401  		}
   402  		return worker.Wait()
   403  	}
   404  
   405  	// We may or may not send on started, but we *must* send on stopped.
   406  	engine.stopped <- stoppedTicket{name, startWorkerAndWait(), resourceGetter.accessLog}
   407  }
   408  
   409  // gotStarted updates the engine to reflect the creation of a worker. It must
   410  // only be called from the loop goroutine.
   411  func (engine *engine) gotStarted(name string, worker worker.Worker, resourceLog []resourceAccess) {
   412  	// Copy current info; check preconditions and abort the workers if we've
   413  	// already been asked to stop it.
   414  	info := engine.current[name]
   415  	switch {
   416  	case info.worker != nil:
   417  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name))
   418  		fallthrough
   419  	case info.stopping, engine.isDying():
   420  		logger.Tracef("%q manifold worker no longer required", name)
   421  		worker.Kill()
   422  	default:
   423  		// It's fine to use this worker; update info and copy back.
   424  		logger.Tracef("%q manifold worker started", name)
   425  		engine.current[name] = workerInfo{
   426  			worker:      worker,
   427  			resourceLog: resourceLog,
   428  		}
   429  
   430  		// Any manifold that declares this one as an input needs to be restarted.
   431  		engine.bounceDependents(name)
   432  	}
   433  }
   434  
   435  // gotStopped updates the engine to reflect the demise of (or failure to create)
   436  // a worker. It must only be called from the loop goroutine.
   437  func (engine *engine) gotStopped(name string, err error, resourceLog []resourceAccess) {
   438  	logger.Tracef("%q manifold worker stopped: %v", name, err)
   439  
   440  	// Copy current info and check for reasons to stop the engine.
   441  	info := engine.current[name]
   442  	if info.stopped() {
   443  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name))
   444  	} else if engine.config.IsFatal(err) {
   445  		engine.worstError = engine.config.WorstError(err, engine.worstError)
   446  		engine.tomb.Kill(nil)
   447  	}
   448  
   449  	// Reset engine info; and bail out if we can be sure there's no need to bounce.
   450  	engine.current[name] = workerInfo{
   451  		err:         err,
   452  		resourceLog: resourceLog,
   453  	}
   454  	if engine.isDying() {
   455  		logger.Tracef("permanently stopped %q manifold worker (shutting down)", name)
   456  		return
   457  	}
   458  
   459  	// If we told the worker to stop, we should start it again immediately,
   460  	// whatever else happened.
   461  	if info.stopping {
   462  		engine.requestStart(name, engine.config.BounceDelay)
   463  	} else {
   464  		// If we didn't stop it ourselves, we need to interpret the error.
   465  		switch errors.Cause(err) {
   466  		case nil:
   467  			// Nothing went wrong; the task completed successfully. Nothing
   468  			// needs to be done (unless the inputs change, in which case it
   469  			// gets to check again).
   470  		case ErrMissing:
   471  			// The task can't even start with the current state. Nothing more
   472  			// can be done (until the inputs change, in which case we retry
   473  			// anyway).
   474  		default:
   475  			// Something went wrong but we don't know what. Try again soon.
   476  			logger.Errorf("%q manifold worker returned unexpected error: %v", name, err)
   477  			engine.requestStart(name, engine.config.ErrorDelay)
   478  		}
   479  	}
   480  
   481  	// Manifolds that declared a dependency on this one only need to be notified
   482  	// if the worker has changed; if it was already nil, nobody needs to know.
   483  	if info.worker != nil {
   484  		engine.bounceDependents(name)
   485  	}
   486  }
   487  
   488  // requestStop ensures that any running or starting worker will be stopped in the
   489  // near future. It must only be called from the loop goroutine.
   490  func (engine *engine) requestStop(name string) {
   491  
   492  	// If already stopping or stopped, just don't do anything.
   493  	info := engine.current[name]
   494  	if info.stopping || info.stopped() {
   495  		return
   496  	}
   497  
   498  	// Update info, kill worker if present, and copy info back to engine.
   499  	info.stopping = true
   500  	if info.worker != nil {
   501  		info.worker.Kill()
   502  	}
   503  	engine.current[name] = info
   504  }
   505  
   506  // isDying returns true if the engine is shutting down. It's safe to call it
   507  // from any goroutine.
   508  func (engine *engine) isDying() bool {
   509  	select {
   510  	case <-engine.tomb.Dying():
   511  		return true
   512  	default:
   513  		return false
   514  	}
   515  }
   516  
   517  // allStopped returns true if no workers are running or starting. It must only
   518  // be called from the loop goroutine.
   519  func (engine *engine) allStopped() bool {
   520  	for _, info := range engine.current {
   521  		if !info.stopped() {
   522  			return false
   523  		}
   524  	}
   525  	return true
   526  }
   527  
   528  // bounceDependents starts every stopped dependent of the named manifold, and
   529  // stops every started one (and trusts the rest of the engine to restart them).
   530  // It must only be called from the loop goroutine.
   531  func (engine *engine) bounceDependents(name string) {
   532  	logger.Tracef("restarting dependents of %q manifold", name)
   533  	for _, dependentName := range engine.dependents[name] {
   534  		if engine.current[dependentName].stopped() {
   535  			engine.requestStart(dependentName, engine.config.BounceDelay)
   536  		} else {
   537  			engine.requestStop(dependentName)
   538  		}
   539  	}
   540  }
   541  
   542  // workerInfo stores what an engine's loop goroutine needs to know about the
   543  // worker for a given Manifold.
   544  type workerInfo struct {
   545  	starting    bool
   546  	stopping    bool
   547  	worker      worker.Worker
   548  	err         error
   549  	resourceLog []resourceAccess
   550  }
   551  
   552  // stopped returns true unless the worker is either assigned or starting.
   553  func (info workerInfo) stopped() bool {
   554  	switch {
   555  	case info.worker != nil:
   556  		return false
   557  	case info.starting:
   558  		return false
   559  	}
   560  	return true
   561  }
   562  
   563  // state returns the latest known state of the worker, for use in reports.
   564  func (info workerInfo) state() string {
   565  	switch {
   566  	case info.starting:
   567  		return "starting"
   568  	case info.stopping:
   569  		return "stopping"
   570  	case info.worker != nil:
   571  		return "started"
   572  	}
   573  	return "stopped"
   574  }
   575  
   576  // report returns any available report from the worker. If the worker is not
   577  // a Reporter, or is not present, this method will return nil.
   578  func (info workerInfo) report() map[string]interface{} {
   579  	if reporter, ok := info.worker.(Reporter); ok {
   580  		return reporter.Report()
   581  	}
   582  	return nil
   583  }
   584  
   585  // installTicket is used by engine to induce installation of a named manifold
   586  // and pass on any errors encountered in the process.
   587  type installTicket struct {
   588  	name     string
   589  	manifold Manifold
   590  	result   chan<- error
   591  }
   592  
   593  // startedTicket is used by engine to notify the loop of the creation of the
   594  // worker for a particular manifold.
   595  type startedTicket struct {
   596  	name        string
   597  	worker      worker.Worker
   598  	resourceLog []resourceAccess
   599  }
   600  
   601  // stoppedTicket is used by engine to notify the loop of the demise of (or
   602  // failure to create) the worker for a particular manifold.
   603  type stoppedTicket struct {
   604  	name        string
   605  	error       error
   606  	resourceLog []resourceAccess
   607  }
   608  
   609  // reportTicket is used by the engine to notify the loop that a status report
   610  // should be generated.
   611  type reportTicket struct {
   612  	result chan map[string]interface{}
   613  }