github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/dependency/engine.go

github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/dependency/engine.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package dependency
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/loggo"
    11  	"launchpad.net/tomb"
    12  
    13  	"github.com/juju/juju/worker"
    14  )
    15  
    16  var logger = loggo.GetLogger("juju.worker.dependency")
    17  
    18  // EngineConfig defines the parameters needed to create a new engine.
    19  type EngineConfig struct {
    20  	// IsFatal allows errors generated by workers to stop the engine.
    21  	IsFatal IsFatalFunc
    22  
    23  	// MoreImportant allows fatal errors to be ranked according to importance.
    24  	MoreImportant MoreImportantFunc
    25  
    26  	// ErrorDelay controls how long the engine waits before restarting a worker
    27  	// that encountered an unknown error.
    28  	ErrorDelay time.Duration
    29  
    30  	// BounceDelay controls how long the engine waits before restarting a worker
    31  	// that was deliberately shut down because its dependencies changed.
    32  	BounceDelay time.Duration
    33  }
    34  
    35  // Validate checks the config values are sensible.
    36  func (config *EngineConfig) Validate() error {
    37  	if config.IsFatal == nil {
    38  		return errors.New("engineconfig validation failed: IsFatal not specified")
    39  	}
    40  	if config.MoreImportant == nil {
    41  		return errors.New("engineconfig validation failed: MoreImportant not specified")
    42  	}
    43  	if config.ErrorDelay <= 0 {
    44  		return errors.New("engineconfig validation failed: ErrorDelay needs to be >= 0")
    45  	}
    46  	if config.BounceDelay <= 0 {
    47  		return errors.New("engineconfig validation failed: BounceDelay needs to be >= 0")
    48  	}
    49  	return nil
    50  }
    51  
    52  // NewEngine returns an Engine that will maintain any installed Manifolds until
    53  // either the engine is stopped or one of the manifolds' workers returns an error
    54  // that satisfies isFatal. The caller takes responsibility for the returned Engine:
    55  // it's responsible for Kill()ing the Engine when no longer used, and must handle
    56  // any error from Wait().
    57  func NewEngine(config EngineConfig) (Engine, error) {
    58  	if err := config.Validate(); err != nil {
    59  		return nil, errors.Trace(err)
    60  	}
    61  	engine := &engine{
    62  		config: config,
    63  
    64  		manifolds:  map[string]Manifold{},
    65  		dependents: map[string][]string{},
    66  		current:    map[string]workerInfo{},
    67  
    68  		install: make(chan installTicket),
    69  		started: make(chan startedTicket),
    70  		stopped: make(chan stoppedTicket),
    71  	}
    72  	go func() {
    73  		defer engine.tomb.Done()
    74  		engine.tomb.Kill(engine.loop())
    75  	}()
    76  	return engine, nil
    77  }
    78  
    79  // engine maintains workers corresponding to its installed manifolds, and
    80  // restarts them whenever their inputs change.
    81  type engine struct {
    82  	tomb tomb.Tomb
    83  
    84  	// config contains values passed in as config when the engine was created.
    85  	config EngineConfig
    86  
    87  	// worstError is used to track the most important error we've received from a
    88  	// manifold. We use tomb.Tomb to track engine life cycle but the first error
    89  	// we get is not necessarily the most important one.
    90  	// Using moreImportant we rank errors and return the worst error.
    91  	worstError error
    92  
    93  	// manifolds holds the installed manifolds by name.
    94  	manifolds map[string]Manifold
    95  
    96  	// dependents holds, for each named manifold, those that depend on it.
    97  	dependents map[string][]string
    98  
    99  	// current holds the active worker information for each installed manifold.
   100  	current map[string]workerInfo
   101  
   102  	// install, started, and stopped each communicate requests and changes into
   103  	// the loop goroutine.
   104  	install chan installTicket
   105  	started chan startedTicket
   106  	stopped chan stoppedTicket
   107  }
   108  
   109  // loop serializes manifold install operations and worker start/stop notifications.
   110  // It's notable for its oneShotDying var, which is necessary because any number of
   111  // start/stop notification could be in flight at the point the engine needs to stop;
   112  // we need to handle all those, and any subsequent messages, until the main loop is
   113  // confident that every worker has stopped. (The usual pattern -- to defer a cleanup
   114  // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because
   115  // it needs to duplicate that start/stop message handling; better to localise that
   116  // in this method.)
   117  func (engine *engine) loop() error {
   118  	oneShotDying := engine.tomb.Dying()
   119  	for {
   120  		select {
   121  		case <-oneShotDying:
   122  			oneShotDying = nil
   123  			for name := range engine.current {
   124  				engine.requestStop(name)
   125  			}
   126  		case ticket := <-engine.install:
   127  			// This is safe so long as the Install method reads the result.
   128  			ticket.result <- engine.gotInstall(ticket.name, ticket.manifold)
   129  		case ticket := <-engine.started:
   130  			engine.gotStarted(ticket.name, ticket.worker)
   131  		case ticket := <-engine.stopped:
   132  			engine.gotStopped(ticket.name, ticket.error)
   133  		}
   134  		if engine.isDying() {
   135  			if engine.allStopped() {
   136  				if engine.worstError == nil {
   137  					return tomb.ErrDying
   138  				}
   139  				return engine.worstError
   140  			}
   141  		}
   142  	}
   143  }
   144  
   145  // Kill is part of the worker.Worker interface.
   146  func (engine *engine) Kill() {
   147  	engine.tomb.Kill(nil)
   148  }
   149  
   150  // Wait is part of the worker.Worker interface.
   151  func (engine *engine) Wait() error {
   152  	<-engine.tomb.Dead()
   153  	return engine.worstError
   154  }
   155  
   156  // Install is part of the Engine interface.
   157  func (engine *engine) Install(name string, manifold Manifold) error {
   158  	result := make(chan error)
   159  	select {
   160  	case <-engine.tomb.Dying():
   161  		return errors.New("engine is shutting down")
   162  	case engine.install <- installTicket{name, manifold, result}:
   163  		// This is safe so long as the loop sends a result.
   164  		return <-result
   165  	}
   166  }
   167  
   168  // gotInstall handles the params originally supplied to Install. It must only be
   169  // called from the loop goroutine.
   170  func (engine *engine) gotInstall(name string, manifold Manifold) error {
   171  	logger.Infof("installing %q manifold...", name)
   172  	if _, found := engine.manifolds[name]; found {
   173  		return errors.Errorf("%q manifold already installed", name)
   174  	}
   175  	engine.manifolds[name] = manifold
   176  	for _, input := range manifold.Inputs {
   177  		engine.dependents[input] = append(engine.dependents[input], name)
   178  	}
   179  	engine.current[name] = workerInfo{}
   180  	engine.requestStart(name, 0)
   181  	return nil
   182  }
   183  
   184  // requestStart invokes a runWorker goroutine for the manifold with the supplied
   185  // name. It must only be called from the loop goroutine.
   186  func (engine *engine) requestStart(name string, delay time.Duration) {
   187  
   188  	// Check preconditions.
   189  	manifold, found := engine.manifolds[name]
   190  	if !found {
   191  		engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name))
   192  	}
   193  
   194  	// Copy current info and check more preconditions.
   195  	info := engine.current[name]
   196  	if !info.stopped() {
   197  		engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name))
   198  	}
   199  
   200  	// Final check that we're not shutting down yet...
   201  	if engine.isDying() {
   202  		logger.Debugf("not starting %q manifold worker (shutting down)", name)
   203  		return
   204  	}
   205  
   206  	// ...then update the info, copy it back to the engine, and start a worker
   207  	// goroutine based on current known state.
   208  	info.starting = true
   209  	engine.current[name] = info
   210  	getResource := engine.getResourceFunc(name, manifold.Inputs)
   211  	go engine.runWorker(name, delay, manifold.Start, getResource)
   212  }
   213  
   214  // getResourceFunc returns a GetResourceFunc backed by a snapshot of current
   215  // worker state, restricted to those workers declared in inputs. It must only
   216  // be called from the loop goroutine; see inside for a detailed dicsussion of
   217  // why we took this appproach.
   218  func (engine *engine) getResourceFunc(name string, inputs []string) GetResourceFunc {
   219  	// We snapshot the resources available at invocation time, rather than adding an
   220  	// additional communicate-resource-request channel. The latter approach is not
   221  	// unreasonable... but is prone to inelegant scrambles when starting several
   222  	// dependent workers at once. For example:
   223  	//
   224  	//  * Install manifold A; loop starts worker A
   225  	//  * Install manifold B; loop starts worker B
   226  	//  * A communicates its worker back to loop; main thread bounces B
   227  	//  * B asks for A, gets A, doesn't react to bounce (*)
   228  	//  * B communicates its worker back to loop; loop kills it immediately in
   229  	//    response to earlier bounce
   230  	//  * loop starts worker B again, now everything's fine; but, still, yuck.
   231  	//    This is not a happy path to take by default.
   232  	//
   233  	// The problem, of course, is in the (*); the main thread *does* know that B
   234  	// needs to bounce soon anyway, and it *could* communicate that fact back via
   235  	// an error over a channel back into getResource; the StartFunc could then
   236  	// just return (say) that ErrResourceChanged and avoid the hassle of creating
   237  	// a worker. But that adds a whole layer of complexity (and unpredictability
   238  	// in tests, which is not much fun) for very little benefit.
   239  	//
   240  	// In the analogous scenario with snapshotted dependencies, we see a happier
   241  	// picture at startup time:
   242  	//
   243  	//  * Install manifold A; loop starts worker A
   244  	//  * Install manifold B; loop starts worker B with empty resource snapshot
   245  	//  * A communicates its worker back to loop; main thread bounces B
   246  	//  * B's StartFunc asks for A, gets nothing, returns ErrUnmetDependencies
   247  	//  * loop restarts worker B with an up-to-date snapshot, B works fine
   248  	//
   249  	// We assume that, in the common case, most workers run without error most
   250  	// of the time; and, thus, that the vast majority of worker startups will
   251  	// happen as an agent starts. Furthermore, most of them will have simple
   252  	// hard dependencies, and their Start funcs will be easy to write; the only
   253  	// components that may be impacted by such a strategy will be those workers
   254  	// which still want to run (with reduced functionality) with some dependency
   255  	// unmet.
   256  	//
   257  	// Those may indeed suffer the occasional extra bounce as the system comes
   258  	// to stability as it starts, or after a change; but workers *must* be
   259  	// written for resilience in the face of arbitrary bounces *anyway*, so it
   260  	// shouldn't be harmful
   261  	outputs := map[string]OutputFunc{}
   262  	workers := map[string]worker.Worker{}
   263  	for _, resourceName := range inputs {
   264  		outputs[resourceName] = engine.manifolds[resourceName].Output
   265  		workers[resourceName] = engine.current[resourceName].worker
   266  	}
   267  	return func(resourceName string, out interface{}) error {
   268  		logger.Debugf("%q manifold requested %q resource", name, resourceName)
   269  		input := workers[resourceName]
   270  		if input == nil {
   271  			// No worker running (or not declared).
   272  			return ErrMissing
   273  		}
   274  		convert := outputs[resourceName]
   275  		if convert == nil {
   276  			// No conversion func available...
   277  			if out != nil {
   278  				// ...and the caller wants a resource.
   279  				return ErrMissing
   280  			}
   281  			// ...but it's ok, because the caller depends on existence only.
   282  			return nil
   283  		}
   284  		return convert(input, out)
   285  	}
   286  }
   287  
   288  // runWorker starts the supplied manifold's worker and communicates it back to the
   289  // loop goroutine; waits for worker completion; and communicates any error encountered
   290  // back to the loop goroutine. It must not be run on the loop goroutine.
   291  func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, getResource GetResourceFunc) {
   292  	startWorkerAndWait := func() error {
   293  		logger.Infof("starting %q manifold worker in %s...", name, delay)
   294  		select {
   295  		case <-time.After(delay):
   296  		case <-engine.tomb.Dying():
   297  			logger.Debugf("not starting %q manifold worker (shutting down)", name)
   298  			return tomb.ErrDying
   299  		}
   300  
   301  		logger.Debugf("starting %q manifold worker", name)
   302  		worker, err := start(getResource)
   303  		if err != nil {
   304  			logger.Warningf("failed to start %q manifold worker: %v", name, err)
   305  			return err
   306  		}
   307  
   308  		logger.Debugf("running %q manifold worker", name)
   309  		select {
   310  		case <-engine.tomb.Dying():
   311  			logger.Debugf("stopping %q manifold worker (shutting down)", name)
   312  			worker.Kill()
   313  		case engine.started <- startedTicket{name, worker}:
   314  			logger.Debugf("registered %q manifold worker", name)
   315  		}
   316  		return worker.Wait()
   317  	}
   318  
   319  	// We may or may not send on started, but we *must* send on stopped.
   320  	engine.stopped <- stoppedTicket{name, startWorkerAndWait()}
   321  }
   322  
   323  // gotStarted updates the engine to reflect the creation of a worker. It must
   324  // only be called from the loop goroutine.
   325  func (engine *engine) gotStarted(name string, worker worker.Worker) {
   326  	// Copy current info; check preconditions and abort the workers if we've
   327  	// already been asked to stop it.
   328  	info := engine.current[name]
   329  	switch {
   330  	case info.worker != nil:
   331  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name))
   332  		fallthrough
   333  	case info.stopping, engine.isDying():
   334  		logger.Debugf("%q manifold worker no longer required", name)
   335  		worker.Kill()
   336  	default:
   337  		// It's fine to use this worker; update info and copy back.
   338  		logger.Infof("%q manifold worker started", name)
   339  		info.starting = false
   340  		info.worker = worker
   341  		engine.current[name] = info
   342  
   343  		// Any manifold that declares this one as an input needs to be restarted.
   344  		engine.bounceDependents(name)
   345  	}
   346  }
   347  
   348  // gotStopped updates the engine to reflect the demise of (or failure to create)
   349  // a worker. It must only be called from the loop goroutine.
   350  func (engine *engine) gotStopped(name string, err error) {
   351  	logger.Infof("%q manifold worker stopped: %v", name, err)
   352  
   353  	// Copy current info and check for reasons to stop the engine.
   354  	info := engine.current[name]
   355  	if info.stopped() {
   356  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name))
   357  	} else if engine.config.IsFatal(err) {
   358  		if engine.worstError == nil {
   359  			engine.worstError = engine.config.MoreImportant(err, engine.worstError)
   360  		}
   361  		engine.tomb.Kill(err)
   362  	}
   363  
   364  	// Reset engine info; and bail out if we can be sure there's no need to bounce.
   365  	engine.current[name] = workerInfo{}
   366  	if engine.isDying() {
   367  		logger.Debugf("permanently stopped %q manifold worker (shutting down)", name)
   368  		return
   369  	}
   370  
   371  	// If we told the worker to stop, we should start it again immediately,
   372  	// whatever else happened.
   373  	if info.stopping {
   374  		engine.requestStart(name, engine.config.BounceDelay)
   375  	} else {
   376  		// If we didn't stop it ourselves, we need to interpret the error.
   377  		switch err {
   378  		case nil:
   379  			// Nothing went wrong; the task completed successfully. Nothing
   380  			// needs to be done (unless the inputs change, in which case it
   381  			// gets to check again).
   382  		case ErrMissing:
   383  			// The task can't even start with the current state. Nothing more
   384  			// can be done (until the inputs change, in which case we retry
   385  			// anyway).
   386  		default:
   387  			// Something went wrong but we don't know what. Try again soon.
   388  			engine.requestStart(name, engine.config.ErrorDelay)
   389  		}
   390  	}
   391  
   392  	// Manifolds that declared a dependency on this one only need to be notified
   393  	// if the worker has changed; if it was already nil, nobody needs to know.
   394  	if info.worker != nil {
   395  		engine.bounceDependents(name)
   396  	}
   397  }
   398  
   399  // requestStop ensures that any running or starting worker will be stopped in the
   400  // near future. It must only be called from the loop goroutine.
   401  func (engine *engine) requestStop(name string) {
   402  
   403  	// If already stopping or stopped, just don't do anything.
   404  	info := engine.current[name]
   405  	if info.stopping || info.stopped() {
   406  		return
   407  	}
   408  
   409  	// Update info, kill worker if present, and copy info back to engine.
   410  	info.stopping = true
   411  	if info.worker != nil {
   412  		info.worker.Kill()
   413  	}
   414  	engine.current[name] = info
   415  }
   416  
   417  // isDying returns true if the engine is shutting down. It's safe to call it
   418  // from any goroutine.
   419  func (engine *engine) isDying() bool {
   420  	select {
   421  	case <-engine.tomb.Dying():
   422  		return true
   423  	default:
   424  		return false
   425  	}
   426  }
   427  
   428  // allStopped returns true if no workers are running or starting. It must only
   429  // be called from the loop goroutine.
   430  func (engine *engine) allStopped() bool {
   431  	for _, info := range engine.current {
   432  		if !info.stopped() {
   433  			return false
   434  		}
   435  	}
   436  	return true
   437  }
   438  
   439  // bounceDependents starts every stopped dependent of the named manifold, and
   440  // stops every started one (and trusts the rest of the engine to restart them).
   441  // It must only be called from the loop goroutine.
   442  func (engine *engine) bounceDependents(name string) {
   443  	logger.Debugf("restarting dependents of %q manifold", name)
   444  	for _, dependentName := range engine.dependents[name] {
   445  		if engine.current[dependentName].stopped() {
   446  			engine.requestStart(dependentName, engine.config.BounceDelay)
   447  		} else {
   448  			engine.requestStop(dependentName)
   449  		}
   450  	}
   451  }
   452  
   453  // workerInfo stores what an engine's loop goroutine needs to know about the
   454  // worker for a given Manifold.
   455  type workerInfo struct {
   456  	starting bool
   457  	stopping bool
   458  	worker   worker.Worker
   459  }
   460  
   461  // stopped returns true unless the worker is either assigned or starting.
   462  func (info workerInfo) stopped() bool {
   463  	switch {
   464  	case info.worker != nil:
   465  		return false
   466  	case info.starting:
   467  		return false
   468  	}
   469  	return true
   470  }
   471  
   472  // installTicket is used by engine to induce installation of a named manifold
   473  // and pass on any errors encountered in the process.
   474  type installTicket struct {
   475  	name     string
   476  	manifold Manifold
   477  	result   chan<- error
   478  }
   479  
   480  // startedTicket is used by engine to notify the loop of the creation of the
   481  // worker for a particular manifold.
   482  type startedTicket struct {
   483  	name   string
   484  	worker worker.Worker
   485  }
   486  
   487  // stoppedTicket is used by engine to notify the loop of the demise of (or
   488  // failure to create) the worker for a particular manifold.
   489  type stoppedTicket struct {
   490  	name  string
   491  	error error
   492  }