github.com/Pankov404/juju@v0.0.0-20150703034450-be266991dceb/worker/dependency/engine.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package dependency
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"github.com/juju/loggo"
    11  	"launchpad.net/tomb"
    12  
    13  	"github.com/juju/juju/worker"
    14  )
    15  
    16  var logger = loggo.GetLogger("juju.worker.dependency")
    17  
    18  // NewEngine returns an Engine that will maintain any installed Manifolds until
    19  // either the engine is stopped or one of the manifolds' workers returns an error
    20  // that satisfies isFatal. The caller takes responsibility for the returned Engine:
    21  // it's responsible for Kill()ing the Engine when no longer used, and must handle
    22  // any error from Wait().
    23  func NewEngine(isFatal IsFatalFunc, errorDelay, bounceDelay time.Duration) Engine {
    24  	engine := &engine{
    25  		isFatal:     isFatal,
    26  		errorDelay:  errorDelay,
    27  		bounceDelay: bounceDelay,
    28  
    29  		manifolds:  map[string]Manifold{},
    30  		dependents: map[string][]string{},
    31  		current:    map[string]workerInfo{},
    32  
    33  		install: make(chan installTicket),
    34  		started: make(chan startedTicket),
    35  		stopped: make(chan stoppedTicket),
    36  	}
    37  	go func() {
    38  		defer engine.tomb.Done()
    39  		engine.tomb.Kill(engine.loop())
    40  	}()
    41  	return engine
    42  }
    43  
    44  // engine maintains workers corresponding to its installed manifolds, and
    45  // restarts them whenever their inputs change.
    46  type engine struct {
    47  	tomb tomb.Tomb
    48  
    49  	// isFatal allows errors generated by workers to stop the engine.
    50  	isFatal IsFatalFunc
    51  
    52  	// errorDelay controls how long the engine waits before restarting a worker
    53  	// that encountered an unknown error.
    54  	errorDelay time.Duration
    55  
    56  	// bounceDelay controls how long the engine waits before restarting a worker
    57  	// that was deliberately shut down because its dependencies changed.
    58  	bounceDelay time.Duration
    59  
    60  	// manifolds holds the installed manifolds by name.
    61  	manifolds map[string]Manifold
    62  
    63  	// dependents holds, for each named manifold, those that depend on it.
    64  	dependents map[string][]string
    65  
    66  	// current holds the active worker information for each installed manifold.
    67  	current map[string]workerInfo
    68  
    69  	// install, started, and stopped each communicate requests and changes into
    70  	// the loop goroutine.
    71  	install chan installTicket
    72  	started chan startedTicket
    73  	stopped chan stoppedTicket
    74  }
    75  
    76  // loop serializes manifold install operations and worker start/stop notifications.
    77  // It's notable for its oneShotDying var, which is necessary because any number of
    78  // start/stop notification could be in flight at the point the engine needs to stop;
    79  // we need to handle all those, and any subsequent messages, until the main loop is
    80  // confident that every worker has stopped. (The usual pattern -- to defer a cleanup
    81  // method to run before tomb.Done in NewEngine -- is not cleanly applicable, because
    82  // it needs to duplicate that start/stop message handling; better to localise that
    83  // in this method.)
    84  func (engine *engine) loop() error {
    85  	oneShotDying := engine.tomb.Dying()
    86  	for {
    87  		select {
    88  		case <-oneShotDying:
    89  			oneShotDying = nil
    90  			for name := range engine.current {
    91  				engine.requestStop(name)
    92  			}
    93  		case ticket := <-engine.install:
    94  			// This is safe so long as the Install method reads the result.
    95  			ticket.result <- engine.gotInstall(ticket.name, ticket.manifold)
    96  		case ticket := <-engine.started:
    97  			engine.gotStarted(ticket.name, ticket.worker)
    98  		case ticket := <-engine.stopped:
    99  			engine.gotStopped(ticket.name, ticket.error)
   100  		}
   101  		if engine.isDying() {
   102  			if engine.allStopped() {
   103  				return tomb.ErrDying
   104  			}
   105  		}
   106  	}
   107  }
   108  
   109  // Kill is part of the worker.Worker interface.
   110  func (engine *engine) Kill() {
   111  	engine.tomb.Kill(nil)
   112  }
   113  
   114  // Wait is part of the worker.Worker interface.
   115  func (engine *engine) Wait() error {
   116  	return engine.tomb.Wait()
   117  }
   118  
   119  // Install is part of the Engine interface.
   120  func (engine *engine) Install(name string, manifold Manifold) error {
   121  	result := make(chan error)
   122  	select {
   123  	case <-engine.tomb.Dying():
   124  		return errors.New("engine is shutting down")
   125  	case engine.install <- installTicket{name, manifold, result}:
   126  		// This is safe so long as the loop sends a result.
   127  		return <-result
   128  	}
   129  }
   130  
   131  // gotInstall handles the params originally supplied to Install. It must only be
   132  // called from the loop goroutine.
   133  func (engine *engine) gotInstall(name string, manifold Manifold) error {
   134  	logger.Infof("installing %q manifold...", name)
   135  	if _, found := engine.manifolds[name]; found {
   136  		return errors.Errorf("%q manifold already installed", name)
   137  	}
   138  	engine.manifolds[name] = manifold
   139  	for _, input := range manifold.Inputs {
   140  		engine.dependents[input] = append(engine.dependents[input], name)
   141  	}
   142  	engine.current[name] = workerInfo{}
   143  	engine.requestStart(name, 0)
   144  	return nil
   145  }
   146  
   147  // requestStart invokes a runWorker goroutine for the manifold with the supplied
   148  // name. It must only be called from the loop goroutine.
   149  func (engine *engine) requestStart(name string, delay time.Duration) {
   150  
   151  	// Check preconditions.
   152  	manifold, found := engine.manifolds[name]
   153  	if !found {
   154  		engine.tomb.Kill(errors.Errorf("fatal: unknown manifold %q", name))
   155  	}
   156  
   157  	// Copy current info and check more preconditions.
   158  	info := engine.current[name]
   159  	if !info.stopped() {
   160  		engine.tomb.Kill(errors.Errorf("fatal: trying to start a second %q manifold worker", name))
   161  	}
   162  
   163  	// Final check that we're not shutting down yet...
   164  	if engine.isDying() {
   165  		logger.Debugf("not starting %q manifold worker (shutting down)", name)
   166  		return
   167  	}
   168  
   169  	// ...then update the info, copy it back to the engine, and start a worker
   170  	// goroutine based on current known state.
   171  	info.starting = true
   172  	engine.current[name] = info
   173  	getResource := engine.getResourceFunc(name, manifold.Inputs)
   174  	go engine.runWorker(name, delay, manifold.Start, getResource)
   175  }
   176  
   177  // getResourceFunc returns a GetResourceFunc backed by a snapshot of current
   178  // worker state, restricted to those workers declared in inputs. It must only
   179  // be called from the loop goroutine; see inside for a detailed dicsussion of
   180  // why we took this appproach.
   181  func (engine *engine) getResourceFunc(name string, inputs []string) GetResourceFunc {
   182  	// We snapshot the resources available at invocation time, rather than adding an
   183  	// additional communicate-resource-request channel. The latter approach is not
   184  	// unreasonable... but is prone to inelegant scrambles when starting several
   185  	// dependent workers at once. For example:
   186  	//
   187  	//  * Install manifold A; loop starts worker A
   188  	//  * Install manifold B; loop starts worker B
   189  	//  * A communicates its worker back to loop; main thread bounces B
   190  	//  * B asks for A, gets A, doesn't react to bounce (*)
   191  	//  * B communicates its worker back to loop; loop kills it immediately in
   192  	//    response to earlier bounce
   193  	//  * loop starts worker B again, now everything's fine; but, still, yuck.
   194  	//    This is not a happy path to take by default.
   195  	//
   196  	// The problem, of course, is in the (*); the main thread *does* know that B
   197  	// needs to bounce soon anyway, and it *could* communicate that fact back via
   198  	// an error over a channel back into getResource; the StartFunc could then
   199  	// just return (say) that ErrResourceChanged and avoid the hassle of creating
   200  	// a worker. But that adds a whole layer of complexity (and unpredictability
   201  	// in tests, which is not much fun) for very little benefit.
   202  	//
   203  	// In the analogous scenario with snapshotted dependencies, we see a happier
   204  	// picture at startup time:
   205  	//
   206  	//  * Install manifold A; loop starts worker A
   207  	//  * Install manifold B; loop starts worker B with empty resource snapshot
   208  	//  * A communicates its worker back to loop; main thread bounces B
   209  	//  * B's StartFunc asks for A, gets nothing, returns ErrUnmetDependencies
   210  	//  * loop restarts worker B with an up-to-date snapshot, B works fine
   211  	//
   212  	// We assume that, in the common case, most workers run without error most
   213  	// of the time; and, thus, that the vast majority of worker startups will
   214  	// happen as an agent starts. Furthermore, most of them will have simple
   215  	// hard dependencies, and their Start funcs will be easy to write; the only
   216  	// components that may be impacted by such a strategy will be those workers
   217  	// which still want to run (with reduced functionality) with some dependency
   218  	// unmet.
   219  	//
   220  	// Those may indeed suffer the occasional extra bounce as the system comes
   221  	// to stability as it starts, or after a change; but workers *must* be
   222  	// written for resilience in the face of arbitrary bounces *anyway*, so it
   223  	// shouldn't be harmful
   224  	outputs := map[string]OutputFunc{}
   225  	workers := map[string]worker.Worker{}
   226  	for _, resourceName := range inputs {
   227  		outputs[resourceName] = engine.manifolds[resourceName].Output
   228  		workers[resourceName] = engine.current[resourceName].worker
   229  	}
   230  	return func(resourceName string, out interface{}) error {
   231  		logger.Debugf("%q manifold requested %q resource", name, resourceName)
   232  		input := workers[resourceName]
   233  		if input == nil {
   234  			// No worker running (or not declared).
   235  			return ErrMissing
   236  		}
   237  		convert := outputs[resourceName]
   238  		if convert == nil {
   239  			// No conversion func available...
   240  			if out != nil {
   241  				// ...and the caller wants a resource.
   242  				return ErrMissing
   243  			}
   244  			// ...but it's ok, because the caller depends on existence only.
   245  			return nil
   246  		}
   247  		return convert(input, out)
   248  	}
   249  }
   250  
   251  // runWorker starts the supplied manifold's worker and communicates it back to the
   252  // loop goroutine; waits for worker completion; and communicates any error encountered
   253  // back to the loop goroutine. It must not be run on the loop goroutine.
   254  func (engine *engine) runWorker(name string, delay time.Duration, start StartFunc, getResource GetResourceFunc) {
   255  	startWorkerAndWait := func() error {
   256  		logger.Infof("starting %q manifold worker in %s...", name, delay)
   257  		select {
   258  		case <-time.After(delay):
   259  		case <-engine.tomb.Dying():
   260  			logger.Debugf("not starting %q manifold worker (shutting down)", name)
   261  			return tomb.ErrDying
   262  		}
   263  
   264  		logger.Debugf("starting %q manifold worker", name)
   265  		worker, err := start(getResource)
   266  		if err != nil {
   267  			logger.Warningf("failed to start %q manifold worker: %v", name, err)
   268  			return err
   269  		}
   270  
   271  		logger.Debugf("running %q manifold worker", name)
   272  		select {
   273  		case <-engine.tomb.Dying():
   274  			logger.Debugf("stopping %q manifold worker (shutting down)", name)
   275  			worker.Kill()
   276  		case engine.started <- startedTicket{name, worker}:
   277  			logger.Debugf("registered %q manifold worker", name)
   278  		}
   279  		return worker.Wait()
   280  	}
   281  
   282  	// We may or may not send on started, but we *must* send on stopped.
   283  	engine.stopped <- stoppedTicket{name, startWorkerAndWait()}
   284  }
   285  
   286  // gotStarted updates the engine to reflect the creation of a worker. It must
   287  // only be called from the loop goroutine.
   288  func (engine *engine) gotStarted(name string, worker worker.Worker) {
   289  	// Copy current info; check preconditions and abort the workers if we've
   290  	// already been asked to stop it.
   291  	info := engine.current[name]
   292  	switch {
   293  	case info.worker != nil:
   294  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker start", name))
   295  		fallthrough
   296  	case info.stopping, engine.isDying():
   297  		logger.Debugf("%q manifold worker no longer required", name)
   298  		worker.Kill()
   299  	default:
   300  		// It's fine to use this worker; update info and copy back.
   301  		logger.Infof("%q manifold worker started", name)
   302  		info.starting = false
   303  		info.worker = worker
   304  		engine.current[name] = info
   305  
   306  		// Any manifold that declares this one as an input needs to be restarted.
   307  		engine.bounceDependents(name)
   308  	}
   309  }
   310  
   311  // gotStopped updates the engine to reflect the demise of (or failure to create)
   312  // a worker. It must only be called from the loop goroutine.
   313  func (engine *engine) gotStopped(name string, err error) {
   314  	logger.Infof("%q manifold worker stopped: %v", name, err)
   315  
   316  	// Copy current info and check for reasons to stop the engine.
   317  	info := engine.current[name]
   318  	if info.stopped() {
   319  		engine.tomb.Kill(errors.Errorf("fatal: unexpected %q manifold worker stop", name))
   320  	} else if engine.isFatal(err) {
   321  		engine.tomb.Kill(err)
   322  	}
   323  
   324  	// Reset engine info; and bail out if we can be sure there's no need to bounce.
   325  	engine.current[name] = workerInfo{}
   326  	if engine.isDying() {
   327  		logger.Debugf("permanently stopped %q manifold worker (shutting down)", name)
   328  		return
   329  	}
   330  
   331  	// If we told the worker to stop, we should start it again immediately,
   332  	// whatever else happened.
   333  	if info.stopping {
   334  		engine.requestStart(name, engine.bounceDelay)
   335  	} else {
   336  		// If we didn't stop it ourselves, we need to interpret the error.
   337  		switch err {
   338  		case nil:
   339  			// Nothing went wrong; the task completed successfully. Nothing
   340  			// needs to be done (unless the inputs change, in which case it
   341  			// gets to check again).
   342  		case ErrMissing:
   343  			// The task can't even start with the current state. Nothing more
   344  			// can be done (until the inputs change, in which case we retry
   345  			// anyway).
   346  		default:
   347  			// Something went wrong but we don't know what. Try again soon.
   348  			engine.requestStart(name, engine.errorDelay)
   349  		}
   350  	}
   351  
   352  	// Manifolds that declared a dependency on this one only need to be notified
   353  	// if the worker has changed; if it was already nil, nobody needs to know.
   354  	if info.worker != nil {
   355  		engine.bounceDependents(name)
   356  	}
   357  }
   358  
   359  // requestStop ensures that any running or starting worker will be stopped in the
   360  // near future. It must only be called from the loop goroutine.
   361  func (engine *engine) requestStop(name string) {
   362  
   363  	// If already stopping or stopped, just don't do anything.
   364  	info := engine.current[name]
   365  	if info.stopping || info.stopped() {
   366  		return
   367  	}
   368  
   369  	// Update info, kill worker if present, and copy info back to engine.
   370  	info.stopping = true
   371  	if info.worker != nil {
   372  		info.worker.Kill()
   373  	}
   374  	engine.current[name] = info
   375  }
   376  
   377  // isDying returns true if the engine is shutting down. It's safe to call it
   378  // from any goroutine.
   379  func (engine *engine) isDying() bool {
   380  	select {
   381  	case <-engine.tomb.Dying():
   382  		return true
   383  	default:
   384  		return false
   385  	}
   386  }
   387  
   388  // allStopped returns true if no workers are running or starting. It must only
   389  // be called from the loop goroutine.
   390  func (engine *engine) allStopped() bool {
   391  	for _, info := range engine.current {
   392  		if !info.stopped() {
   393  			return false
   394  		}
   395  	}
   396  	return true
   397  }
   398  
   399  // bounceDependents starts every stopped dependent of the named manifold, and
   400  // stops every started one (and trusts the rest of the engine to restart them).
   401  // It must only be called from the loop goroutine.
   402  func (engine *engine) bounceDependents(name string) {
   403  	logger.Debugf("restarting dependents of %q manifold", name)
   404  	for _, dependentName := range engine.dependents[name] {
   405  		if engine.current[dependentName].stopped() {
   406  			engine.requestStart(dependentName, engine.bounceDelay)
   407  		} else {
   408  			engine.requestStop(dependentName)
   409  		}
   410  	}
   411  }
   412  
   413  // workerInfo stores what an engine's loop goroutine needs to know about the
   414  // worker for a given Manifold.
   415  type workerInfo struct {
   416  	starting bool
   417  	stopping bool
   418  	worker   worker.Worker
   419  }
   420  
   421  // stopped returns true unless the worker is either assigned or starting.
   422  func (info workerInfo) stopped() bool {
   423  	switch {
   424  	case info.worker != nil:
   425  		return false
   426  	case info.starting:
   427  		return false
   428  	}
   429  	return true
   430  }
   431  
   432  // installTicket is used by engine to induce installation of a named manifold
   433  // and pass on any errors encountered in the process.
   434  type installTicket struct {
   435  	name     string
   436  	manifold Manifold
   437  	result   chan<- error
   438  }
   439  
   440  // startedTicket is used by engine to notify the loop of the creation of the
   441  // worker for a particular manifold.
   442  type startedTicket struct {
   443  	name   string
   444  	worker worker.Worker
   445  }
   446  
   447  // stoppedTicket is used by engine to notify the loop of the demise of (or
   448  // failure to create) the worker for a particular manifold.
   449  type stoppedTicket struct {
   450  	name  string
   451  	error error
   452  }