github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/worker/runner.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package worker
     5  
     6  import (
     7  	"errors"
     8  	"time"
     9  
    10  	"launchpad.net/tomb"
    11  )
    12  
    13  // RestartDelay holds the length of time that a worker
    14  // will wait between exiting and restarting.
    15  var RestartDelay = 3 * time.Second
    16  
    17  // Worker is implemented by a running worker.
    18  type Worker interface {
    19  	// Kill asks the worker to stop without necessarily
    20  	// waiting for it to do so.
    21  	Kill()
    22  	// Wait waits for the worker to exit and returns any
    23  	// error encountered when it was running.
    24  	Wait() error
    25  }
    26  
    27  // Runner is implemented by instances capable of starting and stopping workers.
    28  type Runner interface {
    29  	Worker
    30  	StartWorker(id string, startFunc func() (Worker, error)) error
    31  	StopWorker(id string) error
    32  }
    33  
    34  // runner runs a set of workers, restarting them as necessary
    35  // when they fail.
    36  type runner struct {
    37  	tomb          tomb.Tomb
    38  	startc        chan startReq
    39  	stopc         chan string
    40  	donec         chan doneInfo
    41  	startedc      chan startInfo
    42  	isFatal       func(error) bool
    43  	moreImportant func(err0, err1 error) bool
    44  }
    45  
    46  var _ Runner = (*runner)(nil)
    47  
    48  type startReq struct {
    49  	id    string
    50  	start func() (Worker, error)
    51  }
    52  
    53  type startInfo struct {
    54  	id     string
    55  	worker Worker
    56  }
    57  
    58  type doneInfo struct {
    59  	id  string
    60  	err error
    61  }
    62  
    63  // NewRunner creates a new Runner.  When a worker finishes, if its error
    64  // is deemed fatal (determined by calling isFatal), all the other workers
    65  // will be stopped and the runner itself will finish.  Of all the fatal errors
    66  // returned by the stopped workers, only the most important one,
    67  // determined by calling moreImportant, will be returned from
    68  // Runner.Wait. Non-fatal errors will not be returned.
    69  //
    70  // The function isFatal(err) returns whether err is a fatal error.  The
    71  // function moreImportant(err0, err1) returns whether err0 is considered
    72  // more important than err1.
    73  func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool) Runner {
    74  	runner := &runner{
    75  		startc:        make(chan startReq),
    76  		stopc:         make(chan string),
    77  		donec:         make(chan doneInfo),
    78  		startedc:      make(chan startInfo),
    79  		isFatal:       isFatal,
    80  		moreImportant: moreImportant,
    81  	}
    82  	go func() {
    83  		defer runner.tomb.Done()
    84  		runner.tomb.Kill(runner.run())
    85  	}()
    86  	return runner
    87  }
    88  
    89  var ErrDead = errors.New("worker runner is not running")
    90  
    91  // StartWorker starts a worker running associated with the given id.
    92  // The startFunc function will be called to create the worker;
    93  // when the worker exits, it will be restarted as long as it
    94  // does not return a fatal error.
    95  //
    96  // If there is already a worker with the given id, nothing will be done.
    97  //
    98  // StartWorker returns ErrDead if the runner is not running.
    99  func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error {
   100  	select {
   101  	case runner.startc <- startReq{id, startFunc}:
   102  		return nil
   103  	case <-runner.tomb.Dead():
   104  	}
   105  	return ErrDead
   106  }
   107  
   108  // StopWorker stops the worker associated with the given id.
   109  // It does nothing if there is no such worker.
   110  //
   111  // StopWorker returns ErrDead if the runner is not running.
   112  func (runner *runner) StopWorker(id string) error {
   113  	select {
   114  	case runner.stopc <- id:
   115  		return nil
   116  	case <-runner.tomb.Dead():
   117  	}
   118  	return ErrDead
   119  }
   120  
   121  func (runner *runner) Wait() error {
   122  	return runner.tomb.Wait()
   123  }
   124  
   125  func (runner *runner) Kill() {
   126  	logger.Debugf("killing runner %p", runner)
   127  	runner.tomb.Kill(nil)
   128  }
   129  
   130  // Stop kills the given worker and waits for it to exit.
   131  func Stop(worker Worker) error {
   132  	worker.Kill()
   133  	return worker.Wait()
   134  }
   135  
   136  type workerInfo struct {
   137  	start        func() (Worker, error)
   138  	worker       Worker
   139  	restartDelay time.Duration
   140  	stopping     bool
   141  }
   142  
   143  func (runner *runner) run() error {
   144  	// workers holds the current set of workers.  All workers with a
   145  	// running goroutine have an entry here.
   146  	workers := make(map[string]*workerInfo)
   147  	var finalError error
   148  
   149  	// isDying holds whether the runner is currently dying.  When it
   150  	// is dying (whether as a result of being killed or due to a
   151  	// fatal error), all existing workers are killed, no new workers
   152  	// will be started, and the loop will exit when all existing
   153  	// workers have stopped.
   154  	isDying := false
   155  	tombDying := runner.tomb.Dying()
   156  	for {
   157  		if isDying && len(workers) == 0 {
   158  			return finalError
   159  		}
   160  		select {
   161  		case <-tombDying:
   162  			logger.Infof("runner is dying")
   163  			isDying = true
   164  			killAll(workers)
   165  			tombDying = nil
   166  		case req := <-runner.startc:
   167  			if isDying {
   168  				logger.Infof("ignoring start request for %q when dying", req.id)
   169  				break
   170  			}
   171  			info := workers[req.id]
   172  			if info == nil {
   173  				workers[req.id] = &workerInfo{
   174  					start:        req.start,
   175  					restartDelay: RestartDelay,
   176  				}
   177  				go runner.runWorker(0, req.id, req.start)
   178  				break
   179  			}
   180  			if !info.stopping {
   181  				// The worker is already running, so leave it alone
   182  				break
   183  			}
   184  			// The worker previously existed and is
   185  			// currently being stopped.  When it eventually
   186  			// does stop, we'll restart it immediately with
   187  			// the new start function.
   188  			info.start = req.start
   189  			info.restartDelay = 0
   190  		case id := <-runner.stopc:
   191  			logger.Debugf("stop %q", id)
   192  			if info := workers[id]; info != nil {
   193  				killWorker(id, info)
   194  			}
   195  		case info := <-runner.startedc:
   196  			logger.Debugf("%q started", info.id)
   197  			workerInfo := workers[info.id]
   198  			workerInfo.worker = info.worker
   199  			if isDying || workerInfo.stopping {
   200  				killWorker(info.id, workerInfo)
   201  			}
   202  		case info := <-runner.donec:
   203  			logger.Debugf("%q done: %v", info.id, info.err)
   204  			workerInfo := workers[info.id]
   205  			if !workerInfo.stopping && info.err == nil {
   206  				logger.Debugf("removing %q from known workers", info.id)
   207  				delete(workers, info.id)
   208  				break
   209  			}
   210  			if info.err != nil {
   211  				if runner.isFatal(info.err) {
   212  					logger.Errorf("fatal %q: %v", info.id, info.err)
   213  					if finalError == nil || runner.moreImportant(info.err, finalError) {
   214  						finalError = info.err
   215  					}
   216  					delete(workers, info.id)
   217  					if !isDying {
   218  						isDying = true
   219  						killAll(workers)
   220  					}
   221  					break
   222  				} else {
   223  					logger.Errorf("exited %q: %v", info.id, info.err)
   224  				}
   225  			}
   226  			if workerInfo.start == nil {
   227  				logger.Debugf("no restart, removing %q from known workers", info.id)
   228  
   229  				// The worker has been deliberately stopped;
   230  				// we can now remove it from the list of workers.
   231  				delete(workers, info.id)
   232  				break
   233  			}
   234  			go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start)
   235  			workerInfo.restartDelay = RestartDelay
   236  		}
   237  	}
   238  }
   239  
   240  func killAll(workers map[string]*workerInfo) {
   241  	for id, info := range workers {
   242  		killWorker(id, info)
   243  	}
   244  }
   245  
   246  func killWorker(id string, info *workerInfo) {
   247  	if info.worker != nil {
   248  		logger.Debugf("killing %q", id)
   249  		info.worker.Kill()
   250  		info.worker = nil
   251  	} else {
   252  		logger.Debugf("couldn't kill %q, not yet started", id)
   253  	}
   254  	info.stopping = true
   255  	info.start = nil
   256  }
   257  
   258  // runWorker starts the given worker after waiting for the given delay.
   259  func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) {
   260  	if delay > 0 {
   261  		logger.Infof("restarting %q in %v", id, delay)
   262  		select {
   263  		case <-runner.tomb.Dying():
   264  			runner.donec <- doneInfo{id, nil}
   265  			return
   266  		case <-time.After(delay):
   267  		}
   268  	}
   269  	logger.Infof("start %q", id)
   270  	worker, err := start()
   271  	if err == nil {
   272  		runner.startedc <- startInfo{id, worker}
   273  		err = worker.Wait()
   274  	}
   275  	logger.Infof("stopped %q, err: %v", id, err)
   276  	runner.donec <- doneInfo{id, err}
   277  }