github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/worker/runner.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package worker
     5  
     6  import (
     7  	"errors"
     8  	"time"
     9  
    10  	"launchpad.net/tomb"
    11  )
    12  
    13  // RestartDelay holds the length of time that a worker
    14  // will wait between exiting and restarting.
    15  var RestartDelay = 3 * time.Second
    16  
    17  // Worker is implemented by a running worker.
    18  type Worker interface {
    19  	// Kill asks the worker to stop without necessarily
    20  	// waiting for it to do so.
    21  	Kill()
    22  	// Wait waits for the worker to exit and returns any
    23  	// error encountered when it was running.
    24  	Wait() error
    25  }
    26  
    27  // Runner is implemented by instances capable of starting and stopping workers.
    28  type Runner interface {
    29  	Worker
    30  	StartWorker(id string, startFunc func() (Worker, error)) error
    31  	StopWorker(id string) error
    32  	Dying() <-chan struct{}
    33  }
    34  
    35  // runner runs a set of workers, restarting them as necessary
    36  // when they fail.
    37  type runner struct {
    38  	tomb          tomb.Tomb
    39  	startc        chan startReq
    40  	stopc         chan string
    41  	donec         chan doneInfo
    42  	startedc      chan startInfo
    43  	isFatal       func(error) bool
    44  	moreImportant func(err0, err1 error) bool
    45  }
    46  
    47  var _ Runner = (*runner)(nil)
    48  
    49  type startReq struct {
    50  	id    string
    51  	start func() (Worker, error)
    52  }
    53  
    54  type startInfo struct {
    55  	id     string
    56  	worker Worker
    57  }
    58  
    59  type doneInfo struct {
    60  	id  string
    61  	err error
    62  }
    63  
    64  // NewRunner creates a new Runner.  When a worker finishes, if its error
    65  // is deemed fatal (determined by calling isFatal), all the other workers
    66  // will be stopped and the runner itself will finish.  Of all the fatal errors
    67  // returned by the stopped workers, only the most important one,
    68  // determined by calling moreImportant, will be returned from
    69  // Runner.Wait. Non-fatal errors will not be returned.
    70  //
    71  // The function isFatal(err) returns whether err is a fatal error.  The
    72  // function moreImportant(err0, err1) returns whether err0 is considered
    73  // more important than err1.
    74  func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool) Runner {
    75  	runner := &runner{
    76  		startc:        make(chan startReq),
    77  		stopc:         make(chan string),
    78  		donec:         make(chan doneInfo),
    79  		startedc:      make(chan startInfo),
    80  		isFatal:       isFatal,
    81  		moreImportant: moreImportant,
    82  	}
    83  	go func() {
    84  		defer runner.tomb.Done()
    85  		runner.tomb.Kill(runner.run())
    86  	}()
    87  	return runner
    88  }
    89  
    90  var ErrDead = errors.New("worker runner is not running")
    91  
    92  // StartWorker starts a worker running associated with the given id.
    93  // The startFunc function will be called to create the worker;
    94  // when the worker exits, it will be restarted as long as it
    95  // does not return a fatal error.
    96  //
    97  // If there is already a worker with the given id, nothing will be done.
    98  //
    99  // StartWorker returns ErrDead if the runner is not running.
   100  func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error {
   101  	select {
   102  	case runner.startc <- startReq{id, startFunc}:
   103  		return nil
   104  	case <-runner.tomb.Dead():
   105  	}
   106  	return ErrDead
   107  }
   108  
   109  // StopWorker stops the worker associated with the given id.
   110  // It does nothing if there is no such worker.
   111  //
   112  // StopWorker returns ErrDead if the runner is not running.
   113  func (runner *runner) StopWorker(id string) error {
   114  	select {
   115  	case runner.stopc <- id:
   116  		return nil
   117  	case <-runner.tomb.Dead():
   118  	}
   119  	return ErrDead
   120  }
   121  
   122  func (runner *runner) Wait() error {
   123  	return runner.tomb.Wait()
   124  }
   125  
   126  func (runner *runner) Kill() {
   127  	logger.Debugf("killing runner %p", runner)
   128  	runner.tomb.Kill(nil)
   129  }
   130  
   131  func (runner *runner) Dying() <-chan struct{} {
   132  	return runner.tomb.Dying()
   133  }
   134  
   135  // Stop kills the given worker and waits for it to exit.
   136  func Stop(worker Worker) error {
   137  	worker.Kill()
   138  	return worker.Wait()
   139  }
   140  
   141  type workerInfo struct {
   142  	start        func() (Worker, error)
   143  	worker       Worker
   144  	restartDelay time.Duration
   145  	stopping     bool
   146  }
   147  
   148  func (runner *runner) run() error {
   149  	// workers holds the current set of workers.  All workers with a
   150  	// running goroutine have an entry here.
   151  	workers := make(map[string]*workerInfo)
   152  	var finalError error
   153  
   154  	// isDying holds whether the runner is currently dying.  When it
   155  	// is dying (whether as a result of being killed or due to a
   156  	// fatal error), all existing workers are killed, no new workers
   157  	// will be started, and the loop will exit when all existing
   158  	// workers have stopped.
   159  	isDying := false
   160  	tombDying := runner.tomb.Dying()
   161  	for {
   162  		if isDying && len(workers) == 0 {
   163  			return finalError
   164  		}
   165  		select {
   166  		case <-tombDying:
   167  			logger.Infof("runner is dying")
   168  			isDying = true
   169  			killAll(workers)
   170  			tombDying = nil
   171  		case req := <-runner.startc:
   172  			if isDying {
   173  				logger.Infof("ignoring start request for %q when dying", req.id)
   174  				break
   175  			}
   176  			info := workers[req.id]
   177  			if info == nil {
   178  				workers[req.id] = &workerInfo{
   179  					start:        req.start,
   180  					restartDelay: RestartDelay,
   181  				}
   182  				go runner.runWorker(0, req.id, req.start)
   183  				break
   184  			}
   185  			if !info.stopping {
   186  				// The worker is already running, so leave it alone
   187  				break
   188  			}
   189  			// The worker previously existed and is
   190  			// currently being stopped.  When it eventually
   191  			// does stop, we'll restart it immediately with
   192  			// the new start function.
   193  			info.start = req.start
   194  			info.restartDelay = 0
   195  		case id := <-runner.stopc:
   196  			if info := workers[id]; info != nil {
   197  				killWorker(id, info)
   198  			}
   199  		case info := <-runner.startedc:
   200  			workerInfo := workers[info.id]
   201  			workerInfo.worker = info.worker
   202  			if isDying {
   203  				killWorker(info.id, workerInfo)
   204  			}
   205  		case info := <-runner.donec:
   206  			workerInfo := workers[info.id]
   207  			if !workerInfo.stopping && info.err == nil {
   208  				delete(workers, info.id)
   209  				break
   210  			}
   211  			if info.err != nil {
   212  				if runner.isFatal(info.err) {
   213  					logger.Errorf("fatal %q: %v", info.id, info.err)
   214  					if finalError == nil || runner.moreImportant(info.err, finalError) {
   215  						finalError = info.err
   216  					}
   217  					delete(workers, info.id)
   218  					if !isDying {
   219  						isDying = true
   220  						killAll(workers)
   221  					}
   222  					break
   223  				} else {
   224  					logger.Errorf("exited %q: %v", info.id, info.err)
   225  				}
   226  			}
   227  			if workerInfo.start == nil {
   228  				// The worker has been deliberately stopped;
   229  				// we can now remove it from the list of workers.
   230  				delete(workers, info.id)
   231  				break
   232  			}
   233  			go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start)
   234  			workerInfo.restartDelay = RestartDelay
   235  		}
   236  	}
   237  }
   238  
   239  func killAll(workers map[string]*workerInfo) {
   240  	for id, info := range workers {
   241  		killWorker(id, info)
   242  	}
   243  }
   244  
   245  func killWorker(id string, info *workerInfo) {
   246  	if info.worker != nil {
   247  		logger.Debugf("killing %q", id)
   248  		info.worker.Kill()
   249  		info.worker = nil
   250  	}
   251  	info.stopping = true
   252  	info.start = nil
   253  }
   254  
   255  // runWorker starts the given worker after waiting for the given delay.
   256  func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) {
   257  	if delay > 0 {
   258  		logger.Infof("restarting %q in %v", id, delay)
   259  		select {
   260  		case <-runner.tomb.Dying():
   261  			runner.donec <- doneInfo{id, nil}
   262  			return
   263  		case <-time.After(delay):
   264  		}
   265  	}
   266  	logger.Infof("start %q", id)
   267  	worker, err := start()
   268  	if err == nil {
   269  		runner.startedc <- startInfo{id, worker}
   270  		err = worker.Wait()
   271  	}
   272  	runner.donec <- doneInfo{id, err}
   273  }