github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/runner.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package worker
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"gopkg.in/tomb.v1"
    11  )
    12  
    13  // RestartDelay holds the length of time that a worker
    14  // will wait between exiting and restarting.
    15  const RestartDelay = 3 * time.Second
    16  
    17  // Runner is implemented by instances capable of starting and stopping workers.
    18  type Runner interface {
    19  	Worker
    20  	StartWorker(id string, startFunc func() (Worker, error)) error
    21  	StopWorker(id string) error
    22  }
    23  
    24  // runner runs a set of workers, restarting them as necessary
    25  // when they fail.
    26  type runner struct {
    27  	tomb          tomb.Tomb
    28  	startc        chan startReq
    29  	stopc         chan string
    30  	donec         chan doneInfo
    31  	startedc      chan startInfo
    32  	isFatal       func(error) bool
    33  	moreImportant func(err0, err1 error) bool
    34  
    35  	// restartDelay holds the length of time that a worker
    36  	// will wait between exiting and restarting.
    37  	restartDelay time.Duration
    38  }
    39  
    40  type startReq struct {
    41  	id    string
    42  	start func() (Worker, error)
    43  }
    44  
    45  type startInfo struct {
    46  	id     string
    47  	worker Worker
    48  }
    49  
    50  type doneInfo struct {
    51  	id  string
    52  	err error
    53  }
    54  
    55  // NewRunner creates a new Runner.  When a worker finishes, if its error
    56  // is deemed fatal (determined by calling isFatal), all the other workers
    57  // will be stopped and the runner itself will finish.  Of all the fatal errors
    58  // returned by the stopped workers, only the most important one,
    59  // determined by calling moreImportant, will be returned from
    60  // Runner.Wait. Non-fatal errors will not be returned.
    61  //
    62  // The function isFatal(err) returns whether err is a fatal error.  The
    63  // function moreImportant(err0, err1) returns whether err0 is considered
    64  // more important than err1.
    65  func NewRunner(isFatal func(error) bool, moreImportant func(err0, err1 error) bool, restartDelay time.Duration) Runner {
    66  	runner := &runner{
    67  		startc:        make(chan startReq),
    68  		stopc:         make(chan string),
    69  		donec:         make(chan doneInfo),
    70  		startedc:      make(chan startInfo),
    71  		isFatal:       isFatal,
    72  		moreImportant: moreImportant,
    73  		restartDelay:  restartDelay,
    74  	}
    75  	go func() {
    76  		defer runner.tomb.Done()
    77  		runner.tomb.Kill(runner.run())
    78  	}()
    79  	return runner
    80  }
    81  
    82  var ErrDead = errors.New("worker runner is not running")
    83  
    84  // StartWorker starts a worker running associated with the given id.
    85  // The startFunc function will be called to create the worker;
    86  // when the worker exits, it will be restarted as long as it
    87  // does not return a fatal error.
    88  //
    89  // If there is already a worker with the given id, nothing will be done.
    90  //
    91  // StartWorker returns ErrDead if the runner is not running.
    92  func (runner *runner) StartWorker(id string, startFunc func() (Worker, error)) error {
    93  	select {
    94  	case runner.startc <- startReq{id, startFunc}:
    95  		return nil
    96  	case <-runner.tomb.Dead():
    97  	}
    98  	return ErrDead
    99  }
   100  
   101  // StopWorker stops the worker associated with the given id.
   102  // It does nothing if there is no such worker.
   103  //
   104  // StopWorker returns ErrDead if the runner is not running.
   105  func (runner *runner) StopWorker(id string) error {
   106  	select {
   107  	case runner.stopc <- id:
   108  		return nil
   109  	case <-runner.tomb.Dead():
   110  	}
   111  	return ErrDead
   112  }
   113  
   114  func (runner *runner) Wait() error {
   115  	return runner.tomb.Wait()
   116  }
   117  
   118  func (runner *runner) Kill() {
   119  	logger.Debugf("killing runner %p", runner)
   120  	runner.tomb.Kill(nil)
   121  }
   122  
   123  type workerInfo struct {
   124  	start        func() (Worker, error)
   125  	worker       Worker
   126  	restartDelay time.Duration
   127  	stopping     bool
   128  }
   129  
   130  func (runner *runner) run() error {
   131  	// workers holds the current set of workers.  All workers with a
   132  	// running goroutine have an entry here.
   133  	workers := make(map[string]*workerInfo)
   134  	var finalError error
   135  
   136  	// isDying holds whether the runner is currently dying.  When it
   137  	// is dying (whether as a result of being killed or due to a
   138  	// fatal error), all existing workers are killed, no new workers
   139  	// will be started, and the loop will exit when all existing
   140  	// workers have stopped.
   141  	isDying := false
   142  	tombDying := runner.tomb.Dying()
   143  	for {
   144  		if isDying && len(workers) == 0 {
   145  			return finalError
   146  		}
   147  		select {
   148  		case <-tombDying:
   149  			logger.Infof("runner is dying")
   150  			isDying = true
   151  			killAll(workers)
   152  			tombDying = nil
   153  		case req := <-runner.startc:
   154  			if isDying {
   155  				logger.Infof("ignoring start request for %q when dying", req.id)
   156  				break
   157  			}
   158  			info := workers[req.id]
   159  			if info == nil {
   160  				workers[req.id] = &workerInfo{
   161  					start:        req.start,
   162  					restartDelay: runner.restartDelay,
   163  				}
   164  				go runner.runWorker(0, req.id, req.start)
   165  				break
   166  			}
   167  			if !info.stopping {
   168  				// The worker is already running, so leave it alone
   169  				break
   170  			}
   171  			// The worker previously existed and is
   172  			// currently being stopped.  When it eventually
   173  			// does stop, we'll restart it immediately with
   174  			// the new start function.
   175  			info.start = req.start
   176  			info.restartDelay = 0
   177  		case id := <-runner.stopc:
   178  			logger.Debugf("stop %q", id)
   179  			if info := workers[id]; info != nil {
   180  				killWorker(id, info)
   181  			}
   182  		case info := <-runner.startedc:
   183  			logger.Debugf("%q started", info.id)
   184  			workerInfo := workers[info.id]
   185  			workerInfo.worker = info.worker
   186  			if isDying || workerInfo.stopping {
   187  				killWorker(info.id, workerInfo)
   188  			}
   189  		case info := <-runner.donec:
   190  			logger.Debugf("%q done: %v", info.id, info.err)
   191  			workerInfo := workers[info.id]
   192  			if !workerInfo.stopping && info.err == nil {
   193  				logger.Debugf("removing %q from known workers", info.id)
   194  				delete(workers, info.id)
   195  				break
   196  			}
   197  			if info.err != nil {
   198  				if runner.isFatal(info.err) {
   199  					logger.Errorf("fatal %q: %v", info.id, info.err)
   200  					if finalError == nil || runner.moreImportant(info.err, finalError) {
   201  						finalError = info.err
   202  					}
   203  					delete(workers, info.id)
   204  					if !isDying {
   205  						isDying = true
   206  						killAll(workers)
   207  					}
   208  					break
   209  				} else {
   210  					logger.Errorf("exited %q: %v", info.id, info.err)
   211  				}
   212  			}
   213  			if workerInfo.start == nil {
   214  				logger.Debugf("no restart, removing %q from known workers", info.id)
   215  
   216  				// The worker has been deliberately stopped;
   217  				// we can now remove it from the list of workers.
   218  				delete(workers, info.id)
   219  				break
   220  			}
   221  			go runner.runWorker(workerInfo.restartDelay, info.id, workerInfo.start)
   222  			workerInfo.restartDelay = runner.restartDelay
   223  		}
   224  	}
   225  }
   226  
   227  func killAll(workers map[string]*workerInfo) {
   228  	for id, info := range workers {
   229  		killWorker(id, info)
   230  	}
   231  }
   232  
   233  func killWorker(id string, info *workerInfo) {
   234  	if info.worker != nil {
   235  		logger.Debugf("killing %q", id)
   236  		info.worker.Kill()
   237  		info.worker = nil
   238  	} else {
   239  		logger.Debugf("couldn't kill %q, not yet started", id)
   240  	}
   241  	info.stopping = true
   242  	info.start = nil
   243  }
   244  
   245  // runWorker starts the given worker after waiting for the given delay.
   246  func (runner *runner) runWorker(delay time.Duration, id string, start func() (Worker, error)) {
   247  	if delay > 0 {
   248  		logger.Infof("restarting %q in %v", id, delay)
   249  		select {
   250  		case <-runner.tomb.Dying():
   251  			runner.donec <- doneInfo{id, nil}
   252  			return
   253  		case <-time.After(delay):
   254  		}
   255  	}
   256  	logger.Infof("start %q", id)
   257  	worker, err := start()
   258  	if err == nil {
   259  		runner.startedc <- startInfo{id, worker}
   260  		err = worker.Wait()
   261  	}
   262  	logger.Infof("stopped %q, err: %v", id, err)
   263  	runner.donec <- doneInfo{id, err}
   264  }